reduce number of Unicode private-use characters

This narrows the range of Unicode codepoints fish reserves for its own
use from U+E000 thru U+F8FE (6399 codepoints) to U+F600 thru U+F73F (320
codepoints). This is still not ideal since fish shouldn't be using any
Unicode private-use codepoints but it's a step in the right direction.

This partially addresses issue #2684.
This commit is contained in:
Kurtis Rader 2016-01-21 19:56:39 -08:00
parent b41b962336
commit f2246dfb34
10 changed files with 184 additions and 186 deletions

View file

@ -215,6 +215,11 @@ cd /usr/local/bin
rm -f fish fish_indent rm -f fish fish_indent
\endfish \endfish
<hr>
\section faq-reserved-chars Unicode private-use characters reserved by fish
Fish reserves the <a href="http://www.unicode.org/faq/private_use.html">Unicode private-use character range</a> from U+F600 thru U+F73F for internal use. Any attempt to feed characters in that range to fish will result in them being replaced by the Unicode "replacement character" U+FFFD. This includes both interactive input as well as any file read by fish (but not programs run by fish).
\htmlonly[block] \htmlonly[block]
</div> </div>
\endhtmlonly \endhtmlonly

View file

@ -35,34 +35,59 @@
typedef std::wstring wcstring; typedef std::wstring wcstring;
typedef std::vector<wcstring> wcstring_list_t; typedef std::vector<wcstring> wcstring_list_t;
/** // Maximum number of bytes used by a single utf-8 character.
Maximum number of bytes used by a single utf-8 character
*/
#define MAX_UTF8_BYTES 6 #define MAX_UTF8_BYTES 6
/** // Highest legal ASCII value.
This is in the unicode private use area.
*/
#define ENCODE_DIRECT_BASE 0xf100
/**
Highest legal ascii value
*/
#define ASCII_MAX 127u #define ASCII_MAX 127u
/** // Highest legal 16-bit Unicode value.
Highest legal 16-bit unicode value #define UCS2_MAX 0xFFFFu
*/
#define UCS2_MAX 0xffffu
/** // Highest legal byte value.
Highest legal byte value #define BYTE_MAX 0xFFu
*/
#define BYTE_MAX 0xffu
/** BOM value */ // Unicode BOM value.
#define UTF8_BOM_WCHAR 0xFEFFu #define UTF8_BOM_WCHAR 0xFEFFu
// Unicode replacement character.
#define REPLACEMENT_WCHAR 0xFFFDu
// Use Unicode "noncharacters" for internal characters as much as we can. This
// gives us 32 "characters" for internal use that we can guarantee should not
// appear in our input stream. See http://www.unicode.org/faq/private_use.html.
#define RESERVED_CHAR_BASE 0xFDD0u
#define RESERVED_CHAR_END 0xFDF0u
// Split the available noncharacter values into two ranges to ensure there are
// no conflicts among the places we use these special characters.
#define EXPAND_RESERVED_BASE RESERVED_CHAR_BASE
#define EXPAND_RESERVED_END (EXPAND_RESERVED_BASE + 16)
#define WILDCARD_RESERVED_BASE EXPAND_RESERVED_END
#define WILDCARD_RESERVED_END (WILDCARD_RESERVED_BASE + 16)
// Make sure the ranges defined above don't exceed the range for noncharacters.
// This is to make sure we didn't do something stupid in subdividing the
// Unicode range for our needs.
#if WILDCARD_RESERVED_END > RESERVED_CHAR_END
#error
#endif
// These are in the Unicode private-use range. We really shouldn't use this
// range but have little choice in the matter given how our lexer/parser works.
// We can't use non-characters for these two ranges because there are only 66 of
// them and we need at least 256 + 64.
//
// If sizeof(wchar_t))==4 we could avoid using private-use chars; however, that
// would result in fish having different behavior on machines with 16 versus 32
// bit wchar_t. It's better that fish behave the same on both types of systems.
//
// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know
// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF)
// on Mac OS X. See http://www.unicode.org/faq/private_use.html.
#define ENCODE_DIRECT_BASE 0xF600u
#define ENCODE_DIRECT_END (ENCODE_DIRECT_BASE + 256)
#define INPUT_COMMON_BASE 0xF700u
#define INPUT_COMMON_END (INPUT_COMMON_BASE + 64)
/* Flags for unescape_string functions */ /* Flags for unescape_string functions */
enum enum
{ {

View file

@ -26,92 +26,64 @@
enum enum
{ {
/** Flag specifying that cmdsubst expansion should be skipped */ // Flag specifying that cmdsubst expansion should be skipped.
EXPAND_SKIP_CMDSUBST = 1 << 0, EXPAND_SKIP_CMDSUBST = 1 << 0,
// Flag specifying that variable expansion should be skipped.
/** Flag specifying that variable expansion should be skipped */
EXPAND_SKIP_VARIABLES = 1 << 1, EXPAND_SKIP_VARIABLES = 1 << 1,
// Flag specifying that wildcard expansion should be skipped.
/** Flag specifying that wildcard expansion should be skipped */
EXPAND_SKIP_WILDCARDS = 1 << 2, EXPAND_SKIP_WILDCARDS = 1 << 2,
// The expansion is being done for tab or auto completions. Returned
/** // completions may have the wildcard as a prefix instead of a match.
The expansion is being done for tab or auto completions. Returned completions may have the wildcard as a prefix instead of a match.
*/
EXPAND_FOR_COMPLETIONS = 1 << 3, EXPAND_FOR_COMPLETIONS = 1 << 3,
// Only match files that are executable by the current user. Only
/** Only match files that are executable by the current user. Only applicable together with ACCEPT_INCOMPLETE. */ // applicable together with ACCEPT_INCOMPLETE.
EXECUTABLES_ONLY = 1 << 4, EXECUTABLES_ONLY = 1 << 4,
// Only match directories. Only applicable together with ACCEPT_INCOMPLETE.
/** Only match directories. Only applicable together with ACCEPT_INCOMPLETE. */
DIRECTORIES_ONLY = 1 << 5, DIRECTORIES_ONLY = 1 << 5,
// Don't generate descriptions.
/** Don't generate descriptions */
EXPAND_NO_DESCRIPTIONS = 1 << 6, EXPAND_NO_DESCRIPTIONS = 1 << 6,
// Don't expand jobs (but you can still expand processes). This is because
/** Don't expand jobs (but you can still expand processes). This is because job expansion is not thread safe. */ // job expansion is not thread safe.
EXPAND_SKIP_JOBS = 1 << 7, EXPAND_SKIP_JOBS = 1 << 7,
// Don't expand home directories.
/** Don't expand home directories */
EXPAND_SKIP_HOME_DIRECTORIES = 1 << 8, EXPAND_SKIP_HOME_DIRECTORIES = 1 << 8,
// Allow fuzzy matching.
/** Allow fuzzy matching */
EXPAND_FUZZY_MATCH = 1 << 9, EXPAND_FUZZY_MATCH = 1 << 9,
// Disallow directory abbreviations like /u/l/b for /usr/local/bin. Only
/** Disallow directory abbreviations like /u/l/b for /usr/local/bin. Only applicable if EXPAND_FUZZY_MATCH is set. */ // applicable if EXPAND_FUZZY_MATCH is set.
EXPAND_NO_FUZZY_DIRECTORIES = 1 << 10, EXPAND_NO_FUZZY_DIRECTORIES = 1 << 10,
// Do expansions specifically to support cd (CDPATH, etc).
/** Do expansions specifically to support cd (CDPATH, etc) */
EXPAND_SPECIAL_CD = 1 << 11 EXPAND_SPECIAL_CD = 1 << 11
}; };
typedef int expand_flags_t; typedef int expand_flags_t;
/**
Use unencoded private-use keycodes for internal characters
*/
#define EXPAND_RESERVED 0xf000
/**
End of range reserved for expand
*/
#define EXPAND_RESERVED_END 0xf000f
class completion_t; class completion_t;
enum enum
{ {
/** Character represeting a home directory */ // Character representing a home directory.
HOME_DIRECTORY = EXPAND_RESERVED, HOME_DIRECTORY = EXPAND_RESERVED_BASE,
// Character representing process expansion.
/** Character represeting process expansion */
PROCESS_EXPAND, PROCESS_EXPAND,
// Character representing variable expansion.
/** Character representing variable expansion */
VARIABLE_EXPAND, VARIABLE_EXPAND,
// Character representing variable expansion into a single element.
/** Character rpresenting variable expansion into a single element*/
VARIABLE_EXPAND_SINGLE, VARIABLE_EXPAND_SINGLE,
// Character representing the start of a bracket expansion.
/** Character representing the start of a bracket expansion */
BRACKET_BEGIN, BRACKET_BEGIN,
// Character representing the end of a bracket expansion.
/** Character representing the end of a bracket expansion */
BRACKET_END, BRACKET_END,
// Character representing separation between two bracket elements.
/** Character representing separation between two bracket elements */
BRACKET_SEP, BRACKET_SEP,
/** // Separate subtokens in a token with this character.
Separate subtokens in a token with this character.
*/
INTERNAL_SEPARATOR, INTERNAL_SEPARATOR,
// Character representing an empty variable expansion. Only used
/** // transitively while expanding variables.
Character representing an empty variable expansion.
Only used transitively while expanding variables.
*/
VARIABLE_EXPAND_EMPTY, VARIABLE_EXPAND_EMPTY,
} // This is a special psuedo-char that is not used other than to mark the
; // end of the the special characters so we can sanity check the enum range.
EXPAND_SENTINAL
};
/** These are the possible return values for expand_string. Note how zero value is the only error. */ /** These are the possible return values for expand_string. Note how zero value is the only error. */
enum expand_error_t enum expand_error_t

View file

@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
#include "config.h" #include "config.h"
#include <assert.h>
#include <limits.h> #include <limits.h>
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
@ -63,6 +64,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
#include "input.h" #include "input.h"
#include "io.h" #include "io.h"
#include "fish_version.h" #include "fish_version.h"
#include "input_common.h"
#include "wildcard.h"
/* PATH_MAX may not exist */ /* PATH_MAX may not exist */
#ifndef PATH_MAX #ifndef PATH_MAX
@ -484,6 +487,14 @@ int main(int argc, char **argv)
int res=1; int res=1;
int my_optind=0; int my_optind=0;
// We can't do this at compile time due to the use of enum symbols.
assert(EXPAND_SENTINAL >= EXPAND_RESERVED_BASE &&
EXPAND_SENTINAL <= EXPAND_RESERVED_END);
assert(ANY_SENTINAL >= WILDCARD_RESERVED_BASE &&
ANY_SENTINAL <= WILDCARD_RESERVED_END);
assert(R_SENTINAL >= INPUT_COMMON_BASE &&
R_SENTINAL <= INPUT_COMMON_END);
set_main_thread(); set_main_thread();
setup_fork_guards(); setup_fork_guards();

View file

@ -335,7 +335,7 @@ static bool has_expand_reserved(const wcstring &str)
for (size_t i=0; i < str.size(); i++) for (size_t i=0; i < str.size(); i++)
{ {
wchar_t wc = str.at(i); wchar_t wc = str.at(i);
if (wc >= EXPAND_RESERVED && wc <= EXPAND_RESERVED_END) if (wc >= EXPAND_RESERVED_BASE && wc <= EXPAND_RESERVED_END)
{ {
result = true; result = true;
break; break;

View file

@ -16,77 +16,11 @@ inputrc information for key bindings.
#include "env.h" #include "env.h"
#include "input_common.h" #include "input_common.h"
#define DEFAULT_BIND_MODE L"default" #define DEFAULT_BIND_MODE L"default"
#define FISH_BIND_MODE_VAR L"fish_bind_mode" #define FISH_BIND_MODE_VAR L"fish_bind_mode"
/**
Key codes for inputrc-style keyboard functions that are passed on
to the caller of input_read()
NOTE: IF YOU MODIFY THIS YOU MUST UPDATE THE name_arr AND code_arr VARIABLES TO MATCH!
*/
enum
{
R_BEGINNING_OF_LINE = R_NULL+10, /* This give input_common ten slots for lowlevel keycodes */
R_END_OF_LINE,
R_FORWARD_CHAR,
R_BACKWARD_CHAR,
R_FORWARD_WORD,
R_BACKWARD_WORD,
R_FORWARD_BIGWORD,
R_BACKWARD_BIGWORD,
R_HISTORY_SEARCH_BACKWARD,
R_HISTORY_SEARCH_FORWARD,
R_DELETE_CHAR,
R_BACKWARD_DELETE_CHAR,
R_KILL_LINE,
R_YANK,
R_YANK_POP,
R_COMPLETE,
R_COMPLETE_AND_SEARCH,
R_BEGINNING_OF_HISTORY,
R_END_OF_HISTORY,
R_BACKWARD_KILL_LINE,
R_KILL_WHOLE_LINE,
R_KILL_WORD,
R_KILL_BIGWORD,
R_BACKWARD_KILL_WORD,
R_BACKWARD_KILL_PATH_COMPONENT,
R_BACKWARD_KILL_BIGWORD,
R_HISTORY_TOKEN_SEARCH_BACKWARD,
R_HISTORY_TOKEN_SEARCH_FORWARD,
R_SELF_INSERT,
R_TRANSPOSE_CHARS,
R_TRANSPOSE_WORDS,
R_UPCASE_WORD,
R_DOWNCASE_WORD,
R_CAPITALIZE_WORD,
R_VI_ARG_DIGIT,
R_VI_DELETE_TO,
R_EXECUTE,
R_BEGINNING_OF_BUFFER,
R_END_OF_BUFFER,
R_REPAINT,
R_FORCE_REPAINT,
R_UP_LINE,
R_DOWN_LINE,
R_SUPPRESS_AUTOSUGGESTION,
R_ACCEPT_AUTOSUGGESTION,
R_BEGIN_SELECTION,
R_END_SELECTION,
R_KILL_SELECTION,
R_FORWARD_JUMP,
R_BACKWARD_JUMP,
R_AND,
R_CANCEL
};
wcstring describe_char(wint_t c); wcstring describe_char(wint_t c);
#define R_MIN R_NULL
#define R_MAX R_CANCEL
/** /**
Initialize the terminal by calling setupterm, and set up arrays Initialize the terminal by calling setupterm, and set up arrays
used by readch to detect escape sequences for special keys. used by readch to detect escape sequences for special keys.

View file

@ -8,22 +8,77 @@ Header file for the low level input library
#include <stddef.h> #include <stddef.h>
/** #include "common.h"
Use unencoded private-use keycodes for internal characters
*/
#define INPUT_COMMON_RESERVED 0xe000
enum enum
{ {
/** R_MIN = INPUT_COMMON_BASE,
R_NULL is sometimes returned by the input when a character was // R_NULL is sometimes returned by the input when a character was requested
requested but none could be delivered, or when an exception // but none could be delivered, or when an exception happened.
happened. R_NULL = R_MIN,
*/ R_EOF,
R_NULL = INPUT_COMMON_RESERVED, // Key codes for inputrc-style keyboard functions that are passed on
R_EOF // to the caller of input_read().
} //
; // NOTE: If you modify this sequence of symbols you must update the
// name_arr, code_arr and desc_arr variables in input.cpp to match!
R_BEGINNING_OF_LINE,
R_END_OF_LINE,
R_FORWARD_CHAR,
R_BACKWARD_CHAR,
R_FORWARD_WORD,
R_BACKWARD_WORD,
R_FORWARD_BIGWORD,
R_BACKWARD_BIGWORD,
R_HISTORY_SEARCH_BACKWARD,
R_HISTORY_SEARCH_FORWARD,
R_DELETE_CHAR,
R_BACKWARD_DELETE_CHAR,
R_KILL_LINE,
R_YANK,
R_YANK_POP,
R_COMPLETE,
R_COMPLETE_AND_SEARCH,
R_BEGINNING_OF_HISTORY,
R_END_OF_HISTORY,
R_BACKWARD_KILL_LINE,
R_KILL_WHOLE_LINE,
R_KILL_WORD,
R_KILL_BIGWORD,
R_BACKWARD_KILL_WORD,
R_BACKWARD_KILL_PATH_COMPONENT,
R_BACKWARD_KILL_BIGWORD,
R_HISTORY_TOKEN_SEARCH_BACKWARD,
R_HISTORY_TOKEN_SEARCH_FORWARD,
R_SELF_INSERT,
R_TRANSPOSE_CHARS,
R_TRANSPOSE_WORDS,
R_UPCASE_WORD,
R_DOWNCASE_WORD,
R_CAPITALIZE_WORD,
R_VI_ARG_DIGIT,
R_VI_DELETE_TO,
R_EXECUTE,
R_BEGINNING_OF_BUFFER,
R_END_OF_BUFFER,
R_REPAINT,
R_FORCE_REPAINT,
R_UP_LINE,
R_DOWN_LINE,
R_SUPPRESS_AUTOSUGGESTION,
R_ACCEPT_AUTOSUGGESTION,
R_BEGIN_SELECTION,
R_END_SELECTION,
R_KILL_SELECTION,
R_FORWARD_JUMP,
R_BACKWARD_JUMP,
R_AND,
R_CANCEL,
R_MAX = R_CANCEL,
// This is a special psuedo-char that is not used other than to mark the
// end of the the special characters so we can sanity check the enum range.
R_SENTINAL
};
/** /**
Init the library Init the library

View file

@ -2964,16 +2964,20 @@ static int can_read(int fd)
return select(fd + 1, &fds, 0, 0, &can_read_timeout) == 1; return select(fd + 1, &fds, 0, 0, &can_read_timeout) == 1;
} }
/** // Test if the specified character is in a range that fish uses interally to
Test if the specified character is in the private use area that // store special tokens.
fish uses to store internal characters //
// NOTE: This is used when tokenizing the input. It is also used when reading
Note: Allow U+F8FF because that's the Apple symbol, which is in the // input, before tokenization, to replace such chars with REPLACEMENT_WCHAR if
OS X US keyboard layout. // they're not part of a quoted string. We don't want external input to be able
*/ // to feed reserved characters into our lexer/parser or code evaluator.
//
// TODO: Actually implement the replacement as documented above.
static int wchar_private(wchar_t c) static int wchar_private(wchar_t c)
{ {
return ((c >= 0xe000) && (c < 0xf8ff)); return ((c >= RESERVED_CHAR_BASE && c < RESERVED_CHAR_END) ||
(c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END) ||
(c >= INPUT_COMMON_BASE && c < INPUT_COMMON_END));
} }
/** /**

View file

@ -632,8 +632,8 @@ void tokenizer_t::tok_next()
/*fwprintf( stderr, L"End of string\n" );*/ /*fwprintf( stderr, L"End of string\n" );*/
this->has_next = false; this->has_next = false;
break; break;
case 13: // carriage return case L'\r': // carriage-return
case L'\n': case L'\n': // newline
case L';': case L';':
this->last_type = TOK_END; this->last_type = TOK_END;
this->buff++; this->buff++;

View file

@ -18,27 +18,19 @@
#include "expand.h" #include "expand.h"
#include "complete.h" #include "complete.h"
/* // Enumeration of all wildcard types
Use unencoded private-use keycodes for internal characters
*/
#define WILDCARD_RESERVED 0xf400
/**
Enumeration of all wildcard types
*/
enum enum
{ {
/** Character representing any character except '/' */ // Character representing any character except '/' (slash).
ANY_CHAR = WILDCARD_RESERVED, ANY_CHAR = WILDCARD_RESERVED_BASE,
// Character representing any character string not containing '/' (slash).
/** Character representing any character string not containing '/' (A slash) */
ANY_STRING, ANY_STRING,
// Character representing any character string.
/** Character representing any character string */
ANY_STRING_RECURSIVE, ANY_STRING_RECURSIVE,
} // This is a special psuedo-char that is not used other than to mark the
; // end of the the special characters so we can sanity check the enum range.
ANY_SENTINAL
};
/** /**
Expand the wildcard by matching against the filesystem. Expand the wildcard by matching against the filesystem.