reduce number of Unicode private-use characters

This narrows the range of Unicode codepoints fish reserves for its own
use from U+E000 thru U+F8FE (6399 codepoints) to U+F600 thru U+F73F (320
codepoints). This is still not ideal since fish shouldn't be using any
Unicode private-use codepoints but it's a step in the right direction.

This partially addresses issue #2684.
This commit is contained in:
Kurtis Rader 2016-01-21 19:56:39 -08:00
parent b41b962336
commit f2246dfb34
10 changed files with 184 additions and 186 deletions

View file

@ -215,6 +215,11 @@ cd /usr/local/bin
rm -f fish fish_indent
\endfish
<hr>
\section faq-reserved-chars Unicode private-use characters reserved by fish
Fish reserves the <a href="http://www.unicode.org/faq/private_use.html">Unicode private-use character range</a> from U+F600 thru U+F73F for internal use. Any attempt to feed characters in that range to fish will result in them being replaced by the Unicode "replacement character" U+FFFD. This includes both interactive input as well as any file read by fish (but not programs run by fish).
\htmlonly[block]
</div>
\endhtmlonly

View file

@ -35,34 +35,59 @@
typedef std::wstring wcstring;
typedef std::vector<wcstring> wcstring_list_t;
/**
Maximum number of bytes used by a single utf-8 character
*/
// Maximum number of bytes used by a single utf-8 character.
#define MAX_UTF8_BYTES 6
/**
This is in the unicode private use area.
*/
#define ENCODE_DIRECT_BASE 0xf100
/**
Highest legal ascii value
*/
// Highest legal ASCII value.
#define ASCII_MAX 127u
/**
Highest legal 16-bit unicode value
*/
#define UCS2_MAX 0xffffu
// Highest legal 16-bit Unicode value.
#define UCS2_MAX 0xFFFFu
/**
Highest legal byte value
*/
#define BYTE_MAX 0xffu
// Highest legal byte value.
#define BYTE_MAX 0xFFu
/** BOM value */
// Unicode BOM value.
#define UTF8_BOM_WCHAR 0xFEFFu
// Unicode replacement character.
#define REPLACEMENT_WCHAR 0xFFFDu
// Use Unicode "noncharacters" for internal characters as much as we can. This
// gives us 32 "characters" for internal use that we can guarantee should not
// appear in our input stream. See http://www.unicode.org/faq/private_use.html.
#define RESERVED_CHAR_BASE 0xFDD0u
#define RESERVED_CHAR_END 0xFDF0u
// Split the available noncharacter values into two ranges to ensure there are
// no conflicts among the places we use these special characters.
#define EXPAND_RESERVED_BASE RESERVED_CHAR_BASE
#define EXPAND_RESERVED_END (EXPAND_RESERVED_BASE + 16)
#define WILDCARD_RESERVED_BASE EXPAND_RESERVED_END
#define WILDCARD_RESERVED_END (WILDCARD_RESERVED_BASE + 16)
// Make sure the ranges defined above don't exceed the range for noncharacters.
// This is to make sure we didn't do something stupid in subdividing the
// Unicode range for our needs.
#if WILDCARD_RESERVED_END > RESERVED_CHAR_END
#error
#endif
// These are in the Unicode private-use range. We really shouldn't use this
// range but have little choice in the matter given how our lexer/parser works.
// We can't use non-characters for these two ranges because there are only 66 of
// them and we need at least 256 + 64.
//
// If sizeof(wchar_t))==4 we could avoid using private-use chars; however, that
// would result in fish having different behavior on machines with 16 versus 32
// bit wchar_t. It's better that fish behave the same on both types of systems.
//
// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know
// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF)
// on Mac OS X. See http://www.unicode.org/faq/private_use.html.
#define ENCODE_DIRECT_BASE 0xF600u
#define ENCODE_DIRECT_END (ENCODE_DIRECT_BASE + 256)
#define INPUT_COMMON_BASE 0xF700u
#define INPUT_COMMON_END (INPUT_COMMON_BASE + 64)
/* Flags for unescape_string functions */
enum
{

View file

@ -26,92 +26,64 @@
enum
{
/** Flag specifying that cmdsubst expansion should be skipped */
// Flag specifying that cmdsubst expansion should be skipped.
EXPAND_SKIP_CMDSUBST = 1 << 0,
/** Flag specifying that variable expansion should be skipped */
// Flag specifying that variable expansion should be skipped.
EXPAND_SKIP_VARIABLES = 1 << 1,
/** Flag specifying that wildcard expansion should be skipped */
// Flag specifying that wildcard expansion should be skipped.
EXPAND_SKIP_WILDCARDS = 1 << 2,
/**
The expansion is being done for tab or auto completions. Returned completions may have the wildcard as a prefix instead of a match.
*/
// The expansion is being done for tab or auto completions. Returned
// completions may have the wildcard as a prefix instead of a match.
EXPAND_FOR_COMPLETIONS = 1 << 3,
/** Only match files that are executable by the current user. Only applicable together with ACCEPT_INCOMPLETE. */
// Only match files that are executable by the current user. Only
// applicable together with ACCEPT_INCOMPLETE.
EXECUTABLES_ONLY = 1 << 4,
/** Only match directories. Only applicable together with ACCEPT_INCOMPLETE. */
// Only match directories. Only applicable together with ACCEPT_INCOMPLETE.
DIRECTORIES_ONLY = 1 << 5,
/** Don't generate descriptions */
// Don't generate descriptions.
EXPAND_NO_DESCRIPTIONS = 1 << 6,
/** Don't expand jobs (but you can still expand processes). This is because job expansion is not thread safe. */
// Don't expand jobs (but you can still expand processes). This is because
// job expansion is not thread safe.
EXPAND_SKIP_JOBS = 1 << 7,
/** Don't expand home directories */
// Don't expand home directories.
EXPAND_SKIP_HOME_DIRECTORIES = 1 << 8,
/** Allow fuzzy matching */
// Allow fuzzy matching.
EXPAND_FUZZY_MATCH = 1 << 9,
/** Disallow directory abbreviations like /u/l/b for /usr/local/bin. Only applicable if EXPAND_FUZZY_MATCH is set. */
// Disallow directory abbreviations like /u/l/b for /usr/local/bin. Only
// applicable if EXPAND_FUZZY_MATCH is set.
EXPAND_NO_FUZZY_DIRECTORIES = 1 << 10,
/** Do expansions specifically to support cd (CDPATH, etc) */
// Do expansions specifically to support cd (CDPATH, etc).
EXPAND_SPECIAL_CD = 1 << 11
};
typedef int expand_flags_t;
/**
Use unencoded private-use keycodes for internal characters
*/
#define EXPAND_RESERVED 0xf000
/**
End of range reserved for expand
*/
#define EXPAND_RESERVED_END 0xf000f
class completion_t;
enum
{
/** Character represeting a home directory */
HOME_DIRECTORY = EXPAND_RESERVED,
/** Character represeting process expansion */
// Character representing a home directory.
HOME_DIRECTORY = EXPAND_RESERVED_BASE,
// Character representing process expansion.
PROCESS_EXPAND,
/** Character representing variable expansion */
// Character representing variable expansion.
VARIABLE_EXPAND,
/** Character rpresenting variable expansion into a single element*/
// Character representing variable expansion into a single element.
VARIABLE_EXPAND_SINGLE,
/** Character representing the start of a bracket expansion */
// Character representing the start of a bracket expansion.
BRACKET_BEGIN,
/** Character representing the end of a bracket expansion */
// Character representing the end of a bracket expansion.
BRACKET_END,
/** Character representing separation between two bracket elements */
// Character representing separation between two bracket elements.
BRACKET_SEP,
/**
Separate subtokens in a token with this character.
*/
// Separate subtokens in a token with this character.
INTERNAL_SEPARATOR,
/**
Character representing an empty variable expansion.
Only used transitively while expanding variables.
*/
// Character representing an empty variable expansion. Only used
// transitively while expanding variables.
VARIABLE_EXPAND_EMPTY,
}
;
// This is a special psuedo-char that is not used other than to mark the
// end of the the special characters so we can sanity check the enum range.
EXPAND_SENTINAL
};
/** These are the possible return values for expand_string. Note how zero value is the only error. */
enum expand_error_t

View file

@ -22,6 +22,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
#include "config.h"
#include <assert.h>
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
@ -63,6 +64,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
#include "input.h"
#include "io.h"
#include "fish_version.h"
#include "input_common.h"
#include "wildcard.h"
/* PATH_MAX may not exist */
#ifndef PATH_MAX
@ -484,6 +487,14 @@ int main(int argc, char **argv)
int res=1;
int my_optind=0;
// We can't do this at compile time due to the use of enum symbols.
assert(EXPAND_SENTINAL >= EXPAND_RESERVED_BASE &&
EXPAND_SENTINAL <= EXPAND_RESERVED_END);
assert(ANY_SENTINAL >= WILDCARD_RESERVED_BASE &&
ANY_SENTINAL <= WILDCARD_RESERVED_END);
assert(R_SENTINAL >= INPUT_COMMON_BASE &&
R_SENTINAL <= INPUT_COMMON_END);
set_main_thread();
setup_fork_guards();

View file

@ -335,7 +335,7 @@ static bool has_expand_reserved(const wcstring &str)
for (size_t i=0; i < str.size(); i++)
{
wchar_t wc = str.at(i);
if (wc >= EXPAND_RESERVED && wc <= EXPAND_RESERVED_END)
if (wc >= EXPAND_RESERVED_BASE && wc <= EXPAND_RESERVED_END)
{
result = true;
break;

View file

@ -16,77 +16,11 @@ inputrc information for key bindings.
#include "env.h"
#include "input_common.h"
#define DEFAULT_BIND_MODE L"default"
#define FISH_BIND_MODE_VAR L"fish_bind_mode"
/**
Key codes for inputrc-style keyboard functions that are passed on
to the caller of input_read()
NOTE: IF YOU MODIFY THIS YOU MUST UPDATE THE name_arr AND code_arr VARIABLES TO MATCH!
*/
enum
{
R_BEGINNING_OF_LINE = R_NULL+10, /* This give input_common ten slots for lowlevel keycodes */
R_END_OF_LINE,
R_FORWARD_CHAR,
R_BACKWARD_CHAR,
R_FORWARD_WORD,
R_BACKWARD_WORD,
R_FORWARD_BIGWORD,
R_BACKWARD_BIGWORD,
R_HISTORY_SEARCH_BACKWARD,
R_HISTORY_SEARCH_FORWARD,
R_DELETE_CHAR,
R_BACKWARD_DELETE_CHAR,
R_KILL_LINE,
R_YANK,
R_YANK_POP,
R_COMPLETE,
R_COMPLETE_AND_SEARCH,
R_BEGINNING_OF_HISTORY,
R_END_OF_HISTORY,
R_BACKWARD_KILL_LINE,
R_KILL_WHOLE_LINE,
R_KILL_WORD,
R_KILL_BIGWORD,
R_BACKWARD_KILL_WORD,
R_BACKWARD_KILL_PATH_COMPONENT,
R_BACKWARD_KILL_BIGWORD,
R_HISTORY_TOKEN_SEARCH_BACKWARD,
R_HISTORY_TOKEN_SEARCH_FORWARD,
R_SELF_INSERT,
R_TRANSPOSE_CHARS,
R_TRANSPOSE_WORDS,
R_UPCASE_WORD,
R_DOWNCASE_WORD,
R_CAPITALIZE_WORD,
R_VI_ARG_DIGIT,
R_VI_DELETE_TO,
R_EXECUTE,
R_BEGINNING_OF_BUFFER,
R_END_OF_BUFFER,
R_REPAINT,
R_FORCE_REPAINT,
R_UP_LINE,
R_DOWN_LINE,
R_SUPPRESS_AUTOSUGGESTION,
R_ACCEPT_AUTOSUGGESTION,
R_BEGIN_SELECTION,
R_END_SELECTION,
R_KILL_SELECTION,
R_FORWARD_JUMP,
R_BACKWARD_JUMP,
R_AND,
R_CANCEL
};
wcstring describe_char(wint_t c);
#define R_MIN R_NULL
#define R_MAX R_CANCEL
/**
Initialize the terminal by calling setupterm, and set up arrays
used by readch to detect escape sequences for special keys.

View file

@ -8,22 +8,77 @@ Header file for the low level input library
#include <stddef.h>
/**
Use unencoded private-use keycodes for internal characters
*/
#define INPUT_COMMON_RESERVED 0xe000
#include "common.h"
enum
{
/**
R_NULL is sometimes returned by the input when a character was
requested but none could be delivered, or when an exception
happened.
*/
R_NULL = INPUT_COMMON_RESERVED,
R_EOF
}
;
R_MIN = INPUT_COMMON_BASE,
// R_NULL is sometimes returned by the input when a character was requested
// but none could be delivered, or when an exception happened.
R_NULL = R_MIN,
R_EOF,
// Key codes for inputrc-style keyboard functions that are passed on
// to the caller of input_read().
//
// NOTE: If you modify this sequence of symbols you must update the
// name_arr, code_arr and desc_arr variables in input.cpp to match!
R_BEGINNING_OF_LINE,
R_END_OF_LINE,
R_FORWARD_CHAR,
R_BACKWARD_CHAR,
R_FORWARD_WORD,
R_BACKWARD_WORD,
R_FORWARD_BIGWORD,
R_BACKWARD_BIGWORD,
R_HISTORY_SEARCH_BACKWARD,
R_HISTORY_SEARCH_FORWARD,
R_DELETE_CHAR,
R_BACKWARD_DELETE_CHAR,
R_KILL_LINE,
R_YANK,
R_YANK_POP,
R_COMPLETE,
R_COMPLETE_AND_SEARCH,
R_BEGINNING_OF_HISTORY,
R_END_OF_HISTORY,
R_BACKWARD_KILL_LINE,
R_KILL_WHOLE_LINE,
R_KILL_WORD,
R_KILL_BIGWORD,
R_BACKWARD_KILL_WORD,
R_BACKWARD_KILL_PATH_COMPONENT,
R_BACKWARD_KILL_BIGWORD,
R_HISTORY_TOKEN_SEARCH_BACKWARD,
R_HISTORY_TOKEN_SEARCH_FORWARD,
R_SELF_INSERT,
R_TRANSPOSE_CHARS,
R_TRANSPOSE_WORDS,
R_UPCASE_WORD,
R_DOWNCASE_WORD,
R_CAPITALIZE_WORD,
R_VI_ARG_DIGIT,
R_VI_DELETE_TO,
R_EXECUTE,
R_BEGINNING_OF_BUFFER,
R_END_OF_BUFFER,
R_REPAINT,
R_FORCE_REPAINT,
R_UP_LINE,
R_DOWN_LINE,
R_SUPPRESS_AUTOSUGGESTION,
R_ACCEPT_AUTOSUGGESTION,
R_BEGIN_SELECTION,
R_END_SELECTION,
R_KILL_SELECTION,
R_FORWARD_JUMP,
R_BACKWARD_JUMP,
R_AND,
R_CANCEL,
R_MAX = R_CANCEL,
// This is a special psuedo-char that is not used other than to mark the
// end of the the special characters so we can sanity check the enum range.
R_SENTINAL
};
/**
Init the library

View file

@ -2964,16 +2964,20 @@ static int can_read(int fd)
return select(fd + 1, &fds, 0, 0, &can_read_timeout) == 1;
}
/**
Test if the specified character is in the private use area that
fish uses to store internal characters
Note: Allow U+F8FF because that's the Apple symbol, which is in the
OS X US keyboard layout.
*/
// Test if the specified character is in a range that fish uses interally to
// store special tokens.
//
// NOTE: This is used when tokenizing the input. It is also used when reading
// input, before tokenization, to replace such chars with REPLACEMENT_WCHAR if
// they're not part of a quoted string. We don't want external input to be able
// to feed reserved characters into our lexer/parser or code evaluator.
//
// TODO: Actually implement the replacement as documented above.
static int wchar_private(wchar_t c)
{
return ((c >= 0xe000) && (c < 0xf8ff));
return ((c >= RESERVED_CHAR_BASE && c < RESERVED_CHAR_END) ||
(c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END) ||
(c >= INPUT_COMMON_BASE && c < INPUT_COMMON_END));
}
/**

View file

@ -632,8 +632,8 @@ void tokenizer_t::tok_next()
/*fwprintf( stderr, L"End of string\n" );*/
this->has_next = false;
break;
case 13: // carriage return
case L'\n':
case L'\r': // carriage-return
case L'\n': // newline
case L';':
this->last_type = TOK_END;
this->buff++;

View file

@ -18,27 +18,19 @@
#include "expand.h"
#include "complete.h"
/*
Use unencoded private-use keycodes for internal characters
*/
#define WILDCARD_RESERVED 0xf400
/**
Enumeration of all wildcard types
*/
// Enumeration of all wildcard types
enum
{
/** Character representing any character except '/' */
ANY_CHAR = WILDCARD_RESERVED,
/** Character representing any character string not containing '/' (A slash) */
// Character representing any character except '/' (slash).
ANY_CHAR = WILDCARD_RESERVED_BASE,
// Character representing any character string not containing '/' (slash).
ANY_STRING,
/** Character representing any character string */
// Character representing any character string.
ANY_STRING_RECURSIVE,
}
;
// This is a special psuedo-char that is not used other than to mark the
// end of the the special characters so we can sanity check the enum range.
ANY_SENTINAL
};
/**
Expand the wildcard by matching against the filesystem.