fix handling of non-ASCII chars in C locale

The relevant standards allow the mbtowc/mbrtowc functions to reject
non-ASCII characters (i.e., chars with the high bit set) when the locale
is C or POSIX.  The BSD libraries (e.g., on OS X) don't do this but
the GNU libraries (e.g., on Linux) do. Like most programs we need the
C/POSIX locales to allow arbitrary bytes. So explicitly check if we're
in a single-byte locale (which would also include ISO-8859 variants)
and simply pass-thru the chars without encoding or decoding.

Fixes #2802.
This commit is contained in:
Kurtis Rader 2016-03-10 18:17:39 -08:00
parent fb0921249f
commit c2f1df1d4a
14 changed files with 215 additions and 165 deletions

View file

@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv)
return STATUS_BUILTIN_OK; return STATUS_BUILTIN_OK;
} }
/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */ // The pwd builtin. We don't respect -P to resolve symbolic links because we
// try to always resolve them.
static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv) static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv)
{ {
wchar_t dir_path[4096]; wcstring res = wgetcwd();
wchar_t *res = wgetcwd(dir_path, 4096); if (res.empty())
if (res == NULL)
{ {
return STATUS_BUILTIN_ERROR; return STATUS_BUILTIN_ERROR;
} }
else else
{ {
streams.out.append(dir_path); streams.out.append(res);
streams.out.push_back(L'\n'); streams.out.push_back(L'\n');
return STATUS_BUILTIN_OK; return STATUS_BUILTIN_OK;
} }
@ -2699,9 +2699,8 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
while (1) while (1)
{ {
int finished=0; int finished = 0;
wchar_t res = 0;
wchar_t res=0;
mbstate_t state = {}; mbstate_t state = {};
while (!finished) while (!finished)
@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
break; break;
} }
size_t sz = mbrtowc(&res, &b, 1, &state); if (MB_CUR_MAX == 1) // single-byte locale
switch (sz)
{ {
case (size_t)(-1): res = (unsigned char)b;
memset(&state, '\0', sizeof(state)); finished = 1;
break; }
else {
size_t sz = mbrtowc(&res, &b, 1, &state);
switch (sz)
{
case (size_t)-1:
memset(&state, 0, sizeof(state));
break;
case (size_t)(-2): case (size_t)-2:
break; break;
case 0:
finished = 1;
break;
default:
finished=1;
break;
default:
finished = 1;
break;
}
} }
} }

View file

@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f)
{ {
errno=0; errno=0;
c = getwc(f); c = fgetwc(f);
if (errno == EILSEQ || errno == EINTR) if (errno == EILSEQ || errno == EINTR)
{ {
continue; continue;
@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
wcstring result; wcstring result;
result.reserve(in_len); result.reserve(in_len);
mbstate_t state = {};
size_t in_pos = 0; size_t in_pos = 0;
if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
{
while (in_pos < in_len)
{
result.push_back((unsigned char)in[in_pos]);
in_pos++;
}
return result;
}
mbstate_t state = {};
while (in_pos < in_len) while (in_pos < in_len)
{ {
wchar_t wc = 0; wchar_t wc = 0;
@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
{ {
use_encode_direct = true; use_encode_direct = true;
} }
else if (ret == (size_t)(-2)) else if (ret == (size_t)-2)
{ {
/* Incomplete sequence */ /* Incomplete sequence */
use_encode_direct = true; use_encode_direct = true;
} }
else if (ret == (size_t)(-1)) else if (ret == (size_t)-1)
{ {
/* Invalid data */ /* Invalid data */
use_encode_direct = true; use_encode_direct = true;
@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input)
std::string result; std::string result;
result.reserve(input.size()); result.reserve(input.size());
mbstate_t state; mbstate_t state = {};
memset(&state, 0, sizeof(state));
char converted[MB_LEN_MAX + 1]; char converted[MB_LEN_MAX + 1];
for (size_t i=0; i < input.size(); i++) for (size_t i=0; i < input.size(); i++)
@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input)
wchar_t wc = input[i]; wchar_t wc = input[i];
if (wc == INTERNAL_SEPARATOR) if (wc == INTERNAL_SEPARATOR)
{ {
// Do nothing.
} }
else if ((wc >= ENCODE_DIRECT_BASE) && else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256)
(wc < ENCODE_DIRECT_BASE+256))
{ {
result.push_back(wc - ENCODE_DIRECT_BASE); result.push_back(wc - ENCODE_DIRECT_BASE);
} }
else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (wc & ~0xFF)
{
wc = '?';
}
converted[0] = wc;
result.append(converted, 1);
}
else else
{ {
memset(converted, 0, sizeof converted); memset(converted, 0, sizeof converted);
@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input)
*/ */
static char *wcs2str_internal(const wchar_t *in, char *out) static char *wcs2str_internal(const wchar_t *in, char *out)
{ {
size_t res=0;
size_t in_pos=0;
size_t out_pos = 0;
mbstate_t state;
CHECK(in, 0); CHECK(in, 0);
CHECK(out, 0); CHECK(out, 0);
memset(&state, 0, sizeof(state)); size_t in_pos = 0;
size_t out_pos = 0;
mbstate_t state = {};
while (in[in_pos]) while (in[in_pos])
{ {
if (in[in_pos] == INTERNAL_SEPARATOR) if (in[in_pos] == INTERNAL_SEPARATOR)
{ {
// Do nothing.
} }
else if ((in[in_pos] >= ENCODE_DIRECT_BASE) && else if (in[in_pos] >= ENCODE_DIRECT_BASE &&
(in[in_pos] < ENCODE_DIRECT_BASE+256)) in[in_pos] < ENCODE_DIRECT_BASE + 256)
{ {
out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE; out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE;
} }
else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (in[in_pos] & ~0xFF)
{
out[out_pos++] = '?';
}
else
{
out[out_pos++] = (unsigned char)in[in_pos];
}
}
else else
{ {
res = wcrtomb(&out[out_pos], in[in_pos], &state); size_t len = wcrtomb(&out[out_pos], in[in_pos], &state);
if (len == (size_t)-1)
if (res == (size_t)(-1))
{ {
debug(1, L"Wide character %d has no narrow representation", in[in_pos]); debug(1, L"Wide character %d has no narrow representation", in[in_pos]);
memset(&state, 0, sizeof(state)); memset(&state, 0, sizeof(state));
} }
else else
{ {
out_pos += res; out_pos += len;
} }
} }
in_pos++; in_pos++;

View file

@ -377,14 +377,13 @@ static void setup_path()
int env_set_pwd() int env_set_pwd()
{ {
wchar_t dir_path[4096]; wcstring res = wgetcwd();
wchar_t *res = wgetcwd(dir_path, 4096); if (res.empty())
if (!res)
{ {
debug(0, _(L"Could not determine current working directory. Is your locale set correctly?")); debug(0, _(L"Could not determine current working directory. Is your locale set correctly?"));
return 0; return 0;
} }
env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL); env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL);
return 1; return 1;
} }

View file

@ -606,7 +606,7 @@ static FILE *fw_data;
static void fw_writer(wchar_t c) static void fw_writer(wchar_t c)
{ {
putwc(c, fw_data); fputwc(c, fw_data);
} }
/* /*
@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...)
#endif #endif
#ifndef HAVE_FGETWC #ifndef HAVE_FGETWC
wint_t fgetwc(FILE *stream) wint_t fgetwc(FILE *stream)
{ {
wchar_t res=0; wchar_t res;
mbstate_t state; mbstate_t state = {};
memset(&state, '\0', sizeof(state));
while (1) while (1)
{ {
int b = fgetc(stream); int b = fgetc(stream);
char bb;
int sz;
if (b == EOF) if (b == EOF)
{
return WEOF; return WEOF;
}
bb=b; if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
{
sz = mbrtowc(&res, &bb, 1, &state); return b;
}
char bb = b;
size_t sz = mbrtowc(&res, &bb, 1, &state);
switch (sz) switch (sz)
{ {
case -1: case -1:
memset(&state, '\0', sizeof(state));
return WEOF; return WEOF;
case -2: case -2:
break; break;
case 0: case 0:
@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream)
return res; return res;
} }
} }
} }
wint_t getwc(FILE *stream)
{
return fgetwc(stream);
}
#endif #endif
#ifndef HAVE_FPUTWC #ifndef HAVE_FPUTWC
wint_t fputwc(wchar_t wc, FILE *stream) wint_t fputwc(wchar_t wc, FILE *stream)
{ {
int res; int res = 0;
char s[MB_CUR_MAX+1]; mbstate_t state = {};
memset(s, 0, MB_CUR_MAX+1); char s[MB_CUR_MAX + 1] = {};
wctomb(s, wc);
res = fputs(s, stream);
return res==EOF?WEOF:wc;
}
wint_t putwc(wchar_t wc, FILE *stream) if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{ {
return fputwc(wc, stream); // If `wc` contains a wide character we emit a question-mark.
} if (wc & ~0xFF)
{
wc = '?';
}
s[0] = (char)wc;
res = fputs(s, stream);
}
else
{
size_t len = wcrtomb(s, wc, &state);
if (len == (size_t)-1)
{
debug(1, L"Wide character %d has no narrow representation", wc);
}
else {
res = fputs(s, stream);
}
}
return res == EOF ? WEOF : wc;
}
#endif #endif
#ifndef HAVE_WCSTOK #ifndef HAVE_WCSTOK

View file

@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va);
#endif #endif
#ifndef HAVE_FGETWC #ifndef HAVE_FGETWC
/** // Fallback implementation of fgetwc.
Fallback implementation of fgetwc
*/
wint_t fgetwc(FILE *stream); wint_t fgetwc(FILE *stream);
/**
Fallback implementation of getwc
*/
wint_t getwc(FILE *stream);
#endif #endif
#ifndef HAVE_FPUTWC #ifndef HAVE_FPUTWC
// Fallback implementation of fputwc.
/**
Fallback implementation of fputwc
*/
wint_t fputwc(wchar_t wc, FILE *stream); wint_t fputwc(wchar_t wc, FILE *stream);
/**
Fallback implementation of putwc
*/
wint_t putwc(wchar_t wc, FILE *stream);
#endif #endif
#ifndef HAVE_WCSTOK #ifndef HAVE_WCSTOK

View file

@ -926,10 +926,9 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
{ {
const char *end = begin + length; const char *end = begin + length;
const char *pos=begin; const char *pos = begin;
bool was_backslash = 0;
wcstring out; wcstring out;
bool was_backslash = false;
bool first_char = true; bool first_char = true;
bool timestamp_mode = false; bool timestamp_mode = false;
time_t timestamp = 0; time_t timestamp = 0;
@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
while (1) while (1)
{ {
wchar_t c; wchar_t c;
mbstate_t state;
size_t res; size_t res;
mbstate_t state = {};
memset(&state, 0, sizeof(state)); if (MB_CUR_MAX == 1) // single-byte locale
{
res = mbrtowc(&c, pos, end-pos, &state); c = (unsigned char)*pos;
res = 1;
}
else
{
res = mbrtowc(&c, pos, end - pos, &state);
}
if (res == (size_t)-1) if (res == (size_t)-1)
{ {

View file

@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed)
while (1) while (1)
{ {
wint_t b = readb(); wint_t b = readb();
char bb;
size_t sz; if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
{
return (unsigned char)b;
}
if ((b >= R_NULL) && (b < R_NULL + 1000)) if ((b >= R_NULL) && (b < R_NULL + 1000))
return b; return b;
bb=b; char bb = b;
size_t sz = mbrtowc(&res, &bb, 1, &state);
sz = mbrtowc(&res, &bb, 1, &state);
switch (sz) switch (sz)
{ {

View file

@ -386,32 +386,35 @@ int writeb(tputs_arg_t b)
int writech(wint_t ch) int writech(wint_t ch)
{ {
mbstate_t state;
size_t i;
char buff[MB_LEN_MAX+1]; char buff[MB_LEN_MAX+1];
size_t bytes; size_t len;
if ((ch >= ENCODE_DIRECT_BASE) && if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256)
(ch < ENCODE_DIRECT_BASE+256))
{ {
buff[0] = ch - ENCODE_DIRECT_BASE; buff[0] = ch - ENCODE_DIRECT_BASE;
bytes=1; len = 1;
}
else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
{
// If `wc` contains a wide character we emit a question-mark.
if (ch & ~0xFF)
{
ch = '?';
}
buff[0] = ch;
len = 1;
} }
else else
{ {
memset(&state, 0, sizeof(state)); mbstate_t state = {};
bytes= wcrtomb(buff, ch, &state); len = wcrtomb(buff, ch, &state);
if (len == (size_t)-1)
switch (bytes)
{ {
case (size_t)(-1): return 1;
{
return 1;
}
} }
} }
for (i=0; i<bytes; i++) for (size_t i = 0; i < len; i++)
{ {
out(buff[i]); out(buff[i]);
} }
@ -420,29 +423,26 @@ int writech(wint_t ch)
void writestr(const wchar_t *str) void writestr(const wchar_t *str)
{ {
char *pos;
CHECK(str,); CHECK(str,);
// while( *str ) if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
// writech( *str++ ); {
while( *str )
/* {
Check amount of needed space writech( *str++ );
*/ }
size_t len = wcstombs(0, str, 0); return;
}
size_t len = wcstombs(0, str, 0); // figure amount of space needed
if (len == (size_t)-1) if (len == (size_t)-1)
{ {
debug(1, L"Tried to print invalid wide character string"); debug(1, L"Tried to print invalid wide character string");
return; return;
} }
// Convert the string.
len++; len++;
/*
Convert
*/
char *buffer, static_buffer[256]; char *buffer, static_buffer[256];
if (len <= sizeof static_buffer) if (len <= sizeof static_buffer)
buffer = static_buffer; buffer = static_buffer;
@ -456,7 +456,7 @@ void writestr(const wchar_t *str)
/* /*
Write Write
*/ */
for (pos = buffer; *pos; pos++) for (char *pos = buffer; *pos; pos++)
{ {
out(*pos); out(*pos);
} }

View file

@ -145,30 +145,23 @@ bool wreaddir_for_dirs(DIR *dir, wcstring *out_name)
} }
wchar_t *wgetcwd(wchar_t *buff, size_t sz) const wcstring wgetcwd()
{ {
char *buffc = (char *)malloc(sz*MAX_UTF8_BYTES); wcstring retval;
char *res;
wchar_t *ret = 0;
if (!buffc) char *res = getcwd(NULL, 0);
{
errno = ENOMEM;
return 0;
}
res = getcwd(buffc, sz*MAX_UTF8_BYTES);
if (res) if (res)
{ {
if ((size_t)-1 != mbstowcs(buff, buffc, sizeof(wchar_t) * sz)) retval = str2wcstring(res);
{ free(res);
ret = buff; }
} else
{
debug(0, _(L"getcwd() failed with errno %d/%s"), errno, strerror(errno));
retval = wcstring();
} }
free(buffc); return retval;
return ret;
} }
int wchdir(const wcstring &dir) int wchdir(const wcstring &dir)

View file

@ -70,10 +70,8 @@ void safe_perror(const char *message);
*/ */
const char *safe_strerror(int err); const char *safe_strerror(int err);
/** // Wide character version of getcwd().
Wide character version of getcwd(). const wcstring wgetcwd();
*/
wchar_t *wgetcwd(wchar_t *buff, size_t sz);
/** /**
Wide character version of chdir() Wide character version of chdir()

0
tests/c-locale.err Normal file
View file

35
tests/c-locale.in Normal file
View file

@ -0,0 +1,35 @@
# Verify that fish can pass through non-ASCII characters in the C/POSIX
# locale. This is to prevent regression of
# https://github.com/fish-shell/fish-shell/issues/2802.
#
# These tests are needed because the relevant standards allow the functions
# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
# always treat those bytes as valid.
# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
# version of the unicode from the echo. It should pass those bytes thru
# literally since it is in the C locale. We verify this by first passing the
# echo output directly to the `xxd` program then via a fish instance. The
# output should be "58c3bb58" for the first statement and "58c3bc58" for the
# second.
echo -n X\u00fbX | \
xxd --plain
echo X\u00fcX | env LC_ALL=C ../test/root/bin/fish -c 'read foo; echo -n $foo' | \
xxd --plain
# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
# few single-byte unicode chars (that are not ASCII) are generally in the
# ISO-8859-1 char set which is encompased by the C locale. The output should
# be "59fc59".
env LC_ALL=C ../test/root/bin/fish -c 'echo -n Y\u00fcY' | \
xxd --plain
# The user can specify a wide unicode character (one requiring more than a
# single byte). In the C/POSIX locales we substitute a question-mark for the
# unencodable wide char. The output should be "543f54".
env LC_ALL=C ../test/root/bin/fish -c 'echo -n T\u01fdT' | \
xxd --plain

4
tests/c-locale.out Normal file
View file

@ -0,0 +1,4 @@
58c3bb58
58c3bc58
59fc59
543f54

1
tests/c-locale.status Normal file
View file

@ -0,0 +1 @@
0