fix handling of non-ASCII chars in C locale

The relevant standards allow the mbtowc/mbrtowc functions to reject non-ASCII characters (i.e., chars with the high bit set) when the locale is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but the GNU libraries (e.g., on Linux) do. Like most programs we need the C/POSIX locales to allow arbitrary bytes. So explicitly check if we're in a single-byte locale (which would also include ISO-8859 variants) and simply pass-thru the chars without encoding or decoding. Fixes #2802.
2025-01-13 05:28:49 +00:00 · 2016-03-10 18:17:39 -08:00 · 2016-03-10 18:17:39 -08:00 · c2f1df1d4a
commit c2f1df1d4a
parent fb0921249f
14 changed files with 215 additions and 165 deletions
--- a/src/builtin.cpp
+++ b/src/builtin.cpp
@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv)
    return STATUS_BUILTIN_OK;
 }
-/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */
+// The pwd builtin. We don't respect -P to resolve symbolic links because we
 // try to always resolve them.
 static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv)
 {
-    wchar_t dir_path[4096];
+    wcstring res = wgetcwd();
-    wchar_t *res = wgetcwd(dir_path, 4096);
+    if (res.empty())
    if (res == NULL)
    {
        return STATUS_BUILTIN_ERROR;
    }
    else
    {
-        streams.out.append(dir_path);
+        streams.out.append(res);
        streams.out.push_back(L'\n');
        return STATUS_BUILTIN_OK;
    }
@ -2700,7 +2700,6 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
        while (1)
        {
            int finished = 0;
            wchar_t res = 0;
            mbstate_t state = {};
@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
                    break;
                }
                if (MB_CUR_MAX == 1) // single-byte locale
                {
                    res = (unsigned char)b;
                    finished = 1;
                }
                else {
                    size_t sz = mbrtowc(&res, &b, 1, &state);
                    switch (sz)
                    {
-                    case (size_t)(-1):
+                        case (size_t)-1:
-                        memset(&state, '\0', sizeof(state));
+                            memset(&state, 0, sizeof(state));
                            break;
-                    case (size_t)(-2):
+                        case (size_t)-2:
                        break;
                    case 0:
                        finished = 1;
                            break;
                        default:
                            finished = 1;
                            break;
-
+                    }
                }
            }
--- a/src/common.cpp
+++ b/src/common.cpp
@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f)
    {
        errno=0;
-        c = getwc(f);
+        c = fgetwc(f);
        if (errno == EILSEQ || errno == EINTR)
        {
            continue;
@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
    wcstring result;
    result.reserve(in_len);
    mbstate_t state = {};
    size_t in_pos = 0;
    if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
    {
        while (in_pos < in_len)
        {
            result.push_back((unsigned char)in[in_pos]);
            in_pos++;
        }
        return result;
    }
    mbstate_t state = {};
    while (in_pos < in_len)
    {
        wchar_t wc = 0;
@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
        {
            use_encode_direct = true;
        }
-        else if (ret == (size_t)(-2))
+        else if (ret == (size_t)-2)
        {
            /* Incomplete sequence */
            use_encode_direct = true;
        }
-        else if (ret == (size_t)(-1))
+        else if (ret == (size_t)-1)
        {
            /* Invalid data */
            use_encode_direct = true;
@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input)
    std::string result;
    result.reserve(input.size());
-    mbstate_t state;
+    mbstate_t state = {};
    memset(&state, 0, sizeof(state));
    char converted[MB_LEN_MAX + 1];
    for (size_t i=0; i < input.size(); i++)
@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input)
        wchar_t wc = input[i];
        if (wc == INTERNAL_SEPARATOR)
        {
            // Do nothing.
        }
-        else if ((wc >= ENCODE_DIRECT_BASE) &&
+        else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256)
                 (wc < ENCODE_DIRECT_BASE+256))
        {
            result.push_back(wc - ENCODE_DIRECT_BASE);
        }
        else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
        {
            // If `wc` contains a wide character we emit a question-mark.
            if (wc & ~0xFF)
            {
                wc = '?';
            }
            converted[0] = wc;
            result.append(converted, 1);
        }
        else
        {
            memset(converted, 0, sizeof converted);
@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input)
 */
 static char *wcs2str_internal(const wchar_t *in, char *out)
 {
    size_t res=0;
    size_t in_pos=0;
    size_t out_pos = 0;
    mbstate_t state;
    CHECK(in, 0);
    CHECK(out, 0);
-    memset(&state, 0, sizeof(state));
+    size_t in_pos = 0;
    size_t out_pos = 0;
    mbstate_t state = {};
    while (in[in_pos])
    {
        if (in[in_pos] == INTERNAL_SEPARATOR)
        {
            // Do nothing.
        }
-        else if ((in[in_pos] >= ENCODE_DIRECT_BASE) &&
+        else if (in[in_pos] >= ENCODE_DIRECT_BASE &&
-                 (in[in_pos] < ENCODE_DIRECT_BASE+256))
+                 in[in_pos] < ENCODE_DIRECT_BASE + 256)
        {
            out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE;
        }
        else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
        {
            // If `wc` contains a wide character we emit a question-mark.
            if (in[in_pos] & ~0xFF)
            {
                out[out_pos++] = '?';
            }
            else
            {
-            res = wcrtomb(&out[out_pos], in[in_pos], &state);
+                out[out_pos++] = (unsigned char)in[in_pos];
-
+            }
-            if (res == (size_t)(-1))
+        }
        else
        {
            size_t len = wcrtomb(&out[out_pos], in[in_pos], &state);
            if (len == (size_t)-1)
            {
                debug(1, L"Wide character %d has no narrow representation", in[in_pos]);
                memset(&state, 0, sizeof(state));
            }
            else
            {
-                out_pos += res;
+                out_pos += len;
            }
        }
        in_pos++;
--- a/src/env.cpp
+++ b/src/env.cpp
@ -377,14 +377,13 @@ static void setup_path()
 int env_set_pwd()
 {
-    wchar_t dir_path[4096];
+    wcstring res = wgetcwd();
-    wchar_t *res = wgetcwd(dir_path, 4096);
+    if (res.empty())
    if (!res)
    {
        debug(0, _(L"Could not determine current working directory. Is your locale set correctly?"));
        return 0;
    }
-    env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL);
+    env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL);
    return 1;
 }
--- a/src/fallback.cpp
+++ b/src/fallback.cpp
@ -606,7 +606,7 @@ static FILE *fw_data;
 static void fw_writer(wchar_t c)
 {
-    putwc(c, fw_data);
+    fputwc(c, fw_data);
 }
 /*
@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...)
 #endif
 #ifndef HAVE_FGETWC
 wint_t fgetwc(FILE *stream)
 {
-    wchar_t res=0;
+    wchar_t res;
-    mbstate_t state;
+    mbstate_t state = {};
    memset(&state, '\0', sizeof(state));
    while (1)
    {
        int b = fgetc(stream);
        char bb;
        int sz;
        if (b == EOF)
        {
            return WEOF;
        }
-        bb=b;
+        if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
-
+        {
-        sz = mbrtowc(&res, &bb, 1, &state);
+            return b;
        }
        char bb = b;
        size_t sz = mbrtowc(&res, &bb, 1, &state);
        switch (sz)
        {
            case -1:
                memset(&state, '\0', sizeof(state));
                return WEOF;
            case -2:
                break;
            case 0:
@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream)
                return res;
        }
    }
 }
 wint_t getwc(FILE *stream)
 {
    return fgetwc(stream);
 }
 #endif
 #ifndef HAVE_FPUTWC
 wint_t fputwc(wchar_t wc, FILE *stream)
 {
-    int res;
+    int res = 0;
-    char s[MB_CUR_MAX+1];
+    mbstate_t state = {};
-    memset(s, 0, MB_CUR_MAX+1);
+    char s[MB_CUR_MAX + 1] = {};
-    wctomb(s, wc);
+
    if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
    {
        // If `wc` contains a wide character we emit a question-mark.
        if (wc & ~0xFF)
        {
            wc = '?';
        }
        s[0] = (char)wc;
        res = fputs(s, stream);
    }
    else
    {
        size_t len = wcrtomb(s, wc, &state);
        if (len == (size_t)-1)
        {
            debug(1, L"Wide character %d has no narrow representation", wc);
        }
        else {
            res = fputs(s, stream);
        }
    }
    return res == EOF ? WEOF : wc;
 }
 wint_t putwc(wchar_t wc, FILE *stream)
 {
    return fputwc(wc, stream);
 }
 #endif
 #ifndef HAVE_WCSTOK
--- a/src/fallback.h
+++ b/src/fallback.h
@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va);
 #endif
 #ifndef HAVE_FGETWC
-/**
+// Fallback implementation of fgetwc.
   Fallback implementation of fgetwc
 */
 wint_t fgetwc(FILE *stream);
 /**
   Fallback implementation of getwc
 */
 wint_t getwc(FILE *stream);
 #endif
 #ifndef HAVE_FPUTWC
-
+// Fallback implementation of fputwc.
 /**
   Fallback implementation of fputwc
 */
 wint_t fputwc(wchar_t wc, FILE *stream);
 /**
   Fallback implementation of putwc
 */
 wint_t putwc(wchar_t wc, FILE *stream);
 #endif
 #ifndef HAVE_WCSTOK
--- a/src/history.cpp
+++ b/src/history.cpp
@ -927,9 +927,8 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
    const char *end = begin + length;
    const char *pos = begin;
    bool was_backslash = 0;
    wcstring out;
    bool was_backslash = false;
    bool first_char = true;
    bool timestamp_mode = false;
    time_t timestamp = 0;
@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
    while (1)
    {
        wchar_t c;
        mbstate_t state;
        size_t res;
        mbstate_t state = {};
-        memset(&state, 0, sizeof(state));
+        if (MB_CUR_MAX == 1) // single-byte locale
-
+        {
            c = (unsigned char)*pos;
            res = 1;
        }
        else
        {
            res = mbrtowc(&c, pos, end - pos, &state);
        }
        if (res == (size_t)-1)
        {
--- a/src/input_common.cpp
+++ b/src/input_common.cpp
@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed)
        while (1)
        {
            wint_t b = readb();
            char bb;
-            size_t sz;
+            if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
            {
                return (unsigned char)b;
            }
            if ((b >= R_NULL) && (b < R_NULL + 1000))
                return b;
-            bb=b;
+            char bb = b;
-
+            size_t sz = mbrtowc(&res, &bb, 1, &state);
            sz = mbrtowc(&res, &bb, 1, &state);
            switch (sz)
            {
--- a/src/output.cpp
+++ b/src/output.cpp
@ -386,32 +386,35 @@ int writeb(tputs_arg_t b)
 int writech(wint_t ch)
 {
    mbstate_t state;
    size_t i;
    char buff[MB_LEN_MAX+1];
-    size_t bytes;
+    size_t len;
-    if ((ch >= ENCODE_DIRECT_BASE) &&
+    if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256)
            (ch < ENCODE_DIRECT_BASE+256))
    {
        buff[0] = ch - ENCODE_DIRECT_BASE;
-        bytes=1;
+        len = 1;
    }
    else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
    {
        // If `wc` contains a wide character we emit a question-mark.
        if (ch & ~0xFF)
        {
            ch = '?';
        }
        buff[0] = ch;
        len = 1;
    }
    else
    {
-        memset(&state, 0, sizeof(state));
+        mbstate_t state = {};
-        bytes= wcrtomb(buff, ch, &state);
+        len = wcrtomb(buff, ch, &state);
-
+        if (len == (size_t)-1)
        switch (bytes)
        {
            case (size_t)(-1):
        {
            return 1;
        }
    }
    }
-    for (i=0; i<bytes; i++)
+    for (size_t i = 0; i < len; i++)
    {
        out(buff[i]);
    }
@ -420,29 +423,26 @@ int writech(wint_t ch)
 void writestr(const wchar_t *str)
 {
    char *pos;
    CHECK(str,);
-    //  while( *str )
+    if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
-    //    writech( *str++ );
+    {
-
+        while( *str )
-    /*
+        {
-       Check amount of needed space
+            writech( *str++ );
-       */
+        }
-    size_t len = wcstombs(0, str, 0);
+        return;
    }
    size_t len = wcstombs(0, str, 0);  // figure amount of space needed
    if (len == (size_t)-1)
    {
        debug(1, L"Tried to print invalid wide character string");
        return;
    }
    // Convert the string.
    len++;
    /*
       Convert
       */
    char *buffer, static_buffer[256];
    if (len <= sizeof static_buffer)
        buffer = static_buffer;
@ -456,7 +456,7 @@ void writestr(const wchar_t *str)
    /*
       Write
       */
-    for (pos = buffer; *pos; pos++)
+    for (char *pos = buffer; *pos; pos++)
    {
        out(*pos);
    }
--- a/src/wutil.cpp
+++ b/src/wutil.cpp
@ -145,30 +145,23 @@ bool wreaddir_for_dirs(DIR *dir, wcstring *out_name)
 }
-wchar_t *wgetcwd(wchar_t *buff, size_t sz)
+const wcstring wgetcwd()
 {
-    char *buffc = (char *)malloc(sz*MAX_UTF8_BYTES);
+    wcstring retval;
    char *res;
    wchar_t *ret = 0;
-    if (!buffc)
+    char *res = getcwd(NULL, 0);
    {
        errno = ENOMEM;
        return 0;
    }
    res = getcwd(buffc, sz*MAX_UTF8_BYTES);
    if (res)
    {
-        if ((size_t)-1 != mbstowcs(buff, buffc, sizeof(wchar_t) * sz))
+        retval = str2wcstring(res);
        free(res);
    }
    else
    {
-            ret = buff;
+        debug(0, _(L"getcwd() failed with errno %d/%s"), errno, strerror(errno));
-        }
+        retval = wcstring();
    }
-    free(buffc);
+    return retval;
    return ret;
 }
 int wchdir(const wcstring &dir)
--- a/src/wutil.h
+++ b/src/wutil.h
@ -70,10 +70,8 @@ void safe_perror(const char *message);
 */
 const char *safe_strerror(int err);
-/**
+// Wide character version of getcwd().
-   Wide character version of getcwd().
+const wcstring wgetcwd();
 */
 wchar_t *wgetcwd(wchar_t *buff, size_t sz);
 /**
   Wide character version of chdir()
--- a/tests/c-locale.err
+++ b/tests/c-locale.err
--- a/tests/c-locale.in
+++ b/tests/c-locale.in
@ -0,0 +1,35 @@
 # Verify that fish can pass through non-ASCII characters in the C/POSIX
 # locale. This is to prevent regression of
 # https://github.com/fish-shell/fish-shell/issues/2802.
 #
 # These tests are needed because the relevant standards allow the functions
 # mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
 # or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
 # Other libc implementations (e.g., BSD) treat them as valid. We want fish to
 # always treat those bytes as valid.
 # The fish in the middle of the pipeline should be receiving a UTF-8 encoded
 # version of the unicode from the echo. It should pass those bytes thru
 # literally since it is in the C locale. We verify this by first passing the
 # echo output directly to the `xxd` program then via a fish instance. The
 # output should be "58c3bb58" for the first statement and "58c3bc58" for the
 # second.
 echo -n X\u00fbX | \
  xxd --plain
 echo X\u00fcX | env LC_ALL=C ../test/root/bin/fish -c 'read foo; echo -n $foo' | \
  xxd --plain
 # This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
 # with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
 # be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
 # few single-byte unicode chars (that are not ASCII) are generally in the
 # ISO-8859-1 char set which is encompased by the C locale. The output should
 # be "59fc59".
 env LC_ALL=C ../test/root/bin/fish -c 'echo -n Y\u00fcY' | \
  xxd --plain
 # The user can specify a wide unicode character (one requiring more than a
 # single byte). In the C/POSIX locales we substitute a question-mark for the
 # unencodable wide char. The output should be "543f54".
 env LC_ALL=C ../test/root/bin/fish -c 'echo -n T\u01fdT' | \
  xxd --plain
--- a/tests/c-locale.out
+++ b/tests/c-locale.out
@ -0,0 +1,4 @@
 58c3bb58
 58c3bc58
 59fc59
 543f54
--- a/tests/c-locale.status
+++ b/tests/c-locale.status
@ -0,0 +1 @@
 0
+c3bb58
+c3bc58
+fc59
+f54