fix handling of non-ASCII chars in C locale

The relevant standards allow the mbtowc/mbrtowc functions to reject non-ASCII characters (i.e., chars with the high bit set) when the locale is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but the GNU libraries (e.g., on Linux) do. Like most programs we need the C/POSIX locales to allow arbitrary bytes. So explicitly check if we're in a single-byte locale (which would also include ISO-8859 variants) and simply pass-thru the chars without encoding or decoding. Fixes #2802.
2025-01-12 21:18:53 +00:00 · 2016-03-10 18:17:39 -08:00 · 2016-03-10 18:17:39 -08:00 · c2f1df1d4a
commit c2f1df1d4a
parent fb0921249f
14 changed files with 215 additions and 165 deletions
--- a/src/builtin.cpp
+++ b/src/builtin.cpp
@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv)
    return STATUS_BUILTIN_OK;
 }

-/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */
+// The pwd builtin. We don't respect -P to resolve symbolic links because we
+// try to always resolve them.
 static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv)
 {
-    wchar_t dir_path[4096];
-    wchar_t *res = wgetcwd(dir_path, 4096);
-    if (res == NULL)
+    wcstring res = wgetcwd();
+    if (res.empty())
    {
        return STATUS_BUILTIN_ERROR;
    }
    else
    {
-        streams.out.append(dir_path);
+        streams.out.append(res);
        streams.out.push_back(L'\n');
        return STATUS_BUILTIN_OK;
    }
@ -2699,9 +2699,8 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)

        while (1)
        {
-            int finished=0;
-
-            wchar_t res=0;
+            int finished = 0;
+            wchar_t res = 0;
            mbstate_t state = {};

            while (!finished)
@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv)
                    break;
                }

-                size_t sz = mbrtowc(&res, &b, 1, &state);
-
-                switch (sz)
+                if (MB_CUR_MAX == 1) // single-byte locale
                {
-                    case (size_t)(-1):
-                        memset(&state, '\0', sizeof(state));
-                        break;
+                    res = (unsigned char)b;
+                    finished = 1;
+                }
+                else {
+                    size_t sz = mbrtowc(&res, &b, 1, &state);
+                    switch (sz)
+                    {
+                        case (size_t)-1:
+                            memset(&state, 0, sizeof(state));
+                            break;

-                    case (size_t)(-2):
-                        break;
-                    case 0:
-                        finished = 1;
-                        break;
-
-                    default:
-                        finished=1;
-                        break;
+                        case (size_t)-2:
+                            break;

+                        default:
+                            finished = 1;
+                            break;
+                    }
                }
            }

--- a/src/common.cpp
+++ b/src/common.cpp
@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f)
    {
        errno=0;

-        c = getwc(f);
-
+        c = fgetwc(f);
        if (errno == EILSEQ || errno == EINTR)
        {
            continue;
@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)

    wcstring result;
    result.reserve(in_len);
-    mbstate_t state = {};
    size_t in_pos = 0;
+
+    if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+    {
+        while (in_pos < in_len)
+        {
+            result.push_back((unsigned char)in[in_pos]);
+            in_pos++;
+        }
+        return result;
+    }
+
+    mbstate_t state = {};
    while (in_pos < in_len)
    {
        wchar_t wc = 0;
@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len)
        {
            use_encode_direct = true;
        }
-        else if (ret == (size_t)(-2))
+        else if (ret == (size_t)-2)
        {
            /* Incomplete sequence */
            use_encode_direct = true;
        }
-        else if (ret == (size_t)(-1))
+        else if (ret == (size_t)-1)
        {
            /* Invalid data */
            use_encode_direct = true;
@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input)
    std::string result;
    result.reserve(input.size());

-    mbstate_t state;
-    memset(&state, 0, sizeof(state));
-
+    mbstate_t state = {};
    char converted[MB_LEN_MAX + 1];

    for (size_t i=0; i < input.size(); i++)
@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input)
        wchar_t wc = input[i];
        if (wc == INTERNAL_SEPARATOR)
        {
+            // Do nothing.
        }
-        else if ((wc >= ENCODE_DIRECT_BASE) &&
-                 (wc < ENCODE_DIRECT_BASE+256))
+        else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256)
        {
            result.push_back(wc - ENCODE_DIRECT_BASE);
        }
+        else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+        {
+            // If `wc` contains a wide character we emit a question-mark.
+            if (wc & ~0xFF)
+            {
+                wc = '?';
+            }
+            converted[0] = wc;
+            result.append(converted, 1);
+        }
        else
        {
            memset(converted, 0, sizeof converted);
@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input)
 */
 static char *wcs2str_internal(const wchar_t *in, char *out)
 {
-    size_t res=0;
-    size_t in_pos=0;
-    size_t out_pos = 0;
-    mbstate_t state;
-
    CHECK(in, 0);
    CHECK(out, 0);

-    memset(&state, 0, sizeof(state));
+    size_t in_pos = 0;
+    size_t out_pos = 0;
+    mbstate_t state = {};

    while (in[in_pos])
    {
        if (in[in_pos] == INTERNAL_SEPARATOR)
        {
+            // Do nothing.
        }
-        else if ((in[in_pos] >= ENCODE_DIRECT_BASE) &&
-                 (in[in_pos] < ENCODE_DIRECT_BASE+256))
+        else if (in[in_pos] >= ENCODE_DIRECT_BASE &&
+                 in[in_pos] < ENCODE_DIRECT_BASE + 256)
        {
            out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE;
        }
+        else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+        {
+            // If `wc` contains a wide character we emit a question-mark.
+            if (in[in_pos] & ~0xFF)
+            {
+                out[out_pos++] = '?';
+            }
+            else
+            {
+                out[out_pos++] = (unsigned char)in[in_pos];
+            }
+        }
        else
        {
-            res = wcrtomb(&out[out_pos], in[in_pos], &state);
-
-            if (res == (size_t)(-1))
+            size_t len = wcrtomb(&out[out_pos], in[in_pos], &state);
+            if (len == (size_t)-1)
            {
                debug(1, L"Wide character %d has no narrow representation", in[in_pos]);
                memset(&state, 0, sizeof(state));
            }
            else
            {
-                out_pos += res;
+                out_pos += len;
            }
        }
        in_pos++;
--- a/src/env.cpp
+++ b/src/env.cpp
@ -377,14 +377,13 @@ static void setup_path()

 int env_set_pwd()
 {
-    wchar_t dir_path[4096];
-    wchar_t *res = wgetcwd(dir_path, 4096);
-    if (!res)
+    wcstring res = wgetcwd();
+    if (res.empty())
    {
        debug(0, _(L"Could not determine current working directory. Is your locale set correctly?"));
        return 0;
    }
-    env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL);
+    env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL);
    return 1;
 }

--- a/src/fallback.cpp
+++ b/src/fallback.cpp
@ -606,7 +606,7 @@ static FILE *fw_data;

 static void fw_writer(wchar_t c)
 {
-    putwc(c, fw_data);
+    fputwc(c, fw_data);
 }

 /*
@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...)
 #endif

 #ifndef HAVE_FGETWC
-
 wint_t fgetwc(FILE *stream)
 {
-    wchar_t res=0;
-    mbstate_t state;
-    memset(&state, '\0', sizeof(state));
+    wchar_t res;
+    mbstate_t state = {};

    while (1)
    {
        int b = fgetc(stream);
-        char bb;
-
-        int sz;
-
        if (b == EOF)
+        {
            return WEOF;
+        }

-        bb=b;
-
-        sz = mbrtowc(&res, &bb, 1, &state);
+        if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+        {
+            return b;
+        }

+        char bb = b;
+        size_t sz = mbrtowc(&res, &bb, 1, &state);
        switch (sz)
        {
            case -1:
-                memset(&state, '\0', sizeof(state));
                return WEOF;
-
            case -2:
                break;
            case 0:
@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream)
                return res;
        }
    }
-
 }
-
-
-wint_t getwc(FILE *stream)
-{
-    return fgetwc(stream);
-}
-
-
 #endif

 #ifndef HAVE_FPUTWC
-
 wint_t fputwc(wchar_t wc, FILE *stream)
 {
-    int res;
-    char s[MB_CUR_MAX+1];
-    memset(s, 0, MB_CUR_MAX+1);
-    wctomb(s, wc);
-    res = fputs(s, stream);
-    return res==EOF?WEOF:wc;
-}
+    int res = 0;
+    mbstate_t state = {};
+    char s[MB_CUR_MAX + 1] = {};

-wint_t putwc(wchar_t wc, FILE *stream)
-{
-    return fputwc(wc, stream);
-}
+    if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+    {
+        // If `wc` contains a wide character we emit a question-mark.
+        if (wc & ~0xFF)
+        {
+            wc = '?';
+        }
+        s[0] = (char)wc;
+        res = fputs(s, stream);
+    }
+    else
+    {
+        size_t len = wcrtomb(s, wc, &state);
+        if (len == (size_t)-1)
+        {
+            debug(1, L"Wide character %d has no narrow representation", wc);
+        }
+        else {
+            res = fputs(s, stream);
+        }
+    }

+    return res == EOF ? WEOF : wc;
+}
 #endif

 #ifndef HAVE_WCSTOK
--- a/src/fallback.h
+++ b/src/fallback.h
@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va);
 #endif

 #ifndef HAVE_FGETWC
-/**
-   Fallback implementation of fgetwc
-*/
+// Fallback implementation of fgetwc.
 wint_t fgetwc(FILE *stream);
-
-/**
-   Fallback implementation of getwc
-*/
-wint_t getwc(FILE *stream);
-
 #endif

 #ifndef HAVE_FPUTWC
-
-/**
-   Fallback implementation of fputwc
-*/
+// Fallback implementation of fputwc.
 wint_t fputwc(wchar_t wc, FILE *stream);
-/**
-   Fallback implementation of putwc
-*/
-wint_t putwc(wchar_t wc, FILE *stream);
-
 #endif

 #ifndef HAVE_WCSTOK
--- a/src/history.cpp
+++ b/src/history.cpp
@ -926,10 +926,9 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
 {

    const char *end = begin + length;
-    const char *pos=begin;
-
-    bool was_backslash = 0;
+    const char *pos = begin;
    wcstring out;
+    bool was_backslash = false;
    bool first_char = true;
    bool timestamp_mode = false;
    time_t timestamp = 0;
@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length)
    while (1)
    {
        wchar_t c;
-        mbstate_t state;
        size_t res;
+        mbstate_t state = {};

-        memset(&state, 0, sizeof(state));
-
-        res = mbrtowc(&c, pos, end-pos, &state);
+        if (MB_CUR_MAX == 1) // single-byte locale
+        {
+            c = (unsigned char)*pos;
+            res = 1;
+        }
+        else
+        {
+            res = mbrtowc(&c, pos, end - pos, &state);
+        }

        if (res == (size_t)-1)
        {
--- a/src/input_common.cpp
+++ b/src/input_common.cpp
@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed)
        while (1)
        {
            wint_t b = readb();
-            char bb;

-            size_t sz;
+            if (MB_CUR_MAX == 1) // single-byte locale, all values are legal
+            {
+                return (unsigned char)b;
+            }

            if ((b >= R_NULL) && (b < R_NULL + 1000))
                return b;

-            bb=b;
-
-            sz = mbrtowc(&res, &bb, 1, &state);
+            char bb = b;
+            size_t sz = mbrtowc(&res, &bb, 1, &state);

            switch (sz)
            {
--- a/src/output.cpp
+++ b/src/output.cpp
@ -386,32 +386,35 @@ int writeb(tputs_arg_t b)

 int writech(wint_t ch)
 {
-    mbstate_t state;
-    size_t i;
    char buff[MB_LEN_MAX+1];
-    size_t bytes;
+    size_t len;

-    if ((ch >= ENCODE_DIRECT_BASE) &&
-            (ch < ENCODE_DIRECT_BASE+256))
+    if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256)
    {
        buff[0] = ch - ENCODE_DIRECT_BASE;
-        bytes=1;
+        len = 1;
+    }
+    else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+    {
+        // If `wc` contains a wide character we emit a question-mark.
+        if (ch & ~0xFF)
+        {
+            ch = '?';
+        }
+        buff[0] = ch;
+        len = 1;
    }
    else
    {
-        memset(&state, 0, sizeof(state));
-        bytes= wcrtomb(buff, ch, &state);
-
-        switch (bytes)
+        mbstate_t state = {};
+        len = wcrtomb(buff, ch, &state);
+        if (len == (size_t)-1)
        {
-            case (size_t)(-1):
-            {
-                return 1;
-            }
+            return 1;
        }
    }

-    for (i=0; i<bytes; i++)
+    for (size_t i = 0; i < len; i++)
    {
        out(buff[i]);
    }
@ -420,29 +423,26 @@ int writech(wint_t ch)

 void writestr(const wchar_t *str)
 {
-    char *pos;
-
    CHECK(str,);

-    //  while( *str )
-    //    writech( *str++ );
-
-    /*
-       Check amount of needed space
-       */
-    size_t len = wcstombs(0, str, 0);
+    if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859)
+    {
+        while( *str )
+        {
+            writech( *str++ );
+        }
+        return;
+    }

+    size_t len = wcstombs(0, str, 0);  // figure amount of space needed
    if (len == (size_t)-1)
    {
        debug(1, L"Tried to print invalid wide character string");
        return;
    }

+    // Convert the string.
    len++;
-
-    /*
-       Convert
-       */
    char *buffer, static_buffer[256];
    if (len <= sizeof static_buffer)
        buffer = static_buffer;
@ -456,7 +456,7 @@ void writestr(const wchar_t *str)
    /*
       Write
       */
-    for (pos = buffer; *pos; pos++)
+    for (char *pos = buffer; *pos; pos++)
    {
        out(*pos);
    }
--- a/src/wutil.cpp
+++ b/src/wutil.cpp
@ -145,30 +145,23 @@ bool wreaddir_for_dirs(DIR *dir, wcstring *out_name)
 }


-wchar_t *wgetcwd(wchar_t *buff, size_t sz)
+const wcstring wgetcwd()
 {
-    char *buffc = (char *)malloc(sz*MAX_UTF8_BYTES);
-    char *res;
-    wchar_t *ret = 0;
+    wcstring retval;

-    if (!buffc)
-    {
-        errno = ENOMEM;
-        return 0;
-    }
-
-    res = getcwd(buffc, sz*MAX_UTF8_BYTES);
+    char *res = getcwd(NULL, 0);
    if (res)
    {
-        if ((size_t)-1 != mbstowcs(buff, buffc, sizeof(wchar_t) * sz))
-        {
-            ret = buff;
-        }
+        retval = str2wcstring(res);
+        free(res);
+    }
+    else
+    {
+        debug(0, _(L"getcwd() failed with errno %d/%s"), errno, strerror(errno));
+        retval = wcstring();
    }

-    free(buffc);
-
-    return ret;
+    return retval;
 }

 int wchdir(const wcstring &dir)
--- a/src/wutil.h
+++ b/src/wutil.h
@ -70,10 +70,8 @@ void safe_perror(const char *message);
 */
 const char *safe_strerror(int err);

-/**
-   Wide character version of getcwd().
-*/
-wchar_t *wgetcwd(wchar_t *buff, size_t sz);
+// Wide character version of getcwd().
+const wcstring wgetcwd();

 /**
   Wide character version of chdir()
--- a/tests/c-locale.err
+++ b/tests/c-locale.err
--- a/tests/c-locale.in
+++ b/tests/c-locale.in
@ -0,0 +1,35 @@
+# Verify that fish can pass through non-ASCII characters in the C/POSIX
+# locale. This is to prevent regression of
+# https://github.com/fish-shell/fish-shell/issues/2802.
+#
+# These tests are needed because the relevant standards allow the functions
+# mbrtowc() and wcrtomb() to treat bytes with the high bit set as either valid
+# or invalid in the C/POSIX locales. GNU libc treats those bytes as invalid.
+# Other libc implementations (e.g., BSD) treat them as valid. We want fish to
+# always treat those bytes as valid.
+
+# The fish in the middle of the pipeline should be receiving a UTF-8 encoded
+# version of the unicode from the echo. It should pass those bytes thru
+# literally since it is in the C locale. We verify this by first passing the
+# echo output directly to the `xxd` program then via a fish instance. The
+# output should be "58c3bb58" for the first statement and "58c3bc58" for the
+# second.
+echo -n X\u00fbX | \
+  xxd --plain
+echo X\u00fcX | env LC_ALL=C ../test/root/bin/fish -c 'read foo; echo -n $foo' | \
+  xxd --plain
+
+# This test is subtle. Despite the presence of the \u00fc unicode char (a "u"
+# with an umlaut) the fact the locale is C/POSIX will cause the \xfc byte to
+# be emitted rather than the usual UTF-8 sequence \xc3\xbc. That's because the
+# few single-byte unicode chars (that are not ASCII) are generally in the
+# ISO-8859-1 char set which is encompased by the C locale. The output should
+# be "59fc59".
+env LC_ALL=C ../test/root/bin/fish -c 'echo -n Y\u00fcY' | \
+  xxd --plain
+
+# The user can specify a wide unicode character (one requiring more than a
+# single byte). In the C/POSIX locales we substitute a question-mark for the
+# unencodable wide char. The output should be "543f54".
+env LC_ALL=C ../test/root/bin/fish -c 'echo -n T\u01fdT' | \
+  xxd --plain
--- a/tests/c-locale.out
+++ b/tests/c-locale.out
@ -0,0 +1,4 @@
+58c3bb58
+58c3bc58
+59fc59
+543f54
--- a/tests/c-locale.status
+++ b/tests/c-locale.status
@ -0,0 +1 @@
+0