Decode multibyte escapes immediately

We forgot to decode (i.e. turn into nice wchar_t codepoints) "byte_literal" escape sequences. This meant that e.g. ```fish string match ö \Xc3\Xb6 math 5 \X2b 5 ``` didn't work, but `math 5 \x2b 5` did, and would print the wonderful error: ``` math: Error: Missing operator '5 + 5' ^ ``` So, instead, we decode eagerly.
2025-01-14 14:03:58 +00:00 · 2022-09-29 16:53:16 +02:00 · 2022-09-29 16:53:16 +02:00 · 396e276286
commit 396e276286
parent 62794446b7
2 changed files with 173 additions and 140 deletions
--- a/src/common.cpp
+++ b/src/common.cpp
@ -1149,6 +1149,10 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
    bool errored = false;
    size_t in_pos = 1;  // in_pos always tracks the next character to read (and therefore the number
                        // of characters read so far)
+
+    // For multibyte \X sequences.
+    std::string byte_buff;
+    while (!errored) {
        const wchar_t c = input[in_pos++];
        switch (c) {
                // A null character after a backslash is an error.
@ -1207,8 +1211,8 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
                    default: {
                        base = 8;
                        chars = 3;
-                    // Note that in_pos currently is just after the first post-backslash character;
-                    // we want to start our escape from there.
+                        // Note that in_pos currently is just after the first post-backslash
+                        // character; we want to start our escape from there.
                        assert(in_pos > 0);
                        in_pos--;
                        break;
@ -1228,8 +1232,20 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
                }

                if (!errored && res <= max_val) {
-                result_char_or_none =
-                    static_cast<wchar_t>((byte_literal ? ENCODE_DIRECT_BASE : 0) + res);
+                    if (byte_literal) {
+                        // Multibyte encodings necessitate that we keep adjacent byte escapes.
+                        // - `\Xc3\Xb6` is "ö", but only together.
+                        // (this assumes a valid codepoint can't consist of multiple bytes
+                        // that are valid on their own, which is true for UTF-8)
+                        byte_buff.push_back(static_cast<char>(res));
+                        result_char_or_none = none();
+                        if (input[in_pos] == L'\\' && input[in_pos + 1] == L'X') {
+                            in_pos++;
+                            continue;
+                        }
+                    } else {
+                        result_char_or_none = static_cast<wchar_t>(res);
+                    }
                } else {
                    errored = true;
                }
@ -1300,10 +1316,18 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
            }
        }

-    if (!errored && result_char_or_none.has_value()) {
+        if (errored) return none();
+
+        if (!byte_buff.empty()) {
+            result->append(str2wcstring(byte_buff));
+        }
+
+        break;
+    }
+
+    if (result_char_or_none.has_value()) {
        result->push_back(*result_char_or_none);
    }
-    if (errored) return none();

    return in_pos;
 }
--- a/tests/checks/locale.fish
+++ b/tests/checks/locale.fish
@ -88,3 +88,12 @@ env LC_ALL=C $fish -c 'echo -n Y\u00FCY' | display_bytes
 env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes
 #CHECK: 0000000 124 077 124
 #CHECK: 0000003
+
+string match ö \Xc3\Xb6
+#CHECK: ö
+
+math 5 \X2b 5
+#CHECK: 10
+
+math 7 \x2b 7
+#CHECK: 14