Decode multibyte escapes immediately

We forgot to decode (i.e. turn into nice wchar_t codepoints)
"byte_literal" escape sequences. This meant that e.g.

```fish
string match ö \Xc3\Xb6

math 5 \X2b 5
```

didn't work, but `math 5 \x2b 5` did, and would print the wonderful
error:

```
math: Error: Missing operator
'5 + 5'
   ^
```

So, instead, we decode eagerly.
This commit is contained in:
Fabian Boehm 2022-09-29 16:53:16 +02:00
parent 62794446b7
commit 396e276286
2 changed files with 173 additions and 140 deletions

View file

@ -1149,6 +1149,10 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
bool errored = false; bool errored = false;
size_t in_pos = 1; // in_pos always tracks the next character to read (and therefore the number size_t in_pos = 1; // in_pos always tracks the next character to read (and therefore the number
// of characters read so far) // of characters read so far)
// For multibyte \X sequences.
std::string byte_buff;
while (!errored) {
const wchar_t c = input[in_pos++]; const wchar_t c = input[in_pos++];
switch (c) { switch (c) {
// A null character after a backslash is an error. // A null character after a backslash is an error.
@ -1207,8 +1211,8 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
default: { default: {
base = 8; base = 8;
chars = 3; chars = 3;
// Note that in_pos currently is just after the first post-backslash character; // Note that in_pos currently is just after the first post-backslash
// we want to start our escape from there. // character; we want to start our escape from there.
assert(in_pos > 0); assert(in_pos > 0);
in_pos--; in_pos--;
break; break;
@ -1228,8 +1232,20 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
} }
if (!errored && res <= max_val) { if (!errored && res <= max_val) {
result_char_or_none = if (byte_literal) {
static_cast<wchar_t>((byte_literal ? ENCODE_DIRECT_BASE : 0) + res); // Multibyte encodings necessitate that we keep adjacent byte escapes.
// - `\Xc3\Xb6` is "ö", but only together.
// (this assumes a valid codepoint can't consist of multiple bytes
// that are valid on their own, which is true for UTF-8)
byte_buff.push_back(static_cast<char>(res));
result_char_or_none = none();
if (input[in_pos] == L'\\' && input[in_pos + 1] == L'X') {
in_pos++;
continue;
}
} else {
result_char_or_none = static_cast<wchar_t>(res);
}
} else { } else {
errored = true; errored = true;
} }
@ -1300,10 +1316,18 @@ maybe_t<size_t> read_unquoted_escape(const wchar_t *input, wcstring *result, boo
} }
} }
if (!errored && result_char_or_none.has_value()) { if (errored) return none();
if (!byte_buff.empty()) {
result->append(str2wcstring(byte_buff));
}
break;
}
if (result_char_or_none.has_value()) {
result->push_back(*result_char_or_none); result->push_back(*result_char_or_none);
} }
if (errored) return none();
return in_pos; return in_pos;
} }

View file

@ -88,3 +88,12 @@ env LC_ALL=C $fish -c 'echo -n Y\u00FCY' | display_bytes
env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes env LC_ALL=C $fish -c 'echo -n T\u01FDT' | display_bytes
#CHECK: 0000000 124 077 124 #CHECK: 0000000 124 077 124
#CHECK: 0000003 #CHECK: 0000003
string match ö \Xc3\Xb6
#CHECK: ö
math 5 \X2b 5
#CHECK: 10
math 7 \x2b 7
#CHECK: 14