Fix comparison warnings on UTF-16 platforms

Without true handling of UTF-16 surrogate pairs, all we can do is
properly detect the BMP range in UTF-16 environments and bail if the
input is in a non-BMP region.

There isn't much else we can do as it is incorrect to encode the
surrogate pairs themselves (fish doesn't know what to do with them and
they're illegal under either of UTF-8 or UTF-32).

(I'm not aware of fish being used in any UTF-16 platforms other than
Cygwin.)
This commit is contained in:
Mahmoud Al-Qudsi 2021-02-22 15:03:49 -06:00
parent 215df7eec6
commit 1305a0899c

View file

@ -146,11 +146,34 @@ static wchar_t *char_to_symbol(wchar_t wc, bool bind_friendly) {
del_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly); del_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly);
} else if (wc < 0x80) { // ASCII characters that are not control characters } else if (wc < 0x80) { // ASCII characters that are not control characters
ascii_printable_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly); ascii_printable_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly);
} else if (wc <= 0xFFFF) { // BMP Unicode chararacter }
// Conditional handling of BMP Unicode characters depends on the encoding. Assume width of wchar_t
// corresponds to the encoding, i.e. WCHAR_T_BITS == 16 implies UTF-16 and WCHAR_T_BITS == 32
// because there's no other sane way of handling the input.
#if WCHAR_T_BITS == 16
else if (wc <= 0xD7FF || (wc >= 0xE000 && wc <= 0xFFFD)) {
// UTF-16 encoding of Unicode character in BMP range
std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc);
} else {
// Our support for UTF-16 surrogate pairs is non-existent.
// See https://github.com/fish-shell/fish-shell/issues/6585#issuecomment-783669903 for what
// correct handling of surrogate pairs would look like - except it would need to be done
// everywhere.
// 0xFFFD is the unicode codepoint for "symbol doesn't exist in codepage" and is the most
// correct thing we can do given the byte-by-byte parsing without any support for surrogate
// pairs.
std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\uFFFD");
}
#elif WCHAR_T_BITS == 32
else if (wc <= 0xFFFF) { // BMP Unicode chararacter
std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc); std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc);
} else { // Non-BMP Unicode chararacter } else { // Non-BMP Unicode chararacter
std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\U%06X", wc); std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\U%06X", wc);
} }
#else
static_assert(false, "Unsupported WCHAR_T size; unknown encoding!");
#endif
return buf; return buf;
} }