From 1305a0899cae1cc895928d20bd88b50c4f6ad43e Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Mon, 22 Feb 2021 15:03:49 -0600 Subject: [PATCH] Fix comparison warnings on UTF-16 platforms Without true handling of UTF-16 surrogate pairs, all we can do is properly detect the BMP range in UTF-16 environments and bail if the input is in a non-BMP region. There isn't much else we can do as it is incorrect to encode the surrogate pairs themselves (fish doesn't know what to do with them and they're illegal under either of UTF-8 or UTF-32). (I'm not aware of fish being used in any UTF-16 platforms other than Cygwin.) --- src/fish_key_reader.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/fish_key_reader.cpp b/src/fish_key_reader.cpp index 39c7bfe5e..7116e3bd7 100644 --- a/src/fish_key_reader.cpp +++ b/src/fish_key_reader.cpp @@ -146,11 +146,34 @@ static wchar_t *char_to_symbol(wchar_t wc, bool bind_friendly) { del_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly); } else if (wc < 0x80) { // ASCII characters that are not control characters ascii_printable_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly); - } else if (wc <= 0xFFFF) { // BMP Unicode chararacter + } +// Conditional handling of BMP Unicode characters depends on the encoding. Assume width of wchar_t +// corresponds to the encoding, i.e. WCHAR_T_BITS == 16 implies UTF-16 and WCHAR_T_BITS == 32 +// because there's no other sane way of handling the input. +#if WCHAR_T_BITS == 16 + else if (wc <= 0xD7FF || (wc >= 0xE000 && wc <= 0xFFFD)) { + // UTF-16 encoding of Unicode character in BMP range + std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc); + } else { + // Our support for UTF-16 surrogate pairs is non-existent. + // See https://github.com/fish-shell/fish-shell/issues/6585#issuecomment-783669903 for what + // correct handling of surrogate pairs would look like - except it would need to be done + // everywhere. + + // 0xFFFD is the unicode codepoint for "symbol doesn't exist in codepage" and is the most + // correct thing we can do given the byte-by-byte parsing without any support for surrogate + // pairs. + std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\uFFFD"); + } +#elif WCHAR_T_BITS == 32 + else if (wc <= 0xFFFF) { // BMP Unicode chararacter std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc); } else { // Non-BMP Unicode chararacter std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\U%06X", wc); } +#else + static_assert(false, "Unsupported WCHAR_T size; unknown encoding!"); +#endif return buf; }