From 1305a0899cae1cc895928d20bd88b50c4f6ad43e Mon Sep 17 00:00:00 2001
From: Mahmoud Al-Qudsi <mqudsi@neosmart.net>
Date: Mon, 22 Feb 2021 15:03:49 -0600
Subject: [PATCH] Fix comparison warnings on UTF-16 platforms

Without true handling of UTF-16 surrogate pairs, all we can do is
properly detect the BMP range in UTF-16 environments and bail if the
input is in a non-BMP region.

There isn't much else we can do as it is incorrect to encode the
surrogate pairs themselves (fish doesn't know what to do with them and
they're illegal under either of UTF-8 or UTF-32).

(I'm not aware of fish being used in any UTF-16 platforms other than
Cygwin.)
---
 src/fish_key_reader.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/fish_key_reader.cpp b/src/fish_key_reader.cpp
index 39c7bfe5e..7116e3bd7 100644
--- a/src/fish_key_reader.cpp
+++ b/src/fish_key_reader.cpp
@@ -146,11 +146,34 @@ static wchar_t *char_to_symbol(wchar_t wc, bool bind_friendly) {
         del_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly);
     } else if (wc < 0x80) {  // ASCII characters that are not control characters
         ascii_printable_to_symbol(buf, sizeof(buf) / sizeof(*buf), wc, bind_friendly);
-    } else if (wc <= 0xFFFF) {  // BMP Unicode chararacter
+    }
+// Conditional handling of BMP Unicode characters depends on the encoding. Assume width of wchar_t
+// corresponds to the encoding, i.e. WCHAR_T_BITS == 16 implies UTF-16 and WCHAR_T_BITS == 32
+// because there's no other sane way of handling the input.
+#if WCHAR_T_BITS == 16
+    else if (wc <= 0xD7FF || (wc >= 0xE000 && wc <= 0xFFFD)) {
+        // UTF-16 encoding of Unicode character in BMP range
+        std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc);
+    } else {
+        // Our support for UTF-16 surrogate pairs is non-existent.
+        // See https://github.com/fish-shell/fish-shell/issues/6585#issuecomment-783669903 for what
+        // correct handling of surrogate pairs would look like - except it would need to be done
+        // everywhere.
+
+        // 0xFFFD is the unicode codepoint for "symbol doesn't exist in codepage" and is the most
+        // correct thing we can do given the byte-by-byte parsing without any support for surrogate
+        // pairs.
+        std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\uFFFD");
+    }
+#elif WCHAR_T_BITS == 32
+    else if (wc <= 0xFFFF) {  // BMP Unicode chararacter
         std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\u%04X", wc);
     } else {  // Non-BMP Unicode chararacter
         std::swprintf(buf, sizeof(buf) / sizeof(*buf), L"\\U%06X", wc);
     }
+#else
+    static_assert(false, "Unsupported WCHAR_T size; unknown encoding!");
+#endif
 
     return buf;
 }