From 77dfd7b78389c718dca29d43e446dc888b2dbaff Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Thu, 28 Mar 2013 17:45:49 -0700 Subject: [PATCH] Fix printf's Unicode support https://github.com/fish-shell/fish-shell/issues/625 --- builtin_printf.cpp | 48 ++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/builtin_printf.cpp b/builtin_printf.cpp index bdd1de097..eb7700dd9 100644 --- a/builtin_printf.cpp +++ b/builtin_printf.cpp @@ -342,29 +342,39 @@ long builtin_printf_state_t::print_esc(const wchar_t *escstart, bool octal_0) else if (*p == L'u' || *p == L'U') { wchar_t esc_char = *p; - unsigned int uni_value; - - uni_value = 0; - for (esc_length = (esc_char == L'u' ? 4 : 8), ++p; - esc_length > 0; - --esc_length, ++p) + p++; + uint32_t uni_value = 0; + for (size_t esc_length = 0; esc_length < (esc_char == L'u' ? 4 : 8); esc_length++) { if (! is_hex_digit(*p)) - this->fatal_error(_(L"missing hexadecimal number in escape")); + { + /* Escape sequence must be done. Complain if we didn't get anything */ + if (esc_length == 0) + { + this->fatal_error(_(L"Missing hexadecimal number in Unicode escape")); + } + break; + } uni_value = uni_value * 16 + hex_to_bin(*p); + p++; } - - /* A universal character name shall not specify a character short - identifier in the range 00000000 through 00000020, 0000007F through - 0000009F, or 0000D800 through 0000DFFF inclusive. A universal - character name shall not designate a character in the required - character set. */ - if ((uni_value <= 0x9f - && uni_value != 0x24 && uni_value != 0x40 && uni_value != 0x60) - || (uni_value >= 0xd800 && uni_value <= 0xdfff)) - this->fatal_error(_(L"invalid universal character name \\%c%0*x"), - esc_char, (esc_char == L'u' ? 4 : 8), uni_value); - this->append_format_output(L"%lc", uni_value); + + /* PCA GNU printf respects the limitations described in ISO N717, about which universal characters "shall not" be specified. I believe this limitation is for the benefit of compilers; I see no reason to impose it in builtin_printf. + + If __STDC_ISO_10646__ is defined, then it means wchar_t can and does hold Unicode code points, so just use that. If not defined, use the %lc printf conversion; this probably won't do anything good if your wide character set is not Unicode, but such platforms are exceedingly rare. + */ + if (uni_value > 0x10FFFF) + { + this->fatal_error(_(L"Unicode character out of range: \\%c%0*x"), esc_char, (esc_char == L'u' ? 4 : 8), uni_value); + } + else + { +#if defined(__STDC_ISO_10646__) + this->append_output(uni_value); +#else + this->append_format_output(L"%lc", uni_value); + } +#endif } else {