From b77d1d0e2bebf4b2f6b28acf701d4c74c112e98e Mon Sep 17 00:00:00 2001 From: Johannes Altmanninger Date: Tue, 27 Feb 2024 22:25:54 +0100 Subject: [PATCH] Stop crashing on invalid Unicode input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike C++, Rust requires "char" to be a valid Unicode code point. As a workaround, we take the raw (probably UTF-8-encoded) input and convert each input byte to a char representation from the private use area (see commit 3b15e995e (str2wcs: encode invalid Unicode characters in the private use area, 2023-04-01)). We convert back whenever we output the string, which is correct as long as the encoding didn't change since the data was input. We also need to convert keyboard input; do that. Quick testing shows that our reader drops PUA characters. Since this patch converts both invalid Unicode input as well as PUA input into a safe PUA representation, there's no longer a reason to not add PUA characters to the commandline, so let's do that to restore traditional behavior. Render them as � (REPLACEMENT CHARACTER); unfortunately we show one per input byte instead of one per code point. To fix this we probably need our own char type. While at it, remove some special cases that try to prevent insertion of control characters. I don't think they are necessary. Could be wrong.. --- src/input_common.rs | 27 ++++++++++++++++++++++----- src/reader.rs | 28 ++++++---------------------- src/screen.rs | 5 ++++- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/input_common.rs b/src/input_common.rs index caa34c7c4..4686c635a 100644 --- a/src/input_common.rs +++ b/src/input_common.rs @@ -1,11 +1,11 @@ -use crate::common::{is_windows_subsystem_for_linux, read_blocked}; +use crate::common::{fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked}; use crate::env::{EnvStack, Environment}; use crate::fd_readable_set::FdReadableSet; use crate::flog::FLOG; use crate::reader::reader_current_data; use crate::threads::{iothread_port, iothread_service_main}; use crate::universal_notifier::default_notifier; -use crate::wchar::prelude::*; +use crate::wchar::{encode_byte_to_char, prelude::*}; use crate::wutil::encoding::{mbrtowc, zero_mbstate}; use crate::wutil::fish_wcstol; use std::collections::VecDeque; @@ -374,6 +374,8 @@ pub trait InputEventQueuer { fn readch(&mut self) -> CharEvent { let mut res: char = '\0'; let mut state = zero_mbstate(); + let mut bytes = [0; 64 * 16]; + let mut num_bytes = 0; loop { // Do we have something enqueued already? // Note this may be initially true, or it may become true through calls to @@ -415,9 +417,10 @@ pub trait InputEventQueuer { res = read_byte.into(); return CharEvent::from_char(res); } + let mut codepoint = u32::from(res); let sz = unsafe { mbrtowc( - std::ptr::addr_of_mut!(res).cast(), + std::ptr::addr_of_mut!(codepoint).cast(), std::ptr::addr_of!(read_byte).cast(), 1, &mut state, @@ -430,16 +433,30 @@ pub trait InputEventQueuer { } -2 => { // Sequence not yet complete. + bytes[num_bytes] = read_byte; + num_bytes += 1; + continue; } 0 => { // Actual nul char. return CharEvent::from_char('\0'); } - _ => { - // Sequence complete. + _ => (), + } + if let Some(res) = char::from_u32(codepoint) { + // Sequence complete. + if !fish_reserved_codepoint(res) { return CharEvent::from_char(res); } } + bytes[num_bytes] = read_byte; + num_bytes += 1; + for &b in &bytes[1..num_bytes] { + let c = CharEvent::from_char(encode_byte_to_char(b)); + self.push_back(c); + } + let res = CharEvent::from_char(encode_byte_to_char(bytes[0])); + return res; } } } diff --git a/src/reader.rs b/src/reader.rs index 2dc9d3f95..43f0f0b6c 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -39,10 +39,10 @@ use crate::ast::{self, Ast, Category, Traversal}; use crate::builtins::shared::STATUS_CMD_OK; use crate::color::RgbColor; use crate::common::{ - escape, escape_string, exit_without_destructors, fish_reserved_codepoint, get_ellipsis_char, - get_obfuscation_read_char, redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx, - shell_modes, str2wcstring, wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard, - PROGRAM_NAME, UTF8_BOM_WCHAR, + escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char, + redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx, shell_modes, str2wcstring, + wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard, PROGRAM_NAME, + UTF8_BOM_WCHAR, }; use crate::complete::{ complete, complete_load, sort_and_prioritize, CompleteFlags, Completion, CompletionList, @@ -1844,10 +1844,7 @@ impl ReaderData { && zelf.active_edit_line().1.position() == 0 { // This character is skipped. - } else if !fish_reserved_codepoint(c) - && (c >= ' ' || c == '\n' || c == '\r') - && c != '\x7F' - { + } else { // Regular character. let (elt, _el) = zelf.active_edit_line(); zelf.insert_char(elt, c); @@ -1857,10 +1854,6 @@ impl ReaderData { // We end history search. We could instead update the search string. zelf.history_search.reset(); } - } else { - // This can happen if the user presses a control char we don't recognize. No - // reason to report this to the user unless they've enabled debugging output. - FLOG!(reader, wgettext_fmt!("Unknown key binding 0x%X", c)); } rls.last_cmd = None; } @@ -1976,7 +1969,7 @@ impl ReaderData { self.inputter .read_char(allow_commands.then_some(&mut command_handler)) }; - if !event_is_normal_char(&evt) || !poll_fd_readable(self.conf.inputfd) { + if !evt.is_char() || !poll_fd_readable(self.conf.inputfd) { event_needing_handling = Some(evt); break; } else if evt.input_style == CharInputStyle::NotFirst @@ -5334,12 +5327,3 @@ impl ReaderData { self.set_buffer_maintaining_pager(&new_command_line, cursor, false); } } - -/// \return true if an event is a normal character that should be inserted into the buffer. -fn event_is_normal_char(evt: &CharEvent) -> bool { - if !evt.is_char() { - return false; - } - let c = evt.get_char(); - !fish_reserved_codepoint(c) && c > char::from(31) && c != char::from(127) -} diff --git a/src/screen.rs b/src/screen.rs index ac264b74b..4ed5b51f0 100644 --- a/src/screen.rs +++ b/src/screen.rs @@ -17,7 +17,7 @@ use std::sync::Mutex; use libc::{ONLCR, STDERR_FILENO, STDOUT_FILENO}; use crate::common::{ - get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width, + fish_reserved_codepoint, get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width, has_working_tty_timestamps, shell_modes, str2wcstring, wcs2string, write_loop, ScopeGuard, ScopeGuarding, }; @@ -1825,6 +1825,9 @@ fn compute_layout( // \n. // See https://unicode-table.com/en/blocks/control-pictures/ fn rendered_character(c: char) -> char { + if fish_reserved_codepoint(c) { + return '�'; // replacement character + } if c <= '\x1F' { char::from_u32(u32::from(c) + 0x2400).unwrap() } else {