Stop crashing on invalid Unicode input

Unlike C++, Rust requires "char" to be a valid Unicode code point.  As a
workaround, we take the raw (probably UTF-8-encoded) input and convert each
input byte to a char representation from the private use area (see commit
3b15e995e (str2wcs: encode invalid Unicode characters in the private use
area, 2023-04-01)).  We convert back whenever we output the string, which
is correct as long as the encoding didn't change since the data was input.

We also need to convert keyboard input; do that.

Quick testing shows that our reader drops PUA characters.  Since this patch
converts both invalid Unicode input as well as PUA input into a safe PUA
representation, there's no longer a reason to not add PUA characters to
the commandline, so let's do that to restore traditional behavior.

Render them as � (REPLACEMENT CHARACTER); unfortunately we show one per
input byte instead of one per code point. To fix this we probably need our
own char type.

While at it, remove some special cases that try to prevent insertion of
control characters. I don't think they are necessary. Could be wrong..
This commit is contained in:
Johannes Altmanninger 2024-02-27 22:25:54 +01:00
parent f60b6e6cd4
commit b77d1d0e2b
3 changed files with 32 additions and 28 deletions

View file

@ -1,11 +1,11 @@
use crate::common::{is_windows_subsystem_for_linux, read_blocked}; use crate::common::{fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked};
use crate::env::{EnvStack, Environment}; use crate::env::{EnvStack, Environment};
use crate::fd_readable_set::FdReadableSet; use crate::fd_readable_set::FdReadableSet;
use crate::flog::FLOG; use crate::flog::FLOG;
use crate::reader::reader_current_data; use crate::reader::reader_current_data;
use crate::threads::{iothread_port, iothread_service_main}; use crate::threads::{iothread_port, iothread_service_main};
use crate::universal_notifier::default_notifier; use crate::universal_notifier::default_notifier;
use crate::wchar::prelude::*; use crate::wchar::{encode_byte_to_char, prelude::*};
use crate::wutil::encoding::{mbrtowc, zero_mbstate}; use crate::wutil::encoding::{mbrtowc, zero_mbstate};
use crate::wutil::fish_wcstol; use crate::wutil::fish_wcstol;
use std::collections::VecDeque; use std::collections::VecDeque;
@ -374,6 +374,8 @@ pub trait InputEventQueuer {
fn readch(&mut self) -> CharEvent { fn readch(&mut self) -> CharEvent {
let mut res: char = '\0'; let mut res: char = '\0';
let mut state = zero_mbstate(); let mut state = zero_mbstate();
let mut bytes = [0; 64 * 16];
let mut num_bytes = 0;
loop { loop {
// Do we have something enqueued already? // Do we have something enqueued already?
// Note this may be initially true, or it may become true through calls to // Note this may be initially true, or it may become true through calls to
@ -415,9 +417,10 @@ pub trait InputEventQueuer {
res = read_byte.into(); res = read_byte.into();
return CharEvent::from_char(res); return CharEvent::from_char(res);
} }
let mut codepoint = u32::from(res);
let sz = unsafe { let sz = unsafe {
mbrtowc( mbrtowc(
std::ptr::addr_of_mut!(res).cast(), std::ptr::addr_of_mut!(codepoint).cast(),
std::ptr::addr_of!(read_byte).cast(), std::ptr::addr_of!(read_byte).cast(),
1, 1,
&mut state, &mut state,
@ -430,16 +433,30 @@ pub trait InputEventQueuer {
} }
-2 => { -2 => {
// Sequence not yet complete. // Sequence not yet complete.
bytes[num_bytes] = read_byte;
num_bytes += 1;
continue;
} }
0 => { 0 => {
// Actual nul char. // Actual nul char.
return CharEvent::from_char('\0'); return CharEvent::from_char('\0');
} }
_ => { _ => (),
// Sequence complete. }
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
return CharEvent::from_char(res); return CharEvent::from_char(res);
} }
} }
bytes[num_bytes] = read_byte;
num_bytes += 1;
for &b in &bytes[1..num_bytes] {
let c = CharEvent::from_char(encode_byte_to_char(b));
self.push_back(c);
}
let res = CharEvent::from_char(encode_byte_to_char(bytes[0]));
return res;
} }
} }
} }

View file

@ -39,10 +39,10 @@ use crate::ast::{self, Ast, Category, Traversal};
use crate::builtins::shared::STATUS_CMD_OK; use crate::builtins::shared::STATUS_CMD_OK;
use crate::color::RgbColor; use crate::color::RgbColor;
use crate::common::{ use crate::common::{
escape, escape_string, exit_without_destructors, fish_reserved_codepoint, get_ellipsis_char, escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char,
get_obfuscation_read_char, redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx, redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx, shell_modes, str2wcstring,
shell_modes, str2wcstring, wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard, wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard, PROGRAM_NAME,
PROGRAM_NAME, UTF8_BOM_WCHAR, UTF8_BOM_WCHAR,
}; };
use crate::complete::{ use crate::complete::{
complete, complete_load, sort_and_prioritize, CompleteFlags, Completion, CompletionList, complete, complete_load, sort_and_prioritize, CompleteFlags, Completion, CompletionList,
@ -1844,10 +1844,7 @@ impl ReaderData {
&& zelf.active_edit_line().1.position() == 0 && zelf.active_edit_line().1.position() == 0
{ {
// This character is skipped. // This character is skipped.
} else if !fish_reserved_codepoint(c) } else {
&& (c >= ' ' || c == '\n' || c == '\r')
&& c != '\x7F'
{
// Regular character. // Regular character.
let (elt, _el) = zelf.active_edit_line(); let (elt, _el) = zelf.active_edit_line();
zelf.insert_char(elt, c); zelf.insert_char(elt, c);
@ -1857,10 +1854,6 @@ impl ReaderData {
// We end history search. We could instead update the search string. // We end history search. We could instead update the search string.
zelf.history_search.reset(); zelf.history_search.reset();
} }
} else {
// This can happen if the user presses a control char we don't recognize. No
// reason to report this to the user unless they've enabled debugging output.
FLOG!(reader, wgettext_fmt!("Unknown key binding 0x%X", c));
} }
rls.last_cmd = None; rls.last_cmd = None;
} }
@ -1976,7 +1969,7 @@ impl ReaderData {
self.inputter self.inputter
.read_char(allow_commands.then_some(&mut command_handler)) .read_char(allow_commands.then_some(&mut command_handler))
}; };
if !event_is_normal_char(&evt) || !poll_fd_readable(self.conf.inputfd) { if !evt.is_char() || !poll_fd_readable(self.conf.inputfd) {
event_needing_handling = Some(evt); event_needing_handling = Some(evt);
break; break;
} else if evt.input_style == CharInputStyle::NotFirst } else if evt.input_style == CharInputStyle::NotFirst
@ -5334,12 +5327,3 @@ impl ReaderData {
self.set_buffer_maintaining_pager(&new_command_line, cursor, false); self.set_buffer_maintaining_pager(&new_command_line, cursor, false);
} }
} }
/// \return true if an event is a normal character that should be inserted into the buffer.
fn event_is_normal_char(evt: &CharEvent) -> bool {
if !evt.is_char() {
return false;
}
let c = evt.get_char();
!fish_reserved_codepoint(c) && c > char::from(31) && c != char::from(127)
}

View file

@ -17,7 +17,7 @@ use std::sync::Mutex;
use libc::{ONLCR, STDERR_FILENO, STDOUT_FILENO}; use libc::{ONLCR, STDERR_FILENO, STDOUT_FILENO};
use crate::common::{ use crate::common::{
get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width, fish_reserved_codepoint, get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width,
has_working_tty_timestamps, shell_modes, str2wcstring, wcs2string, write_loop, ScopeGuard, has_working_tty_timestamps, shell_modes, str2wcstring, wcs2string, write_loop, ScopeGuard,
ScopeGuarding, ScopeGuarding,
}; };
@ -1825,6 +1825,9 @@ fn compute_layout(
// \n. // \n.
// See https://unicode-table.com/en/blocks/control-pictures/ // See https://unicode-table.com/en/blocks/control-pictures/
fn rendered_character(c: char) -> char { fn rendered_character(c: char) -> char {
if fish_reserved_codepoint(c) {
return '<27>'; // replacement character
}
if c <= '\x1F' { if c <= '\x1F' {
char::from_u32(u32::from(c) + 0x2400).unwrap() char::from_u32(u32::from(c) + 0x2400).unwrap()
} else { } else {