Stop crashing on invalid Unicode input

Unlike C++, Rust requires "char" to be a valid Unicode code point.  As a
workaround, we take the raw (probably UTF-8-encoded) input and convert each
input byte to a char representation from the private use area (see commit
3b15e995e (str2wcs: encode invalid Unicode characters in the private use
area, 2023-04-01)).  We convert back whenever we output the string, which
is correct as long as the encoding didn't change since the data was input.

We also need to convert keyboard input; do that.

Quick testing shows that our reader drops PUA characters.  Since this patch
converts both invalid Unicode input as well as PUA input into a safe PUA
representation, there's no longer a reason to not add PUA characters to
the commandline, so let's do that to restore traditional behavior.

Render them as � (REPLACEMENT CHARACTER); unfortunately we show one per
input byte instead of one per code point. To fix this we probably need our
own char type.

While at it, remove some special cases that try to prevent insertion of
control characters. I don't think they are necessary. Could be wrong..
This commit is contained in:
Johannes Altmanninger 2024-02-27 22:25:54 +01:00
parent f60b6e6cd4
commit b77d1d0e2b
3 changed files with 32 additions and 28 deletions

View file

@ -1,11 +1,11 @@
use crate::common::{is_windows_subsystem_for_linux, read_blocked};
use crate::common::{fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked};
use crate::env::{EnvStack, Environment};
use crate::fd_readable_set::FdReadableSet;
use crate::flog::FLOG;
use crate::reader::reader_current_data;
use crate::threads::{iothread_port, iothread_service_main};
use crate::universal_notifier::default_notifier;
use crate::wchar::prelude::*;
use crate::wchar::{encode_byte_to_char, prelude::*};
use crate::wutil::encoding::{mbrtowc, zero_mbstate};
use crate::wutil::fish_wcstol;
use std::collections::VecDeque;
@ -374,6 +374,8 @@ pub trait InputEventQueuer {
fn readch(&mut self) -> CharEvent {
let mut res: char = '\0';
let mut state = zero_mbstate();
let mut bytes = [0; 64 * 16];
let mut num_bytes = 0;
loop {
// Do we have something enqueued already?
// Note this may be initially true, or it may become true through calls to
@ -415,9 +417,10 @@ pub trait InputEventQueuer {
res = read_byte.into();
return CharEvent::from_char(res);
}
let mut codepoint = u32::from(res);
let sz = unsafe {
mbrtowc(
std::ptr::addr_of_mut!(res).cast(),
std::ptr::addr_of_mut!(codepoint).cast(),
std::ptr::addr_of!(read_byte).cast(),
1,
&mut state,
@ -430,16 +433,30 @@ pub trait InputEventQueuer {
}
-2 => {
// Sequence not yet complete.
bytes[num_bytes] = read_byte;
num_bytes += 1;
continue;
}
0 => {
// Actual nul char.
return CharEvent::from_char('\0');
}
_ => {
// Sequence complete.
_ => (),
}
if let Some(res) = char::from_u32(codepoint) {
// Sequence complete.
if !fish_reserved_codepoint(res) {
return CharEvent::from_char(res);
}
}
bytes[num_bytes] = read_byte;
num_bytes += 1;
for &b in &bytes[1..num_bytes] {
let c = CharEvent::from_char(encode_byte_to_char(b));
self.push_back(c);
}
let res = CharEvent::from_char(encode_byte_to_char(bytes[0]));
return res;
}
}
}

View file

@ -39,10 +39,10 @@ use crate::ast::{self, Ast, Category, Traversal};
use crate::builtins::shared::STATUS_CMD_OK;
use crate::color::RgbColor;
use crate::common::{
escape, escape_string, exit_without_destructors, fish_reserved_codepoint, get_ellipsis_char,
get_obfuscation_read_char, redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx,
shell_modes, str2wcstring, wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard,
PROGRAM_NAME, UTF8_BOM_WCHAR,
escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char,
redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx, shell_modes, str2wcstring,
wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard, PROGRAM_NAME,
UTF8_BOM_WCHAR,
};
use crate::complete::{
complete, complete_load, sort_and_prioritize, CompleteFlags, Completion, CompletionList,
@ -1844,10 +1844,7 @@ impl ReaderData {
&& zelf.active_edit_line().1.position() == 0
{
// This character is skipped.
} else if !fish_reserved_codepoint(c)
&& (c >= ' ' || c == '\n' || c == '\r')
&& c != '\x7F'
{
} else {
// Regular character.
let (elt, _el) = zelf.active_edit_line();
zelf.insert_char(elt, c);
@ -1857,10 +1854,6 @@ impl ReaderData {
// We end history search. We could instead update the search string.
zelf.history_search.reset();
}
} else {
// This can happen if the user presses a control char we don't recognize. No
// reason to report this to the user unless they've enabled debugging output.
FLOG!(reader, wgettext_fmt!("Unknown key binding 0x%X", c));
}
rls.last_cmd = None;
}
@ -1976,7 +1969,7 @@ impl ReaderData {
self.inputter
.read_char(allow_commands.then_some(&mut command_handler))
};
if !event_is_normal_char(&evt) || !poll_fd_readable(self.conf.inputfd) {
if !evt.is_char() || !poll_fd_readable(self.conf.inputfd) {
event_needing_handling = Some(evt);
break;
} else if evt.input_style == CharInputStyle::NotFirst
@ -5334,12 +5327,3 @@ impl ReaderData {
self.set_buffer_maintaining_pager(&new_command_line, cursor, false);
}
}
/// \return true if an event is a normal character that should be inserted into the buffer.
fn event_is_normal_char(evt: &CharEvent) -> bool {
if !evt.is_char() {
return false;
}
let c = evt.get_char();
!fish_reserved_codepoint(c) && c > char::from(31) && c != char::from(127)
}

View file

@ -17,7 +17,7 @@ use std::sync::Mutex;
use libc::{ONLCR, STDERR_FILENO, STDOUT_FILENO};
use crate::common::{
get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width,
fish_reserved_codepoint, get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width,
has_working_tty_timestamps, shell_modes, str2wcstring, wcs2string, write_loop, ScopeGuard,
ScopeGuarding,
};
@ -1825,6 +1825,9 @@ fn compute_layout(
// \n.
// See https://unicode-table.com/en/blocks/control-pictures/
fn rendered_character(c: char) -> char {
if fish_reserved_codepoint(c) {
return '<27>'; // replacement character
}
if c <= '\x1F' {
char::from_u32(u32::from(c) + 0x2400).unwrap()
} else {