diff --git a/fish-rust/Cargo.lock b/fish-rust/Cargo.lock index 059017816..cbde7ffb6 100644 --- a/fish-rust/Cargo.lock +++ b/fish-rust/Cargo.lock @@ -368,6 +368,7 @@ dependencies = [ "autocxx", "autocxx-build", "bitflags", + "cc", "cxx", "cxx-build", "cxx-gen", diff --git a/fish-rust/Cargo.toml b/fish-rust/Cargo.toml index 24f803e47..1511d1637 100644 --- a/fish-rust/Cargo.toml +++ b/fish-rust/Cargo.toml @@ -26,6 +26,7 @@ widestring = "1.0.2" [build-dependencies] autocxx-build = "0.23.1" +cc = { git = "https://github.com/mqudsi/cc-rs", branch = "fish" } cxx-build = { git = "https://github.com/fish-shell/cxx", branch = "fish" } cxx-gen = { git = "https://github.com/fish-shell/cxx", branch = "fish" } miette = { version = "5", features = ["fancy"] } diff --git a/fish-rust/build.rs b/fish-rust/build.rs index 5ffbbabd1..4d2edfee5 100644 --- a/fish-rust/build.rs +++ b/fish-rust/build.rs @@ -1,6 +1,8 @@ use miette::miette; fn main() -> miette::Result<()> { + cc::Build::new().file("src/compat.c").compile("libcompat.a"); + let rust_dir = std::env::var("CARGO_MANIFEST_DIR").expect("Env var CARGO_MANIFEST_DIR missing"); let target_dir = std::env::var("FISH_RUST_TARGET_DIR").unwrap_or(format!("{}/{}", rust_dir, "target/")); @@ -25,6 +27,7 @@ fn main() -> miette::Result<()> { let source_files = vec![ "src/abbrs.rs", "src/event.rs", + "src/common.rs", "src/fd_monitor.rs", "src/fd_readable_set.rs", "src/fds.rs", diff --git a/fish-rust/src/common.rs b/fish-rust/src/common.rs index 48a7cf622..75780987d 100644 --- a/fish-rust/src/common.rs +++ b/fish-rust/src/common.rs @@ -1,13 +1,81 @@ -use crate::ffi; -use crate::wchar::{wstr, WString}; +//! Prototypes for various functions, mostly string utilities, that are used by most parts of fish. + +use crate::expand::{ + BRACE_BEGIN, BRACE_END, BRACE_SEP, BRACE_SPACE, HOME_DIRECTORY, INTERNAL_SEPARATOR, + PROCESS_EXPAND_SELF, PROCESS_EXPAND_SELF_STR, VARIABLE_EXPAND, VARIABLE_EXPAND_SINGLE, +}; +use crate::ffi::{self, fish_wcwidth}; +use crate::future_feature_flags::{feature_test, FeatureFlag}; +use crate::global_safety::RelaxedAtomicBool; +use crate::termsize::Termsize; +use crate::wchar::{encode_byte_to_char, wstr, WString, L}; use crate::wchar_ext::WExt; -use crate::wchar_ffi::c_str; -use crate::wchar_ffi::WCharFromFFI; +use crate::wchar_ffi::{c_str, WCharFromFFI, WCharToFFI}; +use crate::wcstringutil::wcs2string_callback; +use crate::wildcard::{ANY_CHAR, ANY_STRING, ANY_STRING_RECURSIVE}; +use crate::wutil::encoding::{mbrtowc, wcrtomb, zero_mbstate, AT_LEAST_MB_LEN_MAX}; +use crate::wutil::{fish_iswalnum, sprintf, wgettext}; use bitflags::bitflags; -use std::mem; -use std::mem::ManuallyDrop; +use core::slice; +use cxx::{CxxWString, UniquePtr}; +use libc::{EINTR, EIO, O_WRONLY, SIGTTOU, SIG_IGN, STDERR_FILENO, STDIN_FILENO, STDOUT_FILENO}; +use once_cell::sync::Lazy; +use std::cell::RefCell; +use std::env; +use std::ffi::CString; +use std::mem::{self, ManuallyDrop}; use std::ops::{Deref, DerefMut}; use std::os::fd::AsRawFd; +use std::path::PathBuf; +use std::rc::Rc; +use std::str::FromStr; +use std::sync::atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}; +use std::sync::Mutex; +use std::time; +use widestring_suffix::widestrs; + +// Highest legal ASCII value. +pub const ASCII_MAX: char = 127 as char; + +// Highest legal 16-bit Unicode value. +pub const UCS2_MAX: char = '\u{FFFF}'; + +// Highest legal byte value. +pub const BYTE_MAX: char = 0xFF as char; + +// Unicode BOM value. +pub const UTF8_BOM_WCHAR: char = '\u{FEFF}'; + +// Use Unicode "non-characters" for internal characters as much as we can. This +// gives us 32 "characters" for internal use that we can guarantee should not +// appear in our input stream. See http://www.unicode.org/faq/private_use.html. +pub const RESERVED_CHAR_BASE: char = '\u{FDD0}'; +pub const RESERVED_CHAR_END: char = '\u{FDF0}'; +// Split the available non-character values into two ranges to ensure there are +// no conflicts among the places we use these special characters. +pub const EXPAND_RESERVED_BASE: char = RESERVED_CHAR_BASE; +pub const EXPAND_RESERVED_END: char = char_offset(EXPAND_RESERVED_BASE, 16); +pub const WILDCARD_RESERVED_BASE: char = EXPAND_RESERVED_END; +pub const WILDCARD_RESERVED_END: char = char_offset(WILDCARD_RESERVED_BASE, 16); +// Make sure the ranges defined above don't exceed the range for non-characters. +// This is to make sure we didn't do something stupid in subdividing the +// Unicode range for our needs. +const _: () = assert!(WILDCARD_RESERVED_END <= RESERVED_CHAR_END); + +// These are in the Unicode private-use range. We really shouldn't use this +// range but have little choice in the matter given how our lexer/parser works. +// We can't use non-characters for these two ranges because there are only 66 of +// them and we need at least 256 + 64. +// +// If sizeof(wchar_t))==4 we could avoid using private-use chars; however, that +// would result in fish having different behavior on machines with 16 versus 32 +// bit wchar_t. It's better that fish behave the same on both types of systems. +// +// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know +// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF) +// on Mac OS X. See http://www.unicode.org/faq/private_use.html. +pub const ENCODE_DIRECT_BASE: char = '\u{F600}'; +pub const ENCODE_DIRECT_END: char = char_offset(ENCODE_DIRECT_BASE, 256); #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EscapeStringStyle { @@ -41,6 +109,34 @@ bitflags! { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UnescapeStringStyle { + Script(UnescapeFlags), + Url, + Var, +} + +impl Default for UnescapeStringStyle { + fn default() -> Self { + Self::Script(UnescapeFlags::default()) + } +} + +bitflags! { + /// Flags for unescape_string functions. + #[derive(Default)] + pub struct UnescapeFlags: u32 { + /// default behavior + const DEFAULT = 0; + /// escape special fish syntax characters like the semicolon + const SPECIAL = 1 << 0; + /// allow incomplete escape sequences + const INCOMPLETE = 1 << 1; + /// don't handle backslash escapes + const NO_BACKSLASHES = 1 << 2; + } +} + /// Replace special characters with backslash escape sequences. Newline is replaced with `\n`, etc. pub fn escape_string(s: &wstr, style: EscapeStringStyle) -> WString { let (style, flags) = match style { @@ -64,6 +160,1042 @@ pub fn escape_string(s: &wstr, style: EscapeStringStyle) -> WString { ffi::escape_string(c_str!(s), flags.bits().into(), style).from_ffi() } +/// Escape a string so that it may be inserted into a double-quoted string. +/// This permits ownership transfer. +pub fn escape_string_for_double_quotes(input: &wstr) -> WString { + // We need to escape backslashes, double quotes, and dollars only. + let mut result = input.to_owned(); + let mut idx = result.len(); + while idx > 0 { + idx -= 1; + if ['\\', '$', '"'].contains(&result.char_at(idx)) { + result.insert(idx, '\\'); + } + } + result +} + +pub fn unescape_string(input: &wstr, style: UnescapeStringStyle) -> Option { + match style { + UnescapeStringStyle::Script(flags) => unescape_string_internal(input, flags), + UnescapeStringStyle::Url => unescape_string_url(input), + UnescapeStringStyle::Var => unescape_string_var(input), + } +} + +// TODO Delete this. +pub fn unescape_string_in_place(s: &mut WString, style: UnescapeStringStyle) -> bool { + unescape_string(s, style) + .map(|unescaped| *s = unescaped) + .is_some() +} + +/// Returns the unescaped version of input, or None on error. +fn unescape_string_internal(input: &wstr, flags: UnescapeFlags) -> Option { + let mut result = WString::new(); + result.reserve(input.len()); + + let unescape_special = flags.contains(UnescapeFlags::SPECIAL); + let allow_incomplete = flags.contains(UnescapeFlags::INCOMPLETE); + let ignore_backslashes = flags.contains(UnescapeFlags::NO_BACKSLASHES); + + // The positions of open braces. + let mut braces = vec![]; + // The positions of variable expansions or brace ","s. + // We only read braces as expanders if there's a variable expansion or "," in them. + let mut vars_or_seps = vec![]; + let mut brace_count = 0; + + let mut errored = false; + #[derive(PartialEq, Eq)] + enum Mode { + Unquoted, + SingleQuotes, + DoubleQuotes, + } + let mut mode = Mode::Unquoted; + + let mut input_position = 0; + while input_position < input.len() && !errored { + let c = input.char_at(input_position); + // Here's the character we'll append to result, or none() to suppress it. + let mut to_append_or_none = Some(c); + if mode == Mode::Unquoted { + match c { + '\\' => { + if !ignore_backslashes { + // Backslashes (escapes) are complicated and may result in errors, or + // appending INTERNAL_SEPARATORs, so we have to handle them specially. + if let Some(escape_chars) = read_unquoted_escape( + &input[input_position..], + &mut result, + allow_incomplete, + unescape_special, + ) { + // Skip over the characters we read, minus one because the outer loop + // will increment it. + assert!(escape_chars > 0); + input_position += escape_chars - 1; + } else { + // A none() return indicates an error. + errored = true; + } + // We've already appended, don't append anything else. + to_append_or_none = None; + } + } + '~' => { + if unescape_special && input_position == 0 { + to_append_or_none = Some(HOME_DIRECTORY); + } + } + '%' => { + // Note that this only recognizes %self if the string is literally %self. + // %self/foo will NOT match this. + if unescape_special && input_position == 0 && input == PROCESS_EXPAND_SELF_STR { + to_append_or_none = Some(PROCESS_EXPAND_SELF); + input_position += PROCESS_EXPAND_SELF_STR.len() - 1; // skip over 'self's + } + } + '*' => { + if unescape_special { + // In general, this is ANY_STRING. But as a hack, if the last appended char + // is ANY_STRING, delete the last char and store ANY_STRING_RECURSIVE to + // reflect the fact that ** is the recursive wildcard. + if result.chars().last() == Some(ANY_STRING) { + assert!(!result.is_empty()); + result.truncate(result.len() - 1); + to_append_or_none = Some(ANY_STRING_RECURSIVE); + } else { + to_append_or_none = Some(ANY_STRING); + } + } + } + '?' => { + if unescape_special && !feature_test(FeatureFlag::qmark_noglob) { + to_append_or_none = Some(ANY_CHAR); + } + } + '$' => { + if unescape_special { + let is_cmdsub = input_position + 1 < input.len() + && input.char_at(input_position + 1) == '('; + if !is_cmdsub { + to_append_or_none = Some(VARIABLE_EXPAND); + vars_or_seps.push(input_position); + } + } + } + '{' => { + if unescape_special { + brace_count += 1; + to_append_or_none = Some(BRACE_BEGIN); + // We need to store where the brace *ends up* in the output. + braces.push(result.len()); + } + } + '}' => { + if unescape_special { + // HACK: The completion machinery sometimes hands us partial tokens. + // We can't parse them properly, but it shouldn't hurt, + // so we don't assert here. + // See #4954. + // assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we + // shouldn't be able to get here"); + brace_count -= 1; + to_append_or_none = Some(BRACE_END); + if let Some(brace) = braces.pop() { + // HACK: To reduce accidental use of brace expansion, treat a brace + // with zero or one items as literal input. See #4632. (The hack is + // doing it here and like this.) + if vars_or_seps.last().map(|i| *i < brace).unwrap_or(true) { + result.as_char_slice_mut()[brace] = '{'; + // We also need to turn all spaces back. + for i in brace + 1..result.len() { + if result.char_at(i) == BRACE_SPACE { + result.as_char_slice_mut()[i] = ' '; + } + } + to_append_or_none = Some('}'); + } + // Remove all seps inside the current brace pair, so if we have a + // surrounding pair we only get seps inside *that*. + if !vars_or_seps.is_empty() { + while vars_or_seps.last().map(|i| *i > brace).unwrap_or_default() { + vars_or_seps.pop(); + } + } + } + } + } + ',' => { + if unescape_special && brace_count > 0 { + to_append_or_none = Some(BRACE_SEP); + vars_or_seps.push(input_position); + } + } + ' ' => { + if unescape_special && brace_count > 0 { + to_append_or_none = Some(BRACE_SPACE); + } + } + '\'' => { + mode = Mode::SingleQuotes; + to_append_or_none = if unescape_special { + Some(INTERNAL_SEPARATOR) + } else { + None + }; + } + '"' => { + mode = Mode::DoubleQuotes; + to_append_or_none = if unescape_special { + Some(INTERNAL_SEPARATOR) + } else { + None + }; + } + _ => (), + } + } else if mode == Mode::SingleQuotes { + if c == '\\' { + // A backslash may or may not escape something in single quotes. + match input.char_at(input_position + 1) { + '\\' | '\'' => { + to_append_or_none = Some(input.char_at(input_position + 1)); + input_position += 1; // skip over the backslash + } + '\0' => { + if !allow_incomplete { + errored = true; + } else { + // PCA this line had the following cryptic comment: 'We may ever escape + // a NULL character, but still appending a \ in case I am wrong.' Not + // sure what it means or the importance of this. + input_position += 1; /* Skip over the backslash */ + to_append_or_none = Some('\\'); + } + } + _ => { + // Literal backslash that doesn't escape anything! Leave things alone; we'll + // append the backslash itself. + } + } + } else if c == '\'' { + to_append_or_none = if unescape_special { + Some(INTERNAL_SEPARATOR) + } else { + None + }; + mode = Mode::Unquoted; + } + } else if mode == Mode::DoubleQuotes { + match c { + '"' => { + mode = Mode::Unquoted; + to_append_or_none = if unescape_special { + Some(INTERNAL_SEPARATOR) + } else { + None + }; + } + '\\' => { + match input.char_at(input_position + 1) { + '\0' => { + if !allow_incomplete { + errored = true; + } else { + to_append_or_none = Some('\0'); + } + } + '\\' | '$' | '"' => { + to_append_or_none = Some(input.char_at(input_position + 1)); + input_position += 1; /* Skip over the backslash */ + } + '\n' => { + /* Swallow newline */ + to_append_or_none = None; + input_position += 1; /* Skip over the backslash */ + } + _ => { + /* Literal backslash that doesn't escape anything! Leave things alone; + * we'll append the backslash itself */ + } + } + } + '$' => { + if unescape_special { + to_append_or_none = Some(VARIABLE_EXPAND_SINGLE); + vars_or_seps.push(input_position); + } + } + _ => (), + } + } + + // Now maybe append the char. + if let Some(c) = to_append_or_none { + result.push(c); + } + input_position += 1; + } + + // Return the string by reference, and then success. + if errored { + return None; + } + Some(result) +} + +/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII +/// chars. +fn unescape_string_url(input: &wstr) -> Option { + let mut result: Vec = vec![]; + let mut i = 0; + while i < input.len() { + let c = input.char_at(i); + if c > '\u{7F}' { + return None; // invalid character means we can't decode the string + } + if c == '%' { + let c1 = input.char_at(i + 1); + if c1 == '\0' { + return None; + } else if c1 == '%' { + result.push(b'%'); + i += 1; + } else { + let c2 = input.char_at(i + 2); + if c2 == '\0' { + return None; // string ended prematurely + } + let d1 = c1.to_digit(16)?; + let d2 = c2.to_digit(16)?; + result.push((16 * d1 + d2) as u8); + i += 2; + } + } else { + result.push(c as u8); + } + i += 1 + } + + Some(str2wcstring(&result)) +} + +/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII +/// chars. +fn unescape_string_var(input: &wstr) -> Option { + let mut result: Vec = vec![]; + let mut prev_was_hex_encoded = false; + let mut i = 0; + while i < input.len() { + let c = input.char_at(i); + if c > '\u{7F}' { + return None; // invalid character means we can't decode the string + } + if c == '_' { + let c1 = input.char_at(i + 1); + if c1 == '\0' { + if prev_was_hex_encoded { + break; + } + return None; // found unexpected escape char at end of string + } + if c1 == '_' { + result.push(b'_'); + i += 1; + } else if ('0'..='9').contains(&c1) || ('A'..='F').contains(&c1) { + let c2 = input.char_at(i + 2); + if c2 == '\0' { + return None; // string ended prematurely + } + let d1 = convert_hex_digit(c1)?; + let d2 = convert_hex_digit(c2)?; + result.push((16 * d1 + d2) as u8); + i += 2; + prev_was_hex_encoded = true; + } + // No "else" clause because if the first char after an underscore is not another + // underscore or a valid hex character then the underscore is there to improve + // readability after we've encoded a character not valid in a var name. + } else { + result.push(c as u8); + } + i += 1; + } + + Some(str2wcstring(&result)) +} + +/// Given a string starting with a backslash, read the escape as if it is unquoted, appending +/// to result. Return the number of characters consumed, or none on error. +pub fn read_unquoted_escape( + input: &wstr, + result: &mut WString, + allow_incomplete: bool, + unescape_special: bool, +) -> Option { + assert!(input.char_at(0) == '\\', "not an escape"); + + // Here's the character we'll ultimately append, or none. Note that '\0' is a + // valid thing to append. + let mut result_char_or_none: Option = None; + + let mut errored = false; + let mut in_pos = 1; // in_pos always tracks the next character to read (and therefore the number + // of characters read so far) + + // For multibyte \X sequences. + let mut byte_buff: Vec = vec![]; + + loop { + let c = input.char_at(in_pos); + in_pos += 1; + match c { + // A null character after a backslash is an error. + '\0' => { + // Adjust in_pos to only include the backslash. + assert!(in_pos > 0); + in_pos -= 1; + + // It's an error, unless we're allowing incomplete escapes. + if !allow_incomplete { + errored = true; + } + } + // Numeric escape sequences. No prefix means octal escape, otherwise hexadecimal. + '0'..='7' | 'u' | 'U' | 'x' | 'X' => { + let mut res: u64 = 0; + let mut chars = 2; + let mut base = 16; + let mut byte_literal = false; + let mut max_val = ASCII_MAX; + + match c { + 'u' => { + chars = 4; + max_val = UCS2_MAX; + } + 'U' => { + chars = 8; + // Don't exceed the largest Unicode code point - see #1107. + max_val = char::MAX; + } + 'x' | 'X' => { + byte_literal = true; + max_val = BYTE_MAX; + } + _ => { + base = 8; + chars = 3; + // Note that in_pos currently is just after the first post-backslash + // character; we want to start our escape from there. + assert!(in_pos > 0); + in_pos -= 1; + } + } + + for i in 0..chars { + let Some(d) = input.char_at(in_pos).to_digit(base) else { + // If we have no digit, this is a tokenizer error. + if i == 0 { + errored = true; + } + break; + }; + + res = (res * u64::from(base)) + u64::from(d); + in_pos += 1; + } + + if !errored && res <= u64::from(max_val) { + if byte_literal { + // Multibyte encodings necessitate that we keep adjacent byte escapes. + // - `\Xc3\Xb6` is "ö", but only together. + // (this assumes a valid codepoint can't consist of multiple bytes + // that are valid on their own, which is true for UTF-8) + byte_buff.push(res.try_into().unwrap()); + result_char_or_none = None; + if input[in_pos..].starts_with("\\X") || input[in_pos..].starts_with("\\x") + { + in_pos += 1; + continue; + } + } else { + result_char_or_none = + Some(char::from_u32(res.try_into().unwrap()).unwrap_or('\u{FFFD}')); + } + } else { + errored = true; + } + } + // \a means bell (alert). + 'a' => { + result_char_or_none = Some('\x07'); + } + // \b means backspace. + 'b' => { + result_char_or_none = Some('\x08'); + } + // \cX means control sequence X. + 'c' => { + let sequence_char = u32::from(input.char_at(in_pos)); + in_pos += 1; + if sequence_char >= u32::from('a') && sequence_char <= u32::from('a') + 32 { + result_char_or_none = + Some(char::from_u32(sequence_char - u32::from('a') + 1).unwrap()); + } else if sequence_char >= u32::from('A') && sequence_char <= u32::from('A') + 32 { + result_char_or_none = + Some(char::from_u32(sequence_char - u32::from('A') + 1).unwrap()); + } else { + errored = true; + } + } + // \x1B means escape. + 'e' => { + result_char_or_none = Some('\x1B'); + } + // \f means form feed. + 'f' => { + result_char_or_none = Some('\x0C'); + } + // \n means newline. + 'n' => { + result_char_or_none = Some('\n'); + } + // \r means carriage return. + 'r' => { + result_char_or_none = Some('\x0D'); + } + // \t means tab. + 't' => { + result_char_or_none = Some('\t'); + } + // \v means vertical tab. + 'v' => { + result_char_or_none = Some('\x0b'); + } + // If a backslash is followed by an actual newline, swallow them both. + '\n' => { + result_char_or_none = None; + } + _ => { + if unescape_special { + result.push(INTERNAL_SEPARATOR); + } + result_char_or_none = Some(c); + } + } + + if errored { + return None; + } + + if !byte_buff.is_empty() { + result.push_utfstr(&str2wcstring(&byte_buff)); + } + + break; + } + + if let Some(c) = result_char_or_none { + result.push(c); + } + + Some(in_pos) +} + +/// This is a specialization of `char::to_digit()` that only handles base 16 and only uppercase. +fn convert_hex_digit(d: char) -> Option { + let val = if ('0'..='9').contains(&d) { + u32::from(d) - u32::from('0') + } else if ('A'..='Z').contains(&d) { + 10 + u32::from(d) - u32::from('A') + } else { + return None; + }; + Some(val) +} + +pub const fn char_offset(base: char, offset: u32) -> char { + match char::from_u32(base as u32 + offset) { + Some(c) => c, + None => panic!("not a valid char"), + } +} + +/// A user-visible job ID. +pub type JobId = i32; + +/// The non user-visible, never-recycled job ID. +/// Every job has a unique positive value for this. +pub type InternalJobId = u64; + +/// Exits without invoking destructors (via _exit), useful for code after fork. +fn exit_without_destructors(code: i32) -> ! { + unsafe { + libc::_exit(code); + } +} + +/// Save the shell mode on startup so we can restore them on exit. +static SHELL_MODES: Lazy> = Lazy::new(|| Mutex::new(unsafe { mem::zeroed() })); + +/// The character to use where the text has been truncated. Is an ellipsis on unicode system and a $ +/// on other systems. +pub fn get_ellipsis_char() -> char { + char::from_u32(ELLIPSIS_CHAR.load(Ordering::Relaxed)).unwrap() +} + +static ELLIPSIS_CHAR: AtomicU32 = AtomicU32::new(0); + +/// The character or string to use where text has been truncated (ellipsis if possible, otherwise +/// ...) +pub static mut ELLIPSIS_STRING: Lazy<&'static wstr> = Lazy::new(|| L!("")); + +/// Character representing an omitted newline at the end of text. +pub fn get_omitted_newline_str() -> &'static wstr { + unsafe { &OMITTED_NEWLINE_STR } +} + +static mut OMITTED_NEWLINE_STR: Lazy<&'static wstr> = Lazy::new(|| L!("")); + +pub fn get_omitted_newline_width() -> usize { + unsafe { OMITTED_NEWLINE_STR.len() } +} + +static OBFUSCATION_READ_CHAR: AtomicU32 = AtomicU32::new(0); + +pub fn get_obfuscation_read_char() -> char { + char::from_u32(OBFUSCATION_READ_CHAR.load(Ordering::Relaxed)).unwrap() +} + +/// Profiling flag. True if commands should be profiled. +pub static G_PROFILING_ACTIVE: RelaxedAtomicBool = RelaxedAtomicBool::new(false); + +/// Name of the current program. Should be set at startup. Used by the debug function. +pub static mut PROGRAM_NAME: Lazy<&'static wstr> = Lazy::new(|| L!("")); + +#[cfg(windows)] +/// Set to false if it's been determined we can't trust the last modified timestamp on the tty. +pub const HAS_WORKING_TTY_TIMESTAMPS: bool = false; +#[cfg(not(windows))] +/// Set to false if it's been determined we can't trust the last modified timestamp on the tty. +pub const HAS_WORKING_TTY_TIMESTAMPS: bool = true; + +/// A global, empty string. This is useful for functions which wish to return a reference to an +/// empty string. +pub static G_EMPTY_STRING: WString = WString::new(); + +/// A global, empty wcstring_list_t. This is useful for functions which wish to return a reference +/// to an empty string. +pub static G_EMPTY_STRING_LIST: Vec = vec![]; + +/// A function type to check for cancellation. +/// \return true if execution should cancel. +pub type CancelChecker = dyn Fn() -> bool; + +/// Converts the narrow character string \c in into its wide equivalent, and return it. +/// +/// The string may contain embedded nulls. +/// +/// This function encodes illegal character sequences in a reversible way using the private use +/// area. +pub fn str2wcstring(inp: &[u8]) -> WString { + if inp.is_empty() { + return WString::new(); + } + + let mut result = WString::new(); + result.reserve(inp.len()); + let mut pos = 0; + let mut state = zero_mbstate(); + while pos < inp.len() { + // Append any initial sequence of ascii characters. + // Note we do not support character sets which are not supersets of ASCII. + let ascii_prefix_length = count_ascii_prefix(&inp[pos..]); + result.push_str(std::str::from_utf8(&inp[pos..pos + ascii_prefix_length]).unwrap()); + pos += ascii_prefix_length; + assert!(pos <= inp.len(), "Position overflowed length"); + if pos == inp.len() { + break; + } + + // We have found a non-ASCII character. + let mut ret = 0; + let mut c = '\0'; + + let use_encode_direct = if inp[pos] & 0xF8 == 0xF8 { + // Protect against broken mbrtowc() implementations which attempt to encode UTF-8 + // sequences longer than four bytes (e.g., OS X Snow Leopard). + // TODO This check used to be conditionally compiled only on affected platforms. + true + } else { + const _: () = assert!(mem::size_of::() == mem::size_of::()); + let mut codepoint = u32::from(c); + ret = unsafe { + mbrtowc( + std::ptr::addr_of_mut!(codepoint).cast(), + std::ptr::addr_of!(inp[pos]).cast(), + inp.len() - pos, + std::ptr::addr_of_mut!(state), + ) + }; + match char::from_u32(codepoint) { + Some(codepoint) => { + c = codepoint; + // Determine whether to encode this character with our crazy scheme. + (c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END) + || + c == INTERNAL_SEPARATOR + || + // Incomplete sequence. + ret == 0_usize.wrapping_sub(2) + || + // Invalid data. + ret == 0_usize.wrapping_sub(1) + || + // Other error codes? Terrifying, should never happen. + ret > inp.len() - pos + } + None => true, + } + }; + + if use_encode_direct { + c = encode_byte_to_char(inp[pos]); + result.push(c); + pos += 1; + state = zero_mbstate(); + } else if ret == 0 { + // embedded null byte! + result.push('\0'); + pos += 1; + state = zero_mbstate(); + } else { + // normal case + result.push(c); + pos += ret; + } + } + result +} + +/// Returns a newly allocated multibyte character string equivalent of the specified wide character +/// string. +/// +/// This function decodes illegal character sequences in a reversible way using the private use +/// area. +pub fn wcs2string(input: &wstr) -> Vec { + if input.is_empty() { + return vec![]; + } + + let mut result = vec![]; + wcs2string_appending(&mut result, input); + result +} + +pub fn wcs2zstring(input: &wstr) -> CString { + if input.is_empty() { + return CString::default(); + } + + let mut result = vec![]; + // result.reserve(input.len()); + wcs2string_callback(input, |buff| { + result.extend_from_slice(buff); + true + }); + let until_nul = match result.iter().position(|c| *c == b'\0') { + Some(pos) => &result[..pos], + None => &result[..], + }; + CString::new(until_nul).unwrap() +} + +/// Like wcs2string, but appends to \p receiver instead of returning a new string. +pub fn wcs2string_appending(output: &mut Vec, input: &wstr) { + output.reserve(input.len()); + wcs2string_callback(input, |buff| { + output.extend_from_slice(buff); + true + }); +} + +/// \return the count of initial characters in \p in which are ASCII. +fn count_ascii_prefix(inp: &[u8]) -> usize { + // The C++ version had manual vectorization. + inp.iter().take_while(|c| c.is_ascii()).count() +} + +// Check if we are running in the test mode, where we should suppress error output +#[widestrs] +pub const TESTS_PROGRAM_NAME: &wstr = "(ignore)"L; + +/// Hack to not print error messages in the tests. Do not call this from functions in this module +/// like `debug()`. It is only intended to suppress diagnostic noise from testing things like the +/// fish parser where we expect a lot of diagnostic messages due to testing error conditions. +pub fn should_suppress_stderr_for_tests() -> bool { + unsafe { !PROGRAM_NAME.is_empty() && *PROGRAM_NAME != TESTS_PROGRAM_NAME } +} + +fn assert_is_main_thread() { + assert!(is_main_thread() || THREAD_ASSERTS_CFG_FOR_TESTING.load()); +} + +fn assert_is_background_thread() { + assert!(!is_main_thread() || THREAD_ASSERTS_CFG_FOR_TESTING.load()); +} + +static THREAD_ASSERTS_CFG_FOR_TESTING: RelaxedAtomicBool = RelaxedAtomicBool::new(false); + +thread_local! { + static TL_TID: RefCell = RefCell::new(0); +} + +static S_LAST_THREAD_ID: AtomicU64 = AtomicU64::new(0); +fn next_thread_id() -> u64 { + // Note 0 is an invalid thread id. + // Note fetch_add is a CAS which returns the value *before* the modification. + 1 + S_LAST_THREAD_ID.fetch_add(1, Ordering::Relaxed) +} + +fn thread_id() -> u64 { + TL_TID.with(|tid| { + if *tid.borrow() == 0 { + *tid.borrow_mut() = next_thread_id() + } + *tid.borrow() + }) +} + +/// Format the specified size (in bytes, kilobytes, etc.) into the specified stringbuffer. +#[widestrs] +fn format_size(mut sz: i64) -> WString { + let mut result = WString::new(); + const sz_names: [&wstr; 8] = ["kB"L, "MB"L, "GB"L, "TB"L, "PB"L, "EB"L, "ZB"L, "YB"L]; + if sz < 0 { + result += "unknown"L; + } else if sz == 0 { + result += wgettext!("empty"); + } else if sz < 1024 { + result += &sprintf!("%lldB"L, sz)[..]; + } else { + for (i, sz_name) in sz_names.iter().enumerate() { + if sz < (1024 * 1024) || i == sz_names.len() - 1 { + let isz = sz / 1024; + if isz > 9 { + result += &sprintf!("%ld%ls"L, isz, *sz_name)[..]; + } else { + result += &sprintf!("%.1f%ls"L, sz as f64 / 1024.0, *sz_name)[..]; + } + break; + } + sz /= 1024; + } + } + + result +} + +/// Version of format_size that does not allocate memory. +fn format_size_safe(buff: &mut [u8; 128], mut sz: u64) { + let buff_size = 128; + let max_len = buff_size - 1; // need to leave room for a null terminator + buff.fill(0); + let mut idx = 0; + const sz_names: [&str; 8] = ["kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]; + if sz == 0 { + let empty = "empty".as_bytes(); + buff[..empty.len()].copy_from_slice(empty); + } else if sz < 1024 { + append_ull(buff, &mut sz, &mut idx, max_len); + append_str(buff, "B", &mut idx, max_len); + } else { + for (i, sz_name) in sz_names.iter().enumerate() { + if sz < (1024 * 1024) || i == sz_names.len() - 1 { + let mut isz = sz / 1024; + append_ull(buff, &mut isz, &mut idx, max_len); + if isz <= 9 { + // Maybe append a single fraction digit. + let mut remainder = sz % 1024; + if remainder > 0 { + let tmp = [b'.', extract_most_significant_digit(&mut remainder)]; + let tmp = std::str::from_utf8(&tmp).unwrap(); + append_str(buff, tmp, &mut idx, max_len); + } + } + append_str(buff, sz_name, &mut idx, max_len); + break; + } + sz /= 1024; + } + } +} + +/// Writes out a long safely. +pub fn format_llong_safe>(buff: &mut [CharT; 64], val: i64) { + let uval = val.unsigned_abs(); + if val >= 0 { + format_safe_impl(buff, 64, uval); + } else { + buff[0] = CharT::from(b'-'); + format_safe_impl(&mut buff[1..], 63, uval); + } +} + +pub fn format_ullong_safe>(buff: &mut [CharT; 64], val: u64) { + format_safe_impl(buff, 64, val); +} + +fn format_safe_impl>(buff: &mut [CharT], size: usize, mut val: u64) { + let mut idx = 0; + if val == 0 { + buff[idx] = CharT::from(b'0'); + } else { + // Generate the string backwards, then reverse it. + while val != 0 { + buff[idx] = CharT::from((val % 10) as u8 + b'0'); + val /= 10; + } + buff[..idx].reverse(); + } + buff[idx] = CharT::from(b'\0'); + idx += 1; + assert!(idx <= size, "Buffer overflowed"); +} + +fn append_ull(buff: &mut [u8], val: &mut u64, inout_idx: &mut usize, max_len: usize) { + let mut idx = *inout_idx; + while *val > 0 && idx < max_len { + buff[idx] = extract_most_significant_digit(val); + idx += 1; + } + *inout_idx = idx; +} + +fn append_str(buff: &mut [u8], s: &str, inout_idx: &mut usize, max_len: usize) { + let mut idx = *inout_idx; + let bytes = s.as_bytes(); + while idx < bytes.len().min(max_len) { + buff[idx] = bytes[idx]; + idx += 1; + } + *inout_idx = idx; +} + +/// Crappy function to extract the most significant digit of an unsigned long long value. +fn extract_most_significant_digit(xp: &mut u64) -> u8 { + let mut place_value = 1; + let mut x = *xp; + while x >= 10 { + x /= 10; + place_value *= 10; + } + *xp -= place_value * x; + x as u8 + b'0' +} + +/// "Narrows" a wide character string. This just grabs any ASCII characters and truncates. +pub fn narrow_string_safe(buff: &mut [u8; 64], s: &wstr) { + let mut idx = 0; + for c in s.chars() { + if c as u32 <= 127 { + buff[idx] = c as u8; + idx += 1; + if idx + 1 == 64 { + break; + } + } + } + buff[idx] = b'\0'; +} + +/// Stored in blocks to reference the file which created the block. +pub type FilenameRef = Rc; + +/// This function should be called after calling `setlocale()` to perform fish specific locale +/// initialization. +#[widestrs] +fn fish_setlocale() { + // Use various Unicode symbols if they can be encoded using the current locale, else a simple + // ASCII char alternative. All of the can_be_encoded() invocations should return the same + // true/false value since the code points are in the BMP but we're going to be paranoid. This + // is also technically wrong if we're not in a Unicode locale but we expect (or hope) + // can_be_encoded() will return false in that case. + if can_be_encoded('\u{2026}') { + ELLIPSIS_CHAR.store(u32::from('\u{2026}'), Ordering::Relaxed); + unsafe { + ELLIPSIS_STRING = Lazy::new(|| "\u{2026}"L); + } + } else { + ELLIPSIS_CHAR.store(u32::from('$'), Ordering::Relaxed); // "horizontal ellipsis" + unsafe { + ELLIPSIS_STRING = Lazy::new(|| "..."L); + } + } + + if is_windows_subsystem_for_linux() { + // neither of \u23CE and \u25CF can be displayed in the default fonts on Windows, though + // they can be *encoded* just fine. Use alternative glyphs. + unsafe { + OMITTED_NEWLINE_STR = Lazy::new(|| "\u{00b6}"L); // "pilcrow" + } + OBFUSCATION_READ_CHAR.store(u32::from('\u{2022}'), Ordering::Relaxed); // "bullet" + } else if is_console_session() { + unsafe { + OMITTED_NEWLINE_STR = Lazy::new(|| "^J"L); + } + OBFUSCATION_READ_CHAR.store(u32::from('*'), Ordering::Relaxed); + } else { + if can_be_encoded('\u{23CE}') { + unsafe { + OMITTED_NEWLINE_STR = Lazy::new(|| "\u{23CE}"L); // "return symbol" (⏎) + } + } else { + unsafe { + OMITTED_NEWLINE_STR = Lazy::new(|| "^J"L); + } + } + OBFUSCATION_READ_CHAR.store( + u32::from(if can_be_encoded('\u{25CF}') { + '\u{25CF}' // "black circle" + } else { + '#' + }), + Ordering::Relaxed, + ); + } + G_PROFILING_ACTIVE.store(true); +} + +/// Test if the character can be encoded using the current locale. +fn can_be_encoded(wc: char) -> bool { + let mut converted = [0_i8; AT_LEAST_MB_LEN_MAX]; + let mut state = zero_mbstate(); + unsafe { + wcrtomb( + std::ptr::addr_of_mut!(converted[0]), + wc as libc::wchar_t, + std::ptr::addr_of_mut!(state), + ) != 0_usize.wrapping_sub(1) + } +} + +/// Call read, blocking and repeating on EINTR. Exits on EAGAIN. +/// \return the number of bytes read, or 0 on EOF. On EAGAIN, returns -1 if nothing was read. +pub fn read_blocked(fd: i32, mut buf: &mut [u8]) -> isize { + loop { + let res = unsafe { libc::read(fd, std::ptr::addr_of_mut!(buf).cast(), buf.len()) }; + if res < 0 && errno::errno().0 == EINTR { + continue; + } + return res; + } +} + /// Test if the string is a valid function name. pub fn valid_func_name(name: &wstr) -> bool { if name.is_empty() { @@ -123,6 +1255,235 @@ pub fn read_loop(fd: &Fd, buf: &mut [u8]) -> std::io::Result } } +/// Write the given paragraph of output, redoing linebreaks to fit \p termsize. +#[widestrs] +fn reformat_for_screen(msg: &wstr, termsize: &Termsize) -> WString { + let mut buff = WString::new(); + + let screen_width = termsize.width; + if screen_width != 0 { + let mut start = 0; + let mut pos = start; + let mut line_width = 0; + while pos < msg.len() { + let mut overflow = false; + let mut tok_width = 0; + + // Tokenize on whitespace, and also calculate the width of the token. + while pos < msg.len() && [' ', '\n', '\r', '\t'].contains(&msg.char_at(pos)) { + // Check is token is wider than one line. If so we mark it as an overflow and break + // the token. + let width = fish_wcwidth(msg.char_at(pos).into()).0 as isize; + if (tok_width + width) > (screen_width - 1) { + overflow = true; + break; + } + tok_width += width; + pos += 1; + } + + // If token is zero character long, we don't do anything. + if pos == 0 { + pos += 1; + } else if overflow { + // In case of overflow, we print a newline, except if we already are at position 0. + let token = &msg[start..pos]; + if line_width != 0 { + buff.push('\n'); + } + buff += &sprintf!("%ls-\n"L, token)[..]; + line_width = 0; + } else { + // Print the token. + let token = &msg[start..pos]; + let line_width_unit = (if line_width != 0 { 1 } else { 0 }); + if (line_width + line_width_unit + tok_width) > screen_width { + buff.push('\n'); + line_width = 0; + } + if line_width != 0 { + buff += " "L; + } + buff += token; + line_width += line_width_unit + tok_width; + } + + start = pos; + } + } else { + buff += msg; + } + buff.push('\n'); + buff +} + +pub type Timepoint = f64; + +/// Return the number of seconds from the UNIX epoch, with subsecond precision. This function uses +/// the gettimeofday function and will have the same precision as that function. +fn timef() -> Timepoint { + match time::SystemTime::now().duration_since(time::UNIX_EPOCH) { + Ok(difference) => difference.as_secs() as f64, + Err(until_epoch) => -(until_epoch.duration().as_secs() as f64), + } +} + +/// Call the following function early in main to set the main thread. This is our replacement for +/// pthread_main_np(). +pub fn set_main_thread() { + // Just call thread_id() once to force increment of thread_id. + let tid = thread_id(); + assert!(tid == 1, "main thread should have thread ID 1"); +} + +pub fn is_main_thread() -> bool { + thread_id() == 1 +} + +pub fn configure_thread_assertions_for_testing() { + THREAD_ASSERTS_CFG_FOR_TESTING.store(true) +} + +/// This allows us to notice when we've forked. +static IS_FORKED_PROC: RelaxedAtomicBool = RelaxedAtomicBool::new(false); + +pub fn setup_fork_guards() { + IS_FORKED_PROC.store(false); + todo!(); +} + +pub fn is_forked_child() -> bool { + IS_FORKED_PROC.load() +} + +/// Be able to restore the term's foreground process group. +/// This is set during startup and not modified after. +static INITIAL_FG_PROCESS_GROUP: AtomicI32 = AtomicI32::new(-1); // HACK, should be pid_t +const _: () = assert!(mem::size_of::() >= mem::size_of::()); + +/// Save the value of tcgetpgrp so we can restore it on exit. +pub fn save_term_foreground_process_group() { + INITIAL_FG_PROCESS_GROUP.store(unsafe { libc::tcgetpgrp(STDIN_FILENO) }, Ordering::Relaxed); +} + +pub fn restore_term_foreground_process_group_for_exit() { + // We wish to restore the tty to the initial owner. There's two ways this can go wrong: + // 1. We may steal the tty from someone else (#7060). + // 2. The call to tcsetpgrp may deliver SIGSTOP to us, and we will not exit. + // Hanging on exit seems worse, so ensure that SIGTTOU is ignored so we do not get SIGSTOP. + // Note initial_fg_process_group == 0 is possible with Linux pid namespaces. + // This is called during shutdown and from a signal handler. We don't bother to complain on + // failure because doing so is unlikely to be noticed. + let initial_fg_process_group = INITIAL_FG_PROCESS_GROUP.load(Ordering::Relaxed); + if initial_fg_process_group > 0 && initial_fg_process_group != unsafe { libc::getpgrp() } { + unsafe { + libc::signal(SIGTTOU, SIG_IGN); + libc::tcsetpgrp(STDIN_FILENO, initial_fg_process_group); + } + } +} + +/// Determines if we are running under Microsoft's Windows Subsystem for Linux to work around +/// some known limitations and/or bugs. +/// See https://github.com/Microsoft/WSL/issues/423 and Microsoft/WSL#2997 +pub fn is_windows_subsystem_for_linux() -> bool { + // We are purposely not using std::call_once as it may invoke locking, which is an unnecessary + // overhead since there's no actual race condition here - even if multiple threads call this + // routine simultaneously the first time around, we just end up needlessly querying uname(2) one + // more time. + *IS_WINDOWS_SUBSYSTEM_FOR_LINUX +} + +fn slice_contains_slice(a: &[T], b: &[T]) -> bool { + a.windows(b.len()).any(|aw| aw == b) +} + +#[cfg(not(windows))] +static IS_WINDOWS_SUBSYSTEM_FOR_LINUX: Lazy = Lazy::new(|| false); +#[cfg(windows)] +static IS_WINDOWS_SUBSYSTEM_FOR_LINUX: Lazy = Lazy::new(|| { + let mut info: libc::utsname = unsafe { mem::zeroed() }; + unsafe { + libc::uname(std::ptr::addr_of_mut!(info)); + } + + // Sample utsname.release under WSL, testing for something like `4.4.0-17763-Microsoft` + if !slice_contains_slice(&info.release, b"Microsoft") { + return false; + } + let dash = info.release.iter().position('-'); + + if dash + .map(|d| unsafe { libc::strtod(std::ptr::addr_of!(info.release[d + 1]), std::ptr::null()) } >= 17763) + .unwrap_or(false) + { + return false; + } + + // #5298, #5661: There are acknowledged, published, and (later) fixed issues with + // job control under early WSL releases that prevent fish from running correctly, + // with unexpected failures when piping. Fish 3.0 nightly builds worked around this + // issue with some needlessly complicated code that was later stripped from the + // fish 3.0 release, so we just bail. Note that fish 2.0 was also broken, but we + // just didn't warn about it. + + // #6038 & 5101bde: It's been requested that there be some sort of way to disable + // this check: if the environment variable FISH_NO_WSL_CHECK is present, this test + // is bypassed. We intentionally do not include this in the error message because + // it'll only allow fish to run but not to actually work. Here be dragons! + if env::var("FISH_NO_WSL_CHECK") == Err(env::VarError::NotPresent) { + FLOG!( + error, + "This version of WSL has known bugs that prevent fish from working.\ + Please upgrade to Windows 10 1809 (17763) or higher to use fish!" + ); + } + true; +}); + +/// Return true if the character is in a range reserved for fish's private use. +/// +/// NOTE: This is used when tokenizing the input. It is also used when reading input, before +/// tokenization, to replace such chars with REPLACEMENT_WCHAR if they're not part of a quoted +/// string. We don't want external input to be able to feed reserved characters into our +/// lexer/parser or code evaluator. +// +// TODO: Actually implement the replacement as documented above. +pub fn fish_reserved_codepoint(c: char) -> bool { + (c >= RESERVED_CHAR_BASE && c < RESERVED_CHAR_END) + || (c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END) +} + +pub fn redirect_tty_output() { + unsafe { + let mut t: libc::termios = mem::zeroed(); + let s = CString::new("/dev/null").unwrap(); + let fd = libc::open(s.as_ptr(), O_WRONLY); + assert!(fd != -1, "Could not open /dev/null!"); + for stdfd in [STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO] { + if libc::tcgetattr(stdfd, std::ptr::addr_of_mut!(t)) == -1 && errno::errno().0 == EIO { + libc::dup2(fd, stdfd); + } + } + } +} + +/// Test if the given char is valid in a variable name. +pub fn valid_var_name_char(chr: char) -> bool { + fish_iswalnum(chr) || chr == '_' +} + +/// Test if the given string is a valid variable name. +fn valid_var_name(s: &wstr) -> bool { + // Note do not use c_str(), we want to fail on embedded nul bytes. + !s.is_empty() && s.chars().all(valid_var_name_char) +} + +/// Get the absolute path to the fish executable itself +fn get_executable_path(argv0: &str) -> PathBuf { + std::env::current_exe().unwrap_or_else(|_| PathBuf::from_str(argv0).unwrap()) +} + /// Like [`std::mem::replace()`] but provides a reference to the old value in a callback to obtain /// the replacement value. Useful to avoid errors about multiple references (`&mut T` for `old` then /// `&T` again in the `new` expression). @@ -131,6 +1492,8 @@ pub fn replace_with T>(old: &mut T, with: F) -> T { std::mem::replace(old, new) } +pub type Cleanup = ScopeGuard; + /// A RAII cleanup object. Unlike in C++ where there is no borrow checker, we can't just provide a /// callback that modifies live objects willy-nilly because then there would be two &mut references /// to the same object - the original variables we keep around to use and their captured references @@ -260,6 +1623,46 @@ pub const fn assert_send() {} pub const fn assert_sync() {} +/// This function attempts to distinguish between a console session (at the actual login vty) and a +/// session within a terminal emulator inside a desktop environment or over SSH. Unfortunately +/// there are few values of $TERM that we can interpret as being exclusively console sessions, and +/// most common operating systems do not use them. The value is cached for the duration of the fish +/// session. We err on the side of assuming it's not a console session. This approach isn't +/// bullet-proof and that's OK. +fn is_console_session() -> bool { + *CONSOLE_SESSION +} + +static CONSOLE_SESSION: Lazy = Lazy::new(|| { + const path_max: usize = libc::PATH_MAX as _; + let mut tty_name: [u8; path_max] = [0; path_max]; + if unsafe { + libc::ttyname_r( + STDIN_FILENO, + std::ptr::addr_of_mut!(tty_name).cast(), + path_max, + ) + } != 0 + { + return false; + } + // Test that the tty matches /dev/(console|dcons|tty[uv\d]) + let len = "/dev/tty".len(); + ( + ( + tty_name.starts_with(b"/dev/tty") && + ([b'u', b'v'].contains(&tty_name[len]) || tty_name[len].is_ascii_digit()) + ) || + tty_name.starts_with(b"/dev/dcons\0") || + tty_name.starts_with(b"/dev/console\0")) + // and that $TERM is simple, e.g. `xterm` or `vt100`, not `xterm-something` + && match env::var("TERM") { + Ok(term) => ["-", "sun-color"].contains(&term.as_str()), + Err(env::VarError::NotPresent) => true, + Err(_) => false, + } +}); + /// Asserts that a slice is alphabetically sorted by a [`&wstr`] `name` field. /// /// Mainly useful for static asserts/const eval. @@ -320,11 +1723,15 @@ macro_rules! assert_sorted_by_name { assert_sorted_by_name!($slice, name); }; } + mod tests { - use crate::{ - common::{escape_string, EscapeStringStyle}, - wchar::widestrs, + use crate::common::{ + escape_string, str2wcstring, wcs2string, EscapeStringStyle, ENCODE_DIRECT_BASE, + ENCODE_DIRECT_END, }; + use crate::wchar::widestrs; + use crate::wutil::encoding::{wcrtomb, zero_mbstate, AT_LEAST_MB_LEN_MAX}; + use rand::random; #[widestrs] pub fn test_escape_string() { @@ -333,8 +1740,8 @@ mod tests { // plain text should not be needlessly escaped assert_eq!(regex("hello world!"L), "hello world!"L); - // all the following are intended to be ultimately matched literally - even if they don't look - // like that's the intent - so we escape them. + // all the following are intended to be ultimately matched literally - even if they + // don't look like that's the intent - so we escape them. assert_eq!(regex(".ext"L), "\\.ext"L); assert_eq!(regex("{word}"L), "\\{word\\}"L); assert_eq!(regex("hola-mundo"L), "hola\\-mundo"L); @@ -347,6 +1754,150 @@ mod tests { "not really escaped\\\\\\?"L ); } + + /// The number of tests to run. + const ESCAPE_TEST_COUNT: usize = 100000; + /// The average length of strings to unescape. + const ESCAPE_TEST_LENGTH: usize = 100; + /// The highest character number of character to try and escape. + const ESCAPE_TEST_CHAR: usize = 4000; + + /// Helper to convert a narrow string to a sequence of hex digits. + fn str2hex(input: &[u8]) -> String { + let mut output = "".to_string(); + for byte in input { + output += &format!("0x{:2X} ", *byte); + } + output + } + + /// Test wide/narrow conversion by creating random strings and verifying that the original + /// string comes back through double conversion. + pub fn test_convert() { + for _ in 0..ESCAPE_TEST_COUNT { + let mut origin: Vec = vec![]; + while (random::() % ESCAPE_TEST_LENGTH) != 0 { + let byte = random(); + origin.push(byte); + } + + let w = str2wcstring(&origin[..]); + let n = wcs2string(&w); + assert_eq!( + origin, + n, + "Conversion cycle of string:\n{:4} chars: {}\n\ + produced different string:\n\ + {:4} chars: {}", + origin.len(), + &str2hex(&origin), + n.len(), + &str2hex(&n) + ); + } + } + + /// Verify that ASCII narrow->wide conversions are correct. + pub fn test_convert_ascii() { + let mut s = vec![b'\0'; 4096]; + for (i, c) in s.iter_mut().enumerate() { + *c = u8::try_from(i % 10).unwrap() + b'0'; + } + + // Test a variety of alignments. + for left in 0..16 { + for right in 0..16 { + let len = s.len() - left - right; + let input = &s[left..left + len]; + let wide = str2wcstring(input); + let narrow = wcs2string(&wide); + assert_eq!(narrow, input); + } + } + + // Put some non-ASCII bytes in and ensure it all still works. + for i in 0..s.len() { + let saved = s[i]; + s[i] = 0xF7; + assert_eq!(wcs2string(&str2wcstring(&s)), s); + s[i] = saved; + } + } + /// fish uses the private-use range to encode bytes that could not be decoded using the + /// user's locale. If the input could be decoded, but decoded to private-use codepoints, + /// then fish should also use the direct encoding for those bytes. Verify that characters + /// in the private use area are correctly round-tripped. See #7723. + pub fn test_convert_private_use() { + for c in ENCODE_DIRECT_BASE..ENCODE_DIRECT_END { + // Encode the char via the locale. Do not use fish functions which interpret these + // specially. + let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX]; + let mut state = zero_mbstate(); + let len = unsafe { + wcrtomb( + std::ptr::addr_of_mut!(converted[0]).cast(), + c as libc::wchar_t, + std::ptr::addr_of_mut!(state), + ) + }; + if len == 0_usize.wrapping_sub(1) { + // Could not be encoded in this locale. + continue; + } + let s = &converted[..len]; + + // Ask fish to decode this via str2wcstring. + // str2wcstring should notice that the decoded form collides with its private use + // and encode it directly. + let ws = str2wcstring(s); + + // Each byte should be encoded directly, and round tripping should work. + assert_eq!(ws.len(), s.len()); + assert_eq!(wcs2string(&ws), s); + } + } } crate::ffi_tests::add_test!("escape_string", tests::test_escape_string); +crate::ffi_tests::add_test!("escape_string", tests::test_convert); +crate::ffi_tests::add_test!("escape_string", tests::test_convert_ascii); +crate::ffi_tests::add_test!("escape_string", tests::test_convert_private_use); + +#[cxx::bridge] +mod common_ffi { + extern "C++" { + include!("wutil.h"); + include!("common.h"); + type escape_string_style_t = crate::ffi::escape_string_style_t; + } + extern "Rust" { + fn rust_unescape_string( + input: *const wchar_t, + len: usize, + escape_special: u32, + style: escape_string_style_t, + ) -> UniquePtr; + } +} + +fn rust_unescape_string( + input: *const ffi::wchar_t, + len: usize, + escape_special: u32, + style: ffi::escape_string_style_t, +) -> UniquePtr { + let style = match style { + ffi::escape_string_style_t::STRING_STYLE_SCRIPT => { + UnescapeStringStyle::Script(UnescapeFlags::from_bits(escape_special).unwrap()) + } + ffi::escape_string_style_t::STRING_STYLE_URL => UnescapeStringStyle::Url, + ffi::escape_string_style_t::STRING_STYLE_VAR => UnescapeStringStyle::Var, + _ => panic!(), + }; + let input = unsafe { slice::from_raw_parts(input, len) }; + let input = wstr::from_slice(input).unwrap(); + match unescape_string(input, style) { + Some(result) => result.to_ffi(), + None => UniquePtr::null(), + } +} diff --git a/fish-rust/src/compat.c b/fish-rust/src/compat.c new file mode 100644 index 000000000..a32885dde --- /dev/null +++ b/fish-rust/src/compat.c @@ -0,0 +1,3 @@ +#include + +size_t C_MB_CUR_MAX() { return MB_CUR_MAX; } diff --git a/fish-rust/src/compat.rs b/fish-rust/src/compat.rs new file mode 100644 index 000000000..32cec77ba --- /dev/null +++ b/fish-rust/src/compat.rs @@ -0,0 +1,8 @@ +#[allow(non_snake_case)] +pub fn MB_CUR_MAX() -> usize { + unsafe { C_MB_CUR_MAX() } +} + +extern "C" { + fn C_MB_CUR_MAX() -> usize; +} diff --git a/fish-rust/src/env.rs b/fish-rust/src/env.rs index 38a3b18bf..2b76043b9 100644 --- a/fish-rust/src/env.rs +++ b/fish-rust/src/env.rs @@ -38,6 +38,11 @@ pub mod flags { c_int(i32::from(val.bits())) } } + impl From for u16 { + fn from(val: EnvMode) -> Self { + val.bits() + } + } } /// Return values for `env_stack_t::set()`. diff --git a/fish-rust/src/expand.rs b/fish-rust/src/expand.rs index 1d8e136bf..2546e3468 100644 --- a/fish-rust/src/expand.rs +++ b/fish-rust/src/expand.rs @@ -1,39 +1,34 @@ -use crate::wchar::{EXPAND_RESERVED_BASE, EXPAND_RESERVED_END}; +use crate::common::{char_offset, EXPAND_RESERVED_BASE, EXPAND_RESERVED_END}; +use crate::wchar::wstr; +use widestring_suffix::widestrs; -/// Private use area characters used in expansions -#[repr(u32)] -pub enum ExpandChars { - /// Character representing a home directory. - HomeDirectory = EXPAND_RESERVED_BASE as u32, - /// Character representing process expansion for %self. - ProcessExpandSelf, - /// Character representing variable expansion. - VariableExpand, - /// Character representing variable expansion into a single element. - VariableExpandSingle, - /// Character representing the start of a bracket expansion. - BraceBegin, - /// Character representing the end of a bracket expansion. - BraceEnd, - /// Character representing separation between two bracket elements. - BraceSep, - /// Character that takes the place of any whitespace within non-quoted text in braces - BraceSpace, - /// Separate subtokens in a token with this character. - InternalSeparator, - /// Character representing an empty variable expansion. Only used transitively while expanding - /// variables. - VariableExpandEmpty, -} +/// Character representing a home directory. +pub const HOME_DIRECTORY: char = char_offset(EXPAND_RESERVED_BASE, 0); +/// Character representing process expansion for %self. +pub const PROCESS_EXPAND_SELF: char = char_offset(EXPAND_RESERVED_BASE, 1); +/// Character representing variable expansion. +pub const VARIABLE_EXPAND: char = char_offset(EXPAND_RESERVED_BASE, 2); +/// Character representing variable expansion into a single element. +pub const VARIABLE_EXPAND_SINGLE: char = char_offset(EXPAND_RESERVED_BASE, 3); +/// Character representing the start of a bracket expansion. +pub const BRACE_BEGIN: char = char_offset(EXPAND_RESERVED_BASE, 4); +/// Character representing the end of a bracket expansion. +pub const BRACE_END: char = char_offset(EXPAND_RESERVED_BASE, 5); +/// Character representing separation between two bracket elements. +pub const BRACE_SEP: char = char_offset(EXPAND_RESERVED_BASE, 6); +/// Character that takes the place of any whitespace within non-quoted text in braces +pub const BRACE_SPACE: char = char_offset(EXPAND_RESERVED_BASE, 7); +/// Separate subtokens in a token with this character. +pub const INTERNAL_SEPARATOR: char = char_offset(EXPAND_RESERVED_BASE, 8); +/// Character representing an empty variable expansion. Only used transitively while expanding +/// variables. +pub const VARIABLE_EXPAND_EMPTY: char = char_offset(EXPAND_RESERVED_BASE, 9); const _: () = assert!( - EXPAND_RESERVED_END as u32 > ExpandChars::VariableExpandEmpty as u32, + EXPAND_RESERVED_END as u32 > VARIABLE_EXPAND_EMPTY as u32, "Characters used in expansions must stay within private use area" ); -impl From for char { - fn from(val: ExpandChars) -> Self { - // We know this is safe because we limit the the range of this enum - unsafe { char::from_u32_unchecked(val as _) } - } -} +/// The string represented by PROCESS_EXPAND_SELF +#[widestrs] +pub const PROCESS_EXPAND_SELF_STR: &wstr = "%self"L; diff --git a/fish-rust/src/ffi.rs b/fish-rust/src/ffi.rs index 0c648de05..acdc56169 100644 --- a/fish-rust/src/ffi.rs +++ b/fish-rust/src/ffi.rs @@ -53,8 +53,6 @@ include_cpp! { generate!("env_var_t") generate!("make_pipes_ffi") - generate!("valid_var_name_char") - generate!("get_flog_file_fd") generate!("log_extra_to_flog_file") @@ -100,9 +98,6 @@ include_cpp! { generate!("re::regex_t") generate!("re::regex_result_ffi") generate!("re::try_compile_ffi") - generate!("wcs2string") - generate!("wcs2zstring") - generate!("str2wcstring") generate!("signal_handle") generate!("signal_check_cancel") diff --git a/fish-rust/src/flog.rs b/fish-rust/src/flog.rs index cc1d002ed..4c65458fb 100644 --- a/fish-rust/src/flog.rs +++ b/fish-rust/src/flog.rs @@ -188,7 +188,15 @@ macro_rules! FLOG { } }; } -pub(crate) use FLOG; + +// TODO implement. +macro_rules! FLOGF { + ($category:ident, $($elem:expr),+) => { + crate::flog::FLOG!($category, $($elem),*); + } +} + +pub(crate) use {FLOG, FLOGF}; /// For each category, if its name matches the wildcard, set its enabled to the given sense. fn apply_one_wildcard(wc_esc: &wstr, sense: bool) { diff --git a/fish-rust/src/lib.rs b/fish-rust/src/lib.rs index 09c26a2ec..74fd34615 100644 --- a/fish-rust/src/lib.rs +++ b/fish-rust/src/lib.rs @@ -12,6 +12,7 @@ mod common; mod abbrs; mod builtins; mod color; +mod compat; mod env; mod event; mod expand; @@ -51,6 +52,7 @@ mod wchar_ext; mod wchar_ffi; mod wcstringutil; mod wgetopt; +mod wildcard; mod wutil; // Don't use `#[cfg(test)]` here to make sure ffi tests are built and tested diff --git a/fish-rust/src/path.rs b/fish-rust/src/path.rs index 934df4007..383ba250b 100644 --- a/fish-rust/src/path.rs +++ b/fish-rust/src/path.rs @@ -1,5 +1,5 @@ use crate::{ - expand::ExpandChars::HomeDirectory, + expand::HOME_DIRECTORY, wchar::{wstr, WExt, WString, L}, }; @@ -12,7 +12,7 @@ pub fn path_apply_working_directory(path: &wstr, working_directory: &wstr) -> WS // We're going to make sure that if we want to prepend the wd, that the string has no leading // "/". - let prepend_wd = path.char_at(0) != '/' && path.char_at(0) != HomeDirectory.into(); + let prepend_wd = path.char_at(0) != '/' && path.char_at(0) != HOME_DIRECTORY; if !prepend_wd { // No need to prepend the wd, so just return the path we were given. diff --git a/fish-rust/src/tokenizer.rs b/fish-rust/src/tokenizer.rs index 56f5ac72d..10d7fb16e 100644 --- a/fish-rust/src/tokenizer.rs +++ b/fish-rust/src/tokenizer.rs @@ -1,7 +1,8 @@ //! A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be //! extended to support marks, tokenizing multiple strings and disposing of unused string segments. -use crate::ffi::{valid_var_name_char, wcharz_t}; +use crate::common::valid_var_name_char; +use crate::ffi::wcharz_t; use crate::future_feature_flags::{feature_test, FeatureFlag}; use crate::parse_constants::SOURCE_OFFSET_INVALID; use crate::redirection::RedirectionMode; @@ -1357,7 +1358,7 @@ pub fn variable_assignment_equals_pos(txt: &wstr) -> Option { // TODO bracket indexing for (i, c) in txt.chars().enumerate() { if !found_potential_variable { - if !valid_var_name_char(c as wchar_t) { + if !valid_var_name_char(c) { return None; } found_potential_variable = true; @@ -1365,7 +1366,7 @@ pub fn variable_assignment_equals_pos(txt: &wstr) -> Option { if c == '=' { return Some(i); } - if !valid_var_name_char(c as wchar_t) { + if !valid_var_name_char(c) { return None; } } diff --git a/fish-rust/src/wchar.rs b/fish-rust/src/wchar.rs index 7f723e4f0..c3d366a8d 100644 --- a/fish-rust/src/wchar.rs +++ b/fish-rust/src/wchar.rs @@ -4,6 +4,7 @@ //! - wstr: a string slice without a nul terminator. Like `&str` but wide chars. //! - WString: an owning string without a nul terminator. Like `String` but wide chars. +use crate::common::{ENCODE_DIRECT_BASE, ENCODE_DIRECT_END}; pub use widestring::{Utf32Str as wstr, Utf32String as WString}; /// Pull in our extensions. @@ -30,43 +31,6 @@ pub(crate) use L; /// Note: the resulting string is NOT nul-terminated. pub use widestring_suffix::widestrs; -// Use Unicode "non-characters" for internal characters as much as we can. This -// gives us 32 "characters" for internal use that we can guarantee should not -// appear in our input stream. See http://www.unicode.org/faq/private_use.html. -pub const RESERVED_CHAR_BASE: char = '\u{FDD0}'; -pub const RESERVED_CHAR_END: char = '\u{FDF0}'; -// Split the available non-character values into two ranges to ensure there are -// no conflicts among the places we use these special characters. -pub const EXPAND_RESERVED_BASE: char = RESERVED_CHAR_BASE; -pub const EXPAND_RESERVED_END: char = match char::from_u32(EXPAND_RESERVED_BASE as u32 + 16u32) { - Some(c) => c, - None => panic!("private use codepoint in expansion region should be valid char"), -}; -pub const WILDCARD_RESERVED_BASE: char = EXPAND_RESERVED_END; -pub const WILDCARD_RESERVED_END: char = match char::from_u32(WILDCARD_RESERVED_BASE as u32 + 16u32) -{ - Some(c) => c, - None => panic!("private use codepoint in wildcard region should be valid char"), -}; - -// These are in the Unicode private-use range. We really shouldn't use this -// range but have little choice in the matter given how our lexer/parser works. -// We can't use non-characters for these two ranges because there are only 66 of -// them and we need at least 256 + 64. -// -// If sizeof(wchar_t)==4 we could avoid using private-use chars; however, that -// would result in fish having different behavior on machines with 16 versus 32 -// bit wchar_t. It's better that fish behave the same on both types of systems. -// -// Note: We don't use the highest 8 bit range (0xF800 - 0xF8FF) because we know -// of at least one use of a codepoint in that range: the Apple symbol (0xF8FF) -// on Mac OS X. See http://www.unicode.org/faq/private_use.html. -pub const ENCODE_DIRECT_BASE: char = '\u{F600}'; -pub const ENCODE_DIRECT_END: char = match char::from_u32(ENCODE_DIRECT_BASE as u32 + 256) { - Some(c) => c, - None => panic!("private use codepoint in encode direct region should be valid char"), -}; - /// Encode a literal byte in a UTF-32 character. This is required for e.g. the echo builtin, whose /// escape sequences can be used to construct raw byte sequences which are then interpreted as e.g. /// UTF-8 by the terminal. If we were to interpret each of those bytes as a codepoint and encode it @@ -78,3 +42,16 @@ pub fn encode_byte_to_char(byte: u8) -> char { char::from_u32(u32::from(ENCODE_DIRECT_BASE) + u32::from(byte)) .expect("private-use codepoint should be valid char") } + +/// Decode a literal byte from a UTF-32 character. +pub fn decode_byte_from_char(c: char) -> Option { + if c >= ENCODE_DIRECT_BASE && c < ENCODE_DIRECT_END { + Some( + (u32::from(c) - u32::from(ENCODE_DIRECT_BASE)) + .try_into() + .unwrap(), + ) + } else { + None + } +} diff --git a/fish-rust/src/wcstringutil.rs b/fish-rust/src/wcstringutil.rs index 0fa5f820e..384cc7d40 100644 --- a/fish-rust/src/wcstringutil.rs +++ b/fish-rust/src/wcstringutil.rs @@ -1,6 +1,66 @@ //! Helper functions for working with wcstring. -use crate::wchar::{wstr, WString}; +use crate::compat::MB_CUR_MAX; +use crate::expand::INTERNAL_SEPARATOR; +use crate::flog::FLOGF; +use crate::wchar::{decode_byte_from_char, wstr, WString, L}; +use crate::wutil::encoding::{wcrtomb, zero_mbstate, AT_LEAST_MB_LEN_MAX}; + +/// Implementation of wcs2string that accepts a callback. +/// This invokes \p func with (const char*, size_t) pairs. +/// If \p func returns false, it stops; otherwise it continues. +/// \return false if the callback returned false, otherwise true. +pub fn wcs2string_callback(input: &wstr, mut func: impl FnMut(&[u8]) -> bool) -> bool { + let mut state = zero_mbstate(); + let mut converted = [0_u8; AT_LEAST_MB_LEN_MAX]; + + for mut c in input.chars() { + // TODO: this doesn't seem sound. + if c == INTERNAL_SEPARATOR { + // do nothing + } else if let Some(byte) = decode_byte_from_char(c) { + converted[0] = byte; + if !func(&converted[..1]) { + return false; + } + } else if MB_CUR_MAX() == 1 { + // single-byte locale (C/POSIX/ISO-8859) + // If `c` contains a wide character we emit a question-mark. + if u32::from(c) & !0xFF != 0 { + c = '?'; + } + + converted[0] = c as u8; + if !func(&converted[..1]) { + return false; + } + } else { + converted = [0; AT_LEAST_MB_LEN_MAX]; + let len = unsafe { + wcrtomb( + std::ptr::addr_of_mut!(converted[0]).cast(), + c as libc::wchar_t, + std::ptr::addr_of_mut!(state), + ) + }; + if len == 0_usize.wrapping_sub(1) { + wcs2string_bad_char(c); + state = zero_mbstate(); + } else if !func(&converted[..len]) { + return false; + } + } + } + true +} + +fn wcs2string_bad_char(c: char) { + FLOGF!( + char_encoding, + L!("Wide character U+%4X has no narrow representation"), + c + ); +} /// Joins strings with a separator. pub fn join_strings(strs: &[&wstr], sep: char) -> WString { diff --git a/fish-rust/src/wildcard.rs b/fish-rust/src/wildcard.rs new file mode 100644 index 000000000..00b773743 --- /dev/null +++ b/fish-rust/src/wildcard.rs @@ -0,0 +1,13 @@ +// Enumeration of all wildcard types. + +use crate::common::{char_offset, WILDCARD_RESERVED_BASE}; + +/// Character representing any character except '/' (slash). +pub const ANY_CHAR: char = char_offset(WILDCARD_RESERVED_BASE, 0); +/// Character representing any character string not containing '/' (slash). +pub const ANY_STRING: char = char_offset(WILDCARD_RESERVED_BASE, 1); +/// Character representing any character string. +pub const ANY_STRING_RECURSIVE: char = char_offset(WILDCARD_RESERVED_BASE, 2); +/// This is a special pseudo-char that is not used other than to mark the +/// end of the the special characters so we can sanity check the enum range. +pub const ANY_SENTINEL: char = char_offset(WILDCARD_RESERVED_BASE, 3); diff --git a/fish-rust/src/wutil/encoding.rs b/fish-rust/src/wutil/encoding.rs new file mode 100644 index 000000000..a3661661e --- /dev/null +++ b/fish-rust/src/wutil/encoding.rs @@ -0,0 +1,19 @@ +extern "C" { + pub fn wcrtomb(s: *mut libc::c_char, wc: libc::wchar_t, ps: *mut mbstate_t) -> usize; + pub fn mbrtowc( + pwc: *mut libc::wchar_t, + s: *const libc::c_char, + n: usize, + p: *mut mbstate_t, + ) -> usize; +} + +// HACK This should be mbstate_t from libc but that's not exposed. Since it's only written by +// libc, we define it as opaque type that should be large enough for all implementations. +pub type mbstate_t = [u64; 16]; +pub fn zero_mbstate() -> mbstate_t { + [0; 16] +} + +// HACK This should be the MB_LEN_MAX macro from libc but that's not easy to get. +pub const AT_LEAST_MB_LEN_MAX: usize = 32; diff --git a/fish-rust/src/wutil/mod.rs b/fish-rust/src/wutil/mod.rs index 2da5179ea..358c9add7 100644 --- a/fish-rust/src/wutil/mod.rs +++ b/fish-rust/src/wutil/mod.rs @@ -1,3 +1,4 @@ +pub mod encoding; pub mod errors; pub mod gettext; mod normalize_path; @@ -6,6 +7,7 @@ pub mod wcstod; pub mod wcstoi; mod wrealpath; +use crate::common::fish_reserved_codepoint; pub(crate) use gettext::{wgettext, wgettext_fmt}; pub use normalize_path::*; pub(crate) use printf::sprintf; @@ -28,3 +30,21 @@ pub fn perror(s: &str) { let _ = stderr.write_all(slice); let _ = stderr.write_all(b"\n"); } + +const PUA1_START: char = '\u{E000}'; +const PUA1_END: char = '\u{F900}'; +const PUA2_START: char = '\u{F0000}'; +const PUA2_END: char = '\u{FFFFE}'; +const PUA3_START: char = '\u{100000}'; +const PUA3_END: char = '\u{10FFFE}'; + +/// Return one if the code point is in a Unicode private use area. +fn fish_is_pua(c: char) -> bool { + PUA1_START <= c && c < PUA1_END +} + +/// We need this because there are too many implementations that don't return the proper answer for +/// some code points. See issue #3050. +pub fn fish_iswalnum(c: char) -> bool { + !fish_reserved_codepoint(c) && !fish_is_pua(c) && c.is_alphanumeric() +} diff --git a/fish-rust/src/wutil/wrealpath.rs b/fish-rust/src/wutil/wrealpath.rs index f4d155d6e..04f86404f 100644 --- a/fish-rust/src/wutil/wrealpath.rs +++ b/fish-rust/src/wutil/wrealpath.rs @@ -4,13 +4,8 @@ use std::{ os::unix::prelude::{OsStrExt, OsStringExt}, }; -use cxx::let_cxx_string; - -use crate::{ - ffi::{str2wcstring, wcs2zstring}, - wchar::{wstr, WString}, - wchar_ffi::{WCharFromFFI, WCharToFFI}, -}; +use crate::common::{str2wcstring, wcs2zstring}; +use crate::wchar::{wstr, WString}; /// Wide character realpath. The last path component does not need to be valid. If an error occurs, /// `wrealpath()` returns `None` @@ -19,7 +14,7 @@ pub fn wrealpath(pathname: &wstr) -> Option { return None; } - let mut narrow_path: Vec = wcs2zstring(&pathname.to_ffi()).from_ffi(); + let mut narrow_path: Vec = wcs2zstring(pathname).into(); // Strip trailing slashes. This is treats "/a//" as equivalent to "/a" if /a is a non-directory. while narrow_path.len() > 1 && narrow_path[narrow_path.len() - 1] == b'/' { @@ -68,7 +63,5 @@ pub fn wrealpath(pathname: &wstr) -> Option { } }; - let_cxx_string!(s = real_path); - - Some(str2wcstring(&s).from_ffi()) + Some(str2wcstring(&real_path)) } diff --git a/src/ast.cpp b/src/ast.cpp index bd5d0b23b..0ee6bd1ee 100644 --- a/src/ast.cpp +++ b/src/ast.cpp @@ -67,9 +67,8 @@ static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token if (!needs_expand) { result = keyword_with_name(token); } else { - wcstring storage; - if (unescape_string(token, &storage, 0)) { - result = keyword_with_name(storage); + if (auto unescaped = unescape_string(token, 0)) { + result = keyword_with_name(*unescaped); } } } diff --git a/src/builtins/complete.cpp b/src/builtins/complete.cpp index 8b781a16d..5d7edd3fd 100644 --- a/src/builtins/complete.cpp +++ b/src/builtins/complete.cpp @@ -204,12 +204,11 @@ maybe_t builtin_complete(parser_t &parser, io_streams_t &streams, const wch } case 'p': case 'c': { - wcstring tmp; - if (unescape_string(w.woptarg, &tmp, UNESCAPE_SPECIAL)) { + if (auto tmp = unescape_string(w.woptarg, UNESCAPE_SPECIAL)) { if (opt == 'p') - path.push_back(tmp); + path.push_back(*tmp); else - cmd_to_complete.push_back(tmp); + cmd_to_complete.push_back(*tmp); } else { streams.err.append_format(_(L"%ls: Invalid token '%ls'\n"), cmd, w.woptarg); return STATUS_INVALID_ARGS; diff --git a/src/builtins/read.cpp b/src/builtins/read.cpp index ba16d0aa2..11ddcec1c 100644 --- a/src/builtins/read.cpp +++ b/src/builtins/read.cpp @@ -531,14 +531,13 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t if (opts.tokenize) { auto tok = new_tokenizer(buff.c_str(), TOK_ACCEPT_UNFINISHED); - wcstring out; if (opts.array) { // Array mode: assign each token as a separate element of the sole var. wcstring_list_t tokens; while (auto t = tok->next()) { auto text = *tok->text_of(*t); - if (unescape_string(text, &out, UNESCAPE_DEFAULT)) { - tokens.push_back(out); + if (auto out = unescape_string(text, UNESCAPE_DEFAULT)) { + tokens.push_back(*out); } else { tokens.push_back(text); } @@ -549,8 +548,8 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t std::unique_ptr t; while ((vars_left() - 1 > 0) && (t = tok->next())) { auto text = *tok->text_of(*t); - if (unescape_string(text, &out, UNESCAPE_DEFAULT)) { - parser.set_var_and_fire(*var_ptr++, opts.place, out); + if (auto out = unescape_string(text, UNESCAPE_DEFAULT)) { + parser.set_var_and_fire(*var_ptr++, opts.place, *out); } else { parser.set_var_and_fire(*var_ptr++, opts.place, text); } diff --git a/src/builtins/string.cpp b/src/builtins/string.cpp index 424dd2afe..bb993ade3 100644 --- a/src/builtins/string.cpp +++ b/src/builtins/string.cpp @@ -737,10 +737,9 @@ static int string_unescape(parser_t &parser, io_streams_t &streams, int argc, arg_iterator_t aiter(argv, optind, streams); while (const wcstring *arg = aiter.nextstr()) { - wcstring result; wcstring sep = aiter.want_newline() ? L"\n" : L""; - if (unescape_string(*arg, &result, flags, opts.escape_style)) { - streams.out.append(result + sep); + if (auto result = unescape_string(*arg, flags, opts.escape_style)) { + streams.out.append(*result + sep); nesc++; } } diff --git a/src/common.cpp b/src/common.cpp index a67bd6fa9..1e348f63c 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -33,6 +33,7 @@ #include #include "common.h" +#include "common.rs.h" #include "expand.h" #include "fallback.h" // IWYU pragma: keep #include "flog.h" @@ -119,17 +120,6 @@ long convert_digit(wchar_t d, int base) { /// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions. static bool is_hex_digit(int c) { return std::strchr("0123456789ABCDEF", c) != nullptr; } -/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase. -static long convert_hex_digit(wchar_t d) { - if ((d <= L'9') && (d >= L'0')) { - return d - L'0'; - } else if ((d <= L'Z') && (d >= L'A')) { - return 10 + d - L'A'; - } - - return -1; -} - bool is_windows_subsystem_for_linux() { #if defined(WSL) return true; @@ -749,38 +739,6 @@ static void escape_string_url(const wcstring &in, wcstring &out) { } } -/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII -/// chars. -static bool unescape_string_url(const wchar_t *in, wcstring *out) { - std::string result; - result.reserve(out->size()); - for (wchar_t c = *in; c; c = *++in) { - if (c > 0x7F) return false; // invalid character means we can't decode the string - if (c == '%') { - int c1 = in[1]; - if (c1 == 0) return false; // found unexpected end of string - if (c1 == '%') { - result.push_back('%'); - in++; - } else { - int c2 = in[2]; - if (c2 == 0) return false; // string ended prematurely - long d1 = convert_digit(c1, 16); - if (d1 < 0) return false; - long d2 = convert_digit(c2, 16); - if (d2 < 0) return false; - result.push_back(16 * d1 + d2); - in += 2; - } - } else { - result.push_back(c); - } - } - - *out = str2wcstring(result); - return true; -} - /// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str. static void escape_string_var(const wcstring &in, wcstring &out) { bool prev_was_hex_encoded = false; @@ -812,46 +770,6 @@ static void escape_string_var(const wcstring &in, wcstring &out) { } } -/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII -/// chars. -static bool unescape_string_var(const wchar_t *in, wcstring *out) { - std::string result; - result.reserve(out->size()); - bool prev_was_hex_encoded = false; - for (wchar_t c = *in; c; c = *++in) { - if (c > 0x7F) return false; // invalid character means we can't decode the string - if (c == '_') { - int c1 = in[1]; - if (c1 == 0) { - if (prev_was_hex_encoded) break; - return false; // found unexpected escape char at end of string - } - if (c1 == '_') { - result.push_back('_'); - in++; - } else if (is_hex_digit(c1)) { - int c2 = in[2]; - if (c2 == 0) return false; // string ended prematurely - long d1 = convert_hex_digit(c1); - if (d1 < 0) return false; - long d2 = convert_hex_digit(c2); - if (d2 < 0) return false; - result.push_back(16 * d1 + d2); - in += 2; - prev_was_hex_encoded = true; - } - // No "else" clause because if the first char after an underscore is not another - // underscore or a valid hex character then the underscore is there to improve - // readability after we've encoded a character not valid in a var name. - } else { - result.push_back(c); - } - } - - *out = str2wcstring(result); - return true; -} - wcstring escape_string_for_double_quotes(wcstring in) { // We need to escape backslashes, double quotes, and dollars only. wcstring result = std::move(in); @@ -1130,12 +1048,6 @@ wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_s return result; } -/// Helper to return the last character in a string, or none. -static maybe_t string_last_char(const wcstring &str) { - if (str.empty()) return none(); - return str.back(); -} - /// Given a null terminated string starting with a backslash, read the escape as if it is unquoted, /// appending to result. Return the number of characters consumed, or none on error. maybe_t read_unquoted_escape(const wchar_t *input, wcstring *result, bool allow_incomplete, @@ -1329,320 +1241,30 @@ maybe_t read_unquoted_escape(const wchar_t *input, wcstring *result, boo return in_pos; } -/// Returns the unescaped version of input_str into output_str (by reference). Returns true if -/// successful. If false, the contents of output_str are unchanged. -static bool unescape_string_internal(const wchar_t *const input, const size_t input_len, - wcstring *output_str, unescape_flags_t flags) { - // Set up result string, which we'll swap with the output on success. - wcstring result; - result.reserve(input_len); - - const bool unescape_special = static_cast(flags & UNESCAPE_SPECIAL); - const bool allow_incomplete = static_cast(flags & UNESCAPE_INCOMPLETE); - const bool ignore_backslashes = static_cast(flags & UNESCAPE_NO_BACKSLASHES); - - // The positions of open braces. - std::vector braces; - // The positions of variable expansions or brace ","s. - // We only read braces as expanders if there's a variable expansion or "," in them. - std::vector vars_or_seps; - int brace_count = 0; - - bool errored = false; - enum { - mode_unquoted, - mode_single_quotes, - mode_double_quotes, - } mode = mode_unquoted; - - for (size_t input_position = 0; input_position < input_len && !errored; input_position++) { - const wchar_t c = input[input_position]; - // Here's the character we'll append to result, or none() to suppress it. - maybe_t to_append_or_none = c; - if (mode == mode_unquoted) { - switch (c) { - case L'\\': { - if (!ignore_backslashes) { - // Backslashes (escapes) are complicated and may result in errors, or - // appending INTERNAL_SEPARATORs, so we have to handle them specially. - auto escape_chars = read_unquoted_escape( - input + input_position, &result, allow_incomplete, unescape_special); - if (!escape_chars.has_value()) { - // A none() return indicates an error. - errored = true; - } else { - // Skip over the characters we read, minus one because the outer loop - // will increment it. - assert(*escape_chars > 0); - input_position += *escape_chars - 1; - } - // We've already appended, don't append anything else. - to_append_or_none = none(); - } - break; - } - case L'~': { - if (unescape_special && (input_position == 0)) { - to_append_or_none = HOME_DIRECTORY; - } - break; - } - case L'%': { - // Note that this only recognizes %self if the string is literally %self. - // %self/foo will NOT match this. - if (unescape_special && input_position == 0 && - !std::wcscmp(input, PROCESS_EXPAND_SELF_STR)) { - to_append_or_none = PROCESS_EXPAND_SELF; - input_position += PROCESS_EXPAND_SELF_STR_LEN - 1; // skip over 'self's - } - break; - } - case L'*': { - if (unescape_special) { - // In general, this is ANY_STRING. But as a hack, if the last appended char - // is ANY_STRING, delete the last char and store ANY_STRING_RECURSIVE to - // reflect the fact that ** is the recursive wildcard. - if (string_last_char(result) == ANY_STRING) { - assert(!result.empty()); - result.resize(result.size() - 1); - to_append_or_none = ANY_STRING_RECURSIVE; - } else { - to_append_or_none = ANY_STRING; - } - } - break; - } - case L'?': { - if (unescape_special && !feature_test(feature_flag_t::qmark_noglob)) { - to_append_or_none = ANY_CHAR; - } - break; - } - case L'$': { - if (unescape_special) { - bool is_cmdsub = - input_position + 1 < input_len && input[input_position + 1] == L'('; - if (!is_cmdsub) { - to_append_or_none = VARIABLE_EXPAND; - vars_or_seps.push_back(input_position); - } - } - break; - } - case L'{': { - if (unescape_special) { - brace_count++; - to_append_or_none = BRACE_BEGIN; - // We need to store where the brace *ends up* in the output. - braces.push_back(result.size()); - } - break; - } - case L'}': { - if (unescape_special) { - // HACK: The completion machinery sometimes hands us partial tokens. - // We can't parse them properly, but it shouldn't hurt, - // so we don't assert here. - // See #4954. - // assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we - // shouldn't be able to get here"); - brace_count--; - to_append_or_none = BRACE_END; - if (!braces.empty()) { - // HACK: To reduce accidental use of brace expansion, treat a brace - // with zero or one items as literal input. See #4632. (The hack is - // doing it here and like this.) - if (vars_or_seps.empty() || vars_or_seps.back() < braces.back()) { - result[braces.back()] = L'{'; - // We also need to turn all spaces back. - for (size_t i = braces.back() + 1; i < result.size(); i++) { - if (result[i] == BRACE_SPACE) result[i] = L' '; - } - to_append_or_none = L'}'; - } - - // Remove all seps inside the current brace pair, so if we have a - // surrounding pair we only get seps inside *that*. - if (!vars_or_seps.empty()) { - while (!vars_or_seps.empty() && vars_or_seps.back() > braces.back()) - vars_or_seps.pop_back(); - } - braces.pop_back(); - } - } - break; - } - case L',': { - if (unescape_special && brace_count > 0) { - to_append_or_none = BRACE_SEP; - vars_or_seps.push_back(input_position); - } - break; - } - case L' ': { - if (unescape_special && brace_count > 0) { - to_append_or_none = BRACE_SPACE; - } - break; - } - case L'\'': { - mode = mode_single_quotes; - to_append_or_none = - unescape_special ? maybe_t(INTERNAL_SEPARATOR) : none(); - break; - } - case L'\"': { - mode = mode_double_quotes; - to_append_or_none = - unescape_special ? maybe_t(INTERNAL_SEPARATOR) : none(); - break; - } - default: { - break; - } - } - } else if (mode == mode_single_quotes) { - if (c == L'\\') { - // A backslash may or may not escape something in single quotes. - switch (input[input_position + 1]) { - case '\\': - case L'\'': { - to_append_or_none = input[input_position + 1]; - input_position += 1; // skip over the backslash - break; - } - case L'\0': { - if (!allow_incomplete) { - errored = true; - } else { - // PCA this line had the following cryptic comment: 'We may ever escape - // a NULL character, but still appending a \ in case I am wrong.' Not - // sure what it means or the importance of this. - input_position += 1; /* Skip over the backslash */ - to_append_or_none = L'\\'; - } - break; - } - default: { - // Literal backslash that doesn't escape anything! Leave things alone; we'll - // append the backslash itself. - break; - } - } - } else if (c == L'\'') { - to_append_or_none = - unescape_special ? maybe_t(INTERNAL_SEPARATOR) : none(); - mode = mode_unquoted; - } - } else if (mode == mode_double_quotes) { - switch (c) { - case L'"': { - mode = mode_unquoted; - to_append_or_none = - unescape_special ? maybe_t(INTERNAL_SEPARATOR) : none(); - break; - } - case '\\': { - switch (input[input_position + 1]) { - case L'\0': { - if (!allow_incomplete) { - errored = true; - } else { - to_append_or_none = L'\0'; - } - break; - } - case '\\': - case L'$': - case '"': { - to_append_or_none = input[input_position + 1]; - input_position += 1; /* Skip over the backslash */ - break; - } - case '\n': { - /* Swallow newline */ - to_append_or_none = none(); - input_position += 1; /* Skip over the backslash */ - break; - } - default: { - /* Literal backslash that doesn't escape anything! Leave things alone; - * we'll append the backslash itself */ - break; - } - } - break; - } - case '$': { - if (unescape_special) { - to_append_or_none = VARIABLE_EXPAND_SINGLE; - vars_or_seps.push_back(input_position); - } - break; - } - default: { - break; - } - } - } - - // Now maybe append the char. - if (to_append_or_none.has_value()) { - result.push_back(*to_append_or_none); - } - } - - // Return the string by reference, and then success. - if (!errored) { - *output_str = std::move(result); - } - return !errored; -} - bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) { assert(str != nullptr); wcstring output; - bool success = unescape_string_internal(str->c_str(), str->size(), &output, escape_special); - if (success) { - *str = std::move(output); + if (auto unescaped = unescape_string(str->c_str(), str->size(), escape_special)) { + *str = *unescaped; + return true; } - return success; + return false; } -bool unescape_string(const wchar_t *input, size_t len, wcstring *output, - unescape_flags_t escape_special, escape_string_style_t style) { - bool success = false; - switch (style) { - case STRING_STYLE_SCRIPT: { - success = unescape_string_internal(input, len, output, escape_special); - break; - } - case STRING_STYLE_URL: { - success = unescape_string_url(input, output); - break; - } - case STRING_STYLE_VAR: { - success = unescape_string_var(input, output); - break; - } - case STRING_STYLE_REGEX: { - // unescaping PCRE2 is not needed/supported, the PCRE2 engine is responsible for that - success = false; - break; - } - } - if (!success) output->clear(); - return success; +std::unique_ptr unescape_string(const wchar_t *input, unescape_flags_t escape_special, + escape_string_style_t style) { + return unescape_string(input, std::wcslen(input), escape_special, style); } -bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special, - escape_string_style_t style) { - return unescape_string(input, std::wcslen(input), output, escape_special, style); +std::unique_ptr unescape_string(const wchar_t *input, size_t len, + unescape_flags_t escape_special, + escape_string_style_t style) { + return rust_unescape_string(input, len, escape_special, style); } -bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special, - escape_string_style_t style) { - return unescape_string(input.c_str(), input.size(), output, escape_special, style); +std::unique_ptr unescape_string(const wcstring &input, unescape_flags_t escape_special, + escape_string_style_t style) { + return unescape_string(input.c_str(), input.size(), escape_special, style); } wcstring format_size(long long sz) { diff --git a/src/common.h b/src/common.h index e329370b7..7ca0394ef 100644 --- a/src/common.h +++ b/src/common.h @@ -521,15 +521,15 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special); /// Reverse the effects of calling `escape_string`. Returns the unescaped value by reference. On /// failure, the output is set to an empty string. -bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special, - escape_string_style_t style = STRING_STYLE_SCRIPT); +std::unique_ptr unescape_string(const wchar_t *input, unescape_flags_t escape_special, + escape_string_style_t style = STRING_STYLE_SCRIPT); -bool unescape_string(const wchar_t *input, size_t len, wcstring *output, - unescape_flags_t escape_special, - escape_string_style_t style = STRING_STYLE_SCRIPT); +std::unique_ptr unescape_string(const wchar_t *input, size_t len, + unescape_flags_t escape_special, + escape_string_style_t style = STRING_STYLE_SCRIPT); -bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special, - escape_string_style_t style = STRING_STYLE_SCRIPT); +std::unique_ptr unescape_string(const wcstring &input, unescape_flags_t escape_special, + escape_string_style_t style = STRING_STYLE_SCRIPT); /// Write the given paragraph of output, redoing linebreaks to fit \p termsize. wcstring reformat_for_screen(const wcstring &msg, const termsize_t &termsize); diff --git a/src/complete.cpp b/src/complete.cpp index 522879a21..7dd34b4fe 100644 --- a/src/complete.cpp +++ b/src/complete.cpp @@ -1469,8 +1469,8 @@ void completer_t::escape_opening_brackets(const wcstring &argument) { if (!have_unquoted_unescaped_bracket) return; // Since completion_apply_to_command_line will escape the completion, we need to provide an // unescaped version. - wcstring unescaped_argument; - if (!unescape_string(argument, &unescaped_argument, UNESCAPE_INCOMPLETE)) return; + auto unescaped_argument = unescape_string(argument, UNESCAPE_INCOMPLETE); + if (!unescaped_argument) return; for (completion_t &comp : completions.get_list()) { if (comp.flags & COMPLETE_REPLACES_TOKEN) continue; comp.flags |= COMPLETE_REPLACES_TOKEN; @@ -1482,7 +1482,7 @@ void completer_t::escape_opening_brackets(const wcstring &argument) { if (comp.flags & COMPLETE_DONT_ESCAPE) { FLOG(warning, L"unexpected completion flag"); } - comp.completion = unescaped_argument + comp.completion; + comp.completion = *unescaped_argument + comp.completion; } } @@ -1494,9 +1494,8 @@ void completer_t::mark_completions_duplicating_arguments(const wcstring &cmd, wcstring_list_t arg_strs; for (const auto &arg : args) { wcstring argstr = *arg.get_source(cmd); - wcstring argstr_unesc; - if (unescape_string(argstr, &argstr_unesc, UNESCAPE_DEFAULT)) { - arg_strs.push_back(std::move(argstr_unesc)); + if (auto argstr_unesc = unescape_string(argstr, UNESCAPE_DEFAULT)) { + arg_strs.push_back(std::move(*argstr_unesc)); } } std::sort(arg_strs.begin(), arg_strs.end()); @@ -1668,11 +1667,14 @@ void completer_t::perform_for_commandline(wcstring cmdline) { source_range_t command_range = {cmd_tok.offset - bias, cmd_tok.length}; wcstring exp_command = *cmd_tok.get_source(cmdline); - bool unescaped = - expand_command_token(ctx, exp_command) && - unescape_string(previous_argument, &arg_data.previous_argument, UNESCAPE_DEFAULT) && - unescape_string(current_argument, &arg_data.current_argument, UNESCAPE_INCOMPLETE); + std::unique_ptr prev; + std::unique_ptr cur; + bool unescaped = expand_command_token(ctx, exp_command) && + (prev = unescape_string(previous_argument, UNESCAPE_DEFAULT)) && + (cur = unescape_string(current_argument, UNESCAPE_INCOMPLETE)); if (unescaped) { + arg_data.previous_argument = *prev; + arg_data.current_argument = *cur; // Have to walk over the command and its entire wrap chain. If any command // disables do_file, then they all do. walk_wrap_chain(exp_command, *effective_cmdline, command_range, &arg_data); diff --git a/src/env.cpp b/src/env.cpp index 8bacb4e01..b5e889856 100644 --- a/src/env.cpp +++ b/src/env.cpp @@ -472,11 +472,11 @@ void env_init(const struct config_paths_t *paths, bool do_uvars, bool default_pa for (const auto &kv : table) { if (string_prefixes_string(prefix, kv.first)) { wcstring escaped_name = kv.first.substr(prefix_len); - wcstring name; - if (unescape_string(escaped_name, &name, unescape_flags_t{}, STRING_STYLE_VAR)) { - wcstring key = name; + if (auto name = + unescape_string(escaped_name, unescape_flags_t{}, STRING_STYLE_VAR)) { + wcstring key = *name; wcstring replacement = join_strings(kv.second.as_list(), L' '); - abbrs->add(std::move(name), std::move(key), std::move(replacement), + abbrs->add(std::move(*name), std::move(key), std::move(replacement), abbrs_position_t::command, from_universal); } } diff --git a/src/env_universal_common.cpp b/src/env_universal_common.cpp index 0159f9efd..fc5cd1e0d 100644 --- a/src/env_universal_common.cpp +++ b/src/env_universal_common.cpp @@ -800,9 +800,11 @@ bool env_universal_t::populate_1_variable(const wchar_t *input, env_var_t::env_v // Parse out the value into storage, and decode it into a variable. storage->clear(); - if (!unescape_string(colon + 1, storage, 0)) { + auto unescaped = unescape_string(colon + 1, 0); + if (!unescaped) { return false; } + *storage = *unescaped; env_var_t var{decode_serialized(*storage), flags}; // Parse out the key and write into the map. diff --git a/src/expand.cpp b/src/expand.cpp index 7ffa34acd..74e0bb650 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -971,7 +971,8 @@ expand_result_t expander_t::stage_variables(wcstring input, completion_receiver_ // We accept incomplete strings here, since complete uses expand_string to expand incomplete // strings from the commandline. wcstring next; - unescape_string(input, &next, UNESCAPE_SPECIAL | UNESCAPE_INCOMPLETE); + if (auto unescaped = unescape_string(input, UNESCAPE_SPECIAL | UNESCAPE_INCOMPLETE)) + next = *unescaped; if (flags & expand_flag::skip_variables) { for (auto &i : next) { diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 864ecce68..c03af9a59 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -376,27 +376,26 @@ static void test_unescape_sane() { {L"\"abcd\\n\"", L"abcd\\n"}, {L"\\143", L"c"}, {L"'\\143'", L"\\143"}, {L"\\n", L"\n"} // \n normally becomes newline }; - wcstring output; for (const auto &test : tests) { - bool ret = unescape_string(test.input, &output, UNESCAPE_DEFAULT); - if (!ret) { + auto output = unescape_string(test.input, UNESCAPE_DEFAULT); + if (!output) { err(L"Failed to unescape '%ls'\n", test.input); - } else if (output != test.expected) { + } else if (*output != test.expected) { err(L"In unescaping '%ls', expected '%ls' but got '%ls'\n", test.input, test.expected, - output.c_str()); + output->c_str()); } } // Test for overflow. - if (unescape_string(L"echo \\UFFFFFF", &output, UNESCAPE_DEFAULT)) { + if (unescape_string(L"echo \\UFFFFFF", UNESCAPE_DEFAULT)) { err(L"Should not have been able to unescape \\UFFFFFF\n"); } - if (unescape_string(L"echo \\U110000", &output, UNESCAPE_DEFAULT)) { + if (unescape_string(L"echo \\U110000", UNESCAPE_DEFAULT)) { err(L"Should not have been able to unescape \\U110000\n"); } #if WCHAR_MAX != 0xffff // TODO: Make this work on MS Windows. - if (!unescape_string(L"echo \\U10FFFF", &output, UNESCAPE_DEFAULT)) { + if (!unescape_string(L"echo \\U10FFFF", UNESCAPE_DEFAULT)) { err(L"Should have been able to unescape \\U10FFFF\n"); } #endif @@ -408,8 +407,6 @@ static void test_escape_crazy() { say(L"Testing escaping and unescaping"); wcstring random_string; wcstring escaped_string; - wcstring unescaped_string; - bool unescaped_success; for (size_t i = 0; i < ESCAPE_TEST_COUNT; i++) { random_string.clear(); while (random() % ESCAPE_TEST_LENGTH) { @@ -417,14 +414,14 @@ static void test_escape_crazy() { } escaped_string = escape_string(random_string); - unescaped_success = unescape_string(escaped_string, &unescaped_string, UNESCAPE_DEFAULT); + auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT); - if (!unescaped_success) { + if (!unescaped_string) { err(L"Failed to unescape string <%ls>", escaped_string.c_str()); break; - } else if (unescaped_string != random_string) { + } else if (*unescaped_string != random_string) { err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'", - random_string.c_str(), unescaped_string.c_str()); + random_string.c_str(), unescaped_string->c_str()); break; } } @@ -432,12 +429,12 @@ static void test_escape_crazy() { // Verify that ESCAPE_NO_PRINTABLES also escapes backslashes so we don't regress on issue #3892. random_string = L"line 1\\n\nline 2"; escaped_string = escape_string(random_string, ESCAPE_NO_PRINTABLES | ESCAPE_NO_QUOTED); - unescaped_success = unescape_string(escaped_string, &unescaped_string, UNESCAPE_DEFAULT); - if (!unescaped_success) { + auto unescaped_string = unescape_string(escaped_string, UNESCAPE_DEFAULT); + if (!unescaped_string) { err(L"Failed to unescape string <%ls>", escaped_string.c_str()); - } else if (unescaped_string != random_string) { + } else if (*unescaped_string != random_string) { err(L"Escaped and then unescaped string '%ls', but got back a different string '%ls'", - random_string.c_str(), unescaped_string.c_str()); + random_string.c_str(), unescaped_string->c_str()); } } diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 404819742..c8bde9860 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -960,8 +960,8 @@ parser_test_error_bits_t parse_util_detect_errors_in_argument(const ast::argumen parser_test_error_bits_t err = 0; auto check_subtoken = [&arg_src, &out_errors, source_start](size_t begin, size_t end) -> int { - wcstring unesc; - if (!unescape_string(arg_src.c_str() + begin, end - begin, &unesc, UNESCAPE_SPECIAL)) { + auto maybe_unesc = unescape_string(arg_src.c_str() + begin, end - begin, UNESCAPE_SPECIAL); + if (!maybe_unesc) { if (out_errors) { const wchar_t *fmt = L"Invalid token '%ls'"; if (arg_src.length() == 2 && arg_src[0] == L'\\' && @@ -975,6 +975,7 @@ parser_test_error_bits_t parse_util_detect_errors_in_argument(const ast::argumen } return 1; } + const wcstring &unesc = *maybe_unesc; parser_test_error_bits_t err = 0; // Check for invalid variable expansions. diff --git a/src/wildcard.cpp b/src/wildcard.cpp index 9dc9c55c5..6f6258379 100644 --- a/src/wildcard.cpp +++ b/src/wildcard.cpp @@ -60,7 +60,9 @@ bool wildcard_has(const wchar_t *str, size_t len) { return false; } wcstring unescaped; - unescape_string(str, len, &unescaped, UNESCAPE_SPECIAL); + if (auto tmp = unescape_string(wcstring{str, len}, UNESCAPE_SPECIAL)) { + unescaped = *tmp; + } return wildcard_has_internal(unescaped); } diff --git a/tests/checks/basic.fish b/tests/checks/basic.fish index 60a4e18a2..3d94ad038 100644 --- a/tests/checks/basic.fish +++ b/tests/checks/basic.fish @@ -158,6 +158,9 @@ echo -e 'abc\x211def' #CHECK: abc!def #CHECK: abc!1def +echo \UDE01 +#CHECK: � + # Comments allowed in between lines (#1987) echo before comment \ # comment