From 39f3c894d7fe9e5f40055496692608155471672b Mon Sep 17 00:00:00 2001 From: Johannes Altmanninger Date: Sun, 5 Feb 2023 14:06:11 +0100 Subject: [PATCH] Port tokenizer.cpp to Rust In hindsight, I should probably have split this into three different commits. --- CMakeLists.txt | 2 +- fish-rust/src/tokenizer.rs | 1335 +++++++++++++++++++++++++++++++++- src/ast.cpp | 19 +- src/builtins/commandline.cpp | 8 +- src/builtins/fg.cpp | 2 +- src/builtins/read.cpp | 17 +- src/complete.cpp | 48 +- src/fish_indent.cpp | 12 +- src/fish_tests.cpp | 157 ++-- src/highlight.cpp | 8 +- src/parse_execution.cpp | 6 +- src/parse_util.cpp | 28 +- src/parse_util.h | 3 +- src/reader.cpp | 57 +- src/tokenizer.cpp | 887 ---------------------- src/tokenizer.h | 202 +---- 16 files changed, 1552 insertions(+), 1239 deletions(-) delete mode 100644 src/tokenizer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 61b23e689..2a991b887 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ set(FISH_SRCS src/parser.cpp src/parser_keywords.cpp src/path.cpp src/postfork.cpp src/proc.cpp src/re.cpp src/reader.cpp src/screen.cpp src/signals.cpp src/termsize.cpp src/timer.cpp src/tinyexpr.cpp - src/tokenizer.cpp src/trace.cpp src/utf8.cpp + src/trace.cpp src/utf8.cpp src/wait_handle.cpp src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp src/wutil.cpp src/fds.cpp src/rustffi.cpp ) diff --git a/fish-rust/src/tokenizer.rs b/fish-rust/src/tokenizer.rs index 39114a3ef..fc0e094e1 100644 --- a/fish-rust/src/tokenizer.rs +++ b/fish-rust/src/tokenizer.rs @@ -1,18 +1,1345 @@ //! A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be //! extended to support marks, tokenizing multiple strings and disposing of unused string segments. -use crate::ffi::{valid_var_name_char, wchar_t}; -use crate::wchar::wstr; -use crate::wchar_ffi::WCharFromFFI; -use cxx::{CxxWString, SharedPtr}; + +use crate::ffi::{valid_var_name_char, wcharz_t}; +use crate::future_feature_flags::{feature_test, FeatureFlag}; +use crate::parse_constants::SOURCE_OFFSET_INVALID; +use crate::redirection::RedirectionMode; +use crate::wchar::{WExt, L}; +use crate::wchar_ffi::{wchar_t, wstr, WCharFromFFI, WCharToFFI, WString}; +use crate::wutil::wgettext; +use cxx::{CxxWString, SharedPtr, UniquePtr}; +use libc::{c_int, STDIN_FILENO, STDOUT_FILENO}; +use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, Not}; +use widestring_suffix::widestrs; #[cxx::bridge] mod tokenizer_ffi { + extern "C++" { + include!("wutil.h"); + include!("redirection.h"); + type wcharz_t = super::wcharz_t; + type RedirectionMode = super::RedirectionMode; + } + + /// Token types. XXX Why this isn't ParseTokenType, I'm not really sure. + enum TokenType { + /// Error reading token + error, + /// String token + string, + /// Pipe token + pipe, + /// && token + andand, + /// || token + oror, + /// End token (semicolon or newline, not literal end) + end, + /// redirection token + redirect, + /// send job to bg token + background, + /// comment token + comment, + } + + enum TokenizerError { + none, + unterminated_quote, + unterminated_subshell, + unterminated_slice, + unterminated_escape, + invalid_redirect, + invalid_pipe, + invalid_pipe_ampersand, + closing_unopened_subshell, + illegal_slice, + closing_unopened_brace, + unterminated_brace, + expected_pclose_found_bclose, + expected_bclose_found_pclose, + } + + extern "Rust" { + fn tokenizer_get_error_message(err: TokenizerError) -> UniquePtr; + } + + struct Tok { + // Offset of the token. + offset: u32, + // Length of the token. + length: u32, + + // If an error, this is the offset of the error within the token. A value of 0 means it occurred + // at 'offset'. + error_offset_within_token: u32, + error_length: u32, + + // If an error, this is the error code. + error: TokenizerError, + + // The type of the token. + type_: TokenType, + } + // TODO static_assert(sizeof(Tok) <= 32, "Tok expected to be 32 bytes or less"); + + extern "Rust" { + fn location_in_or_at_end_of_source_range(self: &Tok, loc: usize) -> bool; + #[cxx_name = "get_source"] + fn get_source_ffi(self: &Tok, str: &CxxWString) -> UniquePtr; + } + + extern "Rust" { + type Tokenizer; + fn new_tokenizer(start: wcharz_t, flags: u8) -> Box; + #[cxx_name = "next"] + fn next_ffi(self: &mut Tokenizer) -> UniquePtr; + #[cxx_name = "text_of"] + fn text_of_ffi(self: &Tokenizer, tok: &Tok) -> UniquePtr; + #[cxx_name = "is_token_delimiter"] + fn is_token_delimiter_ffi(c: wchar_t, next: SharedPtr) -> bool; + } + + extern "Rust" { + #[cxx_name = "tok_command"] + fn tok_command_ffi(str: &CxxWString) -> UniquePtr; + } + + /// Struct wrapping up a parsed pipe or redirection. + struct PipeOrRedir { + // The redirected fd, or -1 on overflow. + // In the common case of a pipe, this is 1 (STDOUT_FILENO). + // For example, in the case of "3>&1" this will be 3. + fd: i32, + + // Whether we are a pipe (true) or redirection (false). + is_pipe: bool, + + // The redirection mode if the type is redirect. + // Ignored for pipes. + mode: RedirectionMode, + + // Whether, in addition to this redirection, stderr should also be dup'd to stdout + // For example &| or &> + stderr_merge: bool, + + // Number of characters consumed when parsing the string. + consumed: usize, + } + + extern "Rust" { + fn pipe_or_redir_from_string(buff: wcharz_t) -> UniquePtr; + fn is_valid(self: &PipeOrRedir) -> bool; + fn oflags(self: &PipeOrRedir) -> i32; + fn token_type(self: &PipeOrRedir) -> TokenType; + } + + enum MoveWordStyle { + move_word_style_punctuation, // stop at punctuation + move_word_style_path_components, // stops at path components + move_word_style_whitespace, // stops at whitespace + } + + /// Our state machine that implements "one word" movement or erasure. + struct MoveWordStateMachine { + state: u8, + style: MoveWordStyle, + } + + extern "Rust" { + fn new_move_word_state_machine(syl: MoveWordStyle) -> Box; + #[cxx_name = "consume_char"] + fn consume_char_ffi(self: &mut MoveWordStateMachine, c: wchar_t) -> bool; + fn reset(self: &mut MoveWordStateMachine); + } + extern "Rust" { #[cxx_name = "variable_assignment_equals_pos"] fn variable_assignment_equals_pos_ffi(txt: &CxxWString) -> SharedPtr; } } +pub use tokenizer_ffi::{ + MoveWordStateMachine, MoveWordStyle, PipeOrRedir, Tok, TokenType, TokenizerError, +}; + +#[derive(Clone, Copy)] +pub struct TokFlags(u8); + +impl BitAnd for TokFlags { + type Output = bool; + fn bitand(self, rhs: Self) -> Self::Output { + (self.0 & rhs.0) != 0 + } +} +impl BitOr for TokFlags { + type Output = Self; + fn bitor(self, rhs: Self) -> Self::Output { + Self(self.0 | rhs.0) + } +} + +/// Flag telling the tokenizer to accept incomplete parameters, i.e. parameters with mismatching +/// parenthesis, etc. This is useful for tab-completion. +pub const TOK_ACCEPT_UNFINISHED: TokFlags = TokFlags(1); + +/// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting. +pub const TOK_SHOW_COMMENTS: TokFlags = TokFlags(2); + +/// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells +/// the tokenizer to return each of them as a separate END. +pub const TOK_SHOW_BLANK_LINES: TokFlags = TokFlags(4); + +/// Make an effort to continue after an error. +pub const TOK_CONTINUE_AFTER_ERROR: TokFlags = TokFlags(8); + +/// Get the error message for an error \p err. +fn tokenizer_get_error_message(err: TokenizerError) -> UniquePtr { + let s: &'static wstr = err.into(); + s.to_ffi() +} + +impl From for &'static wstr { + #[widestrs] + fn from(err: TokenizerError) -> Self { + match err { + TokenizerError::none => ""L, + TokenizerError::unterminated_quote => { + wgettext!("Unexpected end of string, quotes are not balanced") + } + TokenizerError::unterminated_subshell => { + wgettext!("Unexpected end of string, expecting ')'") + } + TokenizerError::unterminated_slice => { + wgettext!("Unexpected end of string, square brackets do not match") + } + TokenizerError::unterminated_escape => { + wgettext!("Unexpected end of string, incomplete escape sequence") + } + TokenizerError::invalid_redirect => { + wgettext!("Invalid input/output redirection") + } + TokenizerError::invalid_pipe => { + wgettext!("Cannot use stdin (fd 0) as pipe output") + } + TokenizerError::invalid_pipe_ampersand => { + wgettext!("|& is not valid. In fish, use &| to pipe both stdout and stderr.") + } + TokenizerError::closing_unopened_subshell => { + wgettext!("Unexpected ')' for unopened parenthesis") + } + TokenizerError::illegal_slice => { + wgettext!("Unexpected '[' at this location") + } + TokenizerError::closing_unopened_brace => { + wgettext!("Unexpected '}' for unopened brace expansion") + } + TokenizerError::unterminated_brace => { + wgettext!("Unexpected end of string, incomplete parameter expansion") + } + TokenizerError::expected_pclose_found_bclose => { + wgettext!("Unexpected '}' found, expecting ')'") + } + TokenizerError::expected_bclose_found_pclose => { + wgettext!("Unexpected ')' found, expecting '}'") + } + _ => { + panic!("Unexpected tokenizer error"); + } + } + } +} + +impl Tok { + fn new(r#type: TokenType) -> Tok { + Tok { + offset: 0, + length: 0, + error_offset_within_token: SOURCE_OFFSET_INVALID, + error_length: 0, + error: TokenizerError::none, + type_: r#type, + } + } + pub fn location_in_or_at_end_of_source_range(self: &Tok, loc: usize) -> bool { + let loc = loc as u32; + self.offset <= loc && loc - self.offset <= self.length + } + pub fn get_source<'a, 'b>(self: &'a Tok, str: &'b wstr) -> &'b wstr { + &str[self.offset as usize..(self.offset + self.length) as usize] + } + fn get_source_ffi(self: &Tok, str: &CxxWString) -> UniquePtr { + self.get_source(&str.from_ffi()).to_ffi() + } +} + +/// The tokenizer struct. +pub struct Tokenizer { + /// A pointer into the original string, showing where the next token begins. + token_cursor: usize, + /// The start of the original string. + start: WString, // TODO Avoid copying once we drop the FFI. + /// Whether we have additional tokens. + has_next: bool, + /// Whether incomplete tokens are accepted. + accept_unfinished: bool, + /// Whether comments should be returned. + show_comments: bool, + /// Whether all blank lines are returned. + show_blank_lines: bool, + /// Whether to attempt to continue after an error. + continue_after_error: bool, + /// Whether to continue the previous line after the comment. + continue_line_after_comment: bool, +} + +impl Tokenizer { + /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and + /// should not be freed by the caller until after the tokenizer is destroyed. + /// + /// \param start The string to tokenize + /// \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer + /// to accept incomplete tokens, such as a subshell without a closing parenthesis, as a valid + /// token. Setting TOK_SHOW_COMMENTS will return comments as tokens + fn new(start: &wstr, flags: TokFlags) -> Self { + Tokenizer { + token_cursor: 0, + start: start.to_owned(), + has_next: true, + accept_unfinished: flags & TOK_ACCEPT_UNFINISHED, + show_comments: flags & TOK_SHOW_COMMENTS, + show_blank_lines: flags & TOK_SHOW_BLANK_LINES, + continue_after_error: flags & TOK_CONTINUE_AFTER_ERROR, + continue_line_after_comment: false, + } + } +} + +fn new_tokenizer(start: wcharz_t, flags: u8) -> Box { + Box::new(Tokenizer::new(start.into(), TokFlags(flags))) +} + +impl Tokenizer { + /// Returns the next token, or none if we are at the end. + pub fn next(&mut self) -> Option { + // TODO Implement IntoIterator. + if !self.has_next { + return None; + } + + // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past + // it. + loop { + let i = self.token_cursor; + if self.start.get(i..i + 2) == Some(L!("\\\n")) { + self.token_cursor += 2; + self.continue_line_after_comment = true; + } else if i < self.start.len() && iswspace_not_nl(self.start.char_at(i)) { + self.token_cursor += 1; + } else { + break; + } + } + + while self.start.char_at(self.token_cursor) == '#' { + // We have a comment, walk over the comment. + let comment_start = self.token_cursor; + self.token_cursor = comment_end(&self.start, self.token_cursor); + let comment_len = self.token_cursor - comment_start; + + // If we are going to continue after the comment, skip any trailing newline. + if self.start.as_char_slice().get(self.token_cursor) == Some(&'\n') + && self.continue_line_after_comment + { + self.token_cursor += 1; + } + + // Maybe return the comment. + if self.show_comments { + let mut result = Tok::new(TokenType::comment); + result.offset = comment_start as u32; + result.length = comment_len as u32; + return Some(result); + } + + while self.token_cursor < self.start.len() + && iswspace_not_nl(self.start.char_at(self.token_cursor)) + { + self.token_cursor += 1; + } + } + + // We made it past the comments and ate any trailing newlines we wanted to ignore. + self.continue_line_after_comment = false; + let start_pos = self.token_cursor; + + let this_char = self.start.char_at(self.token_cursor); + let next_char = self + .start + .as_char_slice() + .get(self.token_cursor + 1) + .copied(); + let buff = &self.start[self.token_cursor..]; + match this_char { + '\0'=> { + self.has_next = false; + None + } + '\r'| // carriage-return + '\n'| // newline + ';'=> { + let mut result = Tok::new(TokenType::end); + result.offset = start_pos as u32; + result.length = 1; + self.token_cursor+=1; + // Hack: when we get a newline, swallow as many as we can. This compresses multiple + // subsequent newlines into a single one. + if !self.show_blank_lines { + while self.token_cursor < self.start.len() { + let c = self.start.char_at(self.token_cursor); + if c != '\n' && c != '\r' && c != ' ' && c != '\t' { + break + } + self.token_cursor+=1; + } + } + Some(result) + } + '&'=> { + if next_char == Some('&') { + // && is and. + let mut result = Tok::new(TokenType::andand); + result.offset = start_pos as u32; + result.length = 2; + self.token_cursor += 2; + Some(result) + } else if next_char == Some('>') || next_char == Some('|') { + // &> and &| redirect both stdout and stderr. + let redir = PipeOrRedir::try_from(buff). + expect("Should always succeed to parse a &> or &| redirection"); + let mut result = Tok::new(redir.token_type()); + result.offset = start_pos as u32; + result.length = redir.consumed as u32; + self.token_cursor += redir.consumed; + Some(result) + } else { + let mut result = Tok::new(TokenType::background); + result.offset = start_pos as u32; + result.length = 1; + self.token_cursor+=1; + Some(result) + } + } + '|'=> { + if next_char == Some('|') { + // || is or. + let mut result=Tok::new(TokenType::oror); + result.offset = start_pos as u32; + result.length = 2; + self.token_cursor += 2; + Some(result) + } else if next_char == Some('&') { + // |& is a bashism; in fish it's &|. + Some(self.call_error(TokenizerError::invalid_pipe_ampersand, + self.token_cursor, self.token_cursor, Some(2), 2)) + } else { + let pipe = PipeOrRedir::try_from(buff). + expect("Should always succeed to parse a | pipe"); + let mut result = Tok::new(pipe.token_type()); + result.offset = start_pos as u32; + result.length = pipe.consumed as u32; + self.token_cursor += pipe.consumed; + Some(result) + } + } + '>'| '<' => { + // There's some duplication with the code in the default case below. The key + // difference here is that we must never parse these as a string; a failed + // redirection is an error! + match PipeOrRedir::try_from(buff) { + Ok(redir_or_pipe) => { + if redir_or_pipe.fd < 0 { + Some(self.call_error(TokenizerError::invalid_redirect, self.token_cursor, + self.token_cursor, + Some(redir_or_pipe.consumed), + redir_or_pipe.consumed)) + } else { + let mut result = Tok::new(redir_or_pipe.token_type()); + result.offset = start_pos as u32; + result.length = redir_or_pipe.consumed as u32; + self.token_cursor += redir_or_pipe.consumed; + Some(result) + } + } + Err(()) => Some(self.call_error(TokenizerError::invalid_redirect, self.token_cursor, + self.token_cursor, + Some(0), + 0)) + } + } + _ => { + // Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string. + let error_location = self.token_cursor; + let redir_or_pipe = if this_char.is_ascii_digit() { + PipeOrRedir::try_from(buff).ok() + } else { + None + }; + + match redir_or_pipe { + Some(redir_or_pipe) => { + // It looks like a redirection or a pipe. But we don't support piping fd 0. Note + // tSome(hat fd 0 may be -1, indicating overflow; but we don't treat that as a + // tokenizer error. + if redir_or_pipe.is_pipe && redir_or_pipe.fd == 0 { + Some(self.call_error(TokenizerError::invalid_pipe, error_location, + error_location, Some(redir_or_pipe.consumed), + redir_or_pipe.consumed)) + } + else { + let mut result = Tok::new(redir_or_pipe.token_type()); + result.offset = start_pos as u32; + result.length = redir_or_pipe.consumed as u32; + self.token_cursor += redir_or_pipe.consumed; + Some(result) + } + } + None => { + // Not a redirection or pipe, so just a string. + Some(self.read_string()) + } + } + } + } + } + fn next_ffi(&mut self) -> UniquePtr { + match self.next() { + Some(tok) => UniquePtr::new(tok), + None => UniquePtr::null(), + } + } +} + +/// Test if a character is whitespace. Differs from iswspace in that it does not consider a +/// newline to be whitespace. +fn iswspace_not_nl(c: char) -> bool { + match c { + ' ' | '\t' | '\r' => true, + '\n' => false, + _ => c.is_whitespace(), + } +} + +impl Tokenizer { + /// Returns the text of a token, as a string. + pub fn text_of(&self, tok: &Tok) -> &wstr { + tok.get_source(&self.start) + } + fn text_of_ffi(&self, tok: &Tok) -> UniquePtr { + self.text_of(tok).to_ffi() + } + + /// Return an error token and mark that we no longer have a next token. + fn call_error( + &mut self, + error_type: TokenizerError, + token_start: usize, + error_loc: usize, + token_length: Option, + error_len: usize, + ) -> Tok { + assert!( + error_type != TokenizerError::none, + "TokenizerError::none passed to call_error" + ); + assert!(error_loc >= token_start, "Invalid error location"); + assert!(self.token_cursor >= token_start, "Invalid buff location"); + + // If continue_after_error is set and we have a real token length, then skip past it. + // Otherwise give up. + match token_length { + Some(token_length) if self.continue_after_error => { + assert!( + self.token_cursor < error_loc + token_length, + "Unable to continue past error" + ); + self.token_cursor = error_loc + token_length; + } + _ => self.has_next = false, + } + + Tok { + offset: token_start as u32, + length: token_length.unwrap_or(self.token_cursor - token_start) as u32, + error_offset_within_token: (error_loc - token_start) as u32, + error_length: error_len as u32, + error: error_type, + type_: TokenType::error, + } + } +} + +impl Tokenizer { + /// Read the next token as a string. + fn read_string(&mut self) -> Tok { + let mut mode = TOK_MODE_REGULAR_TEXT; + let mut paran_offsets = vec![]; + let mut brace_offsets = vec![]; + let mut expecting = vec![]; + let mut quoted_cmdsubs = vec![]; + let mut slice_offset = 0; + let buff_start = self.token_cursor; + let mut is_token_begin = true; + + fn process_opening_quote( + this: &mut Tokenizer, + quoted_cmdsubs: &mut Vec, + paran_offsets: &mut Vec, + quote: char, + ) -> Result<(), usize> { + if let Some(end) = quote_end(&this.start, this.token_cursor, quote) { + if this.start.char_at(end) == '$' { + quoted_cmdsubs.push(paran_offsets.len()); + } + this.token_cursor = end; + Ok(()) + } else { + let error_loc = this.token_cursor; + this.token_cursor = this.start.len(); + Err(error_loc) + } + } + + while self.token_cursor != self.start.len() { + let c = self.start.char_at(self.token_cursor); + + // Make sure this character isn't being escaped before anything else + if mode & TOK_MODE_CHAR_ESCAPE { + mode &= !TOK_MODE_CHAR_ESCAPE; + // and do nothing more + } else if myal(c) { + // Early exit optimization in case the character is just a letter, + // which has no special meaning to the tokenizer, i.e. the same mode continues. + } + // Now proceed with the evaluation of the token, first checking to see if the token + // has been explicitly ignored (escaped). + else if c == '\\' { + mode |= TOK_MODE_CHAR_ESCAPE; + } else if c == '#' && is_token_begin { + self.token_cursor = comment_end(&self.start, self.token_cursor) - 1; + } else if c == '(' { + paran_offsets.push(self.token_cursor); + expecting.push(')'); + mode |= TOK_MODE_SUBSHELL; + } else if c == '{' { + brace_offsets.push(self.token_cursor); + expecting.push('}'); + mode |= TOK_MODE_CURLY_BRACES; + } else if c == ')' { + if expecting.last() == Some(&'}') { + return self.call_error( + TokenizerError::expected_bclose_found_pclose, + self.token_cursor, + self.token_cursor, + Some(1), + 1, + ); + } + if paran_offsets.is_empty() { + return self.call_error( + TokenizerError::closing_unopened_subshell, + self.token_cursor, + self.token_cursor, + Some(1), + 1, + ); + } + paran_offsets.pop(); + if paran_offsets.is_empty() { + mode &= !TOK_MODE_SUBSHELL; + } + expecting.pop(); + // Check if the ) completed a quoted command substitution. + if quoted_cmdsubs.last() == Some(¶n_offsets.len()) { + quoted_cmdsubs.pop(); + // The "$(" part of a quoted command substitution closes double quotes. To keep + // quotes balanced, act as if there was an invisible double quote after the ")". + if let Err(error_loc) = + process_opening_quote(self, &mut quoted_cmdsubs, &mut paran_offsets, '"') + { + if !self.accept_unfinished { + return self.call_error( + TokenizerError::unterminated_quote, + buff_start, + error_loc, + None, + 0, + ); + } + break; + } + } + } else if c == '}' { + if expecting.last() == Some(&')') { + return self.call_error( + TokenizerError::expected_pclose_found_bclose, + self.token_cursor, + self.token_cursor, + Some(1), + 1, + ); + } + if brace_offsets.is_empty() { + return self.call_error( + TokenizerError::closing_unopened_brace, + self.token_cursor, + self.start.len(), + None, + 0, + ); + } + brace_offsets.pop(); + if brace_offsets.is_empty() { + mode &= !TOK_MODE_CURLY_BRACES; + } + expecting.pop(); + } else if c == '[' { + if self.token_cursor != buff_start { + mode |= TOK_MODE_ARRAY_BRACKETS; + slice_offset = self.token_cursor; + } else { + // This is actually allowed so the test operator `[` can be used as the head of a + // command + } + } + // Only exit bracket mode if we are in bracket mode. + // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias. + // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket + else if c == ']' && (mode & TOK_MODE_ARRAY_BRACKETS) { + mode &= !TOK_MODE_ARRAY_BRACKETS; + } else if c == '\'' || c == '"' { + if let Err(error_loc) = + process_opening_quote(self, &mut quoted_cmdsubs, &mut paran_offsets, c) + { + if !self.accept_unfinished { + return self.call_error( + TokenizerError::unterminated_quote, + buff_start, + error_loc, + None, + 1, + ); + } + break; + } + } else if mode == TOK_MODE_REGULAR_TEXT + && !tok_is_string_character( + c, + self.start + .as_char_slice() + .get(self.token_cursor + 1) + .copied(), + ) + { + break; + } + + let next = self + .start + .as_char_slice() + .get(self.token_cursor + 1) + .copied(); + is_token_begin = is_token_delimiter(c, next); + self.token_cursor += 1; + } + + if !self.accept_unfinished && mode != TOK_MODE_REGULAR_TEXT { + // These are all "unterminated", so the only char we can mark as an error + // is the opener (the closing char could be anywhere!) + // + // (except for TOK_MODE_CHAR_ESCAPE, which is one long by definition) + if mode & TOK_MODE_CHAR_ESCAPE { + return self.call_error( + TokenizerError::unterminated_escape, + buff_start, + self.token_cursor - 1, + None, + 1, + ); + } else if mode & TOK_MODE_ARRAY_BRACKETS { + return self.call_error( + TokenizerError::unterminated_slice, + buff_start, + slice_offset, + None, + 1, + ); + } else if mode & TOK_MODE_SUBSHELL { + assert!(!paran_offsets.is_empty()); + let offset_of_open_paran = *paran_offsets.last().unwrap(); + + return self.call_error( + TokenizerError::unterminated_subshell, + buff_start, + offset_of_open_paran, + None, + 1, + ); + } else if mode & TOK_MODE_CURLY_BRACES { + assert!(!brace_offsets.is_empty()); + let offset_of_open_brace = *brace_offsets.last().unwrap(); + + return self.call_error( + TokenizerError::unterminated_brace, + buff_start, + offset_of_open_brace, + None, + 1, + ); + } else { + panic!("Unknown non-regular-text mode"); + } + } + + let mut result = Tok::new(TokenType::string); + result.offset = buff_start as u32; + result.length = (self.token_cursor - buff_start) as u32; + result + } +} + +pub fn quote_end(s: &wstr, mut pos: usize, quote: char) -> Option { + loop { + pos += 1; + + if pos == s.len() { + return None; + } + + let c = s.char_at(pos); + if c == '\\' { + pos += 1; + if pos == s.len() { + return None; + } + } else if c == quote || + // Command substitutions also end a double quoted string. This is how we + // support command substitutions inside double quotes. + (quote == '"' && c == '$' && s.as_char_slice().get(pos+1) == Some(&'(')) + { + return Some(pos); + } + } +} + +pub fn comment_end(s: &wstr, mut pos: usize) -> usize { + loop { + pos += 1; + if pos == s.len() || s.char_at(pos) == '\n' { + return pos; + } + } +} + +/// Tests if this character can be a part of a string. Hash (#) starts a comment if it's the first +/// character in a token; otherwise it is considered a string character. See issue #953. +fn tok_is_string_character(c: char, next: Option) -> bool { + match c { + // Unconditional separators. + '\0' | ' ' | '\n' | '|' | '\t' | ';' | '\r' | '<' | '>' => false, + '&' => { + if feature_test(FeatureFlag::ampersand_nobg_in_token) { + // Unlike in other shells, '&' is not special if followed by a string character. + next.map(|nc| tok_is_string_character(nc, None)) + .unwrap_or(false) + } else { + false + } + } + _ => true, + } +} + +/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster +/// by adding a fast path for the most common characters. This is obviously not a suitable +/// replacement for iswalpha. +fn myal(c: char) -> bool { + ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) +} + +#[derive(Clone, Copy, PartialEq, Eq)] +struct TokModes(u8); + +const TOK_MODE_REGULAR_TEXT: TokModes = TokModes(0); // regular text +const TOK_MODE_SUBSHELL: TokModes = TokModes(1 << 0); // inside of subshell parentheses +const TOK_MODE_ARRAY_BRACKETS: TokModes = TokModes(1 << 1); // inside of array brackets +const TOK_MODE_CURLY_BRACES: TokModes = TokModes(1 << 2); +const TOK_MODE_CHAR_ESCAPE: TokModes = TokModes(1 << 3); + +impl BitAnd for TokModes { + type Output = bool; + fn bitand(self, rhs: Self) -> Self::Output { + (self.0 & rhs.0) != 0 + } +} +impl BitAndAssign for TokModes { + fn bitand_assign(&mut self, rhs: Self) { + self.0 &= rhs.0 + } +} +impl BitOrAssign for TokModes { + fn bitor_assign(&mut self, rhs: Self) { + self.0 |= rhs.0 + } +} +impl Not for TokModes { + type Output = TokModes; + fn not(self) -> Self::Output { + TokModes(!self.0) + } +} + +/// Tests if this character can delimit tokens. +pub fn is_token_delimiter(c: char, next: Option) -> bool { + c == '(' || !tok_is_string_character(c, next) +} + +fn is_token_delimiter_ffi(c: wchar_t, next: SharedPtr) -> bool { + is_token_delimiter( + c.try_into().unwrap(), + next.as_ref().map(|c| (*c).try_into().unwrap()), + ) +} + +/// \return the_ffi first token from the string, skipping variable assignments like A=B. +pub fn tok_command(str: &wstr) -> WString { + let mut t = Tokenizer::new(str, TokFlags(0)); + while let Some(token) = t.next() { + if token.type_ != TokenType::string { + return WString::new(); + } + let text = t.text_of(&token); + if variable_assignment_equals_pos(text).is_some() { + continue; + } + return text.to_owned(); + } + WString::new() +} +fn tok_command_ffi(str: &CxxWString) -> UniquePtr { + tok_command(&str.from_ffi()).to_ffi() +} + +impl TryFrom<&wstr> for PipeOrRedir { + type Error = (); + + /// Examples of supported syntaxes. + /// Note we are only responsible for parsing the redirection part, not 'cmd' or 'file'. + /// + /// cmd | cmd normal pipe + /// cmd &| cmd normal pipe plus stderr-merge + /// cmd >| cmd pipe with explicit fd + /// cmd 2>| cmd pipe with explicit fd + /// cmd < file stdin redirection + /// cmd > file redirection + /// cmd >> file appending redirection + /// cmd >? file noclobber redirection + /// cmd >>? file appending noclobber redirection + /// cmd 2> file file redirection with explicit fd + /// cmd >&2 fd redirection with no explicit src fd (stdout is used) + /// cmd 1>&2 fd redirection with an explicit src fd + /// cmd <&2 fd redirection with no explicit src fd (stdin is used) + /// cmd 3<&0 fd redirection with an explicit src fd + /// cmd &> file redirection with stderr merge + /// cmd ^ file caret (stderr) redirection, perhaps disabled via feature flags + /// cmd ^^ file caret (stderr) redirection, perhaps disabled via feature flags + fn try_from(buff: &wstr) -> Result { + // Extract a range of leading fd. + let mut cursor = buff.chars().take_while(|c| c.is_ascii_digit()).count(); + let fd_buff = &buff[..cursor]; + let has_fd = !fd_buff.is_empty(); + + // Try consuming a given character. + // Return true if consumed. On success, advances cursor. + let try_consume = |cursor: &mut usize, c| -> bool { + if buff.char_at(*cursor) != c { + false + } else { + *cursor += 1; + true + } + }; + + // Like try_consume, but asserts on failure. + let consume = |cursor: &mut usize, c| { + assert!(buff.char_at(*cursor) == c, "Failed to consume char"); + *cursor += 1; + }; + + let c = buff.char_at(cursor); + let mut result = PipeOrRedir { + fd: -1, + is_pipe: false, + mode: RedirectionMode::overwrite, + stderr_merge: false, + consumed: 0, + }; + match c { + '|' => { + if has_fd { + // Like 123| + return Err(()); + } + consume(&mut cursor, '|'); + assert!( + buff.char_at(cursor) != '|', + "|| passed as redirection, this should have been handled as 'or' by the caller" + ); + result.fd = STDOUT_FILENO; + result.is_pipe = true; + } + '>' => { + consume(&mut cursor, '>'); + if try_consume(&mut cursor, '>') { + result.mode = RedirectionMode::append; + } + if try_consume(&mut cursor, '|') { + // Note we differ from bash here. + // Consider `echo foo 2>| bar` + // In fish, this is a *pipe*. Run bar as a command and attach foo's stderr to bar's + // stdin, while leaving stdout as tty. + // In bash, this is a *redirection* to bar as a file. It is like > but ignores + // noclobber. + result.is_pipe = true; + result.fd = if has_fd { + parse_fd(fd_buff) // like 2>| + } else { + STDOUT_FILENO + }; // like >| + } else if try_consume(&mut cursor, '&') { + // This is a redirection to an fd. + // Note that we allow ">>&", but it's still just writing to the fd - "appending" to + // it doesn't make sense. + result.mode = RedirectionMode::fd; + result.fd = if has_fd { + parse_fd(fd_buff) // like 1>&2 + } else { + STDOUT_FILENO // like >&2 + }; + } else { + // This is a redirection to a file. + result.fd = if has_fd { + parse_fd(fd_buff) // like 1> file.txt + } else { + STDOUT_FILENO // like > file.txt + }; + if result.mode != RedirectionMode::append { + result.mode = RedirectionMode::overwrite; + } + // Note 'echo abc >>? file' is valid: it means append and noclobber. + // But here "noclobber" means the file must not exist, so appending + // can be ignored. + if try_consume(&mut cursor, '?') { + result.mode = RedirectionMode::noclob; + } + } + } + '<' => { + consume(&mut cursor, '<'); + if try_consume(&mut cursor, '&') { + result.mode = RedirectionMode::fd; + } else { + result.mode = RedirectionMode::input; + } + result.fd = if has_fd { + parse_fd(fd_buff) // like 1<&3 or 1< /tmp/file.txt + } else { + STDIN_FILENO // like <&3 or < /tmp/file.txt + }; + } + '&' => { + consume(&mut cursor, '&'); + if try_consume(&mut cursor, '|') { + // &| is pipe with stderr merge. + result.fd = STDOUT_FILENO; + result.is_pipe = true; + result.stderr_merge = true; + } else if try_consume(&mut cursor, '>') { + result.fd = STDOUT_FILENO; + result.stderr_merge = true; + result.mode = RedirectionMode::overwrite; + if try_consume(&mut cursor, '>') { + result.mode = RedirectionMode::append; // like &>> + } + if try_consume(&mut cursor, '?') { + result.mode = RedirectionMode::noclob; // like &>? or &>>? + } + } else { + return Err(()); + } + } + _ => { + // Not a redirection. + return Err(()); + } + } + + result.consumed = cursor; + assert!( + result.consumed > 0, + "Should have consumed at least one character on success" + ); + Ok(result) + } +} + +fn pipe_or_redir_from_string(buff: wcharz_t) -> UniquePtr { + match PipeOrRedir::try_from(Into::<&wstr>::into(buff)) { + Ok(p) => UniquePtr::new(p), + Err(()) => UniquePtr::null(), + } +} + +impl PipeOrRedir { + /// \return the oflags (as in open(2)) for this redirection. + pub fn oflags(&self) -> c_int { + self.mode.oflags().unwrap_or(-1) + } + + // \return if we are "valid". Here "valid" means only that the source fd did not overflow. + // For example 99999999999> is invalid. + fn is_valid(&self) -> bool { + self.fd >= 0 + } + + // \return the token type for this redirection. + fn token_type(&self) -> TokenType { + if self.is_pipe { + TokenType::pipe + } else { + TokenType::redirect + } + } +} + +// Parse an fd from the non-empty string [start, end), all of which are digits. +// Return the fd, or -1 on overflow. +fn parse_fd(s: &wstr) -> i32 { + assert!(!s.is_empty()); + let mut big_fd: usize = 0; + for c in s.chars() { + assert!(c.is_ascii_digit()); + big_fd = big_fd * 10 + (c.to_digit(10).unwrap() as usize); + if big_fd > (i32::MAX as usize) { + return -1; + } + } + assert!(big_fd <= (i32::MAX as usize), "big_fd should be in range"); + big_fd as i32 +} + +fn new_move_word_state_machine(syl: MoveWordStyle) -> Box { + Box::new(MoveWordStateMachine::new(syl)) +} + +impl MoveWordStateMachine { + pub fn new(style: MoveWordStyle) -> Self { + MoveWordStateMachine { state: 0, style } + } + + pub fn consume_char(&mut self, c: char) -> bool { + match self.style { + MoveWordStyle::move_word_style_punctuation => self.consume_char_punctuation(c), + MoveWordStyle::move_word_style_path_components => self.consume_char_path_components(c), + MoveWordStyle::move_word_style_whitespace => self.consume_char_whitespace(c), + _ => panic!(), + } + } + pub fn consume_char_ffi(&mut self, c: wchar_t) -> bool { + self.consume_char(c.try_into().unwrap()) + } + + pub fn reset(&mut self) { + self.state = 0; + } + + fn consume_char_punctuation(&mut self, c: char) -> bool { + const S_ALWAYS_ONE: u8 = 0; + const S_REST: u8 = 1; + const S_WHITESPACE_REST: u8 = 2; + const S_WHITESPACE: u8 = 3; + const S_ALPHANUMERIC: u8 = 4; + const S_END: u8 = 5; + + let mut consumed = false; + while self.state != S_END && !consumed { + match self.state { + S_ALWAYS_ONE => { + // Always consume the first character. + consumed = true; + if c.is_whitespace() { + self.state = S_WHITESPACE; + } else if c.is_alphanumeric() { + self.state = S_ALPHANUMERIC; + } else { + // Don't allow switching type (ws->nonws) after non-whitespace and + // non-alphanumeric. + self.state = S_REST; + } + } + S_REST => { + if c.is_whitespace() { + // Consume only trailing whitespace. + self.state = S_WHITESPACE_REST; + } else if c.is_alphanumeric() { + // Consume only alnums. + self.state = S_ALPHANUMERIC; + } else { + consumed = false; + self.state = S_END; + } + } + S_WHITESPACE_REST | S_WHITESPACE => { + // "whitespace" consumes whitespace and switches to alnums, + // "whitespace_rest" only consumes whitespace. + if c.is_whitespace() { + // Consumed whitespace. + consumed = true; + } else { + self.state = if self.state == S_WHITESPACE { + S_ALPHANUMERIC + } else { + S_END + }; + } + } + S_ALPHANUMERIC => { + if c.is_alphanumeric() { + consumed = true; // consumed alphanumeric + } else { + self.state = S_END; + } + } + _ => {} + } + } + consumed + } + + fn consume_char_path_components(&mut self, c: char) -> bool { + const S_INITIAL_PUNCTUATION: u8 = 0; + const S_WHITESPACE: u8 = 1; + const S_SEPARATOR: u8 = 2; + const S_SLASH: u8 = 3; + const S_PATH_COMPONENT_CHARACTERS: u8 = 4; + const S_INITIAL_SEPARATOR: u8 = 5; + const S_END: u8 = 6; + + let mut consumed = false; + while self.state != S_END && !consumed { + match self.state { + S_INITIAL_PUNCTUATION => { + if !is_path_component_character(c) && !c.is_whitespace() { + self.state = S_INITIAL_SEPARATOR; + } else { + if !is_path_component_character(c) { + consumed = true; + } + self.state = S_WHITESPACE; + } + } + S_WHITESPACE => { + if c.is_whitespace() { + consumed = true; // consumed whitespace + } else if c == '/' || is_path_component_character(c) { + self.state = S_SLASH; // path component + } else { + self.state = S_SEPARATOR; // path separator + } + } + S_SEPARATOR => { + if !c.is_whitespace() && !is_path_component_character(c) { + consumed = true; // consumed separator + } else { + self.state = S_END; + } + } + S_SLASH => { + if c == '/' { + consumed = true; // consumed slash + } else { + self.state = S_PATH_COMPONENT_CHARACTERS; + } + } + S_PATH_COMPONENT_CHARACTERS => { + if is_path_component_character(c) { + consumed = true; // consumed string character except slash + } else { + self.state = S_END; + } + } + S_INITIAL_SEPARATOR => { + if is_path_component_character(c) { + consumed = true; + self.state = S_PATH_COMPONENT_CHARACTERS; + } else if c.is_whitespace() { + self.state = S_END; + } else { + consumed = true; + } + } + _ => {} + } + } + consumed + } + + fn consume_char_whitespace(&mut self, c: char) -> bool { + // Consume a "word" of printable characters plus any leading whitespace. + const S_ALWAYS_ONE: u8 = 0; + const S_BLANK: u8 = 1; + const S_GRAPH: u8 = 2; + const S_END: u8 = 3; + + let mut consumed = false; + while self.state != S_END && !consumed { + match self.state { + S_ALWAYS_ONE => { + consumed = true; // always consume the first character + // If it's not whitespace, only consume those from here. + if !c.is_whitespace() { + self.state = S_GRAPH; + } else { + // If it's whitespace, keep consuming whitespace until the graphs. + self.state = S_BLANK; + } + } + S_BLANK => { + if c.is_whitespace() { + consumed = true; // consumed whitespace + } else { + self.state = S_GRAPH; + } + } + S_GRAPH => { + if !c.is_whitespace() { + consumed = true; // consumed printable non-space + } else { + self.state = S_END; + } + } + _ => {} + } + } + consumed + } +} + +fn is_path_component_character(c: char) -> bool { + tok_is_string_character(c, None) && !L!("/={,}'\":@").as_char_slice().contains(&c) +} + /// The position of the equal sign in a variable assignment like foo=bar. /// /// Return the location of the equals sign, or none if the string does diff --git a/src/ast.cpp b/src/ast.cpp index f14bf3e7b..bd5d0b23b 100644 --- a/src/ast.cpp +++ b/src/ast.cpp @@ -77,8 +77,7 @@ static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token } /// Convert from tokenizer_t's token type to a parse_token_t type. -static parse_token_type_t parse_token_type_from_tokenizer_token( - enum token_type_t tokenizer_token_type) { +static parse_token_type_t parse_token_type_from_tokenizer_token(token_type_t tokenizer_token_type) { switch (tokenizer_token_type) { case token_type_t::string: return parse_token_type_t::string; @@ -111,7 +110,7 @@ class token_stream_t { explicit token_stream_t(const wcstring &src, parse_tree_flags_t flags, std::vector &comments) : src_(src), - tok_(src_.c_str(), tokenizer_flags_from_parse_flags(flags)), + tok_(new_tokenizer(src_.c_str(), tokenizer_flags_from_parse_flags(flags))), comment_ranges(comments) {} /// \return the token at the given index, without popping it. If the token streamĀ is exhausted, @@ -161,8 +160,8 @@ class token_stream_t { /// \return a new parse token, advancing the tokenizer. /// This returns comments. parse_token_t advance_1() { - auto mtoken = tok_.next(); - if (!mtoken.has_value()) { + auto mtoken = tok_->next(); + if (!mtoken) { return parse_token_t{parse_token_type_t::terminate}; } const tok_t &token = *mtoken; @@ -171,9 +170,9 @@ class token_stream_t { // `builtin --names` lists builtins, but `builtin "--names"` attempts to run --names as a // command. Amazingly as of this writing (10/12/13) nobody seems to have noticed this. // Squint at it really hard and it even starts to look like a feature. - parse_token_t result{parse_token_type_from_tokenizer_token(token.type)}; - const wcstring &text = tok_.copy_text_of(token, &storage_); - result.keyword = keyword_for_token(token.type, text); + parse_token_t result{parse_token_type_from_tokenizer_token(token.type_)}; + const wcstring &text = storage_ = *tok_->text_of(token); + result.keyword = keyword_for_token(token.type_, text); result.has_dash_prefix = !text.empty() && text.at(0) == L'-'; result.is_help_argument = (text == L"-h" || text == L"--help"); result.is_newline = (result.type == parse_token_type_t::end && text == L"\n"); @@ -222,7 +221,7 @@ class token_stream_t { const wcstring &src_; // The tokenizer to generate new tokens. - tokenizer_t tok_; + rust::Box tok_; /// Any comment nodes are collected here. /// These are only collected if parse_flag_include_comments is set. @@ -749,7 +748,7 @@ struct populator_t { case parse_token_type_t::tokenizer_error: parse_error(tok, parse_error_from_tokenizer_error(tok.tok_error), L"%ls", - tokenizer_get_error_message(tok.tok_error)); + tokenizer_get_error_message(tok.tok_error)->c_str()); break; case parse_token_type_t::end: diff --git a/src/builtins/commandline.cpp b/src/builtins/commandline.cpp index 51bf17f26..5dc33a65d 100644 --- a/src/builtins/commandline.cpp +++ b/src/builtins/commandline.cpp @@ -103,12 +103,12 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs // std::fwprintf( stderr, L"Subshell: %ls, end char %lc\n", buff, *end ); wcstring out; wcstring buff(begin, end - begin); - tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED); - while (auto token = tok.next()) { + auto tok = new_tokenizer(buff.c_str(), TOK_ACCEPT_UNFINISHED); + while (auto token = tok->next()) { if ((cut_at_cursor) && (token->offset + token->length >= pos)) break; - if (token->type == token_type_t::string) { - wcstring tmp = tok.text_of(*token); + if (token->type_ == token_type_t::string) { + wcstring tmp = *tok->text_of(*token); unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE); out.append(tmp); out.push_back(L'\n'); diff --git a/src/builtins/fg.cpp b/src/builtins/fg.cpp index f9a51e67d..73caca9f1 100644 --- a/src/builtins/fg.cpp +++ b/src/builtins/fg.cpp @@ -107,7 +107,7 @@ maybe_t builtin_fg(parser_t &parser, io_streams_t &streams, const wchar_t * std::fwprintf(stderr, FG_MSG, job->job_id(), job->command_wcstr()); } - wcstring ft = tok_command(job->command()); + wcstring ft = *tok_command(job->command()); if (!ft.empty()) { // Provide value for `status current-command` parser.libdata().status_vars.command = ft; diff --git a/src/builtins/read.cpp b/src/builtins/read.cpp index 72b176af8..ba16d0aa2 100644 --- a/src/builtins/read.cpp +++ b/src/builtins/read.cpp @@ -425,7 +425,8 @@ static int validate_read_args(const wchar_t *cmd, read_cmd_opts_t &opts, int arg return STATUS_INVALID_ARGS; } if (env_var_t::flags_for(argv[i]) & env_var_t::flag_read_only) { - streams.err.append_format(_(L"%ls: %ls: cannot overwrite read-only variable"), cmd, argv[i]); + streams.err.append_format(_(L"%ls: %ls: cannot overwrite read-only variable"), cmd, + argv[i]); builtin_print_error_trailer(parser, streams.err, cmd); return STATUS_INVALID_ARGS; } @@ -529,13 +530,13 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t } if (opts.tokenize) { - tokenizer_t tok{buff.c_str(), TOK_ACCEPT_UNFINISHED}; + auto tok = new_tokenizer(buff.c_str(), TOK_ACCEPT_UNFINISHED); wcstring out; if (opts.array) { // Array mode: assign each token as a separate element of the sole var. wcstring_list_t tokens; - while (auto t = tok.next()) { - auto text = tok.text_of(*t); + while (auto t = tok->next()) { + auto text = *tok->text_of(*t); if (unescape_string(text, &out, UNESCAPE_DEFAULT)) { tokens.push_back(out); } else { @@ -545,9 +546,9 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t parser.set_var_and_fire(*var_ptr++, opts.place, std::move(tokens)); } else { - maybe_t t; - while ((vars_left() - 1 > 0) && (t = tok.next())) { - auto text = tok.text_of(*t); + std::unique_ptr t; + while ((vars_left() - 1 > 0) && (t = tok->next())) { + auto text = *tok->text_of(*t); if (unescape_string(text, &out, UNESCAPE_DEFAULT)) { parser.set_var_and_fire(*var_ptr++, opts.place, out); } else { @@ -556,7 +557,7 @@ maybe_t builtin_read(parser_t &parser, io_streams_t &streams, const wchar_t } // If we still have tokens, set the last variable to them. - if ((t = tok.next())) { + if ((t = tok->next())) { wcstring rest = wcstring(buff, t->offset); parser.set_var_and_fire(*var_ptr++, opts.place, std::move(rest)); } diff --git a/src/complete.cpp b/src/complete.cpp index 96e7445a0..95d91c6da 100644 --- a/src/complete.cpp +++ b/src/complete.cpp @@ -857,7 +857,7 @@ bool completer_t::complete_param_for_command(const wcstring &cmd_orig, const wcs if (wildcard_match(match, key.first)) { // Copy all of their options into our list. Oof, this is a lot of copying. // We have to copy them in reverse order to preserve legacy behavior (#9221). - const auto& options = kv.second.get_options(); + const auto &options = kv.second.get_options(); all_options.emplace_back(options.rbegin(), options.rend()); } } @@ -887,7 +887,8 @@ bool completer_t::complete_param_for_command(const wcstring &cmd_orig, const wcs if (this->conditions_test(o.conditions)) { if (o.type == option_type_short) { - // Only override a true last_option_requires_param value with a false one + // Only override a true last_option_requires_param value with a false + // one if (last_option_requires_param.has_value()) { last_option_requires_param = *last_option_requires_param && o.result_mode.requires_param; @@ -1402,10 +1403,10 @@ void completer_t::walk_wrap_chain(const wcstring &cmd, const wcstring &cmdline, // Separate the wrap target into any variable assignments VAR=... and the command itself. wcstring wrapped_command; - tokenizer_t tokenizer(wt.c_str(), 0); + auto tokenizer = new_tokenizer(wt.c_str(), 0); size_t wrapped_command_offset_in_wt = wcstring::npos; - while (auto tok = tokenizer.next()) { - wcstring tok_src = tok->get_source(wt); + while (auto tok = tokenizer->next()) { + wcstring tok_src = *tok->get_source(wt); if (variable_assignment_equals_pos(tok_src)) { ad->var_assignments->push_back(std::move(tok_src)); } else { @@ -1485,7 +1486,7 @@ void completer_t::mark_completions_duplicating_arguments(const wcstring &cmd, // Get all the arguments, unescaped, into an array that we're going to bsearch. wcstring_list_t arg_strs; for (const auto &arg : args) { - wcstring argstr = arg.get_source(cmd); + wcstring argstr = *arg.get_source(cmd); wcstring argstr_unesc; if (unescape_string(argstr, &argstr_unesc, UNESCAPE_DEFAULT)) { arg_strs.push_back(std::move(argstr_unesc)); @@ -1542,7 +1543,7 @@ void completer_t::perform_for_commandline(wcstring cmdline) { tokens.erase( std::remove_if(tokens.begin(), tokens.end(), [&cmdline](const tok_t &token) { - return parser_keywords_is_subcommand(token.get_source(cmdline)); + return parser_keywords_is_subcommand(*token.get_source(cmdline)); }), tokens.end()); } @@ -1552,7 +1553,7 @@ void completer_t::perform_for_commandline(wcstring cmdline) { wcstring_list_t var_assignments; for (const tok_t &tok : tokens) { if (tok.location_in_or_at_end_of_source_range(cursor_pos)) break; - wcstring tok_src = tok.get_source(cmdline); + wcstring tok_src = *tok.get_source(cmdline); if (!variable_assignment_equals_pos(tok_src)) break; var_assignments.push_back(std::move(tok_src)); } @@ -1576,26 +1577,27 @@ void completer_t::perform_for_commandline(wcstring cmdline) { effective_cmdline = &effective_cmdline_buf; } - if (tokens.back().type == token_type_t::comment) { + if (tokens.back().type_ == token_type_t::comment) { return; } - tokens.erase(std::remove_if(tokens.begin(), tokens.end(), - [](const tok_t &tok) { return tok.type == token_type_t::comment; }), - tokens.end()); + tokens.erase( + std::remove_if(tokens.begin(), tokens.end(), + [](const tok_t &tok) { return tok.type_ == token_type_t::comment; }), + tokens.end()); assert(!tokens.empty()); const tok_t &cmd_tok = tokens.front(); const tok_t &cur_tok = tokens.back(); // Since fish does not currently support redirect in command position, we return here. - if (cmd_tok.type != token_type_t::string) return; - if (cur_tok.type == token_type_t::error) return; + if (cmd_tok.type_ != token_type_t::string) return; + if (cur_tok.type_ == token_type_t::error) return; for (const auto &tok : tokens) { // If there was an error, it was in the last token. - assert(tok.type == token_type_t::string || tok.type == token_type_t::redirect); + assert(tok.type_ == token_type_t::string || tok.type_ == token_type_t::redirect); } // If we are completing a variable name or a tilde expansion user name, we do that and // return. No need for any other completions. - const wcstring current_token = cur_tok.get_source(cmdline); + const wcstring current_token = *cur_tok.get_source(cmdline); if (cur_tok.location_in_or_at_end_of_source_range(cursor_pos)) { if (try_complete_variable(current_token) || try_complete_user(current_token)) { return; @@ -1614,11 +1616,11 @@ void completer_t::perform_for_commandline(wcstring cmdline) { return; } // See whether we are in an argument, in a redirection or in the whitespace in between. - bool in_redirection = cur_tok.type == token_type_t::redirect; + bool in_redirection = cur_tok.type_ == token_type_t::redirect; bool had_ddash = false; wcstring current_argument, previous_argument; - if (cur_tok.type == token_type_t::string && + if (cur_tok.type_ == token_type_t::string && cur_tok.location_in_or_at_end_of_source_range(position_in_statement)) { // If the cursor is in whitespace, then the "current" argument is empty and the // previous argument is the matching one. But if the cursor was in or at the end @@ -1632,15 +1634,15 @@ void completer_t::perform_for_commandline(wcstring cmdline) { current_argument = current_token; if (tokens.size() >= 2) { tok_t prev_tok = tokens.at(tokens.size() - 2); - if (prev_tok.type == token_type_t::string) - previous_argument = prev_tok.get_source(cmdline); - in_redirection = prev_tok.type == token_type_t::redirect; + if (prev_tok.type_ == token_type_t::string) + previous_argument = *prev_tok.get_source(cmdline); + in_redirection = prev_tok.type_ == token_type_t::redirect; } } // Check to see if we have a preceding double-dash. for (size_t i = 0; i < tokens.size() - 1; i++) { - if (tokens.at(i).get_source(cmdline) == L"--") { + if (*tokens.at(i).get_source(cmdline) == L"--") { had_ddash = true; break; } @@ -1658,7 +1660,7 @@ void completer_t::perform_for_commandline(wcstring cmdline) { source_offset_t bias = cmdline.size() - effective_cmdline->size(); source_range_t command_range = {cmd_tok.offset - bias, cmd_tok.length}; - wcstring exp_command = cmd_tok.get_source(cmdline); + wcstring exp_command = *cmd_tok.get_source(cmdline); bool unescaped = expand_command_token(ctx, exp_command) && unescape_string(previous_argument, &arg_data.previous_argument, UNESCAPE_DEFAULT) && diff --git a/src/fish_indent.cpp b/src/fish_indent.cpp index a39f1aae6..a146efbf0 100644 --- a/src/fish_indent.cpp +++ b/src/fish_indent.cpp @@ -420,9 +420,9 @@ struct pretty_printer_t { // always emit one. bool needs_nl = false; - tokenizer_t tokenizer(gap_text.c_str(), TOK_SHOW_COMMENTS | TOK_SHOW_BLANK_LINES); - while (maybe_t tok = tokenizer.next()) { - wcstring tok_text = tokenizer.text_of(*tok); + auto tokenizer = new_tokenizer(gap_text.c_str(), TOK_SHOW_COMMENTS | TOK_SHOW_BLANK_LINES); + while (auto tok = tokenizer->next()) { + wcstring tok_text = *tokenizer->text_of(*tok); if (needs_nl) { emit_newline(); @@ -434,11 +434,11 @@ struct pretty_printer_t { if (tok_text == L"\n") continue; } - if (tok->type == token_type_t::comment) { + if (tok->type_ == token_type_t::comment) { emit_space_or_indent(); output.append(tok_text); needs_nl = true; - } else if (tok->type == token_type_t::end) { + } else if (tok->type_ == token_type_t::end) { // This may be either a newline or semicolon. // Semicolons found here are not part of the ast and can simply be removed. // Newlines are preserved unless mask_newline is set. @@ -449,7 +449,7 @@ struct pretty_printer_t { fprintf(stderr, "Gap text should only have comments and newlines - instead found token " "type %d with text: %ls\n", - (int)tok->type, tok_text.c_str()); + (int)tok->type_, tok_text.c_str()); DIE("Gap text should only have comments and newlines"); } } diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 3257ffced..034a50b29 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -640,25 +640,25 @@ static void test_tokenizer() { say(L"Testing tokenizer"); { const wchar_t *str = L"alpha beta"; - tokenizer_t t(str, 0); - maybe_t token{}; + auto t = new_tokenizer(str, 0); + std::unique_ptr token{}; - token = t.next(); // alpha - do_test(token.has_value()); - do_test(token->type == token_type_t::string); + token = t->next(); // alpha + do_test(token); + do_test(token->type_ == token_type_t::string); do_test(token->offset == 0); do_test(token->length == 5); - do_test(t.text_of(*token) == L"alpha"); + do_test(*t->text_of(*token) == L"alpha"); - token = t.next(); // beta - do_test(token.has_value()); - do_test(token->type == token_type_t::string); + token = t->next(); // beta + do_test(token); + do_test(token->type_ == token_type_t::string); do_test(token->offset == 6); do_test(token->length == 4); - do_test(t.text_of(*token) == L"beta"); + do_test(*t->text_of(*token) == L"beta"); - token = t.next(); - do_test(!token.has_value()); + token = t->next(); + do_test(!token); } const wchar_t *str = @@ -678,21 +678,21 @@ static void test_tokenizer() { say(L"Test correct tokenization"); { - tokenizer_t t(str, 0); + auto t = new_tokenizer(str, 0); size_t i = 0; - while (auto token = t.next()) { + while (auto token = t->next()) { if (i >= sizeof types / sizeof *types) { err(L"Too many tokens returned from tokenizer"); - std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token->type); + std::fwprintf(stdout, L"Got excess token type %ld\n", (long)token->type_); break; } - if (types[i] != token->type) { + if (types[i] != token->type_) { err(L"Tokenization error:"); std::fwprintf( stdout, L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type " L"%ld\n", - i + 1, str, (long)types[i], (long)token->type); + i + 1, str, (long)types[i], (long)token->type_); } i++; } @@ -703,50 +703,50 @@ static void test_tokenizer() { // Test some errors. { - tokenizer_t t(L"abc\\", 0); - auto token = t.next(); - do_test(token.has_value()); - do_test(token->type == token_type_t::error); + auto t = new_tokenizer(L"abc\\", 0); + auto token = t->next(); + do_test(token); + do_test(token->type_ == token_type_t::error); do_test(token->error == tokenizer_error_t::unterminated_escape); do_test(token->error_offset_within_token == 3); } { - tokenizer_t t(L"abc )defg(hij", 0); - auto token = t.next(); - do_test(token.has_value()); - token = t.next(); - do_test(token.has_value()); - do_test(token->type == token_type_t::error); + auto t = new_tokenizer(L"abc )defg(hij", 0); + auto token = t->next(); + do_test(token); + token = t->next(); + do_test(token); + do_test(token->type_ == token_type_t::error); do_test(token->error == tokenizer_error_t::closing_unopened_subshell); do_test(token->offset == 4); do_test(token->error_offset_within_token == 0); } { - tokenizer_t t(L"abc defg(hij (klm)", 0); - auto token = t.next(); - do_test(token.has_value()); - token = t.next(); - do_test(token.has_value()); - do_test(token->type == token_type_t::error); + auto t = new_tokenizer(L"abc defg(hij (klm)", 0); + auto token = t->next(); + do_test(token); + token = t->next(); + do_test(token); + do_test(token->type_ == token_type_t::error); do_test(token->error == tokenizer_error_t::unterminated_subshell); do_test(token->error_offset_within_token == 4); } { - tokenizer_t t(L"abc defg[hij (klm)", 0); - auto token = t.next(); - do_test(token.has_value()); - token = t.next(); - do_test(token.has_value()); - do_test(token->type == token_type_t::error); + auto t = new_tokenizer(L"abc defg[hij (klm)", 0); + auto token = t->next(); + do_test(token); + token = t->next(); + do_test(token); + do_test(token->type_ == token_type_t::error); do_test(token->error == tokenizer_error_t::unterminated_slice); do_test(token->error_offset_within_token == 4); } // Test some redirection parsing. - auto pipe_or_redir = [](const wchar_t *s) { return pipe_or_redir_t::from_string(s); }; + auto pipe_or_redir = [](const wchar_t *s) { return pipe_or_redir_from_string(s); }; do_test(pipe_or_redir(L"|")->is_pipe); do_test(pipe_or_redir(L"0>|")->is_pipe); do_test(pipe_or_redir(L"0>|")->fd == 0); @@ -770,7 +770,7 @@ static void test_tokenizer() { do_test(pipe_or_redir(L"&>?")->stderr_merge); auto get_redir_mode = [](const wchar_t *s) -> maybe_t { - if (auto redir = pipe_or_redir_t::from_string(s)) { + if (auto redir = pipe_or_redir_from_string(s)) { return redir->mode; } return none(); @@ -1520,6 +1520,12 @@ static void test_indents() { 0, "\nend" // ); + tests.clear(); + add_test(&tests, // + 0, "echo 'continuation line' \\", // + 1, "\ncont", // + 0, "\n" // + ); int test_idx = 0; for (const indent_test_t &test : tests) { // Construct the input text and expected indents. @@ -2740,11 +2746,11 @@ static void test_1_word_motion(word_motion_t motion, move_word_style_t style, } stops.erase(idx); - move_word_state_machine_t sm(style); + auto sm = new_move_word_state_machine(style); while (idx != end) { size_t char_idx = (motion == word_motion_left ? idx - 1 : idx); wchar_t wc = command.at(char_idx); - bool will_stop = !sm.consume_char(wc); + bool will_stop = !sm->consume_char(wc); // std::fwprintf(stdout, L"idx %lu, looking at %lu (%c): %d\n", idx, char_idx, (char)wc, // will_stop); bool expected_stop = (stops.count(idx) > 0); @@ -2765,7 +2771,7 @@ static void test_1_word_motion(word_motion_t motion, move_word_style_t style, stops.erase(idx); } if (will_stop) { - sm.reset(); + sm->reset(); } else { idx += (motion == word_motion_left ? -1 : 1); } @@ -2775,36 +2781,51 @@ static void test_1_word_motion(word_motion_t motion, move_word_style_t style, /// Test word motion (forward-word, etc.). Carets represent cursor stops. static void test_word_motion() { say(L"Testing word motion"); - test_1_word_motion(word_motion_left, move_word_style_punctuation, L"^echo ^hello_^world.^txt^"); - test_1_word_motion(word_motion_right, move_word_style_punctuation, + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_punctuation, + L"^echo ^hello_^world.^txt^"); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_punctuation, L"^echo^ hello^_world^.txt^"); - test_1_word_motion(word_motion_left, move_word_style_punctuation, + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_punctuation, L"echo ^foo_^foo_^foo/^/^/^/^/^ ^"); - test_1_word_motion(word_motion_right, move_word_style_punctuation, + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_punctuation, L"^echo^ foo^_foo^_foo^/^/^/^/^/ ^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^/^foo/^bar/^baz/^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^echo ^--foo ^--bar^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^/^foo/^bar/^baz/^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^echo ^--foo ^--bar^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, L"^echo ^hi ^> ^/^dev/^null^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, L"^echo ^/^foo/^bar{^aaa,^bbb,^ccc}^bak/^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^echo ^bak ^///^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^aaa ^@ ^@^aaa^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^aaa ^a ^@^aaa^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^aaa ^@@@ ^@@^aa^"); - test_1_word_motion(word_motion_left, move_word_style_path_components, L"^aa^@@ ^aa@@^a^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^echo ^bak ^///^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^aaa ^@ ^@^aaa^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^aaa ^a ^@^aaa^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^aaa ^@@@ ^@@^aa^"); + test_1_word_motion(word_motion_left, move_word_style_t::move_word_style_path_components, + L"^aa^@@ ^aa@@^a^"); - test_1_word_motion(word_motion_right, move_word_style_punctuation, L"^a^ bcd^"); - test_1_word_motion(word_motion_right, move_word_style_punctuation, L"a^b^ cde^"); - test_1_word_motion(word_motion_right, move_word_style_punctuation, L"^ab^ cde^"); - test_1_word_motion(word_motion_right, move_word_style_punctuation, L"^ab^&cd^ ^& ^e^ f^&"); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_punctuation, + L"^a^ bcd^"); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_punctuation, + L"a^b^ cde^"); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_punctuation, + L"^ab^ cde^"); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_punctuation, + L"^ab^&cd^ ^& ^e^ f^&"); - test_1_word_motion(word_motion_right, move_word_style_whitespace, L"^^a-b-c^ d-e-f"); - test_1_word_motion(word_motion_right, move_word_style_whitespace, L"^a-b-c^\n d-e-f^ "); - test_1_word_motion(word_motion_right, move_word_style_whitespace, L"^a-b-c^\n\nd-e-f^ "); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_whitespace, + L"^^a-b-c^ d-e-f"); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_whitespace, + L"^a-b-c^\n d-e-f^ "); + test_1_word_motion(word_motion_right, move_word_style_t::move_word_style_whitespace, + L"^a-b-c^\n\nd-e-f^ "); } /// Test is_potential_path. @@ -5694,6 +5715,14 @@ static void test_highlighting() { {L"\\U110000", highlight_role_t::error}, }); #endif + + highlight_tests.clear(); + highlight_tests.push_back({ + {L"echo", highlight_role_t::command}, + {L"stuff", highlight_role_t::param}, + {L"# comment", highlight_role_t::comment}, + }); + bool saved_flag = feature_test(feature_flag_t::ampersand_nobg_in_token); mutable_fish_features()->set(feature_flag_t::ampersand_nobg_in_token, true); for (const highlight_component_list_t &components : highlight_tests) { diff --git a/src/highlight.cpp b/src/highlight.cpp index f424c3057..bfa053d62 100644 --- a/src/highlight.cpp +++ b/src/highlight.cpp @@ -1158,12 +1158,10 @@ static bool contains_pending_variable(const std::vector &pending_varia } void highlighter_t::visit(const ast::redirection_t &redir) { - maybe_t oper = - pipe_or_redir_t::from_string(redir.oper.source(this->buff)); // like 2> - wcstring target = redir.target.source(this->buff); // like &1 or file path + auto oper = pipe_or_redir_from_string(redir.oper.source(this->buff).c_str()); // like 2> + wcstring target = redir.target.source(this->buff); // like &1 or file path - assert(oper.has_value() && - "Should have successfully parsed a pipe_or_redir_t since it was in our ast"); + assert(oper && "Should have successfully parsed a pipe_or_redir_t since it was in our ast"); // Color the > part. // It may have parsed successfully yet still be invalid (e.g. 9999999999999>&1) diff --git a/src/parse_execution.cpp b/src/parse_execution.cpp index 6d0ee614f..b0a1e74a4 100644 --- a/src/parse_execution.cpp +++ b/src/parse_execution.cpp @@ -1005,7 +1005,7 @@ end_execution_reason_t parse_execution_context_t::determine_redirections( if (!arg_or_redir.is_redirection()) continue; const ast::redirection_t &redir_node = arg_or_redir.redirection(); - maybe_t oper = pipe_or_redir_t::from_string(get_source(redir_node.oper)); + auto oper = pipe_or_redir_from_string(get_source(redir_node.oper).c_str()); if (!oper || !oper->is_valid()) { // TODO: figure out if this can ever happen. If so, improve this error message. return report_error(STATUS_INVALID_ARGS, redir_node, _(L"Invalid redirection: %ls"), @@ -1202,8 +1202,8 @@ end_execution_reason_t parse_execution_context_t::populate_job_from_job_node( break; } // Handle the pipe, whose fd may not be the obvious stdout. - auto parsed_pipe = pipe_or_redir_t::from_string(get_source(jc.pipe)); - assert(parsed_pipe.has_value() && parsed_pipe->is_pipe && "Failed to parse valid pipe"); + auto parsed_pipe = pipe_or_redir_from_string(get_source(jc.pipe).c_str()); + assert(parsed_pipe && parsed_pipe->is_pipe && "Failed to parse valid pipe"); if (!parsed_pipe->is_valid()) { result = report_error(STATUS_INVALID_ARGS, jc.pipe, ILLEGAL_FD_ERR_MSG, get_source(jc.pipe).c_str()); diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 6573b0a63..404819742 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -178,7 +178,7 @@ static int parse_util_locate_cmdsub(const wchar_t *in, const wchar_t **begin, co } } } - is_token_begin = is_token_delimiter(pos[0], pos[1]); + is_token_begin = is_token_delimiter(pos[0], std::make_shared(pos[1])); } else { escaped = false; is_token_begin = false; @@ -367,12 +367,12 @@ static void job_or_process_extent(bool process, const wchar_t *buff, size_t curs if (b) *b = end; const wcstring buffcpy(begin, end); - tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS); - maybe_t token{}; - while ((token = tok.next()) && !finished) { + auto tok = new_tokenizer(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS); + std::unique_ptr token{}; + while ((token = tok->next()) && !finished) { size_t tok_begin = token->offset; - switch (token->type) { + switch (token->type_) { case token_type_t::pipe: { if (!process) { break; @@ -440,13 +440,13 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar const wcstring buffcpy = wcstring(cmdsubst_begin, cmdsubst_end - cmdsubst_begin); - tokenizer_t tok(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED); - while (maybe_t token = tok.next()) { + auto tok = new_tokenizer(buffcpy.c_str(), TOK_ACCEPT_UNFINISHED); + while (std::unique_ptr token = tok->next()) { size_t tok_begin = token->offset; size_t tok_end = tok_begin; // Calculate end of token. - if (token->type == token_type_t::string) { + if (token->type_ == token_type_t::string) { tok_end += token->length; } @@ -459,14 +459,14 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar // If cursor is inside the token, this is the token we are looking for. If so, set a and b // and break. - if (token->type == token_type_t::string && tok_end >= offset_within_cmdsubst) { + if (token->type_ == token_type_t::string && tok_end >= offset_within_cmdsubst) { a = cmdsubst_begin + token->offset; b = a + token->length; break; } // Remember previous string token. - if (token->type == token_type_t::string) { + if (token->type_ == token_type_t::string) { pa = cmdsubst_begin + token->offset; pb = pa + token->length; } @@ -541,11 +541,11 @@ static wchar_t get_quote(const wcstring &cmd_str, size_t len) { } wchar_t parse_util_get_quote_type(const wcstring &cmd, size_t pos) { - tokenizer_t tok(cmd.c_str(), TOK_ACCEPT_UNFINISHED); - while (auto token = tok.next()) { - if (token->type == token_type_t::string && + auto tok = new_tokenizer(cmd.c_str(), TOK_ACCEPT_UNFINISHED); + while (auto token = tok->next()) { + if (token->type_ == token_type_t::string && token->location_in_or_at_end_of_source_range(pos)) { - return get_quote(tok.text_of(*token), pos - token->offset); + return get_quote(*tok->text_of(*token), pos - token->offset); } } return L'\0'; diff --git a/src/parse_util.h b/src/parse_util.h index bd318566b..54f492378 100644 --- a/src/parse_util.h +++ b/src/parse_util.h @@ -14,7 +14,8 @@ namespace ast { struct argument_t; class ast_t; } // namespace ast -struct tok_t; +struct Tok; +using tok_t = Tok; /// Handles slices: the square brackets in an expression like $foo[5..4] /// \return the length of the slice starting at \p in, or 0 if there is no slice, or -1 on error. diff --git a/src/reader.cpp b/src/reader.cpp index 4b7dc81a1..1d2a14bfc 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -432,12 +432,12 @@ class reader_history_search_t { assert(offset != wcstring::npos && "Should have found a match in the search result"); add_if_new({std::move(text), offset}); } else if (mode_ == token) { - tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED); + auto tok = new_tokenizer(text.c_str(), TOK_ACCEPT_UNFINISHED); std::vector local_tokens; - while (auto token = tok.next()) { - if (token->type != token_type_t::string) continue; - wcstring text = tok.text_of(*token); + while (auto token = tok->next()) { + if (token->type_ != token_type_t::string) continue; + wcstring text = *tok->text_of(*token); size_t offset = find(text, needle); if (offset != wcstring::npos) { local_tokens.push_back({std::move(text), offset}); @@ -865,7 +865,7 @@ class reader_data_t : public std::enable_shared_from_this { /// try expanding it as a wildcard, populating \p result with the expanded string. expand_result_t::result_t try_expand_wildcard(wcstring wc, size_t pos, wcstring *result); - void move_word(editable_line_t *el, bool move_right, bool erase, enum move_word_style_t style, + void move_word(editable_line_t *el, bool move_right, bool erase, move_word_style_t style, bool newv); void run_input_command_scripts(const wcstring_list_t &cmds); @@ -898,8 +898,9 @@ class reader_data_t : public std::enable_shared_from_this { bool can_autosuggest() const; void autosuggest_completed(autosuggestion_t result); void update_autosuggestion(); - void accept_autosuggestion(bool full, bool single = false, - move_word_style_t style = move_word_style_punctuation); + void accept_autosuggestion( + bool full, bool single = false, + move_word_style_t style = move_word_style_t::move_word_style_punctuation); void super_highlight_me_plenty(); /// Finish up any outstanding syntax highlighting, before execution. @@ -2115,11 +2116,11 @@ void reader_data_t::accept_autosuggestion(bool full, bool single, move_word_styl autosuggestion.text.substr(command_line.size(), 1)); } else { // Accept characters according to the specified style. - move_word_state_machine_t state(style); + auto state = new_move_word_state_machine(style); size_t want; for (want = command_line.size(); want < autosuggestion.text.size(); want++) { wchar_t wc = autosuggestion.text.at(want); - if (!state.consume_char(wc)) break; + if (!state->consume_char(wc)) break; } size_t have = command_line.size(); replace_substring(&command_line, command_line.size(), 0, @@ -2648,13 +2649,13 @@ enum move_word_dir_t { MOVE_DIR_LEFT, MOVE_DIR_RIGHT }; /// \param erase Whether to erase the characters along the way or only move past them. /// \param newv if the new kill item should be appended to the previous kill item or not. void reader_data_t::move_word(editable_line_t *el, bool move_right, bool erase, - enum move_word_style_t style, bool newv) { + move_word_style_t style, bool newv) { // Return if we are already at the edge. const size_t boundary = move_right ? el->size() : 0; if (el->position() == boundary) return; // When moving left, a value of 1 means the character at index 0. - move_word_state_machine_t state(style); + auto state = new_move_word_state_machine(style); const wchar_t *const command_line = el->text().c_str(); const size_t start_buff_pos = el->position(); @@ -2662,7 +2663,7 @@ void reader_data_t::move_word(editable_line_t *el, bool move_right, bool erase, while (buff_pos != boundary) { size_t idx = (move_right ? buff_pos : buff_pos - 1); wchar_t c = command_line[idx]; - if (!state.consume_char(c)) break; + if (!state->consume_char(c)) break; buff_pos = (move_right ? buff_pos + 1 : buff_pos - 1); } @@ -2710,7 +2711,7 @@ void reader_data_t::set_buffer_maintaining_pager(const wcstring &b, size_t pos, /// Run the specified command with the correct terminal modes, and while taking care to perform job /// notification, set the title, etc. static eval_res_t reader_run_command(parser_t &parser, const wcstring &cmd) { - wcstring ft = tok_command(cmd); + wcstring ft = *tok_command(cmd); // Provide values for `status current-command` and `status current-commandline` if (!ft.empty()) { @@ -3303,10 +3304,10 @@ static wchar_t unescaped_quote(const wcstring &str, size_t pos) { /// Returns true if the last token is a comment. static bool text_ends_in_comment(const wcstring &text) { - tokenizer_t tok(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS); + auto tok = new_tokenizer(text.c_str(), TOK_ACCEPT_UNFINISHED | TOK_SHOW_COMMENTS); bool is_comment = false; - while (auto token = tok.next()) { - is_comment = token->type == token_type_t::comment; + while (auto token = tok->next()) { + is_comment = token->type_ == token_type_t::comment; } return is_comment; } @@ -3799,9 +3800,10 @@ void reader_data_t::handle_readline_command(readline_cmd_t c, readline_loop_stat case rl::backward_kill_path_component: case rl::backward_kill_bigword: { move_word_style_t style = - (c == rl::backward_kill_bigword ? move_word_style_whitespace - : c == rl::backward_kill_path_component ? move_word_style_path_components - : move_word_style_punctuation); + (c == rl::backward_kill_bigword ? move_word_style_t::move_word_style_whitespace + : c == rl::backward_kill_path_component + ? move_word_style_t::move_word_style_path_components + : move_word_style_t::move_word_style_punctuation); // Is this the same killring item as the last kill? bool newv = (rls.last_cmd != rl::backward_kill_word && rls.last_cmd != rl::backward_kill_path_component && @@ -3813,8 +3815,8 @@ void reader_data_t::handle_readline_command(readline_cmd_t c, readline_loop_stat case rl::kill_bigword: { // The "bigword" functions differ only in that they move to the next whitespace, not // punctuation. - auto move_style = - (c == rl::kill_word) ? move_word_style_punctuation : move_word_style_whitespace; + auto move_style = (c == rl::kill_word) ? move_word_style_t::move_word_style_punctuation + : move_word_style_t::move_word_style_whitespace; move_word(active_edit_line(), MOVE_DIR_RIGHT, true /* erase */, move_style, rls.last_cmd != c /* same kill item if same movement */); break; @@ -3831,8 +3833,9 @@ void reader_data_t::handle_readline_command(readline_cmd_t c, readline_loop_stat break; } - auto move_style = (c != rl::backward_bigword) ? move_word_style_punctuation - : move_word_style_whitespace; + auto move_style = (c != rl::backward_bigword) + ? move_word_style_t::move_word_style_punctuation + : move_word_style_t::move_word_style_whitespace; move_word(active_edit_line(), MOVE_DIR_LEFT, false /* do not erase */, move_style, false); break; @@ -3849,8 +3852,9 @@ void reader_data_t::handle_readline_command(readline_cmd_t c, readline_loop_stat break; } - auto move_style = (c != rl::forward_bigword) ? move_word_style_punctuation - : move_word_style_whitespace; + auto move_style = (c != rl::forward_bigword) + ? move_word_style_t::move_word_style_punctuation + : move_word_style_t::move_word_style_whitespace; editable_line_t *el = active_edit_line(); if (el->position() < el->size()) { move_word(el, MOVE_DIR_RIGHT, false /* do not erase */, move_style, false); @@ -4072,7 +4076,8 @@ void reader_data_t::handle_readline_command(readline_cmd_t c, readline_loop_stat // We apply the operation from the current location to the end of the word. size_t pos = el->position(); size_t init_pos = pos; - move_word(el, MOVE_DIR_RIGHT, false, move_word_style_punctuation, false); + move_word(el, MOVE_DIR_RIGHT, false, move_word_style_t::move_word_style_punctuation, + false); wcstring replacement; for (; pos < el->position(); pos++) { wchar_t chr = el->text().at(pos); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp deleted file mode 100644 index 568407897..000000000 --- a/src/tokenizer.cpp +++ /dev/null @@ -1,887 +0,0 @@ -// A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be -// extended to support marks, tokenizing multiple strings and disposing of unused string segments. -#include "config.h" // IWYU pragma: keep - -#include "tokenizer.h" - -#include -#include -#include -#include - -#include -#include -#include - -#include "common.h" -#include "fallback.h" // IWYU pragma: keep -#include "future_feature_flags.h" -#include "wutil.h" // IWYU pragma: keep - -// _(s) is already wgettext(s).c_str(), so let's not convert back to wcstring -const wchar_t *tokenizer_get_error_message(tokenizer_error_t err) { - switch (err) { - case tokenizer_error_t::none: - return L""; - case tokenizer_error_t::unterminated_quote: - return _(L"Unexpected end of string, quotes are not balanced"); - case tokenizer_error_t::unterminated_subshell: - return _(L"Unexpected end of string, expecting ')'"); - case tokenizer_error_t::unterminated_slice: - return _(L"Unexpected end of string, square brackets do not match"); - case tokenizer_error_t::unterminated_escape: - return _(L"Unexpected end of string, incomplete escape sequence"); - case tokenizer_error_t::invalid_redirect: - return _(L"Invalid input/output redirection"); - case tokenizer_error_t::invalid_pipe: - return _(L"Cannot use stdin (fd 0) as pipe output"); - case tokenizer_error_t::invalid_pipe_ampersand: - return _(L"|& is not valid. In fish, use &| to pipe both stdout and stderr."); - case tokenizer_error_t::closing_unopened_subshell: - return _(L"Unexpected ')' for unopened parenthesis"); - case tokenizer_error_t::illegal_slice: - return _(L"Unexpected '[' at this location"); - case tokenizer_error_t::closing_unopened_brace: - return _(L"Unexpected '}' for unopened brace expansion"); - case tokenizer_error_t::unterminated_brace: - return _(L"Unexpected end of string, incomplete parameter expansion"); - case tokenizer_error_t::expected_pclose_found_bclose: - return _(L"Unexpected '}' found, expecting ')'"); - case tokenizer_error_t::expected_bclose_found_pclose: - return _(L"Unexpected ')' found, expecting '}'"); - } - assert(0 && "Unexpected tokenizer error"); - return nullptr; -} - -/// Return an error token and mark that we no longer have a next token. -tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token_start, - const wchar_t *error_loc, maybe_t token_length, - size_t error_len) { - assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error"); - assert(error_loc >= token_start && "Invalid error location"); - assert(this->token_cursor >= token_start && "Invalid buff location"); - - // If continue_after_error is set and we have a real token length, then skip past it. - // Otherwise give up. - if (token_length.has_value() && continue_after_error) { - assert(this->token_cursor < error_loc + *token_length && "Unable to continue past error"); - this->token_cursor = error_loc + *token_length; - } else { - this->has_next = false; - } - - tok_t result{token_type_t::error}; - result.error = error_type; - result.offset = token_start - this->start; - // If we are passed a token_length, then use it; otherwise infer it from the buffer. - result.length = token_length.has_value() ? *token_length : this->token_cursor - token_start; - result.error_offset_within_token = error_loc - token_start; - result.error_length = error_len; - return result; -} - -tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) - : token_cursor(start), start(start) { - assert(start != nullptr && "Invalid start"); - - this->accept_unfinished = static_cast(flags & TOK_ACCEPT_UNFINISHED); - this->show_comments = static_cast(flags & TOK_SHOW_COMMENTS); - this->show_blank_lines = static_cast(flags & TOK_SHOW_BLANK_LINES); - this->continue_after_error = static_cast(flags & TOK_CONTINUE_AFTER_ERROR); -} - -tok_t::tok_t(token_type_t type) : type(type) {} - -/// Tests if this character can be a part of a string. Hash (#) starts a comment if it's the first -/// character in a token; otherwise it is considered a string character. See issue #953. -static bool tok_is_string_character(wchar_t c, maybe_t next) { - switch (c) { - case L'\0': - case L' ': - case L'\n': - case L'|': - case L'\t': - case L';': - case L'\r': - case L'<': - case L'>': { - // Unconditional separators. - return false; - } - case L'&': { - if (!feature_test(feature_flag_t::ampersand_nobg_in_token)) return false; - bool next_is_string = next.has_value() && tok_is_string_character(*next, none()); - // Unlike in other shells, '&' is not special if followed by a string character. - return next_is_string; - } - default: { - return true; - } - } -} - -/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster -/// by adding a fast path for the most common characters. This is obviously not a suitable -/// replacement for iswalpha. -static inline int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } - -namespace tok_modes { -enum { - regular_text = 0, // regular text - subshell = 1 << 0, // inside of subshell parentheses - array_brackets = 1 << 1, // inside of array brackets - curly_braces = 1 << 2, - char_escape = 1 << 3, -}; -} // namespace tok_modes -using tok_mode_t = uint32_t; - -/// Read the next token as a string. -tok_t tokenizer_t::read_string() { - tok_mode_t mode{tok_modes::regular_text}; - std::vector paran_offsets; - std::vector brace_offsets; - std::vector expecting; - std::vector quoted_cmdsubs; - int slice_offset = 0; - const wchar_t *const buff_start = this->token_cursor; - bool is_token_begin = true; - - auto process_opening_quote = [&](wchar_t quote) -> const wchar_t * { - const wchar_t *end = quote_end(this->token_cursor, quote); - if (end) { - if (*end == L'$') quoted_cmdsubs.push_back(paran_offsets.size()); - this->token_cursor = end; - return nullptr; - } else { - const wchar_t *error_loc = this->token_cursor; - this->token_cursor += std::wcslen(this->token_cursor); - return error_loc; - } - }; - - while (true) { - wchar_t c = *this->token_cursor; -#if false - wcstring msg = L"Handling 0x%x (%lc)"; - tok_mode mode_begin = mode; -#endif - - if (c == L'\0') { - break; - } - - // Make sure this character isn't being escaped before anything else - if ((mode & tok_modes::char_escape) == tok_modes::char_escape) { - mode &= ~(tok_modes::char_escape); - // and do nothing more - } else if (myal(c)) { - // Early exit optimization in case the character is just a letter, - // which has no special meaning to the tokenizer, i.e. the same mode continues. - } - - // Now proceed with the evaluation of the token, first checking to see if the token - // has been explicitly ignored (escaped). - else if (c == L'\\') { - mode |= tok_modes::char_escape; - } else if (c == L'#' && is_token_begin) { - this->token_cursor = comment_end(this->token_cursor) - 1; - } else if (c == L'(') { - paran_offsets.push_back(this->token_cursor - this->start); - expecting.push_back(L')'); - mode |= tok_modes::subshell; - } else if (c == L'{') { - brace_offsets.push_back(this->token_cursor - this->start); - expecting.push_back(L'}'); - mode |= tok_modes::curly_braces; - } else if (c == L')') { - if (!expecting.empty() && expecting.back() == L'}') { - return this->call_error(tokenizer_error_t::expected_bclose_found_pclose, - this->token_cursor, this->token_cursor, 1, 1); - } - if (paran_offsets.empty()) { - return this->call_error(tokenizer_error_t::closing_unopened_subshell, - this->token_cursor, this->token_cursor, 1, 1); - } - paran_offsets.pop_back(); - if (paran_offsets.empty()) { - mode &= ~(tok_modes::subshell); - } - expecting.pop_back(); - // Check if the ) completed a quoted command substitution. - if (!quoted_cmdsubs.empty() && quoted_cmdsubs.back() == paran_offsets.size()) { - quoted_cmdsubs.pop_back(); - // The "$(" part of a quoted command substitution closes double quotes. To keep - // quotes balanced, act as if there was an invisible double quote after the ")". - if (const wchar_t *error_loc = process_opening_quote(L'"')) { - if (!this->accept_unfinished) { - return this->call_error(tokenizer_error_t::unterminated_quote, buff_start, - error_loc); - } - break; - } - } - } else if (c == L'}') { - if (!expecting.empty() && expecting.back() == L')') { - return this->call_error(tokenizer_error_t::expected_pclose_found_bclose, - this->token_cursor, this->token_cursor, 1, 1); - } - if (brace_offsets.empty()) { - return this->call_error(tokenizer_error_t::closing_unopened_brace, - this->token_cursor, - this->token_cursor + wcslen(this->token_cursor)); - } - brace_offsets.pop_back(); - if (brace_offsets.empty()) { - mode &= ~(tok_modes::curly_braces); - } - expecting.pop_back(); - } else if (c == L'[') { - if (this->token_cursor != buff_start) { - mode |= tok_modes::array_brackets; - slice_offset = this->token_cursor - this->start; - } else { - // This is actually allowed so the test operator `[` can be used as the head of a - // command - } - } - // Only exit bracket mode if we are in bracket mode. - // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias. - // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket - else if (c == L']' && ((mode & tok_modes::array_brackets) == tok_modes::array_brackets)) { - mode &= ~(tok_modes::array_brackets); - } else if (c == L'\'' || c == L'"') { - if (const wchar_t *error_loc = process_opening_quote(c)) { - if (!this->accept_unfinished) { - return this->call_error(tokenizer_error_t::unterminated_quote, buff_start, - error_loc, none(), 1); - } - break; - } - } else if (mode == tok_modes::regular_text && - !tok_is_string_character(c, this->token_cursor[1])) { - break; - } - -#if false - if (mode != mode_begin) { - msg.append(L": mode 0x%x -> 0x%x\n"); - } else { - msg.push_back(L'\n'); - } - FLOGF(error, msg.c_str(), c, c, int(mode_begin), int(mode)); -#endif - - is_token_begin = is_token_delimiter(this->token_cursor[0], this->token_cursor[1]); - this->token_cursor++; - } - - if (!this->accept_unfinished && (mode != tok_modes::regular_text)) { - // These are all "unterminated", so the only char we can mark as an error - // is the opener (the closing char could be anywhere!) - // - // (except for char_escape, which is one long by definition) - if (mode & tok_modes::char_escape) { - return this->call_error(tokenizer_error_t::unterminated_escape, buff_start, - this->token_cursor - 1, none(), 1); - } else if (mode & tok_modes::array_brackets) { - return this->call_error(tokenizer_error_t::unterminated_slice, buff_start, - this->start + slice_offset, none(), 1); - } else if (mode & tok_modes::subshell) { - assert(!paran_offsets.empty()); - size_t offset_of_open_paran = paran_offsets.back(); - - return this->call_error(tokenizer_error_t::unterminated_subshell, buff_start, - this->start + offset_of_open_paran, none(), 1); - } else if (mode & tok_modes::curly_braces) { - assert(!brace_offsets.empty()); - size_t offset_of_open_brace = brace_offsets.back(); - - return this->call_error(tokenizer_error_t::unterminated_brace, buff_start, - this->start + offset_of_open_brace, none(), 1); - } else { - DIE("Unknown non-regular-text mode"); - } - } - - tok_t result(token_type_t::string); - result.offset = buff_start - this->start; - result.length = this->token_cursor - buff_start; - return result; -} - -// Parse an fd from the non-empty string [start, end), all of which are digits. -// Return the fd, or -1 on overflow. -static int parse_fd(const wchar_t *start, const wchar_t *end) { - assert(start < end && "String cannot be empty"); - long long big_fd = 0; - for (const wchar_t *cursor = start; cursor < end; ++cursor) { - assert(L'0' <= *cursor && *cursor <= L'9' && "Not a digit"); - big_fd = big_fd * 10 + (*cursor - L'0'); - if (big_fd > INT_MAX) return -1; - } - assert(big_fd <= INT_MAX && "big_fd should be in range"); - return static_cast(big_fd); -} - -pipe_or_redir_t::pipe_or_redir_t() = default; - -maybe_t pipe_or_redir_t::from_string(const wchar_t *buff) { - pipe_or_redir_t result{}; - - /* Examples of supported syntaxes. - Note we are only responsible for parsing the redirection part, not 'cmd' or 'file'. - - cmd | cmd normal pipe - cmd &| cmd normal pipe plus stderr-merge - cmd >| cmd pipe with explicit fd - cmd 2>| cmd pipe with explicit fd - cmd < file stdin redirection - cmd > file redirection - cmd >> file appending redirection - cmd >? file noclobber redirection - cmd >>? file appending noclobber redirection - cmd 2> file file redirection with explicit fd - cmd >&2 fd redirection with no explicit src fd (stdout is used) - cmd 1>&2 fd redirection with an explicit src fd - cmd <&2 fd redirection with no explicit src fd (stdin is used) - cmd 3<&0 fd redirection with an explicit src fd - cmd &> file redirection with stderr merge - cmd ^ file caret (stderr) redirection, perhaps disabled via feature flags - cmd ^^ file caret (stderr) redirection, perhaps disabled via feature flags - */ - - const wchar_t *cursor = buff; - - // Extract a range of leading fd. - const wchar_t *fd_start = cursor; - while (iswdigit(*cursor)) cursor++; - const wchar_t *fd_end = cursor; - bool has_fd = (fd_end > fd_start); - - // Try consuming a given character. - // Return true if consumed. On success, advances cursor. - auto try_consume = [&cursor](wchar_t c) -> bool { - if (*cursor != c) return false; - cursor++; - return true; - }; - - // Like try_consume, but asserts on failure. - auto consume = [&](wchar_t c) { - assert(*cursor == c && "Failed to consume char"); - cursor++; - }; - - switch (*cursor) { - case L'|': { - if (has_fd) { - // Like 123| - return none(); - } - consume(L'|'); - assert(*cursor != L'|' && - "|| passed as redirection, this should have been handled as 'or' by the caller"); - result.fd = STDOUT_FILENO; - result.is_pipe = true; - break; - } - case L'>': { - consume(L'>'); - if (try_consume(L'>')) result.mode = redirection_mode_t::append; - if (try_consume(L'|')) { - // Note we differ from bash here. - // Consider `echo foo 2>| bar` - // In fish, this is a *pipe*. Run bar as a command and attach foo's stderr to bar's - // stdin, while leaving stdout as tty. - // In bash, this is a *redirection* to bar as a file. It is like > but ignores - // noclobber. - result.is_pipe = true; - result.fd = has_fd ? parse_fd(fd_start, fd_end) // like 2>| - : STDOUT_FILENO; // like >| - } else if (try_consume(L'&')) { - // This is a redirection to an fd. - // Note that we allow ">>&", but it's still just writing to the fd - "appending" to - // it doesn't make sense. - result.mode = redirection_mode_t::fd; - result.fd = has_fd ? parse_fd(fd_start, fd_end) // like 1>&2 - : STDOUT_FILENO; // like >&2 - } else { - // This is a redirection to a file. - result.fd = has_fd ? parse_fd(fd_start, fd_end) // like 1> file.txt - : STDOUT_FILENO; // like > file.txt - if (result.mode != redirection_mode_t::append) - result.mode = redirection_mode_t::overwrite; - // Note 'echo abc >>? file' is valid: it means append and noclobber. - // But here "noclobber" means the file must not exist, so appending - // can be ignored. - if (try_consume(L'?')) result.mode = redirection_mode_t::noclob; - } - break; - } - case L'<': { - consume(L'<'); - if (try_consume('&')) { - result.mode = redirection_mode_t::fd; - } else { - result.mode = redirection_mode_t::input; - } - result.fd = has_fd ? parse_fd(fd_start, fd_end) // like 1<&3 or 1< /tmp/file.txt - : STDIN_FILENO; // like <&3 or < /tmp/file.txt - break; - } - case L'&': { - consume(L'&'); - if (try_consume(L'|')) { - // &| is pipe with stderr merge. - result.fd = STDOUT_FILENO; - result.is_pipe = true; - result.stderr_merge = true; - } else if (try_consume(L'>')) { - result.fd = STDOUT_FILENO; - result.stderr_merge = true; - result.mode = redirection_mode_t::overwrite; - if (try_consume(L'>')) result.mode = redirection_mode_t::append; // like &>> - if (try_consume(L'?')) - result.mode = redirection_mode_t::noclob; // like &>? or &>>? - } else { - return none(); - } - break; - } - default: { - // Not a redirection. - return none(); - } - } - - result.consumed = (cursor - buff); - assert(result.consumed > 0 && "Should have consumed at least one character on success"); - return result; -} - -int pipe_or_redir_t::oflags() const { - switch (mode) { - case redirection_mode_t::append: { - return O_CREAT | O_APPEND | O_WRONLY; - } - case redirection_mode_t::overwrite: { - return O_CREAT | O_WRONLY | O_TRUNC; - } - case redirection_mode_t::noclob: { - return O_CREAT | O_EXCL | O_WRONLY; - } - case redirection_mode_t::input: { - return O_RDONLY; - } - case redirection_mode_t::fd: - default: { - return -1; - } - } -} - -/// Test if a character is whitespace. Differs from iswspace in that it does not consider a -/// newline to be whitespace. -static bool iswspace_not_nl(wchar_t c) { - switch (c) { - case L' ': - case L'\t': - case L'\r': - return true; - case L'\n': - return false; - default: - return iswspace(c); - } -} - -maybe_t tokenizer_t::next() { - if (!this->has_next) { - return none(); - } - - // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past - // it. - for (;;) { - if (this->token_cursor[0] == L'\\' && this->token_cursor[1] == L'\n') { - this->token_cursor += 2; - this->continue_line_after_comment = true; - } else if (iswspace_not_nl(this->token_cursor[0])) { - this->token_cursor++; - } else { - break; - } - } - - while (*this->token_cursor == L'#') { - // We have a comment, walk over the comment. - const wchar_t *comment_start = this->token_cursor; - this->token_cursor = comment_end(this->token_cursor); - size_t comment_len = this->token_cursor - comment_start; - - // If we are going to continue after the comment, skip any trailing newline. - if (this->token_cursor[0] == L'\n' && this->continue_line_after_comment) - this->token_cursor++; - - // Maybe return the comment. - if (this->show_comments) { - tok_t result(token_type_t::comment); - result.offset = comment_start - this->start; - result.length = comment_len; - return result; - } - while (iswspace_not_nl(this->token_cursor[0])) this->token_cursor++; - } - - // We made it past the comments and ate any trailing newlines we wanted to ignore. - this->continue_line_after_comment = false; - const size_t start_pos = this->token_cursor - this->start; - - maybe_t result{}; - switch (*this->token_cursor) { - case L'\0': { - this->has_next = false; - return none(); - } - case L'\r': // carriage-return - case L'\n': // newline - case L';': { - result.emplace(token_type_t::end); - result->offset = start_pos; - result->length = 1; - this->token_cursor++; - // Hack: when we get a newline, swallow as many as we can. This compresses multiple - // subsequent newlines into a single one. - if (!this->show_blank_lines) { - while (*this->token_cursor == L'\n' || *this->token_cursor == 13 /* CR */ || - *this->token_cursor == ' ' || *this->token_cursor == '\t') { - this->token_cursor++; - } - } - break; - } - case L'&': { - if (this->token_cursor[1] == L'&') { - // && is and. - result.emplace(token_type_t::andand); - result->offset = start_pos; - result->length = 2; - this->token_cursor += 2; - } else if (this->token_cursor[1] == L'>' || this->token_cursor[1] == L'|') { - // &> and &| redirect both stdout and stderr. - auto redir = pipe_or_redir_t::from_string(this->token_cursor); - assert(redir.has_value() && - "Should always succeed to parse a &> or &| redirection"); - result.emplace(redir->token_type()); - result->offset = start_pos; - result->length = redir->consumed; - this->token_cursor += redir->consumed; - } else { - result.emplace(token_type_t::background); - result->offset = start_pos; - result->length = 1; - this->token_cursor++; - } - break; - } - case L'|': { - if (this->token_cursor[1] == L'|') { - // || is or. - result.emplace(token_type_t::oror); - result->offset = start_pos; - result->length = 2; - this->token_cursor += 2; - } else if (this->token_cursor[1] == L'&') { - // |& is a bashism; in fish it's &|. - return this->call_error(tokenizer_error_t::invalid_pipe_ampersand, - this->token_cursor, this->token_cursor, 2, 2); - } else { - auto pipe = pipe_or_redir_t::from_string(this->token_cursor); - assert(pipe.has_value() && pipe->is_pipe && - "Should always succeed to parse a | pipe"); - result.emplace(pipe->token_type()); - result->offset = start_pos; - result->length = pipe->consumed; - this->token_cursor += pipe->consumed; - } - break; - } - case L'>': - case L'<': { - // There's some duplication with the code in the default case below. The key - // difference here is that we must never parse these as a string; a failed - // redirection is an error! - auto redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor); - if (!redir_or_pipe || redir_or_pipe->fd < 0) { - return this->call_error(tokenizer_error_t::invalid_redirect, this->token_cursor, - this->token_cursor, - redir_or_pipe ? redir_or_pipe->consumed : 0, - redir_or_pipe ? redir_or_pipe->consumed : 0); - } - result.emplace(redir_or_pipe->token_type()); - result->offset = start_pos; - result->length = redir_or_pipe->consumed; - this->token_cursor += redir_or_pipe->consumed; - break; - } - default: { - // Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string. - const wchar_t *error_location = this->token_cursor; - maybe_t redir_or_pipe{}; - if (iswdigit(*this->token_cursor)) { - redir_or_pipe = pipe_or_redir_t::from_string(this->token_cursor); - } - - if (redir_or_pipe) { - // It looks like a redirection or a pipe. But we don't support piping fd 0. Note - // that fd 0 may be -1, indicating overflow; but we don't treat that as a - // tokenizer error. - if (redir_or_pipe->is_pipe && redir_or_pipe->fd == 0) { - return this->call_error(tokenizer_error_t::invalid_pipe, error_location, - error_location, redir_or_pipe->consumed, - redir_or_pipe->consumed); - } - result.emplace(redir_or_pipe->token_type()); - result->offset = start_pos; - result->length = redir_or_pipe->consumed; - this->token_cursor += redir_or_pipe->consumed; - } else { - // Not a redirection or pipe, so just a string. - result = this->read_string(); - } - break; - } - } - assert(result.has_value() && "Should have a token"); - return result; -} - -bool is_token_delimiter(wchar_t c, maybe_t next) { - return c == L'(' || !tok_is_string_character(c, std::move(next)); -} - -wcstring tok_command(const wcstring &str) { - tokenizer_t t(str.c_str(), 0); - while (auto token = t.next()) { - if (token->type != token_type_t::string) { - return {}; - } - wcstring text = t.text_of(*token); - if (variable_assignment_equals_pos(text)) { - continue; - } - return text; - } - return {}; -} - -bool move_word_state_machine_t::consume_char_punctuation(wchar_t c) { - enum { s_always_one = 0, s_rest, s_whitespace_rest, s_whitespace, s_alphanumeric, s_end }; - - bool consumed = false; - while (state != s_end && !consumed) { - switch (state) { - case s_always_one: { - // Always consume the first character. - consumed = true; - if (iswspace(c)) { - state = s_whitespace; - } else if (iswalnum(c)) { - state = s_alphanumeric; - } else { - // Don't allow switching type (ws->nonws) after non-whitespace and - // non-alphanumeric. - state = s_rest; - } - break; - } - case s_rest: { - if (iswspace(c)) { - // Consume only trailing whitespace. - state = s_whitespace_rest; - } else if (iswalnum(c)) { - // Consume only alnums. - state = s_alphanumeric; - } else { - consumed = false; - state = s_end; - } - break; - } - case s_whitespace_rest: - case s_whitespace: { - // "whitespace" consumes whitespace and switches to alnums, - // "whitespace_rest" only consumes whitespace. - if (iswspace(c)) { - // Consumed whitespace. - consumed = true; - } else { - state = state == s_whitespace ? s_alphanumeric : s_end; - } - break; - } - case s_alphanumeric: { - if (iswalnum(c)) { - consumed = true; // consumed alphanumeric - } else { - state = s_end; - } - break; - } - case s_end: - default: { - break; - } - } - } - return consumed; -} - -bool move_word_state_machine_t::is_path_component_character(wchar_t c) { - return tok_is_string_character(c, none()) && !std::wcschr(L"/={,}'\":@", c); -} - -bool move_word_state_machine_t::consume_char_path_components(wchar_t c) { - enum { - s_initial_punctuation, - s_whitespace, - s_separator, - s_slash, - s_path_component_characters, - s_initial_separator, - s_end - }; - - bool consumed = false; - while (state != s_end && !consumed) { - switch (state) { - case s_initial_punctuation: { - if (!is_path_component_character(c) && !iswspace(c)) { - state = s_initial_separator; - } else { - if (!is_path_component_character(c)) { - consumed = true; - } - state = s_whitespace; - } - break; - } - case s_whitespace: { - if (iswspace(c)) { - consumed = true; // consumed whitespace - } else if (c == L'/' || is_path_component_character(c)) { - state = s_slash; // path component - } else { - state = s_separator; // path separator - } - break; - } - case s_separator: { - if (!iswspace(c) && !is_path_component_character(c)) { - consumed = true; // consumed separator - } else { - state = s_end; - } - break; - } - case s_slash: { - if (c == L'/') { - consumed = true; // consumed slash - } else { - state = s_path_component_characters; - } - break; - } - case s_path_component_characters: { - if (is_path_component_character(c)) { - consumed = true; // consumed string character except slash - } else { - state = s_end; - } - break; - } - case s_initial_separator: { - if (is_path_component_character(c)) { - consumed = true; - state = s_path_component_characters; - } else if (iswspace(c)) { - state = s_end; - } else { - consumed = true; - } - break; - } - case s_end: - default: { - break; - } - } - } - return consumed; -} - -bool move_word_state_machine_t::consume_char_whitespace(wchar_t c) { - // Consume a "word" of printable characters plus any leading whitespace. - enum { s_always_one = 0, s_blank, s_graph, s_end }; - - bool consumed = false; - while (state != s_end && !consumed) { - switch (state) { - case s_always_one: { - consumed = true; // always consume the first character - // If it's not whitespace, only consume those from here. - if (!iswspace(c)) { - state = s_graph; - } else { - // If it's whitespace, keep consuming whitespace until the graphs. - state = s_blank; - } - break; - } - case s_blank: { - if (iswspace(c)) { - consumed = true; // consumed whitespace - } else { - state = s_graph; - } - break; - } - case s_graph: { - if (!iswspace(c)) { - consumed = true; // consumed printable non-space - } else { - state = s_end; - } - break; - } - case s_end: - default: { - break; - } - } - } - return consumed; -} - -bool move_word_state_machine_t::consume_char(wchar_t c) { - switch (style) { - case move_word_style_punctuation: { - return consume_char_punctuation(c); - } - case move_word_style_path_components: { - return consume_char_path_components(c); - } - case move_word_style_whitespace: { - return consume_char_whitespace(c); - } - } - - DIE("should not reach this statement"); // silence some compiler errors about not returning -} - -move_word_state_machine_t::move_word_state_machine_t(move_word_style_t syl) - : state(0), style(syl) {} - -void move_word_state_machine_t::reset() { state = 0; } diff --git a/src/tokenizer.h b/src/tokenizer.h index 475247614..5ad2ff3de 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -1,5 +1,3 @@ -// A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be -// extended to support marks, tokenizing multiple strings and disposing of unused string segments. #ifndef FISH_TOKENIZER_H #define FISH_TOKENIZER_H @@ -10,39 +8,28 @@ #include "maybe.h" #include "parse_constants.h" #include "redirection.h" -#if INCLUDE_RUST_HEADERS -#include "tokenizer.rs.h" -#endif - -/// Token types. XXX Why this isn't parse_token_type_t, I'm not really sure. -enum class token_type_t : uint8_t { - error, /// Error reading token - string, /// String token - pipe, /// Pipe token - andand, /// && token - oror, /// || token - end, /// End token (semicolon or newline, not literal end) - redirect, /// redirection token - background, /// send job to bg token - comment, /// comment token -}; - -/// Flag telling the tokenizer to accept incomplete parameters, i.e. parameters with mismatching -/// parenthesis, etc. This is useful for tab-completion. -#define TOK_ACCEPT_UNFINISHED 1 - -/// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting. -#define TOK_SHOW_COMMENTS 2 - -/// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells -/// the tokenizer to return each of them as a separate END. -#define TOK_SHOW_BLANK_LINES 4 - -/// Make an effort to continue after an error. -#define TOK_CONTINUE_AFTER_ERROR 8 using tok_flags_t = unsigned int; +#define TOK_ACCEPT_UNFINISHED 1 +#define TOK_SHOW_COMMENTS 2 +#define TOK_SHOW_BLANK_LINES 4 +#define TOK_CONTINUE_AFTER_ERROR 8 + +#if INCLUDE_RUST_HEADERS + +#include "tokenizer.rs.h" +using token_type_t = TokenType; +using tokenizer_error_t = TokenizerError; +using tok_t = Tok; +using tokenizer_t = Tokenizer; +using pipe_or_redir_t = PipeOrRedir; +using move_word_state_machine_t = MoveWordStateMachine; +using move_word_style_t = MoveWordStyle; + +#else + +// Hacks to allow us to compile without Rust headers. enum class tokenizer_error_t : uint8_t { none, unterminated_quote, @@ -60,155 +47,6 @@ enum class tokenizer_error_t : uint8_t { expected_bclose_found_pclose, }; -/// Get the error message for an error \p err. -const wchar_t *tokenizer_get_error_message(tokenizer_error_t err); - -struct tok_t { - // Offset of the token. - source_offset_t offset{0}; - // Length of the token. - source_offset_t length{0}; - - // If an error, this is the offset of the error within the token. A value of 0 means it occurred - // at 'offset'. - source_offset_t error_offset_within_token{SOURCE_OFFSET_INVALID}; - source_offset_t error_length{0}; - - // If an error, this is the error code. - tokenizer_error_t error{tokenizer_error_t::none}; - - // The type of the token. - token_type_t type; - - // Construct from a token type. - explicit tok_t(token_type_t type); - - /// Returns whether the given location is within the source range or at its end. - bool location_in_or_at_end_of_source_range(size_t loc) const { - return offset <= loc && loc - offset <= length; - } - /// Gets source for the token, or the empty string if it has no source. - wcstring get_source(const wcstring &str) const { return wcstring(str, offset, length); } -}; -static_assert(sizeof(tok_t) <= 32, "tok_t expected to be 32 bytes or less"); - -/// The tokenizer struct. -class tokenizer_t : noncopyable_t { - /// A pointer into the original string, showing where the next token begins. - const wchar_t *token_cursor; - /// The start of the original string. - const wchar_t *const start; - /// Whether we have additional tokens. - bool has_next{true}; - /// Whether incomplete tokens are accepted. - bool accept_unfinished{false}; - /// Whether comments should be returned. - bool show_comments{false}; - /// Whether all blank lines are returned. - bool show_blank_lines{false}; - /// Whether to attempt to continue after an error. - bool continue_after_error{false}; - /// Whether to continue the previous line after the comment. - bool continue_line_after_comment{false}; - - tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start, - const wchar_t *error_loc, maybe_t token_length = {}, - size_t error_len = 0); - tok_t read_string(); - - public: - /// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and - /// should not be freed by the caller until after the tokenizer is destroyed. - /// - /// \param b The string to tokenize - /// \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer - /// to accept incomplete tokens, such as a subshell without a closing parenthesis, as a valid - /// token. Setting TOK_SHOW_COMMENTS will return comments as tokens - tokenizer_t(const wchar_t *start, tok_flags_t flags); - - /// Returns the next token, or none() if we are at the end. - maybe_t next(); - - /// Returns the text of a token, as a string. - wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); } - - /// Copies a token's text into a string. This is useful for reusing storage. - /// Returns a reference to the string. - const wcstring ©_text_of(const tok_t &tok, wcstring *result) { - return result->assign(start + tok.offset, tok.length); - } -}; - -/// Tests if this character can delimit tokens. -bool is_token_delimiter(wchar_t c, maybe_t next); - -/// \return the first token from the string, skipping variable assignments like A=B. -wcstring tok_command(const wcstring &str); - -/// Struct wrapping up a parsed pipe or redirection. -struct pipe_or_redir_t { - // The redirected fd, or -1 on overflow. - // In the common case of a pipe, this is 1 (STDOUT_FILENO). - // For example, in the case of "3>&1" this will be 3. - int fd{-1}; - - // Whether we are a pipe (true) or redirection (false). - bool is_pipe{false}; - - // The redirection mode if the type is redirect. - // Ignored for pipes. - redirection_mode_t mode{redirection_mode_t::overwrite}; - - // Whether, in addition to this redirection, stderr should also be dup'd to stdout - // For example &| or &> - bool stderr_merge{false}; - - // Number of characters consumed when parsing the string. - size_t consumed{0}; - - // Construct from a string. - static maybe_t from_string(const wchar_t *buff); - static maybe_t from_string(const wcstring &buff) { - return from_string(buff.c_str()); - } - - // \return the oflags (as in open(2)) for this redirection. - int oflags() const; - - // \return if we are "valid". Here "valid" means only that the source fd did not overflow. - // For example 99999999999> is invalid. - bool is_valid() const { return fd >= 0; } - - // \return the token type for this redirection. - token_type_t token_type() const { - return is_pipe ? token_type_t::pipe : token_type_t::redirect; - } - - private: - pipe_or_redir_t(); -}; - -enum move_word_style_t { - move_word_style_punctuation, // stop at punctuation - move_word_style_path_components, // stops at path components - move_word_style_whitespace // stops at whitespace -}; - -/// Our state machine that implements "one word" movement or erasure. -class move_word_state_machine_t { - private: - bool consume_char_punctuation(wchar_t c); - bool consume_char_path_components(wchar_t c); - bool is_path_component_character(wchar_t c); - bool consume_char_whitespace(wchar_t c); - - int state; - move_word_style_t style; - - public: - explicit move_word_state_machine_t(move_word_style_t syl); - bool consume_char(wchar_t c); - void reset(); -}; +#endif #endif