mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-08 11:08:53 +00:00
1412 lines
52 KiB
Rust
1412 lines
52 KiB
Rust
//! A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
|
|
//! extended to support marks, tokenizing multiple strings and disposing of unused string segments.
|
|
|
|
use crate::common::valid_var_name_char;
|
|
use crate::ffi::wcharz_t;
|
|
use crate::future_feature_flags::{feature_test, FeatureFlag};
|
|
use crate::parse_constants::SOURCE_OFFSET_INVALID;
|
|
use crate::redirection::RedirectionMode;
|
|
use crate::wchar::prelude::*;
|
|
use crate::wchar_ffi::{wchar_t, AsWstr, WCharToFFI};
|
|
use cxx::{CxxWString, SharedPtr, UniquePtr};
|
|
use libc::{c_int, STDIN_FILENO, STDOUT_FILENO};
|
|
use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, Not};
|
|
use std::os::fd::RawFd;
|
|
|
|
#[cxx::bridge]
|
|
mod tokenizer_ffi {
|
|
extern "C++" {
|
|
include!("wutil.h");
|
|
include!("redirection.h");
|
|
type wcharz_t = super::wcharz_t;
|
|
type RedirectionMode = super::RedirectionMode;
|
|
}
|
|
|
|
/// Token types. XXX Why this isn't ParseTokenType, I'm not really sure.
|
|
enum TokenType {
|
|
/// Error reading token
|
|
error,
|
|
/// String token
|
|
string,
|
|
/// Pipe token
|
|
pipe,
|
|
/// && token
|
|
andand,
|
|
/// || token
|
|
oror,
|
|
/// End token (semicolon or newline, not literal end)
|
|
end,
|
|
/// redirection token
|
|
redirect,
|
|
/// send job to bg token
|
|
background,
|
|
/// comment token
|
|
comment,
|
|
}
|
|
|
|
enum TokenizerError {
|
|
none,
|
|
unterminated_quote,
|
|
unterminated_subshell,
|
|
unterminated_slice,
|
|
unterminated_escape,
|
|
invalid_redirect,
|
|
invalid_pipe,
|
|
invalid_pipe_ampersand,
|
|
closing_unopened_subshell,
|
|
illegal_slice,
|
|
closing_unopened_brace,
|
|
unterminated_brace,
|
|
expected_pclose_found_bclose,
|
|
expected_bclose_found_pclose,
|
|
}
|
|
|
|
extern "Rust" {
|
|
fn tokenizer_get_error_message(err: TokenizerError) -> UniquePtr<CxxWString>;
|
|
}
|
|
|
|
struct Tok {
|
|
// Offset of the token.
|
|
offset: u32,
|
|
// Length of the token.
|
|
length: u32,
|
|
|
|
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
|
|
// at 'offset'.
|
|
error_offset_within_token: u32,
|
|
error_length: u32,
|
|
|
|
// If an error, this is the error code.
|
|
error: TokenizerError,
|
|
|
|
// The type of the token.
|
|
type_: TokenType,
|
|
}
|
|
// TODO static_assert(sizeof(Tok) <= 32, "Tok expected to be 32 bytes or less");
|
|
|
|
extern "Rust" {
|
|
fn location_in_or_at_end_of_source_range(self: &Tok, loc: usize) -> bool;
|
|
#[cxx_name = "get_source"]
|
|
fn get_source_ffi(self: &Tok, str: &CxxWString) -> UniquePtr<CxxWString>;
|
|
}
|
|
|
|
extern "Rust" {
|
|
type Tokenizer;
|
|
fn new_tokenizer(start: wcharz_t, flags: u8) -> Box<Tokenizer>;
|
|
#[cxx_name = "next"]
|
|
fn next_ffi(self: &mut Tokenizer) -> UniquePtr<Tok>;
|
|
#[cxx_name = "text_of"]
|
|
fn text_of_ffi(self: &Tokenizer, tok: &Tok) -> UniquePtr<CxxWString>;
|
|
#[cxx_name = "is_token_delimiter"]
|
|
fn is_token_delimiter_ffi(c: wchar_t, next: SharedPtr<wchar_t>) -> bool;
|
|
}
|
|
|
|
extern "Rust" {
|
|
#[cxx_name = "tok_command"]
|
|
fn tok_command_ffi(str: &CxxWString) -> UniquePtr<CxxWString>;
|
|
}
|
|
|
|
/// Struct wrapping up a parsed pipe or redirection.
|
|
struct PipeOrRedir {
|
|
// The redirected fd, or -1 on overflow.
|
|
// In the common case of a pipe, this is 1 (STDOUT_FILENO).
|
|
// For example, in the case of "3>&1" this will be 3.
|
|
fd: i32,
|
|
|
|
// Whether we are a pipe (true) or redirection (false).
|
|
is_pipe: bool,
|
|
|
|
// The redirection mode if the type is redirect.
|
|
// Ignored for pipes.
|
|
mode: RedirectionMode,
|
|
|
|
// Whether, in addition to this redirection, stderr should also be dup'd to stdout
|
|
// For example &| or &>
|
|
stderr_merge: bool,
|
|
|
|
// Number of characters consumed when parsing the string.
|
|
consumed: usize,
|
|
}
|
|
|
|
extern "Rust" {
|
|
fn pipe_or_redir_from_string(buff: wcharz_t) -> UniquePtr<PipeOrRedir>;
|
|
fn is_valid(self: &PipeOrRedir) -> bool;
|
|
fn oflags(self: &PipeOrRedir) -> i32;
|
|
fn token_type(self: &PipeOrRedir) -> TokenType;
|
|
}
|
|
|
|
enum MoveWordStyle {
|
|
move_word_style_punctuation, // stop at punctuation
|
|
move_word_style_path_components, // stops at path components
|
|
move_word_style_whitespace, // stops at whitespace
|
|
}
|
|
|
|
/// Our state machine that implements "one word" movement or erasure.
|
|
struct MoveWordStateMachine {
|
|
state: u8,
|
|
style: MoveWordStyle,
|
|
}
|
|
|
|
extern "Rust" {
|
|
fn new_move_word_state_machine(syl: MoveWordStyle) -> Box<MoveWordStateMachine>;
|
|
#[cxx_name = "consume_char"]
|
|
fn consume_char_ffi(self: &mut MoveWordStateMachine, c: wchar_t) -> bool;
|
|
fn reset(self: &mut MoveWordStateMachine);
|
|
}
|
|
|
|
extern "Rust" {
|
|
#[cxx_name = "variable_assignment_equals_pos"]
|
|
fn variable_assignment_equals_pos_ffi(txt: &CxxWString) -> SharedPtr<usize>;
|
|
}
|
|
}
|
|
|
|
pub use tokenizer_ffi::{
|
|
MoveWordStateMachine, MoveWordStyle, PipeOrRedir, Tok, TokenType, TokenizerError,
|
|
};
|
|
|
|
#[derive(Clone, Copy)]
|
|
pub struct TokFlags(pub u8);
|
|
|
|
impl BitAnd for TokFlags {
|
|
type Output = bool;
|
|
fn bitand(self, rhs: Self) -> Self::Output {
|
|
(self.0 & rhs.0) != 0
|
|
}
|
|
}
|
|
impl BitOr for TokFlags {
|
|
type Output = Self;
|
|
fn bitor(self, rhs: Self) -> Self::Output {
|
|
Self(self.0 | rhs.0)
|
|
}
|
|
}
|
|
impl BitOrAssign for TokFlags {
|
|
fn bitor_assign(&mut self, rhs: Self) {
|
|
self.0 |= rhs.0
|
|
}
|
|
}
|
|
|
|
/// Flag telling the tokenizer to accept incomplete parameters, i.e. parameters with mismatching
|
|
/// parenthesis, etc. This is useful for tab-completion.
|
|
pub const TOK_ACCEPT_UNFINISHED: TokFlags = TokFlags(1);
|
|
|
|
/// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting.
|
|
pub const TOK_SHOW_COMMENTS: TokFlags = TokFlags(2);
|
|
|
|
/// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells
|
|
/// the tokenizer to return each of them as a separate END.
|
|
pub const TOK_SHOW_BLANK_LINES: TokFlags = TokFlags(4);
|
|
|
|
/// Make an effort to continue after an error.
|
|
pub const TOK_CONTINUE_AFTER_ERROR: TokFlags = TokFlags(8);
|
|
|
|
/// Get the error message for an error \p err.
|
|
pub fn tokenizer_get_error_message(err: TokenizerError) -> UniquePtr<CxxWString> {
|
|
let s: &'static wstr = err.into();
|
|
s.to_ffi()
|
|
}
|
|
|
|
impl From<TokenizerError> for &'static wstr {
|
|
#[widestrs]
|
|
fn from(err: TokenizerError) -> Self {
|
|
match err {
|
|
TokenizerError::none => ""L,
|
|
TokenizerError::unterminated_quote => {
|
|
wgettext!("Unexpected end of string, quotes are not balanced")
|
|
}
|
|
TokenizerError::unterminated_subshell => {
|
|
wgettext!("Unexpected end of string, expecting ')'")
|
|
}
|
|
TokenizerError::unterminated_slice => {
|
|
wgettext!("Unexpected end of string, square brackets do not match")
|
|
}
|
|
TokenizerError::unterminated_escape => {
|
|
wgettext!("Unexpected end of string, incomplete escape sequence")
|
|
}
|
|
TokenizerError::invalid_redirect => {
|
|
wgettext!("Invalid input/output redirection")
|
|
}
|
|
TokenizerError::invalid_pipe => {
|
|
wgettext!("Cannot use stdin (fd 0) as pipe output")
|
|
}
|
|
TokenizerError::invalid_pipe_ampersand => {
|
|
wgettext!("|& is not valid. In fish, use &| to pipe both stdout and stderr.")
|
|
}
|
|
TokenizerError::closing_unopened_subshell => {
|
|
wgettext!("Unexpected ')' for unopened parenthesis")
|
|
}
|
|
TokenizerError::illegal_slice => {
|
|
wgettext!("Unexpected '[' at this location")
|
|
}
|
|
TokenizerError::closing_unopened_brace => {
|
|
wgettext!("Unexpected '}' for unopened brace expansion")
|
|
}
|
|
TokenizerError::unterminated_brace => {
|
|
wgettext!("Unexpected end of string, incomplete parameter expansion")
|
|
}
|
|
TokenizerError::expected_pclose_found_bclose => {
|
|
wgettext!("Unexpected '}' found, expecting ')'")
|
|
}
|
|
TokenizerError::expected_bclose_found_pclose => {
|
|
wgettext!("Unexpected ')' found, expecting '}'")
|
|
}
|
|
_ => {
|
|
panic!("Unexpected tokenizer error");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl printf_compat::args::ToArg<'static> for TokenizerError {
|
|
fn to_arg(self) -> printf_compat::args::Arg<'static> {
|
|
printf_compat::args::Arg::Str(self.into())
|
|
}
|
|
}
|
|
|
|
impl Tok {
|
|
fn new(r#type: TokenType) -> Tok {
|
|
Tok {
|
|
offset: 0,
|
|
length: 0,
|
|
error_offset_within_token: SOURCE_OFFSET_INVALID.try_into().unwrap(),
|
|
error_length: 0,
|
|
error: TokenizerError::none,
|
|
type_: r#type,
|
|
}
|
|
}
|
|
pub fn location_in_or_at_end_of_source_range(self: &Tok, loc: usize) -> bool {
|
|
let loc = loc as u32;
|
|
self.offset <= loc && loc - self.offset <= self.length
|
|
}
|
|
pub fn get_source<'a, 'b>(self: &'a Tok, str: &'b wstr) -> &'b wstr {
|
|
&str[self.offset as usize..(self.offset + self.length) as usize]
|
|
}
|
|
fn get_source_ffi(self: &Tok, str: &CxxWString) -> UniquePtr<CxxWString> {
|
|
self.get_source(str.as_wstr()).to_ffi()
|
|
}
|
|
pub fn set_offset(&mut self, value: usize) {
|
|
self.offset = value.try_into().unwrap();
|
|
}
|
|
pub fn offset(&self) -> usize {
|
|
self.offset.try_into().unwrap()
|
|
}
|
|
pub fn length(&self) -> usize {
|
|
self.length.try_into().unwrap()
|
|
}
|
|
pub fn set_length(&mut self, value: usize) {
|
|
self.length = value.try_into().unwrap();
|
|
}
|
|
pub fn set_error_offset_within_token(&mut self, value: usize) {
|
|
self.error_offset_within_token = value.try_into().unwrap();
|
|
}
|
|
pub fn error_offset_within_token(&self) -> usize {
|
|
self.error_offset_within_token.try_into().unwrap()
|
|
}
|
|
pub fn error_length(&self) -> usize {
|
|
self.error_length.try_into().unwrap()
|
|
}
|
|
pub fn set_error_length(&mut self, value: usize) {
|
|
self.error_length = value.try_into().unwrap();
|
|
}
|
|
}
|
|
|
|
/// The tokenizer struct.
|
|
pub struct Tokenizer {
|
|
/// A pointer into the original string, showing where the next token begins.
|
|
token_cursor: usize,
|
|
/// The start of the original string.
|
|
start: WString, // TODO Avoid copying once we drop the FFI.
|
|
/// Whether we have additional tokens.
|
|
has_next: bool,
|
|
/// Whether incomplete tokens are accepted.
|
|
accept_unfinished: bool,
|
|
/// Whether comments should be returned.
|
|
show_comments: bool,
|
|
/// Whether all blank lines are returned.
|
|
show_blank_lines: bool,
|
|
/// Whether to attempt to continue after an error.
|
|
continue_after_error: bool,
|
|
/// Whether to continue the previous line after the comment.
|
|
continue_line_after_comment: bool,
|
|
}
|
|
|
|
impl Tokenizer {
|
|
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
|
|
/// should not be freed by the caller until after the tokenizer is destroyed.
|
|
///
|
|
/// \param start The string to tokenize
|
|
/// \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
|
|
/// to accept incomplete tokens, such as a subshell without a closing parenthesis, as a valid
|
|
/// token. Setting TOK_SHOW_COMMENTS will return comments as tokens
|
|
pub fn new(start: &wstr, flags: TokFlags) -> Self {
|
|
Tokenizer {
|
|
token_cursor: 0,
|
|
start: start.to_owned(),
|
|
has_next: true,
|
|
accept_unfinished: flags & TOK_ACCEPT_UNFINISHED,
|
|
show_comments: flags & TOK_SHOW_COMMENTS,
|
|
show_blank_lines: flags & TOK_SHOW_BLANK_LINES,
|
|
continue_after_error: flags & TOK_CONTINUE_AFTER_ERROR,
|
|
continue_line_after_comment: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
fn new_tokenizer(start: wcharz_t, flags: u8) -> Box<Tokenizer> {
|
|
Box::new(Tokenizer::new(start.into(), TokFlags(flags)))
|
|
}
|
|
|
|
impl Iterator for Tokenizer {
|
|
type Item = Tok;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
if !self.has_next {
|
|
return None;
|
|
}
|
|
|
|
// Consume non-newline whitespace. If we get an escaped newline, mark it and continue past
|
|
// it.
|
|
loop {
|
|
let i = self.token_cursor;
|
|
if self.start.get(i..i + 2) == Some(L!("\\\n")) {
|
|
self.token_cursor += 2;
|
|
self.continue_line_after_comment = true;
|
|
} else if i < self.start.len() && iswspace_not_nl(self.start.char_at(i)) {
|
|
self.token_cursor += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
while self.start.char_at(self.token_cursor) == '#' {
|
|
// We have a comment, walk over the comment.
|
|
let comment_start = self.token_cursor;
|
|
self.token_cursor = comment_end(&self.start, self.token_cursor);
|
|
let comment_len = self.token_cursor - comment_start;
|
|
|
|
// If we are going to continue after the comment, skip any trailing newline.
|
|
if self.start.as_char_slice().get(self.token_cursor) == Some(&'\n')
|
|
&& self.continue_line_after_comment
|
|
{
|
|
self.token_cursor += 1;
|
|
}
|
|
|
|
// Maybe return the comment.
|
|
if self.show_comments {
|
|
let mut result = Tok::new(TokenType::comment);
|
|
result.offset = comment_start as u32;
|
|
result.length = comment_len as u32;
|
|
return Some(result);
|
|
}
|
|
|
|
while self.token_cursor < self.start.len()
|
|
&& iswspace_not_nl(self.start.char_at(self.token_cursor))
|
|
{
|
|
self.token_cursor += 1;
|
|
}
|
|
}
|
|
|
|
// We made it past the comments and ate any trailing newlines we wanted to ignore.
|
|
self.continue_line_after_comment = false;
|
|
let start_pos = self.token_cursor;
|
|
|
|
let this_char = self.start.char_at(self.token_cursor);
|
|
let next_char = self
|
|
.start
|
|
.as_char_slice()
|
|
.get(self.token_cursor + 1)
|
|
.copied();
|
|
let buff = &self.start[self.token_cursor..];
|
|
match this_char {
|
|
'\0'=> {
|
|
self.has_next = false;
|
|
None
|
|
}
|
|
'\r'| // carriage-return
|
|
'\n'| // newline
|
|
';'=> {
|
|
let mut result = Tok::new(TokenType::end);
|
|
result.offset = start_pos as u32;
|
|
result.length = 1;
|
|
self.token_cursor+=1;
|
|
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
|
|
// subsequent newlines into a single one.
|
|
if !self.show_blank_lines {
|
|
while self.token_cursor < self.start.len() {
|
|
let c = self.start.char_at(self.token_cursor);
|
|
if c != '\n' && c != '\r' && c != ' ' && c != '\t' {
|
|
break
|
|
}
|
|
self.token_cursor+=1;
|
|
}
|
|
}
|
|
Some(result)
|
|
}
|
|
'&'=> {
|
|
if next_char == Some('&') {
|
|
// && is and.
|
|
let mut result = Tok::new(TokenType::andand);
|
|
result.offset = start_pos as u32;
|
|
result.length = 2;
|
|
self.token_cursor += 2;
|
|
Some(result)
|
|
} else if next_char == Some('>') || next_char == Some('|') {
|
|
// &> and &| redirect both stdout and stderr.
|
|
let redir = PipeOrRedir::try_from(buff).
|
|
expect("Should always succeed to parse a &> or &| redirection");
|
|
let mut result = Tok::new(redir.token_type());
|
|
result.offset = start_pos as u32;
|
|
result.length = redir.consumed as u32;
|
|
self.token_cursor += redir.consumed;
|
|
Some(result)
|
|
} else {
|
|
let mut result = Tok::new(TokenType::background);
|
|
result.offset = start_pos as u32;
|
|
result.length = 1;
|
|
self.token_cursor+=1;
|
|
Some(result)
|
|
}
|
|
}
|
|
'|'=> {
|
|
if next_char == Some('|') {
|
|
// || is or.
|
|
let mut result=Tok::new(TokenType::oror);
|
|
result.offset = start_pos as u32;
|
|
result.length = 2;
|
|
self.token_cursor += 2;
|
|
Some(result)
|
|
} else if next_char == Some('&') {
|
|
// |& is a bashism; in fish it's &|.
|
|
Some(self.call_error(TokenizerError::invalid_pipe_ampersand,
|
|
self.token_cursor, self.token_cursor, Some(2), 2))
|
|
} else {
|
|
let pipe = PipeOrRedir::try_from(buff).
|
|
expect("Should always succeed to parse a | pipe");
|
|
let mut result = Tok::new(pipe.token_type());
|
|
result.offset = start_pos as u32;
|
|
result.length = pipe.consumed as u32;
|
|
self.token_cursor += pipe.consumed;
|
|
Some(result)
|
|
}
|
|
}
|
|
'>'| '<' => {
|
|
// There's some duplication with the code in the default case below. The key
|
|
// difference here is that we must never parse these as a string; a failed
|
|
// redirection is an error!
|
|
match PipeOrRedir::try_from(buff) {
|
|
Ok(redir_or_pipe) => {
|
|
if redir_or_pipe.fd < 0 {
|
|
Some(self.call_error(TokenizerError::invalid_redirect, self.token_cursor,
|
|
self.token_cursor,
|
|
Some(redir_or_pipe.consumed),
|
|
redir_or_pipe.consumed))
|
|
} else {
|
|
let mut result = Tok::new(redir_or_pipe.token_type());
|
|
result.offset = start_pos as u32;
|
|
result.length = redir_or_pipe.consumed as u32;
|
|
self.token_cursor += redir_or_pipe.consumed;
|
|
Some(result)
|
|
}
|
|
}
|
|
Err(()) => Some(self.call_error(TokenizerError::invalid_redirect, self.token_cursor,
|
|
self.token_cursor,
|
|
Some(0),
|
|
0))
|
|
}
|
|
}
|
|
_ => {
|
|
// Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string.
|
|
let error_location = self.token_cursor;
|
|
let redir_or_pipe = if this_char.is_ascii_digit() {
|
|
PipeOrRedir::try_from(buff).ok()
|
|
} else {
|
|
None
|
|
};
|
|
|
|
match redir_or_pipe {
|
|
Some(redir_or_pipe) => {
|
|
// It looks like a redirection or a pipe. But we don't support piping fd 0. Note
|
|
// tSome(hat fd 0 may be -1, indicating overflow; but we don't treat that as a
|
|
// tokenizer error.
|
|
if redir_or_pipe.is_pipe && redir_or_pipe.fd == 0 {
|
|
Some(self.call_error(TokenizerError::invalid_pipe, error_location,
|
|
error_location, Some(redir_or_pipe.consumed),
|
|
redir_or_pipe.consumed))
|
|
}
|
|
else {
|
|
let mut result = Tok::new(redir_or_pipe.token_type());
|
|
result.offset = start_pos as u32;
|
|
result.length = redir_or_pipe.consumed as u32;
|
|
self.token_cursor += redir_or_pipe.consumed;
|
|
Some(result)
|
|
}
|
|
}
|
|
None => {
|
|
// Not a redirection or pipe, so just a string.
|
|
Some(self.read_string())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
impl Tokenizer {
|
|
fn next_ffi(&mut self) -> UniquePtr<Tok> {
|
|
match self.next() {
|
|
Some(tok) => UniquePtr::new(tok),
|
|
None => UniquePtr::null(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test if a character is whitespace. Differs from iswspace in that it does not consider a
|
|
/// newline to be whitespace.
|
|
fn iswspace_not_nl(c: char) -> bool {
|
|
match c {
|
|
' ' | '\t' | '\r' => true,
|
|
'\n' => false,
|
|
_ => c.is_whitespace(),
|
|
}
|
|
}
|
|
|
|
impl Tokenizer {
|
|
/// Returns the text of a token, as a string.
|
|
pub fn text_of(&self, tok: &Tok) -> &wstr {
|
|
tok.get_source(&self.start)
|
|
}
|
|
fn text_of_ffi(&self, tok: &Tok) -> UniquePtr<CxxWString> {
|
|
self.text_of(tok).to_ffi()
|
|
}
|
|
|
|
/// Return an error token and mark that we no longer have a next token.
|
|
fn call_error(
|
|
&mut self,
|
|
error_type: TokenizerError,
|
|
token_start: usize,
|
|
error_loc: usize,
|
|
token_length: Option<usize>,
|
|
error_len: usize,
|
|
) -> Tok {
|
|
assert!(
|
|
error_type != TokenizerError::none,
|
|
"TokenizerError::none passed to call_error"
|
|
);
|
|
assert!(error_loc >= token_start, "Invalid error location");
|
|
assert!(self.token_cursor >= token_start, "Invalid buff location");
|
|
|
|
// If continue_after_error is set and we have a real token length, then skip past it.
|
|
// Otherwise give up.
|
|
match token_length {
|
|
Some(token_length) if self.continue_after_error => {
|
|
assert!(
|
|
self.token_cursor < error_loc + token_length,
|
|
"Unable to continue past error"
|
|
);
|
|
self.token_cursor = error_loc + token_length;
|
|
}
|
|
_ => self.has_next = false,
|
|
}
|
|
|
|
Tok {
|
|
offset: token_start as u32,
|
|
length: token_length.unwrap_or(self.token_cursor - token_start) as u32,
|
|
error_offset_within_token: (error_loc - token_start) as u32,
|
|
error_length: error_len as u32,
|
|
error: error_type,
|
|
type_: TokenType::error,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Tokenizer {
|
|
/// Read the next token as a string.
|
|
fn read_string(&mut self) -> Tok {
|
|
let mut mode = TOK_MODE_REGULAR_TEXT;
|
|
let mut paran_offsets = vec![];
|
|
let mut brace_offsets = vec![];
|
|
let mut expecting = vec![];
|
|
let mut quoted_cmdsubs = vec![];
|
|
let mut slice_offset = 0;
|
|
let buff_start = self.token_cursor;
|
|
let mut is_token_begin = true;
|
|
|
|
fn process_opening_quote(
|
|
this: &mut Tokenizer,
|
|
quoted_cmdsubs: &mut Vec<usize>,
|
|
paran_offsets: &mut Vec<usize>,
|
|
quote: char,
|
|
) -> Result<(), usize> {
|
|
if let Some(end) = quote_end(&this.start, this.token_cursor, quote) {
|
|
if this.start.char_at(end) == '$' {
|
|
quoted_cmdsubs.push(paran_offsets.len());
|
|
}
|
|
this.token_cursor = end;
|
|
Ok(())
|
|
} else {
|
|
let error_loc = this.token_cursor;
|
|
this.token_cursor = this.start.len();
|
|
Err(error_loc)
|
|
}
|
|
}
|
|
|
|
while self.token_cursor != self.start.len() {
|
|
let c = self.start.char_at(self.token_cursor);
|
|
|
|
// Make sure this character isn't being escaped before anything else
|
|
if mode & TOK_MODE_CHAR_ESCAPE {
|
|
mode &= !TOK_MODE_CHAR_ESCAPE;
|
|
// and do nothing more
|
|
} else if myal(c) {
|
|
// Early exit optimization in case the character is just a letter,
|
|
// which has no special meaning to the tokenizer, i.e. the same mode continues.
|
|
}
|
|
// Now proceed with the evaluation of the token, first checking to see if the token
|
|
// has been explicitly ignored (escaped).
|
|
else if c == '\\' {
|
|
mode |= TOK_MODE_CHAR_ESCAPE;
|
|
} else if c == '#' && is_token_begin {
|
|
self.token_cursor = comment_end(&self.start, self.token_cursor) - 1;
|
|
} else if c == '(' {
|
|
paran_offsets.push(self.token_cursor);
|
|
expecting.push(')');
|
|
mode |= TOK_MODE_SUBSHELL;
|
|
} else if c == '{' {
|
|
brace_offsets.push(self.token_cursor);
|
|
expecting.push('}');
|
|
mode |= TOK_MODE_CURLY_BRACES;
|
|
} else if c == ')' {
|
|
if expecting.last() == Some(&'}') {
|
|
return self.call_error(
|
|
TokenizerError::expected_bclose_found_pclose,
|
|
self.token_cursor,
|
|
self.token_cursor,
|
|
Some(1),
|
|
1,
|
|
);
|
|
}
|
|
if paran_offsets.is_empty() {
|
|
return self.call_error(
|
|
TokenizerError::closing_unopened_subshell,
|
|
self.token_cursor,
|
|
self.token_cursor,
|
|
Some(1),
|
|
1,
|
|
);
|
|
}
|
|
paran_offsets.pop();
|
|
if paran_offsets.is_empty() {
|
|
mode &= !TOK_MODE_SUBSHELL;
|
|
}
|
|
expecting.pop();
|
|
// Check if the ) completed a quoted command substitution.
|
|
if quoted_cmdsubs.last() == Some(¶n_offsets.len()) {
|
|
quoted_cmdsubs.pop();
|
|
// The "$(" part of a quoted command substitution closes double quotes. To keep
|
|
// quotes balanced, act as if there was an invisible double quote after the ")".
|
|
if let Err(error_loc) =
|
|
process_opening_quote(self, &mut quoted_cmdsubs, &mut paran_offsets, '"')
|
|
{
|
|
if !self.accept_unfinished {
|
|
return self.call_error(
|
|
TokenizerError::unterminated_quote,
|
|
buff_start,
|
|
error_loc,
|
|
None,
|
|
0,
|
|
);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
} else if c == '}' {
|
|
if expecting.last() == Some(&')') {
|
|
return self.call_error(
|
|
TokenizerError::expected_pclose_found_bclose,
|
|
self.token_cursor,
|
|
self.token_cursor,
|
|
Some(1),
|
|
1,
|
|
);
|
|
}
|
|
if brace_offsets.is_empty() {
|
|
return self.call_error(
|
|
TokenizerError::closing_unopened_brace,
|
|
self.token_cursor,
|
|
self.start.len(),
|
|
None,
|
|
0,
|
|
);
|
|
}
|
|
brace_offsets.pop();
|
|
if brace_offsets.is_empty() {
|
|
mode &= !TOK_MODE_CURLY_BRACES;
|
|
}
|
|
expecting.pop();
|
|
} else if c == '[' {
|
|
if self.token_cursor != buff_start {
|
|
mode |= TOK_MODE_ARRAY_BRACKETS;
|
|
slice_offset = self.token_cursor;
|
|
} else {
|
|
// This is actually allowed so the test operator `[` can be used as the head of a
|
|
// command
|
|
}
|
|
}
|
|
// Only exit bracket mode if we are in bracket mode.
|
|
// Reason: `]` can be a parameter, e.g. last parameter to `[` test alias.
|
|
// e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket
|
|
else if c == ']' && (mode & TOK_MODE_ARRAY_BRACKETS) {
|
|
mode &= !TOK_MODE_ARRAY_BRACKETS;
|
|
} else if c == '\'' || c == '"' {
|
|
if let Err(error_loc) =
|
|
process_opening_quote(self, &mut quoted_cmdsubs, &mut paran_offsets, c)
|
|
{
|
|
if !self.accept_unfinished {
|
|
return self.call_error(
|
|
TokenizerError::unterminated_quote,
|
|
buff_start,
|
|
error_loc,
|
|
None,
|
|
1,
|
|
);
|
|
}
|
|
break;
|
|
}
|
|
} else if mode == TOK_MODE_REGULAR_TEXT
|
|
&& !tok_is_string_character(
|
|
c,
|
|
self.start
|
|
.as_char_slice()
|
|
.get(self.token_cursor + 1)
|
|
.copied(),
|
|
)
|
|
{
|
|
break;
|
|
}
|
|
|
|
let next = self
|
|
.start
|
|
.as_char_slice()
|
|
.get(self.token_cursor + 1)
|
|
.copied();
|
|
is_token_begin = is_token_delimiter(c, next);
|
|
self.token_cursor += 1;
|
|
}
|
|
|
|
if !self.accept_unfinished && mode != TOK_MODE_REGULAR_TEXT {
|
|
// These are all "unterminated", so the only char we can mark as an error
|
|
// is the opener (the closing char could be anywhere!)
|
|
//
|
|
// (except for TOK_MODE_CHAR_ESCAPE, which is one long by definition)
|
|
if mode & TOK_MODE_CHAR_ESCAPE {
|
|
return self.call_error(
|
|
TokenizerError::unterminated_escape,
|
|
buff_start,
|
|
self.token_cursor - 1,
|
|
None,
|
|
1,
|
|
);
|
|
} else if mode & TOK_MODE_ARRAY_BRACKETS {
|
|
return self.call_error(
|
|
TokenizerError::unterminated_slice,
|
|
buff_start,
|
|
slice_offset,
|
|
None,
|
|
1,
|
|
);
|
|
} else if mode & TOK_MODE_SUBSHELL {
|
|
assert!(!paran_offsets.is_empty());
|
|
let offset_of_open_paran = *paran_offsets.last().unwrap();
|
|
|
|
return self.call_error(
|
|
TokenizerError::unterminated_subshell,
|
|
buff_start,
|
|
offset_of_open_paran,
|
|
None,
|
|
1,
|
|
);
|
|
} else if mode & TOK_MODE_CURLY_BRACES {
|
|
assert!(!brace_offsets.is_empty());
|
|
let offset_of_open_brace = *brace_offsets.last().unwrap();
|
|
|
|
return self.call_error(
|
|
TokenizerError::unterminated_brace,
|
|
buff_start,
|
|
offset_of_open_brace,
|
|
None,
|
|
1,
|
|
);
|
|
} else {
|
|
panic!("Unknown non-regular-text mode");
|
|
}
|
|
}
|
|
|
|
let mut result = Tok::new(TokenType::string);
|
|
result.set_offset(buff_start);
|
|
result.set_length(self.token_cursor - buff_start);
|
|
result
|
|
}
|
|
}
|
|
|
|
pub fn quote_end(s: &wstr, mut pos: usize, quote: char) -> Option<usize> {
|
|
loop {
|
|
pos += 1;
|
|
|
|
if pos == s.len() {
|
|
return None;
|
|
}
|
|
|
|
let c = s.char_at(pos);
|
|
if c == '\\' {
|
|
pos += 1;
|
|
if pos == s.len() {
|
|
return None;
|
|
}
|
|
} else if c == quote ||
|
|
// Command substitutions also end a double quoted string. This is how we
|
|
// support command substitutions inside double quotes.
|
|
(quote == '"' && c == '$' && s.as_char_slice().get(pos+1) == Some(&'('))
|
|
{
|
|
return Some(pos);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn comment_end(s: &wstr, mut pos: usize) -> usize {
|
|
loop {
|
|
pos += 1;
|
|
if pos == s.len() || s.char_at(pos) == '\n' {
|
|
return pos;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tests if this character can be a part of a string. Hash (#) starts a comment if it's the first
|
|
/// character in a token; otherwise it is considered a string character. See issue #953.
|
|
fn tok_is_string_character(c: char, next: Option<char>) -> bool {
|
|
match c {
|
|
// Unconditional separators.
|
|
'\0' | ' ' | '\n' | '|' | '\t' | ';' | '\r' | '<' | '>' => false,
|
|
'&' => {
|
|
if feature_test(FeatureFlag::ampersand_nobg_in_token) {
|
|
// Unlike in other shells, '&' is not special if followed by a string character.
|
|
next.map(|nc| tok_is_string_character(nc, None))
|
|
.unwrap_or(false)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
_ => true,
|
|
}
|
|
}
|
|
|
|
/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster
|
|
/// by adding a fast path for the most common characters. This is obviously not a suitable
|
|
/// replacement for iswalpha.
|
|
fn myal(c: char) -> bool {
|
|
('a'..='z').contains(&c) || ('A'..='Z').contains(&c)
|
|
}
|
|
|
|
#[derive(Clone, Copy, PartialEq, Eq)]
|
|
struct TokModes(u8);
|
|
|
|
const TOK_MODE_REGULAR_TEXT: TokModes = TokModes(0); // regular text
|
|
const TOK_MODE_SUBSHELL: TokModes = TokModes(1 << 0); // inside of subshell parentheses
|
|
const TOK_MODE_ARRAY_BRACKETS: TokModes = TokModes(1 << 1); // inside of array brackets
|
|
const TOK_MODE_CURLY_BRACES: TokModes = TokModes(1 << 2);
|
|
const TOK_MODE_CHAR_ESCAPE: TokModes = TokModes(1 << 3);
|
|
|
|
impl BitAnd for TokModes {
|
|
type Output = bool;
|
|
fn bitand(self, rhs: Self) -> Self::Output {
|
|
(self.0 & rhs.0) != 0
|
|
}
|
|
}
|
|
impl BitAndAssign for TokModes {
|
|
fn bitand_assign(&mut self, rhs: Self) {
|
|
self.0 &= rhs.0
|
|
}
|
|
}
|
|
impl BitOrAssign for TokModes {
|
|
fn bitor_assign(&mut self, rhs: Self) {
|
|
self.0 |= rhs.0
|
|
}
|
|
}
|
|
impl Not for TokModes {
|
|
type Output = TokModes;
|
|
fn not(self) -> Self::Output {
|
|
TokModes(!self.0)
|
|
}
|
|
}
|
|
|
|
/// Tests if this character can delimit tokens.
|
|
pub fn is_token_delimiter(c: char, next: Option<char>) -> bool {
|
|
c == '(' || !tok_is_string_character(c, next)
|
|
}
|
|
|
|
fn is_token_delimiter_ffi(c: wchar_t, next: SharedPtr<wchar_t>) -> bool {
|
|
is_token_delimiter(
|
|
c.try_into().unwrap(),
|
|
next.as_ref().map(|c| (*c).try_into().unwrap()),
|
|
)
|
|
}
|
|
|
|
/// \return the_ffi first token from the string, skipping variable assignments like A=B.
|
|
pub fn tok_command(str: &wstr) -> WString {
|
|
let mut t = Tokenizer::new(str, TokFlags(0));
|
|
while let Some(token) = t.next() {
|
|
if token.type_ != TokenType::string {
|
|
return WString::new();
|
|
}
|
|
let text = t.text_of(&token);
|
|
if variable_assignment_equals_pos(text).is_some() {
|
|
continue;
|
|
}
|
|
return text.to_owned();
|
|
}
|
|
WString::new()
|
|
}
|
|
fn tok_command_ffi(str: &CxxWString) -> UniquePtr<CxxWString> {
|
|
tok_command(str.as_wstr()).to_ffi()
|
|
}
|
|
|
|
impl TryFrom<&wstr> for PipeOrRedir {
|
|
type Error = ();
|
|
|
|
/// Examples of supported syntaxes.
|
|
/// Note we are only responsible for parsing the redirection part, not 'cmd' or 'file'.
|
|
///
|
|
/// cmd | cmd normal pipe
|
|
/// cmd &| cmd normal pipe plus stderr-merge
|
|
/// cmd >| cmd pipe with explicit fd
|
|
/// cmd 2>| cmd pipe with explicit fd
|
|
/// cmd < file stdin redirection
|
|
/// cmd > file redirection
|
|
/// cmd >> file appending redirection
|
|
/// cmd >? file noclobber redirection
|
|
/// cmd >>? file appending noclobber redirection
|
|
/// cmd 2> file file redirection with explicit fd
|
|
/// cmd >&2 fd redirection with no explicit src fd (stdout is used)
|
|
/// cmd 1>&2 fd redirection with an explicit src fd
|
|
/// cmd <&2 fd redirection with no explicit src fd (stdin is used)
|
|
/// cmd 3<&0 fd redirection with an explicit src fd
|
|
/// cmd &> file redirection with stderr merge
|
|
/// cmd ^ file caret (stderr) redirection, perhaps disabled via feature flags
|
|
/// cmd ^^ file caret (stderr) redirection, perhaps disabled via feature flags
|
|
fn try_from(buff: &wstr) -> Result<PipeOrRedir, ()> {
|
|
// Extract a range of leading fd.
|
|
let mut cursor = buff.chars().take_while(|c| c.is_ascii_digit()).count();
|
|
let fd_buff = &buff[..cursor];
|
|
let has_fd = !fd_buff.is_empty();
|
|
|
|
// Try consuming a given character.
|
|
// Return true if consumed. On success, advances cursor.
|
|
let try_consume = |cursor: &mut usize, c| -> bool {
|
|
if buff.char_at(*cursor) != c {
|
|
false
|
|
} else {
|
|
*cursor += 1;
|
|
true
|
|
}
|
|
};
|
|
|
|
// Like try_consume, but asserts on failure.
|
|
let consume = |cursor: &mut usize, c| {
|
|
assert!(buff.char_at(*cursor) == c, "Failed to consume char");
|
|
*cursor += 1;
|
|
};
|
|
|
|
let c = buff.char_at(cursor);
|
|
let mut result = PipeOrRedir {
|
|
fd: -1,
|
|
is_pipe: false,
|
|
mode: RedirectionMode::overwrite,
|
|
stderr_merge: false,
|
|
consumed: 0,
|
|
};
|
|
match c {
|
|
'|' => {
|
|
if has_fd {
|
|
// Like 123|
|
|
return Err(());
|
|
}
|
|
consume(&mut cursor, '|');
|
|
assert!(
|
|
buff.char_at(cursor) != '|',
|
|
"|| passed as redirection, this should have been handled as 'or' by the caller"
|
|
);
|
|
result.fd = STDOUT_FILENO;
|
|
result.is_pipe = true;
|
|
}
|
|
'>' => {
|
|
consume(&mut cursor, '>');
|
|
if try_consume(&mut cursor, '>') {
|
|
result.mode = RedirectionMode::append;
|
|
}
|
|
if try_consume(&mut cursor, '|') {
|
|
// Note we differ from bash here.
|
|
// Consider `echo foo 2>| bar`
|
|
// In fish, this is a *pipe*. Run bar as a command and attach foo's stderr to bar's
|
|
// stdin, while leaving stdout as tty.
|
|
// In bash, this is a *redirection* to bar as a file. It is like > but ignores
|
|
// noclobber.
|
|
result.is_pipe = true;
|
|
result.fd = if has_fd {
|
|
parse_fd(fd_buff) // like 2>|
|
|
} else {
|
|
STDOUT_FILENO
|
|
}; // like >|
|
|
} else if try_consume(&mut cursor, '&') {
|
|
// This is a redirection to an fd.
|
|
// Note that we allow ">>&", but it's still just writing to the fd - "appending" to
|
|
// it doesn't make sense.
|
|
result.mode = RedirectionMode::fd;
|
|
result.fd = if has_fd {
|
|
parse_fd(fd_buff) // like 1>&2
|
|
} else {
|
|
STDOUT_FILENO // like >&2
|
|
};
|
|
} else {
|
|
// This is a redirection to a file.
|
|
result.fd = if has_fd {
|
|
parse_fd(fd_buff) // like 1> file.txt
|
|
} else {
|
|
STDOUT_FILENO // like > file.txt
|
|
};
|
|
if result.mode != RedirectionMode::append {
|
|
result.mode = RedirectionMode::overwrite;
|
|
}
|
|
// Note 'echo abc >>? file' is valid: it means append and noclobber.
|
|
// But here "noclobber" means the file must not exist, so appending
|
|
// can be ignored.
|
|
if try_consume(&mut cursor, '?') {
|
|
result.mode = RedirectionMode::noclob;
|
|
}
|
|
}
|
|
}
|
|
'<' => {
|
|
consume(&mut cursor, '<');
|
|
if try_consume(&mut cursor, '&') {
|
|
result.mode = RedirectionMode::fd;
|
|
} else {
|
|
result.mode = RedirectionMode::input;
|
|
}
|
|
result.fd = if has_fd {
|
|
parse_fd(fd_buff) // like 1<&3 or 1< /tmp/file.txt
|
|
} else {
|
|
STDIN_FILENO // like <&3 or < /tmp/file.txt
|
|
};
|
|
}
|
|
'&' => {
|
|
consume(&mut cursor, '&');
|
|
if try_consume(&mut cursor, '|') {
|
|
// &| is pipe with stderr merge.
|
|
result.fd = STDOUT_FILENO;
|
|
result.is_pipe = true;
|
|
result.stderr_merge = true;
|
|
} else if try_consume(&mut cursor, '>') {
|
|
result.fd = STDOUT_FILENO;
|
|
result.stderr_merge = true;
|
|
result.mode = RedirectionMode::overwrite;
|
|
if try_consume(&mut cursor, '>') {
|
|
result.mode = RedirectionMode::append; // like &>>
|
|
}
|
|
if try_consume(&mut cursor, '?') {
|
|
result.mode = RedirectionMode::noclob; // like &>? or &>>?
|
|
}
|
|
} else {
|
|
return Err(());
|
|
}
|
|
}
|
|
_ => {
|
|
// Not a redirection.
|
|
return Err(());
|
|
}
|
|
}
|
|
|
|
result.consumed = cursor;
|
|
assert!(
|
|
result.consumed > 0,
|
|
"Should have consumed at least one character on success"
|
|
);
|
|
Ok(result)
|
|
}
|
|
}
|
|
|
|
fn pipe_or_redir_from_string(buff: wcharz_t) -> UniquePtr<PipeOrRedir> {
|
|
match PipeOrRedir::try_from(Into::<&wstr>::into(buff)) {
|
|
Ok(p) => UniquePtr::new(p),
|
|
Err(()) => UniquePtr::null(),
|
|
}
|
|
}
|
|
|
|
impl PipeOrRedir {
|
|
/// \return the oflags (as in open(2)) for this redirection.
|
|
pub fn oflags(&self) -> c_int {
|
|
self.mode.oflags().unwrap_or(-1)
|
|
}
|
|
|
|
// \return if we are "valid". Here "valid" means only that the source fd did not overflow.
|
|
// For example 99999999999> is invalid.
|
|
fn is_valid(&self) -> bool {
|
|
self.fd >= 0
|
|
}
|
|
|
|
// \return the token type for this redirection.
|
|
fn token_type(&self) -> TokenType {
|
|
if self.is_pipe {
|
|
TokenType::pipe
|
|
} else {
|
|
TokenType::redirect
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse an fd from the non-empty string [start, end), all of which are digits.
|
|
// Return the fd, or -1 on overflow.
|
|
fn parse_fd(s: &wstr) -> RawFd {
|
|
assert!(!s.is_empty());
|
|
let chars: Vec<u8> = s
|
|
.chars()
|
|
.map(|c| {
|
|
assert!(c.is_ascii_digit());
|
|
c as u8
|
|
})
|
|
.collect();
|
|
let s = std::str::from_utf8(chars.as_slice()).unwrap();
|
|
s.parse().unwrap_or(-1)
|
|
}
|
|
|
|
fn new_move_word_state_machine(syl: MoveWordStyle) -> Box<MoveWordStateMachine> {
|
|
Box::new(MoveWordStateMachine::new(syl))
|
|
}
|
|
|
|
impl MoveWordStateMachine {
|
|
pub fn new(style: MoveWordStyle) -> Self {
|
|
MoveWordStateMachine { state: 0, style }
|
|
}
|
|
|
|
pub fn consume_char(&mut self, c: char) -> bool {
|
|
match self.style {
|
|
MoveWordStyle::move_word_style_punctuation => self.consume_char_punctuation(c),
|
|
MoveWordStyle::move_word_style_path_components => self.consume_char_path_components(c),
|
|
MoveWordStyle::move_word_style_whitespace => self.consume_char_whitespace(c),
|
|
_ => panic!(),
|
|
}
|
|
}
|
|
pub fn consume_char_ffi(&mut self, c: wchar_t) -> bool {
|
|
self.consume_char(c.try_into().unwrap())
|
|
}
|
|
|
|
pub fn reset(&mut self) {
|
|
self.state = 0;
|
|
}
|
|
|
|
fn consume_char_punctuation(&mut self, c: char) -> bool {
|
|
const S_ALWAYS_ONE: u8 = 0;
|
|
const S_REST: u8 = 1;
|
|
const S_WHITESPACE_REST: u8 = 2;
|
|
const S_WHITESPACE: u8 = 3;
|
|
const S_ALPHANUMERIC: u8 = 4;
|
|
const S_END: u8 = 5;
|
|
|
|
let mut consumed = false;
|
|
while self.state != S_END && !consumed {
|
|
match self.state {
|
|
S_ALWAYS_ONE => {
|
|
// Always consume the first character.
|
|
consumed = true;
|
|
if c.is_whitespace() {
|
|
self.state = S_WHITESPACE;
|
|
} else if c.is_alphanumeric() {
|
|
self.state = S_ALPHANUMERIC;
|
|
} else {
|
|
// Don't allow switching type (ws->nonws) after non-whitespace and
|
|
// non-alphanumeric.
|
|
self.state = S_REST;
|
|
}
|
|
}
|
|
S_REST => {
|
|
if c.is_whitespace() {
|
|
// Consume only trailing whitespace.
|
|
self.state = S_WHITESPACE_REST;
|
|
} else if c.is_alphanumeric() {
|
|
// Consume only alnums.
|
|
self.state = S_ALPHANUMERIC;
|
|
} else {
|
|
consumed = false;
|
|
self.state = S_END;
|
|
}
|
|
}
|
|
S_WHITESPACE_REST | S_WHITESPACE => {
|
|
// "whitespace" consumes whitespace and switches to alnums,
|
|
// "whitespace_rest" only consumes whitespace.
|
|
if c.is_whitespace() {
|
|
// Consumed whitespace.
|
|
consumed = true;
|
|
} else {
|
|
self.state = if self.state == S_WHITESPACE {
|
|
S_ALPHANUMERIC
|
|
} else {
|
|
S_END
|
|
};
|
|
}
|
|
}
|
|
S_ALPHANUMERIC => {
|
|
if c.is_alphanumeric() {
|
|
consumed = true; // consumed alphanumeric
|
|
} else {
|
|
self.state = S_END;
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
consumed
|
|
}
|
|
|
|
fn consume_char_path_components(&mut self, c: char) -> bool {
|
|
const S_INITIAL_PUNCTUATION: u8 = 0;
|
|
const S_WHITESPACE: u8 = 1;
|
|
const S_SEPARATOR: u8 = 2;
|
|
const S_SLASH: u8 = 3;
|
|
const S_PATH_COMPONENT_CHARACTERS: u8 = 4;
|
|
const S_INITIAL_SEPARATOR: u8 = 5;
|
|
const S_END: u8 = 6;
|
|
|
|
let mut consumed = false;
|
|
while self.state != S_END && !consumed {
|
|
match self.state {
|
|
S_INITIAL_PUNCTUATION => {
|
|
if !is_path_component_character(c) && !c.is_whitespace() {
|
|
self.state = S_INITIAL_SEPARATOR;
|
|
} else {
|
|
if !is_path_component_character(c) {
|
|
consumed = true;
|
|
}
|
|
self.state = S_WHITESPACE;
|
|
}
|
|
}
|
|
S_WHITESPACE => {
|
|
if c.is_whitespace() {
|
|
consumed = true; // consumed whitespace
|
|
} else if c == '/' || is_path_component_character(c) {
|
|
self.state = S_SLASH; // path component
|
|
} else {
|
|
self.state = S_SEPARATOR; // path separator
|
|
}
|
|
}
|
|
S_SEPARATOR => {
|
|
if !c.is_whitespace() && !is_path_component_character(c) {
|
|
consumed = true; // consumed separator
|
|
} else {
|
|
self.state = S_END;
|
|
}
|
|
}
|
|
S_SLASH => {
|
|
if c == '/' {
|
|
consumed = true; // consumed slash
|
|
} else {
|
|
self.state = S_PATH_COMPONENT_CHARACTERS;
|
|
}
|
|
}
|
|
S_PATH_COMPONENT_CHARACTERS => {
|
|
if is_path_component_character(c) {
|
|
consumed = true; // consumed string character except slash
|
|
} else {
|
|
self.state = S_END;
|
|
}
|
|
}
|
|
S_INITIAL_SEPARATOR => {
|
|
if is_path_component_character(c) {
|
|
consumed = true;
|
|
self.state = S_PATH_COMPONENT_CHARACTERS;
|
|
} else if c.is_whitespace() {
|
|
self.state = S_END;
|
|
} else {
|
|
consumed = true;
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
consumed
|
|
}
|
|
|
|
fn consume_char_whitespace(&mut self, c: char) -> bool {
|
|
// Consume a "word" of printable characters plus any leading whitespace.
|
|
const S_ALWAYS_ONE: u8 = 0;
|
|
const S_BLANK: u8 = 1;
|
|
const S_GRAPH: u8 = 2;
|
|
const S_END: u8 = 3;
|
|
|
|
let mut consumed = false;
|
|
while self.state != S_END && !consumed {
|
|
match self.state {
|
|
S_ALWAYS_ONE => {
|
|
consumed = true; // always consume the first character
|
|
// If it's not whitespace, only consume those from here.
|
|
if !c.is_whitespace() {
|
|
self.state = S_GRAPH;
|
|
} else {
|
|
// If it's whitespace, keep consuming whitespace until the graphs.
|
|
self.state = S_BLANK;
|
|
}
|
|
}
|
|
S_BLANK => {
|
|
if c.is_whitespace() {
|
|
consumed = true; // consumed whitespace
|
|
} else {
|
|
self.state = S_GRAPH;
|
|
}
|
|
}
|
|
S_GRAPH => {
|
|
if !c.is_whitespace() {
|
|
consumed = true; // consumed printable non-space
|
|
} else {
|
|
self.state = S_END;
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
consumed
|
|
}
|
|
}
|
|
|
|
fn is_path_component_character(c: char) -> bool {
|
|
tok_is_string_character(c, None) && !L!("/={,}'\":@").as_char_slice().contains(&c)
|
|
}
|
|
|
|
/// The position of the equal sign in a variable assignment like foo=bar.
|
|
///
|
|
/// Return the location of the equals sign, or none if the string does
|
|
/// not look like a variable assignment like FOO=bar. The detection
|
|
/// works similar as in some POSIX shells: only letters and numbers qre
|
|
/// allowed on the left hand side, no quotes or escaping.
|
|
pub fn variable_assignment_equals_pos(txt: &wstr) -> Option<usize> {
|
|
let mut found_potential_variable = false;
|
|
|
|
// TODO bracket indexing
|
|
for (i, c) in txt.chars().enumerate() {
|
|
if !found_potential_variable {
|
|
if !valid_var_name_char(c) {
|
|
return None;
|
|
}
|
|
found_potential_variable = true;
|
|
} else {
|
|
if c == '=' {
|
|
return Some(i);
|
|
}
|
|
if !valid_var_name_char(c) {
|
|
return None;
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn variable_assignment_equals_pos_ffi(txt: &CxxWString) -> SharedPtr<usize> {
|
|
match variable_assignment_equals_pos(txt.as_wstr()) {
|
|
Some(p) => SharedPtr::new(p),
|
|
None => SharedPtr::null(),
|
|
}
|
|
}
|