2016-05-03 21:35:12 +00:00
// A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
// extended to support marks, tokenizing multiple strings and disposing of unused string segments.
2016-05-18 22:30:21 +00:00
# include "config.h" // IWYU pragma: keep
2016-05-03 21:35:12 +00:00
# include <fcntl.h>
# include <limits.h>
# include <unistd.h>
2005-09-20 13:26:39 +00:00
# include <wchar.h>
# include <wctype.h>
2017-02-11 02:47:02 +00:00
2015-07-25 15:14:25 +00:00
# include <string>
2017-02-11 02:47:02 +00:00
# include <type_traits>
2006-02-28 13:17:16 +00:00
2005-09-20 13:26:39 +00:00
# include "common.h"
2016-05-03 21:35:12 +00:00
# include "fallback.h" // IWYU pragma: keep
2015-07-25 15:14:25 +00:00
# include "tokenizer.h"
2016-05-03 21:35:12 +00:00
# include "wutil.h" // IWYU pragma: keep
2006-07-19 22:55:49 +00:00
2018-03-12 00:36:10 +00:00
tokenizer_error * TOK_ERROR_NONE = new tokenizer_error ( L " " ) ;
2018-03-13 18:45:15 +00:00
tokenizer_error * TOK_UNTERMINATED_QUOTE = new tokenizer_error ( ( L " Unexpected end of string, quotes are not balanced " ) , parse_error_tokenizer_unterminated_quote ) ;
tokenizer_error * TOK_UNTERMINATED_SUBSHELL = new tokenizer_error ( ( L " Unexpected end of string, expecting ')' " ) , parse_error_tokenizer_unterminated_subshell ) ;
tokenizer_error * TOK_UNTERMINATED_SLICE = new tokenizer_error ( ( L " Unexpected end of string, square brackets do not match " ) , parse_error_tokenizer_unterminated_slice ) ;
tokenizer_error * TOK_UNTERMINATED_ESCAPE = new tokenizer_error ( ( L " Unexpected end of string, incomplete escape sequence " ) , parse_error_tokenizer_unterminated_escape ) ;
tokenizer_error * TOK_INVALID_REDIRECT = new tokenizer_error ( ( L " Invalid input/output redirection " ) ) ;
tokenizer_error * TOK_INVALID_PIPE = new tokenizer_error ( ( L " Cannot use stdin (fd 0) as pipe output " ) ) ;
tokenizer_error * TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error ( ( L " Unexpected ')' for unopened parenthesis " ) ) ;
tokenizer_error * TOK_ILLEGAL_SLICE = new tokenizer_error ( ( L " Unexpected '[' at this location " ) ) ;
tokenizer_error * TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error ( ( L " Unexpected '}' for unopened brace expansion " ) ) ;
tokenizer_error * TOK_UNTERMINATED_BRACE = new tokenizer_error ( ( L " Unexpected end of string, incomplete parameter expansion " ) ) ;
tokenizer_error * TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error ( ( L " Unexpected '}' found, expecting ')' " ) ) ;
tokenizer_error * TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error ( ( L " Unexpected ')' found, expecting '}' " ) ) ;
const wchar_t * tokenizer_error : : Message ( ) const {
return _ ( _message ) ;
}
2018-02-24 01:28:12 +00:00
2018-02-23 22:30:15 +00:00
/// Return an error token and mark that we no longer have a next token.
2018-03-12 00:36:10 +00:00
tok_t tokenizer_t : : call_error ( tokenizer_error * error_type , const wchar_t * token_start ,
2018-02-23 22:30:15 +00:00
const wchar_t * error_loc ) {
2018-02-20 00:31:39 +00:00
assert ( error_type ! = TOK_ERROR_NONE & & " TOK_ERROR_NONE passed to call_error " ) ;
2018-02-23 22:30:15 +00:00
assert ( error_loc > = token_start & & " Invalid error location " ) ;
assert ( this - > buff > = token_start & & " Invalid buff location " ) ;
2018-02-19 23:10:10 +00:00
this - > has_next = false ;
2018-02-23 22:30:15 +00:00
tok_t result ;
result . type = TOK_ERROR ;
result . error = error_type ;
result . offset = token_start - this - > start ;
result . length = this - > buff - token_start ;
result . error_offset = error_loc - token_start ;
return result ;
2005-09-20 13:26:39 +00:00
}
2018-02-19 23:10:10 +00:00
tokenizer_t : : tokenizer_t ( const wchar_t * start , tok_flags_t flags ) : buff ( start ) , start ( start ) {
assert ( start ! = nullptr & & " Invalid start " ) ;
2006-07-15 12:40:05 +00:00
2016-10-21 04:14:40 +00:00
this - > accept_unfinished = static_cast < bool > ( flags & TOK_ACCEPT_UNFINISHED ) ;
this - > show_comments = static_cast < bool > ( flags & TOK_SHOW_COMMENTS ) ;
this - > show_blank_lines = static_cast < bool > ( flags & TOK_SHOW_BLANK_LINES ) ;
2005-09-20 13:26:39 +00:00
}
2016-05-03 21:35:12 +00:00
bool tokenizer_t : : next ( struct tok_t * result ) {
2015-07-26 06:05:47 +00:00
assert ( result ! = NULL ) ;
2018-02-23 22:30:15 +00:00
maybe_t < tok_t > tok = this - > tok_next ( ) ;
if ( ! tok ) {
2015-07-26 06:05:47 +00:00
return false ;
}
2018-02-23 22:30:15 +00:00
* result = std : : move ( * tok ) ;
2015-07-26 06:05:47 +00:00
return true ;
}
2018-04-01 20:43:05 +00:00
/// Tests if this character can be a part of a string.
static bool tok_is_string_character ( wchar_t c ) {
2016-05-03 21:35:12 +00:00
switch ( c ) {
2012-11-19 08:31:03 +00:00
case L ' \0 ' :
case L ' ' :
case L ' \n ' :
case L ' | ' :
case L ' \t ' :
case L ' ; ' :
case L ' \r ' :
case L ' < ' :
case L ' > ' :
2018-04-01 20:43:05 +00:00
case L ' & ' :
2012-11-19 08:31:03 +00:00
return false ;
2018-04-01 20:43:05 +00:00
default : return true ;
2012-07-11 03:30:54 +00:00
}
2005-10-26 10:51:02 +00:00
}
2005-09-20 13:26:39 +00:00
2016-05-03 21:35:12 +00:00
/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster
/// by adding a fast path for the most common characters. This is obviously not a suitable
/// replacement for iswalpha.
2018-03-11 00:42:56 +00:00
static inline int myal ( wchar_t c ) { return ( c > = L ' a ' & & c < = L ' z ' ) | | ( c > = L ' A ' & & c < = L ' Z ' ) ; }
ENUM_FLAGS ( tok_mode ) {
regular_text = 0 , // regular text
subshell = 1 < < 0 , // inside of subshell parentheses
array_brackets = 1 < < 1 , // inside of array brackets
curly_braces = 1 < < 2 ,
char_escape = 1 < < 3 ,
2018-03-11 17:13:55 +00:00
} ;
2005-09-20 13:26:39 +00:00
2016-05-03 21:35:12 +00:00
/// Read the next token as a string.
2018-02-23 22:30:15 +00:00
tok_t tokenizer_t : : read_string ( ) {
2018-03-11 17:13:55 +00:00
tok_mode mode { tok_mode : : regular_text } ;
2018-03-11 00:42:56 +00:00
std : : vector < int > paran_offsets ;
2018-03-12 01:06:45 +00:00
std : : vector < int > brace_offsets ;
2018-03-12 00:36:10 +00:00
std : : vector < char > expecting ;
2018-03-11 00:42:56 +00:00
int slice_offset = 0 ;
2018-02-19 23:10:10 +00:00
const wchar_t * const buff_start = this - > buff ;
2012-11-18 10:23:22 +00:00
2018-03-11 00:42:56 +00:00
while ( true ) {
wchar_t c = * this - > buff ;
# if false
wcstring msg = L " Handling 0x%x (%lc) " ;
tok_mode mode_begin = mode ;
# endif
2018-03-11 22:10:16 +00:00
if ( c = = L ' \0 ' ) {
break ;
}
2018-03-11 00:42:56 +00:00
// Make sure this character isn't being escaped before anything else
if ( ( mode & tok_mode : : char_escape ) = = tok_mode : : char_escape ) {
mode & = ~ ( tok_mode : : char_escape ) ;
// and do nothing more
}
2018-03-11 17:13:55 +00:00
else if ( myal ( c ) ) {
// Early exit optimization in case the character is just a letter,
// which has no special meaning to the tokenizer, i.e. the same mode continues.
}
// Now proceed with the evaluation of the token, first checking to see if the token
// has been explicitly ignored (escaped).
else if ( c = = L ' \\ ' ) {
2018-03-11 22:10:16 +00:00
mode | = tok_mode : : char_escape ;
}
else if ( c = = L ' ( ' ) {
paran_offsets . push_back ( this - > buff - this - > start ) ;
2018-03-12 00:36:10 +00:00
expecting . push_back ( L ' ) ' ) ;
2018-03-11 22:10:16 +00:00
mode | = tok_mode : : subshell ;
}
2018-03-12 00:36:10 +00:00
else if ( c = = L ' { ' ) {
2018-03-12 01:06:45 +00:00
brace_offsets . push_back ( this - > buff - this - > start ) ;
2018-03-12 00:36:10 +00:00
expecting . push_back ( L ' } ' ) ;
mode | = tok_mode : : curly_braces ;
}
2018-03-11 22:10:16 +00:00
else if ( c = = L ' ) ' ) {
2018-03-12 00:36:10 +00:00
if ( expecting . size ( ) > 0 & & expecting . back ( ) = = L ' } ' ) {
return this - > call_error ( TOK_EXPECTED_BCLOSE_FOUND_PCLOSE , this - > start , this - > buff ) ;
}
2018-03-11 22:10:16 +00:00
switch ( paran_offsets . size ( ) ) {
case 0 :
2018-03-11 22:16:53 +00:00
return this - > call_error ( TOK_CLOSING_UNOPENED_SUBSHELL , this - > start , this - > buff ) ;
2018-03-11 22:10:16 +00:00
case 1 :
mode & = ~ ( tok_mode : : subshell ) ;
default :
paran_offsets . pop_back ( ) ;
2012-11-19 00:30:30 +00:00
}
2018-03-12 01:06:45 +00:00
expecting . pop_back ( ) ;
2018-03-11 22:10:16 +00:00
}
2018-03-12 00:36:10 +00:00
else if ( c = = L ' } ' ) {
if ( expecting . size ( ) > 0 & & expecting . back ( ) = = L ' ) ' ) {
return this - > call_error ( TOK_EXPECTED_PCLOSE_FOUND_BCLOSE , this - > start , this - > buff ) ;
}
2018-03-12 01:06:45 +00:00
switch ( brace_offsets . size ( ) ) {
2018-03-12 00:36:10 +00:00
case 0 :
return this - > call_error ( TOK_CLOSING_UNOPENED_BRACE , this - > start , this - > buff ) ;
case 1 :
mode & = ~ ( tok_mode : : curly_braces ) ;
default :
2018-03-12 01:06:45 +00:00
brace_offsets . pop_back ( ) ;
2018-03-12 00:36:10 +00:00
}
2018-03-12 01:06:45 +00:00
expecting . pop_back ( ) ;
2018-03-12 00:36:10 +00:00
}
2018-03-11 22:10:16 +00:00
else if ( c = = L ' [ ' ) {
if ( this - > buff ! = buff_start ) {
if ( ( mode & tok_mode : : array_brackets ) = = tok_mode : : array_brackets ) {
// Nested brackets should not overwrite the existing slice_offset
//mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell
//prints an error message with the caret pointing at token_start,
//not err_loc, making the TOK_ILLEGAL_SLICE message misleading.
// return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff);
2018-03-11 22:16:53 +00:00
return this - > call_error ( TOK_UNTERMINATED_SLICE , this - > start , this - > buff ) ;
2015-08-11 01:30:44 +00:00
}
2018-03-11 22:10:16 +00:00
slice_offset = this - > buff - this - > start ;
mode | = tok_mode : : array_brackets ;
2018-03-11 00:42:56 +00:00
}
2018-03-11 22:10:16 +00:00
else {
// This is actually allowed so the test operator `[` can be used as the head of a command
2018-03-11 00:42:56 +00:00
}
2018-03-11 22:10:16 +00:00
}
// Only exit bracket mode if we are in bracket mode.
// Reason: `]` can be a parameter, e.g. last parameter to `[` test alias.
// e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket
else if ( c = = L ' ] ' & & ( ( mode & tok_mode : : array_brackets ) = = tok_mode : : array_brackets ) ) {
mode & = ~ ( tok_mode : : array_brackets ) ;
}
else if ( c = = L ' \' ' | | c = = L ' " ' ) {
const wchar_t * end = quote_end ( this - > buff ) ;
if ( end ) {
this - > buff = end ;
} else {
const wchar_t * error_loc = this - > buff ;
this - > buff + = wcslen ( this - > buff ) ;
if ( ( ! this - > accept_unfinished ) ) {
return this - > call_error ( TOK_UNTERMINATED_QUOTE , buff_start , error_loc ) ;
2015-08-11 01:30:44 +00:00
}
2018-03-11 00:42:56 +00:00
break ;
}
2018-03-11 22:10:16 +00:00
}
2018-04-01 20:43:05 +00:00
else if ( mode = = tok_mode : : regular_text & & ! tok_is_string_character ( c ) ) {
2018-03-11 22:10:16 +00:00
break ;
}
2012-11-18 10:23:22 +00:00
2018-03-11 00:42:56 +00:00
# if false
if ( mode ! = mode_begin ) {
msg . append ( L " : mode 0x%x -> 0x%x \n " ) ;
} else {
msg . push_back ( L ' \n ' ) ;
}
debug ( 0 , msg . c_str ( ) , c , c , int ( mode_begin ) , int ( mode ) ) ;
# endif
2012-11-18 10:23:22 +00:00
2015-07-26 07:58:32 +00:00
this - > buff + + ;
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
2018-03-11 00:42:56 +00:00
if ( ( ! this - > accept_unfinished ) & & ( mode ! = tok_mode : : regular_text ) ) {
2018-02-23 22:30:15 +00:00
tok_t error ;
2018-03-11 00:42:56 +00:00
if ( ( mode & tok_mode : : char_escape ) = = tok_mode : : char_escape ) {
error = this - > call_error ( TOK_UNTERMINATED_ESCAPE , buff_start ,
2018-03-11 22:10:16 +00:00
this - > buff - 1 ) ;
2018-03-11 00:42:56 +00:00
}
else if ( ( mode & tok_mode : : array_brackets ) = = tok_mode : : array_brackets ) {
error = this - > call_error ( TOK_UNTERMINATED_SLICE , buff_start ,
this - > start + slice_offset ) ;
}
else if ( ( mode & tok_mode : : subshell ) = = tok_mode : : subshell ) {
assert ( paran_offsets . size ( ) > 0 ) ;
size_t offset_of_open_paran = paran_offsets . back ( ) ;
2016-05-03 21:35:12 +00:00
2018-03-11 00:42:56 +00:00
error = this - > call_error ( TOK_UNTERMINATED_SUBSHELL , buff_start ,
this - > start + offset_of_open_paran ) ;
2013-09-11 21:22:16 +00:00
}
2018-03-12 00:36:10 +00:00
else if ( ( mode & tok_mode : : curly_braces ) = = tok_mode : : curly_braces ) {
2018-03-12 01:06:45 +00:00
assert ( brace_offsets . size ( ) > 0 ) ;
size_t offset_of_open_brace = brace_offsets . back ( ) ;
2018-03-12 00:36:10 +00:00
error = this - > call_error ( TOK_UNTERMINATED_BRACE , buff_start ,
this - > start + offset_of_open_brace ) ;
}
2018-02-23 22:30:15 +00:00
return error ;
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
2018-02-23 22:30:15 +00:00
tok_t result ;
result . type = TOK_STRING ;
result . offset = buff_start - this - > start ;
result . length = this - > buff - buff_start ;
return result ;
2005-09-20 13:26:39 +00:00
}
2018-02-23 23:19:58 +00:00
// Reads a redirection or an "fd pipe" (like 2>|) from a string.
// Returns the parsed pipe or redirection, or none() on error.
struct parsed_redir_or_pipe_t {
// Number of characters consumed.
size_t consumed { 0 } ;
// The token type, always either TOK_PIPE or TOK_REDIRECT.
token_type type { TOK_REDIRECT } ;
// The redirection mode if the type is TOK_REDIRECT.
redirection_type_t redirection_mode { redirection_type_t : : overwrite } ;
// The redirected fd, or -1 on overflow.
int fd { 0 } ;
} ;
2012-11-18 10:23:22 +00:00
2018-02-23 23:19:58 +00:00
static maybe_t < parsed_redir_or_pipe_t > read_redirection_or_fd_pipe ( const wchar_t * buff ) {
bool errored = false ;
parsed_redir_or_pipe_t result ;
2013-10-13 20:26:52 +00:00
size_t idx = 0 ;
2014-01-15 09:40:40 +00:00
2016-05-03 21:35:12 +00:00
// Determine the fd. This may be specified as a prefix like '2>...' or it may be implicit like
2018-04-01 20:43:05 +00:00
// '>'. Try parsing out a number; if we did not get any digits then infer it from the
2016-05-03 21:35:12 +00:00
// first character. Watch out for overflow.
2013-10-13 23:58:40 +00:00
long long big_fd = 0 ;
2016-05-03 21:35:12 +00:00
for ( ; iswdigit ( buff [ idx ] ) ; idx + + ) {
// Note that it's important we consume all the digits here, even if it overflows.
if ( big_fd < = INT_MAX ) big_fd = big_fd * 10 + ( buff [ idx ] - L ' 0 ' ) ;
2013-10-13 20:26:52 +00:00
}
2014-01-15 09:40:40 +00:00
2018-02-23 23:19:58 +00:00
result . fd = ( big_fd > INT_MAX ? - 1 : static_cast < int > ( big_fd ) ) ;
2014-01-15 09:40:40 +00:00
2016-05-03 21:35:12 +00:00
if ( idx = = 0 ) {
// We did not find a leading digit, so there's no explicit fd. Infer it from the type.
switch ( buff [ idx ] ) {
case L ' > ' : {
2018-02-23 23:19:58 +00:00
result . fd = STDOUT_FILENO ;
2014-01-15 09:40:40 +00:00
break ;
2016-05-03 21:35:12 +00:00
}
case L ' < ' : {
2018-02-23 23:19:58 +00:00
result . fd = STDIN_FILENO ;
2014-01-15 09:40:40 +00:00
break ;
2016-05-03 21:35:12 +00:00
}
default : {
2014-01-15 09:40:40 +00:00
errored = true ;
break ;
2016-05-03 21:35:12 +00:00
}
2012-11-19 00:30:30 +00:00
}
2013-10-13 20:26:52 +00:00
}
2016-06-12 18:34:35 +00:00
2016-05-03 21:35:12 +00:00
// Either way we should have ended on the redirection character itself like '>'.
wchar_t redirect_char = buff [ idx + + ] ; // note increment of idx
2018-04-01 20:43:05 +00:00
if ( redirect_char = = L ' > ' ) {
2018-02-23 23:19:58 +00:00
result . redirection_mode = redirection_type_t : : overwrite ;
2016-05-03 21:35:12 +00:00
if ( buff [ idx ] = = redirect_char ) {
2018-04-01 20:43:05 +00:00
// Doubled up like >>. That means append.
2018-02-23 23:19:58 +00:00
result . redirection_mode = redirection_type_t : : append ;
2013-10-13 20:26:52 +00:00
idx + + ;
2012-11-19 00:30:30 +00:00
}
2016-05-03 21:35:12 +00:00
} else if ( redirect_char = = L ' < ' ) {
2018-02-23 23:19:58 +00:00
result . redirection_mode = redirection_type_t : : input ;
2016-05-03 21:35:12 +00:00
} else {
// Something else.
2013-10-13 20:26:52 +00:00
errored = true ;
2012-11-18 10:23:22 +00:00
}
2014-01-15 09:40:40 +00:00
2018-02-23 23:19:58 +00:00
// Bail on error.
2016-05-03 21:35:12 +00:00
if ( errored ) {
2018-02-23 23:19:58 +00:00
return none ( ) ;
2015-10-07 18:38:13 +00:00
}
2014-01-15 09:40:40 +00:00
2018-02-23 23:19:58 +00:00
// Optional characters like & or ?, or the pipe char |.
wchar_t opt_char = buff [ idx ] ;
if ( opt_char = = L ' & ' ) {
result . redirection_mode = redirection_type_t : : fd ;
idx + + ;
} else if ( opt_char = = L ' ? ' ) {
result . redirection_mode = redirection_type_t : : noclob ;
idx + + ;
} else if ( opt_char = = L ' | ' ) {
// So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets
// handled elsewhere.
result . type = TOK_PIPE ;
idx + + ;
}
2014-01-15 09:40:40 +00:00
2018-02-23 23:19:58 +00:00
result . consumed = idx ;
return result ;
2005-09-20 13:26:39 +00:00
}
2018-02-23 23:19:58 +00:00
maybe_t < redirection_type_t > redirection_type_for_string ( const wcstring & str , int * out_fd ) {
auto v = read_redirection_or_fd_pipe ( str . c_str ( ) ) ;
2016-05-03 21:35:12 +00:00
// Redirections only, no pipes.
2018-02-23 23:19:58 +00:00
if ( ! v | | v - > type ! = TOK_REDIRECT | | v - > fd < 0 ) return none ( ) ;
if ( out_fd ) * out_fd = v - > fd ;
return v - > redirection_mode ;
2013-10-13 23:58:40 +00:00
}
2013-12-29 00:18:38 +00:00
2016-05-03 21:35:12 +00:00
int fd_redirected_by_pipe ( const wcstring & str ) {
// Hack for the common case.
if ( str = = L " | " ) {
2013-12-29 00:18:38 +00:00
return STDOUT_FILENO ;
}
2018-02-23 23:19:58 +00:00
auto v = read_redirection_or_fd_pipe ( str . c_str ( ) ) ;
return ( v & & v - > type = = TOK_PIPE ) ? v - > fd : - 1 ;
2013-12-29 00:18:38 +00:00
}
2013-10-13 23:58:40 +00:00
2018-02-23 23:19:58 +00:00
int oflags_for_redirection_type ( redirection_type_t type ) {
2016-05-03 21:35:12 +00:00
switch ( type ) {
2018-02-23 23:19:58 +00:00
case redirection_type_t : : append : {
2014-01-15 09:40:40 +00:00
return O_CREAT | O_APPEND | O_WRONLY ;
2016-05-03 21:35:12 +00:00
}
2018-02-23 23:19:58 +00:00
case redirection_type_t : : overwrite : {
2014-01-15 09:40:40 +00:00
return O_CREAT | O_WRONLY | O_TRUNC ;
2016-05-03 21:35:12 +00:00
}
2018-02-23 23:19:58 +00:00
case redirection_type_t : : noclob : {
2014-01-15 09:40:40 +00:00
return O_CREAT | O_EXCL | O_WRONLY ;
2016-05-03 21:35:12 +00:00
}
2018-02-23 23:19:58 +00:00
case redirection_type_t : : input : {
2014-01-15 09:40:40 +00:00
return O_RDONLY ;
2016-05-03 21:35:12 +00:00
}
default : { return - 1 ; }
2013-12-23 22:53:56 +00:00
}
}
2016-05-03 21:35:12 +00:00
/// Test if a character is whitespace. Differs from iswspace in that it does not consider a newline
/// to be whitespace.
2018-02-19 23:47:02 +00:00
static bool iswspace_not_nl ( wchar_t c ) {
switch ( c ) {
case L ' ' :
case L ' \t ' :
case L ' \r ' :
return true ;
case L ' \n ' :
return false ;
default :
return iswspace ( c ) ;
}
}
2005-09-20 13:26:39 +00:00
2018-02-23 22:30:15 +00:00
maybe_t < tok_t > tokenizer_t : : tok_next ( ) {
2016-05-03 21:35:12 +00:00
if ( ! this - > has_next ) {
2018-02-23 22:30:15 +00:00
return none ( ) ;
2012-11-18 10:23:22 +00:00
}
2018-02-19 23:47:02 +00:00
// Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
for ( ; ; ) {
2016-05-03 21:35:12 +00:00
if ( this - > buff [ 0 ] = = L ' \\ ' & & this - > buff [ 1 ] = = L ' \n ' ) {
2015-07-26 07:58:32 +00:00
this - > buff + = 2 ;
2018-03-12 13:35:09 +00:00
this - > continue_line_after_comment = true ;
2018-02-19 23:47:02 +00:00
} else if ( iswspace_not_nl ( this - > buff [ 0 ] ) ) {
2015-07-26 07:58:32 +00:00
this - > buff + + ;
2016-05-03 21:35:12 +00:00
} else {
2012-11-19 00:30:30 +00:00
break ;
}
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
2018-03-12 13:35:09 +00:00
while ( * this - > buff = = L ' # ' ) {
2018-02-19 23:47:02 +00:00
// We have a comment, walk over the comment.
const wchar_t * comment_start = this - > buff ;
while ( this - > buff [ 0 ] ! = L ' \n ' & & this - > buff [ 0 ] ! = L ' \0 ' ) this - > buff + + ;
size_t comment_len = this - > buff - comment_start ;
2018-03-12 13:35:09 +00:00
// If we are going to continue after the comment, skip any trailing newline.
if ( this - > buff [ 0 ] = = L ' \n ' & & this - > continue_line_after_comment ) this - > buff + + ;
2018-02-19 23:47:02 +00:00
// Maybe return the comment.
if ( this - > show_comments ) {
2018-02-23 22:30:15 +00:00
tok_t result ;
result . type = TOK_COMMENT ;
result . offset = comment_start - this - > start ;
result . length = comment_len ;
return result ;
2012-11-19 00:30:30 +00:00
}
2018-02-19 23:47:02 +00:00
while ( iswspace_not_nl ( this - > buff [ 0 ] ) ) this - > buff + + ;
2012-11-19 00:30:30 +00:00
}
2012-11-18 10:23:22 +00:00
2018-03-12 13:35:09 +00:00
// We made it past the comments and ate any trailing newlines we wanted to ignore.
this - > continue_line_after_comment = false ;
2018-02-23 22:30:15 +00:00
size_t start_pos = this - > buff - this - > start ;
2012-11-18 10:23:22 +00:00
2018-02-23 22:30:15 +00:00
tok_t result ;
result . offset = start_pos ;
2016-05-03 21:35:12 +00:00
switch ( * this - > buff ) {
case L ' \0 ' : {
2015-07-26 07:58:32 +00:00
this - > has_next = false ;
2018-02-23 22:30:15 +00:00
return none ( ) ;
2016-05-03 21:35:12 +00:00
}
case L ' \r ' : // carriage-return
case L ' \n ' : // newline
case L ' ; ' : {
2018-02-23 22:30:15 +00:00
result . type = TOK_END ;
result . length = 1 ;
2015-07-26 07:58:32 +00:00
this - > buff + + ;
2016-05-03 21:35:12 +00:00
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
// subsequent newlines into a single one.
if ( ! this - > show_blank_lines ) {
2018-03-12 13:35:09 +00:00
while ( * this - > buff = = L ' \n ' | | * this - > buff = = 13 /* CR */ | | * this - > buff = = ' ' | |
* this - > buff = = ' \t ' ) {
2015-07-26 07:58:32 +00:00
this - > buff + + ;
2014-11-25 18:43:03 +00:00
}
2014-11-24 09:20:57 +00:00
}
2012-11-19 08:31:03 +00:00
break ;
2016-05-03 21:35:12 +00:00
}
case L ' & ' : {
2018-03-01 20:56:15 +00:00
if ( this - > buff [ 1 ] = = L ' & ' ) {
result . type = TOK_ANDAND ;
result . length = 2 ;
this - > buff + = 2 ;
} else {
result . type = TOK_BACKGROUND ;
result . length = 1 ;
this - > buff + + ;
}
2012-11-19 08:31:03 +00:00
break ;
2016-05-03 21:35:12 +00:00
}
case L ' | ' : {
2018-03-01 20:56:15 +00:00
if ( this - > buff [ 1 ] = = L ' | ' ) {
result . type = TOK_OROR ;
result . length = 2 ;
this - > buff + = 2 ;
} else {
result . type = TOK_PIPE ;
result . redirected_fd = 1 ;
result . length = 1 ;
this - > buff + + ;
}
2012-11-19 08:31:03 +00:00
break ;
2016-05-03 21:35:12 +00:00
}
2012-11-19 08:31:03 +00:00
case L ' > ' :
2018-04-01 20:43:05 +00:00
case L ' < ' : {
2016-05-03 21:35:12 +00:00
// There's some duplication with the code in the default case below. The key difference
// here is that we must never parse these as a string; a failed redirection is an error!
2018-02-23 23:19:58 +00:00
auto redir_or_pipe = read_redirection_or_fd_pipe ( this - > buff ) ;
if ( ! redir_or_pipe | | redir_or_pipe - > fd < 0 ) {
2018-02-23 22:30:15 +00:00
return this - > call_error ( TOK_INVALID_REDIRECT , this - > buff , this - > buff ) ;
2013-10-13 20:26:52 +00:00
}
2018-02-23 23:19:58 +00:00
result . type = redir_or_pipe - > type ;
result . redirected_fd = redir_or_pipe - > fd ;
result . length = redir_or_pipe - > consumed ;
this - > buff + = redir_or_pipe - > consumed ;
2016-05-03 21:35:12 +00:00
break ;
2013-10-13 20:26:52 +00:00
}
2016-05-03 21:35:12 +00:00
default : {
// Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string.
2015-08-11 01:30:44 +00:00
const wchar_t * error_location = this - > buff ;
2018-02-23 23:19:58 +00:00
maybe_t < parsed_redir_or_pipe_t > redir_or_pipe ;
2016-05-03 21:35:12 +00:00
if ( iswdigit ( * this - > buff ) ) {
2018-02-23 23:19:58 +00:00
redir_or_pipe = read_redirection_or_fd_pipe ( this - > buff ) ;
2015-10-07 18:38:13 +00:00
}
2014-01-15 09:40:40 +00:00
2018-02-23 23:19:58 +00:00
if ( redir_or_pipe & & redir_or_pipe - > consumed > 0 ) {
2016-05-03 21:35:12 +00:00
// It looks like a redirection or a pipe. But we don't support piping fd 0. Note
// that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
// error.
2018-02-23 23:19:58 +00:00
if ( redir_or_pipe - > type = = TOK_PIPE & & redir_or_pipe - > fd = = 0 ) {
2018-02-23 22:30:15 +00:00
return this - > call_error ( TOK_INVALID_PIPE , error_location , error_location ) ;
2012-11-19 08:31:03 +00:00
}
2018-02-23 23:19:58 +00:00
result . type = redir_or_pipe - > type ;
result . redirected_fd = redir_or_pipe - > fd ;
result . length = redir_or_pipe - > consumed ;
this - > buff + = redir_or_pipe - > consumed ;
2016-05-03 21:35:12 +00:00
} else {
// Not a redirection or pipe, so just a string.
2018-02-23 22:30:15 +00:00
result = this - > read_string ( ) ;
2013-10-13 20:26:52 +00:00
}
2016-05-03 21:35:12 +00:00
break ;
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
}
2018-02-23 22:30:15 +00:00
return result ;
2005-09-20 13:26:39 +00:00
}
2016-05-03 21:35:12 +00:00
wcstring tok_first ( const wcstring & str ) {
2018-02-24 01:28:12 +00:00
tokenizer_t t ( str . c_str ( ) , 0 ) ;
2015-07-26 07:58:32 +00:00
tok_t token ;
2016-05-03 21:35:12 +00:00
if ( t . next ( & token ) & & token . type = = TOK_STRING ) {
2018-02-23 22:30:15 +00:00
return t . text_of ( token ) ;
2012-11-19 00:30:30 +00:00
}
2018-02-23 22:30:15 +00:00
return { } ;
2005-09-20 13:26:39 +00:00
}
2016-05-03 21:35:12 +00:00
bool move_word_state_machine_t : : consume_char_punctuation ( wchar_t c ) {
2018-02-25 15:30:15 +00:00
enum { s_always_one = 0 , s_rest , s_whitespace_rest , s_whitespace , s_alphanumeric , s_end } ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
bool consumed = false ;
2016-05-03 21:35:12 +00:00
while ( state ! = s_end & & ! consumed ) {
switch ( state ) {
case s_always_one : {
// Always consume the first character.
2012-12-21 01:37:09 +00:00
consumed = true ;
2018-02-25 15:30:15 +00:00
if ( iswspace ( c ) ) {
state = s_whitespace ;
} else {
// Don't allow switching type (ws->nonws) after non-whitespace.
state = s_rest ;
}
2012-12-21 01:37:09 +00:00
break ;
2016-05-03 21:35:12 +00:00
}
2018-02-25 15:30:15 +00:00
case s_rest : {
if ( iswspace ( c ) ) {
// Consume only trailing whitespace.
state = s_whitespace_rest ;
} else if ( iswalnum ( c ) ) {
// Consume only alnums.
state = s_alphanumeric ;
} else {
consumed = false ;
state = s_end ;
}
break ;
}
case s_whitespace_rest :
2016-05-03 21:35:12 +00:00
case s_whitespace : {
2018-02-25 15:30:15 +00:00
// "whitespace" consumes whitespace and switches to alnums,
// "whitespace_rest" only consumes whitespace.
2016-05-03 21:35:12 +00:00
if ( iswspace ( c ) ) {
// Consumed whitespace.
2012-12-21 01:37:09 +00:00
consumed = true ;
2016-05-03 21:35:12 +00:00
} else {
2018-02-25 15:30:15 +00:00
state = state = = s_whitespace ? s_alphanumeric : s_end ;
2012-12-21 01:37:09 +00:00
}
break ;
2016-05-03 21:35:12 +00:00
}
case s_alphanumeric : {
if ( iswalnum ( c ) ) {
consumed = true ; // consumed alphanumeric
} else {
2012-12-21 01:37:09 +00:00
state = s_end ;
}
break ;
2016-05-03 21:35:12 +00:00
}
2012-12-21 01:37:09 +00:00
case s_end :
2016-05-03 21:35:12 +00:00
default : { break ; }
2012-12-21 01:37:09 +00:00
}
}
return consumed ;
}
2005-09-20 13:26:39 +00:00
2016-05-03 21:35:12 +00:00
bool move_word_state_machine_t : : is_path_component_character ( wchar_t c ) {
2018-04-01 20:43:05 +00:00
return tok_is_string_character ( c ) & & ! wcschr ( L " /= { , } ' \ " " , c ) ;
2012-12-11 00:23:08 +00:00
}
2016-05-03 21:35:12 +00:00
bool move_word_state_machine_t : : consume_char_path_components ( wchar_t c ) {
enum {
2012-12-21 01:37:09 +00:00
s_initial_punctuation ,
s_whitespace ,
s_separator ,
s_slash ,
s_path_component_characters ,
s_end
} ;
2012-12-22 20:21:31 +00:00
2017-01-14 04:34:15 +00:00
// fwprintf(stdout, L"state %d, consume '%lc'\n", state, c);
2012-12-11 00:23:08 +00:00
bool consumed = false ;
2016-05-03 21:35:12 +00:00
while ( state ! = s_end & & ! consumed ) {
switch ( state ) {
case s_initial_punctuation : {
if ( ! is_path_component_character ( c ) ) {
2012-12-21 01:37:09 +00:00
consumed = true ;
}
state = s_whitespace ;
break ;
2016-05-03 21:35:12 +00:00
}
case s_whitespace : {
if ( iswspace ( c ) ) {
consumed = true ; // consumed whitespace
} else if ( c = = L ' / ' | | is_path_component_character ( c ) ) {
state = s_slash ; // path component
} else {
state = s_separator ; // path separator
2012-12-11 00:23:08 +00:00
}
break ;
2016-05-03 21:35:12 +00:00
}
case s_separator : {
if ( ! iswspace ( c ) & & ! is_path_component_character ( c ) ) {
consumed = true ; // consumed separator
2016-05-03 23:23:30 +00:00
} else {
2012-12-11 00:23:08 +00:00
state = s_end ;
}
break ;
2016-05-03 21:35:12 +00:00
}
case s_slash : {
if ( c = = L ' / ' ) {
consumed = true ; // consumed slash
} else {
2012-12-21 01:37:09 +00:00
state = s_path_component_characters ;
2012-12-11 00:23:08 +00:00
}
break ;
2016-05-03 21:35:12 +00:00
}
case s_path_component_characters : {
if ( is_path_component_character ( c ) ) {
consumed = true ; // consumed string character except slash
} else {
2012-12-11 00:23:08 +00:00
state = s_end ;
}
break ;
2016-05-03 21:35:12 +00:00
}
2012-12-11 00:23:08 +00:00
case s_end :
2016-11-03 01:29:14 +00:00
default : { break ; }
2012-12-11 00:23:08 +00:00
}
}
return consumed ;
}
2016-05-03 21:35:12 +00:00
bool move_word_state_machine_t : : consume_char_whitespace ( wchar_t c ) {
enum { s_always_one = 0 , s_blank , s_graph , s_end } ;
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
bool consumed = false ;
2016-05-03 21:35:12 +00:00
while ( state ! = s_end & & ! consumed ) {
switch ( state ) {
case s_always_one : {
consumed = true ; // always consume the first character
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
state = s_blank ;
break ;
2016-05-03 21:35:12 +00:00
}
case s_blank : {
if ( iswblank ( c ) ) {
consumed = true ; // consumed whitespace
} else {
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
state = s_graph ;
}
break ;
2016-05-03 21:35:12 +00:00
}
case s_graph : {
if ( iswgraph ( c ) ) {
consumed = true ; // consumed printable non-space
} else {
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
state = s_end ;
}
break ;
2016-05-03 21:35:12 +00:00
}
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
case s_end :
2016-05-03 21:35:12 +00:00
default : { break ; }
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
}
}
return consumed ;
}
2016-05-03 21:35:12 +00:00
bool move_word_state_machine_t : : consume_char ( wchar_t c ) {
switch ( style ) {
case move_word_style_punctuation : {
2012-12-22 20:21:31 +00:00
return consume_char_punctuation ( c ) ;
2016-05-03 21:35:12 +00:00
}
case move_word_style_path_components : {
2012-12-22 20:21:31 +00:00
return consume_char_path_components ( c ) ;
2016-05-03 21:35:12 +00:00
}
case move_word_style_whitespace : {
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
return consume_char_whitespace ( c ) ;
2016-05-03 21:35:12 +00:00
}
2012-12-21 01:37:09 +00:00
}
2016-11-07 01:48:26 +00:00
DIE ( " should not reach this statement " ) ; // silence some compiler errors about not returning
2012-12-21 01:37:09 +00:00
}
2016-05-03 21:35:12 +00:00
move_word_state_machine_t : : move_word_state_machine_t ( move_word_style_t syl )
: state ( 0 ) , style ( syl ) { }
2012-12-21 01:37:09 +00:00
2016-05-03 21:35:12 +00:00
void move_word_state_machine_t : : reset ( ) { state = 0 ; }