2005-09-20 13:26:39 +00:00
/** \file tokenizer.c
2006-10-01 15:59:18 +00:00
A specialized tokenizer for tokenizing the fish language . In the
future , the tokenizer should be extended to support marks ,
tokenizing multiple strings and disposing of unused string
segments .
2005-09-20 13:26:39 +00:00
*/
# include "config.h"
# include <stdlib.h>
# include <stdio.h>
# include <wchar.h>
# include <wctype.h>
# include <string.h>
# include <unistd.h>
2006-02-28 13:17:16 +00:00
# include "fallback.h"
2005-09-20 13:26:39 +00:00
# include "util.h"
2006-02-28 13:17:16 +00:00
2005-09-20 13:26:39 +00:00
# include "wutil.h"
# include "tokenizer.h"
# include "common.h"
2006-07-19 22:55:49 +00:00
2012-02-17 23:55:54 +00:00
/* Wow what a hack */
# define TOK_CALL_ERROR(t, e, x) do { tok_call_error((t), (e), (t)->squash_errors ? L"" : (x)); } while (0)
2005-09-20 13:26:39 +00:00
/**
Error string for unexpected end of string
*/
2006-11-17 14:59:05 +00:00
# define QUOTE_ERROR _( L"Unexpected end of string, quotes are not balanced" )
2006-10-07 00:56:25 +00:00
2005-09-20 13:26:39 +00:00
/**
Error string for mismatched parenthesis
*/
2006-11-17 14:59:05 +00:00
# define PARAN_ERROR _( L"Unexpected end of string, parenthesis do not match" )
2006-06-21 00:48:36 +00:00
2013-09-11 21:22:16 +00:00
/**
Error string for mismatched square brackets
*/
# define SQUARE_BRACKET_ERROR _( L"Unexpected end of string, square brackets do not match" )
2005-09-20 13:26:39 +00:00
/**
Error string for invalid redirections
*/
2006-11-17 14:59:05 +00:00
# define REDIRECT_ERROR _( L"Invalid input / output redirection" )
2005-09-20 13:26:39 +00:00
2005-10-07 14:08:57 +00:00
/**
Error string for when trying to pipe from fd 0
*/
2013-10-13 20:26:52 +00:00
# define PIPE_ERROR _( L"Cannot use stdin (fd 0) as pipe output" )
2005-10-07 14:08:57 +00:00
2005-09-20 13:26:39 +00:00
/**
2006-10-01 15:59:18 +00:00
Characters that separate tokens . They are ordered by frequency of occurrence to increase parsing speed .
2005-09-20 13:26:39 +00:00
*/
2005-10-26 10:51:02 +00:00
# define SEP L" \n|\t;#\r<>^&"
2005-10-08 02:00:08 +00:00
2006-01-23 20:40:14 +00:00
/**
Descriptions of all tokenizer errors
*/
2006-01-04 12:51:02 +00:00
static const wchar_t * tok_desc [ ] =
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
N_ ( L " Tokenizer not yet initialized " ) ,
N_ ( L " Tokenizer error " ) ,
N_ ( L " String " ) ,
N_ ( L " Pipe " ) ,
N_ ( L " End of command " ) ,
N_ ( L " Redirect output to file " ) ,
N_ ( L " Append output to file " ) ,
N_ ( L " Redirect input to file " ) ,
N_ ( L " Redirect to file descriptor " ) ,
N_ ( L " Redirect output to file if file does not exist " ) ,
N_ ( L " Run job in background " ) ,
N_ ( L " Comment " )
2012-11-22 06:09:35 +00:00
} ;
2005-09-20 13:26:39 +00:00
2013-12-09 02:16:55 +00:00
2005-09-20 13:26:39 +00:00
/**
Set the latest tokens string to be the specified error message
*/
2012-11-22 01:48:35 +00:00
static void tok_call_error ( tokenizer_t * tok , int error_type , const wchar_t * error_message )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_ERROR ;
tok - > error = error_type ;
2012-11-22 06:09:35 +00:00
tok - > last_token = error_message ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tok_get_error ( tokenizer_t * tok )
2006-10-07 00:56:25 +00:00
{
2012-11-19 00:30:30 +00:00
return tok - > error ;
2006-10-07 00:56:25 +00:00
}
2013-09-30 20:57:36 +00:00
tokenizer_t : : tokenizer_t ( const wchar_t * b , tok_flags_t flags ) : buff ( NULL ) , orig_buff ( NULL ) , last_type ( TOK_NONE ) , last_pos ( 0 ) , has_next ( false ) , accept_unfinished ( false ) , show_comments ( false ) , last_quote ( 0 ) , error ( 0 ) , squash_errors ( false ) , cached_lineno_offset ( 0 ) , cached_lineno_count ( 0 )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( b , ) ;
2006-07-15 12:40:05 +00:00
2012-11-22 01:48:35 +00:00
this - > accept_unfinished = ! ! ( flags & TOK_ACCEPT_UNFINISHED ) ;
this - > show_comments = ! ! ( flags & TOK_SHOW_COMMENTS ) ;
this - > squash_errors = ! ! ( flags & TOK_SQUASH_ERRORS ) ;
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
this - > has_next = ( * b ! = L ' \0 ' ) ;
this - > orig_buff = this - > buff = b ;
this - > cached_lineno_offset = 0 ;
this - > cached_lineno_count = 0 ;
tok_next ( this ) ;
2005-09-20 13:26:39 +00:00
}
2013-09-30 20:57:36 +00:00
enum token_type tok_last_type ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , TOK_ERROR ) ;
CHECK ( tok - > buff , TOK_ERROR ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
return tok - > last_type ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 06:09:35 +00:00
const wchar_t * tok_last ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , 0 ) ;
2012-11-18 10:23:22 +00:00
2012-11-22 06:09:35 +00:00
return tok - > last_token . c_str ( ) ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tok_has_next ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
/*
Return 1 on broken tokenizer
*/
CHECK ( tok , 1 ) ;
CHECK ( tok - > buff , 1 ) ;
/* fwprintf( stderr, L"has_next is %ls \n", tok->has_next?L"true":L"false" );*/
return tok - > has_next ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tokenizer_t : : line_number_of_character_at_offset ( size_t offset )
2012-08-05 00:44:14 +00:00
{
// we want to return (one plus) the number of newlines at offsets less than the given offset
// cached_lineno_count is the number of newlines at indexes less than cached_lineno_offset
const wchar_t * str = orig_buff ;
2012-11-19 00:30:30 +00:00
if ( ! str )
return 0 ;
2012-11-18 10:23:22 +00:00
2012-08-05 00:44:14 +00:00
// easy hack to handle 0
if ( offset = = 0 )
return 1 ;
2012-11-18 10:23:22 +00:00
2012-08-05 00:44:14 +00:00
size_t i ;
if ( offset > cached_lineno_offset )
{
2012-11-19 00:30:30 +00:00
for ( i = cached_lineno_offset ; str [ i ] & & i < offset ; i + + )
2012-08-05 00:44:14 +00:00
{
/* Add one for every newline we find in the range [cached_lineno_offset, offset) */
2012-11-19 00:30:30 +00:00
if ( str [ i ] = = L ' \n ' )
2012-08-05 00:44:14 +00:00
cached_lineno_count + + ;
}
cached_lineno_offset = i ; //note: i, not offset, in case offset is beyond the length of the string
}
else if ( offset < cached_lineno_offset )
{
/* Subtract one for every newline we find in the range [offset, cached_lineno_offset) */
for ( i = offset ; i < cached_lineno_offset ; i + + )
{
if ( str [ i ] = = L ' \n ' )
cached_lineno_count - - ;
}
cached_lineno_offset = offset ;
}
2012-11-19 00:30:30 +00:00
return cached_lineno_count + 1 ;
2012-08-05 00:44:14 +00:00
}
2005-09-20 13:26:39 +00:00
/**
2012-07-11 03:30:54 +00:00
Tests if this character can be a part of a string . The redirect ^ is allowed unless it ' s the first character .
2005-09-20 13:26:39 +00:00
*/
2012-11-19 10:41:57 +00:00
bool tok_is_string_character ( wchar_t c , bool is_first )
2005-09-20 13:26:39 +00:00
{
2012-07-11 03:30:54 +00:00
switch ( c )
{
2012-11-19 08:31:03 +00:00
/* Unconditional separators */
case L ' \0 ' :
case L ' ' :
case L ' \n ' :
case L ' | ' :
case L ' \t ' :
case L ' ; ' :
case L ' # ' :
case L ' \r ' :
case L ' < ' :
case L ' > ' :
case L ' & ' :
return false ;
/* Conditional separator */
case L ' ^ ' :
return ! is_first ;
default :
return true ;
2012-07-11 03:30:54 +00:00
}
2005-10-26 10:51:02 +00:00
}
2005-09-20 13:26:39 +00:00
2005-10-26 10:51:02 +00:00
/**
Quick test to catch the most common ' non - magical ' characters , makes
read_string slightly faster by adding a fast path for the most
common characters . This is obviously not a suitable replacement for
iswalpha .
*/
2012-11-19 00:30:30 +00:00
static int myal ( wchar_t c )
2005-10-26 10:51:02 +00:00
{
2012-11-19 00:30:30 +00:00
return ( c > = L ' a ' & & c < = L ' z ' ) | | ( c > = L ' A ' & & c < = L ' Z ' ) ;
2005-09-20 13:26:39 +00:00
}
/**
Read the next token as a string
*/
2012-11-22 01:48:35 +00:00
static void read_string ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
const wchar_t * start ;
long len ;
int do_loop = 1 ;
int paran_count = 0 ;
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
start = tok - > buff ;
2012-07-11 03:30:54 +00:00
bool is_first = true ;
2012-11-18 10:23:22 +00:00
2012-11-23 19:12:22 +00:00
enum tok_mode_t
{
2012-11-22 01:48:35 +00:00
mode_regular_text = 0 , // regular text
mode_subshell = 1 , // inside of subshell
mode_array_brackets = 2 , // inside of array brackets
mode_array_brackets_and_subshell = 3 // inside of array brackets and subshell, like in '$foo[(ech'
} mode = mode_regular_text ;
2012-11-19 00:30:30 +00:00
while ( 1 )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
if ( ! myal ( * tok - > buff ) )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
if ( * tok - > buff = = L ' \\ ' )
{
tok - > buff + + ;
if ( * tok - > buff = = L ' \0 ' )
{
if ( ( ! tok - > accept_unfinished ) )
{
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_ESCAPE , QUOTE_ERROR ) ;
return ;
}
else
{
2012-11-20 22:51:30 +00:00
/* Since we are about to increment tok->buff, decrement it first so the increment doesn't go past the end of the buffer. https://github.com/fish-shell/fish-shell/issues/389 */
tok - > buff - - ;
2012-11-22 01:48:35 +00:00
do_loop = 0 ;
2012-11-19 00:30:30 +00:00
}
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > buff + + ;
continue ;
}
2012-11-23 19:12:22 +00:00
2012-11-19 00:30:30 +00:00
switch ( mode )
2012-11-18 10:23:22 +00:00
{
2012-11-22 01:48:35 +00:00
case mode_regular_text :
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
switch ( * tok - > buff )
{
case L ' ( ' :
{
paran_count = 1 ;
2012-11-22 01:48:35 +00:00
mode = mode_subshell ;
2012-11-19 08:31:03 +00:00
break ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' [ ' :
{
if ( tok - > buff ! = start )
2012-11-22 01:48:35 +00:00
mode = mode_array_brackets ;
2012-11-19 08:31:03 +00:00
break ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' \' ' :
case L ' " ' :
{
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
const wchar_t * end = quote_end ( tok - > buff ) ;
tok - > last_quote = * tok - > buff ;
if ( end )
{
tok - > buff = ( wchar_t * ) end ;
}
else
{
tok - > buff + = wcslen ( tok - > buff ) ;
2012-11-22 01:48:35 +00:00
if ( ! tok - > accept_unfinished )
2012-11-19 08:31:03 +00:00
{
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_QUOTE , QUOTE_ERROR ) ;
return ;
}
do_loop = 0 ;
}
break ;
}
2012-11-19 00:30:30 +00:00
2012-11-19 08:31:03 +00:00
default :
2012-11-19 00:30:30 +00:00
{
2012-11-22 01:48:35 +00:00
if ( ! tok_is_string_character ( * ( tok - > buff ) , is_first ) )
2012-11-19 08:31:03 +00:00
{
do_loop = 0 ;
}
2012-11-19 00:30:30 +00:00
}
}
break ;
}
2012-11-22 01:48:35 +00:00
case mode_array_brackets_and_subshell :
case mode_subshell :
2012-11-19 08:31:03 +00:00
switch ( * tok - > buff )
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
case L ' \' ' :
case L ' \" ' :
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
const wchar_t * end = quote_end ( tok - > buff ) ;
if ( end )
{
tok - > buff = ( wchar_t * ) end ;
}
else
{
tok - > buff + = wcslen ( tok - > buff ) ;
if ( ( ! tok - > accept_unfinished ) )
{
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_QUOTE , QUOTE_ERROR ) ;
return ;
}
do_loop = 0 ;
}
break ;
2012-11-19 00:30:30 +00:00
}
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' ( ' :
paran_count + + ;
break ;
case L ' ) ' :
paran_count - - ;
if ( paran_count = = 0 )
{
2012-11-22 01:48:35 +00:00
mode = ( mode = = mode_array_brackets_and_subshell ? mode_array_brackets : mode_regular_text ) ;
2012-11-19 08:31:03 +00:00
}
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
}
2012-11-19 00:30:30 +00:00
break ;
2012-11-23 19:12:22 +00:00
2012-11-22 01:48:35 +00:00
case mode_array_brackets :
2012-11-19 08:31:03 +00:00
switch ( * tok - > buff )
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
case L ' ( ' :
paran_count = 1 ;
2012-11-22 01:48:35 +00:00
mode = mode_array_brackets_and_subshell ;
2012-11-19 08:31:03 +00:00
break ;
case L ' ] ' :
2012-11-22 01:48:35 +00:00
mode = mode_regular_text ;
2012-11-19 08:31:03 +00:00
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
2012-11-19 00:30:30 +00:00
}
break ;
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
if ( ! do_loop )
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > buff + + ;
2012-07-11 03:30:54 +00:00
is_first = false ;
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
if ( ( ! tok - > accept_unfinished ) & & ( mode ! = mode_regular_text ) )
2012-11-19 00:30:30 +00:00
{
2013-09-11 21:22:16 +00:00
switch ( mode )
{
case mode_subshell :
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_SUBSHELL , PARAN_ERROR ) ;
break ;
case mode_array_brackets :
case mode_array_brackets_and_subshell :
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_SUBSHELL , SQUARE_BRACKET_ERROR ) ; // TOK_UNTERMINATED_SUBSHELL is a lie but nobody actually looks at it
break ;
default :
assert ( 0 & & " Unexpected mode in read_string " ) ;
break ;
}
2012-11-19 00:30:30 +00:00
return ;
}
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
len = tok - > buff - start ;
2005-09-20 13:26:39 +00:00
2012-11-22 06:09:35 +00:00
tok - > last_token . assign ( start , len ) ;
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_STRING ;
2005-09-20 13:26:39 +00:00
}
/**
Read the next token as a comment .
*/
2012-11-22 01:48:35 +00:00
static void read_comment ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
const wchar_t * start ;
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
start = tok - > buff ;
while ( * ( tok - > buff ) ! = L ' \n ' & & * ( tok - > buff ) ! = L ' \0 ' )
tok - > buff + + ;
2012-11-23 19:12:22 +00:00
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
size_t len = tok - > buff - start ;
2012-11-22 06:09:35 +00:00
tok - > last_token . assign ( start , len ) ;
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_COMMENT ;
2005-09-20 13:26:39 +00:00
}
2013-10-13 23:58:40 +00:00
2013-10-13 20:26:52 +00:00
/* Reads a redirection or an "fd pipe" (like 2>|) from a string. Returns how many characters were consumed. If zero, then this string was not a redirection.
2013-10-13 23:58:40 +00:00
Also returns by reference the redirection mode , and the fd to redirection . If there is overflow , * out_fd is set to - 1.
2005-09-20 13:26:39 +00:00
*/
2013-10-13 20:26:52 +00:00
static size_t read_redirection_or_fd_pipe ( const wchar_t * buff , enum token_type * out_redirection_mode , int * out_fd )
2005-09-20 13:26:39 +00:00
{
2013-10-13 20:26:52 +00:00
bool errored = false ;
int fd = 0 ;
2013-09-30 20:57:36 +00:00
enum token_type redirection_mode = TOK_NONE ;
2012-11-18 10:23:22 +00:00
2013-10-13 20:26:52 +00:00
size_t idx = 0 ;
2013-10-13 23:58:40 +00:00
/* Determine the fd. This may be specified as a prefix like '2>...' or it may be implicit like '>' or '^'. Try parsing out a number; if we did not get any digits then infer it from the first character. Watch out for overflow. */
long long big_fd = 0 ;
2013-10-13 20:26:52 +00:00
for ( ; iswdigit ( buff [ idx ] ) ; idx + + )
2012-11-18 10:23:22 +00:00
{
2013-10-13 23:58:40 +00:00
/* Note that it's important we consume all the digits here, even if it overflows. */
if ( big_fd < = INT_MAX )
big_fd = big_fd * 10 + ( buff [ idx ] - L ' 0 ' ) ;
2013-10-13 20:26:52 +00:00
}
2013-10-13 23:58:40 +00:00
fd = ( big_fd > INT_MAX ? - 1 : static_cast < int > ( big_fd ) ) ;
2013-10-13 20:26:52 +00:00
if ( idx = = 0 )
{
/* We did not find a leading digit, so there's no explicit fd. Infer it from the type */
switch ( buff [ idx ] )
2012-11-19 00:30:30 +00:00
{
2013-10-13 20:26:52 +00:00
case L ' > ' : fd = STDOUT_FILENO ; break ;
case L ' < ' : fd = STDIN_FILENO ; break ;
case L ' ^ ' : fd = STDERR_FILENO ; break ;
default : errored = true ; break ;
2012-11-19 00:30:30 +00:00
}
2013-10-13 20:26:52 +00:00
}
/* Either way we should have ended on the redirection character itself like '>' */
wchar_t redirect_char = buff [ idx + + ] ; //note increment of idx
if ( redirect_char = = L ' > ' | | redirect_char = = L ' ^ ' )
{
redirection_mode = TOK_REDIRECT_OUT ;
if ( buff [ idx ] = = redirect_char )
2012-11-19 00:30:30 +00:00
{
2013-10-13 20:26:52 +00:00
/* Doubled up like ^^ or >>. That means append */
redirection_mode = TOK_REDIRECT_APPEND ;
idx + + ;
2012-11-19 00:30:30 +00:00
}
}
2013-10-13 20:26:52 +00:00
else if ( redirect_char = = L ' < ' )
2012-11-19 00:30:30 +00:00
{
2013-09-30 20:57:36 +00:00
redirection_mode = TOK_REDIRECT_IN ;
2012-11-18 10:23:22 +00:00
}
else
{
2013-10-13 20:26:52 +00:00
/* Something else */
errored = true ;
2012-11-18 10:23:22 +00:00
}
2013-10-13 20:26:52 +00:00
/* Optional characters like & or ?, or the pipe char | */
wchar_t opt_char = buff [ idx ] ;
if ( opt_char = = L ' & ' )
2012-11-19 00:30:30 +00:00
{
2013-10-13 20:26:52 +00:00
redirection_mode = TOK_REDIRECT_FD ;
idx + + ;
2012-11-19 00:30:30 +00:00
}
2013-10-13 20:26:52 +00:00
else if ( opt_char = = L ' ? ' )
2012-11-19 00:30:30 +00:00
{
2013-10-13 20:26:52 +00:00
redirection_mode = TOK_REDIRECT_NOCLOB ;
idx + + ;
2012-11-19 00:30:30 +00:00
}
2013-10-13 20:26:52 +00:00
else if ( opt_char = = L ' | ' )
{
/* So the string looked like '2>|'. This is not a redirection - it's a pipe! That gets handled elsewhere. */
redirection_mode = TOK_PIPE ;
idx + + ;
}
/* Don't return valid-looking stuff on error */
if ( errored )
2012-11-19 00:30:30 +00:00
{
2013-10-13 20:26:52 +00:00
idx = 0 ;
redirection_mode = TOK_NONE ;
2012-11-19 00:30:30 +00:00
}
2013-10-13 20:26:52 +00:00
/* Return stuff */
if ( out_redirection_mode ! = NULL )
* out_redirection_mode = redirection_mode ;
if ( out_fd ! = NULL )
* out_fd = fd ;
return idx ;
2005-09-20 13:26:39 +00:00
}
2013-10-13 23:58:40 +00:00
enum token_type redirection_type_for_string ( const wcstring & str )
{
enum token_type mode = TOK_NONE ;
int fd = 0 ;
read_redirection_or_fd_pipe ( str . c_str ( ) , & mode , & fd ) ;
/* Redirections only, no pipes */
if ( mode = = TOK_PIPE | | fd < 0 )
mode = TOK_NONE ;
return mode ;
}
2012-11-22 01:48:35 +00:00
wchar_t tok_last_quote ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , 0 ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
return tok - > last_quote ;
2005-09-20 13:26:39 +00:00
}
/**
Test if a character is whitespace . Differs from iswspace in that it
does not consider a newline to be whitespace .
*/
2012-11-23 20:03:36 +00:00
static bool my_iswspace ( wchar_t c )
2005-09-20 13:26:39 +00:00
{
2012-11-23 20:03:36 +00:00
return c ! = L ' \n ' & & iswspace ( c ) ;
2005-09-20 13:26:39 +00:00
}
2012-11-19 00:30:30 +00:00
const wchar_t * tok_get_desc ( int type )
2005-09-20 13:26:39 +00:00
{
2013-12-09 02:16:55 +00:00
if ( type < 0 | | ( size_t ) type > = ( sizeof tok_desc / sizeof * tok_desc ) )
2012-11-19 00:30:30 +00:00
{
return _ ( L " Invalid token type " ) ;
}
return _ ( tok_desc [ type ] ) ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
void tok_next ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , ) ;
CHECK ( tok - > buff , ) ;
if ( tok_last_type ( tok ) = = TOK_ERROR )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
tok - > has_next = false ;
return ;
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
if ( ! tok - > has_next )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
/* wprintf( L"EOL\n" );*/
2012-11-18 10:23:22 +00:00
tok - > last_type = TOK_END ;
return ;
}
2012-11-19 00:30:30 +00:00
while ( 1 )
2012-11-18 10:23:22 +00:00
{
2012-11-23 20:03:36 +00:00
if ( tok - > buff [ 0 ] = = L ' \\ ' & & tok - > buff [ 1 ] = = L ' \n ' )
{
tok - > buff + = 2 ;
}
else if ( my_iswspace ( tok - > buff [ 0 ] ) )
2012-11-19 00:30:30 +00:00
{
tok - > buff + + ;
}
else
{
break ;
}
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
if ( * tok - > buff = = L ' # ' )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
if ( tok - > show_comments )
{
tok - > last_pos = tok - > buff - tok - > orig_buff ;
read_comment ( tok ) ;
return ;
}
else
{
while ( * ( tok - > buff ) ! = L ' \n ' & & * ( tok - > buff ) ! = L ' \0 ' )
tok - > buff + + ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
while ( my_iswspace ( * ( tok - > buff ) ) )
tok - > buff + + ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > last_pos = tok - > buff - tok - > orig_buff ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
switch ( * tok - > buff )
{
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' \0 ' :
tok - > last_type = TOK_END ;
/*fwprintf( stderr, L"End of string\n" );*/
tok - > has_next = false ;
break ;
case 13 :
case L ' \n ' :
case L ' ; ' :
tok - > last_type = TOK_END ;
tok - > buff + + ;
break ;
case L ' & ' :
tok - > last_type = TOK_BACKGROUND ;
tok - > buff + + ;
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' | ' :
2012-11-22 06:09:35 +00:00
tok - > last_token = L " 1 " ;
2012-11-19 08:31:03 +00:00
tok - > last_type = TOK_PIPE ;
tok - > buff + + ;
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' > ' :
case L ' < ' :
case L ' ^ ' :
2013-10-13 20:26:52 +00:00
{
/* There's some duplication with the code in the default case below. The key difference here is that we must never parse these as a string; a failed redirection is an error! */
enum token_type mode = TOK_NONE ;
int fd = - 1 ;
size_t consumed = read_redirection_or_fd_pipe ( tok - > buff , & mode , & fd ) ;
2013-10-13 23:58:40 +00:00
if ( consumed = = 0 | | fd < 0 )
2013-10-13 20:26:52 +00:00
{
TOK_CALL_ERROR ( tok , TOK_OTHER , REDIRECT_ERROR ) ;
}
else
{
tok - > buff + = consumed ;
tok - > last_type = mode ;
tok - > last_token = to_string ( fd ) ;
}
}
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
default :
2012-11-18 10:23:22 +00:00
{
2013-10-13 20:26:52 +00:00
/* Maybe a redirection like '2>&1', maybe a pipe like 2>|, maybe just a string */
size_t consumed = 0 ;
enum token_type mode = TOK_NONE ;
int fd = - 1 ;
2012-11-19 08:31:03 +00:00
if ( iswdigit ( * tok - > buff ) )
2013-10-13 20:26:52 +00:00
consumed = read_redirection_or_fd_pipe ( tok - > buff , & mode , & fd ) ;
if ( consumed > 0 )
2012-11-19 00:30:30 +00:00
{
2013-10-13 23:58:40 +00:00
/* It looks like a redirection or a pipe. But we don't support piping fd 0. Note that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer error. */
2013-10-13 20:26:52 +00:00
if ( mode = = TOK_PIPE & & fd = = 0 )
2012-11-19 08:31:03 +00:00
{
2013-10-13 20:26:52 +00:00
TOK_CALL_ERROR ( tok , TOK_OTHER , PIPE_ERROR ) ;
}
else
{
tok - > buff + = consumed ;
tok - > last_type = mode ;
tok - > last_token = to_string ( fd ) ;
2012-11-19 08:31:03 +00:00
}
2012-11-19 00:30:30 +00:00
}
2013-10-13 20:26:52 +00:00
else
{
/* Not a redirection or pipe, so just a stirng */
read_string ( tok ) ;
}
2012-11-18 10:23:22 +00:00
}
2013-10-13 20:26:52 +00:00
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
}
2013-09-30 21:55:25 +00:00
enum token_type tok_peek_next ( tokenizer_t * tok , wcstring * out_next_string )
{
if ( out_next_string ! = NULL )
{
out_next_string - > clear ( ) ;
}
2013-10-26 22:27:39 +00:00
2013-09-30 21:55:25 +00:00
enum token_type result = TOK_END ;
if ( tok_has_next ( tok ) )
{
int saved = tok_get_pos ( tok ) ;
tok_next ( tok ) ;
result = tok_last_type ( tok ) ;
2013-10-26 22:27:39 +00:00
2013-09-30 21:55:25 +00:00
if ( out_next_string ! = NULL )
{
const wchar_t * last = tok_last ( tok ) ;
out_next_string - > assign ( last ? last : L " " ) ;
}
2013-10-26 22:27:39 +00:00
2013-09-30 21:55:25 +00:00
tok_set_pos ( tok , saved ) ;
}
return result ;
}
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
const wchar_t * tok_string ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
return tok ? tok - > orig_buff : 0 ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 06:23:48 +00:00
wcstring tok_first ( const wchar_t * str )
2005-09-20 13:26:39 +00:00
{
2012-11-22 06:23:48 +00:00
wcstring result ;
if ( str )
2012-11-19 00:30:30 +00:00
{
2012-11-22 06:23:48 +00:00
tokenizer_t t ( str , TOK_SQUASH_ERRORS ) ;
switch ( tok_last_type ( & t ) )
{
case TOK_STRING :
{
const wchar_t * tmp = tok_last ( & t ) ;
if ( tmp ! = NULL )
result = tmp ;
break ;
}
default :
break ;
}
2012-11-19 00:30:30 +00:00
}
2012-11-22 06:23:48 +00:00
return result ;
2005-09-20 13:26:39 +00:00
}
2013-07-23 01:26:15 +00:00
int tok_get_pos ( const tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , 0 ) ;
return ( int ) tok - > last_pos ;
2005-09-20 13:26:39 +00:00
}
2013-07-23 01:26:15 +00:00
size_t tok_get_extent ( const tokenizer_t * tok )
{
CHECK ( tok , 0 ) ;
size_t current_pos = tok - > buff - tok - > orig_buff ;
return current_pos > tok - > last_pos ? current_pos - tok - > last_pos : 0 ;
}
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
void tok_set_pos ( tokenizer_t * tok , int pos )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > buff = tok - > orig_buff + pos ;
tok - > has_next = true ;
tok_next ( tok ) ;
2005-09-20 13:26:39 +00:00
}
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : consume_char_punctuation ( wchar_t c )
{
enum
{
s_always_one = 0 ,
s_whitespace ,
s_alphanumeric ,
s_end
} ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
case s_always_one :
/* Always consume the first character */
consumed = true ;
state = s_whitespace ;
break ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
case s_whitespace :
if ( iswspace ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
else
{
state = s_alphanumeric ;
}
break ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
case s_alphanumeric :
if ( iswalnum ( c ) )
{
/* Consumed alphanumeric */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
case s_end :
default :
break ;
}
}
return consumed ;
}
2005-09-20 13:26:39 +00:00
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : is_path_component_character ( wchar_t c )
2012-12-11 00:23:08 +00:00
{
2012-12-21 01:37:09 +00:00
/* Always treat separators as first. All this does is ensure that we treat ^ as a string character instead of as stderr redirection, which I hypothesize is usually what is desired. */
return tok_is_string_character ( c , true ) & & ! wcschr ( L " /= { , } ' \ " " , c ) ;
2012-12-11 00:23:08 +00:00
}
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : consume_char_path_components ( wchar_t c )
2012-12-11 00:23:08 +00:00
{
2012-12-21 01:37:09 +00:00
enum
{
s_initial_punctuation ,
s_whitespace ,
s_separator ,
s_slash ,
s_path_component_characters ,
s_end
} ;
2012-12-22 20:21:31 +00:00
2012-12-11 00:23:08 +00:00
//printf("state %d, consume '%lc'\n", state, c);
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
2012-12-21 01:37:09 +00:00
case s_initial_punctuation :
if ( ! is_path_component_character ( c ) )
{
consumed = true ;
}
state = s_whitespace ;
break ;
2012-12-22 20:21:31 +00:00
2012-12-11 00:23:08 +00:00
case s_whitespace :
if ( iswspace ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
2012-12-21 01:37:09 +00:00
else if ( c = = L ' / ' | | is_path_component_character ( c ) )
2012-12-11 00:23:08 +00:00
{
2012-12-21 01:37:09 +00:00
/* Path component */
2012-12-11 00:23:08 +00:00
state = s_slash ;
}
else
{
2012-12-21 01:37:09 +00:00
/* Path separator */
2012-12-11 00:23:08 +00:00
state = s_separator ;
}
break ;
case s_separator :
2012-12-21 01:37:09 +00:00
if ( ! iswspace ( c ) & & ! is_path_component_character ( c ) )
2012-12-11 00:23:08 +00:00
{
/* Consumed separator */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
case s_slash :
if ( c = = L ' / ' )
{
/* Consumed slash */
consumed = true ;
}
else
{
2012-12-21 01:37:09 +00:00
state = s_path_component_characters ;
2012-12-11 00:23:08 +00:00
}
break ;
2012-12-21 01:37:09 +00:00
case s_path_component_characters :
if ( is_path_component_character ( c ) )
2012-12-11 00:23:08 +00:00
{
/* Consumed string character except slash */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
/* We won't get here, but keep the compiler happy */
case s_end :
default :
break ;
}
}
return consumed ;
}
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : consume_char ( wchar_t c )
{
switch ( style )
{
2012-12-22 20:21:31 +00:00
case move_word_style_punctuation :
return consume_char_punctuation ( c ) ;
case move_word_style_path_components :
return consume_char_path_components ( c ) ;
default :
return false ;
2012-12-21 01:37:09 +00:00
}
}
move_word_state_machine_t : : move_word_state_machine_t ( move_word_style_t syl ) : state ( 0 ) , style ( syl )
{
}
void move_word_state_machine_t : : reset ( )
{
state = 0 ;
}