2005-09-20 13:26:39 +00:00
/** \file tokenizer.c
2006-10-01 15:59:18 +00:00
A specialized tokenizer for tokenizing the fish language . In the
future , the tokenizer should be extended to support marks ,
tokenizing multiple strings and disposing of unused string
segments .
2005-09-20 13:26:39 +00:00
*/
# include "config.h"
# include <stdlib.h>
# include <stdio.h>
# include <wchar.h>
# include <wctype.h>
# include <string.h>
# include <unistd.h>
2006-02-28 13:17:16 +00:00
# include "fallback.h"
2005-09-20 13:26:39 +00:00
# include "util.h"
2006-02-28 13:17:16 +00:00
2005-09-20 13:26:39 +00:00
# include "wutil.h"
# include "tokenizer.h"
# include "common.h"
2006-07-19 22:55:49 +00:00
2012-02-17 23:55:54 +00:00
/* Wow what a hack */
# define TOK_CALL_ERROR(t, e, x) do { tok_call_error((t), (e), (t)->squash_errors ? L"" : (x)); } while (0)
2005-09-20 13:26:39 +00:00
/**
Error string for unexpected end of string
*/
2006-11-17 14:59:05 +00:00
# define QUOTE_ERROR _( L"Unexpected end of string, quotes are not balanced" )
2006-10-07 00:56:25 +00:00
2005-09-20 13:26:39 +00:00
/**
Error string for mismatched parenthesis
*/
2006-11-17 14:59:05 +00:00
# define PARAN_ERROR _( L"Unexpected end of string, parenthesis do not match" )
2006-06-21 00:48:36 +00:00
2013-09-11 21:22:16 +00:00
/**
Error string for mismatched square brackets
*/
# define SQUARE_BRACKET_ERROR _( L"Unexpected end of string, square brackets do not match" )
2005-09-20 13:26:39 +00:00
/**
Error string for invalid redirections
*/
2006-11-17 14:59:05 +00:00
# define REDIRECT_ERROR _( L"Invalid input / output redirection" )
2005-09-20 13:26:39 +00:00
2005-10-07 14:08:57 +00:00
/**
Error string for when trying to pipe from fd 0
*/
2006-01-04 12:51:02 +00:00
# define PIPE_ERROR _( L"Can not use fd 0 as pipe output" )
2005-10-07 14:08:57 +00:00
2005-09-20 13:26:39 +00:00
/**
2006-10-01 15:59:18 +00:00
Characters that separate tokens . They are ordered by frequency of occurrence to increase parsing speed .
2005-09-20 13:26:39 +00:00
*/
2005-10-26 10:51:02 +00:00
# define SEP L" \n|\t;#\r<>^&"
2005-10-08 02:00:08 +00:00
2006-01-23 20:40:14 +00:00
/**
Descriptions of all tokenizer errors
*/
2006-01-04 12:51:02 +00:00
static const wchar_t * tok_desc [ ] =
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
N_ ( L " Tokenizer not yet initialized " ) ,
N_ ( L " Tokenizer error " ) ,
N_ ( L " Invalid token " ) ,
N_ ( L " String " ) ,
N_ ( L " Pipe " ) ,
N_ ( L " End of command " ) ,
N_ ( L " Redirect output to file " ) ,
N_ ( L " Append output to file " ) ,
N_ ( L " Redirect input to file " ) ,
N_ ( L " Redirect to file descriptor " ) ,
N_ ( L " Redirect output to file if file does not exist " ) ,
N_ ( L " Run job in background " ) ,
N_ ( L " Comment " )
2012-11-22 06:09:35 +00:00
} ;
2005-09-20 13:26:39 +00:00
/**
Set the latest tokens string to be the specified error message
*/
2012-11-22 01:48:35 +00:00
static void tok_call_error ( tokenizer_t * tok , int error_type , const wchar_t * error_message )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_ERROR ;
tok - > error = error_type ;
2012-11-22 06:09:35 +00:00
tok - > last_token = error_message ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tok_get_error ( tokenizer_t * tok )
2006-10-07 00:56:25 +00:00
{
2012-11-19 00:30:30 +00:00
return tok - > error ;
2006-10-07 00:56:25 +00:00
}
2013-09-30 20:57:36 +00:00
tokenizer_t : : tokenizer_t ( const wchar_t * b , tok_flags_t flags ) : buff ( NULL ) , orig_buff ( NULL ) , last_type ( TOK_NONE ) , last_pos ( 0 ) , has_next ( false ) , accept_unfinished ( false ) , show_comments ( false ) , last_quote ( 0 ) , error ( 0 ) , squash_errors ( false ) , cached_lineno_offset ( 0 ) , cached_lineno_count ( 0 )
2005-09-20 13:26:39 +00:00
{
2006-01-30 19:53:10 +00:00
2012-02-17 23:55:54 +00:00
/* We can only generate error messages on the main thread due to wgettext() thread safety issues. */
2012-11-19 00:30:30 +00:00
if ( ! ( flags & TOK_SQUASH_ERRORS ) )
{
2012-02-17 23:55:54 +00:00
ASSERT_IS_MAIN_THREAD ( ) ;
}
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
CHECK ( b , ) ;
2006-07-15 12:40:05 +00:00
2012-11-22 01:48:35 +00:00
this - > accept_unfinished = ! ! ( flags & TOK_ACCEPT_UNFINISHED ) ;
this - > show_comments = ! ! ( flags & TOK_SHOW_COMMENTS ) ;
this - > squash_errors = ! ! ( flags & TOK_SQUASH_ERRORS ) ;
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
this - > has_next = ( * b ! = L ' \0 ' ) ;
this - > orig_buff = this - > buff = b ;
this - > cached_lineno_offset = 0 ;
this - > cached_lineno_count = 0 ;
tok_next ( this ) ;
2005-09-20 13:26:39 +00:00
}
2013-09-30 20:57:36 +00:00
enum token_type tok_last_type ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , TOK_ERROR ) ;
CHECK ( tok - > buff , TOK_ERROR ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
return tok - > last_type ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 06:09:35 +00:00
const wchar_t * tok_last ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , 0 ) ;
2012-11-18 10:23:22 +00:00
2012-11-22 06:09:35 +00:00
return tok - > last_token . c_str ( ) ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tok_has_next ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
/*
Return 1 on broken tokenizer
*/
CHECK ( tok , 1 ) ;
CHECK ( tok - > buff , 1 ) ;
/* fwprintf( stderr, L"has_next is %ls \n", tok->has_next?L"true":L"false" );*/
return tok - > has_next ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tokenizer_t : : line_number_of_character_at_offset ( size_t offset )
2012-08-05 00:44:14 +00:00
{
// we want to return (one plus) the number of newlines at offsets less than the given offset
// cached_lineno_count is the number of newlines at indexes less than cached_lineno_offset
const wchar_t * str = orig_buff ;
2012-11-19 00:30:30 +00:00
if ( ! str )
return 0 ;
2012-11-18 10:23:22 +00:00
2012-08-05 00:44:14 +00:00
// easy hack to handle 0
if ( offset = = 0 )
return 1 ;
2012-11-18 10:23:22 +00:00
2012-08-05 00:44:14 +00:00
size_t i ;
if ( offset > cached_lineno_offset )
{
2012-11-19 00:30:30 +00:00
for ( i = cached_lineno_offset ; str [ i ] & & i < offset ; i + + )
2012-08-05 00:44:14 +00:00
{
/* Add one for every newline we find in the range [cached_lineno_offset, offset) */
2012-11-19 00:30:30 +00:00
if ( str [ i ] = = L ' \n ' )
2012-08-05 00:44:14 +00:00
cached_lineno_count + + ;
}
cached_lineno_offset = i ; //note: i, not offset, in case offset is beyond the length of the string
}
else if ( offset < cached_lineno_offset )
{
/* Subtract one for every newline we find in the range [offset, cached_lineno_offset) */
for ( i = offset ; i < cached_lineno_offset ; i + + )
{
if ( str [ i ] = = L ' \n ' )
cached_lineno_count - - ;
}
cached_lineno_offset = offset ;
}
2012-11-19 00:30:30 +00:00
return cached_lineno_count + 1 ;
2012-08-05 00:44:14 +00:00
}
2005-09-20 13:26:39 +00:00
/**
2012-07-11 03:30:54 +00:00
Tests if this character can be a part of a string . The redirect ^ is allowed unless it ' s the first character .
2005-09-20 13:26:39 +00:00
*/
2012-11-19 10:41:57 +00:00
bool tok_is_string_character ( wchar_t c , bool is_first )
2005-09-20 13:26:39 +00:00
{
2012-07-11 03:30:54 +00:00
switch ( c )
{
2012-11-19 08:31:03 +00:00
/* Unconditional separators */
case L ' \0 ' :
case L ' ' :
case L ' \n ' :
case L ' | ' :
case L ' \t ' :
case L ' ; ' :
case L ' # ' :
case L ' \r ' :
case L ' < ' :
case L ' > ' :
case L ' & ' :
return false ;
/* Conditional separator */
case L ' ^ ' :
return ! is_first ;
default :
return true ;
2012-07-11 03:30:54 +00:00
}
2005-10-26 10:51:02 +00:00
}
2005-09-20 13:26:39 +00:00
2005-10-26 10:51:02 +00:00
/**
Quick test to catch the most common ' non - magical ' characters , makes
read_string slightly faster by adding a fast path for the most
common characters . This is obviously not a suitable replacement for
iswalpha .
*/
2012-11-19 00:30:30 +00:00
static int myal ( wchar_t c )
2005-10-26 10:51:02 +00:00
{
2012-11-19 00:30:30 +00:00
return ( c > = L ' a ' & & c < = L ' z ' ) | | ( c > = L ' A ' & & c < = L ' Z ' ) ;
2005-09-20 13:26:39 +00:00
}
/**
Read the next token as a string
*/
2012-11-22 01:48:35 +00:00
static void read_string ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
const wchar_t * start ;
long len ;
int do_loop = 1 ;
int paran_count = 0 ;
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
start = tok - > buff ;
2012-07-11 03:30:54 +00:00
bool is_first = true ;
2012-11-18 10:23:22 +00:00
2012-11-23 19:12:22 +00:00
enum tok_mode_t
{
2012-11-22 01:48:35 +00:00
mode_regular_text = 0 , // regular text
mode_subshell = 1 , // inside of subshell
mode_array_brackets = 2 , // inside of array brackets
mode_array_brackets_and_subshell = 3 // inside of array brackets and subshell, like in '$foo[(ech'
} mode = mode_regular_text ;
2012-11-19 00:30:30 +00:00
while ( 1 )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
if ( ! myal ( * tok - > buff ) )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
if ( * tok - > buff = = L ' \\ ' )
{
tok - > buff + + ;
if ( * tok - > buff = = L ' \0 ' )
{
if ( ( ! tok - > accept_unfinished ) )
{
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_ESCAPE , QUOTE_ERROR ) ;
return ;
}
else
{
2012-11-20 22:51:30 +00:00
/* Since we are about to increment tok->buff, decrement it first so the increment doesn't go past the end of the buffer. https://github.com/fish-shell/fish-shell/issues/389 */
tok - > buff - - ;
2012-11-22 01:48:35 +00:00
do_loop = 0 ;
2012-11-19 00:30:30 +00:00
}
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > buff + + ;
continue ;
}
2012-11-23 19:12:22 +00:00
2012-11-19 00:30:30 +00:00
switch ( mode )
2012-11-18 10:23:22 +00:00
{
2012-11-22 01:48:35 +00:00
case mode_regular_text :
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
switch ( * tok - > buff )
{
case L ' ( ' :
{
paran_count = 1 ;
2012-11-22 01:48:35 +00:00
mode = mode_subshell ;
2012-11-19 08:31:03 +00:00
break ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' [ ' :
{
if ( tok - > buff ! = start )
2012-11-22 01:48:35 +00:00
mode = mode_array_brackets ;
2012-11-19 08:31:03 +00:00
break ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' \' ' :
case L ' " ' :
{
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
const wchar_t * end = quote_end ( tok - > buff ) ;
tok - > last_quote = * tok - > buff ;
if ( end )
{
tok - > buff = ( wchar_t * ) end ;
}
else
{
tok - > buff + = wcslen ( tok - > buff ) ;
2012-11-22 01:48:35 +00:00
if ( ! tok - > accept_unfinished )
2012-11-19 08:31:03 +00:00
{
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_QUOTE , QUOTE_ERROR ) ;
return ;
}
do_loop = 0 ;
}
break ;
}
2012-11-19 00:30:30 +00:00
2012-11-19 08:31:03 +00:00
default :
2012-11-19 00:30:30 +00:00
{
2012-11-22 01:48:35 +00:00
if ( ! tok_is_string_character ( * ( tok - > buff ) , is_first ) )
2012-11-19 08:31:03 +00:00
{
do_loop = 0 ;
}
2012-11-19 00:30:30 +00:00
}
}
break ;
}
2012-11-22 01:48:35 +00:00
case mode_array_brackets_and_subshell :
case mode_subshell :
2012-11-19 08:31:03 +00:00
switch ( * tok - > buff )
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
case L ' \' ' :
case L ' \" ' :
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
const wchar_t * end = quote_end ( tok - > buff ) ;
if ( end )
{
tok - > buff = ( wchar_t * ) end ;
}
else
{
tok - > buff + = wcslen ( tok - > buff ) ;
if ( ( ! tok - > accept_unfinished ) )
{
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_QUOTE , QUOTE_ERROR ) ;
return ;
}
do_loop = 0 ;
}
break ;
2012-11-19 00:30:30 +00:00
}
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' ( ' :
paran_count + + ;
break ;
case L ' ) ' :
paran_count - - ;
if ( paran_count = = 0 )
{
2012-11-22 01:48:35 +00:00
mode = ( mode = = mode_array_brackets_and_subshell ? mode_array_brackets : mode_regular_text ) ;
2012-11-19 08:31:03 +00:00
}
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
}
2012-11-19 00:30:30 +00:00
break ;
2012-11-23 19:12:22 +00:00
2012-11-22 01:48:35 +00:00
case mode_array_brackets :
2012-11-19 08:31:03 +00:00
switch ( * tok - > buff )
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
case L ' ( ' :
paran_count = 1 ;
2012-11-22 01:48:35 +00:00
mode = mode_array_brackets_and_subshell ;
2012-11-19 08:31:03 +00:00
break ;
case L ' ] ' :
2012-11-22 01:48:35 +00:00
mode = mode_regular_text ;
2012-11-19 08:31:03 +00:00
break ;
case L ' \0 ' :
do_loop = 0 ;
break ;
2012-11-19 00:30:30 +00:00
}
break ;
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
if ( ! do_loop )
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > buff + + ;
2012-07-11 03:30:54 +00:00
is_first = false ;
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
if ( ( ! tok - > accept_unfinished ) & & ( mode ! = mode_regular_text ) )
2012-11-19 00:30:30 +00:00
{
2013-09-11 21:22:16 +00:00
switch ( mode )
{
case mode_subshell :
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_SUBSHELL , PARAN_ERROR ) ;
break ;
case mode_array_brackets :
case mode_array_brackets_and_subshell :
TOK_CALL_ERROR ( tok , TOK_UNTERMINATED_SUBSHELL , SQUARE_BRACKET_ERROR ) ; // TOK_UNTERMINATED_SUBSHELL is a lie but nobody actually looks at it
break ;
default :
assert ( 0 & & " Unexpected mode in read_string " ) ;
break ;
}
2012-11-19 00:30:30 +00:00
return ;
}
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
len = tok - > buff - start ;
2005-09-20 13:26:39 +00:00
2012-11-22 06:09:35 +00:00
tok - > last_token . assign ( start , len ) ;
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_STRING ;
2005-09-20 13:26:39 +00:00
}
/**
Read the next token as a comment .
*/
2012-11-22 01:48:35 +00:00
static void read_comment ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
const wchar_t * start ;
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
start = tok - > buff ;
while ( * ( tok - > buff ) ! = L ' \n ' & & * ( tok - > buff ) ! = L ' \0 ' )
tok - > buff + + ;
2012-11-23 19:12:22 +00:00
2005-09-20 13:26:39 +00:00
2012-11-19 00:30:30 +00:00
size_t len = tok - > buff - start ;
2012-11-22 06:09:35 +00:00
tok - > last_token . assign ( start , len ) ;
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_COMMENT ;
2005-09-20 13:26:39 +00:00
}
/**
2006-02-19 01:54:38 +00:00
Read a FD redirection .
2005-09-20 13:26:39 +00:00
*/
2012-11-22 01:48:35 +00:00
static void read_redirect ( tokenizer_t * tok , int fd )
2005-09-20 13:26:39 +00:00
{
2013-09-30 20:57:36 +00:00
enum token_type redirection_mode = TOK_NONE ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
if ( ( * tok - > buff = = L ' > ' ) | |
( * tok - > buff = = L ' ^ ' ) )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
tok - > buff + + ;
if ( * tok - > buff = = * ( tok - > buff - 1 ) )
{
tok - > buff + + ;
2013-09-30 20:57:36 +00:00
redirection_mode = TOK_REDIRECT_APPEND ;
2012-11-19 00:30:30 +00:00
}
else
{
2013-09-30 20:57:36 +00:00
redirection_mode = TOK_REDIRECT_OUT ;
2012-11-19 00:30:30 +00:00
}
if ( * tok - > buff = = L ' | ' )
{
if ( fd = = 0 )
{
TOK_CALL_ERROR ( tok , TOK_OTHER , PIPE_ERROR ) ;
return ;
}
tok - > buff + + ;
2012-11-22 06:09:35 +00:00
tok - > last_token = to_string < int > ( fd ) ;
2012-11-19 00:30:30 +00:00
tok - > last_type = TOK_PIPE ;
return ;
}
}
else if ( * tok - > buff = = L ' < ' )
{
tok - > buff + + ;
2013-09-30 20:57:36 +00:00
redirection_mode = TOK_REDIRECT_IN ;
2012-11-18 10:23:22 +00:00
}
else
{
2012-11-19 00:30:30 +00:00
TOK_CALL_ERROR ( tok , TOK_OTHER , REDIRECT_ERROR ) ;
2012-11-18 10:23:22 +00:00
}
2012-11-22 06:09:35 +00:00
tok - > last_token = to_string ( fd ) ;
2012-11-19 00:30:30 +00:00
if ( * tok - > buff = = L ' & ' )
{
tok - > buff + + ;
tok - > last_type = TOK_REDIRECT_FD ;
}
else if ( * tok - > buff = = L ' ? ' )
{
tok - > buff + + ;
tok - > last_type = TOK_REDIRECT_NOCLOB ;
}
else
{
2013-09-30 20:57:36 +00:00
tok - > last_type = redirection_mode ;
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
wchar_t tok_last_quote ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , 0 ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
return tok - > last_quote ;
2005-09-20 13:26:39 +00:00
}
/**
Test if a character is whitespace . Differs from iswspace in that it
does not consider a newline to be whitespace .
*/
2012-11-23 20:03:36 +00:00
static bool my_iswspace ( wchar_t c )
2005-09-20 13:26:39 +00:00
{
2012-11-23 20:03:36 +00:00
return c ! = L ' \n ' & & iswspace ( c ) ;
2005-09-20 13:26:39 +00:00
}
2012-11-19 00:30:30 +00:00
const wchar_t * tok_get_desc ( int type )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
if ( type < 0 | | ( size_t ) type > = sizeof ( tok_desc ) )
{
return _ ( L " Invalid token type " ) ;
}
return _ ( tok_desc [ type ] ) ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
void tok_next ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , ) ;
CHECK ( tok - > buff , ) ;
if ( tok_last_type ( tok ) = = TOK_ERROR )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
tok - > has_next = false ;
return ;
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
if ( ! tok - > has_next )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
/* wprintf( L"EOL\n" );*/
2012-11-18 10:23:22 +00:00
tok - > last_type = TOK_END ;
return ;
}
2012-11-19 00:30:30 +00:00
while ( 1 )
2012-11-18 10:23:22 +00:00
{
2012-11-23 20:03:36 +00:00
if ( tok - > buff [ 0 ] = = L ' \\ ' & & tok - > buff [ 1 ] = = L ' \n ' )
{
tok - > buff + = 2 ;
}
else if ( my_iswspace ( tok - > buff [ 0 ] ) )
2012-11-19 00:30:30 +00:00
{
tok - > buff + + ;
}
else
{
break ;
}
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
if ( * tok - > buff = = L ' # ' )
2012-11-18 10:23:22 +00:00
{
2012-11-19 00:30:30 +00:00
if ( tok - > show_comments )
{
tok - > last_pos = tok - > buff - tok - > orig_buff ;
read_comment ( tok ) ;
return ;
}
else
{
while ( * ( tok - > buff ) ! = L ' \n ' & & * ( tok - > buff ) ! = L ' \0 ' )
tok - > buff + + ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
while ( my_iswspace ( * ( tok - > buff ) ) )
tok - > buff + + ;
}
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > last_pos = tok - > buff - tok - > orig_buff ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
switch ( * tok - > buff )
{
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' \0 ' :
tok - > last_type = TOK_END ;
/*fwprintf( stderr, L"End of string\n" );*/
tok - > has_next = false ;
break ;
case 13 :
case L ' \n ' :
case L ' ; ' :
tok - > last_type = TOK_END ;
tok - > buff + + ;
break ;
case L ' & ' :
tok - > last_type = TOK_BACKGROUND ;
tok - > buff + + ;
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' | ' :
2012-11-22 06:09:35 +00:00
tok - > last_token = L " 1 " ;
2012-11-19 08:31:03 +00:00
tok - > last_type = TOK_PIPE ;
tok - > buff + + ;
break ;
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
case L ' > ' :
read_redirect ( tok , 1 ) ;
return ;
case L ' < ' :
read_redirect ( tok , 0 ) ;
return ;
case L ' ^ ' :
read_redirect ( tok , 2 ) ;
return ;
2012-11-18 10:23:22 +00:00
2012-11-19 08:31:03 +00:00
default :
2012-11-18 10:23:22 +00:00
{
2012-11-19 08:31:03 +00:00
if ( iswdigit ( * tok - > buff ) )
2012-11-19 00:30:30 +00:00
{
2012-11-19 08:31:03 +00:00
const wchar_t * orig = tok - > buff ;
int fd = 0 ;
while ( iswdigit ( * tok - > buff ) )
fd = ( fd * 10 ) + ( * ( tok - > buff + + ) - L ' 0 ' ) ;
switch ( * ( tok - > buff ) )
{
case L ' ^ ' :
case L ' > ' :
case L ' < ' :
read_redirect ( tok , fd ) ;
return ;
}
tok - > buff = orig ;
2012-11-19 00:30:30 +00:00
}
2012-11-19 08:31:03 +00:00
read_string ( tok ) ;
2012-11-18 10:23:22 +00:00
}
2012-11-19 00:30:30 +00:00
}
2005-09-20 13:26:39 +00:00
}
2013-09-30 21:55:25 +00:00
enum token_type tok_peek_next ( tokenizer_t * tok , wcstring * out_next_string )
{
if ( out_next_string ! = NULL )
{
out_next_string - > clear ( ) ;
}
enum token_type result = TOK_END ;
if ( tok_has_next ( tok ) )
{
int saved = tok_get_pos ( tok ) ;
tok_next ( tok ) ;
result = tok_last_type ( tok ) ;
if ( out_next_string ! = NULL )
{
const wchar_t * last = tok_last ( tok ) ;
out_next_string - > assign ( last ? last : L " " ) ;
}
tok_set_pos ( tok , saved ) ;
}
return result ;
}
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
const wchar_t * tok_string ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
return tok ? tok - > orig_buff : 0 ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 06:23:48 +00:00
wcstring tok_first ( const wchar_t * str )
2005-09-20 13:26:39 +00:00
{
2012-11-22 06:23:48 +00:00
wcstring result ;
if ( str )
2012-11-19 00:30:30 +00:00
{
2012-11-22 06:23:48 +00:00
tokenizer_t t ( str , TOK_SQUASH_ERRORS ) ;
switch ( tok_last_type ( & t ) )
{
case TOK_STRING :
{
const wchar_t * tmp = tok_last ( & t ) ;
if ( tmp ! = NULL )
result = tmp ;
break ;
}
default :
break ;
}
2012-11-19 00:30:30 +00:00
}
2012-11-22 06:23:48 +00:00
return result ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
int tok_get_pos ( tokenizer_t * tok )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , 0 ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
return ( int ) tok - > last_pos ;
2005-09-20 13:26:39 +00:00
}
2012-11-22 01:48:35 +00:00
void tok_set_pos ( tokenizer_t * tok , int pos )
2005-09-20 13:26:39 +00:00
{
2012-11-19 00:30:30 +00:00
CHECK ( tok , ) ;
2012-11-18 10:23:22 +00:00
2012-11-19 00:30:30 +00:00
tok - > buff = tok - > orig_buff + pos ;
tok - > has_next = true ;
tok_next ( tok ) ;
2005-09-20 13:26:39 +00:00
}
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : consume_char_punctuation ( wchar_t c )
{
enum
{
s_always_one = 0 ,
s_whitespace ,
s_alphanumeric ,
s_end
} ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
case s_always_one :
/* Always consume the first character */
consumed = true ;
state = s_whitespace ;
break ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
case s_whitespace :
if ( iswspace ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
else
{
state = s_alphanumeric ;
}
break ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
case s_alphanumeric :
if ( iswalnum ( c ) )
{
/* Consumed alphanumeric */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
case s_end :
default :
break ;
}
}
return consumed ;
}
2005-09-20 13:26:39 +00:00
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : is_path_component_character ( wchar_t c )
2012-12-11 00:23:08 +00:00
{
2012-12-21 01:37:09 +00:00
/* Always treat separators as first. All this does is ensure that we treat ^ as a string character instead of as stderr redirection, which I hypothesize is usually what is desired. */
return tok_is_string_character ( c , true ) & & ! wcschr ( L " /= { , } ' \ " " , c ) ;
2012-12-11 00:23:08 +00:00
}
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : consume_char_path_components ( wchar_t c )
2012-12-11 00:23:08 +00:00
{
2012-12-21 01:37:09 +00:00
enum
{
s_initial_punctuation ,
s_whitespace ,
s_separator ,
s_slash ,
s_path_component_characters ,
s_end
} ;
2012-12-22 20:21:31 +00:00
2012-12-11 00:23:08 +00:00
//printf("state %d, consume '%lc'\n", state, c);
bool consumed = false ;
while ( state ! = s_end & & ! consumed )
{
switch ( state )
{
2012-12-21 01:37:09 +00:00
case s_initial_punctuation :
if ( ! is_path_component_character ( c ) )
{
consumed = true ;
}
state = s_whitespace ;
break ;
2012-12-22 20:21:31 +00:00
2012-12-11 00:23:08 +00:00
case s_whitespace :
if ( iswspace ( c ) )
{
/* Consumed whitespace */
consumed = true ;
}
2012-12-21 01:37:09 +00:00
else if ( c = = L ' / ' | | is_path_component_character ( c ) )
2012-12-11 00:23:08 +00:00
{
2012-12-21 01:37:09 +00:00
/* Path component */
2012-12-11 00:23:08 +00:00
state = s_slash ;
}
else
{
2012-12-21 01:37:09 +00:00
/* Path separator */
2012-12-11 00:23:08 +00:00
state = s_separator ;
}
break ;
case s_separator :
2012-12-21 01:37:09 +00:00
if ( ! iswspace ( c ) & & ! is_path_component_character ( c ) )
2012-12-11 00:23:08 +00:00
{
/* Consumed separator */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
case s_slash :
if ( c = = L ' / ' )
{
/* Consumed slash */
consumed = true ;
}
else
{
2012-12-21 01:37:09 +00:00
state = s_path_component_characters ;
2012-12-11 00:23:08 +00:00
}
break ;
2012-12-21 01:37:09 +00:00
case s_path_component_characters :
if ( is_path_component_character ( c ) )
2012-12-11 00:23:08 +00:00
{
/* Consumed string character except slash */
consumed = true ;
}
else
{
state = s_end ;
}
break ;
/* We won't get here, but keep the compiler happy */
case s_end :
default :
break ;
}
}
return consumed ;
}
2012-12-21 01:37:09 +00:00
bool move_word_state_machine_t : : consume_char ( wchar_t c )
{
switch ( style )
{
2012-12-22 20:21:31 +00:00
case move_word_style_punctuation :
return consume_char_punctuation ( c ) ;
case move_word_style_path_components :
return consume_char_path_components ( c ) ;
default :
return false ;
2012-12-21 01:37:09 +00:00
}
}
move_word_state_machine_t : : move_word_state_machine_t ( move_word_style_t syl ) : state ( 0 ) , style ( syl )
{
}
void move_word_state_machine_t : : reset ( )
{
state = 0 ;
}