2012-11-18 10:23:22 +00:00
/** \file tokenizer.h
2005-09-20 13:26:39 +00:00
A specialized tokenizer for tokenizing the fish language . In the
future , the tokenizer should be extended to support marks ,
tokenizing multiple strings and disposing of unused string
segments .
*/
2005-10-04 15:11:39 +00:00
# ifndef FISH_TOKENIZER_H
# define FISH_TOKENIZER_H
2015-07-25 15:14:25 +00:00
# include <stddef.h>
2012-11-22 06:09:35 +00:00
# include "common.h"
2005-10-04 15:11:39 +00:00
2005-09-20 13:26:39 +00:00
/**
Token types
*/
enum token_type
{
2012-11-19 00:30:30 +00:00
TOK_NONE , /**< Tokenizer not yet constructed */
TOK_ERROR , /**< Error reading token */
TOK_STRING , /**< String token */
TOK_PIPE , /**< Pipe token */
2013-06-02 05:14:47 +00:00
TOK_END , /**< End token (semicolon or newline, not literal end) */
2012-11-19 00:30:30 +00:00
TOK_REDIRECT_OUT , /**< redirection token */
TOK_REDIRECT_APPEND , /**< redirection append token */
TOK_REDIRECT_IN , /**< input redirection token */
TOK_REDIRECT_FD , /**< redirection to new fd token */
TOK_REDIRECT_NOCLOB , /**<? redirection token */
TOK_BACKGROUND , /**< send job to bg token */
TOK_COMMENT /**< comment token */
2012-02-15 19:33:41 +00:00
} ;
2006-10-07 00:56:25 +00:00
/**
Tokenizer error types
*/
enum tokenizer_error
{
2015-07-26 06:05:47 +00:00
TOK_ERROR_NONE ,
2012-11-19 00:30:30 +00:00
TOK_UNTERMINATED_QUOTE ,
TOK_UNTERMINATED_SUBSHELL ,
2015-08-11 02:30:21 +00:00
TOK_UNTERMINATED_SLICE ,
2012-11-19 00:30:30 +00:00
TOK_UNTERMINATED_ESCAPE ,
TOK_OTHER
2006-10-07 00:56:25 +00:00
}
2012-11-19 00:30:30 +00:00
;
2006-10-07 00:56:25 +00:00
2005-09-20 13:26:39 +00:00
/**
Flag telling the tokenizer to accept incomplete parameters ,
i . e . parameters with mismatching paranthesis , etc . This is useful
for tab - completion .
*/
# define TOK_ACCEPT_UNFINISHED 1
/**
Flag telling the tokenizer not to remove comments . Useful for
syntax highlighting .
*/
# define TOK_SHOW_COMMENTS 2
2012-02-17 23:55:54 +00:00
/** Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing off of the main thread (since wgettext is not thread safe).
*/
# define TOK_SQUASH_ERRORS 4
2014-11-25 18:43:03 +00:00
/** Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon.
This flag tells the tokenizer to return each of them as a separate END . */
# define TOK_SHOW_BLANK_LINES 8
2012-11-22 01:48:35 +00:00
typedef unsigned int tok_flags_t ;
2005-09-20 13:26:39 +00:00
2015-07-26 06:05:47 +00:00
struct tok_t
{
/* The text of the token, or an error message for type error */
wcstring text ;
/* The type of the token */
token_type type ;
2015-07-26 07:12:36 +00:00
/* If an error, this is the error code */
enum tokenizer_error error ;
2015-08-11 01:30:44 +00:00
/* If an error, this is the offset of the error within the token. A value of 0 means it occurred at 'offset' */
size_t error_offset ;
2015-07-26 06:05:47 +00:00
/* Offset of the token */
size_t offset ;
/* Length of the token */
size_t length ;
2015-08-11 01:30:44 +00:00
tok_t ( ) : type ( TOK_NONE ) , error ( TOK_ERROR_NONE ) , error_offset ( - 1 ) , offset ( - 1 ) , length ( - 1 ) { }
2015-07-26 06:05:47 +00:00
} ;
2005-09-20 13:26:39 +00:00
/**
2012-11-18 10:23:22 +00:00
The tokenizer struct .
2005-09-20 13:26:39 +00:00
*/
2015-07-26 07:58:32 +00:00
class tokenizer_t
2005-09-20 13:26:39 +00:00
{
2015-07-26 07:58:32 +00:00
/* No copying, etc. */
tokenizer_t ( const tokenizer_t & ) ;
void operator = ( const tokenizer_t & ) ;
2012-11-19 00:30:30 +00:00
/** A pointer into the original string, showing where the next token begins */
const wchar_t * buff ;
/** A copy of the original string */
const wchar_t * orig_buff ;
2012-11-22 06:09:35 +00:00
/** The last token */
wcstring last_token ;
2012-11-19 00:30:30 +00:00
/** Type of last token*/
2013-09-30 20:57:36 +00:00
enum token_type last_type ;
2012-11-22 06:09:35 +00:00
2012-11-19 00:30:30 +00:00
/** Offset of last token*/
size_t last_pos ;
/** Whether there are more tokens*/
bool has_next ;
/** Whether incomplete tokens are accepted*/
bool accept_unfinished ;
2014-11-25 18:43:03 +00:00
/** Whether comments should be returned*/
2012-11-19 00:30:30 +00:00
bool show_comments ;
2014-11-25 18:43:03 +00:00
/** Whether all blank lines are returned */
bool show_blank_lines ;
2012-11-19 00:30:30 +00:00
/** Last error */
2015-07-26 06:05:47 +00:00
tokenizer_error error ;
2015-08-11 01:30:44 +00:00
/** Last error offset, in "global" coordinates (relative to orig_buff) */
size_t global_error_offset ;
2012-02-17 23:55:54 +00:00
/* Whether we are squashing errors */
bool squash_errors ;
2012-08-05 00:44:14 +00:00
2015-03-13 12:05:22 +00:00
/* Whether to continue the previous line after the comment */
bool continue_line_after_comment ;
2015-07-26 07:58:32 +00:00
2015-08-11 01:30:44 +00:00
void call_error ( enum tokenizer_error error_type , const wchar_t * where , const wchar_t * error_message ) ;
2015-07-26 07:58:32 +00:00
void read_string ( ) ;
void read_comment ( ) ;
void tok_next ( ) ;
public :
2012-11-22 01:48:35 +00:00
/**
Constructor for a tokenizer . b is the string that is to be
tokenized . It is not copied , and should not be freed by the caller
until after the tokenizer is destroyed .
2005-09-20 13:26:39 +00:00
2012-11-22 01:48:35 +00:00
\ param b The string to tokenize
\ param flags Flags to the tokenizer . Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
to accept incomplete tokens , such as a subshell without a closing
parenthesis , as a valid token . Setting TOK_SHOW_COMMENTS will return comments as tokens
2012-11-18 10:23:22 +00:00
2012-11-22 01:48:35 +00:00
*/
tokenizer_t ( const wchar_t * b , tok_flags_t flags ) ;
2015-07-26 06:05:47 +00:00
/** Returns the next token by reference. Returns true if we got one, false if we're at the end. */
bool next ( struct tok_t * result ) ;
2012-11-22 01:48:35 +00:00
} ;
2005-09-20 13:26:39 +00:00
/**
Returns only the first token from the specified string . This is a
convenience function , used to retrieve the first token of a
string . This can be useful for error messages , etc .
2012-11-22 06:23:48 +00:00
On failure , returns the empty string .
2005-09-20 13:26:39 +00:00
*/
2015-07-26 07:58:32 +00:00
wcstring tok_first ( const wcstring & str ) ;
2005-09-20 13:26:39 +00:00
2013-12-23 22:53:56 +00:00
/* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid. Also returns the fd by reference. */
enum token_type redirection_type_for_string ( const wcstring & str , int * out_fd = NULL ) ;
2013-12-29 00:18:38 +00:00
/* Helper function to determine which fd is redirected by a pipe */
int fd_redirected_by_pipe ( const wcstring & str ) ;
2013-12-23 22:53:56 +00:00
/* Helper function to return oflags (as in open(2)) for a redirection type */
int oflags_for_redirection_type ( enum token_type type ) ;
2013-10-13 23:58:40 +00:00
2012-12-21 01:37:09 +00:00
enum move_word_style_t
{
move_word_style_punctuation , //stop at punctuation
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
move_word_style_path_components , //stops at path components
move_word_style_whitespace // stops at whitespace
2012-12-21 01:37:09 +00:00
} ;
2006-10-07 00:56:25 +00:00
2012-12-11 00:23:08 +00:00
/* Our state machine that implements "one word" movement or erasure. */
class move_word_state_machine_t
{
2012-12-21 01:37:09 +00:00
private :
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
bool consume_char_punctuation ( wchar_t c ) ;
bool consume_char_path_components ( wchar_t c ) ;
bool is_path_component_character ( wchar_t c ) ;
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
bool consume_char_whitespace ( wchar_t c ) ;
2012-12-22 20:21:31 +00:00
2012-12-21 01:37:09 +00:00
int state ;
move_word_style_t style ;
2012-12-11 00:23:08 +00:00
public :
2012-12-21 01:37:09 +00:00
2016-02-28 03:38:15 +00:00
explicit move_word_state_machine_t ( move_word_style_t st ) ;
2012-12-11 00:23:08 +00:00
bool consume_char ( wchar_t c ) ;
2012-12-21 01:37:09 +00:00
void reset ( ) ;
2012-12-11 00:23:08 +00:00
} ;
2005-10-04 15:11:39 +00:00
# endif