2016-05-03 21:35:12 +00:00
|
|
|
// A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
|
|
|
|
// extended to support marks, tokenizing multiple strings and disposing of unused string segments.
|
2005-10-04 15:11:39 +00:00
|
|
|
#ifndef FISH_TOKENIZER_H
|
|
|
|
#define FISH_TOKENIZER_H
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
#include <stddef.h>
|
2016-04-21 06:00:54 +00:00
|
|
|
|
2012-11-22 06:09:35 +00:00
|
|
|
#include "common.h"
|
2005-10-04 15:11:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Token types.
|
|
|
|
enum token_type {
|
|
|
|
TOK_NONE, /// Tokenizer not yet constructed
|
|
|
|
TOK_ERROR, /// Error reading token
|
|
|
|
TOK_STRING, /// String token
|
|
|
|
TOK_PIPE, /// Pipe token
|
|
|
|
TOK_END, /// End token (semicolon or newline, not literal end)
|
|
|
|
TOK_REDIRECT_OUT, /// redirection token
|
|
|
|
TOK_REDIRECT_APPEND, /// redirection append token
|
|
|
|
TOK_REDIRECT_IN, /// input redirection token
|
|
|
|
TOK_REDIRECT_FD, /// redirection to new fd token
|
|
|
|
TOK_REDIRECT_NOCLOB, /// redirection token
|
|
|
|
TOK_BACKGROUND, /// send job to bg token
|
|
|
|
TOK_COMMENT /// comment token
|
2012-02-15 19:33:41 +00:00
|
|
|
};
|
2006-10-07 00:56:25 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Tokenizer error types.
|
|
|
|
enum tokenizer_error {
|
2015-07-26 06:05:47 +00:00
|
|
|
TOK_ERROR_NONE,
|
2012-11-19 00:30:30 +00:00
|
|
|
TOK_UNTERMINATED_QUOTE,
|
|
|
|
TOK_UNTERMINATED_SUBSHELL,
|
2015-08-11 02:30:21 +00:00
|
|
|
TOK_UNTERMINATED_SLICE,
|
2012-11-19 00:30:30 +00:00
|
|
|
TOK_UNTERMINATED_ESCAPE,
|
2018-02-20 00:31:39 +00:00
|
|
|
TOK_INVALID_REDIRECT,
|
|
|
|
TOK_INVALID_PIPE
|
2016-05-03 21:35:12 +00:00
|
|
|
};
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Flag telling the tokenizer to accept incomplete parameters, i.e. parameters with mismatching
|
|
|
|
/// paranthesis, etc. This is useful for tab-completion.
|
2005-09-20 13:26:39 +00:00
|
|
|
#define TOK_ACCEPT_UNFINISHED 1
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting.
|
2005-09-20 13:26:39 +00:00
|
|
|
#define TOK_SHOW_COMMENTS 2
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Flag telling the tokenizer to not generate error messages, which we need to do when tokenizing
|
|
|
|
/// off of the main thread (since wgettext is not thread safe).
|
2012-02-17 23:55:54 +00:00
|
|
|
#define TOK_SQUASH_ERRORS 4
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells
|
|
|
|
/// the tokenizer to return each of them as a separate END.
|
2014-11-25 18:43:03 +00:00
|
|
|
#define TOK_SHOW_BLANK_LINES 8
|
|
|
|
|
2012-11-22 01:48:35 +00:00
|
|
|
typedef unsigned int tok_flags_t;
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
struct tok_t {
|
|
|
|
// The text of the token, or an error message for type error.
|
2015-07-26 06:05:47 +00:00
|
|
|
wcstring text;
|
2016-05-03 21:35:12 +00:00
|
|
|
// The type of the token.
|
2015-07-26 06:05:47 +00:00
|
|
|
token_type type;
|
2016-05-03 21:35:12 +00:00
|
|
|
// If an error, this is the error code.
|
2015-07-26 07:12:36 +00:00
|
|
|
enum tokenizer_error error;
|
2016-05-03 21:35:12 +00:00
|
|
|
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
|
|
|
|
// at 'offset'.
|
2015-08-11 01:30:44 +00:00
|
|
|
size_t error_offset;
|
2016-05-03 21:35:12 +00:00
|
|
|
// Offset of the token.
|
2015-07-26 06:05:47 +00:00
|
|
|
size_t offset;
|
2016-05-03 21:35:12 +00:00
|
|
|
// Length of the token.
|
2015-07-26 06:05:47 +00:00
|
|
|
size_t length;
|
2016-05-03 21:35:12 +00:00
|
|
|
|
2015-08-11 01:30:44 +00:00
|
|
|
tok_t() : type(TOK_NONE), error(TOK_ERROR_NONE), error_offset(-1), offset(-1), length(-1) {}
|
2015-07-26 06:05:47 +00:00
|
|
|
};
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// The tokenizer struct.
|
|
|
|
class tokenizer_t {
|
|
|
|
// No copying, etc.
|
2018-02-19 23:10:10 +00:00
|
|
|
tokenizer_t(const tokenizer_t &) = delete;
|
|
|
|
void operator=(const tokenizer_t &) = delete;
|
2015-07-26 07:58:32 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// A pointer into the original string, showing where the next token begins.
|
2012-11-19 00:30:30 +00:00
|
|
|
const wchar_t *buff;
|
2018-02-19 23:10:10 +00:00
|
|
|
/// The start of the original string.
|
|
|
|
const wchar_t *const start;
|
2016-05-03 21:35:12 +00:00
|
|
|
/// The last token.
|
2012-11-22 06:09:35 +00:00
|
|
|
wcstring last_token;
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Type of last token.
|
2018-02-19 23:10:10 +00:00
|
|
|
enum token_type last_type { TOK_NONE };
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Offset of last token.
|
2018-02-19 23:10:10 +00:00
|
|
|
size_t last_pos{0};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether there are more tokens.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool has_next{true};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether incomplete tokens are accepted.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool accept_unfinished{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether comments should be returned.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool show_comments{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether all blank lines are returned.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool show_blank_lines{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Last error.
|
2018-02-19 23:10:10 +00:00
|
|
|
tokenizer_error error{TOK_ERROR_NONE};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Last error offset, in "global" coordinates (relative to orig_buff).
|
2018-02-19 23:10:10 +00:00
|
|
|
size_t global_error_offset{size_t(-1)};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether we are squashing errors.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool squash_errors{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether to continue the previous line after the comment.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool continue_line_after_comment{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
|
2018-02-20 00:31:39 +00:00
|
|
|
void call_error(enum tokenizer_error error_type, const wchar_t *where);
|
2015-07-26 07:58:32 +00:00
|
|
|
void read_string();
|
2018-02-19 23:10:10 +00:00
|
|
|
bool tok_next();
|
2016-05-03 21:35:12 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
|
|
|
|
/// should not be freed by the caller until after the tokenizer is destroyed.
|
|
|
|
///
|
|
|
|
/// \param b The string to tokenize
|
|
|
|
/// \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
|
|
|
|
/// to accept incomplete tokens, such as a subshell without a closing parenthesis, as a valid
|
|
|
|
/// token. Setting TOK_SHOW_COMMENTS will return comments as tokens
|
2012-11-22 01:48:35 +00:00
|
|
|
tokenizer_t(const wchar_t *b, tok_flags_t flags);
|
2016-05-03 21:35:12 +00:00
|
|
|
|
|
|
|
/// Returns the next token by reference. Returns true if we got one, false if we're at the end.
|
2015-07-26 06:05:47 +00:00
|
|
|
bool next(struct tok_t *result);
|
2012-11-22 01:48:35 +00:00
|
|
|
};
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Returns only the first token from the specified string. This is a convenience function, used to
|
|
|
|
/// retrieve the first token of a string. This can be useful for error messages, etc. On failure,
|
|
|
|
/// returns the empty string.
|
2015-07-26 07:58:32 +00:00
|
|
|
wcstring tok_first(const wcstring &str);
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Helper function to determine redirection type from a string, or TOK_NONE if the redirection is
|
|
|
|
/// invalid. Also returns the fd by reference.
|
2013-12-23 22:53:56 +00:00
|
|
|
enum token_type redirection_type_for_string(const wcstring &str, int *out_fd = NULL);
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Helper function to determine which fd is redirected by a pipe.
|
2013-12-29 00:18:38 +00:00
|
|
|
int fd_redirected_by_pipe(const wcstring &str);
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Helper function to return oflags (as in open(2)) for a redirection type.
|
2013-12-23 22:53:56 +00:00
|
|
|
int oflags_for_redirection_type(enum token_type type);
|
2013-10-13 23:58:40 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
enum move_word_style_t {
|
|
|
|
move_word_style_punctuation, // stop at punctuation
|
|
|
|
move_word_style_path_components, // stops at path components
|
|
|
|
move_word_style_whitespace // stops at whitespace
|
2012-12-21 01:37:09 +00:00
|
|
|
};
|
2006-10-07 00:56:25 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Our state machine that implements "one word" movement or erasure.
|
|
|
|
class move_word_state_machine_t {
|
|
|
|
private:
|
2012-12-21 01:37:09 +00:00
|
|
|
bool consume_char_punctuation(wchar_t c);
|
|
|
|
bool consume_char_path_components(wchar_t c);
|
|
|
|
bool is_path_component_character(wchar_t c);
|
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
|
|
|
bool consume_char_whitespace(wchar_t c);
|
2012-12-22 20:21:31 +00:00
|
|
|
|
2012-12-21 01:37:09 +00:00
|
|
|
int state;
|
|
|
|
move_word_style_t style;
|
2012-12-11 00:23:08 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
public:
|
2016-02-28 03:38:15 +00:00
|
|
|
explicit move_word_state_machine_t(move_word_style_t st);
|
2012-12-11 00:23:08 +00:00
|
|
|
bool consume_char(wchar_t c);
|
2012-12-21 01:37:09 +00:00
|
|
|
void reset();
|
2012-12-11 00:23:08 +00:00
|
|
|
};
|
|
|
|
|
2005-10-04 15:11:39 +00:00
|
|
|
#endif
|