2016-05-03 21:35:12 +00:00
|
|
|
// A specialized tokenizer for tokenizing the fish language. In the future, the tokenizer should be
|
|
|
|
// extended to support marks, tokenizing multiple strings and disposing of unused string segments.
|
2005-10-04 15:11:39 +00:00
|
|
|
#ifndef FISH_TOKENIZER_H
|
|
|
|
#define FISH_TOKENIZER_H
|
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
#include <stddef.h>
|
2022-08-21 06:14:48 +00:00
|
|
|
#include <stdint.h>
|
2016-04-21 06:00:54 +00:00
|
|
|
|
2012-11-22 06:09:35 +00:00
|
|
|
#include "common.h"
|
2018-02-23 22:30:15 +00:00
|
|
|
#include "maybe.h"
|
2018-03-12 00:36:10 +00:00
|
|
|
#include "parse_constants.h"
|
2019-12-13 00:44:24 +00:00
|
|
|
#include "redirection.h"
|
2023-02-05 08:35:06 +00:00
|
|
|
#if INCLUDE_RUST_HEADERS
|
|
|
|
#include "tokenizer.rs.h"
|
|
|
|
#endif
|
2005-10-04 15:11:39 +00:00
|
|
|
|
2021-12-21 10:26:41 +00:00
|
|
|
/// Token types. XXX Why this isn't parse_token_type_t, I'm not really sure.
|
|
|
|
enum class token_type_t : uint8_t {
|
2019-10-13 23:06:16 +00:00
|
|
|
error, /// Error reading token
|
|
|
|
string, /// String token
|
|
|
|
pipe, /// Pipe token
|
|
|
|
andand, /// && token
|
|
|
|
oror, /// || token
|
|
|
|
end, /// End token (semicolon or newline, not literal end)
|
|
|
|
redirect, /// redirection token
|
|
|
|
background, /// send job to bg token
|
|
|
|
comment, /// comment token
|
2012-02-15 19:33:41 +00:00
|
|
|
};
|
2006-10-07 00:56:25 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Flag telling the tokenizer to accept incomplete parameters, i.e. parameters with mismatching
|
2019-11-25 11:03:25 +00:00
|
|
|
/// parenthesis, etc. This is useful for tab-completion.
|
2022-09-20 18:58:37 +00:00
|
|
|
#define TOK_ACCEPT_UNFINISHED 1
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Flag telling the tokenizer not to remove comments. Useful for syntax highlighting.
|
2022-09-20 18:58:37 +00:00
|
|
|
#define TOK_SHOW_COMMENTS 2
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Ordinarily, the tokenizer ignores newlines following a newline, or a semicolon. This flag tells
|
|
|
|
/// the tokenizer to return each of them as a separate END.
|
2022-09-20 18:58:37 +00:00
|
|
|
#define TOK_SHOW_BLANK_LINES 4
|
2014-11-25 18:43:03 +00:00
|
|
|
|
2019-10-27 23:08:49 +00:00
|
|
|
/// Make an effort to continue after an error.
|
2022-09-20 18:58:37 +00:00
|
|
|
#define TOK_CONTINUE_AFTER_ERROR 8
|
2019-10-27 23:08:49 +00:00
|
|
|
|
2021-09-22 00:57:25 +00:00
|
|
|
using tok_flags_t = unsigned int;
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2021-12-21 10:26:41 +00:00
|
|
|
enum class tokenizer_error_t : uint8_t {
|
2018-09-28 01:25:49 +00:00
|
|
|
none,
|
|
|
|
unterminated_quote,
|
|
|
|
unterminated_subshell,
|
|
|
|
unterminated_slice,
|
|
|
|
unterminated_escape,
|
|
|
|
invalid_redirect,
|
|
|
|
invalid_pipe,
|
2019-10-27 21:35:14 +00:00
|
|
|
invalid_pipe_ampersand,
|
2018-09-28 01:25:49 +00:00
|
|
|
closing_unopened_subshell,
|
|
|
|
illegal_slice,
|
|
|
|
closing_unopened_brace,
|
|
|
|
unterminated_brace,
|
|
|
|
expected_pclose_found_bclose,
|
|
|
|
expected_bclose_found_pclose,
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Get the error message for an error \p err.
|
2019-03-14 22:12:14 +00:00
|
|
|
const wchar_t *tokenizer_get_error_message(tokenizer_error_t err);
|
2018-09-28 01:25:49 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
struct tok_t {
|
2018-02-23 22:30:15 +00:00
|
|
|
// Offset of the token.
|
2021-12-21 10:26:41 +00:00
|
|
|
source_offset_t offset{0};
|
2018-02-23 22:30:15 +00:00
|
|
|
// Length of the token.
|
2021-12-21 10:26:41 +00:00
|
|
|
source_offset_t length{0};
|
2018-02-23 22:30:15 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
|
|
|
|
// at 'offset'.
|
2021-12-21 10:26:41 +00:00
|
|
|
source_offset_t error_offset_within_token{SOURCE_OFFSET_INVALID};
|
2022-08-09 17:33:40 +00:00
|
|
|
source_offset_t error_length{0};
|
2021-12-21 09:27:17 +00:00
|
|
|
|
|
|
|
// If an error, this is the error code.
|
|
|
|
tokenizer_error_t error{tokenizer_error_t::none};
|
|
|
|
|
2021-12-21 10:26:41 +00:00
|
|
|
// The type of the token.
|
|
|
|
token_type_t type;
|
|
|
|
|
2019-10-13 23:06:16 +00:00
|
|
|
// Construct from a token type.
|
|
|
|
explicit tok_t(token_type_t type);
|
2019-10-29 12:32:26 +00:00
|
|
|
|
|
|
|
/// Returns whether the given location is within the source range or at its end.
|
|
|
|
bool location_in_or_at_end_of_source_range(size_t loc) const {
|
|
|
|
return offset <= loc && loc - offset <= length;
|
|
|
|
}
|
|
|
|
/// Gets source for the token, or the empty string if it has no source.
|
2020-09-27 00:21:22 +00:00
|
|
|
wcstring get_source(const wcstring &str) const { return wcstring(str, offset, length); }
|
2015-07-26 06:05:47 +00:00
|
|
|
};
|
2021-12-21 09:27:17 +00:00
|
|
|
static_assert(sizeof(tok_t) <= 32, "tok_t expected to be 32 bytes or less");
|
2015-07-26 06:05:47 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// The tokenizer struct.
|
2021-07-22 17:43:25 +00:00
|
|
|
class tokenizer_t : noncopyable_t {
|
2016-05-03 21:35:12 +00:00
|
|
|
/// A pointer into the original string, showing where the next token begins.
|
2019-11-09 00:40:15 +00:00
|
|
|
const wchar_t *token_cursor;
|
2018-02-19 23:10:10 +00:00
|
|
|
/// The start of the original string.
|
|
|
|
const wchar_t *const start;
|
2018-02-23 22:30:15 +00:00
|
|
|
/// Whether we have additional tokens.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool has_next{true};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether incomplete tokens are accepted.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool accept_unfinished{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether comments should be returned.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool show_comments{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Whether all blank lines are returned.
|
2018-02-19 23:10:10 +00:00
|
|
|
bool show_blank_lines{false};
|
2019-10-27 23:08:49 +00:00
|
|
|
/// Whether to attempt to continue after an error.
|
|
|
|
bool continue_after_error{false};
|
2018-03-12 13:35:09 +00:00
|
|
|
/// Whether to continue the previous line after the comment.
|
|
|
|
bool continue_line_after_comment{false};
|
2016-05-03 21:35:12 +00:00
|
|
|
|
2018-09-28 01:25:49 +00:00
|
|
|
tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start,
|
2022-08-21 21:51:33 +00:00
|
|
|
const wchar_t *error_loc, maybe_t<size_t> token_length = {},
|
|
|
|
size_t error_len = 0);
|
2018-02-23 22:30:15 +00:00
|
|
|
tok_t read_string();
|
2016-05-03 21:35:12 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
|
|
|
|
/// should not be freed by the caller until after the tokenizer is destroyed.
|
|
|
|
///
|
|
|
|
/// \param b The string to tokenize
|
|
|
|
/// \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
|
|
|
|
/// to accept incomplete tokens, such as a subshell without a closing parenthesis, as a valid
|
|
|
|
/// token. Setting TOK_SHOW_COMMENTS will return comments as tokens
|
2019-11-19 00:54:36 +00:00
|
|
|
tokenizer_t(const wchar_t *start, tok_flags_t flags);
|
2016-05-03 21:35:12 +00:00
|
|
|
|
2019-10-13 23:06:16 +00:00
|
|
|
/// Returns the next token, or none() if we are at the end.
|
|
|
|
maybe_t<tok_t> next();
|
2018-02-23 22:30:15 +00:00
|
|
|
|
|
|
|
/// Returns the text of a token, as a string.
|
|
|
|
wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); }
|
2018-02-23 23:58:13 +00:00
|
|
|
|
|
|
|
/// Copies a token's text into a string. This is useful for reusing storage.
|
|
|
|
/// Returns a reference to the string.
|
|
|
|
const wcstring ©_text_of(const tok_t &tok, wcstring *result) {
|
|
|
|
return result->assign(start + tok.offset, tok.length);
|
|
|
|
}
|
2012-11-22 01:48:35 +00:00
|
|
|
};
|
2005-09-20 13:26:39 +00:00
|
|
|
|
2022-02-04 20:44:45 +00:00
|
|
|
/// Tests if this character can delimit tokens.
|
2022-04-16 17:45:38 +00:00
|
|
|
bool is_token_delimiter(wchar_t c, maybe_t<wchar_t> next);
|
2022-02-04 20:44:45 +00:00
|
|
|
|
2022-03-20 21:48:44 +00:00
|
|
|
/// \return the first token from the string, skipping variable assignments like A=B.
|
2020-02-23 23:16:12 +00:00
|
|
|
wcstring tok_command(const wcstring &str);
|
|
|
|
|
2019-10-14 20:20:31 +00:00
|
|
|
/// Struct wrapping up a parsed pipe or redirection.
|
|
|
|
struct pipe_or_redir_t {
|
|
|
|
// The redirected fd, or -1 on overflow.
|
2019-12-11 00:14:34 +00:00
|
|
|
// In the common case of a pipe, this is 1 (STDOUT_FILENO).
|
2019-10-14 20:20:31 +00:00
|
|
|
// For example, in the case of "3>&1" this will be 3.
|
2019-12-11 00:14:34 +00:00
|
|
|
int fd{-1};
|
2013-12-23 22:53:56 +00:00
|
|
|
|
2019-10-14 20:20:31 +00:00
|
|
|
// Whether we are a pipe (true) or redirection (false).
|
|
|
|
bool is_pipe{false};
|
2013-12-29 00:18:38 +00:00
|
|
|
|
2019-10-14 20:20:31 +00:00
|
|
|
// The redirection mode if the type is redirect.
|
|
|
|
// Ignored for pipes.
|
|
|
|
redirection_mode_t mode{redirection_mode_t::overwrite};
|
|
|
|
|
2019-10-14 22:45:40 +00:00
|
|
|
// Whether, in addition to this redirection, stderr should also be dup'd to stdout
|
|
|
|
// For example &| or &>
|
|
|
|
bool stderr_merge{false};
|
|
|
|
|
2019-10-14 20:20:31 +00:00
|
|
|
// Number of characters consumed when parsing the string.
|
|
|
|
size_t consumed{0};
|
|
|
|
|
|
|
|
// Construct from a string.
|
|
|
|
static maybe_t<pipe_or_redir_t> from_string(const wchar_t *buff);
|
|
|
|
static maybe_t<pipe_or_redir_t> from_string(const wcstring &buff) {
|
|
|
|
return from_string(buff.c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
// \return the oflags (as in open(2)) for this redirection.
|
|
|
|
int oflags() const;
|
|
|
|
|
|
|
|
// \return if we are "valid". Here "valid" means only that the source fd did not overflow.
|
|
|
|
// For example 99999999999> is invalid.
|
|
|
|
bool is_valid() const { return fd >= 0; }
|
|
|
|
|
|
|
|
// \return the token type for this redirection.
|
|
|
|
token_type_t token_type() const {
|
|
|
|
return is_pipe ? token_type_t::pipe : token_type_t::redirect;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
pipe_or_redir_t();
|
|
|
|
};
|
2013-10-13 23:58:40 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
enum move_word_style_t {
|
|
|
|
move_word_style_punctuation, // stop at punctuation
|
|
|
|
move_word_style_path_components, // stops at path components
|
|
|
|
move_word_style_whitespace // stops at whitespace
|
2012-12-21 01:37:09 +00:00
|
|
|
};
|
2006-10-07 00:56:25 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
/// Our state machine that implements "one word" movement or erasure.
|
|
|
|
class move_word_state_machine_t {
|
|
|
|
private:
|
2012-12-21 01:37:09 +00:00
|
|
|
bool consume_char_punctuation(wchar_t c);
|
|
|
|
bool consume_char_path_components(wchar_t c);
|
|
|
|
bool is_path_component_character(wchar_t c);
|
Add 'bigword' vi key bindings
- Add four new functions: forward-bigword, backward-bigword,
kill-bigword, backward-kill-bigword
- Add new enum move_word_style_whitespace and related state machine
method
- Change vi key bindings to operate on bigwords: B, gE, W, E, dW, diW,
daW, dE, dB, dgE, cW, ciW, caW, cE, cB, cgE, yW, yiW, yaW, yE, yB,
ygE
2015-05-30 22:44:25 +00:00
|
|
|
bool consume_char_whitespace(wchar_t c);
|
2012-12-22 20:21:31 +00:00
|
|
|
|
2012-12-21 01:37:09 +00:00
|
|
|
int state;
|
|
|
|
move_word_style_t style;
|
2012-12-11 00:23:08 +00:00
|
|
|
|
2016-05-03 21:35:12 +00:00
|
|
|
public:
|
2019-11-19 00:54:36 +00:00
|
|
|
explicit move_word_state_machine_t(move_word_style_t syl);
|
2012-12-11 00:23:08 +00:00
|
|
|
bool consume_char(wchar_t c);
|
2012-12-21 01:37:09 +00:00
|
|
|
void reset();
|
2012-12-11 00:23:08 +00:00
|
|
|
};
|
|
|
|
|
2005-10-04 15:11:39 +00:00
|
|
|
#endif
|