Clean up tokenizer implementation

Rather than storing a bunch of "next_foo" fields, simply populate the
tok_t directly.
This commit is contained in:
ridiculousfish 2018-02-23 14:30:15 -08:00
parent e9a4875a6b
commit 6673fe5457
7 changed files with 122 additions and 130 deletions

View file

@ -143,10 +143,10 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED);
tok_t token;
while (tok.next(&token)) {
if ((cut_at_cursor) && (token.offset + token.text.size() >= pos)) break;
if ((cut_at_cursor) && (token.offset + token.length >= pos)) break;
if (token.type == TOK_STRING) {
wcstring tmp = token.text;
wcstring tmp = tok.text_of(token);
unescape_string_in_place(&tmp, UNESCAPE_INCOMPLETE);
out.append(tmp);
out.push_back(L'\n');

View file

@ -519,14 +519,14 @@ static void test_tokenizer() {
do_test(token.type == TOK_STRING);
do_test(token.offset == 0);
do_test(token.length == 5);
do_test(token.text == L"alpha");
do_test(t.text_of(token) == L"alpha");
got = t.next(&token); // beta
do_test(got);
do_test(token.type == TOK_STRING);
do_test(token.offset == 6);
do_test(token.length == 4);
do_test(token.text == L"beta");
do_test(t.text_of(token) == L"beta");
got = t.next(&token);
do_test(!got);

View file

@ -477,7 +477,7 @@ class parse_ll_t {
void accept_tokens(parse_token_t token1, parse_token_t token2);
/// Report tokenizer errors.
void report_tokenizer_error(const tok_t &tok);
void report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok);
/// Indicate if we hit a fatal error.
bool has_fatal_error() const { return this->fatal_errored; }
@ -711,7 +711,7 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
}
}
void parse_ll_t::report_tokenizer_error(const tok_t &tok) {
void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) {
parse_error_code_t parse_error_code;
switch (tok.error) {
case TOK_UNTERMINATED_QUOTE: {
@ -738,7 +738,7 @@ void parse_ll_t::report_tokenizer_error(const tok_t &tok) {
}
}
this->parse_error_at_location(tok.offset + tok.error_offset, parse_error_code, L"%ls",
tok.text.c_str());
tokenizer.text_of(tok).c_str());
}
void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
@ -1067,10 +1067,11 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token) {
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
// even starts to look like a feature.
result.type = parse_token_type_from_tokenizer_token(token->type);
result.keyword = keyword_for_token(token->type, token->text);
result.has_dash_prefix = !token->text.empty() && token->text.at(0) == L'-';
result.is_help_argument = result.has_dash_prefix && is_help_argument(token->text);
result.is_newline = (result.type == parse_token_type_end && token->text == L"\n");
wcstring text = tok->text_of(*token);
result.keyword = keyword_for_token(token->type, text);
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
@ -1128,7 +1129,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
// Handle tokenizer errors. This is a hack because really the parser should report this for
// itself; but it has no way of getting the tokenizer message.
if (queue[1].type == parse_special_type_tokenizer_error) {
parser.report_tokenizer_error(tokenizer_token);
parser.report_tokenizer_error(tok, tokenizer_token);
}
if (!parser.has_fatal_error()) {

View file

@ -379,7 +379,7 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
// Calculate end of token.
if (token.type == TOK_STRING) {
tok_end += token.text.size();
tok_end += token.length;
}
// Cursor was before beginning of this token, means that the cursor is between two tokens,
@ -393,14 +393,14 @@ void parse_util_token_extent(const wchar_t *buff, size_t cursor_pos, const wchar
// and break.
if (token.type == TOK_STRING && tok_end >= offset_within_cmdsubst) {
a = cmdsubst_begin + token.offset;
b = a + token.text.size();
b = a + token.length;
break;
}
// Remember previous string token.
if (token.type == TOK_STRING) {
pa = cmdsubst_begin + token.offset;
pb = pa + token.text.size();
pb = pa + token.length;
}
}
@ -479,7 +479,8 @@ void parse_util_get_parameter_info(const wcstring &cmd, const size_t pos, wchar_
while (tok.next(&token)) {
if (token.offset > pos) break;
if (token.type == TOK_STRING) last_quote = get_quote(token.text, pos - token.offset);
if (token.type == TOK_STRING)
last_quote = get_quote(tok.text_of(token), pos - token.offset);
if (out_type != NULL) *out_type = token.type;

View file

@ -1744,13 +1744,14 @@ static void handle_token_history(history_search_direction_t dir, bool reset = fa
tok_t token;
while (tok.next(&token)) {
if (token.type != TOK_STRING) continue;
if (token.text.find(data->search_buff) == wcstring::npos) continue;
wcstring text = tok.text_of(token);
if (text.find(data->search_buff) == wcstring::npos) continue;
if (token.offset >= current_pos) continue;
auto found = find(data->search_prev.begin(), data->search_prev.end(), token.text);
auto found = find(data->search_prev.begin(), data->search_prev.end(), text);
if (found == data->search_prev.end()) {
data->token_history_pos = token.offset;
str = token.text;
str = text;
}
}
}

View file

@ -34,39 +34,46 @@
/// Error string for when trying to pipe from fd 0.
#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")
/// Set the latest tokens string to be the specified error message.
void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *where) {
/// Return an error token and mark that we no longer have a next token.
tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start,
const wchar_t *error_loc) {
assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
this->last_type = TOK_ERROR;
this->error = error_type;
assert(error_loc >= token_start && "Invalid error location");
assert(this->buff >= token_start && "Invalid buff location");
this->has_next = false;
this->global_error_offset = where ? where - this->start : 0;
if (this->squash_errors) {
this->last_token.clear();
} else {
tok_t result;
result.type = TOK_ERROR;
result.error = error_type;
result.offset = token_start - this->start;
result.length = this->buff - token_start;
result.error_offset = error_loc - token_start;
if (!this->squash_errors) {
switch (error_type) {
case TOK_UNTERMINATED_QUOTE:
this->last_token = QUOTE_ERROR;
result.error_text = QUOTE_ERROR;
break;
case TOK_UNTERMINATED_SUBSHELL:
this->last_token = PARAN_ERROR;
result.error_text = PARAN_ERROR;
break;
case TOK_UNTERMINATED_SLICE:
this->last_token = SQUARE_BRACKET_ERROR;
result.error_text = SQUARE_BRACKET_ERROR;
break;
case TOK_UNTERMINATED_ESCAPE:
this->last_token = UNTERMINATED_ESCAPE_ERROR;
result.error_text = UNTERMINATED_ESCAPE_ERROR;
break;
case TOK_INVALID_REDIRECT:
this->last_token = REDIRECT_ERROR;
result.error_text = REDIRECT_ERROR;
break;
case TOK_INVALID_PIPE:
this->last_token = PIPE_ERROR;
result.error_text = PIPE_ERROR;
break;
default:
assert(0 && "Unknown error type");
}
}
return result;
}
tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) {
@ -80,34 +87,11 @@ tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start),
bool tokenizer_t::next(struct tok_t *result) {
assert(result != NULL);
if (!this->tok_next()) {
maybe_t<tok_t> tok = this->tok_next();
if (!tok) {
return false;
}
const size_t current_pos = this->buff - this->start;
// We want to copy our last_token into result->text. If we just do this naively via =, we are
// liable to trigger std::string's CoW implementation: result->text's storage will be
// deallocated and instead will acquire a reference to last_token's storage. But last_token will
// be overwritten soon, which will trigger a new allocation and a copy. So our attempt to re-use
// result->text's storage will have failed. To ensure that doesn't happen, use assign() with
// wchar_t.
result->text.assign(this->last_token.data(), this->last_token.size());
result->type = this->last_type;
result->offset = this->last_pos;
result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE;
assert(this->buff >= this->start);
// Compute error offset.
result->error_offset = 0;
if (this->last_type == TOK_ERROR && this->global_error_offset >= this->last_pos &&
this->global_error_offset < current_pos) {
result->error_offset = this->global_error_offset - this->last_pos;
}
assert(this->buff >= this->start);
result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
*result = std::move(*tok);
return true;
}
@ -143,9 +127,8 @@ static bool tok_is_string_character(wchar_t c, bool is_first) {
static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }
/// Read the next token as a string.
void tokenizer_t::read_string() {
long len;
int do_loop = 1;
tok_t tokenizer_t::read_string() {
bool do_loop = true;
size_t paran_count = 0;
// Up to 96 open parens, before we give up on good error reporting.
const size_t paran_offsets_max = 96;
@ -170,8 +153,8 @@ void tokenizer_t::read_string() {
this->buff++;
if (*this->buff == L'\0') {
if ((!this->accept_unfinished)) {
this->call_error(TOK_UNTERMINATED_ESCAPE, error_location);
return;
return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
error_location);
}
// Since we are about to increment tok->buff, decrement it first so the
// increment doesn't go past the end of the buffer. See issue #389.
@ -209,8 +192,8 @@ void tokenizer_t::read_string() {
this->buff += wcslen(this->buff);
if (!this->accept_unfinished) {
this->call_error(TOK_UNTERMINATED_QUOTE, error_loc);
return;
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
error_loc);
}
do_loop = 0;
}
@ -238,8 +221,8 @@ void tokenizer_t::read_string() {
const wchar_t *error_loc = this->buff;
this->buff += wcslen(this->buff);
if ((!this->accept_unfinished)) {
this->call_error(TOK_UNTERMINATED_QUOTE, error_loc);
return;
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
error_loc);
}
do_loop = 0;
}
@ -305,6 +288,7 @@ void tokenizer_t::read_string() {
}
if ((!this->accept_unfinished) && (mode != mode_regular_text)) {
tok_t error;
switch (mode) {
case mode_subshell: {
// Determine the innermost opening paran offset by interrogating paran_offsets.
@ -314,12 +298,14 @@ void tokenizer_t::read_string() {
offset_of_open_paran = paran_offsets[paran_count - 1];
}
this->call_error(TOK_UNTERMINATED_SUBSHELL, this->start + offset_of_open_paran);
error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
this->start + offset_of_open_paran);
break;
}
case mode_array_brackets:
case mode_array_brackets_and_subshell: {
this->call_error(TOK_UNTERMINATED_SLICE, this->start + offset_of_bracket);
error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
this->start + offset_of_bracket);
break;
}
default: {
@ -327,13 +313,14 @@ void tokenizer_t::read_string() {
break;
}
}
return;
return error;
}
len = this->buff - buff_start;
this->last_token.assign(buff_start, len);
this->last_type = TOK_STRING;
tok_t result;
result.type = TOK_STRING;
result.offset = buff_start - this->start;
result.length = this->buff - buff_start;
return result;
}
/// Reads a redirection or an "fd pipe" (like 2>|) from a string. Returns how many characters were
@ -482,9 +469,9 @@ static bool iswspace_not_nl(wchar_t c) {
}
}
bool tokenizer_t::tok_next() {
maybe_t<tok_t> tokenizer_t::tok_next() {
if (!this->has_next) {
return false;
return none();
}
// Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
@ -510,30 +497,31 @@ bool tokenizer_t::tok_next() {
// Maybe return the comment.
if (this->show_comments) {
this->last_pos = comment_start - this->start;
this->last_token.assign(comment_start, comment_len);
this->last_type = TOK_COMMENT;
return true;
tok_t result;
result.type = TOK_COMMENT;
result.offset = comment_start - this->start;
result.length = comment_len;
return result;
}
while (iswspace_not_nl(this->buff[0])) this->buff++;
}
// We made it past the comments and ate any trailing newlines we wanted to ignore.
this->continue_line_after_comment = false;
this->last_pos = this->buff - this->start;
size_t start_pos = this->buff - this->start;
tok_t result;
result.offset = start_pos;
switch (*this->buff) {
case L'\0': {
this->last_type = TOK_END;
this->has_next = false;
this->last_token.clear();
return false;
return none();
}
case L'\r': // carriage-return
case L'\n': // newline
case L';': {
this->last_type = TOK_END;
this->last_token.assign(1, *this->buff);
result.type = TOK_END;
result.length = 1;
this->buff++;
// Hack: when we get a newline, swallow as many as we can. This compresses multiple
// subsequent newlines into a single one.
@ -546,13 +534,15 @@ bool tokenizer_t::tok_next() {
break;
}
case L'&': {
this->last_type = TOK_BACKGROUND;
result.type = TOK_BACKGROUND;
result.length = 1;
this->buff++;
break;
}
case L'|': {
this->last_token = L"1";
this->last_type = TOK_PIPE;
result.type = TOK_PIPE;
result.redirected_fd = 1;
result.length = 1;
this->buff++;
break;
}
@ -565,12 +555,12 @@ bool tokenizer_t::tok_next() {
int fd = -1;
size_t consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd);
if (consumed == 0 || fd < 0) {
this->call_error(TOK_INVALID_REDIRECT, this->buff);
} else {
this->buff += consumed;
this->last_type = mode;
this->last_token = to_string(fd);
return this->call_error(TOK_INVALID_REDIRECT, this->buff, this->buff);
}
result.type = mode;
result.redirected_fd = fd;
result.length = consumed;
this->buff += consumed;
break;
}
default: {
@ -588,30 +578,29 @@ bool tokenizer_t::tok_next() {
// that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer
// error.
if (mode == TOK_PIPE && fd == 0) {
this->call_error(TOK_INVALID_PIPE, error_location);
} else {
this->buff += consumed;
this->last_type = mode;
this->last_token = to_string(fd);
return this->call_error(TOK_INVALID_PIPE, error_location, error_location);
}
result.type = mode;
result.redirected_fd = fd;
result.length = consumed;
this->buff += consumed;
} else {
// Not a redirection or pipe, so just a string.
this->read_string();
result = this->read_string();
}
break;
}
}
return true;
return result;
}
wcstring tok_first(const wcstring &str) {
wcstring result;
tokenizer_t t(str.data(), TOK_SQUASH_ERRORS);
tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
tok_t token;
if (t.next(&token) && token.type == TOK_STRING) {
result = std::move(token.text);
return t.text_of(token);
}
return result;
return {};
}
bool move_word_state_machine_t::consume_char_punctuation(wchar_t c) {

View file

@ -6,6 +6,7 @@
#include <stddef.h>
#include "common.h"
#include "maybe.h"
/// Token types.
enum token_type {
@ -52,21 +53,26 @@ enum tokenizer_error {
typedef unsigned int tok_flags_t;
struct tok_t {
// The text of the token, or an error message for type error.
wcstring text;
// The type of the token.
token_type type;
token_type type{TOK_NONE};
// Offset of the token.
size_t offset{0};
// Length of the token.
size_t length{0};
// If the token represents a redirection, the redirected fd.
maybe_t<int> redirected_fd{};
// If an error, this is the error code.
enum tokenizer_error error;
enum tokenizer_error error { TOK_ERROR_NONE };
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
// at 'offset'.
size_t error_offset;
// Offset of the token.
size_t offset;
// Length of the token.
size_t length;
size_t error_offset{size_t(-1)};
// If there is an error, the text of the error; otherwise empty.
wcstring error_text{};
tok_t() : type(TOK_NONE), error(TOK_ERROR_NONE), error_offset(-1), offset(-1), length(-1) {}
tok_t() = default;
};
/// The tokenizer struct.
@ -79,13 +85,7 @@ class tokenizer_t {
const wchar_t *buff;
/// The start of the original string.
const wchar_t *const start;
/// The last token.
wcstring last_token;
/// Type of last token.
enum token_type last_type { TOK_NONE };
/// Offset of last token.
size_t last_pos{0};
/// Whether there are more tokens.
/// Whether we have additional tokens.
bool has_next{true};
/// Whether incomplete tokens are accepted.
bool accept_unfinished{false};
@ -93,18 +93,15 @@ class tokenizer_t {
bool show_comments{false};
/// Whether all blank lines are returned.
bool show_blank_lines{false};
/// Last error.
tokenizer_error error{TOK_ERROR_NONE};
/// Last error offset, in "global" coordinates (relative to orig_buff).
size_t global_error_offset{size_t(-1)};
/// Whether we are squashing errors.
bool squash_errors{false};
/// Whether to continue the previous line after the comment.
bool continue_line_after_comment{false};
void call_error(enum tokenizer_error error_type, const wchar_t *where);
void read_string();
bool tok_next();
tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start,
const wchar_t *error_loc);
tok_t read_string();
maybe_t<tok_t> tok_next();
public:
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
@ -118,6 +115,9 @@ class tokenizer_t {
/// Returns the next token by reference. Returns true if we got one, false if we're at the end.
bool next(struct tok_t *result);
/// Returns the text of a token, as a string.
wcstring text_of(const tok_t &tok) const { return wcstring(start + tok.offset, tok.length); }
};
/// Returns only the first token from the specified string. This is a convenience function, used to