From 5ef457cfd37438f03806024df7f9f1aea890fc0f Mon Sep 17 00:00:00 2001 From: Fabian Boehm Date: Tue, 9 Aug 2022 19:33:40 +0200 Subject: [PATCH] Make tokenizer delimiter errors one long This makes the awkward case fish: Unexpected end of string, square brackets do not match echo f[oo # not valid, no matching ] ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ (that `]` is simply the last character on the line, it's firmly in a comment) less awkward by only marking the starting brace. The implementation here is awkward mostly because the tok_t communicates two things: The error location and how to carry on. So we need to store the error length separately, and this is the first time we've done so. It's possible we can make this simpler. --- src/ast.cpp | 2 +- src/tokenizer.cpp | 34 ++++++++++++++++++++++------------ src/tokenizer.h | 3 ++- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/ast.cpp b/src/ast.cpp index 96a3f4cc9..2cb1389f6 100644 --- a/src/ast.cpp +++ b/src/ast.cpp @@ -188,7 +188,7 @@ class token_stream_t { // Skip invalid tokens that have a zero length, especially if they are at EOF. if (subtoken_offset < result.source_length) { result.source_start += subtoken_offset; - result.source_length -= subtoken_offset; + result.source_length = token.error_length; } } diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 11e6f2948..c96d4a846 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -57,7 +57,7 @@ const wchar_t *tokenizer_get_error_message(tokenizer_error_t err) { /// Return an error token and mark that we no longer have a next token. tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token_start, - const wchar_t *error_loc, maybe_t token_length) { + const wchar_t *error_loc, maybe_t token_length, size_t error_len) { assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error"); assert(error_loc >= token_start && "Invalid error location"); assert(this->token_cursor >= token_start && "Invalid buff location"); @@ -77,6 +77,7 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token // If we are passed a token_length, then use it; otherwise infer it from the buffer. result.length = token_length ? *token_length : this->token_cursor - token_start; result.error_offset_within_token = error_loc - token_start; + result.error_length = error_len; return result; } @@ -197,11 +198,11 @@ tok_t tokenizer_t::read_string() { } else if (c == L')') { if (!expecting.empty() && expecting.back() == L'}') { return this->call_error(tokenizer_error_t::expected_bclose_found_pclose, - this->token_cursor, this->token_cursor, 1); + this->token_cursor, this->token_cursor, 1, 1); } if (paran_offsets.empty()) { return this->call_error(tokenizer_error_t::closing_unopened_subshell, - this->token_cursor, this->token_cursor, 1); + this->token_cursor, this->token_cursor, 1, 1); } paran_offsets.pop_back(); if (paran_offsets.empty()) { @@ -224,7 +225,7 @@ tok_t tokenizer_t::read_string() { } else if (c == L'}') { if (!expecting.empty() && expecting.back() == L')') { return this->call_error(tokenizer_error_t::expected_pclose_found_bclose, - this->token_cursor, this->token_cursor, 1); + this->token_cursor, this->token_cursor, 1, 1); } if (brace_offsets.empty()) { return this->call_error(tokenizer_error_t::closing_unopened_brace, @@ -254,7 +255,7 @@ tok_t tokenizer_t::read_string() { if (const wchar_t *error_loc = process_opening_quote(c)) { if (!this->accept_unfinished) { return this->call_error(tokenizer_error_t::unterminated_quote, buff_start, - error_loc); + error_loc, none(), 1); } break; } @@ -277,24 +278,28 @@ tok_t tokenizer_t::read_string() { } if (!this->accept_unfinished && (mode != tok_modes::regular_text)) { + // These are all "unterminated", so the only char we can mark as an error + // is the opener (the closing char could be anywhere!) + // + // (except for char_escape, which is one long by definition) if (mode & tok_modes::char_escape) { return this->call_error(tokenizer_error_t::unterminated_escape, buff_start, - this->token_cursor - 1, 1); + this->token_cursor - 1, none(), 1); } else if (mode & tok_modes::array_brackets) { return this->call_error(tokenizer_error_t::unterminated_slice, buff_start, - this->start + slice_offset); + this->start + slice_offset, none(), 1); } else if (mode & tok_modes::subshell) { assert(!paran_offsets.empty()); size_t offset_of_open_paran = paran_offsets.back(); return this->call_error(tokenizer_error_t::unterminated_subshell, buff_start, - this->start + offset_of_open_paran); + this->start + offset_of_open_paran, none(), 1); } else if (mode & tok_modes::curly_braces) { assert(!brace_offsets.empty()); size_t offset_of_open_brace = brace_offsets.back(); return this->call_error(tokenizer_error_t::unterminated_brace, buff_start, - this->start + offset_of_open_brace); + this->start + offset_of_open_brace, none(), 1); } else { DIE("Unknown non-regular-text mode"); } @@ -591,7 +596,7 @@ maybe_t tokenizer_t::next() { } else if (this->token_cursor[1] == L'&') { // |& is a bashism; in fish it's &|. return this->call_error(tokenizer_error_t::invalid_pipe_ampersand, - this->token_cursor, this->token_cursor, 2); + this->token_cursor, this->token_cursor, 2, 2); } else { auto pipe = pipe_or_redir_t::from_string(this->token_cursor); assert(pipe.has_value() && pipe->is_pipe && @@ -612,7 +617,9 @@ maybe_t tokenizer_t::next() { if (!redir_or_pipe || redir_or_pipe->fd < 0) { return this->call_error(tokenizer_error_t::invalid_redirect, this->token_cursor, this->token_cursor, - redir_or_pipe ? redir_or_pipe->consumed : 0); + redir_or_pipe ? redir_or_pipe->consumed : 0, + redir_or_pipe ? redir_or_pipe->consumed : 0 + ); } result.emplace(redir_or_pipe->token_type()); result->offset = start_pos; @@ -634,7 +641,10 @@ maybe_t tokenizer_t::next() { // tokenizer error. if (redir_or_pipe->is_pipe && redir_or_pipe->fd == 0) { return this->call_error(tokenizer_error_t::invalid_pipe, error_location, - error_location, redir_or_pipe->consumed); + error_location, + redir_or_pipe->consumed, + redir_or_pipe->consumed + ); } result.emplace(redir_or_pipe->token_type()); result->offset = start_pos; diff --git a/src/tokenizer.h b/src/tokenizer.h index 8dcc2bddb..a86ef1849 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -68,6 +68,7 @@ struct tok_t { // If an error, this is the offset of the error within the token. A value of 0 means it occurred // at 'offset'. source_offset_t error_offset_within_token{SOURCE_OFFSET_INVALID}; + source_offset_t error_length{0}; // If an error, this is the error code. tokenizer_error_t error{tokenizer_error_t::none}; @@ -107,7 +108,7 @@ class tokenizer_t : noncopyable_t { bool continue_line_after_comment{false}; tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start, - const wchar_t *error_loc, maybe_t token_length = {}); + const wchar_t *error_loc, maybe_t token_length = {}, size_t error_len = 0); tok_t read_string(); public: