Make tokenizer delimiter errors one long

This makes the awkward case

	    fish: Unexpected end of string, square brackets do not match
	    echo f[oo # not valid, no matching ]
	          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~^

(that `]` is simply the last character on the line, it's firmly in a comment)

less awkward by only marking the starting brace.

The implementation here is awkward mostly because the tok_t
communicates two things: The error location and how to carry on.

So we need to store the error length separately, and this is the first
time we've done so.

It's possible we can make this simpler.
This commit is contained in:
Fabian Boehm 2022-08-09 19:33:40 +02:00
parent bf47d469d4
commit 5ef457cfd3
3 changed files with 25 additions and 14 deletions

View file

@ -188,7 +188,7 @@ class token_stream_t {
// Skip invalid tokens that have a zero length, especially if they are at EOF.
if (subtoken_offset < result.source_length) {
result.source_start += subtoken_offset;
result.source_length -= subtoken_offset;
result.source_length = token.error_length;
}
}

View file

@ -57,7 +57,7 @@ const wchar_t *tokenizer_get_error_message(tokenizer_error_t err) {
/// Return an error token and mark that we no longer have a next token.
tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token_start,
const wchar_t *error_loc, maybe_t<size_t> token_length) {
const wchar_t *error_loc, maybe_t<size_t> token_length, size_t error_len) {
assert(error_type != tokenizer_error_t::none && "tokenizer_error_t::none passed to call_error");
assert(error_loc >= token_start && "Invalid error location");
assert(this->token_cursor >= token_start && "Invalid buff location");
@ -77,6 +77,7 @@ tok_t tokenizer_t::call_error(tokenizer_error_t error_type, const wchar_t *token
// If we are passed a token_length, then use it; otherwise infer it from the buffer.
result.length = token_length ? *token_length : this->token_cursor - token_start;
result.error_offset_within_token = error_loc - token_start;
result.error_length = error_len;
return result;
}
@ -197,11 +198,11 @@ tok_t tokenizer_t::read_string() {
} else if (c == L')') {
if (!expecting.empty() && expecting.back() == L'}') {
return this->call_error(tokenizer_error_t::expected_bclose_found_pclose,
this->token_cursor, this->token_cursor, 1);
this->token_cursor, this->token_cursor, 1, 1);
}
if (paran_offsets.empty()) {
return this->call_error(tokenizer_error_t::closing_unopened_subshell,
this->token_cursor, this->token_cursor, 1);
this->token_cursor, this->token_cursor, 1, 1);
}
paran_offsets.pop_back();
if (paran_offsets.empty()) {
@ -224,7 +225,7 @@ tok_t tokenizer_t::read_string() {
} else if (c == L'}') {
if (!expecting.empty() && expecting.back() == L')') {
return this->call_error(tokenizer_error_t::expected_pclose_found_bclose,
this->token_cursor, this->token_cursor, 1);
this->token_cursor, this->token_cursor, 1, 1);
}
if (brace_offsets.empty()) {
return this->call_error(tokenizer_error_t::closing_unopened_brace,
@ -254,7 +255,7 @@ tok_t tokenizer_t::read_string() {
if (const wchar_t *error_loc = process_opening_quote(c)) {
if (!this->accept_unfinished) {
return this->call_error(tokenizer_error_t::unterminated_quote, buff_start,
error_loc);
error_loc, none(), 1);
}
break;
}
@ -277,24 +278,28 @@ tok_t tokenizer_t::read_string() {
}
if (!this->accept_unfinished && (mode != tok_modes::regular_text)) {
// These are all "unterminated", so the only char we can mark as an error
// is the opener (the closing char could be anywhere!)
//
// (except for char_escape, which is one long by definition)
if (mode & tok_modes::char_escape) {
return this->call_error(tokenizer_error_t::unterminated_escape, buff_start,
this->token_cursor - 1, 1);
this->token_cursor - 1, none(), 1);
} else if (mode & tok_modes::array_brackets) {
return this->call_error(tokenizer_error_t::unterminated_slice, buff_start,
this->start + slice_offset);
this->start + slice_offset, none(), 1);
} else if (mode & tok_modes::subshell) {
assert(!paran_offsets.empty());
size_t offset_of_open_paran = paran_offsets.back();
return this->call_error(tokenizer_error_t::unterminated_subshell, buff_start,
this->start + offset_of_open_paran);
this->start + offset_of_open_paran, none(), 1);
} else if (mode & tok_modes::curly_braces) {
assert(!brace_offsets.empty());
size_t offset_of_open_brace = brace_offsets.back();
return this->call_error(tokenizer_error_t::unterminated_brace, buff_start,
this->start + offset_of_open_brace);
this->start + offset_of_open_brace, none(), 1);
} else {
DIE("Unknown non-regular-text mode");
}
@ -591,7 +596,7 @@ maybe_t<tok_t> tokenizer_t::next() {
} else if (this->token_cursor[1] == L'&') {
// |& is a bashism; in fish it's &|.
return this->call_error(tokenizer_error_t::invalid_pipe_ampersand,
this->token_cursor, this->token_cursor, 2);
this->token_cursor, this->token_cursor, 2, 2);
} else {
auto pipe = pipe_or_redir_t::from_string(this->token_cursor);
assert(pipe.has_value() && pipe->is_pipe &&
@ -612,7 +617,9 @@ maybe_t<tok_t> tokenizer_t::next() {
if (!redir_or_pipe || redir_or_pipe->fd < 0) {
return this->call_error(tokenizer_error_t::invalid_redirect, this->token_cursor,
this->token_cursor,
redir_or_pipe ? redir_or_pipe->consumed : 0);
redir_or_pipe ? redir_or_pipe->consumed : 0,
redir_or_pipe ? redir_or_pipe->consumed : 0
);
}
result.emplace(redir_or_pipe->token_type());
result->offset = start_pos;
@ -634,7 +641,10 @@ maybe_t<tok_t> tokenizer_t::next() {
// tokenizer error.
if (redir_or_pipe->is_pipe && redir_or_pipe->fd == 0) {
return this->call_error(tokenizer_error_t::invalid_pipe, error_location,
error_location, redir_or_pipe->consumed);
error_location,
redir_or_pipe->consumed,
redir_or_pipe->consumed
);
}
result.emplace(redir_or_pipe->token_type());
result->offset = start_pos;

View file

@ -68,6 +68,7 @@ struct tok_t {
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
// at 'offset'.
source_offset_t error_offset_within_token{SOURCE_OFFSET_INVALID};
source_offset_t error_length{0};
// If an error, this is the error code.
tokenizer_error_t error{tokenizer_error_t::none};
@ -107,7 +108,7 @@ class tokenizer_t : noncopyable_t {
bool continue_line_after_comment{false};
tok_t call_error(tokenizer_error_t error_type, const wchar_t *token_start,
const wchar_t *error_loc, maybe_t<size_t> token_length = {});
const wchar_t *error_loc, maybe_t<size_t> token_length = {}, size_t error_len = 0);
tok_t read_string();
public: