diff --git a/src/common.cpp b/src/common.cpp index 84d2cdd8e..f3c91df30 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1288,10 +1288,11 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in const bool unescape_special = static_cast(flags & UNESCAPE_SPECIAL); const bool allow_incomplete = static_cast(flags & UNESCAPE_INCOMPLETE); - int bracket_count = 0; + bool brace_text_start = false; + int brace_count = 0; bool errored = false; - enum { mode_unquoted, mode_single_quotes, mode_double_quotes } mode = mode_unquoted; + enum { mode_unquoted, mode_single_quotes, mode_double_quotes, mode_braces } mode = mode_unquoted; for (size_t input_position = 0; input_position < input_len && !errored; input_position++) { const wchar_t c = input[input_position]; @@ -1352,21 +1353,32 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in } case L'{': { if (unescape_special) { - bracket_count++; - to_append_or_none = BRACKET_BEGIN; + brace_count++; + to_append_or_none = BRACE_BEGIN; } break; } case L'}': { if (unescape_special) { - bracket_count--; - to_append_or_none = BRACKET_END; + assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we shouldn't be able to get here"); + brace_count--; + brace_text_start = brace_text_start && brace_count > 0; + to_append_or_none = BRACE_END; } break; } case L',': { - if (unescape_special && bracket_count > 0) { - to_append_or_none = BRACKET_SEP; + if (unescape_special && brace_count > 0) { + to_append_or_none = BRACE_SEP; + brace_text_start = false; + } + break; + } + case L'\n': + case L'\t': + case L' ': { + if (unescape_special && brace_count > 0) { + to_append_or_none = brace_text_start ? BRACE_SPACE : NOT_A_WCHAR; } break; } @@ -1380,7 +1392,12 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in to_append_or_none = unescape_special ? wint_t(INTERNAL_SEPARATOR) : NOT_A_WCHAR; break; } - default: { break; } + default: { + if (unescape_special && brace_count > 0) { + brace_text_start = true; + } + break; + } } } else if (mode == mode_single_quotes) { if (c == L'\\') { diff --git a/src/common.h b/src/common.h index a5b4bb18e..8047018b5 100644 --- a/src/common.h +++ b/src/common.h @@ -807,6 +807,19 @@ struct enum_map { const wchar_t *const str; }; + +/// Use for scoped enums (i.e. `enum class`) with bitwise operations +#define ENUM_FLAG_OPERATOR(T,X,Y) \ +inline T operator X (T lhs, T rhs) { return (T) (static_cast::type>(lhs) X static_cast::type>(rhs)); } \ +inline T operator Y (T &lhs, T rhs) { return lhs = (T) (static_cast::type>(lhs) X static_cast::type>(rhs)); } +#define ENUM_FLAGS(T) \ +enum class T; \ +inline T operator ~ (T t) { return (T) (~static_cast::type>(t)); } \ +ENUM_FLAG_OPERATOR(T,|,|=) \ +ENUM_FLAG_OPERATOR(T,^,^=) \ +ENUM_FLAG_OPERATOR(T,&,&=) \ +enum class T + /// Given a string return the matching enum. Return the sentinal enum if no match is made. The map /// must be sorted by the `str` member. A binary search is twice as fast as a linear search with 16 /// elements in the map. diff --git a/src/expand.cpp b/src/expand.cpp index 7605ed353..141d4158f 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -47,6 +47,7 @@ #include "proc.h" #include "reader.h" #include "wildcard.h" +#include "wcstringutil.h" #include "wutil.h" // IWYU pragma: keep #ifdef KERN_PROCARGS2 #else @@ -570,7 +571,7 @@ static void find_process(const wchar_t *proc, expand_flags_t flags, static size_t parse_slice(const wchar_t *in, wchar_t **end_ptr, std::vector &idx, std::vector &source_positions, size_t array_size) { const long size = (long)array_size; - size_t pos = 1; // skip past the opening square bracket + size_t pos = 1; // skip past the opening square brace while (1) { while (iswspace(in[pos]) || (in[pos] == INTERNAL_SEPARATOR)) pos++; @@ -846,39 +847,39 @@ static bool expand_variables(const wcstring &instr, std::vector *o return true; } -/// Perform bracket expansion. -static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flags, +/// Perform brace expansion. +static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, std::vector *out, parse_error_list_t *errors) { bool syntax_error = false; - int bracket_count = 0; + int brace_count = 0; - const wchar_t *bracket_begin = NULL, *bracket_end = NULL; + const wchar_t *brace_begin = NULL, *brace_end = NULL; const wchar_t *last_sep = NULL; const wchar_t *item_begin; - size_t length_preceding_brackets, length_following_brackets, tot_len; + size_t length_preceding_braces, length_following_braces, tot_len; const wchar_t *const in = instr.c_str(); - // Locate the first non-nested bracket pair. + // Locate the first non-nested brace pair. for (const wchar_t *pos = in; (*pos) && !syntax_error; pos++) { switch (*pos) { - case BRACKET_BEGIN: { - if (bracket_count == 0) bracket_begin = pos; - bracket_count++; + case BRACE_BEGIN: { + if (brace_count == 0) brace_begin = pos; + brace_count++; break; } - case BRACKET_END: { - bracket_count--; - if (bracket_count < 0) { + case BRACE_END: { + brace_count--; + if (brace_count < 0) { syntax_error = true; - } else if (bracket_count == 0) { - bracket_end = pos; + } else if (brace_count == 0) { + brace_end = pos; } break; } - case BRACKET_SEP: { - if (bracket_count == 1) last_sep = pos; + case BRACE_SEP: { + if (brace_count == 1) last_sep = pos; break; } default: { @@ -887,72 +888,80 @@ static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flag } } - if (bracket_count > 0) { + if (brace_count > 0) { if (!(flags & EXPAND_FOR_COMPLETIONS)) { syntax_error = true; } else { - // The user hasn't typed an end bracket yet; make one up and append it, then expand + // The user hasn't typed an end brace yet; make one up and append it, then expand // that. wcstring mod; if (last_sep) { - mod.append(in, bracket_begin - in + 1); + mod.append(in, brace_begin - in + 1); mod.append(last_sep + 1); - mod.push_back(BRACKET_END); + mod.push_back(BRACE_END); } else { mod.append(in); - mod.push_back(BRACKET_END); + mod.push_back(BRACE_END); } // Note: this code looks very fishy, apparently it has never worked. - return expand_brackets(mod, 1, out, errors); + return expand_braces(mod, 1, out, errors); } } // Expand a literal "{}" to itself because it is useless otherwise, // and this eases e.g. `find -exec {}`. See #1109. - if (bracket_begin + 1 == bracket_end) { + if (brace_begin + 1 == brace_end) { wcstring newstr = instr; - newstr.at(bracket_begin - in) = L'{'; - newstr.at(bracket_end - in) = L'}'; - return expand_brackets(newstr, flags, out, errors); + newstr.at(brace_begin - in) = L'{'; + newstr.at(brace_end - in) = L'}'; + return expand_braces(newstr, flags, out, errors); } if (syntax_error) { - append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched brackets")); + append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched braces")); return EXPAND_ERROR; } - if (bracket_begin == NULL) { + if (brace_begin == NULL) { append_completion(out, instr); return EXPAND_OK; } - length_preceding_brackets = (bracket_begin - in); - length_following_brackets = wcslen(bracket_end) - 1; - tot_len = length_preceding_brackets + length_following_brackets; - item_begin = bracket_begin + 1; - for (const wchar_t *pos = (bracket_begin + 1); true; pos++) { - if (bracket_count == 0 && ((*pos == BRACKET_SEP) || (pos == bracket_end))) { + length_preceding_braces = (brace_begin - in); + length_following_braces = wcslen(brace_end) - 1; + tot_len = length_preceding_braces + length_following_braces; + item_begin = brace_begin + 1; + for (const wchar_t *pos = (brace_begin + 1); true; pos++) { + if (brace_count == 0 && ((*pos == BRACE_SEP) || (pos == brace_end))) { assert(pos >= item_begin); size_t item_len = pos - item_begin; + wcstring item = wcstring(item_begin, item_len); + item = trim(item, (const wchar_t[]) { BRACE_SPACE }); + for (auto &c : item) { + if (c == BRACE_SPACE) { + c = ' '; + } + } wcstring whole_item; whole_item.reserve(tot_len + item_len + 2); - whole_item.append(in, length_preceding_brackets); - whole_item.append(item_begin, item_len); - whole_item.append(bracket_end + 1); - expand_brackets(whole_item, flags, out, errors); + whole_item.append(in, length_preceding_braces); + whole_item.append(item.begin(), item.end()); + whole_item.append(brace_end + 1); + whole_item = trim(whole_item, (const wchar_t[]) { BRACE_SPACE }); + expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; - if (pos == bracket_end) break; + if (pos == brace_end) break; } - if (*pos == BRACKET_BEGIN) { - bracket_count++; + if (*pos == BRACE_BEGIN) { + brace_count++; } - if (*pos == BRACKET_END) { - bracket_count--; + if (*pos == BRACE_END) { + brace_count--; } } return EXPAND_OK; @@ -1274,9 +1283,9 @@ static expand_error_t expand_stage_variables(const wcstring &input, std::vector< return EXPAND_OK; } -static expand_error_t expand_stage_brackets(const wcstring &input, std::vector *out, +static expand_error_t expand_stage_braces(const wcstring &input, std::vector *out, expand_flags_t flags, parse_error_list_t *errors) { - return expand_brackets(input, flags, out, errors); + return expand_braces(input, flags, out, errors); } static expand_error_t expand_stage_home(const wcstring &input, @@ -1393,7 +1402,7 @@ expand_error_t expand_string(const wcstring &input, std::vector *o // Our expansion stages. const expand_stage_t stages[] = {expand_stage_cmdsubst, expand_stage_variables, - expand_stage_brackets, expand_stage_home, + expand_stage_braces, expand_stage_home, expand_stage_wildcards}; // Load up our single initial completion. diff --git a/src/expand.h b/src/expand.h index 771d8773b..890a6407b 100644 --- a/src/expand.h +++ b/src/expand.h @@ -65,11 +65,13 @@ enum { /// Character representing variable expansion into a single element. VARIABLE_EXPAND_SINGLE, /// Character representing the start of a bracket expansion. - BRACKET_BEGIN, + BRACE_BEGIN, /// Character representing the end of a bracket expansion. - BRACKET_END, + BRACE_END, /// Character representing separation between two bracket elements. - BRACKET_SEP, + BRACE_SEP, + /// Character that takes the place of any whitespace within non-quoted text in braces + BRACE_SPACE, /// Separate subtokens in a token with this character. INTERNAL_SEPARATOR, /// Character representing an empty variable expansion. Only used transitively while expanding diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 3ccd7ad25..71ee33a94 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -578,6 +578,15 @@ static void test_tokenizer() { do_test(token.error_offset == 3); } + { + tokenizer_t t(L"abc )defg(hij", 0); + do_test(t.next(&token)); + do_test(t.next(&token)); + do_test(token.type == TOK_ERROR); + do_test(token.error == TOK_CLOSING_UNOPENED_SUBSHELL); + do_test(token.error_offset == 4); + } + { tokenizer_t t(L"abc defg(hij (klm)", 0); do_test(t.next(&token)); @@ -4420,10 +4429,11 @@ static void test_illegal_command_exit_code() { const command_result_tuple_t tests[] = { {L"echo -n", STATUS_CMD_OK}, {L"pwd", STATUS_CMD_OK}, - {L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, + // a `)` without a matching `(` is now a tokenizer error, and cannot be executed even as an illegal command + // {L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD} {L"*", STATUS_ILLEGAL_CMD}, {L"**", STATUS_ILLEGAL_CMD}, {L"?", STATUS_ILLEGAL_CMD}, {L"abc?def", STATUS_ILLEGAL_CMD}, - {L") ", STATUS_ILLEGAL_CMD}}; + }; int res = 0; const io_chain_t empty_ios; diff --git a/src/highlight.cpp b/src/highlight.cpp index c0366c7d5..1b94e87fa 100644 --- a/src/highlight.cpp +++ b/src/highlight.cpp @@ -122,9 +122,9 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l switch (c) { case VARIABLE_EXPAND: case VARIABLE_EXPAND_SINGLE: - case BRACKET_BEGIN: - case BRACKET_END: - case BRACKET_SEP: + case BRACE_BEGIN: + case BRACE_END: + case BRACE_SEP: case ANY_CHAR: case ANY_STRING: case ANY_STRING_RECURSIVE: { diff --git a/src/parse_constants.h b/src/parse_constants.h index 05700fc4d..b84de73e6 100644 --- a/src/parse_constants.h +++ b/src/parse_constants.h @@ -169,6 +169,7 @@ enum parse_error_code_t { parse_error_tokenizer_unterminated_subshell, parse_error_tokenizer_unterminated_slice, parse_error_tokenizer_unterminated_escape, + parse_error_tokenizer_nested_slice, parse_error_tokenizer_other, parse_error_unbalancing_end, // end outside of block diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index 9c51025f4..9ddcebd09 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta } void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) { - parse_error_code_t parse_error_code; - switch (tok.error) { - case TOK_UNTERMINATED_QUOTE: { - parse_error_code = parse_error_tokenizer_unterminated_quote; - break; - } - case TOK_UNTERMINATED_SUBSHELL: { - parse_error_code = parse_error_tokenizer_unterminated_subshell; - break; - } - case TOK_UNTERMINATED_SLICE: { - parse_error_code = parse_error_tokenizer_unterminated_slice; - break; - } - case TOK_UNTERMINATED_ESCAPE: { - parse_error_code = parse_error_tokenizer_unterminated_escape; - break; - } - case TOK_INVALID_REDIRECT: - case TOK_INVALID_PIPE: - default: { - parse_error_code = parse_error_tokenizer_other; - break; - } - } - + parse_error_code_t parse_error_code = tok.error->parser_error; this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset, parse_error_code, L"%ls", - error_message_for_code(tok.error).c_str()); + tok.error->Message); } void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) { diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 2927763ee..2fb17e94a 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -834,14 +834,14 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token wchar_t char_after_dollar = dollar_pos + 1 >= token.size() ? 0 : token.at(dollar_pos + 1); switch (char_after_dollar) { - case BRACKET_BEGIN: + case BRACE_BEGIN: case L'{': { - // The BRACKET_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible + // The BRACE_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible // quoted) ${. See if we have a }, and the stuff in between is variable material. If so, // report a bracket error. Otherwise just complain about the ${. bool looks_like_variable = false; size_t closing_bracket = - token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACKET_END), dollar_pos + 2); + token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACE_END), dollar_pos + 2); wcstring var_name; if (closing_bracket != wcstring::npos) { size_t var_start = dollar_pos + 2, var_end = closing_bracket; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index eb49cf263..65e262204 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -16,46 +16,22 @@ #include "tokenizer.h" #include "wutil.h" // IWYU pragma: keep -/// Error string for unexpected end of string. -#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced") - -/// Error string for mismatched parenthesis. -#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match") - -/// Error string for mismatched square brackets. -#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match") - -/// Error string for unterminated escape (backslash without continuation). -#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence") - -/// Error string for invalid redirections. -#define REDIRECT_ERROR _(L"Invalid input/output redirection") - -/// Error string for when trying to pipe from fd 0. -#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output") - -wcstring error_message_for_code(tokenizer_error err) { - switch (err) { - case TOK_UNTERMINATED_QUOTE: - return QUOTE_ERROR; - case TOK_UNTERMINATED_SUBSHELL: - return PARAN_ERROR; - case TOK_UNTERMINATED_SLICE: - return SQUARE_BRACKET_ERROR; - case TOK_UNTERMINATED_ESCAPE: - return UNTERMINATED_ESCAPE_ERROR; - case TOK_INVALID_REDIRECT: - return REDIRECT_ERROR; - case TOK_INVALID_PIPE: - return PIPE_ERROR; - default: - assert(0 && "Unknown error type"); - return {}; - } -} +tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L""); +tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote); +tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell); +tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice); +tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape); +tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection")); +tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output")); +tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis")); +tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location")); +tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion")); +tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion")); +tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'")); +tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'")); /// Return an error token and mark that we no longer have a next token. -tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start, +tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start, const wchar_t *error_loc) { assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error"); assert(error_loc >= token_start && "Invalid error location"); @@ -119,194 +95,166 @@ static bool tok_is_string_character(wchar_t c, bool is_first) { /// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster /// by adding a fast path for the most common characters. This is obviously not a suitable /// replacement for iswalpha. -static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } +static inline int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } + +ENUM_FLAGS(tok_mode) { + regular_text = 0, // regular text + subshell = 1 << 0, // inside of subshell parentheses + array_brackets = 1 << 1, // inside of array brackets + curly_braces = 1 << 2, + char_escape = 1 << 3, +}; /// Read the next token as a string. tok_t tokenizer_t::read_string() { - bool do_loop = true; - size_t paran_count = 0; - // Up to 96 open parens, before we give up on good error reporting. - const size_t paran_offsets_max = 96; - size_t paran_offsets[paran_offsets_max]; - // Where the open bracket is. - size_t offset_of_bracket = 0; + tok_mode mode { tok_mode::regular_text }; + std::vector paran_offsets; + std::vector brace_offsets; + std::vector expecting; + int slice_offset = 0; const wchar_t *const buff_start = this->buff; bool is_first = true; - enum tok_mode_t { - mode_regular_text = 0, // regular text - mode_subshell = 1, // inside of subshell - mode_array_brackets = 2, // inside of array brackets - mode_array_brackets_and_subshell = - 3 // inside of array brackets and subshell, like in '$foo[(ech' - } mode = mode_regular_text; + while (true) { + wchar_t c = *this->buff; +#if false + wcstring msg = L"Handling 0x%x (%lc)"; + tok_mode mode_begin = mode; +#endif - while (1) { - if (!myal(*this->buff)) { - if (*this->buff == L'\\') { - const wchar_t *error_location = this->buff; - this->buff++; - if (*this->buff == L'\0') { - if ((!this->accept_unfinished)) { - return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start, - error_location); - } - // Since we are about to increment tok->buff, decrement it first so the - // increment doesn't go past the end of the buffer. See issue #389. - this->buff--; - do_loop = 0; - } - - this->buff++; - continue; - } - - switch (mode) { - case mode_regular_text: { - switch (*this->buff) { - case L'(': { - paran_count = 1; - paran_offsets[0] = this->buff - this->start; - mode = mode_subshell; - break; - } - case L'[': { - if (this->buff != buff_start) { - mode = mode_array_brackets; - offset_of_bracket = this->buff - this->start; - } - break; - } - case L'\'': - case L'"': { - const wchar_t *end = quote_end(this->buff); - if (end) { - this->buff = end; - } else { - const wchar_t *error_loc = this->buff; - this->buff += wcslen(this->buff); - - if (!this->accept_unfinished) { - return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, - error_loc); - } - do_loop = 0; - } - break; - } - default: { - if (!tok_is_string_character(*(this->buff), is_first)) { - do_loop = 0; - } - break; - } - } - break; - } - - case mode_array_brackets_and_subshell: - case mode_subshell: { - switch (*this->buff) { - case L'\'': - case L'\"': { - const wchar_t *end = quote_end(this->buff); - if (end) { - this->buff = end; - } else { - const wchar_t *error_loc = this->buff; - this->buff += wcslen(this->buff); - if ((!this->accept_unfinished)) { - return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, - error_loc); - } - do_loop = 0; - } - break; - } - case L'(': { - if (paran_count < paran_offsets_max) { - paran_offsets[paran_count] = this->buff - this->start; - } - paran_count++; - break; - } - case L')': { - assert(paran_count > 0); - paran_count--; - if (paran_count == 0) { - mode = - (mode == mode_array_brackets_and_subshell ? mode_array_brackets - : mode_regular_text); - } - break; - } - case L'\0': { - do_loop = 0; - break; - } - default: { - break; // ignore other chars - } - } - break; - } - - case mode_array_brackets: { - switch (*this->buff) { - case L'(': { - paran_count = 1; - paran_offsets[0] = this->buff - this->start; - mode = mode_array_brackets_and_subshell; - break; - } - case L']': { - mode = mode_regular_text; - break; - } - case L'\0': { - do_loop = 0; - break; - } - default: { - break; // ignore other chars - } - } - break; - } - } + if (c == L'\0') { + break; } - if (!do_loop) break; + // Make sure this character isn't being escaped before anything else + if ((mode & tok_mode::char_escape) == tok_mode::char_escape) { + mode &= ~(tok_mode::char_escape); + // and do nothing more + } + else if (myal(c)) { + // Early exit optimization in case the character is just a letter, + // which has no special meaning to the tokenizer, i.e. the same mode continues. + } + + // Now proceed with the evaluation of the token, first checking to see if the token + // has been explicitly ignored (escaped). + else if (c == L'\\') { + mode |= tok_mode::char_escape; + } + else if (c == L'(') { + paran_offsets.push_back(this->buff - this->start); + expecting.push_back(L')'); + mode |= tok_mode::subshell; + } + else if (c == L'{') { + brace_offsets.push_back(this->buff - this->start); + expecting.push_back(L'}'); + mode |= tok_mode::curly_braces; + } + else if (c == L')') { + if (expecting.size() > 0 && expecting.back() == L'}') { + return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff); + } + switch (paran_offsets.size()) { + case 0: + return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff); + case 1: + mode &= ~(tok_mode::subshell); + default: + paran_offsets.pop_back(); + } + expecting.pop_back(); + } + else if (c == L'}') { + if (expecting.size() > 0 && expecting.back() == L')') { + return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff); + } + switch (brace_offsets.size()) { + case 0: + return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff); + case 1: + mode &= ~(tok_mode::curly_braces); + default: + brace_offsets.pop_back(); + } + expecting.pop_back(); + } + else if (c == L'[') { + if (this->buff != buff_start) { + if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { + // Nested brackets should not overwrite the existing slice_offset + //mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell + //prints an error message with the caret pointing at token_start, + //not err_loc, making the TOK_ILLEGAL_SLICE message misleading. + // return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff); + return this->call_error(TOK_UNTERMINATED_SLICE, this->start, this->buff); + } + slice_offset = this->buff - this->start; + mode |= tok_mode::array_brackets; + } + else { + // This is actually allowed so the test operator `[` can be used as the head of a command + } + } + // Only exit bracket mode if we are in bracket mode. + // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias. + // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket + else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) { + mode &= ~(tok_mode::array_brackets); + } + else if (c == L'\'' || c == L'"') { + const wchar_t *end = quote_end(this->buff); + if (end) { + this->buff = end; + } else { + const wchar_t *error_loc = this->buff; + this->buff += wcslen(this->buff); + if ((!this->accept_unfinished)) { + return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc); + } + break; + } + } + else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) { + break; + } + +#if false + if (mode != mode_begin) { + msg.append(L": mode 0x%x -> 0x%x\n"); + } else { + msg.push_back(L'\n'); + } + debug(0, msg.c_str(), c, c, int(mode_begin), int(mode)); +#endif this->buff++; is_first = false; } - if ((!this->accept_unfinished) && (mode != mode_regular_text)) { + if ((!this->accept_unfinished) && (mode != tok_mode::regular_text)) { tok_t error; - switch (mode) { - case mode_subshell: { - // Determine the innermost opening paran offset by interrogating paran_offsets. - assert(paran_count > 0); - size_t offset_of_open_paran = 0; - if (paran_count <= paran_offsets_max) { - offset_of_open_paran = paran_offsets[paran_count - 1]; - } + if ((mode & tok_mode::char_escape) == tok_mode::char_escape) { + error = this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start, + this->buff - 1); + } + else if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { + error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start, + this->start + slice_offset); + } + else if ((mode & tok_mode::subshell) == tok_mode::subshell) { + assert(paran_offsets.size() > 0); + size_t offset_of_open_paran = paran_offsets.back(); - error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, - this->start + offset_of_open_paran); - break; - } - case mode_array_brackets: - case mode_array_brackets_and_subshell: { - error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start, - this->start + offset_of_bracket); - break; - } - default: { - DIE("unexpected mode in read_string"); - break; - } + error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, + this->start + offset_of_open_paran); + } + else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) { + assert(brace_offsets.size() > 0); + size_t offset_of_open_brace = brace_offsets.back(); + + error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start, + this->start + offset_of_open_brace); } return error; } diff --git a/src/tokenizer.h b/src/tokenizer.h index e0aa58b50..8ce6618a7 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -7,6 +7,7 @@ #include "common.h" #include "maybe.h" +#include "parse_constants.h" /// Token types. enum token_type { @@ -22,17 +23,26 @@ enum token_type { TOK_COMMENT /// comment token }; -/// Tokenizer error types. -enum tokenizer_error { - TOK_ERROR_NONE, - TOK_UNTERMINATED_QUOTE, - TOK_UNTERMINATED_SUBSHELL, - TOK_UNTERMINATED_SLICE, - TOK_UNTERMINATED_ESCAPE, - TOK_INVALID_REDIRECT, - TOK_INVALID_PIPE +struct tokenizer_error { + const wchar_t *Message; + enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error + tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other) + : Message(msg), parser_error(perr) {} + tokenizer_error(const tokenizer_error&) = delete; }; +extern tokenizer_error *TOK_ERROR_NONE; +extern tokenizer_error *TOK_UNTERMINATED_QUOTE; +extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL; +extern tokenizer_error *TOK_UNTERMINATED_SLICE; +extern tokenizer_error *TOK_UNTERMINATED_ESCAPE; +extern tokenizer_error *TOK_UNTERMINATED_BRACE; +extern tokenizer_error *TOK_INVALID_REDIRECT; +extern tokenizer_error *TOK_INVALID_PIPE; +extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL; +extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE; +extern tokenizer_error *TOK_ILLEGAL_SLICE; + enum class redirection_type_t { overwrite, // normal redirection: > file.txt append, // appending redirection: >> file.txt @@ -67,7 +77,7 @@ struct tok_t { maybe_t redirected_fd{}; // If an error, this is the error code. - enum tokenizer_error error { TOK_ERROR_NONE }; + tokenizer_error *error { TOK_ERROR_NONE }; // If an error, this is the offset of the error within the token. A value of 0 means it occurred // at 'offset'. @@ -97,7 +107,7 @@ class tokenizer_t { /// Whether to continue the previous line after the comment. bool continue_line_after_comment{false}; - tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start, + tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start, const wchar_t *error_loc); tok_t read_string(); maybe_t tok_next(); diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index 79209c1c5..348257443 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -45,3 +45,14 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) { output.push_back(ellipsis_char); return output; } + +wcstring trim(const wcstring &input, const wchar_t *any_of) { + auto begin_offset = input.find_first_not_of(any_of); + if (begin_offset == wcstring::npos) { + return wcstring{}; + } + auto end = input.cbegin() + input.find_last_not_of(any_of); + + wcstring result(input.begin() + begin_offset, end + 1); + return result; +} diff --git a/src/wcstringutil.h b/src/wcstringutil.h index 878771f25..75351e38f 100644 --- a/src/wcstringutil.h +++ b/src/wcstringutil.h @@ -59,5 +59,6 @@ enum class ellipsis_type { }; wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest); +wcstring trim(const wcstring &input, const wchar_t *any_of); #endif diff --git a/tests/parameter_expansion.err b/tests/parameter_expansion.err new file mode 100644 index 000000000..e69de29bb diff --git a/tests/parameter_expansion.in b/tests/parameter_expansion.in new file mode 100644 index 000000000..89ba2b2d9 --- /dev/null +++ b/tests/parameter_expansion.in @@ -0,0 +1,34 @@ +# basic expansion test +echo {} +echo {apple} +echo {apple,orange} + +# expansion tests with spaces +echo {apple, orange} +echo { apple, orange, banana } + +# expansion with spaces and cartesian products +echo \'{ hello , world }\' + +# expansion with escapes +for phrase in {good\,, beautiful ,morning}; echo -n "$phrase "; end | string trim; +for phrase in {goodbye\,,\ cruel\ ,world\n}; echo -n $phrase; end; + +# whitespace within entries converted to spaces in a single entry +for foo in { hello +world } + echo \'$foo\' +end + +# dual expansion cartesian product +echo { alpha, beta }\ {lambda, gamma }, | sed -r 's/(.*),/\1/' + +# expansion with subshells +for name in { (echo Meg), (echo Jo) } + echo $name +end + +# subshells with expansion +for name in (for name in {Beth, Amy}; printf "$name\n"; end); printf "$name\n"; end + +# vim: set ft=fish: diff --git a/tests/parameter_expansion.out b/tests/parameter_expansion.out new file mode 100644 index 000000000..d89373285 --- /dev/null +++ b/tests/parameter_expansion.out @@ -0,0 +1,14 @@ +{} +apple +apple orange +apple orange +apple orange banana +'hello' 'world' +good, beautiful morning +goodbye, cruel world +'hello world' +alpha lambda, beta lambda, alpha gamma, beta gamma +Meg +Jo +Beth +Amy