From c98ed6d07abeb7b4dc41d92f631a433e37d6f9ea Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sat, 10 Mar 2018 13:16:07 -0600 Subject: [PATCH 01/11] Rename BRACKET to BRACE per #3802 --- src/common.cpp | 6 +-- src/expand.cpp | 93 +++++++++++++++++++++++----------------------- src/expand.h | 6 +-- src/highlight.cpp | 6 +-- src/parse_util.cpp | 6 +-- 5 files changed, 59 insertions(+), 58 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index 84d2cdd8e..b3a96db18 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1353,20 +1353,20 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in case L'{': { if (unescape_special) { bracket_count++; - to_append_or_none = BRACKET_BEGIN; + to_append_or_none = BRACE_BEGIN; } break; } case L'}': { if (unescape_special) { bracket_count--; - to_append_or_none = BRACKET_END; + to_append_or_none = BRACE_END; } break; } case L',': { if (unescape_special && bracket_count > 0) { - to_append_or_none = BRACKET_SEP; + to_append_or_none = BRACE_SEP; } break; } diff --git a/src/expand.cpp b/src/expand.cpp index 7605ed353..3ead4819a 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -570,7 +570,7 @@ static void find_process(const wchar_t *proc, expand_flags_t flags, static size_t parse_slice(const wchar_t *in, wchar_t **end_ptr, std::vector &idx, std::vector &source_positions, size_t array_size) { const long size = (long)array_size; - size_t pos = 1; // skip past the opening square bracket + size_t pos = 1; // skip past the opening square brace while (1) { while (iswspace(in[pos]) || (in[pos] == INTERNAL_SEPARATOR)) pos++; @@ -846,39 +846,39 @@ static bool expand_variables(const wcstring &instr, std::vector *o return true; } -/// Perform bracket expansion. -static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flags, +/// Perform brace expansion. +static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, std::vector *out, parse_error_list_t *errors) { bool syntax_error = false; - int bracket_count = 0; + int brace_count = 0; - const wchar_t *bracket_begin = NULL, *bracket_end = NULL; + const wchar_t *brace_begin = NULL, *brace_end = NULL; const wchar_t *last_sep = NULL; const wchar_t *item_begin; - size_t length_preceding_brackets, length_following_brackets, tot_len; + size_t length_preceding_braces, length_following_braces, tot_len; const wchar_t *const in = instr.c_str(); - // Locate the first non-nested bracket pair. + // Locate the first non-nested brace pair. for (const wchar_t *pos = in; (*pos) && !syntax_error; pos++) { switch (*pos) { - case BRACKET_BEGIN: { - if (bracket_count == 0) bracket_begin = pos; - bracket_count++; + case BRACE_BEGIN: { + if (brace_count == 0) brace_begin = pos; + brace_count++; break; } - case BRACKET_END: { - bracket_count--; - if (bracket_count < 0) { + case BRACE_END: { + brace_count--; + if (brace_count < 0) { syntax_error = true; - } else if (bracket_count == 0) { - bracket_end = pos; + } else if (brace_count == 0) { + brace_end = pos; } break; } - case BRACKET_SEP: { - if (bracket_count == 1) last_sep = pos; + case BRACE_SEP: { + if (brace_count == 1) last_sep = pos; break; } default: { @@ -887,72 +887,73 @@ static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flag } } - if (bracket_count > 0) { + if (brace_count > 0) { if (!(flags & EXPAND_FOR_COMPLETIONS)) { syntax_error = true; } else { - // The user hasn't typed an end bracket yet; make one up and append it, then expand + // The user hasn't typed an end brace yet; make one up and append it, then expand // that. wcstring mod; if (last_sep) { - mod.append(in, bracket_begin - in + 1); + mod.append(in, brace_begin - in + 1); mod.append(last_sep + 1); - mod.push_back(BRACKET_END); + mod.push_back(BRACE_END); } else { mod.append(in); - mod.push_back(BRACKET_END); + mod.push_back(BRACE_END); } // Note: this code looks very fishy, apparently it has never worked. - return expand_brackets(mod, 1, out, errors); + return expand_braces(mod, 1, out, errors); } } // Expand a literal "{}" to itself because it is useless otherwise, // and this eases e.g. `find -exec {}`. See #1109. - if (bracket_begin + 1 == bracket_end) { + if (brace_begin + 1 == brace_end) { wcstring newstr = instr; - newstr.at(bracket_begin - in) = L'{'; - newstr.at(bracket_end - in) = L'}'; - return expand_brackets(newstr, flags, out, errors); + newstr.at(brace_begin - in) = L'{'; + newstr.at(brace_end - in) = L'}'; + return expand_braces(newstr, flags, out, errors); } if (syntax_error) { - append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched brackets")); + append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched braces")); return EXPAND_ERROR; } - if (bracket_begin == NULL) { + if (brace_begin == NULL) { append_completion(out, instr); return EXPAND_OK; } - length_preceding_brackets = (bracket_begin - in); - length_following_brackets = wcslen(bracket_end) - 1; - tot_len = length_preceding_brackets + length_following_brackets; - item_begin = bracket_begin + 1; - for (const wchar_t *pos = (bracket_begin + 1); true; pos++) { - if (bracket_count == 0 && ((*pos == BRACKET_SEP) || (pos == bracket_end))) { + length_preceding_braces = (brace_begin - in); + length_following_braces = wcslen(brace_end) - 1; + tot_len = length_preceding_braces + length_following_braces; + item_begin = brace_begin + 1; + for (const wchar_t *pos = (brace_begin + 1); true; pos++) { + if (brace_count == 0 && ((*pos == BRACE_SEP) || (pos == brace_end))) { assert(pos >= item_begin); size_t item_len = pos - item_begin; wcstring whole_item; whole_item.reserve(tot_len + item_len + 2); - whole_item.append(in, length_preceding_brackets); + whole_item.append(in, length_preceding_braces); whole_item.append(item_begin, item_len); - whole_item.append(bracket_end + 1); - expand_brackets(whole_item, flags, out, errors); + whole_item.append(brace_end + 1); + debug(0, L"Found brace item: %ls\n", whole_item.c_str()); + expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; - if (pos == bracket_end) break; + if (pos == brace_end) break; } - if (*pos == BRACKET_BEGIN) { - bracket_count++; + if (*pos == BRACE_BEGIN) { + brace_count++; } - if (*pos == BRACKET_END) { - bracket_count--; + if (*pos == BRACE_END) { + brace_count--; } } return EXPAND_OK; @@ -1274,9 +1275,9 @@ static expand_error_t expand_stage_variables(const wcstring &input, std::vector< return EXPAND_OK; } -static expand_error_t expand_stage_brackets(const wcstring &input, std::vector *out, +static expand_error_t expand_stage_braces(const wcstring &input, std::vector *out, expand_flags_t flags, parse_error_list_t *errors) { - return expand_brackets(input, flags, out, errors); + return expand_braces(input, flags, out, errors); } static expand_error_t expand_stage_home(const wcstring &input, @@ -1393,7 +1394,7 @@ expand_error_t expand_string(const wcstring &input, std::vector *o // Our expansion stages. const expand_stage_t stages[] = {expand_stage_cmdsubst, expand_stage_variables, - expand_stage_brackets, expand_stage_home, + expand_stage_braces, expand_stage_home, expand_stage_wildcards}; // Load up our single initial completion. diff --git a/src/expand.h b/src/expand.h index 771d8773b..62bd6a482 100644 --- a/src/expand.h +++ b/src/expand.h @@ -65,11 +65,11 @@ enum { /// Character representing variable expansion into a single element. VARIABLE_EXPAND_SINGLE, /// Character representing the start of a bracket expansion. - BRACKET_BEGIN, + BRACE_BEGIN, /// Character representing the end of a bracket expansion. - BRACKET_END, + BRACE_END, /// Character representing separation between two bracket elements. - BRACKET_SEP, + BRACE_SEP, /// Separate subtokens in a token with this character. INTERNAL_SEPARATOR, /// Character representing an empty variable expansion. Only used transitively while expanding diff --git a/src/highlight.cpp b/src/highlight.cpp index c0366c7d5..1b94e87fa 100644 --- a/src/highlight.cpp +++ b/src/highlight.cpp @@ -122,9 +122,9 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l switch (c) { case VARIABLE_EXPAND: case VARIABLE_EXPAND_SINGLE: - case BRACKET_BEGIN: - case BRACKET_END: - case BRACKET_SEP: + case BRACE_BEGIN: + case BRACE_END: + case BRACE_SEP: case ANY_CHAR: case ANY_STRING: case ANY_STRING_RECURSIVE: { diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 2927763ee..2fb17e94a 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -834,14 +834,14 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token wchar_t char_after_dollar = dollar_pos + 1 >= token.size() ? 0 : token.at(dollar_pos + 1); switch (char_after_dollar) { - case BRACKET_BEGIN: + case BRACE_BEGIN: case L'{': { - // The BRACKET_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible + // The BRACE_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible // quoted) ${. See if we have a }, and the stuff in between is variable material. If so, // report a bracket error. Otherwise just complain about the ${. bool looks_like_variable = false; size_t closing_bracket = - token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACKET_END), dollar_pos + 2); + token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACE_END), dollar_pos + 2); wcstring var_name; if (closing_bracket != wcstring::npos) { size_t var_start = dollar_pos + 2, var_end = closing_bracket; From 65a03c86cb2fbe86db17251682dc22df5119dfc3 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sat, 10 Mar 2018 13:16:07 -0600 Subject: [PATCH 02/11] Rename BRACKET in reference to `{` to BRACE instead per #3802 This `{` is a curly brace. This `[` is a square bracket. --- src/common.cpp | 6 +-- src/expand.cpp | 93 +++++++++++++++++++++++----------------------- src/expand.h | 6 +-- src/highlight.cpp | 6 +-- src/parse_util.cpp | 6 +-- 5 files changed, 59 insertions(+), 58 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index 84d2cdd8e..b3a96db18 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1353,20 +1353,20 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in case L'{': { if (unescape_special) { bracket_count++; - to_append_or_none = BRACKET_BEGIN; + to_append_or_none = BRACE_BEGIN; } break; } case L'}': { if (unescape_special) { bracket_count--; - to_append_or_none = BRACKET_END; + to_append_or_none = BRACE_END; } break; } case L',': { if (unescape_special && bracket_count > 0) { - to_append_or_none = BRACKET_SEP; + to_append_or_none = BRACE_SEP; } break; } diff --git a/src/expand.cpp b/src/expand.cpp index 7605ed353..3ead4819a 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -570,7 +570,7 @@ static void find_process(const wchar_t *proc, expand_flags_t flags, static size_t parse_slice(const wchar_t *in, wchar_t **end_ptr, std::vector &idx, std::vector &source_positions, size_t array_size) { const long size = (long)array_size; - size_t pos = 1; // skip past the opening square bracket + size_t pos = 1; // skip past the opening square brace while (1) { while (iswspace(in[pos]) || (in[pos] == INTERNAL_SEPARATOR)) pos++; @@ -846,39 +846,39 @@ static bool expand_variables(const wcstring &instr, std::vector *o return true; } -/// Perform bracket expansion. -static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flags, +/// Perform brace expansion. +static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, std::vector *out, parse_error_list_t *errors) { bool syntax_error = false; - int bracket_count = 0; + int brace_count = 0; - const wchar_t *bracket_begin = NULL, *bracket_end = NULL; + const wchar_t *brace_begin = NULL, *brace_end = NULL; const wchar_t *last_sep = NULL; const wchar_t *item_begin; - size_t length_preceding_brackets, length_following_brackets, tot_len; + size_t length_preceding_braces, length_following_braces, tot_len; const wchar_t *const in = instr.c_str(); - // Locate the first non-nested bracket pair. + // Locate the first non-nested brace pair. for (const wchar_t *pos = in; (*pos) && !syntax_error; pos++) { switch (*pos) { - case BRACKET_BEGIN: { - if (bracket_count == 0) bracket_begin = pos; - bracket_count++; + case BRACE_BEGIN: { + if (brace_count == 0) brace_begin = pos; + brace_count++; break; } - case BRACKET_END: { - bracket_count--; - if (bracket_count < 0) { + case BRACE_END: { + brace_count--; + if (brace_count < 0) { syntax_error = true; - } else if (bracket_count == 0) { - bracket_end = pos; + } else if (brace_count == 0) { + brace_end = pos; } break; } - case BRACKET_SEP: { - if (bracket_count == 1) last_sep = pos; + case BRACE_SEP: { + if (brace_count == 1) last_sep = pos; break; } default: { @@ -887,72 +887,73 @@ static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flag } } - if (bracket_count > 0) { + if (brace_count > 0) { if (!(flags & EXPAND_FOR_COMPLETIONS)) { syntax_error = true; } else { - // The user hasn't typed an end bracket yet; make one up and append it, then expand + // The user hasn't typed an end brace yet; make one up and append it, then expand // that. wcstring mod; if (last_sep) { - mod.append(in, bracket_begin - in + 1); + mod.append(in, brace_begin - in + 1); mod.append(last_sep + 1); - mod.push_back(BRACKET_END); + mod.push_back(BRACE_END); } else { mod.append(in); - mod.push_back(BRACKET_END); + mod.push_back(BRACE_END); } // Note: this code looks very fishy, apparently it has never worked. - return expand_brackets(mod, 1, out, errors); + return expand_braces(mod, 1, out, errors); } } // Expand a literal "{}" to itself because it is useless otherwise, // and this eases e.g. `find -exec {}`. See #1109. - if (bracket_begin + 1 == bracket_end) { + if (brace_begin + 1 == brace_end) { wcstring newstr = instr; - newstr.at(bracket_begin - in) = L'{'; - newstr.at(bracket_end - in) = L'}'; - return expand_brackets(newstr, flags, out, errors); + newstr.at(brace_begin - in) = L'{'; + newstr.at(brace_end - in) = L'}'; + return expand_braces(newstr, flags, out, errors); } if (syntax_error) { - append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched brackets")); + append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched braces")); return EXPAND_ERROR; } - if (bracket_begin == NULL) { + if (brace_begin == NULL) { append_completion(out, instr); return EXPAND_OK; } - length_preceding_brackets = (bracket_begin - in); - length_following_brackets = wcslen(bracket_end) - 1; - tot_len = length_preceding_brackets + length_following_brackets; - item_begin = bracket_begin + 1; - for (const wchar_t *pos = (bracket_begin + 1); true; pos++) { - if (bracket_count == 0 && ((*pos == BRACKET_SEP) || (pos == bracket_end))) { + length_preceding_braces = (brace_begin - in); + length_following_braces = wcslen(brace_end) - 1; + tot_len = length_preceding_braces + length_following_braces; + item_begin = brace_begin + 1; + for (const wchar_t *pos = (brace_begin + 1); true; pos++) { + if (brace_count == 0 && ((*pos == BRACE_SEP) || (pos == brace_end))) { assert(pos >= item_begin); size_t item_len = pos - item_begin; wcstring whole_item; whole_item.reserve(tot_len + item_len + 2); - whole_item.append(in, length_preceding_brackets); + whole_item.append(in, length_preceding_braces); whole_item.append(item_begin, item_len); - whole_item.append(bracket_end + 1); - expand_brackets(whole_item, flags, out, errors); + whole_item.append(brace_end + 1); + debug(0, L"Found brace item: %ls\n", whole_item.c_str()); + expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; - if (pos == bracket_end) break; + if (pos == brace_end) break; } - if (*pos == BRACKET_BEGIN) { - bracket_count++; + if (*pos == BRACE_BEGIN) { + brace_count++; } - if (*pos == BRACKET_END) { - bracket_count--; + if (*pos == BRACE_END) { + brace_count--; } } return EXPAND_OK; @@ -1274,9 +1275,9 @@ static expand_error_t expand_stage_variables(const wcstring &input, std::vector< return EXPAND_OK; } -static expand_error_t expand_stage_brackets(const wcstring &input, std::vector *out, +static expand_error_t expand_stage_braces(const wcstring &input, std::vector *out, expand_flags_t flags, parse_error_list_t *errors) { - return expand_brackets(input, flags, out, errors); + return expand_braces(input, flags, out, errors); } static expand_error_t expand_stage_home(const wcstring &input, @@ -1393,7 +1394,7 @@ expand_error_t expand_string(const wcstring &input, std::vector *o // Our expansion stages. const expand_stage_t stages[] = {expand_stage_cmdsubst, expand_stage_variables, - expand_stage_brackets, expand_stage_home, + expand_stage_braces, expand_stage_home, expand_stage_wildcards}; // Load up our single initial completion. diff --git a/src/expand.h b/src/expand.h index 771d8773b..62bd6a482 100644 --- a/src/expand.h +++ b/src/expand.h @@ -65,11 +65,11 @@ enum { /// Character representing variable expansion into a single element. VARIABLE_EXPAND_SINGLE, /// Character representing the start of a bracket expansion. - BRACKET_BEGIN, + BRACE_BEGIN, /// Character representing the end of a bracket expansion. - BRACKET_END, + BRACE_END, /// Character representing separation between two bracket elements. - BRACKET_SEP, + BRACE_SEP, /// Separate subtokens in a token with this character. INTERNAL_SEPARATOR, /// Character representing an empty variable expansion. Only used transitively while expanding diff --git a/src/highlight.cpp b/src/highlight.cpp index c0366c7d5..1b94e87fa 100644 --- a/src/highlight.cpp +++ b/src/highlight.cpp @@ -122,9 +122,9 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l switch (c) { case VARIABLE_EXPAND: case VARIABLE_EXPAND_SINGLE: - case BRACKET_BEGIN: - case BRACKET_END: - case BRACKET_SEP: + case BRACE_BEGIN: + case BRACE_END: + case BRACE_SEP: case ANY_CHAR: case ANY_STRING: case ANY_STRING_RECURSIVE: { diff --git a/src/parse_util.cpp b/src/parse_util.cpp index 2927763ee..2fb17e94a 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -834,14 +834,14 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token wchar_t char_after_dollar = dollar_pos + 1 >= token.size() ? 0 : token.at(dollar_pos + 1); switch (char_after_dollar) { - case BRACKET_BEGIN: + case BRACE_BEGIN: case L'{': { - // The BRACKET_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible + // The BRACE_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible // quoted) ${. See if we have a }, and the stuff in between is variable material. If so, // report a bracket error. Otherwise just complain about the ${. bool looks_like_variable = false; size_t closing_bracket = - token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACKET_END), dollar_pos + 2); + token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACE_END), dollar_pos + 2); wcstring var_name; if (closing_bracket != wcstring::npos) { size_t var_start = dollar_pos + 2, var_end = closing_bracket; From f508a1f274b9ba8599cd1be7704d86bc2b75d7f2 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 12:13:55 -0500 Subject: [PATCH 03/11] Reset tokenizer state on start and improve slice error detection --- src/tokenizer.cpp | 39 +++++++++++++++++++++++++++++++-------- src/tokenizer.h | 1 + 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 31c719016..4530d51b6 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -37,6 +37,9 @@ /// Error for when ) is encountered with no matching ( #define ERROR_CLOSING_UNOPENED_PARENTHESIS _(L"Unexpected ')' for unopened parenthesis") +/// Error for when [ is encountered while already in bracket mode +#define ERROR_UNEXPECTED_BRACKET _(L"Unexpected '[' at this location") + wcstring error_message_for_code(tokenizer_error err) { switch (err) { case TOK_UNTERMINATED_QUOTE: @@ -53,6 +56,8 @@ wcstring error_message_for_code(tokenizer_error err) { return PIPE_ERROR; case TOK_CLOSING_UNOPENED_SUBSHELL: return ERROR_CLOSING_UNOPENED_PARENTHESIS; + case TOK_ILLEGAL_SLICE: + return ERROR_UNEXPECTED_BRACKET; default: assert(0 && "Unknown error type"); return {}; @@ -132,10 +137,11 @@ ENUM_FLAGS(tok_mode) { array_brackets = 1 << 1, // inside of array brackets curly_braces = 1 << 2, char_escape = 1 << 3, -} mode = tok_mode::regular_text; +}; /// Read the next token as a string. tok_t tokenizer_t::read_string() { + tok_mode mode { tok_mode::regular_text }; std::vector paran_offsets; int slice_offset = 0; const wchar_t *const buff_start = this->buff; @@ -153,11 +159,18 @@ tok_t tokenizer_t::read_string() { mode &= ~(tok_mode::char_escape); // and do nothing more } - else if (!myal(c)) { - if (c == L'\0') { - break; - } - else if (c == L'\\') { + else if (myal(c)) { + // Early exit optimization in case the character is just a letter, + // which has no special meaning to the tokenizer, i.e. the same mode continues. + } + // This check has to be after the char_escape check above + else if (c == L'\0') { + break; + } + + // Now proceed with the evaluation of the token, first checking to see if the token + // has been explicitly ignored (escaped). + else if (c == L'\\') { mode |= tok_mode::char_escape; } else if (c == L'(') { @@ -176,13 +189,24 @@ tok_t tokenizer_t::read_string() { } else if (c == L'[') { if (this->buff != buff_start) { - mode |= tok_mode::array_brackets; + if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { + // Nested brackets should not overwrite the existing slice_offset + //mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell + //prints an error message with the caret pointing at token_start, + //not err_loc, making the TOK_ILLEGAL_SLICE message misleading. + // return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff); + return call_error(TOK_UNTERMINATED_SLICE, buff_start, this->buff); + } slice_offset = this->buff - this->start; + mode |= tok_mode::array_brackets; } else { // This is actually allowed so the test operator `[` can be used as the head of a command } } + // Only exit bracket mode if we are in bracket mode. + // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias. + // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) { mode &= ~(tok_mode::array_brackets); } @@ -202,7 +226,6 @@ tok_t tokenizer_t::read_string() { else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) { break; } - } #if false if (mode != mode_begin) { diff --git a/src/tokenizer.h b/src/tokenizer.h index 11b71bc0d..1110e86ab 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -32,6 +32,7 @@ enum tokenizer_error { TOK_INVALID_REDIRECT, TOK_INVALID_PIPE, TOK_CLOSING_UNOPENED_SUBSHELL, + TOK_ILLEGAL_SLICE, }; enum class redirection_type_t { From df89d712379308f277ba5ee59db7d001dea34bc5 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 17:10:16 -0500 Subject: [PATCH 04/11] Correct escaping behavior in new tokenizer code --- src/tokenizer.cpp | 108 +++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 4530d51b6..c216e1972 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -154,6 +154,10 @@ tok_t tokenizer_t::read_string() { tok_mode mode_begin = mode; #endif + if (c == L'\0') { + break; + } + // Make sure this character isn't being escaped before anything else if ((mode & tok_mode::char_escape) == tok_mode::char_escape) { mode &= ~(tok_mode::char_escape); @@ -163,69 +167,65 @@ tok_t tokenizer_t::read_string() { // Early exit optimization in case the character is just a letter, // which has no special meaning to the tokenizer, i.e. the same mode continues. } - // This check has to be after the char_escape check above - else if (c == L'\0') { - break; - } // Now proceed with the evaluation of the token, first checking to see if the token // has been explicitly ignored (escaped). else if (c == L'\\') { - mode |= tok_mode::char_escape; + mode |= tok_mode::char_escape; + } + else if (c == L'(') { + paran_offsets.push_back(this->buff - this->start); + mode |= tok_mode::subshell; + } + else if (c == L')') { + switch (paran_offsets.size()) { + case 0: + return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, buff_start, this->buff); + case 1: + mode &= ~(tok_mode::subshell); + default: + paran_offsets.pop_back(); } - else if (c == L'(') { - paran_offsets.push_back(this->buff - this->start); - mode |= tok_mode::subshell; - } - else if (c == L')') { - switch (paran_offsets.size()) { - case 0: - return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, buff_start, this->buff); - case 1: - mode &= ~(tok_mode::subshell); - default: - paran_offsets.pop_back(); + } + else if (c == L'[') { + if (this->buff != buff_start) { + if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { + // Nested brackets should not overwrite the existing slice_offset + //mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell + //prints an error message with the caret pointing at token_start, + //not err_loc, making the TOK_ILLEGAL_SLICE message misleading. + // return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff); + return call_error(TOK_UNTERMINATED_SLICE, buff_start, this->buff); } + slice_offset = this->buff - this->start; + mode |= tok_mode::array_brackets; } - else if (c == L'[') { - if (this->buff != buff_start) { - if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { - // Nested brackets should not overwrite the existing slice_offset - //mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell - //prints an error message with the caret pointing at token_start, - //not err_loc, making the TOK_ILLEGAL_SLICE message misleading. - // return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff); - return call_error(TOK_UNTERMINATED_SLICE, buff_start, this->buff); - } - slice_offset = this->buff - this->start; - mode |= tok_mode::array_brackets; + else { + // This is actually allowed so the test operator `[` can be used as the head of a command + } + } + // Only exit bracket mode if we are in bracket mode. + // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias. + // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket + else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) { + mode &= ~(tok_mode::array_brackets); + } + else if (c == L'\'' || c == L'"') { + const wchar_t *end = quote_end(this->buff); + if (end) { + this->buff = end; + } else { + const wchar_t *error_loc = this->buff; + this->buff += wcslen(this->buff); + if ((!this->accept_unfinished)) { + return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc); } - else { - // This is actually allowed so the test operator `[` can be used as the head of a command - } - } - // Only exit bracket mode if we are in bracket mode. - // Reason: `]` can be a parameter, e.g. last parameter to `[` test alias. - // e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket - else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) { - mode &= ~(tok_mode::array_brackets); - } - else if (c == L'\'' || c == L'"') { - const wchar_t *end = quote_end(this->buff); - if (end) { - this->buff = end; - } else { - const wchar_t *error_loc = this->buff; - this->buff += wcslen(this->buff); - if ((!this->accept_unfinished)) { - return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc); - } - break; - } - } - else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) { break; } + } + else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) { + break; + } #if false if (mode != mode_begin) { @@ -244,7 +244,7 @@ tok_t tokenizer_t::read_string() { tok_t error; if ((mode & tok_mode::char_escape) == tok_mode::char_escape) { error = this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start, - this->buff); + this->buff - 1); } else if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start, From 74474324713025b86797fdb01844eaa9ccab5092 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 17:16:53 -0500 Subject: [PATCH 05/11] Fix tests for new `)` token error --- src/fish_tests.cpp | 14 ++++++++++++-- src/tokenizer.cpp | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 3ccd7ad25..71ee33a94 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -578,6 +578,15 @@ static void test_tokenizer() { do_test(token.error_offset == 3); } + { + tokenizer_t t(L"abc )defg(hij", 0); + do_test(t.next(&token)); + do_test(t.next(&token)); + do_test(token.type == TOK_ERROR); + do_test(token.error == TOK_CLOSING_UNOPENED_SUBSHELL); + do_test(token.error_offset == 4); + } + { tokenizer_t t(L"abc defg(hij (klm)", 0); do_test(t.next(&token)); @@ -4420,10 +4429,11 @@ static void test_illegal_command_exit_code() { const command_result_tuple_t tests[] = { {L"echo -n", STATUS_CMD_OK}, {L"pwd", STATUS_CMD_OK}, - {L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, + // a `)` without a matching `(` is now a tokenizer error, and cannot be executed even as an illegal command + // {L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD} {L"*", STATUS_ILLEGAL_CMD}, {L"**", STATUS_ILLEGAL_CMD}, {L"?", STATUS_ILLEGAL_CMD}, {L"abc?def", STATUS_ILLEGAL_CMD}, - {L") ", STATUS_ILLEGAL_CMD}}; + }; int res = 0; const io_chain_t empty_ios; diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index c216e1972..f2ff61267 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -180,7 +180,7 @@ tok_t tokenizer_t::read_string() { else if (c == L')') { switch (paran_offsets.size()) { case 0: - return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, buff_start, this->buff); + return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff); case 1: mode &= ~(tok_mode::subshell); default: @@ -195,7 +195,7 @@ tok_t tokenizer_t::read_string() { //prints an error message with the caret pointing at token_start, //not err_loc, making the TOK_ILLEGAL_SLICE message misleading. // return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff); - return call_error(TOK_UNTERMINATED_SLICE, buff_start, this->buff); + return this->call_error(TOK_UNTERMINATED_SLICE, this->start, this->buff); } slice_offset = this->buff - this->start; mode |= tok_mode::array_brackets; From 00f95a978e1caade5addda4c368feaeb26e5c8a4 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 19:36:10 -0500 Subject: [PATCH 06/11] Make { and } valid, first-class tokenizer elements --- src/expand.cpp | 4 +- src/parse_tree.cpp | 29 +------------- src/tokenizer.cpp | 92 +++++++++++++++++++++----------------------- src/tokenizer.h | 34 +++++++++------- src/wcstringutil.cpp | 18 +++++++++ src/wcstringutil.h | 1 + 6 files changed, 89 insertions(+), 89 deletions(-) diff --git a/src/expand.cpp b/src/expand.cpp index 3ead4819a..8aa7fc540 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -47,6 +47,7 @@ #include "proc.h" #include "reader.h" #include "wildcard.h" +#include "wcstringutil.h" #include "wutil.h" // IWYU pragma: keep #ifdef KERN_PROCARGS2 #else @@ -941,7 +942,8 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, whole_item.append(in, length_preceding_braces); whole_item.append(item_begin, item_len); whole_item.append(brace_end + 1); - debug(0, L"Found brace item: %ls\n", whole_item.c_str()); + auto whole_item2 = trim(whole_item); + debug(0, L"Found brace item: %ls\n", whole_item2.c_str()); expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index 9c51025f4..9ddcebd09 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta } void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) { - parse_error_code_t parse_error_code; - switch (tok.error) { - case TOK_UNTERMINATED_QUOTE: { - parse_error_code = parse_error_tokenizer_unterminated_quote; - break; - } - case TOK_UNTERMINATED_SUBSHELL: { - parse_error_code = parse_error_tokenizer_unterminated_subshell; - break; - } - case TOK_UNTERMINATED_SLICE: { - parse_error_code = parse_error_tokenizer_unterminated_slice; - break; - } - case TOK_UNTERMINATED_ESCAPE: { - parse_error_code = parse_error_tokenizer_unterminated_escape; - break; - } - case TOK_INVALID_REDIRECT: - case TOK_INVALID_PIPE: - default: { - parse_error_code = parse_error_tokenizer_other; - break; - } - } - + parse_error_code_t parse_error_code = tok.error->parser_error; this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset, parse_error_code, L"%ls", - error_message_for_code(tok.error).c_str()); + tok.error->Message); } void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) { diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index f2ff61267..ae804bb59 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -16,56 +16,22 @@ #include "tokenizer.h" #include "wutil.h" // IWYU pragma: keep -/// Error string for unexpected end of string. -#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced") - -/// Error string for mismatched parenthesis. -#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match") - -/// Error string for mismatched square brackets. -#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match") - -/// Error string for unterminated escape (backslash without continuation). -#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence") - -/// Error string for invalid redirections. -#define REDIRECT_ERROR _(L"Invalid input/output redirection") - -/// Error string for when trying to pipe from fd 0. -#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output") - -/// Error for when ) is encountered with no matching ( -#define ERROR_CLOSING_UNOPENED_PARENTHESIS _(L"Unexpected ')' for unopened parenthesis") - -/// Error for when [ is encountered while already in bracket mode -#define ERROR_UNEXPECTED_BRACKET _(L"Unexpected '[' at this location") - -wcstring error_message_for_code(tokenizer_error err) { - switch (err) { - case TOK_UNTERMINATED_QUOTE: - return QUOTE_ERROR; - case TOK_UNTERMINATED_SUBSHELL: - return PARAN_ERROR; - case TOK_UNTERMINATED_SLICE: - return SQUARE_BRACKET_ERROR; - case TOK_UNTERMINATED_ESCAPE: - return UNTERMINATED_ESCAPE_ERROR; - case TOK_INVALID_REDIRECT: - return REDIRECT_ERROR; - case TOK_INVALID_PIPE: - return PIPE_ERROR; - case TOK_CLOSING_UNOPENED_SUBSHELL: - return ERROR_CLOSING_UNOPENED_PARENTHESIS; - case TOK_ILLEGAL_SLICE: - return ERROR_UNEXPECTED_BRACKET; - default: - assert(0 && "Unknown error type"); - return {}; - } -} +tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L""); +tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote); +tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell); +tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice); +tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape); +tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection")); +tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output")); +tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis")); +tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location")); +tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion")); +tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion")); +tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'")); +tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'")); /// Return an error token and mark that we no longer have a next token. -tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start, +tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start, const wchar_t *error_loc) { assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error"); assert(error_loc >= token_start && "Invalid error location"); @@ -143,6 +109,7 @@ ENUM_FLAGS(tok_mode) { tok_t tokenizer_t::read_string() { tok_mode mode { tok_mode::regular_text }; std::vector paran_offsets; + std::vector expecting; int slice_offset = 0; const wchar_t *const buff_start = this->buff; bool is_first = true; @@ -175,9 +142,18 @@ tok_t tokenizer_t::read_string() { } else if (c == L'(') { paran_offsets.push_back(this->buff - this->start); + expecting.push_back(L')'); mode |= tok_mode::subshell; } + else if (c == L'{') { + paran_offsets.push_back(this->buff - this->start); + expecting.push_back(L'}'); + mode |= tok_mode::curly_braces; + } else if (c == L')') { + if (expecting.size() > 0 && expecting.back() == L'}') { + return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff); + } switch (paran_offsets.size()) { case 0: return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff); @@ -187,6 +163,19 @@ tok_t tokenizer_t::read_string() { paran_offsets.pop_back(); } } + else if (c == L'}') { + if (expecting.size() > 0 && expecting.back() == L')') { + return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff); + } + switch (paran_offsets.size()) { + case 0: + return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff); + case 1: + mode &= ~(tok_mode::curly_braces); + default: + paran_offsets.pop_back(); + } + } else if (c == L'[') { if (this->buff != buff_start) { if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { @@ -257,6 +246,13 @@ tok_t tokenizer_t::read_string() { error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, this->start + offset_of_open_paran); } + else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) { + assert(paran_offsets.size() > 0); + size_t offset_of_open_brace = paran_offsets.back(); + + error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start, + this->start + offset_of_open_brace); + } return error; } diff --git a/src/tokenizer.h b/src/tokenizer.h index 1110e86ab..8ce6618a7 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -7,6 +7,7 @@ #include "common.h" #include "maybe.h" +#include "parse_constants.h" /// Token types. enum token_type { @@ -22,19 +23,26 @@ enum token_type { TOK_COMMENT /// comment token }; -/// Tokenizer error types. -enum tokenizer_error { - TOK_ERROR_NONE, - TOK_UNTERMINATED_QUOTE, - TOK_UNTERMINATED_SUBSHELL, - TOK_UNTERMINATED_SLICE, - TOK_UNTERMINATED_ESCAPE, - TOK_INVALID_REDIRECT, - TOK_INVALID_PIPE, - TOK_CLOSING_UNOPENED_SUBSHELL, - TOK_ILLEGAL_SLICE, +struct tokenizer_error { + const wchar_t *Message; + enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error + tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other) + : Message(msg), parser_error(perr) {} + tokenizer_error(const tokenizer_error&) = delete; }; +extern tokenizer_error *TOK_ERROR_NONE; +extern tokenizer_error *TOK_UNTERMINATED_QUOTE; +extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL; +extern tokenizer_error *TOK_UNTERMINATED_SLICE; +extern tokenizer_error *TOK_UNTERMINATED_ESCAPE; +extern tokenizer_error *TOK_UNTERMINATED_BRACE; +extern tokenizer_error *TOK_INVALID_REDIRECT; +extern tokenizer_error *TOK_INVALID_PIPE; +extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL; +extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE; +extern tokenizer_error *TOK_ILLEGAL_SLICE; + enum class redirection_type_t { overwrite, // normal redirection: > file.txt append, // appending redirection: >> file.txt @@ -69,7 +77,7 @@ struct tok_t { maybe_t redirected_fd{}; // If an error, this is the error code. - enum tokenizer_error error { TOK_ERROR_NONE }; + tokenizer_error *error { TOK_ERROR_NONE }; // If an error, this is the offset of the error within the token. A value of 0 means it occurred // at 'offset'. @@ -99,7 +107,7 @@ class tokenizer_t { /// Whether to continue the previous line after the comment. bool continue_line_after_comment{false}; - tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start, + tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start, const wchar_t *error_loc); tok_t read_string(); maybe_t tok_next(); diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index 79209c1c5..5ed6d7b74 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -45,3 +45,21 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) { output.push_back(ellipsis_char); return output; } + +wcstring trim(const wcstring &input) { + debug(0, "trimming '%ls'", input.c_str()); + + // auto begin = input.cbegin(); + // for (begin; *begin == L' '; ++begin); + // auto end = input.cbegin() + input.size(); + // for (end; end > begin && *end == L' '; ++end); + + auto begin_offset = input.find_first_not_of(whitespace); + if (begin_offset == wcstring::npos) { + return wcstring{}; + } + auto end = input.cbegin() + input.find_last_not_of(whitespace); + + wcstring result(input.begin() + begin_offset, end + 1); + return result; +} diff --git a/src/wcstringutil.h b/src/wcstringutil.h index 878771f25..8665c0024 100644 --- a/src/wcstringutil.h +++ b/src/wcstringutil.h @@ -59,5 +59,6 @@ enum class ellipsis_type { }; wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest); +wcstring trim(const wcstring &input); #endif From 1629d339deec25fbe21e7be96818139cf65b5495 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 19:51:54 -0500 Subject: [PATCH 07/11] Properly parse spaces and escaped/quoted spaces in expansion braces --- src/common.cpp | 17 ++++++++++++----- src/expand.cpp | 4 ++-- src/wcstringutil.cpp | 7 ------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index b3a96db18..da7ac03f2 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1288,10 +1288,10 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in const bool unescape_special = static_cast(flags & UNESCAPE_SPECIAL); const bool allow_incomplete = static_cast(flags & UNESCAPE_INCOMPLETE); - int bracket_count = 0; + int brace_count = 0; bool errored = false; - enum { mode_unquoted, mode_single_quotes, mode_double_quotes } mode = mode_unquoted; + enum { mode_unquoted, mode_single_quotes, mode_double_quotes, mode_braces } mode = mode_unquoted; for (size_t input_position = 0; input_position < input_len && !errored; input_position++) { const wchar_t c = input[input_position]; @@ -1352,24 +1352,31 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in } case L'{': { if (unescape_special) { - bracket_count++; + brace_count++; to_append_or_none = BRACE_BEGIN; } break; } case L'}': { if (unescape_special) { - bracket_count--; + brace_count--; to_append_or_none = BRACE_END; } break; } case L',': { - if (unescape_special && bracket_count > 0) { + if (unescape_special && brace_count > 0) { to_append_or_none = BRACE_SEP; } break; } + case L' ': { + //spaces, unless quoted or escaped, are ignored within braces + // if (unescape_special && brace_count > 0) { + // input_position++; //skip the space + // } + break; + } case L'\'': { mode = mode_single_quotes; to_append_or_none = unescape_special ? wint_t(INTERNAL_SEPARATOR) : NOT_A_WCHAR; diff --git a/src/expand.cpp b/src/expand.cpp index 8aa7fc540..2031dc717 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -942,8 +942,8 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, whole_item.append(in, length_preceding_braces); whole_item.append(item_begin, item_len); whole_item.append(brace_end + 1); - auto whole_item2 = trim(whole_item); - debug(0, L"Found brace item: %ls\n", whole_item2.c_str()); + whole_item = trim(whole_item); + // debug(0, L"Found brace item: '%ls'\n", whole_item.c_str()); expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index 5ed6d7b74..30a3a0111 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -47,13 +47,6 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) { } wcstring trim(const wcstring &input) { - debug(0, "trimming '%ls'", input.c_str()); - - // auto begin = input.cbegin(); - // for (begin; *begin == L' '; ++begin); - // auto end = input.cbegin() + input.size(); - // for (end; end > begin && *end == L' '; ++end); - auto begin_offset = input.find_first_not_of(whitespace); if (begin_offset == wcstring::npos) { return wcstring{}; From 0620cdf7114a772e961a8574cd3a5ec20446cbec Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 20:06:45 -0500 Subject: [PATCH 08/11] Fix tokenizer errors for nested, alternating {} and () --- src/tokenizer.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index ae804bb59..65e262204 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -109,6 +109,7 @@ ENUM_FLAGS(tok_mode) { tok_t tokenizer_t::read_string() { tok_mode mode { tok_mode::regular_text }; std::vector paran_offsets; + std::vector brace_offsets; std::vector expecting; int slice_offset = 0; const wchar_t *const buff_start = this->buff; @@ -146,7 +147,7 @@ tok_t tokenizer_t::read_string() { mode |= tok_mode::subshell; } else if (c == L'{') { - paran_offsets.push_back(this->buff - this->start); + brace_offsets.push_back(this->buff - this->start); expecting.push_back(L'}'); mode |= tok_mode::curly_braces; } @@ -162,19 +163,21 @@ tok_t tokenizer_t::read_string() { default: paran_offsets.pop_back(); } + expecting.pop_back(); } else if (c == L'}') { if (expecting.size() > 0 && expecting.back() == L')') { return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff); } - switch (paran_offsets.size()) { + switch (brace_offsets.size()) { case 0: return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff); case 1: mode &= ~(tok_mode::curly_braces); default: - paran_offsets.pop_back(); + brace_offsets.pop_back(); } + expecting.pop_back(); } else if (c == L'[') { if (this->buff != buff_start) { @@ -247,8 +250,8 @@ tok_t tokenizer_t::read_string() { this->start + offset_of_open_paran); } else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) { - assert(paran_offsets.size() > 0); - size_t offset_of_open_brace = paran_offsets.back(); + assert(brace_offsets.size() > 0); + size_t offset_of_open_brace = brace_offsets.back(); error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start, this->start + offset_of_open_brace); From 364115f818ef57df60db2893eac41a0ff62d8051 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 20:18:21 -0500 Subject: [PATCH 09/11] fixup! Properly parse spaces and escaped/quoted spaces in expansion braces --- src/expand.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/expand.cpp b/src/expand.cpp index 2031dc717..899ec4653 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -942,8 +942,6 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, whole_item.append(in, length_preceding_braces); whole_item.append(item_begin, item_len); whole_item.append(brace_end + 1); - whole_item = trim(whole_item); - // debug(0, L"Found brace item: '%ls'\n", whole_item.c_str()); expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; From 24afff1c77c636c1bbc7b1355518313dab9859d1 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 22:02:43 -0500 Subject: [PATCH 10/11] Handle whitespace within parameter expansion tokens From the discussion in #3802, handling spaces within braces more gracefully. Leading and trailing whitespace that isn't quoted or escaped is stripped, whitespace in the middle is preserved. Any whitespace encountered within expansion tokens is treated as a single space, similar to how programming languages that don't hard break tokens/quotes on line endings would. --- src/common.cpp | 20 +++++++++++++++----- src/expand.cpp | 10 +++++++++- src/expand.h | 2 ++ src/wcstringutil.cpp | 6 +++--- src/wcstringutil.h | 2 +- 5 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/common.cpp b/src/common.cpp index da7ac03f2..f3c91df30 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1288,6 +1288,7 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in const bool unescape_special = static_cast(flags & UNESCAPE_SPECIAL); const bool allow_incomplete = static_cast(flags & UNESCAPE_INCOMPLETE); + bool brace_text_start = false; int brace_count = 0; bool errored = false; @@ -1359,7 +1360,9 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in } case L'}': { if (unescape_special) { + assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we shouldn't be able to get here"); brace_count--; + brace_text_start = brace_text_start && brace_count > 0; to_append_or_none = BRACE_END; } break; @@ -1367,14 +1370,16 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in case L',': { if (unescape_special && brace_count > 0) { to_append_or_none = BRACE_SEP; + brace_text_start = false; } break; } + case L'\n': + case L'\t': case L' ': { - //spaces, unless quoted or escaped, are ignored within braces - // if (unescape_special && brace_count > 0) { - // input_position++; //skip the space - // } + if (unescape_special && brace_count > 0) { + to_append_or_none = brace_text_start ? BRACE_SPACE : NOT_A_WCHAR; + } break; } case L'\'': { @@ -1387,7 +1392,12 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in to_append_or_none = unescape_special ? wint_t(INTERNAL_SEPARATOR) : NOT_A_WCHAR; break; } - default: { break; } + default: { + if (unescape_special && brace_count > 0) { + brace_text_start = true; + } + break; + } } } else if (mode == mode_single_quotes) { if (c == L'\\') { diff --git a/src/expand.cpp b/src/expand.cpp index 899ec4653..141d4158f 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -936,12 +936,20 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags, if (brace_count == 0 && ((*pos == BRACE_SEP) || (pos == brace_end))) { assert(pos >= item_begin); size_t item_len = pos - item_begin; + wcstring item = wcstring(item_begin, item_len); + item = trim(item, (const wchar_t[]) { BRACE_SPACE }); + for (auto &c : item) { + if (c == BRACE_SPACE) { + c = ' '; + } + } wcstring whole_item; whole_item.reserve(tot_len + item_len + 2); whole_item.append(in, length_preceding_braces); - whole_item.append(item_begin, item_len); + whole_item.append(item.begin(), item.end()); whole_item.append(brace_end + 1); + whole_item = trim(whole_item, (const wchar_t[]) { BRACE_SPACE }); expand_braces(whole_item, flags, out, errors); item_begin = pos + 1; diff --git a/src/expand.h b/src/expand.h index 62bd6a482..890a6407b 100644 --- a/src/expand.h +++ b/src/expand.h @@ -70,6 +70,8 @@ enum { BRACE_END, /// Character representing separation between two bracket elements. BRACE_SEP, + /// Character that takes the place of any whitespace within non-quoted text in braces + BRACE_SPACE, /// Separate subtokens in a token with this character. INTERNAL_SEPARATOR, /// Character representing an empty variable expansion. Only used transitively while expanding diff --git a/src/wcstringutil.cpp b/src/wcstringutil.cpp index 30a3a0111..348257443 100644 --- a/src/wcstringutil.cpp +++ b/src/wcstringutil.cpp @@ -46,12 +46,12 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) { return output; } -wcstring trim(const wcstring &input) { - auto begin_offset = input.find_first_not_of(whitespace); +wcstring trim(const wcstring &input, const wchar_t *any_of) { + auto begin_offset = input.find_first_not_of(any_of); if (begin_offset == wcstring::npos) { return wcstring{}; } - auto end = input.cbegin() + input.find_last_not_of(whitespace); + auto end = input.cbegin() + input.find_last_not_of(any_of); wcstring result(input.begin() + begin_offset, end + 1); return result; diff --git a/src/wcstringutil.h b/src/wcstringutil.h index 8665c0024..75351e38f 100644 --- a/src/wcstringutil.h +++ b/src/wcstringutil.h @@ -59,6 +59,6 @@ enum class ellipsis_type { }; wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest); -wcstring trim(const wcstring &input); +wcstring trim(const wcstring &input, const wchar_t *any_of); #endif From 1e5d7d98a85d024204dd72bc42655316dc7a632a Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sun, 11 Mar 2018 23:34:58 -0500 Subject: [PATCH 11/11] Add tests for new parameter expansion features --- tests/parameter_expansion.err | 0 tests/parameter_expansion.in | 34 ++++++++++++++++++++++++++++++++++ tests/parameter_expansion.out | 14 ++++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 tests/parameter_expansion.err create mode 100644 tests/parameter_expansion.in create mode 100644 tests/parameter_expansion.out diff --git a/tests/parameter_expansion.err b/tests/parameter_expansion.err new file mode 100644 index 000000000..e69de29bb diff --git a/tests/parameter_expansion.in b/tests/parameter_expansion.in new file mode 100644 index 000000000..89ba2b2d9 --- /dev/null +++ b/tests/parameter_expansion.in @@ -0,0 +1,34 @@ +# basic expansion test +echo {} +echo {apple} +echo {apple,orange} + +# expansion tests with spaces +echo {apple, orange} +echo { apple, orange, banana } + +# expansion with spaces and cartesian products +echo \'{ hello , world }\' + +# expansion with escapes +for phrase in {good\,, beautiful ,morning}; echo -n "$phrase "; end | string trim; +for phrase in {goodbye\,,\ cruel\ ,world\n}; echo -n $phrase; end; + +# whitespace within entries converted to spaces in a single entry +for foo in { hello +world } + echo \'$foo\' +end + +# dual expansion cartesian product +echo { alpha, beta }\ {lambda, gamma }, | sed -r 's/(.*),/\1/' + +# expansion with subshells +for name in { (echo Meg), (echo Jo) } + echo $name +end + +# subshells with expansion +for name in (for name in {Beth, Amy}; printf "$name\n"; end); printf "$name\n"; end + +# vim: set ft=fish: diff --git a/tests/parameter_expansion.out b/tests/parameter_expansion.out new file mode 100644 index 000000000..d89373285 --- /dev/null +++ b/tests/parameter_expansion.out @@ -0,0 +1,14 @@ +{} +apple +apple orange +apple orange +apple orange banana +'hello' 'world' +good, beautiful morning +goodbye, cruel world +'hello world' +alpha lambda, beta lambda, alpha gamma, beta gamma +Meg +Jo +Beth +Amy