diff --git a/src/common.h b/src/common.h index a5b4bb18e..8047018b5 100644 --- a/src/common.h +++ b/src/common.h @@ -807,6 +807,19 @@ struct enum_map { const wchar_t *const str; }; + +/// Use for scoped enums (i.e. `enum class`) with bitwise operations +#define ENUM_FLAG_OPERATOR(T,X,Y) \ +inline T operator X (T lhs, T rhs) { return (T) (static_cast::type>(lhs) X static_cast::type>(rhs)); } \ +inline T operator Y (T &lhs, T rhs) { return lhs = (T) (static_cast::type>(lhs) X static_cast::type>(rhs)); } +#define ENUM_FLAGS(T) \ +enum class T; \ +inline T operator ~ (T t) { return (T) (~static_cast::type>(t)); } \ +ENUM_FLAG_OPERATOR(T,|,|=) \ +ENUM_FLAG_OPERATOR(T,^,^=) \ +ENUM_FLAG_OPERATOR(T,&,&=) \ +enum class T + /// Given a string return the matching enum. Return the sentinal enum if no match is made. The map /// must be sorted by the `str` member. A binary search is twice as fast as a linear search with 16 /// elements in the map. diff --git a/src/parse_constants.h b/src/parse_constants.h index 05700fc4d..b84de73e6 100644 --- a/src/parse_constants.h +++ b/src/parse_constants.h @@ -169,6 +169,7 @@ enum parse_error_code_t { parse_error_tokenizer_unterminated_subshell, parse_error_tokenizer_unterminated_slice, parse_error_tokenizer_unterminated_escape, + parse_error_tokenizer_nested_slice, parse_error_tokenizer_other, parse_error_unbalancing_end, // end outside of block diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index eb49cf263..31c719016 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -34,6 +34,9 @@ /// Error string for when trying to pipe from fd 0. #define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output") +/// Error for when ) is encountered with no matching ( +#define ERROR_CLOSING_UNOPENED_PARENTHESIS _(L"Unexpected ')' for unopened parenthesis") + wcstring error_message_for_code(tokenizer_error err) { switch (err) { case TOK_UNTERMINATED_QUOTE: @@ -48,6 +51,8 @@ wcstring error_message_for_code(tokenizer_error err) { return REDIRECT_ERROR; case TOK_INVALID_PIPE: return PIPE_ERROR; + case TOK_CLOSING_UNOPENED_SUBSHELL: + return ERROR_CLOSING_UNOPENED_PARENTHESIS; default: assert(0 && "Unknown error type"); return {}; @@ -119,194 +124,115 @@ static bool tok_is_string_character(wchar_t c, bool is_first) { /// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster /// by adding a fast path for the most common characters. This is obviously not a suitable /// replacement for iswalpha. -static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } +static inline int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } + +ENUM_FLAGS(tok_mode) { + regular_text = 0, // regular text + subshell = 1 << 0, // inside of subshell parentheses + array_brackets = 1 << 1, // inside of array brackets + curly_braces = 1 << 2, + char_escape = 1 << 3, +} mode = tok_mode::regular_text; /// Read the next token as a string. tok_t tokenizer_t::read_string() { - bool do_loop = true; - size_t paran_count = 0; - // Up to 96 open parens, before we give up on good error reporting. - const size_t paran_offsets_max = 96; - size_t paran_offsets[paran_offsets_max]; - // Where the open bracket is. - size_t offset_of_bracket = 0; + std::vector paran_offsets; + int slice_offset = 0; const wchar_t *const buff_start = this->buff; bool is_first = true; - enum tok_mode_t { - mode_regular_text = 0, // regular text - mode_subshell = 1, // inside of subshell - mode_array_brackets = 2, // inside of array brackets - mode_array_brackets_and_subshell = - 3 // inside of array brackets and subshell, like in '$foo[(ech' - } mode = mode_regular_text; + while (true) { + wchar_t c = *this->buff; +#if false + wcstring msg = L"Handling 0x%x (%lc)"; + tok_mode mode_begin = mode; +#endif - while (1) { - if (!myal(*this->buff)) { - if (*this->buff == L'\\') { - const wchar_t *error_location = this->buff; - this->buff++; - if (*this->buff == L'\0') { - if ((!this->accept_unfinished)) { - return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start, - error_location); - } - // Since we are about to increment tok->buff, decrement it first so the - // increment doesn't go past the end of the buffer. See issue #389. - this->buff--; - do_loop = 0; - } - - this->buff++; - continue; + // Make sure this character isn't being escaped before anything else + if ((mode & tok_mode::char_escape) == tok_mode::char_escape) { + mode &= ~(tok_mode::char_escape); + // and do nothing more + } + else if (!myal(c)) { + if (c == L'\0') { + break; } - - switch (mode) { - case mode_regular_text: { - switch (*this->buff) { - case L'(': { - paran_count = 1; - paran_offsets[0] = this->buff - this->start; - mode = mode_subshell; - break; - } - case L'[': { - if (this->buff != buff_start) { - mode = mode_array_brackets; - offset_of_bracket = this->buff - this->start; - } - break; - } - case L'\'': - case L'"': { - const wchar_t *end = quote_end(this->buff); - if (end) { - this->buff = end; - } else { - const wchar_t *error_loc = this->buff; - this->buff += wcslen(this->buff); - - if (!this->accept_unfinished) { - return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, - error_loc); - } - do_loop = 0; - } - break; - } - default: { - if (!tok_is_string_character(*(this->buff), is_first)) { - do_loop = 0; - } - break; - } - } - break; - } - - case mode_array_brackets_and_subshell: - case mode_subshell: { - switch (*this->buff) { - case L'\'': - case L'\"': { - const wchar_t *end = quote_end(this->buff); - if (end) { - this->buff = end; - } else { - const wchar_t *error_loc = this->buff; - this->buff += wcslen(this->buff); - if ((!this->accept_unfinished)) { - return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, - error_loc); - } - do_loop = 0; - } - break; - } - case L'(': { - if (paran_count < paran_offsets_max) { - paran_offsets[paran_count] = this->buff - this->start; - } - paran_count++; - break; - } - case L')': { - assert(paran_count > 0); - paran_count--; - if (paran_count == 0) { - mode = - (mode == mode_array_brackets_and_subshell ? mode_array_brackets - : mode_regular_text); - } - break; - } - case L'\0': { - do_loop = 0; - break; - } - default: { - break; // ignore other chars - } - } - break; - } - - case mode_array_brackets: { - switch (*this->buff) { - case L'(': { - paran_count = 1; - paran_offsets[0] = this->buff - this->start; - mode = mode_array_brackets_and_subshell; - break; - } - case L']': { - mode = mode_regular_text; - break; - } - case L'\0': { - do_loop = 0; - break; - } - default: { - break; // ignore other chars - } + else if (c == L'\\') { + mode |= tok_mode::char_escape; + } + else if (c == L'(') { + paran_offsets.push_back(this->buff - this->start); + mode |= tok_mode::subshell; + } + else if (c == L')') { + switch (paran_offsets.size()) { + case 0: + return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, buff_start, this->buff); + case 1: + mode &= ~(tok_mode::subshell); + default: + paran_offsets.pop_back(); + } + } + else if (c == L'[') { + if (this->buff != buff_start) { + mode |= tok_mode::array_brackets; + slice_offset = this->buff - this->start; + } + else { + // This is actually allowed so the test operator `[` can be used as the head of a command + } + } + else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) { + mode &= ~(tok_mode::array_brackets); + } + else if (c == L'\'' || c == L'"') { + const wchar_t *end = quote_end(this->buff); + if (end) { + this->buff = end; + } else { + const wchar_t *error_loc = this->buff; + this->buff += wcslen(this->buff); + if ((!this->accept_unfinished)) { + return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc); } break; } } + else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) { + break; + } } - if (!do_loop) break; +#if false + if (mode != mode_begin) { + msg.append(L": mode 0x%x -> 0x%x\n"); + } else { + msg.push_back(L'\n'); + } + debug(0, msg.c_str(), c, c, int(mode_begin), int(mode)); +#endif this->buff++; is_first = false; } - if ((!this->accept_unfinished) && (mode != mode_regular_text)) { + if ((!this->accept_unfinished) && (mode != tok_mode::regular_text)) { tok_t error; - switch (mode) { - case mode_subshell: { - // Determine the innermost opening paran offset by interrogating paran_offsets. - assert(paran_count > 0); - size_t offset_of_open_paran = 0; - if (paran_count <= paran_offsets_max) { - offset_of_open_paran = paran_offsets[paran_count - 1]; - } + if ((mode & tok_mode::char_escape) == tok_mode::char_escape) { + error = this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start, + this->buff); + } + else if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) { + error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start, + this->start + slice_offset); + } + else if ((mode & tok_mode::subshell) == tok_mode::subshell) { + assert(paran_offsets.size() > 0); + size_t offset_of_open_paran = paran_offsets.back(); - error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, - this->start + offset_of_open_paran); - break; - } - case mode_array_brackets: - case mode_array_brackets_and_subshell: { - error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start, - this->start + offset_of_bracket); - break; - } - default: { - DIE("unexpected mode in read_string"); - break; - } + error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, + this->start + offset_of_open_paran); } return error; } diff --git a/src/tokenizer.h b/src/tokenizer.h index e0aa58b50..11b71bc0d 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -30,7 +30,8 @@ enum tokenizer_error { TOK_UNTERMINATED_SLICE, TOK_UNTERMINATED_ESCAPE, TOK_INVALID_REDIRECT, - TOK_INVALID_PIPE + TOK_INVALID_PIPE, + TOK_CLOSING_UNOPENED_SUBSHELL, }; enum class redirection_type_t {