Make { and } valid, first-class tokenizer elements

2025-01-13 13:39:02 +00:00 · 2018-03-11 19:36:10 -05:00 · 2018-03-11 19:36:10 -05:00 · 00f95a978e
commit 00f95a978e
parent 7447432471
6 changed files with 89 additions and 89 deletions
--- a/src/expand.cpp
+++ b/src/expand.cpp
@ -47,6 +47,7 @@
 #include "proc.h"
 #include "reader.h"
 #include "wildcard.h"
+#include "wcstringutil.h"
 #include "wutil.h"  // IWYU pragma: keep
 #ifdef KERN_PROCARGS2
 #else
@ -941,7 +942,8 @@ static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags,
            whole_item.append(in, length_preceding_braces);
            whole_item.append(item_begin, item_len);
            whole_item.append(brace_end + 1);
-            debug(0, L"Found brace item: %ls\n", whole_item.c_str());
+            auto whole_item2 = trim(whole_item);
+            debug(0, L"Found brace item: %ls\n", whole_item2.c_str());
            expand_braces(whole_item, flags, out, errors);

            item_begin = pos + 1;
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
 }

 void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) {
-    parse_error_code_t parse_error_code;
-    switch (tok.error) {
-        case TOK_UNTERMINATED_QUOTE: {
-            parse_error_code = parse_error_tokenizer_unterminated_quote;
-            break;
-        }
-        case TOK_UNTERMINATED_SUBSHELL: {
-            parse_error_code = parse_error_tokenizer_unterminated_subshell;
-            break;
-        }
-        case TOK_UNTERMINATED_SLICE: {
-            parse_error_code = parse_error_tokenizer_unterminated_slice;
-            break;
-        }
-        case TOK_UNTERMINATED_ESCAPE: {
-            parse_error_code = parse_error_tokenizer_unterminated_escape;
-            break;
-        }
-        case TOK_INVALID_REDIRECT:
-        case TOK_INVALID_PIPE:
-        default: {
-            parse_error_code = parse_error_tokenizer_other;
-            break;
-        }
-    }
-
+    parse_error_code_t parse_error_code = tok.error->parser_error;
    this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset,
                                  parse_error_code, L"%ls",
-                                  error_message_for_code(tok.error).c_str());
+                                  tok.error->Message);
 }

 void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -16,56 +16,22 @@
 #include "tokenizer.h"
 #include "wutil.h"  // IWYU pragma: keep

-/// Error string for unexpected end of string.
-#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced")
-
-/// Error string for mismatched parenthesis.
-#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match")
-
-/// Error string for mismatched square brackets.
-#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match")
-
-/// Error string for unterminated escape (backslash without continuation).
-#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence")
-
-/// Error string for invalid redirections.
-#define REDIRECT_ERROR _(L"Invalid input/output redirection")
-
-/// Error string for when trying to pipe from fd 0.
-#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")
-
-/// Error for when ) is encountered with no matching (
-#define ERROR_CLOSING_UNOPENED_PARENTHESIS _(L"Unexpected ')' for unopened parenthesis")
-
-/// Error for when [ is encountered while already in bracket mode
-#define ERROR_UNEXPECTED_BRACKET _(L"Unexpected '[' at this location")
-
-wcstring error_message_for_code(tokenizer_error err) {
-    switch (err) {
-        case TOK_UNTERMINATED_QUOTE:
-            return QUOTE_ERROR;
-        case TOK_UNTERMINATED_SUBSHELL:
-            return PARAN_ERROR;
-        case TOK_UNTERMINATED_SLICE:
-            return SQUARE_BRACKET_ERROR;
-        case TOK_UNTERMINATED_ESCAPE:
-            return UNTERMINATED_ESCAPE_ERROR;
-        case TOK_INVALID_REDIRECT:
-            return REDIRECT_ERROR;
-        case TOK_INVALID_PIPE:
-            return PIPE_ERROR;
-        case TOK_CLOSING_UNOPENED_SUBSHELL:
-            return ERROR_CLOSING_UNOPENED_PARENTHESIS;
-        case TOK_ILLEGAL_SLICE:
-            return ERROR_UNEXPECTED_BRACKET;
-        default:
-            assert(0 && "Unknown error type");
-            return {};
-    }
-}
+tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L"");
+tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote);
+tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell);
+tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice);
+tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape);
+tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection"));
+tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output"));
+tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis"));
+tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location"));
+tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion"));
+tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion"));
+tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'"));
+tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'"));

 /// Return an error token and mark that we no longer have a next token.
-tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start,
+tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start,
                              const wchar_t *error_loc) {
    assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
    assert(error_loc >= token_start && "Invalid error location");
@ -143,6 +109,7 @@ ENUM_FLAGS(tok_mode) {
 tok_t tokenizer_t::read_string() {
    tok_mode mode { tok_mode::regular_text };
    std::vector<int> paran_offsets;
+    std::vector<char> expecting;
    int slice_offset = 0;
    const wchar_t *const buff_start = this->buff;
    bool is_first = true;
@ -175,9 +142,18 @@ tok_t tokenizer_t::read_string() {
        }
        else if (c == L'(') {
            paran_offsets.push_back(this->buff - this->start);
+            expecting.push_back(L')');
            mode |= tok_mode::subshell;
        }
+        else if (c == L'{') {
+            paran_offsets.push_back(this->buff - this->start);
+            expecting.push_back(L'}');
+            mode |= tok_mode::curly_braces;
+        }
        else if (c == L')') {
+            if (expecting.size() > 0 && expecting.back() == L'}') {
+                return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff);
+            }
            switch (paran_offsets.size()) {
                case 0:
                    return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff);
@ -187,6 +163,19 @@ tok_t tokenizer_t::read_string() {
                    paran_offsets.pop_back();
            }
        }
+        else if (c == L'}') {
+            if (expecting.size() > 0 && expecting.back() == L')') {
+                return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff);
+            }
+            switch (paran_offsets.size()) {
+                case 0:
+                    return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff);
+                case 1:
+                    mode &= ~(tok_mode::curly_braces);
+                default:
+                    paran_offsets.pop_back();
+            }
+        }
        else if (c == L'[') {
            if (this->buff != buff_start) {
                if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) {
@ -257,6 +246,13 @@ tok_t tokenizer_t::read_string() {
            error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
                    this->start + offset_of_open_paran);
        }
+        else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) {
+            assert(paran_offsets.size() > 0);
+            size_t offset_of_open_brace = paran_offsets.back();
+
+            error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start,
+                    this->start + offset_of_open_brace);
+        }
        return error;
    }

--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -7,6 +7,7 @@

 #include "common.h"
 #include "maybe.h"
+#include "parse_constants.h"

 /// Token types.
 enum token_type {
@ -22,19 +23,26 @@ enum token_type {
    TOK_COMMENT      /// comment token
 };

-/// Tokenizer error types.
-enum tokenizer_error {
-    TOK_ERROR_NONE,
-    TOK_UNTERMINATED_QUOTE,
-    TOK_UNTERMINATED_SUBSHELL,
-    TOK_UNTERMINATED_SLICE,
-    TOK_UNTERMINATED_ESCAPE,
-    TOK_INVALID_REDIRECT,
-    TOK_INVALID_PIPE,
-    TOK_CLOSING_UNOPENED_SUBSHELL,
-    TOK_ILLEGAL_SLICE,
+struct tokenizer_error {
+    const wchar_t *Message;
+    enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error
+    tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other)
+        : Message(msg), parser_error(perr) {}
+    tokenizer_error(const tokenizer_error&) = delete;
 };

+extern tokenizer_error *TOK_ERROR_NONE;
+extern tokenizer_error *TOK_UNTERMINATED_QUOTE;
+extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL;
+extern tokenizer_error *TOK_UNTERMINATED_SLICE;
+extern tokenizer_error *TOK_UNTERMINATED_ESCAPE;
+extern tokenizer_error *TOK_UNTERMINATED_BRACE;
+extern tokenizer_error *TOK_INVALID_REDIRECT;
+extern tokenizer_error *TOK_INVALID_PIPE;
+extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL;
+extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE;
+extern tokenizer_error *TOK_ILLEGAL_SLICE;
+
 enum class redirection_type_t {
    overwrite,  // normal redirection: > file.txt
    append,     // appending redirection: >> file.txt
@ -69,7 +77,7 @@ struct tok_t {
    maybe_t<int> redirected_fd{};

    // If an error, this is the error code.
-    enum tokenizer_error error { TOK_ERROR_NONE };
+    tokenizer_error *error { TOK_ERROR_NONE };

    // If an error, this is the offset of the error within the token. A value of 0 means it occurred
    // at 'offset'.
@ -99,7 +107,7 @@ class tokenizer_t {
    /// Whether to continue the previous line after the comment.
    bool continue_line_after_comment{false};

-    tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start,
+    tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start,
                     const wchar_t *error_loc);
    tok_t read_string();
    maybe_t<tok_t> tok_next();
--- a/src/wcstringutil.cpp
+++ b/src/wcstringutil.cpp
@ -45,3 +45,21 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) {
    output.push_back(ellipsis_char);
    return output;
 }
+
+wcstring trim(const wcstring &input) {
+    debug(0, "trimming '%ls'", input.c_str());
+
+    // auto begin = input.cbegin();
+    // for (begin; *begin == L' '; ++begin);
+    // auto end = input.cbegin() + input.size();
+    // for (end; end > begin && *end == L' '; ++end);
+
+    auto begin_offset = input.find_first_not_of(whitespace);
+    if (begin_offset == wcstring::npos) {
+        return wcstring{};
+    }
+    auto end = input.cbegin() + input.find_last_not_of(whitespace);
+
+    wcstring result(input.begin() + begin_offset, end + 1);
+    return result;
+}
--- a/src/wcstringutil.h
+++ b/src/wcstringutil.h
@ -59,5 +59,6 @@ enum class ellipsis_type {
 };

 wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest);
+wcstring trim(const wcstring &input);

 #endif