Introduce a new fish ast

This is the first commit of a series intended to replace the existing "parse tree" machinery. It adds a new abstract syntax tree and uses a more normal recursive descent parser. Initially there are no users of the new ast. The following commits will replace parse_tree -> ast for all usages.
2024-12-27 05:13:10 +00:00 · 2020-06-20 15:27:10 -07:00 · 2020-06-20 15:27:10 -07:00 · 4d4455007d
commit 4d4455007d
parent 45c9e3b0f1
11 changed files with 2350 additions and 30 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -121,7 +121,7 @@ set(FISH_SRCS
    src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp src/wutil.cpp
    src/future_feature_flags.cpp src/redirection.cpp src/topic_monitor.cpp
    src/flog.cpp src/trace.cpp src/timer.cpp src/null_terminated_array.cpp
-    src/operation_context.cpp src/fd_monitor.cpp src/termsize.cpp
+    src/operation_context.cpp src/fd_monitor.cpp src/termsize.cpp src/ast.cpp
 )

 # Header files are just globbed.
--- a/src/ast.cpp
+++ b/src/ast.cpp
--- a/src/ast.h
+++ b/src/ast.h
--- a/src/ast_node_types.inc
+++ b/src/ast_node_types.inc
@ -0,0 +1,60 @@
+// Define ELEM and optionally ELEMLIST before including this file.
+// ELEM is for ordinary nodes.
+// ELEMLIST(x, y) marks list nodes and the type they contain.
+#ifndef ELEMLIST
+#define ELEMLIST(x, y) ELEM(x)
+#endif
+
+ELEM(keyword_base)
+ELEM(token_base)
+ELEM(maybe_newlines)
+
+ELEM(argument)
+ELEMLIST(argument_list, argument)
+
+ELEM(redirection)
+ELEM(argument_or_redirection)
+ELEMLIST(argument_or_redirection_list, argument_or_redirection)
+
+ELEM(variable_assignment)
+ELEMLIST(variable_assignment_list, variable_assignment)
+
+ELEM(job)
+ELEM(job_conjunction)
+// For historical reasons, a job list is a list of job *conjunctions*. This should be fixed.
+ELEMLIST(job_list, job_conjunction)
+ELEM(job_conjunction_continuation)
+ELEMLIST(job_conjunction_continuation_list, job_conjunction_continuation)
+
+ELEM(job_continuation)
+ELEMLIST(job_continuation_list, job_continuation)
+
+ELEM(andor_job)
+ELEMLIST(andor_job_list, andor_job)
+
+ELEM(statement)
+
+ELEM(not_statement)
+
+ELEM(block_statement)
+ELEM(for_header)
+ELEM(while_header)
+ELEM(function_header)
+ELEM(begin_header)
+
+ELEM(if_statement)
+ELEM(if_clause)
+ELEM(elseif_clause)
+ELEMLIST(elseif_clause_list, elseif_clause)
+ELEM(else_clause)
+
+ELEM(switch_statement)
+ELEM(case_item)
+ELEMLIST(case_item_list, case_item)
+
+ELEM(decorated_statement)
+
+ELEM(freestanding_argument_list)
+
+#undef ELEM
+#undef ELEMLIST
--- a/src/fish_indent.cpp
+++ b/src/fish_indent.cpp
@ -33,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 #include <tuple>
 #include <vector>

+#include "ast.h"
 #include "color.h"
 #include "common.h"
 #include "env.h"
@ -404,6 +405,12 @@ static wcstring prettify(const wcstring &src, bool do_indent) {
    if (dump_parse_tree) {
        const wcstring dump = parse_dump_tree(parse_tree, src);
        std::fwprintf(stderr, L"%ls\n", dump.c_str());
+
+        auto ast =
+            ast::ast_t::parse(src, parse_flag_leave_unterminated | parse_flag_include_comments |
+                                       parse_flag_show_extra_semis);
+        wcstring ast_dump = ast.dump(src);
+        std::fwprintf(stderr, L"%ls\n", ast_dump.c_str());
    }

    // We may have a forest of disconnected trees on a parse failure. We have to handle all nodes
--- a/src/flog.h
+++ b/src/flog.h
@ -67,6 +67,7 @@ class category_list_t {
    category_t parse_productions{L"parse-productions", L"Resolving tokens"};
    category_t parse_productions_chatty{L"parse-productions-chatty",
                                        L"Resolving tokens (chatty messages)"};
+    category_t ast_construction{L"ast-construction", L"Parsing fish AST"};

    category_t proc_job_run{L"proc-job-run", L"Jobs getting started or continued"};

--- a/src/parse_constants.h
+++ b/src/parse_constants.h
@ -13,6 +13,17 @@
        exit_without_destructors(-1);  \
    } while (0)

+// A range of source code.
+struct source_range_t {
+    uint32_t start;
+    uint32_t length;
+
+    uint32_t end() const {
+        assert(start + length >= start && "Overflow");
+        return start + length;
+    }
+};
+
 // IMPORTANT: If the following enum table is modified you must also update token_enum_map below.
 enum parse_token_type_t : uint8_t {
    token_type_invalid = 1,
@ -193,6 +204,26 @@ enum parse_error_code_t {
    parse_error_andor_in_pipeline,         // "and" or "or" after a pipe
 };

+enum {
+    parse_flag_none = 0,
+
+    /// Attempt to build a "parse tree" no matter what. This may result in a 'forest' of
+    /// disconnected trees. This is intended to be used by syntax highlighting.
+    parse_flag_continue_after_error = 1 << 0,
+    /// Include comment tokens.
+    parse_flag_include_comments = 1 << 1,
+    /// Indicate that the tokenizer should accept incomplete tokens */
+    parse_flag_accept_incomplete_tokens = 1 << 2,
+    /// Indicate that the parser should not generate the terminate token, allowing an 'unfinished'
+    /// tree where some nodes may have no productions.
+    parse_flag_leave_unterminated = 1 << 3,
+    /// Indicate that the parser should generate job_list entries for blank lines.
+    parse_flag_show_blank_lines = 1 << 4,
+    /// Indicate that extra semis should be generated.
+    parse_flag_show_extra_semis = 1 << 5,
+};
+typedef unsigned int parse_tree_flags_t;
+
 enum { PARSER_TEST_ERROR = 1, PARSER_TEST_INCOMPLETE = 2 };
 typedef unsigned int parser_test_error_bits_t;

@ -214,6 +245,9 @@ struct parse_error_t {
 };
 typedef std::vector<parse_error_t> parse_error_list_t;

+wcstring token_type_user_presentable_description(parse_token_type_t type,
+                                                 parse_keyword_t keyword = parse_keyword_t::none);
+
 // Special source_start value that means unknown.
 #define SOURCE_LOCATION_UNKNOWN (static_cast<size_t>(-1))

--- a/src/parse_grammar.h
+++ b/src/parse_grammar.h
@ -255,7 +255,7 @@ DEF_ALT(variable_assignments) {
 // A string token like VAR=value
 DEF(variable_assignment) produces_single<tok_string>{BODY(variable_assignment)};

-// A statement is a normal command, or an if / while / and etc
+// A statement is a normal command, or an if / while / etc
 DEF_ALT(statement) {
    using nots = single<not_statement>;
    using block = single<block_statement>;
--- a/src/parse_productions.h
+++ b/src/parse_productions.h
@ -4,6 +4,7 @@

 #include <sys/types.h>

+#include "ast.h"
 #include "parse_constants.h"

 struct parse_token_t;
--- a/src/parse_tree.cpp
+++ b/src/parse_tree.cpp
@ -30,7 +30,7 @@ static bool production_is_empty(const production_element_t *production) {
    return *production == token_type_invalid;
 }

-static parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err) {
+parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err) {
    switch (err) {
        case tokenizer_error_t::none:
            return parse_error_none;
@ -168,8 +168,7 @@ const wchar_t *keyword_description(parse_keyword_t type) {
    return L"unknown_keyword";
 }

-static wcstring token_type_user_presentable_description(
-    parse_token_type_t type, parse_keyword_t keyword = parse_keyword_t::none) {
+wcstring token_type_user_presentable_description(parse_token_type_t type, parse_keyword_t keyword) {
    if (keyword != parse_keyword_t::none) {
        return format_string(L"keyword '%ls'", keyword_description(keyword));
    }
@ -1078,8 +1077,7 @@ static inline bool is_help_argument(const wcstring &txt) {
 }

 /// Return a new parse token, advancing the tokenizer.
-static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token,
-                                             wcstring *storage) {
+parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token, wcstring *storage) {
    *out_token = tok->next();
    if (!out_token->has_value()) {
        return kTerminalToken;
@ -1098,7 +1096,8 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *o
    result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
    result.is_newline = (result.type == parse_token_type_end && text == L"\n");
    result.preceding_escaped_nl = token.preceding_escaped_nl;
-    result.may_be_variable_assignment = bool(variable_assignment_equals_pos(text));
+    result.may_be_variable_assignment = variable_assignment_equals_pos(text).has_value();
+    result.tok_error = token.error;

    // These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
    // uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
--- a/src/parse_tree.h
+++ b/src/parse_tree.h
@ -6,6 +6,7 @@
 #include <stdint.h>
 #include <sys/types.h>

+#include <deque>
 #include <memory>
 #include <vector>

@ -25,11 +26,6 @@ typedef uint32_t source_offset_t;

 constexpr source_offset_t SOURCE_OFFSET_INVALID = static_cast<source_offset_t>(-1);

-struct source_range_t {
-    uint32_t start;
-    uint32_t length;
-};
-
 /// A struct representing the token type that we use internally.
 struct parse_token_t {
    enum parse_token_type_t type;  // The type of the token as represented by the parser
@ -41,38 +37,36 @@ struct parse_token_t {
    bool is_newline{false};            // Hackish: if TOK_END, whether the source is a newline.
    bool preceding_escaped_nl{false};  // Whether there was an escaped newline preceding this token.
    bool may_be_variable_assignment{false};  // Hackish: whether this token is a string like FOO=bar
+    tokenizer_error_t tok_error{tokenizer_error_t::none}; // If this is a tokenizer error, that error.
    source_offset_t source_start{SOURCE_OFFSET_INVALID};
    source_offset_t source_length{0};

+    /// \return the source range.
+    source_range_t range() const {
+        return source_range_t{source_start, source_length};
+    }
+
+    /// \return whether we are a string with the dash prefix set.
+    bool is_dash_prefix_string() const {
+        return type == parse_token_type_string && has_dash_prefix;
+    }
+
    wcstring describe() const;
    wcstring user_presentable_description() const;

    constexpr parse_token_t(parse_token_type_t type) : type(type) {}
 };

-enum {
-    parse_flag_none = 0,
-
-    /// Attempt to build a "parse tree" no matter what. This may result in a 'forest' of
-    /// disconnected trees. This is intended to be used by syntax highlighting.
-    parse_flag_continue_after_error = 1 << 0,
-    /// Include comment tokens.
-    parse_flag_include_comments = 1 << 1,
-    /// Indicate that the tokenizer should accept incomplete tokens */
-    parse_flag_accept_incomplete_tokens = 1 << 2,
-    /// Indicate that the parser should not generate the terminate token, allowing an 'unfinished'
-    /// tree where some nodes may have no productions.
-    parse_flag_leave_unterminated = 1 << 3,
-    /// Indicate that the parser should generate job_list entries for blank lines.
-    parse_flag_show_blank_lines = 1 << 4
-};
-typedef unsigned int parse_tree_flags_t;
+/// Return a new parse token, advancing the tokenizer.
+parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token, wcstring *storage);

 wcstring parse_dump_tree(const parse_node_tree_t &nodes, const wcstring &src);

 const wchar_t *token_type_description(parse_token_type_t type);
 const wchar_t *keyword_description(parse_keyword_t type);

+parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err);
+
 // Node flags.
 enum {
    /// Flag indicating that the node has associated comment nodes.