Introduce a new fish ast

This is the first commit of a series intended to replace the existing
"parse tree" machinery. It adds a new abstract syntax tree and uses a more
normal recursive descent parser.

Initially there are no users of the new ast. The following commits will
replace parse_tree -> ast for all usages.
This commit is contained in:
ridiculousfish 2020-06-20 15:27:10 -07:00
parent 45c9e3b0f1
commit 4d4455007d
11 changed files with 2350 additions and 30 deletions

View file

@ -121,7 +121,7 @@ set(FISH_SRCS
src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp src/wutil.cpp
src/future_feature_flags.cpp src/redirection.cpp src/topic_monitor.cpp
src/flog.cpp src/trace.cpp src/timer.cpp src/null_terminated_array.cpp
src/operation_context.cpp src/fd_monitor.cpp src/termsize.cpp
src/operation_context.cpp src/fd_monitor.cpp src/termsize.cpp src/ast.cpp
)
# Header files are just globbed.

1206
src/ast.cpp Normal file

File diff suppressed because it is too large Load diff

1018
src/ast.h Normal file

File diff suppressed because it is too large Load diff

60
src/ast_node_types.inc Normal file
View file

@ -0,0 +1,60 @@
// Define ELEM and optionally ELEMLIST before including this file.
// ELEM is for ordinary nodes.
// ELEMLIST(x, y) marks list nodes and the type they contain.
#ifndef ELEMLIST
#define ELEMLIST(x, y) ELEM(x)
#endif
ELEM(keyword_base)
ELEM(token_base)
ELEM(maybe_newlines)
ELEM(argument)
ELEMLIST(argument_list, argument)
ELEM(redirection)
ELEM(argument_or_redirection)
ELEMLIST(argument_or_redirection_list, argument_or_redirection)
ELEM(variable_assignment)
ELEMLIST(variable_assignment_list, variable_assignment)
ELEM(job)
ELEM(job_conjunction)
// For historical reasons, a job list is a list of job *conjunctions*. This should be fixed.
ELEMLIST(job_list, job_conjunction)
ELEM(job_conjunction_continuation)
ELEMLIST(job_conjunction_continuation_list, job_conjunction_continuation)
ELEM(job_continuation)
ELEMLIST(job_continuation_list, job_continuation)
ELEM(andor_job)
ELEMLIST(andor_job_list, andor_job)
ELEM(statement)
ELEM(not_statement)
ELEM(block_statement)
ELEM(for_header)
ELEM(while_header)
ELEM(function_header)
ELEM(begin_header)
ELEM(if_statement)
ELEM(if_clause)
ELEM(elseif_clause)
ELEMLIST(elseif_clause_list, elseif_clause)
ELEM(else_clause)
ELEM(switch_statement)
ELEM(case_item)
ELEMLIST(case_item_list, case_item)
ELEM(decorated_statement)
ELEM(freestanding_argument_list)
#undef ELEM
#undef ELEMLIST

View file

@ -33,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
#include <tuple>
#include <vector>
#include "ast.h"
#include "color.h"
#include "common.h"
#include "env.h"
@ -404,6 +405,12 @@ static wcstring prettify(const wcstring &src, bool do_indent) {
if (dump_parse_tree) {
const wcstring dump = parse_dump_tree(parse_tree, src);
std::fwprintf(stderr, L"%ls\n", dump.c_str());
auto ast =
ast::ast_t::parse(src, parse_flag_leave_unterminated | parse_flag_include_comments |
parse_flag_show_extra_semis);
wcstring ast_dump = ast.dump(src);
std::fwprintf(stderr, L"%ls\n", ast_dump.c_str());
}
// We may have a forest of disconnected trees on a parse failure. We have to handle all nodes

View file

@ -67,6 +67,7 @@ class category_list_t {
category_t parse_productions{L"parse-productions", L"Resolving tokens"};
category_t parse_productions_chatty{L"parse-productions-chatty",
L"Resolving tokens (chatty messages)"};
category_t ast_construction{L"ast-construction", L"Parsing fish AST"};
category_t proc_job_run{L"proc-job-run", L"Jobs getting started or continued"};

View file

@ -13,6 +13,17 @@
exit_without_destructors(-1); \
} while (0)
// A range of source code.
struct source_range_t {
uint32_t start;
uint32_t length;
uint32_t end() const {
assert(start + length >= start && "Overflow");
return start + length;
}
};
// IMPORTANT: If the following enum table is modified you must also update token_enum_map below.
enum parse_token_type_t : uint8_t {
token_type_invalid = 1,
@ -193,6 +204,26 @@ enum parse_error_code_t {
parse_error_andor_in_pipeline, // "and" or "or" after a pipe
};
enum {
parse_flag_none = 0,
/// Attempt to build a "parse tree" no matter what. This may result in a 'forest' of
/// disconnected trees. This is intended to be used by syntax highlighting.
parse_flag_continue_after_error = 1 << 0,
/// Include comment tokens.
parse_flag_include_comments = 1 << 1,
/// Indicate that the tokenizer should accept incomplete tokens */
parse_flag_accept_incomplete_tokens = 1 << 2,
/// Indicate that the parser should not generate the terminate token, allowing an 'unfinished'
/// tree where some nodes may have no productions.
parse_flag_leave_unterminated = 1 << 3,
/// Indicate that the parser should generate job_list entries for blank lines.
parse_flag_show_blank_lines = 1 << 4,
/// Indicate that extra semis should be generated.
parse_flag_show_extra_semis = 1 << 5,
};
typedef unsigned int parse_tree_flags_t;
enum { PARSER_TEST_ERROR = 1, PARSER_TEST_INCOMPLETE = 2 };
typedef unsigned int parser_test_error_bits_t;
@ -214,6 +245,9 @@ struct parse_error_t {
};
typedef std::vector<parse_error_t> parse_error_list_t;
wcstring token_type_user_presentable_description(parse_token_type_t type,
parse_keyword_t keyword = parse_keyword_t::none);
// Special source_start value that means unknown.
#define SOURCE_LOCATION_UNKNOWN (static_cast<size_t>(-1))

View file

@ -255,7 +255,7 @@ DEF_ALT(variable_assignments) {
// A string token like VAR=value
DEF(variable_assignment) produces_single<tok_string>{BODY(variable_assignment)};
// A statement is a normal command, or an if / while / and etc
// A statement is a normal command, or an if / while / etc
DEF_ALT(statement) {
using nots = single<not_statement>;
using block = single<block_statement>;

View file

@ -4,6 +4,7 @@
#include <sys/types.h>
#include "ast.h"
#include "parse_constants.h"
struct parse_token_t;

View file

@ -30,7 +30,7 @@ static bool production_is_empty(const production_element_t *production) {
return *production == token_type_invalid;
}
static parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err) {
parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err) {
switch (err) {
case tokenizer_error_t::none:
return parse_error_none;
@ -168,8 +168,7 @@ const wchar_t *keyword_description(parse_keyword_t type) {
return L"unknown_keyword";
}
static wcstring token_type_user_presentable_description(
parse_token_type_t type, parse_keyword_t keyword = parse_keyword_t::none) {
wcstring token_type_user_presentable_description(parse_token_type_t type, parse_keyword_t keyword) {
if (keyword != parse_keyword_t::none) {
return format_string(L"keyword '%ls'", keyword_description(keyword));
}
@ -1078,8 +1077,7 @@ static inline bool is_help_argument(const wcstring &txt) {
}
/// Return a new parse token, advancing the tokenizer.
static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token,
wcstring *storage) {
parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token, wcstring *storage) {
*out_token = tok->next();
if (!out_token->has_value()) {
return kTerminalToken;
@ -1098,7 +1096,8 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *o
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
result.preceding_escaped_nl = token.preceding_escaped_nl;
result.may_be_variable_assignment = bool(variable_assignment_equals_pos(text));
result.may_be_variable_assignment = variable_assignment_equals_pos(text).has_value();
result.tok_error = token.error;
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just

View file

@ -6,6 +6,7 @@
#include <stdint.h>
#include <sys/types.h>
#include <deque>
#include <memory>
#include <vector>
@ -25,11 +26,6 @@ typedef uint32_t source_offset_t;
constexpr source_offset_t SOURCE_OFFSET_INVALID = static_cast<source_offset_t>(-1);
struct source_range_t {
uint32_t start;
uint32_t length;
};
/// A struct representing the token type that we use internally.
struct parse_token_t {
enum parse_token_type_t type; // The type of the token as represented by the parser
@ -41,38 +37,36 @@ struct parse_token_t {
bool is_newline{false}; // Hackish: if TOK_END, whether the source is a newline.
bool preceding_escaped_nl{false}; // Whether there was an escaped newline preceding this token.
bool may_be_variable_assignment{false}; // Hackish: whether this token is a string like FOO=bar
tokenizer_error_t tok_error{tokenizer_error_t::none}; // If this is a tokenizer error, that error.
source_offset_t source_start{SOURCE_OFFSET_INVALID};
source_offset_t source_length{0};
/// \return the source range.
source_range_t range() const {
return source_range_t{source_start, source_length};
}
/// \return whether we are a string with the dash prefix set.
bool is_dash_prefix_string() const {
return type == parse_token_type_string && has_dash_prefix;
}
wcstring describe() const;
wcstring user_presentable_description() const;
constexpr parse_token_t(parse_token_type_t type) : type(type) {}
};
enum {
parse_flag_none = 0,
/// Attempt to build a "parse tree" no matter what. This may result in a 'forest' of
/// disconnected trees. This is intended to be used by syntax highlighting.
parse_flag_continue_after_error = 1 << 0,
/// Include comment tokens.
parse_flag_include_comments = 1 << 1,
/// Indicate that the tokenizer should accept incomplete tokens */
parse_flag_accept_incomplete_tokens = 1 << 2,
/// Indicate that the parser should not generate the terminate token, allowing an 'unfinished'
/// tree where some nodes may have no productions.
parse_flag_leave_unterminated = 1 << 3,
/// Indicate that the parser should generate job_list entries for blank lines.
parse_flag_show_blank_lines = 1 << 4
};
typedef unsigned int parse_tree_flags_t;
/// Return a new parse token, advancing the tokenizer.
parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token, wcstring *storage);
wcstring parse_dump_tree(const parse_node_tree_t &nodes, const wcstring &src);
const wchar_t *token_type_description(parse_token_type_t type);
const wchar_t *keyword_description(parse_keyword_t type);
parse_error_code_t parse_error_from_tokenizer_error(tokenizer_error_t err);
// Node flags.
enum {
/// Flag indicating that the node has associated comment nodes.