From 8e07e55c1f928ed8a0da6360e00b2b8139594a63 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Thu, 8 Aug 2013 15:06:46 -0700 Subject: [PATCH] More work on new parser --- builtin.cpp | 2 +- common.cpp | 2 +- common.h | 4 +- fish_tests.cpp | 50 ++++- highlight.cpp | 412 ++++++++++++++++++++++++++++++++++++++++++ highlight.h | 1 + parse_productions.cpp | 26 ++- parse_tree.cpp | 200 ++++++++++++++++++-- parse_tree.h | 43 ++++- 9 files changed, 708 insertions(+), 32 deletions(-) diff --git a/builtin.cpp b/builtin.cpp index d2a80a8c4..4ae9e5b51 100644 --- a/builtin.cpp +++ b/builtin.cpp @@ -4063,7 +4063,7 @@ int builtin_parse(parser_t &parser, wchar_t **argv) parse_node_tree_t parse_tree; parse_error_list_t errors; parse_t parser; - bool success = parser.parse(src, &parse_tree, &errors); + bool success = parser.parse(src, parse_flag_none, &parse_tree, &errors, true); if (! success) { stdout_buffer.append(L"Parsing failed:\n"); diff --git a/common.cpp b/common.cpp index 7a9f7a514..c9a6b2279 100644 --- a/common.cpp +++ b/common.cpp @@ -507,7 +507,7 @@ const wchar_t *wcsfuncname(const wchar_t *str) } -int wcsvarchr(wchar_t chr) +bool wcsvarchr(wchar_t chr) { return iswalnum(chr) || chr == L'_'; } diff --git a/common.h b/common.h index 57fe7fa1a..abbf12f34 100644 --- a/common.h +++ b/common.h @@ -608,10 +608,10 @@ const wchar_t *wcsfuncname(const wchar_t *str); /** Test if the given string is valid in a variable name - \return 1 if this is a valid name, 0 otherwise + \return true if this is a valid name, false otherwise */ -int wcsvarchr(wchar_t chr); +bool wcsvarchr(wchar_t chr); /** diff --git a/fish_tests.cpp b/fish_tests.cpp index 6ebd3d220..dd16deb99 100644 --- a/fish_tests.cpp +++ b/fish_tests.cpp @@ -1801,23 +1801,65 @@ void history_tests_t::test_history_speed(void) delete hist; } +static void test_new_parser_correctness(void) +{ + say(L"Testing new parser!"); + const struct parser_test_t + { + const wchar_t *src; + bool ok; + } + parser_tests[] = + { + {L"; ; ; ", true}, + {L"if ; end", false}, + {L"if true ; end", true}, + {L"if true; end ; end", false}, + {L"if end; end ; end", false}, + {L"end", false} + }; + + for (size_t i=0; i < sizeof parser_tests / sizeof *parser_tests; i++) + { + const parser_test_t *test = &parser_tests[i]; + + parse_node_tree_t parse_tree; + parse_t parser; + bool success = parser.parse(test->src, parse_flag_none, &parse_tree, NULL); + say(L"%lu / %lu: Parse \"%ls\": %s", i+1, sizeof parser_tests / sizeof *parser_tests, test->src, success ? "yes" : "no"); + if (success && ! test->ok) + { + err(L"\"%ls\" should NOT have parsed, but did", test->src); + } + else if (! success && test->ok) + { + err(L"\"%ls\" should have parsed, but failed", test->src); + } + } + say(L"Parse tests complete"); + +} + +__attribute__((unused)) static void test_new_parser(void) { say(L"Testing new parser!"); const wcstring src = L"echo hello world"; parse_node_tree_t parse_tree; parse_t parser; - bool success = parser.parse(src, &parse_tree, NULL); + bool success = parser.parse(src, parse_flag_none, &parse_tree, NULL); if (! success) { say(L"Parsing failed"); } else { +#if 0 parse_execution_context_t ctx(parse_tree, src); say(L"Simulating execution:"); wcstring simulation = ctx.simulate(); say(simulation.c_str()); +#endif } } @@ -1827,13 +1869,12 @@ static void test_new_parser(void) int main(int argc, char **argv) { setlocale(LC_ALL, ""); - srand(time(0)); + //srand(time(0)); configure_thread_assertions_for_testing(); program_name=L"(ignore)"; say(L"Testing low-level functionality"); - say(L"Lines beginning with '(ignore):' are not errors, they are warning messages\ngenerated by the fish parser library when given broken input, and can be\nignored. All actual errors begin with 'Error:'."); set_main_thread(); setup_fork_guards(); //proc_init(); @@ -1843,7 +1884,8 @@ int main(int argc, char **argv) reader_init(); env_init(); - test_new_parser(); + test_new_parser_correctness(); + //test_new_parser(); return 0; test_format(); diff --git a/highlight.cpp b/highlight.cpp index 606604386..3c0838902 100644 --- a/highlight.cpp +++ b/highlight.cpp @@ -34,6 +34,7 @@ #include "wildcard.h" #include "path.h" #include "history.h" +#include "parse_tree.h" /** Number of elements in the highlight_var array @@ -1307,11 +1308,16 @@ static void tokenize(const wchar_t * const buff, std::vector &color, const } } +void highlight_shell_magic(const wcstring &buff, std::vector &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars); // PCA This function does I/O, (calls is_potential_path, path_get_path, maybe others) and so ought to only run on a background thread void highlight_shell(const wcstring &buff, std::vector &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars) { ASSERT_IS_BACKGROUND_THREAD(); + if (1) { + highlight_shell_magic(buff, color, pos, error, vars); + return; + } const size_t length = buff.size(); assert(buff.size() == color.size()); @@ -1440,7 +1446,413 @@ void highlight_shell(const wcstring &buff, std::vector &color, size_t pos, } } +static void color_node(const parse_node_t &node, int color, std::vector &color_array) +{ + // Can only color nodes with valid source ranges + if (! node.has_source()) + return; + + // Fill the color array with our color in the corresponding range + size_t source_end = node.source_start + node.source_length; + assert(source_end >= node.source_start); + assert(source_end <= color_array.size()); + + std::fill(color_array.begin() + node.source_start, color_array.begin() + source_end, color); +} +static void color_argument(const wcstring &buffstr, std::vector::iterator colors, int normal_status) +{ + const size_t buff_len = buffstr.size(); + std::fill(colors, colors + buff_len, normal_status); + + enum {e_unquoted, e_single_quoted, e_double_quoted} mode = e_unquoted; + int bracket_count=0; + for (size_t in_pos=0; in_pos < buff_len; in_pos++) + { + const wchar_t c = buffstr.at(in_pos); + switch (mode) + { + case e_unquoted: + { + if (c == L'\\') + { + int fill_color = HIGHLIGHT_ESCAPE; //may be set to HIGHLIGHT_ERROR + const size_t backslash_pos = in_pos; + size_t fill_end = backslash_pos; + + // Move to the escaped character + in_pos++; + const wchar_t escaped_char = (in_pos < buff_len ? buffstr.at(in_pos) : L'\0'); + + if (escaped_char == L'\0') + { + fill_end = in_pos; + fill_color = HIGHLIGHT_ERROR; + } + else if (wcschr(L"~%", escaped_char)) + { + if (in_pos == 1) + { + fill_end = in_pos + 1; + } + } + else if (escaped_char == L',') + { + if (bracket_count) + { + fill_end = in_pos + 1; + } + } + else if (wcschr(L"abefnrtv*?$(){}[]'\"<>^ \\#;|&", escaped_char)) + { + fill_end = in_pos + 1; + } + else if (wcschr(L"c", escaped_char)) + { + // Like \ci. So highlight three characters + fill_end = in_pos + 1; + } + else if (wcschr(L"uUxX01234567", escaped_char)) + { + long long res=0; + int chars=2; + int base=16; + + wchar_t max_val = ASCII_MAX; + + switch (escaped_char) + { + case L'u': + { + chars=4; + max_val = UCS2_MAX; + in_pos++; + break; + } + + case L'U': + { + chars=8; + max_val = WCHAR_MAX; + in_pos++; + break; + } + + case L'x': + { + in_pos++; + break; + } + + case L'X': + { + max_val = BYTE_MAX; + in_pos++; + break; + } + + default: + { + // a digit like \12 + base=8; + chars=3; + break; + } + } + + // Consume + for (int i=0; i < chars && in_pos < buff_len; i++) + { + long d = convert_digit(buffstr.at(in_pos), base); + if (d < 0) + break; + res = (res * base) + d; + in_pos++; + } + //in_pos is now at the first character that could not be converted (or buff_len) + assert(in_pos >= backslash_pos && in_pos <= buff_len); + fill_end = in_pos; + + // It's an error if we exceeded the max value + if (res > max_val) + fill_color = HIGHLIGHT_ERROR; + + // Subtract one from in_pos, so that the increment in the loop will move to the next character + in_pos--; + } + assert(fill_end >= backslash_pos); + std::fill(colors + backslash_pos, colors + fill_end, fill_color); + } + else + { + // Not a backslash + switch (c) + { + case L'~': + case L'%': + { + if (in_pos == 0) + { + colors[in_pos] = HIGHLIGHT_OPERATOR; + } + break; + } + + case L'$': + { + assert(in_pos < buff_len); + int dollar_color = HIGHLIGHT_ERROR; + if (in_pos + 1 < buff_len) + { + wchar_t next = buffstr.at(in_pos + 1); + if (next == L'$' || wcsvarchr(next)) + dollar_color = HIGHLIGHT_OPERATOR; + } + colors[in_pos] = dollar_color; + break; + } + + + case L'*': + case L'?': + case L'(': + case L')': + { + colors[in_pos] = HIGHLIGHT_OPERATOR; + break; + } + + case L'{': + { + colors[in_pos] = HIGHLIGHT_OPERATOR; + bracket_count++; + break; + } + + case L'}': + { + colors[in_pos] = HIGHLIGHT_OPERATOR; + bracket_count--; + break; + } + + case L',': + { + if (bracket_count > 0) + { + colors[in_pos] = HIGHLIGHT_OPERATOR; + } + + break; + } + + case L'\'': + { + colors[in_pos] = HIGHLIGHT_QUOTE; + mode = e_single_quoted; + break; + } + + case L'\"': + { + colors[in_pos] = HIGHLIGHT_QUOTE; + mode = e_double_quoted; + break; + } + + } + } + break; + } + + /* + Mode 1 means single quoted string, i.e 'foo' + */ + case e_single_quoted: + { + colors[in_pos] = HIGHLIGHT_QUOTE; + if (c == L'\\') + { + // backslash + if (in_pos + 1 < buff_len) + { + const wchar_t escaped_char = buffstr.at(in_pos + 1); + if (escaped_char == L'\\' || escaped_char == L'\'') + { + colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash + colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char + in_pos += 1; //skip over backslash + } + } + } + else if (c == L'\'') + { + mode = e_unquoted; + } + break; + } + + /* + Mode 2 means double quoted string, i.e. "foo" + */ + case e_double_quoted: + { + colors[in_pos] = HIGHLIGHT_QUOTE; + switch (c) + { + case L'"': + { + mode = e_unquoted; + break; + } + + case L'\\': + { + // backslash + if (in_pos + 1 < buff_len) + { + const wchar_t escaped_char = buffstr.at(in_pos + 1); + if (escaped_char == L'\\' || escaped_char == L'\'' || escaped_char == L'$') + { + colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash + colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char + in_pos += 1; //skip over backslash + } + } + break; + } + + case L'$': + { + int dollar_color = HIGHLIGHT_ERROR; + if (in_pos + 1 < buff_len) + { + wchar_t next = buffstr.at(in_pos + 1); + if (next == L'$' || wcsvarchr(next)) + dollar_color = HIGHLIGHT_OPERATOR; + } + colors[in_pos] = dollar_color; + break; + } + + } + break; + } + } + } +} + +// Color all of the arguments of the given command +static void color_arguments(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &parent, std::vector &color_array) +{ + const parse_node_tree_t::parse_node_list_t nodes = tree.find_nodes(parent, symbol_argument); + + wcstring param; + for (node_offset_t i=0; i < nodes.size(); i++) + { + const parse_node_t *child = nodes.at(i); + assert(child != NULL && child->type == symbol_argument); + param.assign(src, child->source_start, child->source_length); + color_argument(param, color_array.begin() + child->source_start, HIGHLIGHT_NORMAL); + } +} + +static void color_children(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, int color, std::vector &color_array) +{ + for (node_offset_t idx=0; idx < parent.child_count; idx++) + { + const parse_node_t *child = tree.get_child(parent, idx); + if (child != NULL && child->type == type && child->has_source()) + { + color_node(*child, color, color_array); + } + } +} + +void highlight_shell_magic(const wcstring &buff, std::vector &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars) +{ + ASSERT_IS_BACKGROUND_THREAD(); + + const size_t length = buff.size(); + assert(buff.size() == color.size()); + + if (length == 0) + return; + + std::fill(color.begin(), color.end(), -1); + + /* Do something sucky and get the current working directory on this background thread. This should really be passed in. */ + const wcstring working_directory = env_get_pwd_slash(); + + /* Parse the buffer */ + parse_node_tree_t parse_tree; + parse_t parser; + parser.parse(buff, parse_flag_continue_after_error | parse_flag_include_comments, &parse_tree, NULL); + + /* Walk the node tree */ + for (parse_node_tree_t::const_iterator iter = parse_tree.begin(); iter != parse_tree.end(); ++iter) + { + const parse_node_t &node = *iter; + + switch (node.type) + { + // Color direct string descendants, e.g. 'for' and 'in'. + case symbol_for_header: + case symbol_while_header: + case symbol_begin_header: + case symbol_function_header: + case symbol_if_clause: + case symbol_else_clause: + case symbol_case_item: + case symbol_switch_statement: + case symbol_boolean_statement: + case symbol_decorated_statement: + color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color); + break; + + case symbol_redirection: + color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_REDIRECTION, color); + break; + + case parse_token_type_background: + case parse_token_type_end: + color_node(node, HIGHLIGHT_END, color); + break; + + case symbol_plain_statement: + { + // Color the command + color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color); + + // Color arguments + const parse_node_t *arguments = parse_tree.get_child(node, 1, symbol_arguments_or_redirections_list); + if (arguments != NULL) + { + color_arguments(buff, parse_tree, *arguments, color); + } + } + break; + + + case symbol_arguments_or_redirections_list: + case symbol_argument_list: + /* Nothing, these are handled by their parents */ + break; + + case parse_special_type_parse_error: + case parse_special_type_tokenizer_error: + color_node(node, HIGHLIGHT_ERROR, color); + break; + + case parse_special_type_comment: + color_node(node, HIGHLIGHT_COMMENT, color); + break; + + default: + break; + } + } +} /** Perform quote and parenthesis highlighting on the specified string. diff --git a/highlight.h b/highlight.h index 6747bba51..ea8557918 100644 --- a/highlight.h +++ b/highlight.h @@ -84,6 +84,7 @@ struct file_detection_context_t; \param error a list in which a description of each error will be inserted. May be 0, in whcich case no error descriptions will be generated. */ void highlight_shell(const wcstring &buffstr, std::vector &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars); +void highlight_shell_magic(const wcstring &buffstr, std::vector &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars); /** Perform syntax highlighting for the text in buff. Matching quotes and paranthesis are highlighted. The result is diff --git a/parse_productions.cpp b/parse_productions.cpp index 61f7636de..b5efa11ca 100644 --- a/parse_productions.cpp +++ b/parse_productions.cpp @@ -135,14 +135,12 @@ RESOLVE(statement) return 2; case parse_keyword_else: - //symbol_stack_pop(); return NO_PRODUCTION; case parse_keyword_switch: return 3; case parse_keyword_end: - PARSER_DIE(); //todo return NO_PRODUCTION; // 'in' is only special within a for_header @@ -378,7 +376,7 @@ RESOLVE(arguments_or_redirections_list) PRODUCTIONS(argument_or_redirection) = { - {parse_token_type_string}, + {symbol_argument}, {parse_token_type_redirection} }; RESOLVE(argument_or_redirection) @@ -394,6 +392,18 @@ RESOLVE(argument_or_redirection) } } +PRODUCTIONS(argument) = +{ + {parse_token_type_string} +}; +RESOLVE_ONLY(argument) + +PRODUCTIONS(redirection) = +{ + {parse_token_type_redirection} +}; +RESOLVE_ONLY(redirection) + PRODUCTIONS(optional_background) = { {}, @@ -449,6 +459,8 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n TEST(plain_statement) TEST(arguments_or_redirections_list) TEST(argument_or_redirection) + TEST(argument) + TEST(redirection) TEST(optional_background) case parse_token_type_string: @@ -461,6 +473,14 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n PARSER_DIE(); break; + case parse_special_type_parse_error: + case parse_special_type_tokenizer_error: + case parse_special_type_comment: + fprintf(stderr, "Special type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__); + PARSER_DIE(); + break; + + case token_type_invalid: fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__); PARSER_DIE(); diff --git a/parse_tree.cpp b/parse_tree.cpp index 7a809167e..bab295042 100644 --- a/parse_tree.cpp +++ b/parse_tree.cpp @@ -101,6 +101,11 @@ wcstring token_type_description(parse_token_type_t type) return L"arguments_or_redirections_list"; case symbol_argument_or_redirection: return L"argument_or_redirection"; + case symbol_argument: + return L"symbol_argument"; + case symbol_redirection: + return L"symbol_redirection"; + case parse_token_type_string: return L"token_string"; @@ -116,6 +121,14 @@ wcstring token_type_description(parse_token_type_t type) return L"token_terminate"; case symbol_optional_background: return L"optional_background"; + + case parse_special_type_parse_error: + return L"parse_error"; + case parse_special_type_tokenizer_error: + return L"tokenizer_error"; + case parse_special_type_comment: + return L"comment"; + } return format_string(L"Unknown token type %ld", static_cast(type)); } @@ -216,6 +229,14 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_ case TOK_REDIRECT_NOCLOB: result.type = parse_token_type_redirection; break; + + case TOK_ERROR: + result.type = parse_special_type_tokenizer_error; + break; + + case TOK_COMMENT: + result.type = parse_special_type_comment; + break; default: @@ -248,9 +269,16 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring & } if (node.type == parse_token_type_string) { - result->append(L": \""); - result->append(src, node.source_start, node.source_length); - result->append(L"\""); + if (node.source_start == -1) + { + append_format(*result, L" (no source)"); + } + else + { + result->append(L": \""); + result->append(src, node.source_start, node.source_length); + result->append(L"\""); + } } result->push_back(L'\n'); ++*line; @@ -311,20 +339,24 @@ class parse_ll_t // Constructor parse_ll_t() : fatal_errored(false) { - // initial node - symbol_stack.push_back(parse_stack_element_t(symbol_job_list, 0)); // goal token - nodes.push_back(parse_node_t(symbol_job_list)); + this->reset(); } bool top_node_match_token(parse_token_t token); void accept_token(parse_token_t token, const wcstring &src); + + // Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node. + void reset(void); void parse_error(const wchar_t *expected, parse_token_t token); void parse_error(parse_token_t token, const wchar_t *format, ...); void append_error_callout(wcstring &error_message, parse_token_t token); void dump_stack(void) const; + + // Figure out the ranges of intermediate nodes + void determine_node_ranges(); // Get the node corresponding to the top element of the stack parse_node_t &node_for_top_symbol() @@ -453,9 +485,41 @@ void parse_ll_t::dump_stack(void) const } } +// Give each node a source range equal to the union of the ranges of its children +// Terminal nodes already have source ranges (and no children) +// Since children always appear after their parents, we can implement this very simply by walking backwards +void parse_ll_t::determine_node_ranges(void) +{ + const size_t source_start_invalid = -1; + size_t idx = nodes.size(); + while (idx--) + { + parse_node_t *parent = &nodes.at(idx); + + // Skip nodes that already have a source range. These are terminal nodes. + if (parent->source_start != source_start_invalid) + continue; + + // Ok, this node needs a source range. Get all of its children, and then set its range. + size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge + for (node_offset_t i=0; i < parent->child_count; i++) + { + const parse_node_t &child = nodes.at(parent->child_offset(i)); + min_start = std::min(min_start, child.source_start); + max_end = std::max(max_end, child.source_start + child.source_length); + } + + if (min_start != source_start_invalid) { + assert(max_end >= min_start); + parent->source_start = min_start; + parent->source_length = max_end - min_start; + } + } +} + void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...) { - this->dump_stack(); + //this->dump_stack(); parse_error_t err; va_list va; @@ -481,8 +545,27 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token) fatal_errored = true; } +void parse_ll_t::reset(void) +{ + // add a new job_list node and then reset our symbol list to point at it + node_offset_t where = nodes.size(); + nodes.push_back(parse_node_t(symbol_job_list)); + + symbol_stack.clear(); + symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token + this->fatal_errored = false; +} + + bool parse_ll_t::top_node_match_token(parse_token_t token) { + if (symbol_stack.empty()) + { + // This can come about with an unbalanced 'end' or 'else', which causes us to terminate the outermost job list. + this->fatal_errored = true; + return false; + } + PARSE_ASSERT(! symbol_stack.empty()); PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE); bool result = false; @@ -520,10 +603,23 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src) fprintf(stderr, "Accept token %ls\n", token.describe().c_str()); } PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE); - PARSE_ASSERT(! symbol_stack.empty()); + bool consumed = false; + + // Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty. + if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment) + { + parse_node_t err_node(token.type); + err_node.source_start = token.source_start; + err_node.source_length = token.source_length; + nodes.push_back(err_node); + consumed = true; + } + while (! consumed && ! this->fatal_errored) { + PARSE_ASSERT(! symbol_stack.empty()); + if (top_node_match_token(token)) { if (logit) @@ -534,6 +630,10 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src) break; } + // top_node_match_token may indicate an error if our stack is empty + if (this->fatal_errored) + break; + // Get the production for the top of the stack parse_stack_element_t &stack_elem = symbol_stack.back(); parse_node_t &node = nodes.at(stack_elem.node_idx); @@ -548,6 +648,12 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src) // Manipulate the symbol stack. // Note that stack_elem is invalidated by popping the stack. symbol_stack_pop_push_production(production); + + // If we end up with an empty stack, something bad happened, like an unbalanced end + if (symbol_stack.empty()) + { + this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?", stack_elem.describe().c_str(), token.describe().c_str()); + } } } } @@ -556,6 +662,11 @@ parse_t::parse_t() : parser(new parse_ll_t()) { } +parse_t::~parse_t() +{ + delete parser; +} + static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt) { parse_keyword_t result = parse_keyword_none; @@ -597,21 +708,20 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt) return result; } -bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors) +bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it) { - tokenizer_t tok = tokenizer_t(str.c_str(), 0); + tok_flags_t tok_options = TOK_SQUASH_ERRORS; + if (parse_flags & parse_flag_include_comments) + tok_options |= TOK_SHOW_COMMENTS; + + tokenizer_t tok = tokenizer_t(str.c_str(), tok_options); for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok)) { token_type tok_type = static_cast(tok_last_type(&tok)); const wchar_t *tok_txt = tok_last(&tok); int tok_start = tok_get_pos(&tok); size_t tok_extent = tok_get_extent(&tok); - - if (tok_type == TOK_ERROR) - { - fprintf(stderr, "Tokenizer error\n"); - break; - } + assert(tok_extent < 10000000); //paranoia parse_token_t token = parse_token_from_tokenizer_token(tok_type); token.tokenizer_type = tok_type; @@ -621,12 +731,31 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_ this->parser->accept_token(token, str); if (this->parser->fatal_errored) - break; + { + if (parse_flags & parse_flag_continue_after_error) + { + /* Mark an error and then keep going */ + token.type = parse_special_type_parse_error; + token.keyword = parse_keyword_none; + this->parser->accept_token(token, str); + this->parser->reset(); + } + else + { + /* Bail out */ + break; + } + } } + // Teach each node where its source range is + this->parser->determine_node_ranges(); + +#if 0 wcstring result = dump_tree(this->parser->nodes, str); fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str()); fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t)); +#endif if (output != NULL) { @@ -642,3 +771,40 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_ return ! this->parser->fatal_errored; } + +const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const +{ + const parse_node_t *result = NULL; + PARSE_ASSERT(which < parent.child_count); + node_offset_t child_offset = parent.child_offset(which); + if (child_offset < this->size()) + { + result = &this->at(child_offset); + } + + // If we are given an expected type, then the node must be null or that type + if (result != NULL) + { + assert(expected_type == token_type_invalid || expected_type == result->type); + } + + return result; +} + +static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, parse_node_tree_t::parse_node_list_t *result) +{ + if (parent.type == type) result->push_back(&parent); + for (size_t i=0; i < parent.child_count; i++) + { + const parse_node_t *child = tree.get_child(parent, i); + assert(child != NULL); + find_nodes_recursive(tree, *child, type, result); + } +} + +parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent, parse_token_type_t type) const +{ + parse_node_list_t result; + find_nodes_recursive(*this, parent, type, &result); + return result; +} diff --git a/parse_tree.h b/parse_tree.h index c53864258..25b63a0ca 100644 --- a/parse_tree.h +++ b/parse_tree.h @@ -15,7 +15,7 @@ #include #define PARSE_ASSERT(a) assert(a) -#define PARSER_DIE() exit_without_destructors(-1) +#define PARSER_DIE() do { fprintf(stderr, "Parser dying!\n"); exit_without_destructors(-1); } while (0) class parse_node_t; class parse_node_tree_t; @@ -36,6 +36,18 @@ struct parse_error_t }; typedef std::vector parse_error_list_t; +enum +{ + parse_flag_none = 0, + + /* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */ + parse_flag_continue_after_error = 1 << 0, + + /* Include comment tokens */ + parse_flag_include_comments = 1 << 1 +}; +typedef unsigned int parse_tree_flags_t; + class parse_ll_t; class parse_t { @@ -43,7 +55,8 @@ class parse_t public: parse_t(); - bool parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors); + ~parse_t(); + bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false); }; enum parse_token_type_t @@ -80,6 +93,9 @@ enum parse_token_type_t symbol_argument_list_nonempty, symbol_argument_list, + symbol_argument, + symbol_redirection, + symbol_optional_background, // Terminal types @@ -90,6 +106,11 @@ enum parse_token_type_t parse_token_type_end, parse_token_type_terminate, + // Very special terminal types that don't appear in the production list + parse_special_type_parse_error, + parse_special_type_tokenizer_error, + parse_special_type_comment, + LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate, FIRST_PARSE_TOKEN_TYPE = parse_token_type_string }; @@ -145,7 +166,7 @@ public: wcstring describe(void) const; /* Constructor */ - explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(0), source_length(0), child_start(0), child_count(0), tag(0) + explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), child_start(0), child_count(0), tag(0) { } @@ -154,10 +175,23 @@ public: PARSE_ASSERT(which < child_count); return child_start + which; } + + bool has_source() const + { + return source_start != (size_t)(-1); + } }; class parse_node_tree_t : public std::vector { + public: + + /* Get the node corresponding to a child of the given node, or NULL if there is no such child. If expected_type is provided, assert that the node has that type. */ + const parse_node_t *get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type = token_type_invalid) const; + + /* Find all the nodes of a given type underneath a given node */ + typedef std::vector parse_node_list_t; + parse_node_list_t find_nodes(const parse_node_t &parent, parse_token_type_t type) const; }; @@ -214,7 +248,8 @@ class parse_node_tree_t : public std::vector arguments_or_redirections_list = | argument_or_redirection arguments_or_redirections_list - argument_or_redirection = redirection | + argument_or_redirection = argument | redirection + argument = redirection = terminator = |