From 7b86b2e05a011e37bf11bba2675ef5db684bca24 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Wed, 9 Oct 2013 02:03:50 -0700 Subject: [PATCH] Adoption of new parser in abbreviations --- fish_tests.cpp | 20 +++++-- highlight.cpp | 40 ++++++------- parse_productions.cpp | 10 ++-- parse_productions.h | 2 +- parse_tree.cpp | 29 ++++++--- parse_tree.h | 42 ++++++------- reader.cpp | 134 ++++++++++++------------------------------ 7 files changed, 115 insertions(+), 162 deletions(-) diff --git a/fish_tests.cpp b/fish_tests.cpp index 6c77ec08a..99ed6cd34 100644 --- a/fish_tests.cpp +++ b/fish_tests.cpp @@ -61,7 +61,6 @@ #include "signal.h" #include "highlight.h" #include "parse_tree.h" -#include "parse_exec.h" #include "parse_util.h" /** @@ -769,6 +768,11 @@ static void test_abbreviations(void) expanded = reader_expand_abbreviation_in_command(L"of gc", wcslen(L"of gc"), &result); if (expanded) err(L"gc incorrectly expanded on line %ld", (long)__LINE__); + /* others should not be */ + expanded = reader_expand_abbreviation_in_command(L"command gc", wcslen(L"command gc"), &result); + if (expanded) err(L"gc incorrectly expanded on line %ld", (long)__LINE__); + + env_pop(); } @@ -1916,12 +1920,16 @@ static void test_new_parser_fuzzing(void) size_t max = 5; for (size_t len=1; len <= max; len++) { - fprintf(stderr, "%lu / %lu\n", len, max); + fprintf(stderr, "%lu / %lu...", len, max); std::vector tokens(len); + size_t count = 0; + parse_t parser; + parse_node_tree_t parse_tree; do { - parse_t parser; - parse_node_tree_t parse_tree; + parser.clear(); + parse_tree.clear(); + count++; for (size_t i=0; i < len; i++) { const parser_fuzz_token_t &token = tokens[i]; @@ -1931,6 +1939,7 @@ static void test_new_parser_fuzzing(void) // keep going until we wrap } while (! increment(tokens)); + fprintf(stderr, "done (%lu)\n", count); } double end = timef(); say(L"All fuzzed in %f seconds!", end - start); @@ -2108,7 +2117,7 @@ int main(int argc, char **argv) say(L"Testing low-level functionality"); set_main_thread(); setup_fork_guards(); - //proc_init(); + //proc_init(); //disabling this prevents catching SIGINT event_init(); function_init(); builtin_init(); @@ -2116,7 +2125,6 @@ int main(int argc, char **argv) env_init(); test_highlighting(); - return 0; test_new_parser_fuzzing(); test_new_parser_correctness(); test_highlighting(); diff --git a/highlight.cpp b/highlight.cpp index 28e32b7a1..8fe9989b9 100644 --- a/highlight.cpp +++ b/highlight.cpp @@ -332,7 +332,7 @@ static bool is_potential_cd_path(const wcstring &path, const wcstring &working_d } /* Given a plain statement node in a parse tree, get the command and return it, expanded appropriately for commands. If we succeed, return true. */ -static bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &plain_statement, wcstring *out_cmd) +bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &plain_statement, wcstring *out_cmd) { assert(plain_statement.type == symbol_plain_statement); bool result = false; @@ -708,15 +708,15 @@ static bool has_expand_reserved(const wcstring &str) return result; } -/* Parse a command line. Return by reference the last command, its arguments, and the offset in the string of the beginning of the last argument. This is used by autosuggestions */ -static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expanded_command, const parse_node_t **out_last_arg) +/* Parse a command line. Return by reference the last command, and the last argument to that command (as a copied node), if any. This is used by autosuggestions */ +static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expanded_command, parse_node_t *out_last_arg) { bool result = false; /* Parse the buffer */ parse_node_tree_t parse_tree; parse_t parser; - parser.parse(buff, parse_flag_continue_after_error, &parse_tree, NULL); + parser.parse(buff, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &parse_tree, NULL); /* Find the last statement */ const parse_node_t *last_statement = parse_tree.find_last_node_of_type(symbol_plain_statement, NULL); @@ -727,8 +727,12 @@ static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expand /* We got it */ result = true; - /* Find the last argument */ - *out_last_arg = parse_tree.find_last_node_of_type(symbol_plain_statement, last_statement); + /* Find the last argument. If we don't get one, return an invalid node. */ + const parse_node_t *last_arg = parse_tree.find_last_node_of_type(symbol_argument, last_statement); + if (last_arg != NULL) + { + *out_last_arg = *last_arg; + } } } return result; @@ -739,20 +743,20 @@ bool autosuggest_suggest_special(const wcstring &str, const wcstring &working_di { if (str.empty()) return false; - + ASSERT_IS_BACKGROUND_THREAD(); /* Parse the string */ wcstring parsed_command; - const parse_node_t *last_arg_node = NULL; + parse_node_t last_arg_node(token_type_invalid); if (! autosuggest_parse_command(str, &parsed_command, &last_arg_node)) return false; bool result = false; - if (parsed_command == L"cd" && last_arg_node != NULL && last_arg_node->has_source()) + if (parsed_command == L"cd" && last_arg_node.type == symbol_argument && last_arg_node.has_source()) { /* We can possibly handle this specially */ - const wcstring escaped_dir = last_arg_node->get_source(str); + const wcstring escaped_dir = last_arg_node.get_source(str); wcstring suggested_path; /* We always return true because we recognized the command. This prevents us from falling back to dumber algorithms; for example we won't suggest a non-directory for the cd command. */ @@ -771,13 +775,12 @@ bool autosuggest_suggest_special(const wcstring &str, const wcstring &working_di path_flags_t path_flags = (quote == L'\0') ? PATH_EXPAND_TILDE : 0; if (unescaped && is_potential_cd_path(unescaped_dir, working_directory, path_flags, &suggested_path)) { - /* Note: this looks really wrong for strings that have an "unescapable" character in them, e.g. a \t, because parse_util_escape_string_with_quote will insert that character */ wcstring escaped_suggested_path = parse_util_escape_string_with_quote(suggested_path, quote); /* Return it */ out_suggestion = str; - out_suggestion.erase(last_arg_node->source_start); + out_suggestion.erase(last_arg_node.source_start); if (quote != L'\0') out_suggestion.push_back(quote); out_suggestion.append(escaped_suggested_path); if (quote != L'\0') out_suggestion.push_back(quote); @@ -798,14 +801,14 @@ bool autosuggest_validate_from_history(const history_item_t &item, file_detectio /* Parse the string */ wcstring parsed_command; - const parse_node_t *last_arg_node = NULL; + parse_node_t last_arg_node(token_type_invalid); if (! autosuggest_parse_command(item.str(), &parsed_command, &last_arg_node)) return false; - if (parsed_command == L"cd" && last_arg_node != NULL && last_arg_node->has_source()) + if (parsed_command == L"cd" && last_arg_node.type == symbol_argument && last_arg_node.has_source()) { /* We can possibly handle this specially */ - wcstring dir = last_arg_node->get_source(item.str()); + wcstring dir = last_arg_node.get_source(item.str()); if (expand_one(dir, EXPAND_SKIP_CMDSUBST)) { handled = true; @@ -1968,12 +1971,7 @@ const highlighter_t::color_array_t & highlighter_t::highlight() case symbol_plain_statement: { // Get the decoration from the parent - enum parse_statement_decoration_t decoration = parse_statement_decoration_none; - const parse_node_t *decorated_statement = parse_tree.get_parent(node, symbol_decorated_statement); - if (decorated_statement != NULL) - { - decoration = static_cast(decorated_statement->production_idx); - } + enum parse_statement_decoration_t decoration = parse_tree.decoration_for_plain_statement(node); /* Color the command */ const parse_node_t *cmd_node = parse_tree.get_child(node, 0, parse_token_type_string); diff --git a/parse_productions.cpp b/parse_productions.cpp index 0900977f7..38d57ebab 100644 --- a/parse_productions.cpp +++ b/parse_productions.cpp @@ -27,8 +27,8 @@ static bool production_is_valid(const production_options_t production_list, prod } #define PRODUCTIONS(sym) static const production_options_t productions_##sym -#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) -#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) { return 0; } +#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword) +#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword) { return 0; } #define KEYWORD(x) ((x) + LAST_TOKEN_OR_SYMBOL + 1) @@ -418,7 +418,7 @@ RESOLVE(optional_background) } #define TEST(sym) case (symbol_##sym): production_list = & productions_ ## sym ; resolver = resolve_ ## sym ; break; -const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, production_tag_t *out_tag, wcstring *out_error_text) +const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, wcstring *out_error_text) { bool log_it = false; if (log_it) @@ -428,7 +428,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n /* Fetch the list of productions and the function to resolve them */ const production_options_t *production_list = NULL; - production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) = NULL; + production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword) = NULL; switch (node_type) { TEST(job_list) @@ -486,7 +486,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n PARSE_ASSERT(resolver != NULL); const production_t *result = NULL; - production_option_idx_t which = resolver(input_type, input_keyword, out_tag); + production_option_idx_t which = resolver(input_type, input_keyword); if (log_it) { diff --git a/parse_productions.h b/parse_productions.h index a0d43f629..7e132d0c4 100644 --- a/parse_productions.h +++ b/parse_productions.h @@ -63,7 +63,7 @@ inline bool production_element_is_valid(production_element_t elem) } /* Fetch a production */ -const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, production_tag_t *out_tag, wcstring *out_error_text); +const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, wcstring *out_error_text); } diff --git a/parse_tree.cpp b/parse_tree.cpp index 30ee6856b..900513f50 100644 --- a/parse_tree.cpp +++ b/parse_tree.cpp @@ -720,7 +720,7 @@ void parse_ll_t::accept_token(parse_token_t token) // Get the production for the top of the stack parse_stack_element_t &stack_elem = symbol_stack.back(); parse_node_t &node = nodes.at(stack_elem.node_idx); - const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, &node.tag, NULL /* error text */); + const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, NULL /* error text */); if (production == NULL) { if (should_generate_error_messages) @@ -804,6 +804,9 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n if (parse_flags & parse_flag_include_comments) tok_options |= TOK_SHOW_COMMENTS; + if (parse_flags & parse_flag_accept_incomplete_tokens) + tok_options |= TOK_ACCEPT_UNFINISHED; + this->parser->set_should_generate_error_messages(errors != NULL); tokenizer_t tok = tokenizer_t(str.c_str(), tok_options); @@ -845,14 +848,14 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n // Tag nodes -#if 0 - wcstring result = dump_tree(this->parser->nodes, str); - fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str()); - fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t)); -#endif - // Acquire the output from the parser this->parser->acquire_output(output, errors); + +#if 0 + //wcstring result = dump_tree(this->parser->nodes, str); + //fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str()); + fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", output->size(), sizeof(parse_node_t), output->size() * sizeof(parse_node_t)); +#endif // Indicate if we had a fatal error return ! this->parser->has_fatal_error(); @@ -992,3 +995,15 @@ bool parse_node_tree_t::argument_list_is_root(const parse_node_t &node) const } return result; } + +enum parse_statement_decoration_t parse_node_tree_t::decoration_for_plain_statement(const parse_node_t &node) const +{ + assert(node.type == symbol_plain_statement); + enum parse_statement_decoration_t decoration = parse_statement_decoration_none; + const parse_node_t *decorated_statement = this->get_parent(node, symbol_decorated_statement); + if (decorated_statement != NULL) + { + decoration = static_cast(decorated_statement->production_idx); + } + return decoration; +} diff --git a/parse_tree.h b/parse_tree.h index b2059914c..945d550c4 100644 --- a/parse_tree.h +++ b/parse_tree.h @@ -125,7 +125,10 @@ enum parse_flag_continue_after_error = 1 << 0, /* Include comment tokens */ - parse_flag_include_comments = 1 << 1 + parse_flag_include_comments = 1 << 1, + + /* Indicate that the tokenizer should accept incomplete tokens */ + parse_flag_accept_incomplete_tokens = 1 << 2 }; typedef unsigned int parse_tree_flags_t; @@ -175,9 +178,6 @@ public: node_offset_t child_start; node_offset_t child_count; - /* Type-dependent data */ - uint32_t tag; - /* Which production was used */ uint8_t production_idx; @@ -185,7 +185,7 @@ public: wcstring describe(void) const; /* Constructor */ - explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), tag(0) + explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0) { } @@ -211,6 +211,15 @@ public: } }; +/* Statement decorations. This matches the order of productions in decorated_statement */ +enum parse_statement_decoration_t +{ + parse_statement_decoration_none, + parse_statement_decoration_command, + parse_statement_decoration_builtin +}; + + /* The parse tree itself */ class parse_node_tree_t : public std::vector { @@ -232,27 +241,10 @@ public: /* Indicate if the given argument_list or arguments_or_redirections_list is a root list, or has a parent */ bool argument_list_is_root(const parse_node_t &node) const; -}; - - -/* Node type specific data, stored in the tag field */ - -/* Statement decorations, stored in the tag of plain_statement. This matches the order of productions in decorated_statement */ -enum parse_statement_decoration_t -{ - parse_statement_decoration_none, - parse_statement_decoration_command, - parse_statement_decoration_builtin -}; - -/* Argument flags as a bitmask, stored in the tag of argument */ -enum parse_argument_flags_t -{ - /* Indicates that this or a prior argument was --, so this should not be treated as an option */ - parse_argument_no_options = 1 << 0, - /* Indicates that the argument is for a cd command */ - parse_argument_is_for_cd = 1 << 1 + /* Utilities */ + enum parse_statement_decoration_t decoration_for_plain_statement(const parse_node_t &node) const; + }; /* Fish grammar: diff --git a/reader.cpp b/reader.cpp index 228fa9183..0f022c279 100644 --- a/reader.cpp +++ b/reader.cpp @@ -99,6 +99,7 @@ commence. #include "path.h" #include "parse_util.h" #include "parser_keywords.h" +#include "parse_tree.h" /** Maximum length of prefix string when printing completion @@ -659,117 +660,56 @@ bool reader_expand_abbreviation_in_command(const wcstring &cmdline, size_t curso const size_t subcmd_offset = cmdsub_begin - buff; const wcstring subcmd = wcstring(cmdsub_begin, cmdsub_end - cmdsub_begin); - const wchar_t *subcmd_cstr = subcmd.c_str(); - - /* Get the token containing the cursor */ - const wchar_t *subcmd_tok_begin = NULL, *subcmd_tok_end = NULL; - assert(cursor_pos >= subcmd_offset); - size_t subcmd_cursor_pos = cursor_pos - subcmd_offset; - parse_util_token_extent(subcmd_cstr, subcmd_cursor_pos, &subcmd_tok_begin, &subcmd_tok_end, NULL, NULL); - - /* Compute the offset of the token before the cursor within the subcmd */ - assert(subcmd_tok_begin >= subcmd_cstr); - assert(subcmd_tok_end >= subcmd_tok_begin); - const size_t subcmd_tok_begin_offset = subcmd_tok_begin - subcmd_cstr; - const size_t subcmd_tok_length = subcmd_tok_end - subcmd_tok_begin; - - /* Now parse the subcmd, looking for commands */ - bool had_cmd = false, previous_token_is_cmd = false; - tokenizer_t tok(subcmd_cstr, TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS); - for (; tok_has_next(&tok); tok_next(&tok)) + const size_t subcmd_cursor_pos = cursor_pos - subcmd_offset; + + /* Parse this subcmd */ + parse_node_tree_t parse_tree; + parse_t parser; + parser.parse(subcmd, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &parse_tree, NULL); + + /* Look for plain statements where the cursor is at the end of the command */ + const parse_node_t *matching_cmd_node = NULL; + const size_t len = parse_tree.size(); + for (size_t i=0; i < len; i++) { - size_t tok_pos = static_cast(tok_get_pos(&tok)); - if (tok_pos > subcmd_tok_begin_offset) + const parse_node_t &node = parse_tree.at(i); + + /* Only interested in plain statements with source */ + if (node.type != symbol_plain_statement || ! node.has_source()) + continue; + + /* Skip decorated statements */ + if (parse_tree.decoration_for_plain_statement(node) != parse_statement_decoration_none) + continue; + + /* Get the command node. Skip it if we can't or it has no source */ + const parse_node_t *cmd_node = parse_tree.get_child(node, 0, parse_token_type_string); + if (cmd_node == NULL || ! cmd_node->has_source()) + continue; + + /* Now see if its source range contains our cursor, including at the end */ + if (subcmd_cursor_pos >= cmd_node->source_start && subcmd_cursor_pos <= cmd_node->source_start + cmd_node->source_length) { - /* We've passed the token we're interested in */ + /* Success! */ + matching_cmd_node = cmd_node; break; } - - int last_type = tok_last_type(&tok); - - switch (last_type) - { - case TOK_STRING: - { - if (had_cmd) - { - /* Parameter to the command. */ - } - else - { - const wcstring potential_cmd = tok_last(&tok); - if (parser_keywords_is_subcommand(potential_cmd)) - { - if (potential_cmd == L"command" || potential_cmd == L"builtin") - { - /* 'command' and 'builtin' defeat abbreviation expansion. Skip this command. */ - had_cmd = true; - } - else - { - /* Other subcommand. Pretend it doesn't exist so that we can expand the following command */ - had_cmd = false; - } - } - else - { - /* It's a normal command */ - had_cmd = true; - if (tok_pos == subcmd_tok_begin_offset) - { - /* This is the token we care about! */ - previous_token_is_cmd = true; - } - } - } - break; - } - - case TOK_REDIRECT_NOCLOB: - case TOK_REDIRECT_OUT: - case TOK_REDIRECT_IN: - case TOK_REDIRECT_APPEND: - case TOK_REDIRECT_FD: - { - if (!had_cmd) - { - break; - } - tok_next(&tok); - break; - } - - case TOK_PIPE: - case TOK_BACKGROUND: - case TOK_END: - { - had_cmd = false; - break; - } - - case TOK_COMMENT: - case TOK_ERROR: - default: - { - break; - } - } } - + + /* Now if we found a command node, expand it */ bool result = false; - if (previous_token_is_cmd) + if (matching_cmd_node != NULL) { - /* The token is a command. Try expanding it as an abbreviation. */ - const wcstring token = wcstring(subcmd, subcmd_tok_begin_offset, subcmd_tok_length); + assert(matching_cmd_node->type == parse_token_type_string); + const wcstring token = matching_cmd_node->get_source(subcmd); wcstring abbreviation; if (expand_abbreviation(token, &abbreviation)) { /* There was an abbreviation! Replace the token in the full command. Maintain the relative position of the cursor. */ if (output != NULL) { - size_t cmd_tok_begin_offset = subcmd_tok_begin_offset + subcmd_offset; output->assign(cmdline); - output->replace(cmd_tok_begin_offset, subcmd_tok_length, abbreviation); + output->replace(subcmd_offset + matching_cmd_node->source_start, matching_cmd_node->source_length, abbreviation); } result = true; }