Adoption of new parser in abbreviations

2024-12-26 12:53:13 +00:00 · 2013-10-09 02:03:50 -07:00 · 2013-10-09 02:03:50 -07:00 · 7b86b2e05a
commit 7b86b2e05a
parent a51bd03a5c
7 changed files with 115 additions and 162 deletions
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@ -61,7 +61,6 @@
 #include "signal.h"
 #include "highlight.h"
 #include "parse_tree.h"
-#include "parse_exec.h"
 #include "parse_util.h"

 /**
@ -769,6 +768,11 @@ static void test_abbreviations(void)
    expanded = reader_expand_abbreviation_in_command(L"of gc", wcslen(L"of gc"), &result);
    if (expanded) err(L"gc incorrectly expanded on line %ld", (long)__LINE__);

+    /* others should not be */
+    expanded = reader_expand_abbreviation_in_command(L"command gc", wcslen(L"command gc"), &result);
+    if (expanded) err(L"gc incorrectly expanded on line %ld", (long)__LINE__);
+
+
    env_pop();
 }

@ -1916,12 +1920,16 @@ static void test_new_parser_fuzzing(void)
    size_t max = 5;
    for (size_t len=1; len <= max; len++)
    {
-        fprintf(stderr, "%lu / %lu\n", len, max);
+        fprintf(stderr, "%lu / %lu...", len, max);
        std::vector<parser_fuzz_token_t> tokens(len);
+        size_t count = 0;
+        parse_t parser;
+        parse_node_tree_t parse_tree;
        do
        {
-            parse_t parser;
-            parse_node_tree_t parse_tree;
+            parser.clear();
+            parse_tree.clear();
+            count++;
            for (size_t i=0; i < len; i++)
            {
                const parser_fuzz_token_t &token = tokens[i];
@ -1931,6 +1939,7 @@ static void test_new_parser_fuzzing(void)
            // keep going until we wrap
        }
        while (! increment(tokens));
+        fprintf(stderr, "done (%lu)\n", count);
    }
    double end = timef();
    say(L"All fuzzed in %f seconds!", end - start);
@ -2108,7 +2117,7 @@ int main(int argc, char **argv)
    say(L"Testing low-level functionality");
    set_main_thread();
    setup_fork_guards();
-    //proc_init();
+    //proc_init(); //disabling this prevents catching SIGINT
    event_init();
    function_init();
    builtin_init();
@ -2116,7 +2125,6 @@ int main(int argc, char **argv)
    env_init();

    test_highlighting();
-    return 0;
    test_new_parser_fuzzing();
    test_new_parser_correctness();
    test_highlighting();
--- a/highlight.cpp
+++ b/highlight.cpp
@ -332,7 +332,7 @@ static bool is_potential_cd_path(const wcstring &path, const wcstring &working_d
 }

 /* Given a plain statement node in a parse tree, get the command and return it, expanded appropriately for commands. If we succeed, return true. */
-static bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &plain_statement, wcstring *out_cmd)
+bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &plain_statement, wcstring *out_cmd)
 {
    assert(plain_statement.type == symbol_plain_statement);
    bool result = false;
@ -708,15 +708,15 @@ static bool has_expand_reserved(const wcstring &str)
    return result;
 }

-/* Parse a command line. Return by reference the last command, its arguments, and the offset in the string of the beginning of the last argument. This is used by autosuggestions */
-static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expanded_command, const parse_node_t **out_last_arg)
+/* Parse a command line. Return by reference the last command, and the last argument to that command (as a copied node), if any. This is used by autosuggestions */
+static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expanded_command, parse_node_t *out_last_arg)
 {
    bool result = false;
    
    /* Parse the buffer */
    parse_node_tree_t parse_tree;
    parse_t parser;
-    parser.parse(buff, parse_flag_continue_after_error, &parse_tree, NULL);
+    parser.parse(buff, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &parse_tree, NULL);
    
    /* Find the last statement */
    const parse_node_t *last_statement = parse_tree.find_last_node_of_type(symbol_plain_statement, NULL);
@ -727,8 +727,12 @@ static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expand
            /* We got it */
            result = true;
            
-            /* Find the last argument */
-            *out_last_arg = parse_tree.find_last_node_of_type(symbol_plain_statement, last_statement);
+            /* Find the last argument. If we don't get one, return an invalid node. */
+            const parse_node_t *last_arg = parse_tree.find_last_node_of_type(symbol_argument, last_statement);
+            if (last_arg != NULL)
+            {
+                *out_last_arg = *last_arg;
+            }
        }
    }
    return result;
@ -739,20 +743,20 @@ bool autosuggest_suggest_special(const wcstring &str, const wcstring &working_di
 {
    if (str.empty())
        return false;
-
+    
    ASSERT_IS_BACKGROUND_THREAD();

    /* Parse the string */
    wcstring parsed_command;
-    const parse_node_t *last_arg_node = NULL;
+    parse_node_t last_arg_node(token_type_invalid);
    if (! autosuggest_parse_command(str, &parsed_command, &last_arg_node))
        return false;

    bool result = false;
-    if (parsed_command == L"cd" && last_arg_node != NULL && last_arg_node->has_source())
+    if (parsed_command == L"cd" && last_arg_node.type == symbol_argument && last_arg_node.has_source())
    {
        /* We can possibly handle this specially */
-        const wcstring escaped_dir = last_arg_node->get_source(str);
+        const wcstring escaped_dir = last_arg_node.get_source(str);
        wcstring suggested_path;

        /* We always return true because we recognized the command. This prevents us from falling back to dumber algorithms; for example we won't suggest a non-directory for the cd command. */
@ -771,13 +775,12 @@ bool autosuggest_suggest_special(const wcstring &str, const wcstring &working_di
        path_flags_t path_flags = (quote == L'\0') ? PATH_EXPAND_TILDE : 0;
        if (unescaped && is_potential_cd_path(unescaped_dir, working_directory, path_flags, &suggested_path))
        {
-
            /* Note: this looks really wrong for strings that have an "unescapable" character in them, e.g. a \t, because parse_util_escape_string_with_quote will insert that character */
            wcstring escaped_suggested_path = parse_util_escape_string_with_quote(suggested_path, quote);

            /* Return it */
            out_suggestion = str;
-            out_suggestion.erase(last_arg_node->source_start);
+            out_suggestion.erase(last_arg_node.source_start);
            if (quote != L'\0') out_suggestion.push_back(quote);
            out_suggestion.append(escaped_suggested_path);
            if (quote != L'\0') out_suggestion.push_back(quote);
@ -798,14 +801,14 @@ bool autosuggest_validate_from_history(const history_item_t &item, file_detectio

    /* Parse the string */
    wcstring parsed_command;
-    const parse_node_t *last_arg_node = NULL;
+    parse_node_t last_arg_node(token_type_invalid);
    if (! autosuggest_parse_command(item.str(), &parsed_command, &last_arg_node))
        return false;

-    if (parsed_command == L"cd" && last_arg_node != NULL && last_arg_node->has_source())
+    if (parsed_command == L"cd" && last_arg_node.type == symbol_argument && last_arg_node.has_source())
    {
        /* We can possibly handle this specially */
-        wcstring dir = last_arg_node->get_source(item.str());
+        wcstring dir = last_arg_node.get_source(item.str());
        if (expand_one(dir, EXPAND_SKIP_CMDSUBST))
        {
            handled = true;
@ -1968,12 +1971,7 @@ const highlighter_t::color_array_t & highlighter_t::highlight()
            case symbol_plain_statement:
            {
                // Get the decoration from the parent
-                enum parse_statement_decoration_t decoration = parse_statement_decoration_none;
-                const parse_node_t *decorated_statement = parse_tree.get_parent(node, symbol_decorated_statement);
-                if (decorated_statement != NULL)
-                {
-                    decoration = static_cast<enum parse_statement_decoration_t>(decorated_statement->production_idx);
-                }
+                enum parse_statement_decoration_t decoration = parse_tree.decoration_for_plain_statement(node);

                /* Color the command */
                const parse_node_t *cmd_node = parse_tree.get_child(node, 0, parse_token_type_string);
--- a/parse_productions.cpp
+++ b/parse_productions.cpp
@ -27,8 +27,8 @@ static bool production_is_valid(const production_options_t production_list, prod
 }

 #define PRODUCTIONS(sym) static const production_options_t productions_##sym
-#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag)
-#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) { return 0; }
+#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword)
+#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword) { return 0; }

 #define KEYWORD(x) ((x) + LAST_TOKEN_OR_SYMBOL + 1)

@ -418,7 +418,7 @@ RESOLVE(optional_background)
 }

 #define TEST(sym) case (symbol_##sym): production_list = & productions_ ## sym ; resolver = resolve_ ## sym ; break;
-const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, production_tag_t *out_tag, wcstring *out_error_text)
+const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, wcstring *out_error_text)
 {
    bool log_it = false;
    if (log_it)
@ -428,7 +428,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n

    /* Fetch the list of productions and the function to resolve them */
    const production_options_t *production_list = NULL;
-    production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) = NULL;
+    production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword) = NULL;
    switch (node_type)
    {
            TEST(job_list)
@ -486,7 +486,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
    PARSE_ASSERT(resolver != NULL);

    const production_t *result = NULL;
-    production_option_idx_t which = resolver(input_type, input_keyword, out_tag);
+    production_option_idx_t which = resolver(input_type, input_keyword);

    if (log_it)
    {
--- a/parse_productions.h
+++ b/parse_productions.h
@ -63,7 +63,7 @@ inline bool production_element_is_valid(production_element_t elem)
 }

 /* Fetch a production */
-const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, production_tag_t *out_tag, wcstring *out_error_text);
+const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, wcstring *out_error_text);

 }

--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@ -720,7 +720,7 @@ void parse_ll_t::accept_token(parse_token_t token)
        // Get the production for the top of the stack
        parse_stack_element_t &stack_elem = symbol_stack.back();
        parse_node_t &node = nodes.at(stack_elem.node_idx);
-        const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, &node.tag, NULL /* error text */);
+        const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, NULL /* error text */);
        if (production == NULL)
        {
            if (should_generate_error_messages)
@ -804,6 +804,9 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
    if (parse_flags & parse_flag_include_comments)
        tok_options |= TOK_SHOW_COMMENTS;
    
+    if (parse_flags & parse_flag_accept_incomplete_tokens)
+        tok_options |= TOK_ACCEPT_UNFINISHED;
+    
    this->parser->set_should_generate_error_messages(errors != NULL);

    tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
@ -845,14 +848,14 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
    
    // Tag nodes
    
-#if 0
-    wcstring result = dump_tree(this->parser->nodes, str);
-    fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
-    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
-#endif
-
    // Acquire the output from the parser
    this->parser->acquire_output(output, errors);
+
+#if 0
+    //wcstring result = dump_tree(this->parser->nodes, str);
+    //fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
+    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", output->size(), sizeof(parse_node_t), output->size() * sizeof(parse_node_t));
+#endif
    
    // Indicate if we had a fatal error
    return ! this->parser->has_fatal_error();
@ -992,3 +995,15 @@ bool parse_node_tree_t::argument_list_is_root(const parse_node_t &node) const
    }
    return result;
 }
+
+enum parse_statement_decoration_t parse_node_tree_t::decoration_for_plain_statement(const parse_node_t &node) const
+{
+    assert(node.type == symbol_plain_statement);
+    enum parse_statement_decoration_t decoration = parse_statement_decoration_none;
+    const parse_node_t *decorated_statement = this->get_parent(node, symbol_decorated_statement);
+    if (decorated_statement != NULL)
+    {
+        decoration = static_cast<enum parse_statement_decoration_t>(decorated_statement->production_idx);
+    }
+    return decoration;
+}
--- a/parse_tree.h
+++ b/parse_tree.h
@ -125,7 +125,10 @@ enum
    parse_flag_continue_after_error = 1 << 0,
    
    /* Include comment tokens */
-    parse_flag_include_comments = 1 << 1
+    parse_flag_include_comments = 1 << 1,
+    
+    /* Indicate that the tokenizer should accept incomplete tokens */
+    parse_flag_accept_incomplete_tokens = 1 << 2
 };
 typedef unsigned int parse_tree_flags_t;

@ -175,9 +178,6 @@ public:
    node_offset_t child_start;
    node_offset_t child_count;

-    /* Type-dependent data */
-    uint32_t tag;
-
    /* Which production was used */
    uint8_t production_idx;

@ -185,7 +185,7 @@ public:
    wcstring describe(void) const;

    /* Constructor */
-    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), tag(0)
+    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0)
    {
    }

@ -211,6 +211,15 @@ public:
    }
 };

+/* Statement decorations. This matches the order of productions in decorated_statement */
+enum parse_statement_decoration_t
+{
+    parse_statement_decoration_none,
+    parse_statement_decoration_command,
+    parse_statement_decoration_builtin
+};
+
+
 /* The parse tree itself */
 class parse_node_tree_t : public std::vector<parse_node_t>
 {
@ -232,27 +241,10 @@ public:
    
    /* Indicate if the given argument_list or arguments_or_redirections_list is a root list, or has a parent */
    bool argument_list_is_root(const parse_node_t &node) const;
-};
-
-
-/* Node type specific data, stored in the tag field */
-
-/* Statement decorations, stored in the tag of plain_statement. This matches the order of productions in decorated_statement */
-enum parse_statement_decoration_t
-{
-    parse_statement_decoration_none,
-    parse_statement_decoration_command,
-    parse_statement_decoration_builtin
-};
-
-/* Argument flags as a bitmask, stored in the tag of argument */
-enum parse_argument_flags_t
-{
-    /* Indicates that this or a prior argument was --, so this should not be treated as an option */
-    parse_argument_no_options = 1 << 0,
    
-    /* Indicates that the argument is for a cd command */
-    parse_argument_is_for_cd = 1 << 1
+    /* Utilities */
+    enum parse_statement_decoration_t decoration_for_plain_statement(const parse_node_t &node) const;
+
 };

 /* Fish grammar:
--- a/reader.cpp
+++ b/reader.cpp
@ -99,6 +99,7 @@ commence.
 #include "path.h"
 #include "parse_util.h"
 #include "parser_keywords.h"
+#include "parse_tree.h"

 /**
   Maximum length of prefix string when printing completion
@ -659,117 +660,56 @@ bool reader_expand_abbreviation_in_command(const wcstring &cmdline, size_t curso
    const size_t subcmd_offset = cmdsub_begin - buff;

    const wcstring subcmd = wcstring(cmdsub_begin, cmdsub_end - cmdsub_begin);
-    const wchar_t *subcmd_cstr = subcmd.c_str();
-
-    /* Get the token containing the cursor */
-    const wchar_t *subcmd_tok_begin = NULL, *subcmd_tok_end = NULL;
-    assert(cursor_pos >= subcmd_offset);
-    size_t subcmd_cursor_pos = cursor_pos - subcmd_offset;
-    parse_util_token_extent(subcmd_cstr, subcmd_cursor_pos, &subcmd_tok_begin, &subcmd_tok_end, NULL, NULL);
-
-    /* Compute the offset of the token before the cursor within the subcmd */
-    assert(subcmd_tok_begin >= subcmd_cstr);
-    assert(subcmd_tok_end >= subcmd_tok_begin);
-    const size_t subcmd_tok_begin_offset = subcmd_tok_begin - subcmd_cstr;
-    const size_t subcmd_tok_length = subcmd_tok_end - subcmd_tok_begin;
-
-    /* Now parse the subcmd, looking for commands */
-    bool had_cmd = false, previous_token_is_cmd = false;
-    tokenizer_t tok(subcmd_cstr, TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS);
-    for (; tok_has_next(&tok); tok_next(&tok))
+    const size_t subcmd_cursor_pos = cursor_pos - subcmd_offset;
+    
+    /* Parse this subcmd */
+    parse_node_tree_t parse_tree;
+    parse_t parser;
+    parser.parse(subcmd, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &parse_tree, NULL);
+    
+    /* Look for plain statements where the cursor is at the end of the command */
+    const parse_node_t *matching_cmd_node = NULL;
+    const size_t len = parse_tree.size();
+    for (size_t i=0; i < len; i++)
    {
-        size_t tok_pos = static_cast<size_t>(tok_get_pos(&tok));
-        if (tok_pos > subcmd_tok_begin_offset)
+        const parse_node_t &node = parse_tree.at(i);
+        
+        /* Only interested in plain statements with source */
+        if (node.type != symbol_plain_statement || ! node.has_source())
+            continue;
+        
+        /* Skip decorated statements */
+        if (parse_tree.decoration_for_plain_statement(node) != parse_statement_decoration_none)
+            continue;
+        
+        /* Get the command node. Skip it if we can't or it has no source */
+        const parse_node_t *cmd_node = parse_tree.get_child(node, 0, parse_token_type_string);
+        if (cmd_node == NULL || ! cmd_node->has_source())
+            continue;
+        
+        /* Now see if its source range contains our cursor, including at the end */
+        if (subcmd_cursor_pos >= cmd_node->source_start && subcmd_cursor_pos <= cmd_node->source_start + cmd_node->source_length)
        {
-            /* We've passed the token we're interested in */
+            /* Success! */
+            matching_cmd_node = cmd_node;
            break;
        }
-
-        int last_type = tok_last_type(&tok);
-
-        switch (last_type)
-        {
-            case TOK_STRING:
-            {
-                if (had_cmd)
-                {
-                    /* Parameter to the command. */
-                }
-                else
-                {
-                    const wcstring potential_cmd = tok_last(&tok);
-                    if (parser_keywords_is_subcommand(potential_cmd))
-                    {
-                        if (potential_cmd == L"command" || potential_cmd == L"builtin")
-                        {
-                            /* 'command' and 'builtin' defeat abbreviation expansion. Skip this command. */
-                            had_cmd = true;
-                        }
-                        else
-                        {
-                            /* Other subcommand. Pretend it doesn't exist so that we can expand the following command */
-                            had_cmd = false;
-                        }
-                    }
-                    else
-                    {
-                        /* It's a normal command */
-                        had_cmd = true;
-                        if (tok_pos == subcmd_tok_begin_offset)
-                        {
-                            /* This is the token we care about! */
-                            previous_token_is_cmd = true;
-                        }
-                    }
-                }
-                break;
-            }
-
-            case TOK_REDIRECT_NOCLOB:
-            case TOK_REDIRECT_OUT:
-            case TOK_REDIRECT_IN:
-            case TOK_REDIRECT_APPEND:
-            case TOK_REDIRECT_FD:
-            {
-                if (!had_cmd)
-                {
-                    break;
-                }
-                tok_next(&tok);
-                break;
-            }
-
-            case TOK_PIPE:
-            case TOK_BACKGROUND:
-            case TOK_END:
-            {
-                had_cmd = false;
-                break;
-            }
-
-            case TOK_COMMENT:
-            case TOK_ERROR:
-            default:
-            {
-                break;
-            }
-        }
    }
-
+    
+    /* Now if we found a command node, expand it */
    bool result = false;
-    if (previous_token_is_cmd)
+    if (matching_cmd_node != NULL)
    {
-        /* The token is a command. Try expanding it as an abbreviation. */
-        const wcstring token = wcstring(subcmd, subcmd_tok_begin_offset, subcmd_tok_length);
+        assert(matching_cmd_node->type == parse_token_type_string);
+        const wcstring token = matching_cmd_node->get_source(subcmd);
        wcstring abbreviation;
        if (expand_abbreviation(token, &abbreviation))
        {
            /* There was an abbreviation! Replace the token in the full command. Maintain the relative position of the cursor. */
            if (output != NULL)
            {
-                size_t cmd_tok_begin_offset = subcmd_tok_begin_offset + subcmd_offset;
                output->assign(cmdline);
-                output->replace(cmd_tok_begin_offset, subcmd_tok_length, abbreviation);
+                output->replace(subcmd_offset + matching_cmd_node->source_start, matching_cmd_node->source_length, abbreviation);
            }
            result = true;
        }