From 7b86b2e05a011e37bf11bba2675ef5db684bca24 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Wed, 9 Oct 2013 02:03:50 -0700
Subject: [PATCH] Adoption of new parser in abbreviations

---
 fish_tests.cpp        |  20 +++++--
 highlight.cpp         |  40 ++++++-------
 parse_productions.cpp |  10 ++--
 parse_productions.h   |   2 +-
 parse_tree.cpp        |  29 ++++++---
 parse_tree.h          |  42 ++++++-------
 reader.cpp            | 134 ++++++++++++------------------------------
 7 files changed, 115 insertions(+), 162 deletions(-)

diff --git a/fish_tests.cpp b/fish_tests.cpp
index 6c77ec08a..99ed6cd34 100644
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -61,7 +61,6 @@
 #include "signal.h"
 #include "highlight.h"
 #include "parse_tree.h"
-#include "parse_exec.h"
 #include "parse_util.h"
 
 /**
@@ -769,6 +768,11 @@ static void test_abbreviations(void)
     expanded = reader_expand_abbreviation_in_command(L"of gc", wcslen(L"of gc"), &result);
     if (expanded) err(L"gc incorrectly expanded on line %ld", (long)__LINE__);
 
+    /* others should not be */
+    expanded = reader_expand_abbreviation_in_command(L"command gc", wcslen(L"command gc"), &result);
+    if (expanded) err(L"gc incorrectly expanded on line %ld", (long)__LINE__);
+
+
     env_pop();
 }
 
@@ -1916,12 +1920,16 @@ static void test_new_parser_fuzzing(void)
     size_t max = 5;
     for (size_t len=1; len <= max; len++)
     {
-        fprintf(stderr, "%lu / %lu\n", len, max);
+        fprintf(stderr, "%lu / %lu...", len, max);
         std::vector<parser_fuzz_token_t> tokens(len);
+        size_t count = 0;
+        parse_t parser;
+        parse_node_tree_t parse_tree;
         do
         {
-            parse_t parser;
-            parse_node_tree_t parse_tree;
+            parser.clear();
+            parse_tree.clear();
+            count++;
             for (size_t i=0; i < len; i++)
             {
                 const parser_fuzz_token_t &token = tokens[i];
@@ -1931,6 +1939,7 @@ static void test_new_parser_fuzzing(void)
             // keep going until we wrap
         }
         while (! increment(tokens));
+        fprintf(stderr, "done (%lu)\n", count);
     }
     double end = timef();
     say(L"All fuzzed in %f seconds!", end - start);
@@ -2108,7 +2117,7 @@ int main(int argc, char **argv)
     say(L"Testing low-level functionality");
     set_main_thread();
     setup_fork_guards();
-    //proc_init();
+    //proc_init(); //disabling this prevents catching SIGINT
     event_init();
     function_init();
     builtin_init();
@@ -2116,7 +2125,6 @@ int main(int argc, char **argv)
     env_init();
 
     test_highlighting();
-    return 0;
     test_new_parser_fuzzing();
     test_new_parser_correctness();
     test_highlighting();
diff --git a/highlight.cpp b/highlight.cpp
index 28e32b7a1..8fe9989b9 100644
--- a/highlight.cpp
+++ b/highlight.cpp
@@ -332,7 +332,7 @@ static bool is_potential_cd_path(const wcstring &path, const wcstring &working_d
 }
 
 /* Given a plain statement node in a parse tree, get the command and return it, expanded appropriately for commands. If we succeed, return true. */
-static bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &plain_statement, wcstring *out_cmd)
+bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &plain_statement, wcstring *out_cmd)
 {
     assert(plain_statement.type == symbol_plain_statement);
     bool result = false;
@@ -708,15 +708,15 @@ static bool has_expand_reserved(const wcstring &str)
     return result;
 }
 
-/* Parse a command line. Return by reference the last command, its arguments, and the offset in the string of the beginning of the last argument. This is used by autosuggestions */
-static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expanded_command, const parse_node_t **out_last_arg)
+/* Parse a command line. Return by reference the last command, and the last argument to that command (as a copied node), if any. This is used by autosuggestions */
+static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expanded_command, parse_node_t *out_last_arg)
 {
     bool result = false;
     
     /* Parse the buffer */
     parse_node_tree_t parse_tree;
     parse_t parser;
-    parser.parse(buff, parse_flag_continue_after_error, &parse_tree, NULL);
+    parser.parse(buff, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &parse_tree, NULL);
     
     /* Find the last statement */
     const parse_node_t *last_statement = parse_tree.find_last_node_of_type(symbol_plain_statement, NULL);
@@ -727,8 +727,12 @@ static bool autosuggest_parse_command(const wcstring &buff, wcstring *out_expand
             /* We got it */
             result = true;
             
-            /* Find the last argument */
-            *out_last_arg = parse_tree.find_last_node_of_type(symbol_plain_statement, last_statement);
+            /* Find the last argument. If we don't get one, return an invalid node. */
+            const parse_node_t *last_arg = parse_tree.find_last_node_of_type(symbol_argument, last_statement);
+            if (last_arg != NULL)
+            {
+                *out_last_arg = *last_arg;
+            }
         }
     }
     return result;
@@ -739,20 +743,20 @@ bool autosuggest_suggest_special(const wcstring &str, const wcstring &working_di
 {
     if (str.empty())
         return false;
-
+    
     ASSERT_IS_BACKGROUND_THREAD();
 
     /* Parse the string */
     wcstring parsed_command;
-    const parse_node_t *last_arg_node = NULL;
+    parse_node_t last_arg_node(token_type_invalid);
     if (! autosuggest_parse_command(str, &parsed_command, &last_arg_node))
         return false;
 
     bool result = false;
-    if (parsed_command == L"cd" && last_arg_node != NULL && last_arg_node->has_source())
+    if (parsed_command == L"cd" && last_arg_node.type == symbol_argument && last_arg_node.has_source())
     {
         /* We can possibly handle this specially */
-        const wcstring escaped_dir = last_arg_node->get_source(str);
+        const wcstring escaped_dir = last_arg_node.get_source(str);
         wcstring suggested_path;
 
         /* We always return true because we recognized the command. This prevents us from falling back to dumber algorithms; for example we won't suggest a non-directory for the cd command. */
@@ -771,13 +775,12 @@ bool autosuggest_suggest_special(const wcstring &str, const wcstring &working_di
         path_flags_t path_flags = (quote == L'\0') ? PATH_EXPAND_TILDE : 0;
         if (unescaped && is_potential_cd_path(unescaped_dir, working_directory, path_flags, &suggested_path))
         {
-
             /* Note: this looks really wrong for strings that have an "unescapable" character in them, e.g. a \t, because parse_util_escape_string_with_quote will insert that character */
             wcstring escaped_suggested_path = parse_util_escape_string_with_quote(suggested_path, quote);
 
             /* Return it */
             out_suggestion = str;
-            out_suggestion.erase(last_arg_node->source_start);
+            out_suggestion.erase(last_arg_node.source_start);
             if (quote != L'\0') out_suggestion.push_back(quote);
             out_suggestion.append(escaped_suggested_path);
             if (quote != L'\0') out_suggestion.push_back(quote);
@@ -798,14 +801,14 @@ bool autosuggest_validate_from_history(const history_item_t &item, file_detectio
 
     /* Parse the string */
     wcstring parsed_command;
-    const parse_node_t *last_arg_node = NULL;
+    parse_node_t last_arg_node(token_type_invalid);
     if (! autosuggest_parse_command(item.str(), &parsed_command, &last_arg_node))
         return false;
 
-    if (parsed_command == L"cd" && last_arg_node != NULL && last_arg_node->has_source())
+    if (parsed_command == L"cd" && last_arg_node.type == symbol_argument && last_arg_node.has_source())
     {
         /* We can possibly handle this specially */
-        wcstring dir = last_arg_node->get_source(item.str());
+        wcstring dir = last_arg_node.get_source(item.str());
         if (expand_one(dir, EXPAND_SKIP_CMDSUBST))
         {
             handled = true;
@@ -1968,12 +1971,7 @@ const highlighter_t::color_array_t & highlighter_t::highlight()
             case symbol_plain_statement:
             {
                 // Get the decoration from the parent
-                enum parse_statement_decoration_t decoration = parse_statement_decoration_none;
-                const parse_node_t *decorated_statement = parse_tree.get_parent(node, symbol_decorated_statement);
-                if (decorated_statement != NULL)
-                {
-                    decoration = static_cast<enum parse_statement_decoration_t>(decorated_statement->production_idx);
-                }
+                enum parse_statement_decoration_t decoration = parse_tree.decoration_for_plain_statement(node);
 
                 /* Color the command */
                 const parse_node_t *cmd_node = parse_tree.get_child(node, 0, parse_token_type_string);
diff --git a/parse_productions.cpp b/parse_productions.cpp
index 0900977f7..38d57ebab 100644
--- a/parse_productions.cpp
+++ b/parse_productions.cpp
@@ -27,8 +27,8 @@ static bool production_is_valid(const production_options_t production_list, prod
 }
 
 #define PRODUCTIONS(sym) static const production_options_t productions_##sym
-#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag)
-#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) { return 0; }
+#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword)
+#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword) { return 0; }
 
 #define KEYWORD(x) ((x) + LAST_TOKEN_OR_SYMBOL + 1)
 
@@ -418,7 +418,7 @@ RESOLVE(optional_background)
 }
 
 #define TEST(sym) case (symbol_##sym): production_list = & productions_ ## sym ; resolver = resolve_ ## sym ; break;
-const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, production_tag_t *out_tag, wcstring *out_error_text)
+const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, wcstring *out_error_text)
 {
     bool log_it = false;
     if (log_it)
@@ -428,7 +428,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
 
     /* Fetch the list of productions and the function to resolve them */
     const production_options_t *production_list = NULL;
-    production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) = NULL;
+    production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword) = NULL;
     switch (node_type)
     {
             TEST(job_list)
@@ -486,7 +486,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
     PARSE_ASSERT(resolver != NULL);
 
     const production_t *result = NULL;
-    production_option_idx_t which = resolver(input_type, input_keyword, out_tag);
+    production_option_idx_t which = resolver(input_type, input_keyword);
 
     if (log_it)
     {
diff --git a/parse_productions.h b/parse_productions.h
index a0d43f629..7e132d0c4 100644
--- a/parse_productions.h
+++ b/parse_productions.h
@@ -63,7 +63,7 @@ inline bool production_element_is_valid(production_element_t elem)
 }
 
 /* Fetch a production */
-const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, production_tag_t *out_tag, wcstring *out_error_text);
+const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, wcstring *out_error_text);
 
 }
 
diff --git a/parse_tree.cpp b/parse_tree.cpp
index 30ee6856b..900513f50 100644
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@@ -720,7 +720,7 @@ void parse_ll_t::accept_token(parse_token_t token)
         // Get the production for the top of the stack
         parse_stack_element_t &stack_elem = symbol_stack.back();
         parse_node_t &node = nodes.at(stack_elem.node_idx);
-        const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, &node.tag, NULL /* error text */);
+        const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, NULL /* error text */);
         if (production == NULL)
         {
             if (should_generate_error_messages)
@@ -804,6 +804,9 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
     if (parse_flags & parse_flag_include_comments)
         tok_options |= TOK_SHOW_COMMENTS;
     
+    if (parse_flags & parse_flag_accept_incomplete_tokens)
+        tok_options |= TOK_ACCEPT_UNFINISHED;
+    
     this->parser->set_should_generate_error_messages(errors != NULL);
 
     tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
@@ -845,14 +848,14 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
     
     // Tag nodes
     
-#if 0
-    wcstring result = dump_tree(this->parser->nodes, str);
-    fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
-    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
-#endif
-
     // Acquire the output from the parser
     this->parser->acquire_output(output, errors);
+
+#if 0
+    //wcstring result = dump_tree(this->parser->nodes, str);
+    //fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
+    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", output->size(), sizeof(parse_node_t), output->size() * sizeof(parse_node_t));
+#endif
     
     // Indicate if we had a fatal error
     return ! this->parser->has_fatal_error();
@@ -992,3 +995,15 @@ bool parse_node_tree_t::argument_list_is_root(const parse_node_t &node) const
     }
     return result;
 }
+
+enum parse_statement_decoration_t parse_node_tree_t::decoration_for_plain_statement(const parse_node_t &node) const
+{
+    assert(node.type == symbol_plain_statement);
+    enum parse_statement_decoration_t decoration = parse_statement_decoration_none;
+    const parse_node_t *decorated_statement = this->get_parent(node, symbol_decorated_statement);
+    if (decorated_statement != NULL)
+    {
+        decoration = static_cast<enum parse_statement_decoration_t>(decorated_statement->production_idx);
+    }
+    return decoration;
+}
diff --git a/parse_tree.h b/parse_tree.h
index b2059914c..945d550c4 100644
--- a/parse_tree.h
+++ b/parse_tree.h
@@ -125,7 +125,10 @@ enum
     parse_flag_continue_after_error = 1 << 0,
     
     /* Include comment tokens */
-    parse_flag_include_comments = 1 << 1
+    parse_flag_include_comments = 1 << 1,
+    
+    /* Indicate that the tokenizer should accept incomplete tokens */
+    parse_flag_accept_incomplete_tokens = 1 << 2
 };
 typedef unsigned int parse_tree_flags_t;
 
@@ -175,9 +178,6 @@ public:
     node_offset_t child_start;
     node_offset_t child_count;
 
-    /* Type-dependent data */
-    uint32_t tag;
-
     /* Which production was used */
     uint8_t production_idx;
 
@@ -185,7 +185,7 @@ public:
     wcstring describe(void) const;
 
     /* Constructor */
-    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0), tag(0)
+    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), parent(NODE_OFFSET_INVALID), child_start(0), child_count(0)
     {
     }
 
@@ -211,6 +211,15 @@ public:
     }
 };
 
+/* Statement decorations. This matches the order of productions in decorated_statement */
+enum parse_statement_decoration_t
+{
+    parse_statement_decoration_none,
+    parse_statement_decoration_command,
+    parse_statement_decoration_builtin
+};
+
+
 /* The parse tree itself */
 class parse_node_tree_t : public std::vector<parse_node_t>
 {
@@ -232,27 +241,10 @@ public:
     
     /* Indicate if the given argument_list or arguments_or_redirections_list is a root list, or has a parent */
     bool argument_list_is_root(const parse_node_t &node) const;
-};
-
-
-/* Node type specific data, stored in the tag field */
-
-/* Statement decorations, stored in the tag of plain_statement. This matches the order of productions in decorated_statement */
-enum parse_statement_decoration_t
-{
-    parse_statement_decoration_none,
-    parse_statement_decoration_command,
-    parse_statement_decoration_builtin
-};
-
-/* Argument flags as a bitmask, stored in the tag of argument */
-enum parse_argument_flags_t
-{
-    /* Indicates that this or a prior argument was --, so this should not be treated as an option */
-    parse_argument_no_options = 1 << 0,
     
-    /* Indicates that the argument is for a cd command */
-    parse_argument_is_for_cd = 1 << 1
+    /* Utilities */
+    enum parse_statement_decoration_t decoration_for_plain_statement(const parse_node_t &node) const;
+
 };
 
 /* Fish grammar:
diff --git a/reader.cpp b/reader.cpp
index 228fa9183..0f022c279 100644
--- a/reader.cpp
+++ b/reader.cpp
@@ -99,6 +99,7 @@ commence.
 #include "path.h"
 #include "parse_util.h"
 #include "parser_keywords.h"
+#include "parse_tree.h"
 
 /**
    Maximum length of prefix string when printing completion
@@ -659,117 +660,56 @@ bool reader_expand_abbreviation_in_command(const wcstring &cmdline, size_t curso
     const size_t subcmd_offset = cmdsub_begin - buff;
 
     const wcstring subcmd = wcstring(cmdsub_begin, cmdsub_end - cmdsub_begin);
-    const wchar_t *subcmd_cstr = subcmd.c_str();
-
-    /* Get the token containing the cursor */
-    const wchar_t *subcmd_tok_begin = NULL, *subcmd_tok_end = NULL;
-    assert(cursor_pos >= subcmd_offset);
-    size_t subcmd_cursor_pos = cursor_pos - subcmd_offset;
-    parse_util_token_extent(subcmd_cstr, subcmd_cursor_pos, &subcmd_tok_begin, &subcmd_tok_end, NULL, NULL);
-
-    /* Compute the offset of the token before the cursor within the subcmd */
-    assert(subcmd_tok_begin >= subcmd_cstr);
-    assert(subcmd_tok_end >= subcmd_tok_begin);
-    const size_t subcmd_tok_begin_offset = subcmd_tok_begin - subcmd_cstr;
-    const size_t subcmd_tok_length = subcmd_tok_end - subcmd_tok_begin;
-
-    /* Now parse the subcmd, looking for commands */
-    bool had_cmd = false, previous_token_is_cmd = false;
-    tokenizer_t tok(subcmd_cstr, TOK_ACCEPT_UNFINISHED | TOK_SQUASH_ERRORS);
-    for (; tok_has_next(&tok); tok_next(&tok))
+    const size_t subcmd_cursor_pos = cursor_pos - subcmd_offset;
+    
+    /* Parse this subcmd */
+    parse_node_tree_t parse_tree;
+    parse_t parser;
+    parser.parse(subcmd, parse_flag_continue_after_error | parse_flag_accept_incomplete_tokens, &parse_tree, NULL);
+    
+    /* Look for plain statements where the cursor is at the end of the command */
+    const parse_node_t *matching_cmd_node = NULL;
+    const size_t len = parse_tree.size();
+    for (size_t i=0; i < len; i++)
     {
-        size_t tok_pos = static_cast<size_t>(tok_get_pos(&tok));
-        if (tok_pos > subcmd_tok_begin_offset)
+        const parse_node_t &node = parse_tree.at(i);
+        
+        /* Only interested in plain statements with source */
+        if (node.type != symbol_plain_statement || ! node.has_source())
+            continue;
+        
+        /* Skip decorated statements */
+        if (parse_tree.decoration_for_plain_statement(node) != parse_statement_decoration_none)
+            continue;
+        
+        /* Get the command node. Skip it if we can't or it has no source */
+        const parse_node_t *cmd_node = parse_tree.get_child(node, 0, parse_token_type_string);
+        if (cmd_node == NULL || ! cmd_node->has_source())
+            continue;
+        
+        /* Now see if its source range contains our cursor, including at the end */
+        if (subcmd_cursor_pos >= cmd_node->source_start && subcmd_cursor_pos <= cmd_node->source_start + cmd_node->source_length)
         {
-            /* We've passed the token we're interested in */
+            /* Success! */
+            matching_cmd_node = cmd_node;
             break;
         }
-
-        int last_type = tok_last_type(&tok);
-
-        switch (last_type)
-        {
-            case TOK_STRING:
-            {
-                if (had_cmd)
-                {
-                    /* Parameter to the command. */
-                }
-                else
-                {
-                    const wcstring potential_cmd = tok_last(&tok);
-                    if (parser_keywords_is_subcommand(potential_cmd))
-                    {
-                        if (potential_cmd == L"command" || potential_cmd == L"builtin")
-                        {
-                            /* 'command' and 'builtin' defeat abbreviation expansion. Skip this command. */
-                            had_cmd = true;
-                        }
-                        else
-                        {
-                            /* Other subcommand. Pretend it doesn't exist so that we can expand the following command */
-                            had_cmd = false;
-                        }
-                    }
-                    else
-                    {
-                        /* It's a normal command */
-                        had_cmd = true;
-                        if (tok_pos == subcmd_tok_begin_offset)
-                        {
-                            /* This is the token we care about! */
-                            previous_token_is_cmd = true;
-                        }
-                    }
-                }
-                break;
-            }
-
-            case TOK_REDIRECT_NOCLOB:
-            case TOK_REDIRECT_OUT:
-            case TOK_REDIRECT_IN:
-            case TOK_REDIRECT_APPEND:
-            case TOK_REDIRECT_FD:
-            {
-                if (!had_cmd)
-                {
-                    break;
-                }
-                tok_next(&tok);
-                break;
-            }
-
-            case TOK_PIPE:
-            case TOK_BACKGROUND:
-            case TOK_END:
-            {
-                had_cmd = false;
-                break;
-            }
-
-            case TOK_COMMENT:
-            case TOK_ERROR:
-            default:
-            {
-                break;
-            }
-        }
     }
-
+    
+    /* Now if we found a command node, expand it */
     bool result = false;
-    if (previous_token_is_cmd)
+    if (matching_cmd_node != NULL)
     {
-        /* The token is a command. Try expanding it as an abbreviation. */
-        const wcstring token = wcstring(subcmd, subcmd_tok_begin_offset, subcmd_tok_length);
+        assert(matching_cmd_node->type == parse_token_type_string);
+        const wcstring token = matching_cmd_node->get_source(subcmd);
         wcstring abbreviation;
         if (expand_abbreviation(token, &abbreviation))
         {
             /* There was an abbreviation! Replace the token in the full command. Maintain the relative position of the cursor. */
             if (output != NULL)
             {
-                size_t cmd_tok_begin_offset = subcmd_tok_begin_offset + subcmd_offset;
                 output->assign(cmdline);
-                output->replace(cmd_tok_begin_offset, subcmd_tok_length, abbreviation);
+                output->replace(subcmd_offset + matching_cmd_node->source_start, matching_cmd_node->source_length, abbreviation);
             }
             result = true;
         }