From 58447c147f20d55555ed4035e3add1ccafec2998 Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Wed, 9 Oct 2013 15:57:10 -0700
Subject: [PATCH] Make the new parser LL(2). Support for correct handling of
 e.g. 'command --help'

---
 fish_tests.cpp        |  80 +++++++++++++++++++++++++++
 highlight.cpp         |   8 +--
 parse_productions.cpp |  44 +++++++++++++--
 parse_productions.h   |   4 +-
 parse_tree.cpp        | 125 +++++++++++++++++++++++++++++-------------
 parse_tree.h          |  11 +++-
 6 files changed, 219 insertions(+), 53 deletions(-)

diff --git a/fish_tests.cpp b/fish_tests.cpp
index 99ed6cd34..40a8d7db4 100644
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -1945,6 +1945,85 @@ static void test_new_parser_fuzzing(void)
     say(L"All fuzzed in %f seconds!", end - start);
 }
 
+// Parse a statement, returning the command, args (joined by spaces), and the decoration. Returns true if successful.
+static bool test_1_parse_ll2(const wcstring &src, wcstring *out_cmd, wcstring *out_joined_args, enum parse_statement_decoration_t *out_deco)
+{
+    out_cmd->clear();
+    out_joined_args->clear();
+    *out_deco = parse_statement_decoration_none;
+    
+    bool result = false;
+    parse_node_tree_t tree;
+    parse_t parser;
+    if (parser.parse(src, parse_flag_none, &tree, NULL))
+    {
+        /* Get the statement. Should only have one */
+        const parse_node_tree_t::parse_node_list_t stmt_nodes = tree.find_nodes(tree.at(0), symbol_plain_statement);
+        if (stmt_nodes.size() != 1)
+        {
+            say(L"Unexpected number of statements (%lu) found in '%ls'", stmt_nodes.size(), src.c_str());
+            return false;
+        }
+        const parse_node_t &stmt = *stmt_nodes.at(0);
+        
+        /* Return its decoration */
+        *out_deco = tree.decoration_for_plain_statement(stmt);
+        
+        /* Return its command */
+        tree.command_for_plain_statement(stmt, src, out_cmd);
+        
+        /* Return arguments separated by spaces */
+        const parse_node_tree_t::parse_node_list_t arg_nodes = tree.find_nodes(stmt, symbol_argument);
+        for (size_t i=0; i < arg_nodes.size(); i++)
+        {
+            if (i > 0) out_joined_args->push_back(L' ');
+            out_joined_args->append(arg_nodes.at(i)->get_source(src));
+        }
+        result = true;
+    }
+    return result;
+}
+
+/* Test the LL2 (two token lookahead) nature of the parser by exercising the special builtin and command handling. In particular, 'command foo' should be a decorated statement 'foo' but 'command --help' should be an undecorated statement 'command' with argument '--help', and NOT attempt to run a command called '--help' */
+static void test_new_parser_ll2(void)
+{
+    say(L"Testing parser two-token lookahead");
+    
+    const struct
+    {
+        wcstring src;
+        wcstring cmd;
+        wcstring args;
+        enum parse_statement_decoration_t deco;
+    } tests[] =
+    {
+        {L"echo hello", L"echo", L"hello", parse_statement_decoration_none},
+        {L"command echo hello", L"echo", L"hello", parse_statement_decoration_command},
+        {L"command command hello", L"command", L"hello", parse_statement_decoration_command},
+        {L"builtin command hello", L"command", L"hello", parse_statement_decoration_builtin},
+        {L"command --help", L"command", L"--help", parse_statement_decoration_none},
+        {L"command -h", L"command", L"-h", parse_statement_decoration_none},
+        {L"command", L"command", L"", parse_statement_decoration_none},
+        {L"function", L"function", L"", parse_statement_decoration_none},
+        {L"function --help", L"function", L"--help", parse_statement_decoration_none}
+    };
+    
+    for (size_t i=0; i < sizeof tests / sizeof *tests; i++)
+    {
+        wcstring cmd, args;
+        enum parse_statement_decoration_t deco = parse_statement_decoration_none;
+        bool success = test_1_parse_ll2(tests[i].src, &cmd, &args, &deco);
+        if (! success)
+            err(L"Parse of '%ls' failed on line %ld", tests[i].cmd.c_str(), (long)__LINE__);
+        if (cmd != tests[i].cmd)
+            err(L"When parsing '%ls', expected command '%ls' but got '%ls' on line %ld", tests[i].src.c_str(), tests[i].cmd.c_str(), cmd.c_str(), (long)__LINE__);
+        if (args != tests[i].args)
+            err(L"When parsing '%ls', expected args '%ls' but got '%ls' on line %ld", tests[i].src.c_str(), tests[i].args.c_str(), args.c_str(), (long)__LINE__);
+        if (deco != tests[i].deco)
+            err(L"When parsing '%ls', expected decoration %d but got %d on line %ld", tests[i].src.c_str(), (int)tests[i].deco, (int)deco, (long)__LINE__);
+    }
+}
+
 __attribute__((unused))
 static void test_new_parser(void)
 {
@@ -2125,6 +2204,7 @@ int main(int argc, char **argv)
     env_init();
 
     test_highlighting();
+    test_new_parser_ll2();
     test_new_parser_fuzzing();
     test_new_parser_correctness();
     test_highlighting();
diff --git a/highlight.cpp b/highlight.cpp
index 8fe9989b9..ffd5953c6 100644
--- a/highlight.cpp
+++ b/highlight.cpp
@@ -337,12 +337,10 @@ bool plain_statement_get_expanded_command(const wcstring &src, const parse_node_
     assert(plain_statement.type == symbol_plain_statement);
     bool result = false;
     
-    // Get the command
-    const parse_node_t *cmd_node = tree.get_child(plain_statement, 0, parse_token_type_string);
-    if (cmd_node != NULL && cmd_node->has_source())
+    /* Get the command */
+    wcstring cmd;
+    if (tree.command_for_plain_statement(plain_statement, src, &cmd))
     {
-        wcstring cmd(src, cmd_node->source_start, cmd_node->source_length);
-        
         /* Try expanding it. If we cannot, it's an error. */
         if (expand_one(cmd, EXPAND_SKIP_CMDSUBST | EXPAND_SKIP_VARIABLES | EXPAND_SKIP_JOBS))
         {
diff --git a/parse_productions.cpp b/parse_productions.cpp
index 38d57ebab..90e4a99b8 100644
--- a/parse_productions.cpp
+++ b/parse_productions.cpp
@@ -8,7 +8,7 @@ static bool production_is_empty(const production_t production)
     return production[0] == token_type_invalid;
 }
 
-// Empty productions are allowed but must be first. Validate that the given production is in the valid range, i.e. it is either not empty or there is a non-empty production after it
+/* Empty productions are allowed but must be first. Validate that the given production is in the valid range, i.e. it is either not empty or there is a non-empty production after it */
 static bool production_is_valid(const production_options_t production_list, production_option_idx_t which)
 {
     if (which < 0 || which >= MAX_PRODUCTIONS)
@@ -26,9 +26,24 @@ static bool production_is_valid(const production_options_t production_list, prod
     return nonempty_found;
 }
 
+/* Helper function indicates whether a token (typically second token) means 'help'. This is so we can treat e.g. 'command --help' as "invoke the 'command' builtin with --help' instead of 'run the --help command'.
+
+    if naked_invocation_invokes_help is true, then we treat an invalid type or something other than a string as indicating help; this means that the user ran e.g. 'command' with no arguments.
+*/
+static inline bool token_means_help(parse_token_type_t type, parse_keyword_t keyword, bool naked_invocation_invokes_help)
+{
+    if (keyword == parse_keyword_dash_h || keyword == parse_keyword_dashdash_help)
+        return true;
+    
+    if (naked_invocation_invokes_help && type != parse_token_type_string)
+        return true;
+    
+    return false;
+}
+
 #define PRODUCTIONS(sym) static const production_options_t productions_##sym
-#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword)
-#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword) { return 0; }
+#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, parse_token_type_t token_type2, parse_keyword_t token_keyword2)
+#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, parse_token_type_t token_type2, parse_keyword_t token_keyword2) { return 0; }
 
 #define KEYWORD(x) ((x) + LAST_TOKEN_OR_SYMBOL + 1)
 
@@ -115,6 +130,17 @@ PRODUCTIONS(statement) =
 };
 RESOLVE(statement)
 {
+    // Go to decorated statements if the subsequent token looks like '--help'
+    // If we are 'begin', then we expect to be invoked with no arguments. But if we are anything else, we require an argument, so do the same thing if the subsequent token is a line end.
+    if (token_type == parse_token_type_string)
+    {
+        bool naked_invocation_invokes_help = (token_keyword != parse_keyword_begin && token_keyword != parse_keyword_end);
+        if (token_means_help(token_type2, token_keyword2, naked_invocation_invokes_help))
+        {
+            return 4; //decorated statement
+        }
+    }
+
     switch (token_type)
     {
         case parse_token_type_string:
@@ -149,6 +175,8 @@ RESOLVE(statement)
                 case parse_keyword_command:
                 case parse_keyword_builtin:
                 case parse_keyword_case:
+                case parse_keyword_dash_h:
+                case parse_keyword_dashdash_help:
                     return 4;
             }
             break;
@@ -336,6 +364,10 @@ PRODUCTIONS(decorated_statement) =
 };
 RESOLVE(decorated_statement)
 {
+    /* If this is e.g. 'command --help' then the command is 'command' and not a decoration */
+    if (token_means_help(token_type2, token_keyword2, true /* naked_invocation_is_help */))
+        return 0;
+    
     switch (token_keyword)
     {
         default:
@@ -418,7 +450,7 @@ RESOLVE(optional_background)
 }
 
 #define TEST(sym) case (symbol_##sym): production_list = & productions_ ## sym ; resolver = resolve_ ## sym ; break;
-const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_which_production, wcstring *out_error_text)
+const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, parse_token_type_t input_type2, parse_keyword_t input_keyword2, production_option_idx_t *out_which_production, wcstring *out_error_text)
 {
     bool log_it = false;
     if (log_it)
@@ -428,7 +460,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
 
     /* Fetch the list of productions and the function to resolve them */
     const production_options_t *production_list = NULL;
-    production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword) = NULL;
+    production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword, parse_token_type_t token_type2, parse_keyword_t token_keyword2) = NULL;
     switch (node_type)
     {
             TEST(job_list)
@@ -486,7 +518,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
     PARSE_ASSERT(resolver != NULL);
 
     const production_t *result = NULL;
-    production_option_idx_t which = resolver(input_type, input_keyword);
+    production_option_idx_t which = resolver(input_type, input_keyword, input_type2, input_keyword2);
 
     if (log_it)
     {
diff --git a/parse_productions.h b/parse_productions.h
index 7e132d0c4..298be0b1c 100644
--- a/parse_productions.h
+++ b/parse_productions.h
@@ -62,8 +62,8 @@ inline bool production_element_is_valid(production_element_t elem)
     return elem != token_type_invalid;
 }
 
-/* Fetch a production */
-const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, production_option_idx_t *out_idx, wcstring *out_error_text);
+/* Fetch a production. We are passed two input tokens. The first input token is guaranteed to not be invalid; the second token may be invalid if there's no more tokens. */
+const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, parse_token_type_t input_type2, parse_keyword_t input_keyword2, production_option_idx_t *out_idx, wcstring *out_error_text);
 
 }
 
diff --git a/parse_tree.cpp b/parse_tree.cpp
index 900513f50..2066b8246 100644
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@@ -199,7 +199,7 @@ struct parse_token_t
 };
 
 /* Convert from tokenizer_t's token type to a parse_token_t type */
-static parse_token_type_t parse_token_type_from_tokenizer_token(enum token_type tokenizer_token_type)
+static inline parse_token_type_t parse_token_type_from_tokenizer_token(enum token_type tokenizer_token_type)
 {
     parse_token_type_t result = token_type_invalid;
     switch (tokenizer_token_type)
@@ -447,7 +447,7 @@ class parse_ll_t
     }
 
     /* Input */
-    void accept_token(parse_token_t token);
+    void accept_tokens(parse_token_t token1, parse_token_t token2);
     
     /* Indicate if we hit a fatal error */
     bool has_fatal_error(void) const
@@ -678,23 +678,23 @@ bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token)
     return handled;
 }
 
-void parse_ll_t::accept_token(parse_token_t token)
+void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2)
 {
     bool logit = false;
     if (logit)
     {
-        fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
+        fprintf(stderr, "Accept token %ls\n", token1.describe().c_str());
     }
-    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
+    PARSE_ASSERT(token1.type >= FIRST_PARSE_TOKEN_TYPE);
 
     bool consumed = false;
 
     // Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
-    if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
+    if (token1.type == parse_special_type_parse_error || token1.type == parse_special_type_tokenizer_error || token1.type == parse_special_type_comment)
     {
-        parse_node_t err_node(token.type);
-        err_node.source_start = token.source_start;
-        err_node.source_length = token.source_length;
+        parse_node_t err_node(token1.type);
+        err_node.source_start = token1.source_start;
+        err_node.source_length = token1.source_length;
         nodes.push_back(err_node);
         consumed = true;
     }
@@ -703,11 +703,11 @@ void parse_ll_t::accept_token(parse_token_t token)
     {
         PARSE_ASSERT(! symbol_stack.empty());
 
-        if (top_node_handle_terminal_types(token))
+        if (top_node_handle_terminal_types(token1))
         {
             if (logit)
             {
-                fprintf(stderr, "Consumed token %ls\n", token.describe().c_str());
+                fprintf(stderr, "Consumed token %ls\n", token1.describe().c_str());
             }
             consumed = true;
             break;
@@ -720,16 +720,16 @@ void parse_ll_t::accept_token(parse_token_t token)
         // Get the production for the top of the stack
         parse_stack_element_t &stack_elem = symbol_stack.back();
         parse_node_t &node = nodes.at(stack_elem.node_idx);
-        const production_t *production = production_for_token(stack_elem.type, token.type, token.keyword, &node.production_idx, NULL /* error text */);
+        const production_t *production = production_for_token(stack_elem.type, token1.type, token1.keyword, token2.type, token2.keyword, &node.production_idx, NULL /* error text */);
         if (production == NULL)
         {
             if (should_generate_error_messages)
             {
-                this->parse_error(token, L"Unable to produce a '%ls' from input '%ls'", stack_elem.describe().c_str(), token.describe().c_str());
+                this->parse_error(token1, L"Unable to produce a '%ls' from input '%ls'", stack_elem.describe().c_str(), token1.describe().c_str());
             }
             else
             {
-                this->parse_error(token, NULL);
+                this->parse_error(token1, NULL);
             }
             // parse_error sets fatal_errored, which ends the loop
         }
@@ -742,7 +742,7 @@ void parse_ll_t::accept_token(parse_token_t token)
             // If we end up with an empty stack, something bad happened, like an unbalanced end
             if (symbol_stack.empty())
             {
-                this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?");
+                this->parse_error(token1, L"All symbols removed from symbol stack. Likely unbalanced else or end?");
             }
         }
     }
@@ -783,7 +783,9 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
             {L"or", parse_keyword_or},
             {L"not", parse_keyword_not},
             {L"command", parse_keyword_command},
-            {L"builtin", parse_keyword_builtin}
+            {L"builtin", parse_keyword_builtin},
+            {L"-h", parse_keyword_dash_h},
+            {L"--help", parse_keyword_dashdash_help}
         };
 
         for (size_t i=0; i < sizeof keywords / sizeof *keywords; i++)
@@ -798,8 +800,38 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
     return result;
 }
 
+/* Placeholder invalid token */
+static const parse_token_t kInvalidToken = {token_type_invalid, parse_keyword_none, -1, -1};
+
+/* Return a new parse token, advancing the tokenizer */
+static inline parse_token_t next_parse_token(tokenizer_t *tok)
+{
+    if (! tok_has_next(tok))
+    {
+        return kInvalidToken;
+    }
+    
+    token_type tok_type = static_cast<token_type>(tok_last_type(tok));
+    int tok_start = tok_get_pos(tok);
+    size_t tok_extent = tok_get_extent(tok);
+    assert(tok_extent < 10000000); //paranoia
+    const wchar_t *tok_txt = tok_last(tok);
+
+    parse_token_t result;
+    result.type = parse_token_type_from_tokenizer_token(tok_type);
+    result.source_start = (size_t)tok_start;
+    result.source_length = tok_extent;
+    result.keyword = keyword_for_token(tok_type, tok_txt);
+    
+    tok_next(tok);
+    return result;
+}
+
 bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it)
 {
+    this->parser->set_should_generate_error_messages(errors != NULL);
+
+    /* Construct the tokenizer */
     tok_flags_t tok_options = TOK_SQUASH_ERRORS;
     if (parse_flags & parse_flag_include_comments)
         tok_options |= TOK_SHOW_COMMENTS;
@@ -807,32 +839,29 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
     if (parse_flags & parse_flag_accept_incomplete_tokens)
         tok_options |= TOK_ACCEPT_UNFINISHED;
     
-    this->parser->set_should_generate_error_messages(errors != NULL);
-
     tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
-    for (; tok_has_next(&tok) && ! this->parser->has_fatal_error(); tok_next(&tok))
+    
+    /* We are an LL(2) parser. We pass two tokens at a time. New tokens come in at index 1. Seed our queue with an initial token at index 1. */
+    parse_token_t queue[2] = {kInvalidToken, next_parse_token(&tok)};
+    
+    /* Go until the most recently added token is invalid. Note this may mean we don't process anything if there were no tokens. */
+    while (queue[1].type != token_type_invalid)
     {
-        token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
-        const wchar_t *tok_txt = tok_last(&tok);
-        int tok_start = tok_get_pos(&tok);
-        size_t tok_extent = tok_get_extent(&tok);
-        assert(tok_extent < 10000000); //paranoia
-
-        parse_token_t token;
-        token.type = parse_token_type_from_tokenizer_token(tok_type);
-        token.source_start = (size_t)tok_start;
-        token.source_length = tok_extent;
-        token.keyword = keyword_for_token(tok_type, tok_txt);
-        this->parser->accept_token(token);
-
+        /* Push a new token onto the queue */
+        queue[0] = queue[1];
+        queue[1] = next_parse_token(&tok);
+        
+        /* Pass these two tokens. We know that queue[0] is valid; queue[1] may be invalid. */
+        this->parser->accept_tokens(queue[0], queue[1]);
+        
+        /* Handle errors */
         if (this->parser->has_fatal_error())
         {
             if (parse_flags & parse_flag_continue_after_error)
             {
-                /* Mark an error and then keep going */
-                token.type = parse_special_type_parse_error;
-                token.keyword = parse_keyword_none;
-                this->parser->accept_token(token);
+                /* Mark a special error token, and then keep going */
+                const parse_token_t token = {parse_special_type_parse_error, parse_keyword_none, -1, -1};
+                this->parser->accept_tokens(token, kInvalidToken);
                 this->parser->reset_symbols();
             }
             else
@@ -843,11 +872,10 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
         }
     }
 
+
     // Teach each node where its source range is
     this->parser->determine_node_ranges();
     
-    // Tag nodes
-    
     // Acquire the output from the parser
     this->parser->acquire_output(output, errors);
 
@@ -863,6 +891,8 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
 
 bool parse_t::parse_1_token(parse_token_type_t token_type, parse_keyword_t keyword, parse_node_tree_t *output, parse_error_list_t *errors)
 {
+    const parse_token_t invalid_token = {token_type_invalid, parse_keyword_none, -1, -1};
+    
     // Only strings can have keywords. So if we have a keyword, the type must be a string
     assert(keyword == parse_keyword_none || token_type == parse_token_type_string);
 
@@ -875,7 +905,7 @@ bool parse_t::parse_1_token(parse_token_type_t token_type, parse_keyword_t keywo
     bool wants_errors = (errors != NULL);
     this->parser->set_should_generate_error_messages(wants_errors);
 
-    this->parser->accept_token(token);
+    this->parser->accept_tokens(token, invalid_token);
 
     return ! this->parser->has_fatal_error();
 }
@@ -1007,3 +1037,20 @@ enum parse_statement_decoration_t parse_node_tree_t::decoration_for_plain_statem
     }
     return decoration;
 }
+
+bool parse_node_tree_t::command_for_plain_statement(const parse_node_t &node, const wcstring &src, wcstring *out_cmd) const
+{
+    bool result = false;
+    assert(node.type == symbol_plain_statement);
+    const parse_node_t *cmd_node = this->get_child(node, 0, parse_token_type_string);
+    if (cmd_node != NULL && cmd_node->has_source())
+    {
+        out_cmd->assign(src, cmd_node->source_start, cmd_node->source_length);
+        result = true;
+    }
+    else
+    {
+        out_cmd->clear();
+    }
+    return result;
+}
diff --git a/parse_tree.h b/parse_tree.h
index b83e47abc..941ddd4e2 100644
--- a/parse_tree.h
+++ b/parse_tree.h
@@ -112,8 +112,12 @@ enum parse_keyword_t
     parse_keyword_not,
     parse_keyword_command,
     parse_keyword_builtin,
+    
+    /* The following are not really keywords but are necessary for e.g. "command --help" to work */
+    parse_keyword_dash_h,
+    parse_keyword_dashdash_help,
 
-    LAST_KEYWORD = parse_keyword_builtin
+    LAST_KEYWORD = parse_keyword_dashdash_help
 };
 
 
@@ -243,7 +247,12 @@ public:
     bool argument_list_is_root(const parse_node_t &node) const;
     
     /* Utilities */
+    
+    /* Given a plain statement, get the decoration (from the parent node), or none if there is no decoration */
     enum parse_statement_decoration_t decoration_for_plain_statement(const parse_node_t &node) const;
+    
+    /* Given a plain statement, get the command by reference (from the child node). Returns true if successful. Clears the command on failure. */
+    bool command_for_plain_statement(const parse_node_t &node, const wcstring &src, wcstring *out_cmd) const;
 
 };