More work on new parser

2024-12-26 12:53:13 +00:00 · 2013-08-08 15:06:46 -07:00 · 2013-08-08 15:06:46 -07:00 · 8e07e55c1f
commit 8e07e55c1f
parent 6a6593335d
9 changed files with 708 additions and 32 deletions
--- a/builtin.cpp
+++ b/builtin.cpp
@ -4063,7 +4063,7 @@ int builtin_parse(parser_t &parser, wchar_t **argv)
        parse_node_tree_t parse_tree;
        parse_error_list_t errors;
        parse_t parser;
-        bool success = parser.parse(src, &parse_tree, &errors);
+        bool success = parser.parse(src, parse_flag_none, &parse_tree, &errors, true);
        if (! success)
        {
            stdout_buffer.append(L"Parsing failed:\n");
--- a/common.cpp
+++ b/common.cpp
@ -507,7 +507,7 @@ const wchar_t *wcsfuncname(const wchar_t *str)
 }


-int wcsvarchr(wchar_t chr)
+bool wcsvarchr(wchar_t chr)
 {
    return iswalnum(chr) || chr == L'_';
 }
--- a/common.h
+++ b/common.h
@ -608,10 +608,10 @@ const wchar_t *wcsfuncname(const wchar_t *str);
 /**
   Test if the given string is valid in a variable name

-   \return 1 if this is a valid name, 0 otherwise
+   \return true if this is a valid name, false otherwise
 */

-int wcsvarchr(wchar_t chr);
+bool wcsvarchr(wchar_t chr);


 /**
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@ -1801,23 +1801,65 @@ void history_tests_t::test_history_speed(void)
    delete hist;
 }

+static void test_new_parser_correctness(void)
+{
+    say(L"Testing new parser!");
+    const struct parser_test_t
+    {
+        const wchar_t *src;
+        bool ok;
+    }
+    parser_tests[] =
+    {
+        {L"; ; ; ", true},
+        {L"if ; end", false},
+        {L"if true ; end", true},
+        {L"if true; end ; end", false},
+        {L"if end; end ; end", false},
+        {L"end", false}
+    };
+    
+    for (size_t i=0; i < sizeof parser_tests / sizeof *parser_tests; i++)
+    {
+        const parser_test_t *test = &parser_tests[i];
+        
+        parse_node_tree_t parse_tree;
+        parse_t parser;
+        bool success = parser.parse(test->src, parse_flag_none, &parse_tree, NULL);
+        say(L"%lu / %lu: Parse \"%ls\": %s", i+1, sizeof parser_tests / sizeof *parser_tests, test->src, success ? "yes" : "no");
+        if (success && ! test->ok)
+        {
+            err(L"\"%ls\" should NOT have parsed, but did", test->src);
+        }
+        else if (! success && test->ok)
+        {
+            err(L"\"%ls\" should have parsed, but failed", test->src);
+        }
+    }
+    say(L"Parse tests complete");
+
+}
+
+__attribute__((unused))
 static void test_new_parser(void)
 {
    say(L"Testing new parser!");
    const wcstring src = L"echo hello world";
    parse_node_tree_t parse_tree;
    parse_t parser;
-    bool success = parser.parse(src, &parse_tree, NULL);
+    bool success = parser.parse(src, parse_flag_none, &parse_tree, NULL);
    if (! success)
    {
        say(L"Parsing failed");
    }
    else
    {
+#if 0
        parse_execution_context_t ctx(parse_tree, src);
        say(L"Simulating execution:");
        wcstring simulation = ctx.simulate();
        say(simulation.c_str());
+#endif
    }
 }

@ -1827,13 +1869,12 @@ static void test_new_parser(void)
 int main(int argc, char **argv)
 {
    setlocale(LC_ALL, "");
-    srand(time(0));
+    //srand(time(0));
    configure_thread_assertions_for_testing();

    program_name=L"(ignore)";

    say(L"Testing low-level functionality");
-    say(L"Lines beginning with '(ignore):' are not errors, they are warning messages\ngenerated by the fish parser library when given broken input, and can be\nignored. All actual errors begin with 'Error:'.");
    set_main_thread();
    setup_fork_guards();
    //proc_init();
@ -1843,7 +1884,8 @@ int main(int argc, char **argv)
    reader_init();
    env_init();

-    test_new_parser();
+    test_new_parser_correctness();
+    //test_new_parser();
    return 0;

    test_format();
--- a/highlight.cpp
+++ b/highlight.cpp
@ -34,6 +34,7 @@
 #include "wildcard.h"
 #include "path.h"
 #include "history.h"
+#include "parse_tree.h"

 /**
   Number of elements in the highlight_var array
@ -1307,11 +1308,16 @@ static void tokenize(const wchar_t * const buff, std::vector<int> &color, const
    }
 }

+void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);

 // PCA This function does I/O, (calls is_potential_path, path_get_path, maybe others) and so ought to only run on a background thread
 void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
 {
    ASSERT_IS_BACKGROUND_THREAD();
+    if (1) {
+        highlight_shell_magic(buff, color, pos, error, vars);
+        return;
+    }

    const size_t length = buff.size();
    assert(buff.size() == color.size());
@ -1440,7 +1446,413 @@ void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos,
    }
 }

+static void color_node(const parse_node_t &node, int color, std::vector<int> &color_array)
+{
+    // Can only color nodes with valid source ranges
+    if (! node.has_source())
+        return;
    
+    // Fill the color array with our color in the corresponding range
+    size_t source_end = node.source_start + node.source_length;
+    assert(source_end >= node.source_start);
+    assert(source_end <= color_array.size());
+    
+    std::fill(color_array.begin() + node.source_start, color_array.begin() + source_end, color);
+}
+
+static void color_argument(const wcstring &buffstr, std::vector<int>::iterator colors, int normal_status)
+{
+    const size_t buff_len = buffstr.size();
+    std::fill(colors, colors + buff_len, normal_status);
+    
+    enum {e_unquoted, e_single_quoted, e_double_quoted} mode = e_unquoted;
+    int bracket_count=0;
+    for (size_t in_pos=0; in_pos < buff_len; in_pos++)
+    {
+        const wchar_t c = buffstr.at(in_pos);
+        switch (mode)
+        {
+            case e_unquoted:
+            {
+                if (c == L'\\')
+                {
+                    int fill_color = HIGHLIGHT_ESCAPE; //may be set to HIGHLIGHT_ERROR
+                    const size_t backslash_pos = in_pos;
+                    size_t fill_end = backslash_pos;
+                    
+                    // Move to the escaped character
+                    in_pos++;
+                    const wchar_t escaped_char = (in_pos < buff_len ? buffstr.at(in_pos) : L'\0');
+                    
+                    if (escaped_char == L'\0')
+                    {
+                        fill_end = in_pos;
+                        fill_color = HIGHLIGHT_ERROR;
+                    }
+                    else if (wcschr(L"~%", escaped_char))
+                    {
+                        if (in_pos == 1)
+                        {
+                            fill_end = in_pos + 1;
+                        }
+                    }
+                    else if (escaped_char == L',')
+                    {
+                        if (bracket_count)
+                        {
+                            fill_end = in_pos + 1;
+                        }
+                    }
+                    else if (wcschr(L"abefnrtv*?$(){}[]'\"<>^ \\#;|&", escaped_char))
+                    {
+                        fill_end = in_pos + 1;
+                    }
+                    else if (wcschr(L"c", escaped_char))
+                    {
+                        // Like \ci. So highlight three characters
+                        fill_end = in_pos + 1;
+                    }
+                    else if (wcschr(L"uUxX01234567", escaped_char))
+                    {
+                        long long res=0;
+                        int chars=2;
+                        int base=16;
+
+                        wchar_t max_val = ASCII_MAX;
+
+                        switch (escaped_char)
+                        {
+                            case L'u':
+                            {
+                                chars=4;
+                                max_val = UCS2_MAX;
+                                in_pos++;
+                                break;
+                            }
+
+                            case L'U':
+                            {
+                                chars=8;
+                                max_val = WCHAR_MAX;
+                                in_pos++;
+                                break;
+                            }
+
+                            case L'x':
+                            {
+                                in_pos++;
+                                break;
+                            }
+
+                            case L'X':
+                            {
+                                max_val = BYTE_MAX;
+                                in_pos++;
+                                break;
+                            }
+
+                            default:
+                            {
+                                // a digit like \12
+                                base=8;
+                                chars=3;
+                                break;
+                            }
+                        }
+                        
+                        // Consume
+                        for (int i=0; i < chars && in_pos < buff_len; i++)
+                        {
+                            long d = convert_digit(buffstr.at(in_pos), base);
+                            if (d < 0)
+                                break;
+                            res = (res * base) + d;
+                            in_pos++;
+                        }
+                        //in_pos is now at the first character that could not be converted (or buff_len)
+                        assert(in_pos >= backslash_pos && in_pos <= buff_len);
+                        fill_end = in_pos;
+                        
+                        // It's an error if we exceeded the max value
+                        if (res > max_val)
+                            fill_color = HIGHLIGHT_ERROR;
+                        
+                        // Subtract one from in_pos, so that the increment in the loop will move to the next character
+                        in_pos--;
+                    }
+                    assert(fill_end >= backslash_pos);
+                    std::fill(colors + backslash_pos, colors + fill_end, fill_color);
+                }
+                else
+                {
+                    // Not a backslash
+                    switch (c)
+                    {
+                        case L'~':
+                        case L'%':
+                        {
+                            if (in_pos == 0)
+                            {
+                                colors[in_pos] = HIGHLIGHT_OPERATOR;
+                            }
+                            break;
+                        }
+
+                        case L'$':
+                        {
+                            assert(in_pos < buff_len);
+                            int dollar_color = HIGHLIGHT_ERROR;
+                            if (in_pos + 1 < buff_len)
+                            {
+                                wchar_t next = buffstr.at(in_pos + 1);
+                                if (next == L'$' || wcsvarchr(next))
+                                    dollar_color = HIGHLIGHT_OPERATOR;
+                            }
+                            colors[in_pos] = dollar_color;
+                            break;
+                        }
+
+
+                        case L'*':
+                        case L'?':
+                        case L'(':
+                        case L')':
+                        {
+                            colors[in_pos] = HIGHLIGHT_OPERATOR;
+                            break;
+                        }
+
+                        case L'{':
+                        {
+                            colors[in_pos] = HIGHLIGHT_OPERATOR;
+                            bracket_count++;
+                            break;
+                        }
+
+                        case L'}':
+                        {
+                            colors[in_pos] = HIGHLIGHT_OPERATOR;
+                            bracket_count--;
+                            break;
+                        }
+
+                        case L',':
+                        {
+                            if (bracket_count > 0)
+                            {
+                                colors[in_pos] = HIGHLIGHT_OPERATOR;
+                            }
+
+                            break;
+                        }
+
+                        case L'\'':
+                        {
+                            colors[in_pos] = HIGHLIGHT_QUOTE;
+                            mode = e_single_quoted;
+                            break;
+                        }
+
+                        case L'\"':
+                        {
+                            colors[in_pos] = HIGHLIGHT_QUOTE;
+                            mode = e_double_quoted;
+                            break;
+                        }
+
+                    }
+                }
+                break;
+            }
+
+            /*
+             Mode 1 means single quoted string, i.e 'foo'
+             */
+            case e_single_quoted:
+            {
+                colors[in_pos] = HIGHLIGHT_QUOTE;
+                if (c == L'\\')
+                {
+                    // backslash
+                    if (in_pos + 1 < buff_len)
+                    {
+                        const wchar_t escaped_char = buffstr.at(in_pos + 1);
+                        if (escaped_char == L'\\' || escaped_char == L'\'')
+                        {
+                            colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
+                            colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
+                            in_pos += 1; //skip over backslash
+                        }
+                    }
+                }
+                else if (c == L'\'')
+                {
+                    mode = e_unquoted;
+                }
+                break;
+            }
+
+            /*
+             Mode 2 means double quoted string, i.e. "foo"
+             */
+            case e_double_quoted:
+            {
+                colors[in_pos] = HIGHLIGHT_QUOTE;
+                switch (c)
+                {
+                    case L'"':
+                    {
+                        mode = e_unquoted;
+                        break;
+                    }
+
+                    case L'\\':
+                    {
+                        // backslash
+                        if (in_pos + 1 < buff_len)
+                        {
+                            const wchar_t escaped_char = buffstr.at(in_pos + 1);
+                            if (escaped_char == L'\\' || escaped_char == L'\'' || escaped_char == L'$')
+                            {
+                                colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
+                                colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
+                                in_pos += 1; //skip over backslash
+                            }
+                        }
+                        break;
+                    }
+
+                    case L'$':
+                    {
+                        int dollar_color = HIGHLIGHT_ERROR;
+                        if (in_pos + 1 < buff_len)
+                        {
+                            wchar_t next = buffstr.at(in_pos + 1);
+                            if (next == L'$' || wcsvarchr(next))
+                                dollar_color = HIGHLIGHT_OPERATOR;
+                        }
+                        colors[in_pos] = dollar_color;
+                        break;
+                    }
+
+                }
+                break;
+            }
+        }
+    }
+}
+
+// Color all of the arguments of the given command
+static void color_arguments(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &parent, std::vector<int> &color_array)
+{
+    const parse_node_tree_t::parse_node_list_t nodes = tree.find_nodes(parent, symbol_argument);
+    
+    wcstring param;
+    for (node_offset_t i=0; i < nodes.size(); i++)
+    {
+        const parse_node_t *child = nodes.at(i);
+        assert(child != NULL && child->type == symbol_argument);
+        param.assign(src, child->source_start, child->source_length);
+        color_argument(param, color_array.begin() + child->source_start, HIGHLIGHT_NORMAL);
+    }
+}
+
+static void color_children(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, int color, std::vector<int> &color_array)
+{
+    for (node_offset_t idx=0; idx < parent.child_count; idx++)
+    {
+        const parse_node_t *child = tree.get_child(parent, idx);
+        if (child != NULL && child->type == type && child->has_source())
+        {
+            color_node(*child, color, color_array);
+        }
+    }
+}
+
+void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
+{
+    ASSERT_IS_BACKGROUND_THREAD();
+
+    const size_t length = buff.size();
+    assert(buff.size() == color.size());
+
+    if (length == 0)
+        return;
+
+    std::fill(color.begin(), color.end(), -1);
+
+    /* Do something sucky and get the current working directory on this background thread. This should really be passed in. */
+    const wcstring working_directory = env_get_pwd_slash();
+    
+    /* Parse the buffer */
+    parse_node_tree_t parse_tree;
+    parse_t parser;
+    parser.parse(buff, parse_flag_continue_after_error | parse_flag_include_comments, &parse_tree, NULL);
+    
+    /* Walk the node tree */
+    for (parse_node_tree_t::const_iterator iter = parse_tree.begin(); iter != parse_tree.end(); ++iter)
+    {
+        const parse_node_t &node = *iter;
+        
+        switch (node.type)
+        {
+            // Color direct string descendants, e.g. 'for' and 'in'.
+            case symbol_for_header:
+            case symbol_while_header:
+            case symbol_begin_header:
+            case symbol_function_header:
+            case symbol_if_clause:
+            case symbol_else_clause:
+            case symbol_case_item:
+            case symbol_switch_statement:
+            case symbol_boolean_statement:
+            case symbol_decorated_statement:
+                color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
+                break;
+                                
+            case symbol_redirection:
+                color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_REDIRECTION, color);
+                break;
+            
+            case parse_token_type_background:
+            case parse_token_type_end:
+                color_node(node, HIGHLIGHT_END, color);
+                break;
+                
+            case symbol_plain_statement:
+            {
+                // Color the command
+                color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
+                
+                // Color arguments
+                const parse_node_t *arguments = parse_tree.get_child(node, 1, symbol_arguments_or_redirections_list);
+                if (arguments != NULL)
+                {
+                    color_arguments(buff, parse_tree, *arguments, color);
+                }
+            }
+            break;
+            
+            
+            case symbol_arguments_or_redirections_list:
+            case symbol_argument_list:
+                /* Nothing, these are handled by their parents */
+                break;
+            
+            case parse_special_type_parse_error:
+            case parse_special_type_tokenizer_error:
+                color_node(node, HIGHLIGHT_ERROR, color);
+                break;
+                
+            case parse_special_type_comment:
+                color_node(node, HIGHLIGHT_COMMENT, color);
+                break;
+                
+            default:
+                break;
+        }
+    }
+}

 /**
   Perform quote and parenthesis highlighting on the specified string.
--- a/highlight.h
+++ b/highlight.h
@ -84,6 +84,7 @@ struct file_detection_context_t;
   \param error a list in which a description of each error will be inserted. May be 0, in whcich case no error descriptions will be generated.
 */
 void highlight_shell(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
+void highlight_shell_magic(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);

 /**
   Perform syntax highlighting for the text in buff. Matching quotes and paranthesis are highlighted. The result is
--- a/parse_productions.cpp
+++ b/parse_productions.cpp
@ -135,14 +135,12 @@ RESOLVE(statement)
                    return 2;

                case parse_keyword_else:
-                    //symbol_stack_pop();
                    return NO_PRODUCTION;

                case parse_keyword_switch:
                    return 3;

                case parse_keyword_end:
-                    PARSER_DIE(); //todo
                    return NO_PRODUCTION;

                    // 'in' is only special within a for_header
@ -378,7 +376,7 @@ RESOLVE(arguments_or_redirections_list)

 PRODUCTIONS(argument_or_redirection) =
 {
-    {parse_token_type_string},
+    {symbol_argument},
    {parse_token_type_redirection}
 };
 RESOLVE(argument_or_redirection)
@ -394,6 +392,18 @@ RESOLVE(argument_or_redirection)
    }
 }

+PRODUCTIONS(argument) =
+{
+    {parse_token_type_string}
+};
+RESOLVE_ONLY(argument)
+
+PRODUCTIONS(redirection) =
+{
+    {parse_token_type_redirection}
+};
+RESOLVE_ONLY(redirection)
+
 PRODUCTIONS(optional_background) =
 {
    {},
@ -449,6 +459,8 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
        TEST(plain_statement)
        TEST(arguments_or_redirections_list)
        TEST(argument_or_redirection)
+        TEST(argument)
+        TEST(redirection)
        TEST(optional_background)
        
        case parse_token_type_string:
@ -461,6 +473,14 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
            PARSER_DIE();
            break;
            
+        case parse_special_type_parse_error:
+        case parse_special_type_tokenizer_error:
+        case parse_special_type_comment:
+            fprintf(stderr, "Special type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__);
+            PARSER_DIE();
+            break;
+            
+            
        case token_type_invalid:
            fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__);
            PARSER_DIE();
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@ -101,6 +101,11 @@ wcstring token_type_description(parse_token_type_t type)
            return L"arguments_or_redirections_list";
        case symbol_argument_or_redirection:
            return L"argument_or_redirection";
+        case symbol_argument:
+            return L"symbol_argument";
+        case symbol_redirection:
+            return L"symbol_redirection";
+

        case parse_token_type_string:
            return L"token_string";
@ -116,6 +121,14 @@ wcstring token_type_description(parse_token_type_t type)
            return L"token_terminate";
        case symbol_optional_background:
            return L"optional_background";
+        
+        case parse_special_type_parse_error:
+            return L"parse_error";
+        case parse_special_type_tokenizer_error:
+            return L"tokenizer_error";
+        case parse_special_type_comment:
+            return L"comment";
+
    }
    return format_string(L"Unknown token type %ld", static_cast<long>(type));
 }
@ -217,6 +230,14 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_
            result.type = parse_token_type_redirection;
            break;
            
+        case TOK_ERROR:
+            result.type = parse_special_type_tokenizer_error;
+            break;
+            
+        case TOK_COMMENT:
+            result.type = parse_special_type_comment;
+            break;
+

        default:
            fprintf(stderr, "Bad token type %d passed to %s\n", (int)tokenizer_token_type, __FUNCTION__);
@ -247,11 +268,18 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
        append_format(*result, L" <%lu children>", node.child_count);
    }
    if (node.type == parse_token_type_string)
+    {
+        if (node.source_start == -1)
+        {
+            append_format(*result, L" (no source)");
+        }
+        else
        {
            result->append(L": \"");
            result->append(src, node.source_start, node.source_length);
            result->append(L"\"");
        }
+    }
    result->push_back(L'\n');
    ++*line;
    for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++)
@ -311,21 +339,25 @@ class parse_ll_t
    // Constructor
    parse_ll_t() : fatal_errored(false)
    {
-        // initial node
-        symbol_stack.push_back(parse_stack_element_t(symbol_job_list, 0)); // goal token
-        nodes.push_back(parse_node_t(symbol_job_list));
+        this->reset();
    }

    bool top_node_match_token(parse_token_t token);

    void accept_token(parse_token_t token, const wcstring &src);
    
+    // Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node.
+    void reset(void);
+
    void parse_error(const wchar_t *expected, parse_token_t token);
    void parse_error(parse_token_t token, const wchar_t *format, ...);
    void append_error_callout(wcstring &error_message, parse_token_t token);

    void dump_stack(void) const;
    
+    // Figure out the ranges of intermediate nodes
+    void determine_node_ranges();
+
    // Get the node corresponding to the top element of the stack
    parse_node_t &node_for_top_symbol()
    {
@ -453,9 +485,41 @@ void parse_ll_t::dump_stack(void) const
    }
 }

+// Give each node a source range equal to the union of the ranges of its children
+// Terminal nodes already have source ranges (and no children)
+// Since children always appear after their parents, we can implement this very simply by walking backwards
+void parse_ll_t::determine_node_ranges(void)
+{
+    const size_t source_start_invalid = -1;
+    size_t idx = nodes.size();
+    while (idx--)
+    {
+        parse_node_t *parent = &nodes.at(idx);
+        
+        // Skip nodes that already have a source range. These are terminal nodes.
+        if (parent->source_start != source_start_invalid)
+            continue;
+        
+        // Ok, this node needs a source range. Get all of its children, and then set its range.
+        size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
+        for (node_offset_t i=0; i < parent->child_count; i++)
+        {
+            const parse_node_t &child = nodes.at(parent->child_offset(i));
+            min_start = std::min(min_start, child.source_start);
+            max_end = std::max(max_end, child.source_start + child.source_length);
+        }
+        
+        if (min_start != source_start_invalid) {
+            assert(max_end >= min_start);
+            parent->source_start = min_start;
+            parent->source_length = max_end - min_start;
+        }
+    }
+}
+
 void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...)
 {
-    this->dump_stack();
+    //this->dump_stack();
    parse_error_t err;
    
    va_list va;
@ -481,8 +545,27 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
    fatal_errored = true;
 }

+void parse_ll_t::reset(void)
+{
+    // add a new job_list node and then reset our symbol list to point at it
+    node_offset_t where = nodes.size();
+    nodes.push_back(parse_node_t(symbol_job_list));
+    
+    symbol_stack.clear();
+    symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token
+    this->fatal_errored = false;
+}
+
+
 bool parse_ll_t::top_node_match_token(parse_token_t token)
 {
+    if (symbol_stack.empty())
+    {
+        // This can come about with an unbalanced 'end' or 'else', which causes us to terminate the outermost job list.
+        this->fatal_errored = true;
+        return false;
+    }
+    
    PARSE_ASSERT(! symbol_stack.empty());
    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
    bool result = false;
@ -520,10 +603,23 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
        fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
    }
    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
-    PARSE_ASSERT(! symbol_stack.empty());
+    
    bool consumed = false;
+    
+    // Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
+    if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
+    {
+        parse_node_t err_node(token.type);
+        err_node.source_start = token.source_start;
+        err_node.source_length = token.source_length;
+        nodes.push_back(err_node);
+        consumed = true;
+    }
+
    while (! consumed && ! this->fatal_errored)
    {
+        PARSE_ASSERT(! symbol_stack.empty());
+        
        if (top_node_match_token(token))
        {
            if (logit)
@ -534,6 +630,10 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
            break;
        }
        
+        // top_node_match_token may indicate an error if our stack is empty
+        if (this->fatal_errored)
+            break;
+        
        // Get the production for the top of the stack
        parse_stack_element_t &stack_elem = symbol_stack.back();
        parse_node_t &node = nodes.at(stack_elem.node_idx);
@ -548,6 +648,12 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
            // Manipulate the symbol stack.
            // Note that stack_elem is invalidated by popping the stack.
            symbol_stack_pop_push_production(production);
+            
+            // If we end up with an empty stack, something bad happened, like an unbalanced end
+            if (symbol_stack.empty())
+            {
+                this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?", stack_elem.describe().c_str(), token.describe().c_str());
+            }
        }
    }
 }
@ -556,6 +662,11 @@ parse_t::parse_t() : parser(new parse_ll_t())
 {
 }

+parse_t::~parse_t()
+{
+    delete parser;
+}
+
 static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
 {
    parse_keyword_t result = parse_keyword_none;
@ -597,21 +708,20 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
    return result;
 }

-bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors)
+bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it)
 {
-    tokenizer_t tok = tokenizer_t(str.c_str(), 0);
+    tok_flags_t tok_options = TOK_SQUASH_ERRORS;
+    if (parse_flags & parse_flag_include_comments)
+        tok_options |= TOK_SHOW_COMMENTS;
+        
+    tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
    for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok))
    {
        token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
        const wchar_t *tok_txt = tok_last(&tok);
        int tok_start = tok_get_pos(&tok);
        size_t tok_extent = tok_get_extent(&tok);
-
-        if (tok_type == TOK_ERROR)
-        {
-            fprintf(stderr, "Tokenizer error\n");
-            break;
-        }
+        assert(tok_extent < 10000000); //paranoia

        parse_token_t token = parse_token_from_tokenizer_token(tok_type);
        token.tokenizer_type = tok_type;
@ -621,12 +731,31 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
        this->parser->accept_token(token, str);
        
        if (this->parser->fatal_errored)
+        {
+            if (parse_flags & parse_flag_continue_after_error)
+            {
+                /* Mark an error and then keep going */
+                token.type = parse_special_type_parse_error;
+                token.keyword = parse_keyword_none;
+                this->parser->accept_token(token, str);
+                this->parser->reset();
+            }
+            else
+            {
+                /* Bail out */
                break;
            }
+        }
+    }

+    // Teach each node where its source range is
+    this->parser->determine_node_ranges();
+
+#if 0
    wcstring result = dump_tree(this->parser->nodes, str);
    fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
+#endif

    if (output != NULL)
    {
@ -642,3 +771,40 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_

    return ! this->parser->fatal_errored;
 }
+
+const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const
+{
+    const parse_node_t *result = NULL;
+    PARSE_ASSERT(which < parent.child_count);
+    node_offset_t child_offset = parent.child_offset(which);
+    if (child_offset < this->size())
+    {
+        result = &this->at(child_offset);
+    }
+    
+    // If we are given an expected type, then the node must be null or that type
+    if (result != NULL)
+    {
+        assert(expected_type == token_type_invalid || expected_type == result->type);
+    }
+    
+    return result;
+}
+
+static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, parse_node_tree_t::parse_node_list_t *result)
+{
+    if (parent.type == type) result->push_back(&parent);
+    for (size_t i=0; i < parent.child_count; i++)
+    {
+        const parse_node_t *child = tree.get_child(parent, i);
+        assert(child != NULL);
+        find_nodes_recursive(tree, *child, type, result);
+    }
+}
+
+parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent, parse_token_type_t type) const
+{
+    parse_node_list_t result;
+    find_nodes_recursive(*this, parent, type, &result);
+    return result;
+}
--- a/parse_tree.h
+++ b/parse_tree.h
@ -15,7 +15,7 @@
 #include <vector>

 #define PARSE_ASSERT(a) assert(a)
-#define PARSER_DIE() exit_without_destructors(-1)
+#define PARSER_DIE() do { fprintf(stderr, "Parser dying!\n"); exit_without_destructors(-1); } while (0)

 class parse_node_t;
 class parse_node_tree_t;
@ -36,6 +36,18 @@ struct parse_error_t
 };
 typedef std::vector<parse_error_t> parse_error_list_t;

+enum
+{
+    parse_flag_none = 0,
+    
+    /* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */
+    parse_flag_continue_after_error = 1 << 0,
+    
+    /* Include comment tokens */
+    parse_flag_include_comments = 1 << 1
+};
+typedef unsigned int parse_tree_flags_t;
+
 class parse_ll_t;
 class parse_t
 {
@ -43,7 +55,8 @@ class parse_t

 public:
    parse_t();
-    bool parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors);
+    ~parse_t();
+    bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false);
 };

 enum parse_token_type_t
@ -80,6 +93,9 @@ enum parse_token_type_t
    symbol_argument_list_nonempty,
    symbol_argument_list,
    
+    symbol_argument,
+    symbol_redirection,
+    
    symbol_optional_background,

    // Terminal types
@ -90,6 +106,11 @@ enum parse_token_type_t
    parse_token_type_end,
    parse_token_type_terminate,
    
+    // Very special terminal types that don't appear in the production list
+    parse_special_type_parse_error,
+    parse_special_type_tokenizer_error,
+    parse_special_type_comment,
+    
    LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate,
    FIRST_PARSE_TOKEN_TYPE = parse_token_type_string
 };
@ -145,7 +166,7 @@ public:
    wcstring describe(void) const;

    /* Constructor */
-    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(0), source_length(0), child_start(0), child_count(0), tag(0)
+    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), child_start(0), child_count(0), tag(0)
    {
    }

@ -154,10 +175,23 @@ public:
        PARSE_ASSERT(which < child_count);
        return child_start + which;
    }
+    
+    bool has_source() const
+    {
+        return source_start != (size_t)(-1);
+    }
 };

 class parse_node_tree_t : public std::vector<parse_node_t>
 {
+    public:
+    
+    /* Get the node corresponding to a child of the given node, or NULL if there is no such child. If expected_type is provided, assert that the node has that type. */
+    const parse_node_t *get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type = token_type_invalid) const;
+    
+    /* Find all the nodes of a given type underneath a given node */
+    typedef std::vector<const parse_node_t *> parse_node_list_t;
+    parse_node_list_t find_nodes(const parse_node_t &parent, parse_token_type_t type) const;
 };


@ -214,7 +248,8 @@ class parse_node_tree_t : public std::vector<parse_node_t>

    arguments_or_redirections_list = <empty> |
                                     argument_or_redirection arguments_or_redirections_list
-    argument_or_redirection = redirection | <TOK_STRING>
+    argument_or_redirection = argument | redirection
+    argument = <TOK_STRING>
    redirection = <TOK_REDIRECTION>
    
    terminator = <TOK_END> | <TOK_BACKGROUND>