More work on new parser

2024-12-27 05:13:10 +00:00 · 2013-08-08 15:06:46 -07:00 · 2013-08-08 15:06:46 -07:00 · 8e07e55c1f
commit 8e07e55c1f
parent 6a6593335d
9 changed files with 708 additions and 32 deletions
--- a/builtin.cpp
+++ b/builtin.cpp
@ -4063,7 +4063,7 @@ int builtin_parse(parser_t &parser, wchar_t **argv)
        parse_node_tree_t parse_tree;
        parse_error_list_t errors;
        parse_t parser;
-        bool success = parser.parse(src, &parse_tree, &errors);
+        bool success = parser.parse(src, parse_flag_none, &parse_tree, &errors, true);
        if (! success)
        {
            stdout_buffer.append(L"Parsing failed:\n");
--- a/common.cpp
+++ b/common.cpp
@ -507,7 +507,7 @@ const wchar_t *wcsfuncname(const wchar_t *str)
 }
-int wcsvarchr(wchar_t chr)
+bool wcsvarchr(wchar_t chr)
 {
    return iswalnum(chr) || chr == L'_';
 }
--- a/common.h
+++ b/common.h
@ -608,10 +608,10 @@ const wchar_t *wcsfuncname(const wchar_t *str);
 /**
   Test if the given string is valid in a variable name
-   \return 1 if this is a valid name, 0 otherwise
+   \return true if this is a valid name, false otherwise
 */
-int wcsvarchr(wchar_t chr);
+bool wcsvarchr(wchar_t chr);
 /**
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@ -1801,23 +1801,65 @@ void history_tests_t::test_history_speed(void)
    delete hist;
 }
 static void test_new_parser_correctness(void)
 {
    say(L"Testing new parser!");
    const struct parser_test_t
    {
        const wchar_t *src;
        bool ok;
    }
    parser_tests[] =
    {
        {L"; ; ; ", true},
        {L"if ; end", false},
        {L"if true ; end", true},
        {L"if true; end ; end", false},
        {L"if end; end ; end", false},
        {L"end", false}
    };
    for (size_t i=0; i < sizeof parser_tests / sizeof *parser_tests; i++)
    {
        const parser_test_t *test = &parser_tests[i];
        parse_node_tree_t parse_tree;
        parse_t parser;
        bool success = parser.parse(test->src, parse_flag_none, &parse_tree, NULL);
        say(L"%lu / %lu: Parse \"%ls\": %s", i+1, sizeof parser_tests / sizeof *parser_tests, test->src, success ? "yes" : "no");
        if (success && ! test->ok)
        {
            err(L"\"%ls\" should NOT have parsed, but did", test->src);
        }
        else if (! success && test->ok)
        {
            err(L"\"%ls\" should have parsed, but failed", test->src);
        }
    }
    say(L"Parse tests complete");
 }
 __attribute__((unused))
 static void test_new_parser(void)
 {
    say(L"Testing new parser!");
    const wcstring src = L"echo hello world";
    parse_node_tree_t parse_tree;
    parse_t parser;
-    bool success = parser.parse(src, &parse_tree, NULL);
+    bool success = parser.parse(src, parse_flag_none, &parse_tree, NULL);
    if (! success)
    {
        say(L"Parsing failed");
    }
    else
    {
 #if 0
        parse_execution_context_t ctx(parse_tree, src);
        say(L"Simulating execution:");
        wcstring simulation = ctx.simulate();
        say(simulation.c_str());
 #endif
    }
 }
@ -1827,13 +1869,12 @@ static void test_new_parser(void)
 int main(int argc, char **argv)
 {
    setlocale(LC_ALL, "");
-    srand(time(0));
+    //srand(time(0));
    configure_thread_assertions_for_testing();
    program_name=L"(ignore)";
    say(L"Testing low-level functionality");
    say(L"Lines beginning with '(ignore):' are not errors, they are warning messages\ngenerated by the fish parser library when given broken input, and can be\nignored. All actual errors begin with 'Error:'.");
    set_main_thread();
    setup_fork_guards();
    //proc_init();
@ -1843,7 +1884,8 @@ int main(int argc, char **argv)
    reader_init();
    env_init();
-    test_new_parser();
+    test_new_parser_correctness();
    //test_new_parser();
    return 0;
    test_format();
--- a/highlight.cpp
+++ b/highlight.cpp
@ -34,6 +34,7 @@
 #include "wildcard.h"
 #include "path.h"
 #include "history.h"
 #include "parse_tree.h"
 /**
   Number of elements in the highlight_var array
@ -1307,11 +1308,16 @@ static void tokenize(const wchar_t * const buff, std::vector<int> &color, const
    }
 }
 void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
 // PCA This function does I/O, (calls is_potential_path, path_get_path, maybe others) and so ought to only run on a background thread
 void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
 {
    ASSERT_IS_BACKGROUND_THREAD();
    if (1) {
        highlight_shell_magic(buff, color, pos, error, vars);
        return;
    }
    const size_t length = buff.size();
    assert(buff.size() == color.size());
@ -1440,7 +1446,413 @@ void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos,
    }
 }
 static void color_node(const parse_node_t &node, int color, std::vector<int> &color_array)
 {
    // Can only color nodes with valid source ranges
    if (! node.has_source())
        return;
    // Fill the color array with our color in the corresponding range
    size_t source_end = node.source_start + node.source_length;
    assert(source_end >= node.source_start);
    assert(source_end <= color_array.size());
    std::fill(color_array.begin() + node.source_start, color_array.begin() + source_end, color);
 }
 static void color_argument(const wcstring &buffstr, std::vector<int>::iterator colors, int normal_status)
 {
    const size_t buff_len = buffstr.size();
    std::fill(colors, colors + buff_len, normal_status);
    enum {e_unquoted, e_single_quoted, e_double_quoted} mode = e_unquoted;
    int bracket_count=0;
    for (size_t in_pos=0; in_pos < buff_len; in_pos++)
    {
        const wchar_t c = buffstr.at(in_pos);
        switch (mode)
        {
            case e_unquoted:
            {
                if (c == L'\\')
                {
                    int fill_color = HIGHLIGHT_ESCAPE; //may be set to HIGHLIGHT_ERROR
                    const size_t backslash_pos = in_pos;
                    size_t fill_end = backslash_pos;
                    // Move to the escaped character
                    in_pos++;
                    const wchar_t escaped_char = (in_pos < buff_len ? buffstr.at(in_pos) : L'\0');
                    if (escaped_char == L'\0')
                    {
                        fill_end = in_pos;
                        fill_color = HIGHLIGHT_ERROR;
                    }
                    else if (wcschr(L"~%", escaped_char))
                    {
                        if (in_pos == 1)
                        {
                            fill_end = in_pos + 1;
                        }
                    }
                    else if (escaped_char == L',')
                    {
                        if (bracket_count)
                        {
                            fill_end = in_pos + 1;
                        }
                    }
                    else if (wcschr(L"abefnrtv*?$(){}[]'\"<>^ \\#;|&", escaped_char))
                    {
                        fill_end = in_pos + 1;
                    }
                    else if (wcschr(L"c", escaped_char))
                    {
                        // Like \ci. So highlight three characters
                        fill_end = in_pos + 1;
                    }
                    else if (wcschr(L"uUxX01234567", escaped_char))
                    {
                        long long res=0;
                        int chars=2;
                        int base=16;
                        wchar_t max_val = ASCII_MAX;
                        switch (escaped_char)
                        {
                            case L'u':
                            {
                                chars=4;
                                max_val = UCS2_MAX;
                                in_pos++;
                                break;
                            }
                            case L'U':
                            {
                                chars=8;
                                max_val = WCHAR_MAX;
                                in_pos++;
                                break;
                            }
                            case L'x':
                            {
                                in_pos++;
                                break;
                            }
                            case L'X':
                            {
                                max_val = BYTE_MAX;
                                in_pos++;
                                break;
                            }
                            default:
                            {
                                // a digit like \12
                                base=8;
                                chars=3;
                                break;
                            }
                        }
                        // Consume
                        for (int i=0; i < chars && in_pos < buff_len; i++)
                        {
                            long d = convert_digit(buffstr.at(in_pos), base);
                            if (d < 0)
                                break;
                            res = (res * base) + d;
                            in_pos++;
                        }
                        //in_pos is now at the first character that could not be converted (or buff_len)
                        assert(in_pos >= backslash_pos && in_pos <= buff_len);
                        fill_end = in_pos;
                        // It's an error if we exceeded the max value
                        if (res > max_val)
                            fill_color = HIGHLIGHT_ERROR;
                        // Subtract one from in_pos, so that the increment in the loop will move to the next character
                        in_pos--;
                    }
                    assert(fill_end >= backslash_pos);
                    std::fill(colors + backslash_pos, colors + fill_end, fill_color);
                }
                else
                {
                    // Not a backslash
                    switch (c)
                    {
                        case L'~':
                        case L'%':
                        {
                            if (in_pos == 0)
                            {
                                colors[in_pos] = HIGHLIGHT_OPERATOR;
                            }
                            break;
                        }
                        case L'$':
                        {
                            assert(in_pos < buff_len);
                            int dollar_color = HIGHLIGHT_ERROR;
                            if (in_pos + 1 < buff_len)
                            {
                                wchar_t next = buffstr.at(in_pos + 1);
                                if (next == L'$' || wcsvarchr(next))
                                    dollar_color = HIGHLIGHT_OPERATOR;
                            }
                            colors[in_pos] = dollar_color;
                            break;
                        }
                        case L'*':
                        case L'?':
                        case L'(':
                        case L')':
                        {
                            colors[in_pos] = HIGHLIGHT_OPERATOR;
                            break;
                        }
                        case L'{':
                        {
                            colors[in_pos] = HIGHLIGHT_OPERATOR;
                            bracket_count++;
                            break;
                        }
                        case L'}':
                        {
                            colors[in_pos] = HIGHLIGHT_OPERATOR;
                            bracket_count--;
                            break;
                        }
                        case L',':
                        {
                            if (bracket_count > 0)
                            {
                                colors[in_pos] = HIGHLIGHT_OPERATOR;
                            }
                            break;
                        }
                        case L'\'':
                        {
                            colors[in_pos] = HIGHLIGHT_QUOTE;
                            mode = e_single_quoted;
                            break;
                        }
                        case L'\"':
                        {
                            colors[in_pos] = HIGHLIGHT_QUOTE;
                            mode = e_double_quoted;
                            break;
                        }
                    }
                }
                break;
            }
            /*
             Mode 1 means single quoted string, i.e 'foo'
             */
            case e_single_quoted:
            {
                colors[in_pos] = HIGHLIGHT_QUOTE;
                if (c == L'\\')
                {
                    // backslash
                    if (in_pos + 1 < buff_len)
                    {
                        const wchar_t escaped_char = buffstr.at(in_pos + 1);
                        if (escaped_char == L'\\' || escaped_char == L'\'')
                        {
                            colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
                            colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
                            in_pos += 1; //skip over backslash
                        }
                    }
                }
                else if (c == L'\'')
                {
                    mode = e_unquoted;
                }
                break;
            }
            /*
             Mode 2 means double quoted string, i.e. "foo"
             */
            case e_double_quoted:
            {
                colors[in_pos] = HIGHLIGHT_QUOTE;
                switch (c)
                {
                    case L'"':
                    {
                        mode = e_unquoted;
                        break;
                    }
                    case L'\\':
                    {
                        // backslash
                        if (in_pos + 1 < buff_len)
                        {
                            const wchar_t escaped_char = buffstr.at(in_pos + 1);
                            if (escaped_char == L'\\' || escaped_char == L'\'' || escaped_char == L'$')
                            {
                                colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
                                colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
                                in_pos += 1; //skip over backslash
                            }
                        }
                        break;
                    }
                    case L'$':
                    {
                        int dollar_color = HIGHLIGHT_ERROR;
                        if (in_pos + 1 < buff_len)
                        {
                            wchar_t next = buffstr.at(in_pos + 1);
                            if (next == L'$' || wcsvarchr(next))
                                dollar_color = HIGHLIGHT_OPERATOR;
                        }
                        colors[in_pos] = dollar_color;
                        break;
                    }
                }
                break;
            }
        }
    }
 }
 // Color all of the arguments of the given command
 static void color_arguments(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &parent, std::vector<int> &color_array)
 {
    const parse_node_tree_t::parse_node_list_t nodes = tree.find_nodes(parent, symbol_argument);
    wcstring param;
    for (node_offset_t i=0; i < nodes.size(); i++)
    {
        const parse_node_t *child = nodes.at(i);
        assert(child != NULL && child->type == symbol_argument);
        param.assign(src, child->source_start, child->source_length);
        color_argument(param, color_array.begin() + child->source_start, HIGHLIGHT_NORMAL);
    }
 }
 static void color_children(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, int color, std::vector<int> &color_array)
 {
    for (node_offset_t idx=0; idx < parent.child_count; idx++)
    {
        const parse_node_t *child = tree.get_child(parent, idx);
        if (child != NULL && child->type == type && child->has_source())
        {
            color_node(*child, color, color_array);
        }
    }
 }
 void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
 {
    ASSERT_IS_BACKGROUND_THREAD();
    const size_t length = buff.size();
    assert(buff.size() == color.size());
    if (length == 0)
        return;
    std::fill(color.begin(), color.end(), -1);
    /* Do something sucky and get the current working directory on this background thread. This should really be passed in. */
    const wcstring working_directory = env_get_pwd_slash();
    /* Parse the buffer */
    parse_node_tree_t parse_tree;
    parse_t parser;
    parser.parse(buff, parse_flag_continue_after_error | parse_flag_include_comments, &parse_tree, NULL);
    /* Walk the node tree */
    for (parse_node_tree_t::const_iterator iter = parse_tree.begin(); iter != parse_tree.end(); ++iter)
    {
        const parse_node_t &node = *iter;
        switch (node.type)
        {
            // Color direct string descendants, e.g. 'for' and 'in'.
            case symbol_for_header:
            case symbol_while_header:
            case symbol_begin_header:
            case symbol_function_header:
            case symbol_if_clause:
            case symbol_else_clause:
            case symbol_case_item:
            case symbol_switch_statement:
            case symbol_boolean_statement:
            case symbol_decorated_statement:
                color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
                break;
            case symbol_redirection:
                color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_REDIRECTION, color);
                break;
            case parse_token_type_background:
            case parse_token_type_end:
                color_node(node, HIGHLIGHT_END, color);
                break;
            case symbol_plain_statement:
            {
                // Color the command
                color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
                // Color arguments
                const parse_node_t *arguments = parse_tree.get_child(node, 1, symbol_arguments_or_redirections_list);
                if (arguments != NULL)
                {
                    color_arguments(buff, parse_tree, *arguments, color);
                }
            }
            break;
            case symbol_arguments_or_redirections_list:
            case symbol_argument_list:
                /* Nothing, these are handled by their parents */
                break;
            case parse_special_type_parse_error:
            case parse_special_type_tokenizer_error:
                color_node(node, HIGHLIGHT_ERROR, color);
                break;
            case parse_special_type_comment:
                color_node(node, HIGHLIGHT_COMMENT, color);
                break;
            default:
                break;
        }
    }
 }
 /**
   Perform quote and parenthesis highlighting on the specified string.
--- a/highlight.h
+++ b/highlight.h
@ -84,6 +84,7 @@ struct file_detection_context_t;
   \param error a list in which a description of each error will be inserted. May be 0, in whcich case no error descriptions will be generated.
 */
 void highlight_shell(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
 void highlight_shell_magic(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
 /**
   Perform syntax highlighting for the text in buff. Matching quotes and paranthesis are highlighted. The result is
--- a/parse_productions.cpp
+++ b/parse_productions.cpp
@ -135,14 +135,12 @@ RESOLVE(statement)
                    return 2;
                case parse_keyword_else:
                    //symbol_stack_pop();
                    return NO_PRODUCTION;
                case parse_keyword_switch:
                    return 3;
                case parse_keyword_end:
                    PARSER_DIE(); //todo
                    return NO_PRODUCTION;
                    // 'in' is only special within a for_header
@ -378,7 +376,7 @@ RESOLVE(arguments_or_redirections_list)
 PRODUCTIONS(argument_or_redirection) =
 {
-    {parse_token_type_string},
+    {symbol_argument},
    {parse_token_type_redirection}
 };
 RESOLVE(argument_or_redirection)
@ -394,6 +392,18 @@ RESOLVE(argument_or_redirection)
    }
 }
 PRODUCTIONS(argument) =
 {
    {parse_token_type_string}
 };
 RESOLVE_ONLY(argument)
 PRODUCTIONS(redirection) =
 {
    {parse_token_type_redirection}
 };
 RESOLVE_ONLY(redirection)
 PRODUCTIONS(optional_background) =
 {
    {},
@ -449,6 +459,8 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
        TEST(plain_statement)
        TEST(arguments_or_redirections_list)
        TEST(argument_or_redirection)
        TEST(argument)
        TEST(redirection)
        TEST(optional_background)
        case parse_token_type_string:
@ -461,6 +473,14 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
            PARSER_DIE();
            break;
        case parse_special_type_parse_error:
        case parse_special_type_tokenizer_error:
        case parse_special_type_comment:
            fprintf(stderr, "Special type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__);
            PARSER_DIE();
            break;
        case token_type_invalid:
            fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__);
            PARSER_DIE();
--- a/parse_tree.cpp
+++ b/parse_tree.cpp
@ -101,6 +101,11 @@ wcstring token_type_description(parse_token_type_t type)
            return L"arguments_or_redirections_list";
        case symbol_argument_or_redirection:
            return L"argument_or_redirection";
        case symbol_argument:
            return L"symbol_argument";
        case symbol_redirection:
            return L"symbol_redirection";
        case parse_token_type_string:
            return L"token_string";
@ -116,6 +121,14 @@ wcstring token_type_description(parse_token_type_t type)
            return L"token_terminate";
        case symbol_optional_background:
            return L"optional_background";
        case parse_special_type_parse_error:
            return L"parse_error";
        case parse_special_type_tokenizer_error:
            return L"tokenizer_error";
        case parse_special_type_comment:
            return L"comment";
    }
    return format_string(L"Unknown token type %ld", static_cast<long>(type));
 }
@ -217,6 +230,14 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_
            result.type = parse_token_type_redirection;
            break;
        case TOK_ERROR:
            result.type = parse_special_type_tokenizer_error;
            break;
        case TOK_COMMENT:
            result.type = parse_special_type_comment;
            break;
        default:
            fprintf(stderr, "Bad token type %d passed to %s\n", (int)tokenizer_token_type, __FUNCTION__);
@ -247,11 +268,18 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
        append_format(*result, L" <%lu children>", node.child_count);
    }
    if (node.type == parse_token_type_string)
    {
        if (node.source_start == -1)
        {
            append_format(*result, L" (no source)");
        }
        else
        {
            result->append(L": \"");
            result->append(src, node.source_start, node.source_length);
            result->append(L"\"");
        }
    }
    result->push_back(L'\n');
    ++*line;
    for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++)
@ -311,21 +339,25 @@ class parse_ll_t
    // Constructor
    parse_ll_t() : fatal_errored(false)
    {
-        // initial node
+        this->reset();
        symbol_stack.push_back(parse_stack_element_t(symbol_job_list, 0)); // goal token
        nodes.push_back(parse_node_t(symbol_job_list));
    }
    bool top_node_match_token(parse_token_t token);
    void accept_token(parse_token_t token, const wcstring &src);
    // Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node.
    void reset(void);
    void parse_error(const wchar_t *expected, parse_token_t token);
    void parse_error(parse_token_t token, const wchar_t *format, ...);
    void append_error_callout(wcstring &error_message, parse_token_t token);
    void dump_stack(void) const;
    // Figure out the ranges of intermediate nodes
    void determine_node_ranges();
    // Get the node corresponding to the top element of the stack
    parse_node_t &node_for_top_symbol()
    {
@ -453,9 +485,41 @@ void parse_ll_t::dump_stack(void) const
    }
 }
 // Give each node a source range equal to the union of the ranges of its children
 // Terminal nodes already have source ranges (and no children)
 // Since children always appear after their parents, we can implement this very simply by walking backwards
 void parse_ll_t::determine_node_ranges(void)
 {
    const size_t source_start_invalid = -1;
    size_t idx = nodes.size();
    while (idx--)
    {
        parse_node_t *parent = &nodes.at(idx);
        // Skip nodes that already have a source range. These are terminal nodes.
        if (parent->source_start != source_start_invalid)
            continue;
        // Ok, this node needs a source range. Get all of its children, and then set its range.
        size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
        for (node_offset_t i=0; i < parent->child_count; i++)
        {
            const parse_node_t &child = nodes.at(parent->child_offset(i));
            min_start = std::min(min_start, child.source_start);
            max_end = std::max(max_end, child.source_start + child.source_length);
        }
        if (min_start != source_start_invalid) {
            assert(max_end >= min_start);
            parent->source_start = min_start;
            parent->source_length = max_end - min_start;
        }
    }
 }
 void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...)
 {
-    this->dump_stack();
+    //this->dump_stack();
    parse_error_t err;
    va_list va;
@ -481,8 +545,27 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
    fatal_errored = true;
 }
 void parse_ll_t::reset(void)
 {
    // add a new job_list node and then reset our symbol list to point at it
    node_offset_t where = nodes.size();
    nodes.push_back(parse_node_t(symbol_job_list));
    symbol_stack.clear();
    symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token
    this->fatal_errored = false;
 }
 bool parse_ll_t::top_node_match_token(parse_token_t token)
 {
    if (symbol_stack.empty())
    {
        // This can come about with an unbalanced 'end' or 'else', which causes us to terminate the outermost job list.
        this->fatal_errored = true;
        return false;
    }
    PARSE_ASSERT(! symbol_stack.empty());
    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
    bool result = false;
@ -520,10 +603,23 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
        fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
    }
    PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
-    PARSE_ASSERT(! symbol_stack.empty());
+    
    bool consumed = false;
    // Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
    if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
    {
        parse_node_t err_node(token.type);
        err_node.source_start = token.source_start;
        err_node.source_length = token.source_length;
        nodes.push_back(err_node);
        consumed = true;
    }
    while (! consumed && ! this->fatal_errored)
    {
        PARSE_ASSERT(! symbol_stack.empty());
        if (top_node_match_token(token))
        {
            if (logit)
@ -534,6 +630,10 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
            break;
        }
        // top_node_match_token may indicate an error if our stack is empty
        if (this->fatal_errored)
            break;
        // Get the production for the top of the stack
        parse_stack_element_t &stack_elem = symbol_stack.back();
        parse_node_t &node = nodes.at(stack_elem.node_idx);
@ -548,6 +648,12 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
            // Manipulate the symbol stack.
            // Note that stack_elem is invalidated by popping the stack.
            symbol_stack_pop_push_production(production);
            // If we end up with an empty stack, something bad happened, like an unbalanced end
            if (symbol_stack.empty())
            {
                this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?", stack_elem.describe().c_str(), token.describe().c_str());
            }
        }
    }
 }
@ -556,6 +662,11 @@ parse_t::parse_t() : parser(new parse_ll_t())
 {
 }
 parse_t::~parse_t()
 {
    delete parser;
 }
 static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
 {
    parse_keyword_t result = parse_keyword_none;
@ -597,21 +708,20 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
    return result;
 }
-bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors)
+bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it)
 {
-    tokenizer_t tok = tokenizer_t(str.c_str(), 0);
+    tok_flags_t tok_options = TOK_SQUASH_ERRORS;
    if (parse_flags & parse_flag_include_comments)
        tok_options |= TOK_SHOW_COMMENTS;
    tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
    for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok))
    {
        token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
        const wchar_t *tok_txt = tok_last(&tok);
        int tok_start = tok_get_pos(&tok);
        size_t tok_extent = tok_get_extent(&tok);
-
+        assert(tok_extent < 10000000); //paranoia
        if (tok_type == TOK_ERROR)
        {
            fprintf(stderr, "Tokenizer error\n");
            break;
        }
        parse_token_t token = parse_token_from_tokenizer_token(tok_type);
        token.tokenizer_type = tok_type;
@ -621,12 +731,31 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
        this->parser->accept_token(token, str);
        if (this->parser->fatal_errored)
        {
            if (parse_flags & parse_flag_continue_after_error)
            {
                /* Mark an error and then keep going */
                token.type = parse_special_type_parse_error;
                token.keyword = parse_keyword_none;
                this->parser->accept_token(token, str);
                this->parser->reset();
            }
            else
            {
                /* Bail out */
                break;
            }
        }
    }
    // Teach each node where its source range is
    this->parser->determine_node_ranges();
 #if 0
    wcstring result = dump_tree(this->parser->nodes, str);
    fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
    fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
 #endif
    if (output != NULL)
    {
@ -642,3 +771,40 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
    return ! this->parser->fatal_errored;
 }
 const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const
 {
    const parse_node_t *result = NULL;
    PARSE_ASSERT(which < parent.child_count);
    node_offset_t child_offset = parent.child_offset(which);
    if (child_offset < this->size())
    {
        result = &this->at(child_offset);
    }
    // If we are given an expected type, then the node must be null or that type
    if (result != NULL)
    {
        assert(expected_type == token_type_invalid || expected_type == result->type);
    }
    return result;
 }
 static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, parse_node_tree_t::parse_node_list_t *result)
 {
    if (parent.type == type) result->push_back(&parent);
    for (size_t i=0; i < parent.child_count; i++)
    {
        const parse_node_t *child = tree.get_child(parent, i);
        assert(child != NULL);
        find_nodes_recursive(tree, *child, type, result);
    }
 }
 parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent, parse_token_type_t type) const
 {
    parse_node_list_t result;
    find_nodes_recursive(*this, parent, type, &result);
    return result;
 }
--- a/parse_tree.h
+++ b/parse_tree.h
@ -15,7 +15,7 @@
 #include <vector>
 #define PARSE_ASSERT(a) assert(a)
-#define PARSER_DIE() exit_without_destructors(-1)
+#define PARSER_DIE() do { fprintf(stderr, "Parser dying!\n"); exit_without_destructors(-1); } while (0)
 class parse_node_t;
 class parse_node_tree_t;
@ -36,6 +36,18 @@ struct parse_error_t
 };
 typedef std::vector<parse_error_t> parse_error_list_t;
 enum
 {
    parse_flag_none = 0,
    /* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */
    parse_flag_continue_after_error = 1 << 0,
    /* Include comment tokens */
    parse_flag_include_comments = 1 << 1
 };
 typedef unsigned int parse_tree_flags_t;
 class parse_ll_t;
 class parse_t
 {
@ -43,7 +55,8 @@ class parse_t
 public:
    parse_t();
-    bool parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors);
+    ~parse_t();
    bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false);
 };
 enum parse_token_type_t
@ -80,6 +93,9 @@ enum parse_token_type_t
    symbol_argument_list_nonempty,
    symbol_argument_list,
    symbol_argument,
    symbol_redirection,
    symbol_optional_background,
    // Terminal types
@ -90,6 +106,11 @@ enum parse_token_type_t
    parse_token_type_end,
    parse_token_type_terminate,
    // Very special terminal types that don't appear in the production list
    parse_special_type_parse_error,
    parse_special_type_tokenizer_error,
    parse_special_type_comment,
    LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate,
    FIRST_PARSE_TOKEN_TYPE = parse_token_type_string
 };
@ -145,7 +166,7 @@ public:
    wcstring describe(void) const;
    /* Constructor */
-    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(0), source_length(0), child_start(0), child_count(0), tag(0)
+    explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), child_start(0), child_count(0), tag(0)
    {
    }
@ -154,10 +175,23 @@ public:
        PARSE_ASSERT(which < child_count);
        return child_start + which;
    }
    bool has_source() const
    {
        return source_start != (size_t)(-1);
    }
 };
 class parse_node_tree_t : public std::vector<parse_node_t>
 {
    public:
    /* Get the node corresponding to a child of the given node, or NULL if there is no such child. If expected_type is provided, assert that the node has that type. */
    const parse_node_t *get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type = token_type_invalid) const;
    /* Find all the nodes of a given type underneath a given node */
    typedef std::vector<const parse_node_t *> parse_node_list_t;
    parse_node_list_t find_nodes(const parse_node_t &parent, parse_token_type_t type) const;
 };
@ -214,7 +248,8 @@ class parse_node_tree_t : public std::vector<parse_node_t>
    arguments_or_redirections_list = <empty> |
                                     argument_or_redirection arguments_or_redirections_list
-    argument_or_redirection = redirection | <TOK_STRING>
+    argument_or_redirection = argument | redirection
    argument = <TOK_STRING>
    redirection = <TOK_REDIRECTION>
    terminator = <TOK_END> | <TOK_BACKGROUND>