More work on new parser

This commit is contained in:
ridiculousfish 2013-08-08 15:06:46 -07:00
parent 6a6593335d
commit 8e07e55c1f
9 changed files with 708 additions and 32 deletions

View file

@ -4063,7 +4063,7 @@ int builtin_parse(parser_t &parser, wchar_t **argv)
parse_node_tree_t parse_tree;
parse_error_list_t errors;
parse_t parser;
bool success = parser.parse(src, &parse_tree, &errors);
bool success = parser.parse(src, parse_flag_none, &parse_tree, &errors, true);
if (! success)
{
stdout_buffer.append(L"Parsing failed:\n");

View file

@ -507,7 +507,7 @@ const wchar_t *wcsfuncname(const wchar_t *str)
}
int wcsvarchr(wchar_t chr)
bool wcsvarchr(wchar_t chr)
{
return iswalnum(chr) || chr == L'_';
}

View file

@ -608,10 +608,10 @@ const wchar_t *wcsfuncname(const wchar_t *str);
/**
Test if the given string is valid in a variable name
\return 1 if this is a valid name, 0 otherwise
\return true if this is a valid name, false otherwise
*/
int wcsvarchr(wchar_t chr);
bool wcsvarchr(wchar_t chr);
/**

View file

@ -1801,23 +1801,65 @@ void history_tests_t::test_history_speed(void)
delete hist;
}
static void test_new_parser_correctness(void)
{
say(L"Testing new parser!");
const struct parser_test_t
{
const wchar_t *src;
bool ok;
}
parser_tests[] =
{
{L"; ; ; ", true},
{L"if ; end", false},
{L"if true ; end", true},
{L"if true; end ; end", false},
{L"if end; end ; end", false},
{L"end", false}
};
for (size_t i=0; i < sizeof parser_tests / sizeof *parser_tests; i++)
{
const parser_test_t *test = &parser_tests[i];
parse_node_tree_t parse_tree;
parse_t parser;
bool success = parser.parse(test->src, parse_flag_none, &parse_tree, NULL);
say(L"%lu / %lu: Parse \"%ls\": %s", i+1, sizeof parser_tests / sizeof *parser_tests, test->src, success ? "yes" : "no");
if (success && ! test->ok)
{
err(L"\"%ls\" should NOT have parsed, but did", test->src);
}
else if (! success && test->ok)
{
err(L"\"%ls\" should have parsed, but failed", test->src);
}
}
say(L"Parse tests complete");
}
__attribute__((unused))
static void test_new_parser(void)
{
say(L"Testing new parser!");
const wcstring src = L"echo hello world";
parse_node_tree_t parse_tree;
parse_t parser;
bool success = parser.parse(src, &parse_tree, NULL);
bool success = parser.parse(src, parse_flag_none, &parse_tree, NULL);
if (! success)
{
say(L"Parsing failed");
}
else
{
#if 0
parse_execution_context_t ctx(parse_tree, src);
say(L"Simulating execution:");
wcstring simulation = ctx.simulate();
say(simulation.c_str());
#endif
}
}
@ -1827,13 +1869,12 @@ static void test_new_parser(void)
int main(int argc, char **argv)
{
setlocale(LC_ALL, "");
srand(time(0));
//srand(time(0));
configure_thread_assertions_for_testing();
program_name=L"(ignore)";
say(L"Testing low-level functionality");
say(L"Lines beginning with '(ignore):' are not errors, they are warning messages\ngenerated by the fish parser library when given broken input, and can be\nignored. All actual errors begin with 'Error:'.");
set_main_thread();
setup_fork_guards();
//proc_init();
@ -1843,7 +1884,8 @@ int main(int argc, char **argv)
reader_init();
env_init();
test_new_parser();
test_new_parser_correctness();
//test_new_parser();
return 0;
test_format();

View file

@ -34,6 +34,7 @@
#include "wildcard.h"
#include "path.h"
#include "history.h"
#include "parse_tree.h"
/**
Number of elements in the highlight_var array
@ -1307,11 +1308,16 @@ static void tokenize(const wchar_t * const buff, std::vector<int> &color, const
}
}
void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
// PCA This function does I/O, (calls is_potential_path, path_get_path, maybe others) and so ought to only run on a background thread
void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
{
ASSERT_IS_BACKGROUND_THREAD();
if (1) {
highlight_shell_magic(buff, color, pos, error, vars);
return;
}
const size_t length = buff.size();
assert(buff.size() == color.size());
@ -1440,7 +1446,413 @@ void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos,
}
}
static void color_node(const parse_node_t &node, int color, std::vector<int> &color_array)
{
// Can only color nodes with valid source ranges
if (! node.has_source())
return;
// Fill the color array with our color in the corresponding range
size_t source_end = node.source_start + node.source_length;
assert(source_end >= node.source_start);
assert(source_end <= color_array.size());
std::fill(color_array.begin() + node.source_start, color_array.begin() + source_end, color);
}
static void color_argument(const wcstring &buffstr, std::vector<int>::iterator colors, int normal_status)
{
const size_t buff_len = buffstr.size();
std::fill(colors, colors + buff_len, normal_status);
enum {e_unquoted, e_single_quoted, e_double_quoted} mode = e_unquoted;
int bracket_count=0;
for (size_t in_pos=0; in_pos < buff_len; in_pos++)
{
const wchar_t c = buffstr.at(in_pos);
switch (mode)
{
case e_unquoted:
{
if (c == L'\\')
{
int fill_color = HIGHLIGHT_ESCAPE; //may be set to HIGHLIGHT_ERROR
const size_t backslash_pos = in_pos;
size_t fill_end = backslash_pos;
// Move to the escaped character
in_pos++;
const wchar_t escaped_char = (in_pos < buff_len ? buffstr.at(in_pos) : L'\0');
if (escaped_char == L'\0')
{
fill_end = in_pos;
fill_color = HIGHLIGHT_ERROR;
}
else if (wcschr(L"~%", escaped_char))
{
if (in_pos == 1)
{
fill_end = in_pos + 1;
}
}
else if (escaped_char == L',')
{
if (bracket_count)
{
fill_end = in_pos + 1;
}
}
else if (wcschr(L"abefnrtv*?$(){}[]'\"<>^ \\#;|&", escaped_char))
{
fill_end = in_pos + 1;
}
else if (wcschr(L"c", escaped_char))
{
// Like \ci. So highlight three characters
fill_end = in_pos + 1;
}
else if (wcschr(L"uUxX01234567", escaped_char))
{
long long res=0;
int chars=2;
int base=16;
wchar_t max_val = ASCII_MAX;
switch (escaped_char)
{
case L'u':
{
chars=4;
max_val = UCS2_MAX;
in_pos++;
break;
}
case L'U':
{
chars=8;
max_val = WCHAR_MAX;
in_pos++;
break;
}
case L'x':
{
in_pos++;
break;
}
case L'X':
{
max_val = BYTE_MAX;
in_pos++;
break;
}
default:
{
// a digit like \12
base=8;
chars=3;
break;
}
}
// Consume
for (int i=0; i < chars && in_pos < buff_len; i++)
{
long d = convert_digit(buffstr.at(in_pos), base);
if (d < 0)
break;
res = (res * base) + d;
in_pos++;
}
//in_pos is now at the first character that could not be converted (or buff_len)
assert(in_pos >= backslash_pos && in_pos <= buff_len);
fill_end = in_pos;
// It's an error if we exceeded the max value
if (res > max_val)
fill_color = HIGHLIGHT_ERROR;
// Subtract one from in_pos, so that the increment in the loop will move to the next character
in_pos--;
}
assert(fill_end >= backslash_pos);
std::fill(colors + backslash_pos, colors + fill_end, fill_color);
}
else
{
// Not a backslash
switch (c)
{
case L'~':
case L'%':
{
if (in_pos == 0)
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
}
break;
}
case L'$':
{
assert(in_pos < buff_len);
int dollar_color = HIGHLIGHT_ERROR;
if (in_pos + 1 < buff_len)
{
wchar_t next = buffstr.at(in_pos + 1);
if (next == L'$' || wcsvarchr(next))
dollar_color = HIGHLIGHT_OPERATOR;
}
colors[in_pos] = dollar_color;
break;
}
case L'*':
case L'?':
case L'(':
case L')':
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
break;
}
case L'{':
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
bracket_count++;
break;
}
case L'}':
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
bracket_count--;
break;
}
case L',':
{
if (bracket_count > 0)
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
}
break;
}
case L'\'':
{
colors[in_pos] = HIGHLIGHT_QUOTE;
mode = e_single_quoted;
break;
}
case L'\"':
{
colors[in_pos] = HIGHLIGHT_QUOTE;
mode = e_double_quoted;
break;
}
}
}
break;
}
/*
Mode 1 means single quoted string, i.e 'foo'
*/
case e_single_quoted:
{
colors[in_pos] = HIGHLIGHT_QUOTE;
if (c == L'\\')
{
// backslash
if (in_pos + 1 < buff_len)
{
const wchar_t escaped_char = buffstr.at(in_pos + 1);
if (escaped_char == L'\\' || escaped_char == L'\'')
{
colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
in_pos += 1; //skip over backslash
}
}
}
else if (c == L'\'')
{
mode = e_unquoted;
}
break;
}
/*
Mode 2 means double quoted string, i.e. "foo"
*/
case e_double_quoted:
{
colors[in_pos] = HIGHLIGHT_QUOTE;
switch (c)
{
case L'"':
{
mode = e_unquoted;
break;
}
case L'\\':
{
// backslash
if (in_pos + 1 < buff_len)
{
const wchar_t escaped_char = buffstr.at(in_pos + 1);
if (escaped_char == L'\\' || escaped_char == L'\'' || escaped_char == L'$')
{
colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
in_pos += 1; //skip over backslash
}
}
break;
}
case L'$':
{
int dollar_color = HIGHLIGHT_ERROR;
if (in_pos + 1 < buff_len)
{
wchar_t next = buffstr.at(in_pos + 1);
if (next == L'$' || wcsvarchr(next))
dollar_color = HIGHLIGHT_OPERATOR;
}
colors[in_pos] = dollar_color;
break;
}
}
break;
}
}
}
}
// Color all of the arguments of the given command
static void color_arguments(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &parent, std::vector<int> &color_array)
{
const parse_node_tree_t::parse_node_list_t nodes = tree.find_nodes(parent, symbol_argument);
wcstring param;
for (node_offset_t i=0; i < nodes.size(); i++)
{
const parse_node_t *child = nodes.at(i);
assert(child != NULL && child->type == symbol_argument);
param.assign(src, child->source_start, child->source_length);
color_argument(param, color_array.begin() + child->source_start, HIGHLIGHT_NORMAL);
}
}
static void color_children(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, int color, std::vector<int> &color_array)
{
for (node_offset_t idx=0; idx < parent.child_count; idx++)
{
const parse_node_t *child = tree.get_child(parent, idx);
if (child != NULL && child->type == type && child->has_source())
{
color_node(*child, color, color_array);
}
}
}
void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
{
ASSERT_IS_BACKGROUND_THREAD();
const size_t length = buff.size();
assert(buff.size() == color.size());
if (length == 0)
return;
std::fill(color.begin(), color.end(), -1);
/* Do something sucky and get the current working directory on this background thread. This should really be passed in. */
const wcstring working_directory = env_get_pwd_slash();
/* Parse the buffer */
parse_node_tree_t parse_tree;
parse_t parser;
parser.parse(buff, parse_flag_continue_after_error | parse_flag_include_comments, &parse_tree, NULL);
/* Walk the node tree */
for (parse_node_tree_t::const_iterator iter = parse_tree.begin(); iter != parse_tree.end(); ++iter)
{
const parse_node_t &node = *iter;
switch (node.type)
{
// Color direct string descendants, e.g. 'for' and 'in'.
case symbol_for_header:
case symbol_while_header:
case symbol_begin_header:
case symbol_function_header:
case symbol_if_clause:
case symbol_else_clause:
case symbol_case_item:
case symbol_switch_statement:
case symbol_boolean_statement:
case symbol_decorated_statement:
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
break;
case symbol_redirection:
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_REDIRECTION, color);
break;
case parse_token_type_background:
case parse_token_type_end:
color_node(node, HIGHLIGHT_END, color);
break;
case symbol_plain_statement:
{
// Color the command
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
// Color arguments
const parse_node_t *arguments = parse_tree.get_child(node, 1, symbol_arguments_or_redirections_list);
if (arguments != NULL)
{
color_arguments(buff, parse_tree, *arguments, color);
}
}
break;
case symbol_arguments_or_redirections_list:
case symbol_argument_list:
/* Nothing, these are handled by their parents */
break;
case parse_special_type_parse_error:
case parse_special_type_tokenizer_error:
color_node(node, HIGHLIGHT_ERROR, color);
break;
case parse_special_type_comment:
color_node(node, HIGHLIGHT_COMMENT, color);
break;
default:
break;
}
}
}
/**
Perform quote and parenthesis highlighting on the specified string.

View file

@ -84,6 +84,7 @@ struct file_detection_context_t;
\param error a list in which a description of each error will be inserted. May be 0, in whcich case no error descriptions will be generated.
*/
void highlight_shell(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
void highlight_shell_magic(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
/**
Perform syntax highlighting for the text in buff. Matching quotes and paranthesis are highlighted. The result is

View file

@ -135,14 +135,12 @@ RESOLVE(statement)
return 2;
case parse_keyword_else:
//symbol_stack_pop();
return NO_PRODUCTION;
case parse_keyword_switch:
return 3;
case parse_keyword_end:
PARSER_DIE(); //todo
return NO_PRODUCTION;
// 'in' is only special within a for_header
@ -378,7 +376,7 @@ RESOLVE(arguments_or_redirections_list)
PRODUCTIONS(argument_or_redirection) =
{
{parse_token_type_string},
{symbol_argument},
{parse_token_type_redirection}
};
RESOLVE(argument_or_redirection)
@ -394,6 +392,18 @@ RESOLVE(argument_or_redirection)
}
}
PRODUCTIONS(argument) =
{
{parse_token_type_string}
};
RESOLVE_ONLY(argument)
PRODUCTIONS(redirection) =
{
{parse_token_type_redirection}
};
RESOLVE_ONLY(redirection)
PRODUCTIONS(optional_background) =
{
{},
@ -449,6 +459,8 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
TEST(plain_statement)
TEST(arguments_or_redirections_list)
TEST(argument_or_redirection)
TEST(argument)
TEST(redirection)
TEST(optional_background)
case parse_token_type_string:
@ -461,6 +473,14 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
PARSER_DIE();
break;
case parse_special_type_parse_error:
case parse_special_type_tokenizer_error:
case parse_special_type_comment:
fprintf(stderr, "Special type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__);
PARSER_DIE();
break;
case token_type_invalid:
fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__);
PARSER_DIE();

View file

@ -101,6 +101,11 @@ wcstring token_type_description(parse_token_type_t type)
return L"arguments_or_redirections_list";
case symbol_argument_or_redirection:
return L"argument_or_redirection";
case symbol_argument:
return L"symbol_argument";
case symbol_redirection:
return L"symbol_redirection";
case parse_token_type_string:
return L"token_string";
@ -116,6 +121,14 @@ wcstring token_type_description(parse_token_type_t type)
return L"token_terminate";
case symbol_optional_background:
return L"optional_background";
case parse_special_type_parse_error:
return L"parse_error";
case parse_special_type_tokenizer_error:
return L"tokenizer_error";
case parse_special_type_comment:
return L"comment";
}
return format_string(L"Unknown token type %ld", static_cast<long>(type));
}
@ -217,6 +230,14 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_
result.type = parse_token_type_redirection;
break;
case TOK_ERROR:
result.type = parse_special_type_tokenizer_error;
break;
case TOK_COMMENT:
result.type = parse_special_type_comment;
break;
default:
fprintf(stderr, "Bad token type %d passed to %s\n", (int)tokenizer_token_type, __FUNCTION__);
@ -247,11 +268,18 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
append_format(*result, L" <%lu children>", node.child_count);
}
if (node.type == parse_token_type_string)
{
if (node.source_start == -1)
{
append_format(*result, L" (no source)");
}
else
{
result->append(L": \"");
result->append(src, node.source_start, node.source_length);
result->append(L"\"");
}
}
result->push_back(L'\n');
++*line;
for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++)
@ -311,21 +339,25 @@ class parse_ll_t
// Constructor
parse_ll_t() : fatal_errored(false)
{
// initial node
symbol_stack.push_back(parse_stack_element_t(symbol_job_list, 0)); // goal token
nodes.push_back(parse_node_t(symbol_job_list));
this->reset();
}
bool top_node_match_token(parse_token_t token);
void accept_token(parse_token_t token, const wcstring &src);
// Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node.
void reset(void);
void parse_error(const wchar_t *expected, parse_token_t token);
void parse_error(parse_token_t token, const wchar_t *format, ...);
void append_error_callout(wcstring &error_message, parse_token_t token);
void dump_stack(void) const;
// Figure out the ranges of intermediate nodes
void determine_node_ranges();
// Get the node corresponding to the top element of the stack
parse_node_t &node_for_top_symbol()
{
@ -453,9 +485,41 @@ void parse_ll_t::dump_stack(void) const
}
}
// Give each node a source range equal to the union of the ranges of its children
// Terminal nodes already have source ranges (and no children)
// Since children always appear after their parents, we can implement this very simply by walking backwards
void parse_ll_t::determine_node_ranges(void)
{
const size_t source_start_invalid = -1;
size_t idx = nodes.size();
while (idx--)
{
parse_node_t *parent = &nodes.at(idx);
// Skip nodes that already have a source range. These are terminal nodes.
if (parent->source_start != source_start_invalid)
continue;
// Ok, this node needs a source range. Get all of its children, and then set its range.
size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
for (node_offset_t i=0; i < parent->child_count; i++)
{
const parse_node_t &child = nodes.at(parent->child_offset(i));
min_start = std::min(min_start, child.source_start);
max_end = std::max(max_end, child.source_start + child.source_length);
}
if (min_start != source_start_invalid) {
assert(max_end >= min_start);
parent->source_start = min_start;
parent->source_length = max_end - min_start;
}
}
}
void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...)
{
this->dump_stack();
//this->dump_stack();
parse_error_t err;
va_list va;
@ -481,8 +545,27 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
fatal_errored = true;
}
void parse_ll_t::reset(void)
{
// add a new job_list node and then reset our symbol list to point at it
node_offset_t where = nodes.size();
nodes.push_back(parse_node_t(symbol_job_list));
symbol_stack.clear();
symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token
this->fatal_errored = false;
}
bool parse_ll_t::top_node_match_token(parse_token_t token)
{
if (symbol_stack.empty())
{
// This can come about with an unbalanced 'end' or 'else', which causes us to terminate the outermost job list.
this->fatal_errored = true;
return false;
}
PARSE_ASSERT(! symbol_stack.empty());
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
bool result = false;
@ -520,10 +603,23 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
}
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
PARSE_ASSERT(! symbol_stack.empty());
bool consumed = false;
// Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
{
parse_node_t err_node(token.type);
err_node.source_start = token.source_start;
err_node.source_length = token.source_length;
nodes.push_back(err_node);
consumed = true;
}
while (! consumed && ! this->fatal_errored)
{
PARSE_ASSERT(! symbol_stack.empty());
if (top_node_match_token(token))
{
if (logit)
@ -534,6 +630,10 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
break;
}
// top_node_match_token may indicate an error if our stack is empty
if (this->fatal_errored)
break;
// Get the production for the top of the stack
parse_stack_element_t &stack_elem = symbol_stack.back();
parse_node_t &node = nodes.at(stack_elem.node_idx);
@ -548,6 +648,12 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
// Manipulate the symbol stack.
// Note that stack_elem is invalidated by popping the stack.
symbol_stack_pop_push_production(production);
// If we end up with an empty stack, something bad happened, like an unbalanced end
if (symbol_stack.empty())
{
this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?", stack_elem.describe().c_str(), token.describe().c_str());
}
}
}
}
@ -556,6 +662,11 @@ parse_t::parse_t() : parser(new parse_ll_t())
{
}
parse_t::~parse_t()
{
delete parser;
}
static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
{
parse_keyword_t result = parse_keyword_none;
@ -597,21 +708,20 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
return result;
}
bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors)
bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it)
{
tokenizer_t tok = tokenizer_t(str.c_str(), 0);
tok_flags_t tok_options = TOK_SQUASH_ERRORS;
if (parse_flags & parse_flag_include_comments)
tok_options |= TOK_SHOW_COMMENTS;
tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok))
{
token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
const wchar_t *tok_txt = tok_last(&tok);
int tok_start = tok_get_pos(&tok);
size_t tok_extent = tok_get_extent(&tok);
if (tok_type == TOK_ERROR)
{
fprintf(stderr, "Tokenizer error\n");
break;
}
assert(tok_extent < 10000000); //paranoia
parse_token_t token = parse_token_from_tokenizer_token(tok_type);
token.tokenizer_type = tok_type;
@ -621,12 +731,31 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
this->parser->accept_token(token, str);
if (this->parser->fatal_errored)
{
if (parse_flags & parse_flag_continue_after_error)
{
/* Mark an error and then keep going */
token.type = parse_special_type_parse_error;
token.keyword = parse_keyword_none;
this->parser->accept_token(token, str);
this->parser->reset();
}
else
{
/* Bail out */
break;
}
}
}
// Teach each node where its source range is
this->parser->determine_node_ranges();
#if 0
wcstring result = dump_tree(this->parser->nodes, str);
fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
#endif
if (output != NULL)
{
@ -642,3 +771,40 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
return ! this->parser->fatal_errored;
}
const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const
{
const parse_node_t *result = NULL;
PARSE_ASSERT(which < parent.child_count);
node_offset_t child_offset = parent.child_offset(which);
if (child_offset < this->size())
{
result = &this->at(child_offset);
}
// If we are given an expected type, then the node must be null or that type
if (result != NULL)
{
assert(expected_type == token_type_invalid || expected_type == result->type);
}
return result;
}
static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, parse_node_tree_t::parse_node_list_t *result)
{
if (parent.type == type) result->push_back(&parent);
for (size_t i=0; i < parent.child_count; i++)
{
const parse_node_t *child = tree.get_child(parent, i);
assert(child != NULL);
find_nodes_recursive(tree, *child, type, result);
}
}
parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent, parse_token_type_t type) const
{
parse_node_list_t result;
find_nodes_recursive(*this, parent, type, &result);
return result;
}

View file

@ -15,7 +15,7 @@
#include <vector>
#define PARSE_ASSERT(a) assert(a)
#define PARSER_DIE() exit_without_destructors(-1)
#define PARSER_DIE() do { fprintf(stderr, "Parser dying!\n"); exit_without_destructors(-1); } while (0)
class parse_node_t;
class parse_node_tree_t;
@ -36,6 +36,18 @@ struct parse_error_t
};
typedef std::vector<parse_error_t> parse_error_list_t;
enum
{
parse_flag_none = 0,
/* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */
parse_flag_continue_after_error = 1 << 0,
/* Include comment tokens */
parse_flag_include_comments = 1 << 1
};
typedef unsigned int parse_tree_flags_t;
class parse_ll_t;
class parse_t
{
@ -43,7 +55,8 @@ class parse_t
public:
parse_t();
bool parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors);
~parse_t();
bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false);
};
enum parse_token_type_t
@ -80,6 +93,9 @@ enum parse_token_type_t
symbol_argument_list_nonempty,
symbol_argument_list,
symbol_argument,
symbol_redirection,
symbol_optional_background,
// Terminal types
@ -90,6 +106,11 @@ enum parse_token_type_t
parse_token_type_end,
parse_token_type_terminate,
// Very special terminal types that don't appear in the production list
parse_special_type_parse_error,
parse_special_type_tokenizer_error,
parse_special_type_comment,
LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate,
FIRST_PARSE_TOKEN_TYPE = parse_token_type_string
};
@ -145,7 +166,7 @@ public:
wcstring describe(void) const;
/* Constructor */
explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(0), source_length(0), child_start(0), child_count(0), tag(0)
explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), child_start(0), child_count(0), tag(0)
{
}
@ -154,10 +175,23 @@ public:
PARSE_ASSERT(which < child_count);
return child_start + which;
}
bool has_source() const
{
return source_start != (size_t)(-1);
}
};
class parse_node_tree_t : public std::vector<parse_node_t>
{
public:
/* Get the node corresponding to a child of the given node, or NULL if there is no such child. If expected_type is provided, assert that the node has that type. */
const parse_node_t *get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type = token_type_invalid) const;
/* Find all the nodes of a given type underneath a given node */
typedef std::vector<const parse_node_t *> parse_node_list_t;
parse_node_list_t find_nodes(const parse_node_t &parent, parse_token_type_t type) const;
};
@ -214,7 +248,8 @@ class parse_node_tree_t : public std::vector<parse_node_t>
arguments_or_redirections_list = <empty> |
argument_or_redirection arguments_or_redirections_list
argument_or_redirection = redirection | <TOK_STRING>
argument_or_redirection = argument | redirection
argument = <TOK_STRING>
redirection = <TOK_REDIRECTION>
terminator = <TOK_END> | <TOK_BACKGROUND>