More work on new parser

This commit is contained in:
ridiculousfish 2013-08-11 00:35:00 -07:00
parent 8e07e55c1f
commit e58b73179f
6 changed files with 532 additions and 215 deletions

View file

@ -4075,7 +4075,10 @@ int builtin_parse(parser_t &parser, wchar_t **argv)
}
else
{
if (0) {
const wcstring dump = parse_dump_tree(parse_tree, src);
fprintf(stderr, "%ls", dump.c_str());
if (0)
{
parse_execution_context_t ctx(parse_tree, src);
parse_execution_simulator_t sim;
sim.context = &ctx;

View file

@ -1816,13 +1816,16 @@ static void test_new_parser_correctness(void)
{L"if true ; end", true},
{L"if true; end ; end", false},
{L"if end; end ; end", false},
{L"end", false}
{L"if end", false},
{L"end", false},
{L"for i i", false},
{L"for i in a b c ; end", true}
};
for (size_t i=0; i < sizeof parser_tests / sizeof *parser_tests; i++)
{
const parser_test_t *test = &parser_tests[i];
parse_node_tree_t parse_tree;
parse_t parser;
bool success = parser.parse(test->src, parse_flag_none, &parse_tree, NULL);
@ -1837,7 +1840,87 @@ static void test_new_parser_correctness(void)
}
}
say(L"Parse tests complete");
}
struct parser_fuzz_token_t
{
parse_token_type_t token_type;
parse_keyword_t keyword;
parser_fuzz_token_t() : token_type(FIRST_TERMINAL_TYPE), keyword(parse_keyword_none)
{
}
};
static bool increment(std::vector<parser_fuzz_token_t> &tokens)
{
size_t i, end = tokens.size();
for (i=0; i < end; i++)
{
bool wrapped = false;
struct parser_fuzz_token_t &token = tokens[i];
bool incremented_in_keyword = false;
if (token.token_type == parse_token_type_string)
{
// try incrementing the keyword
token.keyword++;
if (token.keyword <= LAST_KEYWORD)
{
incremented_in_keyword = true;
}
else
{
token.keyword = parse_keyword_none;
incremented_in_keyword = false;
}
}
if (! incremented_in_keyword)
{
token.token_type++;
if (token.token_type > LAST_TERMINAL_TYPE)
{
token.token_type = FIRST_TERMINAL_TYPE;
wrapped = true;
}
}
if (! wrapped)
{
break;
}
}
return i == end;
}
static void test_new_parser_fuzzing(void)
{
say(L"Fuzzing parser (node size: %lu)", sizeof(parse_node_t));
double start = timef();
// ensure nothing crashes
size_t max = 5;
for (size_t len=1; len <= max; len++)
{
fprintf(stderr, "%lu / %lu\n", len, max);
std::vector<parser_fuzz_token_t> tokens(len);
do
{
parse_t parser;
parse_node_tree_t parse_tree;
parse_error_list_t errors;
for (size_t i=0; i < len; i++)
{
const parser_fuzz_token_t &token = tokens[i];
parser.parse_1_token(token.token_type, token.keyword, &parse_tree, &errors);
}
// keep going until we wrap
}
while (! increment(tokens));
}
double end = timef();
say(L"All fuzzed in %f seconds!", end - start);
}
__attribute__((unused))
@ -1863,6 +1946,104 @@ static void test_new_parser(void)
}
}
static void test_highlighting(void)
{
say(L"Testing syntax highlighting");
if (system("mkdir -p /tmp/fish_highlight_test/")) err(L"mkdir failed");
if (system("touch /tmp/fish_highlight_test/foo")) err(L"touch failed");
if (system("touch /tmp/fish_highlight_test/bar")) err(L"touch failed");
// Here are the components of our source and the colors we expect those to be
struct highlight_component_t {
const wchar_t *txt;
int color;
};
const highlight_component_t components1[] =
{
{L"echo", HIGHLIGHT_COMMAND},
{L"/tmp/fish_highlight_test/foo", HIGHLIGHT_PARAM | HIGHLIGHT_VALID_PATH},
{L"&", HIGHLIGHT_END},
{NULL, -1}
};
const highlight_component_t components2[] =
{
{L"command", HIGHLIGHT_COMMAND},
{L"echo", HIGHLIGHT_COMMAND},
{L"abc", HIGHLIGHT_PARAM},
{L"/tmp/fish_highlight_test/foo", HIGHLIGHT_PARAM | HIGHLIGHT_VALID_PATH},
{L"&", HIGHLIGHT_END},
{NULL, -1}
};
const highlight_component_t components3[] =
{
{L"if command ls", HIGHLIGHT_COMMAND},
{L"; ", HIGHLIGHT_END},
{L"echo", HIGHLIGHT_COMMAND},
{L"abc", HIGHLIGHT_PARAM},
{L"; ", HIGHLIGHT_END},
{L"/bin/definitely_not_a_command", HIGHLIGHT_ERROR},
{L"; ", HIGHLIGHT_END},
{L"end", HIGHLIGHT_COMMAND},
{NULL, -1}
};
const highlight_component_t *tests[] = {components1, components2, components3};
for (size_t which = 0; which < sizeof tests / sizeof *tests; which++)
{
const highlight_component_t *components = tests[which];
// Count how many we have
size_t component_count = 0;
while (components[component_count].txt != NULL)
{
component_count++;
}
// Generate the text
wcstring text;
std::vector<int> expected_colors;
for (size_t i=0; i < component_count; i++)
{
if (i > 0)
{
text.push_back(L' ');
expected_colors.push_back(0);
}
text.append(components[i].txt);
// hackish space handling
const size_t text_len = wcslen(components[i].txt);
for (size_t j=0; j < text_len; j++)
{
bool is_space = (components[i].txt[j] == L' ');
expected_colors.push_back(is_space ? 0 : components[i].color);
}
}
assert(expected_colors.size() == text.size());
std::vector<int> colors(text.size());
highlight_shell(text, colors, 20, NULL, env_vars_snapshot_t());
if (expected_colors.size() != colors.size())
{
err(L"Color vector has wrong size! Expected %lu, actual %lu", expected_colors.size(), colors.size());
}
assert(expected_colors.size() == colors.size());
for (size_t i=0; i < text.size(); i++)
{
if (expected_colors.at(i) != colors.at(i))
{
const wcstring spaces(i, L' ');
err(L"Wrong color at index %lu in text (expected %d, actual %d):\n%ls\n%ls^", i, expected_colors.at(i), colors.at(i), text.c_str(), spaces.c_str());
}
}
}
system("rm -Rf /tmp/fish_highlight_test");
}
/**
Main test
*/
@ -1884,9 +2065,10 @@ int main(int argc, char **argv)
reader_init();
env_init();
test_new_parser_correctness();
//test_new_parser_fuzzing();
//test_new_parser_correctness();
//test_highlighting();
//test_new_parser();
return 0;
test_format();
test_escape();

View file

@ -1314,7 +1314,8 @@ void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t
void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
{
ASSERT_IS_BACKGROUND_THREAD();
if (1) {
if (0)
{
highlight_shell_magic(buff, color, pos, error, vars);
return;
}
@ -1451,12 +1452,12 @@ static void color_node(const parse_node_t &node, int color, std::vector<int> &co
// Can only color nodes with valid source ranges
if (! node.has_source())
return;
// Fill the color array with our color in the corresponding range
size_t source_end = node.source_start + node.source_length;
assert(source_end >= node.source_start);
assert(source_end <= color_array.size());
std::fill(color_array.begin() + node.source_start, color_array.begin() + source_end, color);
}
@ -1464,7 +1465,7 @@ static void color_argument(const wcstring &buffstr, std::vector<int>::iterator c
{
const size_t buff_len = buffstr.size();
std::fill(colors, colors + buff_len, normal_status);
enum {e_unquoted, e_single_quoted, e_double_quoted} mode = e_unquoted;
int bracket_count=0;
for (size_t in_pos=0; in_pos < buff_len; in_pos++)
@ -1479,11 +1480,11 @@ static void color_argument(const wcstring &buffstr, std::vector<int>::iterator c
int fill_color = HIGHLIGHT_ESCAPE; //may be set to HIGHLIGHT_ERROR
const size_t backslash_pos = in_pos;
size_t fill_end = backslash_pos;
// Move to the escaped character
in_pos++;
const wchar_t escaped_char = (in_pos < buff_len ? buffstr.at(in_pos) : L'\0');
if (escaped_char == L'\0')
{
fill_end = in_pos;
@ -1559,7 +1560,7 @@ static void color_argument(const wcstring &buffstr, std::vector<int>::iterator c
break;
}
}
// Consume
for (int i=0; i < chars && in_pos < buff_len; i++)
{
@ -1572,11 +1573,11 @@ static void color_argument(const wcstring &buffstr, std::vector<int>::iterator c
//in_pos is now at the first character that could not be converted (or buff_len)
assert(in_pos >= backslash_pos && in_pos <= buff_len);
fill_end = in_pos;
// It's an error if we exceeded the max value
if (res > max_val)
fill_color = HIGHLIGHT_ERROR;
// Subtract one from in_pos, so that the increment in the loop will move to the next character
in_pos--;
}
@ -1746,7 +1747,7 @@ static void color_argument(const wcstring &buffstr, std::vector<int>::iterator c
static void color_arguments(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &parent, std::vector<int> &color_array)
{
const parse_node_tree_t::parse_node_list_t nodes = tree.find_nodes(parent, symbol_argument);
wcstring param;
for (node_offset_t i=0; i < nodes.size(); i++)
{
@ -1783,20 +1784,20 @@ void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t
/* Do something sucky and get the current working directory on this background thread. This should really be passed in. */
const wcstring working_directory = env_get_pwd_slash();
/* Parse the buffer */
parse_node_tree_t parse_tree;
parse_t parser;
parser.parse(buff, parse_flag_continue_after_error | parse_flag_include_comments, &parse_tree, NULL);
/* Walk the node tree */
for (parse_node_tree_t::const_iterator iter = parse_tree.begin(); iter != parse_tree.end(); ++iter)
{
const parse_node_t &node = *iter;
switch (node.type)
{
// Color direct string descendants, e.g. 'for' and 'in'.
// Color direct string descendants, e.g. 'for' and 'in'.
case symbol_for_header:
case symbol_while_header:
case symbol_begin_header:
@ -1809,21 +1810,35 @@ void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t
case symbol_decorated_statement:
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
break;
case symbol_if_statement:
{
// Color the 'end'
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
// Color arguments and redirections
const parse_node_t *arguments = parse_tree.get_child(node, 3, symbol_arguments_or_redirections_list);
if (arguments != NULL)
{
color_arguments(buff, parse_tree, *arguments, color);
}
}
break;
case symbol_redirection:
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_REDIRECTION, color);
break;
case parse_token_type_background:
case parse_token_type_end:
color_node(node, HIGHLIGHT_END, color);
break;
case symbol_plain_statement:
{
// Color the command
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
// Color arguments
const parse_node_t *arguments = parse_tree.get_child(node, 1, symbol_arguments_or_redirections_list);
if (arguments != NULL)
@ -1832,22 +1847,22 @@ void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t
}
}
break;
case symbol_arguments_or_redirections_list:
case symbol_argument_list:
/* Nothing, these are handled by their parents */
break;
case parse_special_type_parse_error:
case parse_special_type_tokenizer_error:
color_node(node, HIGHLIGHT_ERROR, color);
break;
case parse_special_type_comment:
color_node(node, HIGHLIGHT_COMMENT, color);
break;
default:
break;
}

View file

@ -13,7 +13,7 @@ static bool production_is_valid(const production_options_t production_list, prod
{
if (which < 0 || which >= MAX_PRODUCTIONS)
return false;
bool nonempty_found = false;
for (int i=which; i < MAX_PRODUCTIONS; i++)
{
@ -249,8 +249,10 @@ RESOLVE(argument_list)
{
switch (token_type)
{
case parse_token_type_string: return 1;
default: return 0;
case parse_token_type_string:
return 1;
default:
return 0;
}
}
@ -429,40 +431,40 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
{
fprintf(stderr, "Resolving production for %ls with input type %ls <%ls>\n", token_type_description(node_type).c_str(), token_type_description(input_type).c_str(), keyword_description(input_keyword).c_str());
}
/* Fetch the list of productions and the function to resolve them */
const production_options_t *production_list = NULL;
production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword, production_tag_t *tag) = NULL;
switch (node_type)
{
TEST(job_list)
TEST(job)
TEST(statement)
TEST(job_continuation)
TEST(boolean_statement)
TEST(block_statement)
TEST(if_statement)
TEST(if_clause)
TEST(else_clause)
TEST(else_continuation)
TEST(switch_statement)
TEST(decorated_statement)
TEST(case_item_list)
TEST(case_item)
TEST(argument_list_nonempty)
TEST(argument_list)
TEST(block_header)
TEST(for_header)
TEST(while_header)
TEST(begin_header)
TEST(function_header)
TEST(plain_statement)
TEST(arguments_or_redirections_list)
TEST(argument_or_redirection)
TEST(argument)
TEST(redirection)
TEST(optional_background)
TEST(job_list)
TEST(job)
TEST(statement)
TEST(job_continuation)
TEST(boolean_statement)
TEST(block_statement)
TEST(if_statement)
TEST(if_clause)
TEST(else_clause)
TEST(else_continuation)
TEST(switch_statement)
TEST(decorated_statement)
TEST(case_item_list)
TEST(case_item)
TEST(argument_list_nonempty)
TEST(argument_list)
TEST(block_header)
TEST(for_header)
TEST(while_header)
TEST(begin_header)
TEST(function_header)
TEST(plain_statement)
TEST(arguments_or_redirections_list)
TEST(argument_or_redirection)
TEST(argument)
TEST(redirection)
TEST(optional_background)
case parse_token_type_string:
case parse_token_type_pipe:
case parse_token_type_redirection:
@ -472,33 +474,33 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
fprintf(stderr, "Terminal token type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__);
PARSER_DIE();
break;
case parse_special_type_parse_error:
case parse_special_type_tokenizer_error:
case parse_special_type_comment:
fprintf(stderr, "Special type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__);
PARSER_DIE();
break;
case token_type_invalid:
fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__);
PARSER_DIE();
break;
}
PARSE_ASSERT(production_list != NULL);
PARSE_ASSERT(resolver != NULL);
const production_t *result = NULL;
production_option_idx_t which = resolver(input_type, input_keyword, out_tag);
if (log_it)
{
fprintf(stderr, "\tresolved to %u\n", (unsigned)which);
}
if (which == NO_PRODUCTION)
{
if (log_it)

View file

@ -4,6 +4,7 @@
using namespace parse_productions;
/** Returns a string description of this parse error */
wcstring parse_error_t::describe(const wcstring &src) const
{
wcstring result = text;
@ -41,6 +42,7 @@ wcstring parse_error_t::describe(const wcstring &src) const
return result;
}
/** Returns a string description of the given token type */
wcstring token_type_description(parse_token_type_t type)
{
switch (type)
@ -121,7 +123,7 @@ wcstring token_type_description(parse_token_type_t type)
return L"token_terminate";
case symbol_optional_background:
return L"optional_background";
case parse_special_type_parse_error:
return L"parse_error";
case parse_special_type_tokenizer_error:
@ -172,70 +174,68 @@ wcstring keyword_description(parse_keyword_t k)
}
}
/** Returns a string description of the given parse node */
wcstring parse_node_t::describe(void) const
{
wcstring result = token_type_description(type);
return result;
}
/** A struct representing the token type passed to */
struct parse_token_t
{
enum parse_token_type_t type; // The type of the token as represented by the parser
enum token_type tokenizer_type; // The type of the token as represented by the tokenizer
enum parse_keyword_t keyword; // Any keyword represented by this parser
size_t source_start;
size_t source_length;
wcstring describe() const;
wcstring describe() const
{
wcstring result = token_type_description(type);
if (keyword != parse_keyword_none)
{
append_format(result, L" <%ls>", keyword_description(keyword).c_str());
}
return result;
}
};
wcstring parse_token_t::describe(void) const
/* Convert from tokenizer_t's token type to a parse_token_t type */
static parse_token_type_t parse_token_type_from_tokenizer_token(enum token_type tokenizer_token_type)
{
wcstring result = token_type_description(type);
if (keyword != parse_keyword_none)
{
append_format(result, L" <%ls>", keyword_description(keyword).c_str());
}
return result;
}
// Convert from tokenizer_t's token type to our token
static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_token_type)
{
parse_token_t result = {};
result.tokenizer_type = tokenizer_token_type;
parse_token_type_t result = token_type_invalid;
switch (tokenizer_token_type)
{
case TOK_STRING:
result.type = parse_token_type_string;
result = parse_token_type_string;
break;
case TOK_PIPE:
result.type = parse_token_type_pipe;
result = parse_token_type_pipe;
break;
case TOK_END:
result.type = parse_token_type_end;
result = parse_token_type_end;
break;
case TOK_BACKGROUND:
result.type = parse_token_type_background;
result = parse_token_type_background;
break;
case TOK_REDIRECT_OUT:
case TOK_REDIRECT_APPEND:
case TOK_REDIRECT_IN:
case TOK_REDIRECT_FD:
case TOK_REDIRECT_NOCLOB:
result.type = parse_token_type_redirection;
result = parse_token_type_redirection;
break;
case TOK_ERROR:
result.type = parse_special_type_tokenizer_error;
result = parse_special_type_tokenizer_error;
break;
case TOK_COMMENT:
result.type = parse_special_type_comment;
result = parse_special_type_comment;
break;
@ -247,6 +247,7 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_
return result;
}
/* Helper function for dump_tree */
static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &src, size_t start, size_t indent, wcstring *result, size_t *line)
{
assert(start < nodes.size());
@ -288,8 +289,8 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
}
}
__attribute__((unused))
static wcstring dump_tree(const parse_node_tree_t &nodes, const wcstring &src)
/* Gives a debugging textual description of a parse tree */
wcstring parse_dump_tree(const parse_node_tree_t &nodes, const wcstring &src)
{
if (nodes.empty())
return L"(empty!)";
@ -300,6 +301,7 @@ static wcstring dump_tree(const parse_node_tree_t &nodes, const wcstring &src)
return result;
}
/* Struct representing elements of the symbol stack, used in the internal state of the LL parser */
struct parse_stack_element_t
{
enum parse_token_type_t type;
@ -309,7 +311,7 @@ struct parse_stack_element_t
explicit parse_stack_element_t(parse_token_type_t t, node_offset_t idx) : type(t), keyword(parse_keyword_none), node_idx(idx)
{
}
explicit parse_stack_element_t(production_element_t e, node_offset_t idx) : type(production_element_type(e)), keyword(production_element_keyword(e)), node_idx(idx)
{
}
@ -323,40 +325,31 @@ struct parse_stack_element_t
}
return result;
}
};
/* The parser itself, private implementation of class parse_t. This is a hand-coded table-driven LL parser. Most hand-coded LL parsers are recursive descent, but recursive descent parsers are difficult to "pause", unlike table-driven parsers. */
class parse_ll_t
{
friend class parse_t;
std::vector<parse_stack_element_t> symbol_stack; // LL parser stack
/* Traditional symbol stack of the LL parser */
std::vector<parse_stack_element_t> symbol_stack;
/* Parser output. This is a parse tree, but stored in an array. */
parse_node_tree_t nodes;
/* Whether we ran into a fatal error, including parse errors or tokenizer errors */
bool fatal_errored;
/* List of errors we have encountered */
parse_error_list_t errors;
// Constructor
parse_ll_t() : fatal_errored(false)
{
this->reset();
}
bool top_node_match_token(parse_token_t token);
void accept_token(parse_token_t token, const wcstring &src);
// Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node.
void reset(void);
/* The symbol stack can contain terminal types or symbols. Symbols go on to do productions, but terminal types are just matched against input tokens. */
bool top_node_handle_terminal_types(parse_token_t token);
void parse_error(const wchar_t *expected, parse_token_t token);
void parse_error(parse_token_t token, const wchar_t *format, ...);
void append_error_callout(wcstring &error_message, parse_token_t token);
void dump_stack(void) const;
// Figure out the ranges of intermediate nodes
void determine_node_ranges();
// Get the node corresponding to the top element of the stack
parse_node_t &node_for_top_symbol()
@ -413,33 +406,38 @@ class parse_ll_t
if (! count) fprintf(stderr, "\t<empty>\n");
}
// Add the children. Confusingly, we want our nodes to be in forwards order (last token last, so dumps look nice), but the symbols should be reverse order (last token first, so it's lowest on the stack)
const size_t child_start = nodes.size();
size_t child_count = 0;
for (size_t i=0; i < MAX_SYMBOLS_PER_PRODUCTION; i++)
{
production_element_t elem = (*production)[i];
if (production_element_is_valid(elem))
if (!production_element_is_valid(elem))
{
// All done, bail out
break;
}
else
{
// Generate the parse node. Note that this push_back may invalidate node.
parse_token_type_t child_type = production_element_type(elem);
nodes.push_back(parse_node_t(child_type));
child_count++;
parse_token_type_t child_type = production_element_type(elem);
nodes.push_back(parse_node_t(child_type));
child_count++;
}
}
// Update the parent
const size_t parent_node_idx = symbol_stack.back().node_idx;
parse_node_t &parent_node = nodes.at(parent_node_idx);
// Should have no children yet
PARSE_ASSERT(parent_node.child_count == 0);
// Tell the node about its children
parent_node.child_start = child_start;
parent_node.child_count = child_count;
// Replace the top of the stack with new stack elements corresponding to our new nodes. Note that these go in reverse order.
symbol_stack.pop_back();
symbol_stack.reserve(symbol_stack.size() + child_count);
@ -452,6 +450,36 @@ class parse_ll_t
}
}
public:
/* Constructor */
parse_ll_t() : fatal_errored(false)
{
this->symbol_stack.reserve(16);
this->nodes.reserve(64);
this->reset_symbols_and_nodes();
}
/* Input */
void accept_token(parse_token_t token);
/* Indicate if we hit a fatal error */
bool has_fatal_error(void) const
{
return this->fatal_errored;
}
/* Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node. This is called from the constructor */
void reset_symbols(void);
/* Clear the parse symbol stack and the node tree. Add a new job_list_t goal node. This is called from the constructor. */
void reset_symbols_and_nodes(void);
/* Once parsing is complete, determine the ranges of intermediate nodes */
void determine_node_ranges();
/* Acquire output after parsing. This transfers directly from within self */
void acquire_output(parse_node_tree_t *output, parse_error_list_t *errors);
};
void parse_ll_t::dump_stack(void) const
@ -495,11 +523,11 @@ void parse_ll_t::determine_node_ranges(void)
while (idx--)
{
parse_node_t *parent = &nodes.at(idx);
// Skip nodes that already have a source range. These are terminal nodes.
if (parent->source_start != source_start_invalid)
continue;
// Ok, this node needs a source range. Get all of its children, and then set its range.
size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
for (node_offset_t i=0; i < parent->child_count; i++)
@ -508,8 +536,9 @@ void parse_ll_t::determine_node_ranges(void)
min_start = std::min(min_start, child.source_start);
max_end = std::max(max_end, child.source_start + child.source_length);
}
if (min_start != source_start_invalid) {
if (min_start != source_start_invalid)
{
assert(max_end >= min_start);
parent->source_start = min_start;
parent->source_length = max_end - min_start;
@ -517,11 +546,27 @@ void parse_ll_t::determine_node_ranges(void)
}
}
void parse_ll_t::acquire_output(parse_node_tree_t *output, parse_error_list_t *errors)
{
if (output != NULL)
{
std::swap(*output, this->nodes);
}
this->nodes.clear();
if (errors != NULL)
{
std::swap(*errors, this->errors);
}
this->errors.clear();
this->symbol_stack.clear();
}
void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...)
{
//this->dump_stack();
parse_error_t err;
va_list va;
va_start(va, fmt);
err.text = vformat_string(fmt, va);
@ -545,19 +590,42 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
fatal_errored = true;
}
void parse_ll_t::reset(void)
void parse_ll_t::reset_symbols(void)
{
// add a new job_list node and then reset our symbol list to point at it
/* Add a new job_list node, and then reset our symbol list to point at it */
node_offset_t where = nodes.size();
nodes.push_back(parse_node_t(symbol_job_list));
symbol_stack.clear();
symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token
this->fatal_errored = false;
}
/* Reset both symbols and nodes */
void parse_ll_t::reset_symbols_and_nodes(void)
{
nodes.clear();
this->reset_symbols();
}
bool parse_ll_t::top_node_match_token(parse_token_t token)
static bool type_is_terminal_type(parse_token_type_t type)
{
switch (type)
{
case parse_token_type_string:
case parse_token_type_pipe:
case parse_token_type_redirection:
case parse_token_type_background:
case parse_token_type_end:
case parse_token_type_terminate:
return true;
default:
return false;
}
}
bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token)
{
if (symbol_stack.empty())
{
@ -565,47 +633,64 @@ bool parse_ll_t::top_node_match_token(parse_token_t token)
this->fatal_errored = true;
return false;
}
PARSE_ASSERT(! symbol_stack.empty());
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
bool result = false;
bool handled = false;
parse_stack_element_t &stack_top = symbol_stack.back();
if (stack_top.type == token.type)
if (type_is_terminal_type(stack_top.type))
{
// So far so good. See if we need a particular keyword.
if (stack_top.keyword == parse_keyword_none || stack_top.keyword == token.keyword)
// The top of the stack is terminal. We are going to handle this (because we can't produce from a terminal type)
handled = true;
// Now see if we actually matched
bool matched = false;
if (stack_top.type == token.type)
{
switch (stack_top.type)
{
case parse_token_type_string:
// We matched if the keywords match, or no keyword was required
matched = (stack_top.keyword == parse_keyword_none || stack_top.keyword == token.keyword);
break;
default:
// For other types, we only require that the types match
matched = true;
break;
}
}
if (matched)
{
// Success. Tell the node that it matched this token
parse_node_t &node = node_for_top_symbol();
node.source_start = token.source_start;
node.source_length = token.source_length;
// We consumed this symbol
symbol_stack.pop_back();
result = true;
}
else if (token.type == parse_token_type_pipe)
else
{
// Pipes are primitive
symbol_stack.pop_back();
result = true;
// Failure
this->fatal_errored = true;
}
// We handled the token, so pop the symbol stack
symbol_stack.pop_back();
}
return result;
return handled;
}
void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
void parse_ll_t::accept_token(parse_token_t token)
{
bool logit = false;
if (logit)
{
const wcstring txt = wcstring(src, token.source_start, token.source_length);
fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
}
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
bool consumed = false;
// Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
{
@ -619,8 +704,8 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
while (! consumed && ! this->fatal_errored)
{
PARSE_ASSERT(! symbol_stack.empty());
if (top_node_match_token(token))
if (top_node_handle_terminal_types(token))
{
if (logit)
{
@ -629,11 +714,11 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
consumed = true;
break;
}
// top_node_match_token may indicate an error if our stack is empty
if (this->fatal_errored)
break;
// Get the production for the top of the stack
parse_stack_element_t &stack_elem = symbol_stack.back();
parse_node_t &node = nodes.at(stack_elem.node_idx);
@ -648,7 +733,7 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
// Manipulate the symbol stack.
// Note that stack_elem is invalidated by popping the stack.
symbol_stack_pop_push_production(production);
// If we end up with an empty stack, something bad happened, like an unbalanced end
if (symbol_stack.empty())
{
@ -713,9 +798,9 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
tok_flags_t tok_options = TOK_SQUASH_ERRORS;
if (parse_flags & parse_flag_include_comments)
tok_options |= TOK_SHOW_COMMENTS;
tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok))
for (; tok_has_next(&tok) && ! this->parser->has_fatal_error(); tok_next(&tok))
{
token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
const wchar_t *tok_txt = tok_last(&tok);
@ -723,22 +808,22 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
size_t tok_extent = tok_get_extent(&tok);
assert(tok_extent < 10000000); //paranoia
parse_token_t token = parse_token_from_tokenizer_token(tok_type);
token.tokenizer_type = tok_type;
parse_token_t token;
token.type = parse_token_type_from_tokenizer_token(tok_type);
token.source_start = (size_t)tok_start;
token.source_length = tok_extent;
token.keyword = keyword_for_token(tok_type, tok_txt);
this->parser->accept_token(token, str);
if (this->parser->fatal_errored)
this->parser->accept_token(token);
if (this->parser->has_fatal_error())
{
if (parse_flags & parse_flag_continue_after_error)
{
/* Mark an error and then keep going */
token.type = parse_special_type_parse_error;
token.keyword = parse_keyword_none;
this->parser->accept_token(token, str);
this->parser->reset();
this->parser->accept_token(token);
this->parser->reset_symbols();
}
else
{
@ -757,19 +842,32 @@ bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_n
fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
#endif
if (output != NULL)
{
output->swap(this->parser->nodes);
this->parser->nodes.clear();
}
// Acquire the output from the parser
this->parser->acquire_output(output, errors);
// Indicate if we had a fatal error
return ! this->parser->has_fatal_error();
}
if (errors != NULL)
{
errors->swap(this->parser->errors);
this->parser->errors.clear();
}
bool parse_t::parse_1_token(parse_token_type_t token_type, parse_keyword_t keyword, parse_node_tree_t *output, parse_error_list_t *errors)
{
// Only strings can have keywords. So if we have a keyword, the type must be a string
assert(keyword == parse_keyword_none || token_type == parse_token_type_string);
return ! this->parser->fatal_errored;
parse_token_t token;
token.type = token_type;
token.keyword = keyword;
token.source_start = -1;
token.source_length = 0;
this->parser->accept_token(token);
return ! this->parser->has_fatal_error();
}
void parse_t::clear()
{
this->parser->reset_symbols_and_nodes();
}
const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const
@ -781,13 +879,13 @@ const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, nod
{
result = &this->at(child_offset);
}
// If we are given an expected type, then the node must be null or that type
if (result != NULL)
{
assert(expected_type == token_type_invalid || expected_type == result->type);
}
return result;
}

View file

@ -36,29 +36,6 @@ struct parse_error_t
};
typedef std::vector<parse_error_t> parse_error_list_t;
enum
{
parse_flag_none = 0,
/* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */
parse_flag_continue_after_error = 1 << 0,
/* Include comment tokens */
parse_flag_include_comments = 1 << 1
};
typedef unsigned int parse_tree_flags_t;
class parse_ll_t;
class parse_t
{
parse_ll_t * const parser;
public:
parse_t();
~parse_t();
bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false);
};
enum parse_token_type_t
{
token_type_invalid,
@ -92,10 +69,10 @@ enum parse_token_type_t
symbol_argument_list_nonempty,
symbol_argument_list,
symbol_argument,
symbol_redirection,
symbol_optional_background,
// Terminal types
@ -105,12 +82,15 @@ enum parse_token_type_t
parse_token_type_background,
parse_token_type_end,
parse_token_type_terminate,
// Very special terminal types that don't appear in the production list
parse_special_type_parse_error,
parse_special_type_tokenizer_error,
parse_special_type_comment,
FIRST_TERMINAL_TYPE = parse_token_type_string,
LAST_TERMINAL_TYPE = parse_token_type_terminate,
LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate,
FIRST_PARSE_TOKEN_TYPE = parse_token_type_string
};
@ -132,9 +112,46 @@ enum parse_keyword_t
parse_keyword_or,
parse_keyword_not,
parse_keyword_command,
parse_keyword_builtin
parse_keyword_builtin,
LAST_KEYWORD = parse_keyword_builtin
};
enum
{
parse_flag_none = 0,
/* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */
parse_flag_continue_after_error = 1 << 0,
/* Include comment tokens */
parse_flag_include_comments = 1 << 1
};
typedef unsigned int parse_tree_flags_t;
class parse_ll_t;
class parse_t
{
parse_ll_t * const parser;
public:
parse_t();
~parse_t();
/* Parse a string */
bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false);
/* Parse a single token */
bool parse_1_token(parse_token_type_t token, parse_keyword_t keyword, parse_node_tree_t *output, parse_error_list_t *errors);
/* Reset, ready to parse something else */
void clear();
};
wcstring parse_dump_tree(const parse_node_tree_t &tree, const wcstring &src);
wcstring token_type_description(parse_token_type_t type);
wcstring keyword_description(parse_keyword_t type);
@ -158,7 +175,7 @@ public:
/* Type-dependent data */
uint32_t tag;
/* Which production was used */
uint8_t production_idx;
@ -175,7 +192,7 @@ public:
PARSE_ASSERT(which < child_count);
return child_start + which;
}
bool has_source() const
{
return source_start != (size_t)(-1);
@ -184,11 +201,11 @@ public:
class parse_node_tree_t : public std::vector<parse_node_t>
{
public:
public:
/* Get the node corresponding to a child of the given node, or NULL if there is no such child. If expected_type is provided, assert that the node has that type. */
const parse_node_t *get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type = token_type_invalid) const;
/* Find all the nodes of a given type underneath a given node */
typedef std::vector<const parse_node_t *> parse_node_list_t;
parse_node_list_t find_nodes(const parse_node_t &parent, parse_token_type_t type) const;
@ -200,8 +217,8 @@ class parse_node_tree_t : public std::vector<parse_node_t>
# A job_list is a list of jobs, separated by semicolons or newlines
job_list = <empty> |
<TOK_END> job_list |
job job_list
<TOK_END> job_list
# A job is a non-empty list of statements, separated by pipes. (Non-empty is useful for cases like if statements, where we require a command). To represent "non-empty", we require a statement, followed by a possibly empty job_continuation
@ -251,9 +268,9 @@ class parse_node_tree_t : public std::vector<parse_node_t>
argument_or_redirection = argument | redirection
argument = <TOK_STRING>
redirection = <TOK_REDIRECTION>
terminator = <TOK_END> | <TOK_BACKGROUND>
optional_background = <empty> | <TOK_BACKGROUND>
*/