More work on new parser

This commit is contained in:
ridiculousfish 2013-08-08 15:06:46 -07:00
parent 6a6593335d
commit 8e07e55c1f
9 changed files with 708 additions and 32 deletions

View file

@ -4063,7 +4063,7 @@ int builtin_parse(parser_t &parser, wchar_t **argv)
parse_node_tree_t parse_tree; parse_node_tree_t parse_tree;
parse_error_list_t errors; parse_error_list_t errors;
parse_t parser; parse_t parser;
bool success = parser.parse(src, &parse_tree, &errors); bool success = parser.parse(src, parse_flag_none, &parse_tree, &errors, true);
if (! success) if (! success)
{ {
stdout_buffer.append(L"Parsing failed:\n"); stdout_buffer.append(L"Parsing failed:\n");

View file

@ -507,7 +507,7 @@ const wchar_t *wcsfuncname(const wchar_t *str)
} }
int wcsvarchr(wchar_t chr) bool wcsvarchr(wchar_t chr)
{ {
return iswalnum(chr) || chr == L'_'; return iswalnum(chr) || chr == L'_';
} }

View file

@ -608,10 +608,10 @@ const wchar_t *wcsfuncname(const wchar_t *str);
/** /**
Test if the given string is valid in a variable name Test if the given string is valid in a variable name
\return 1 if this is a valid name, 0 otherwise \return true if this is a valid name, false otherwise
*/ */
int wcsvarchr(wchar_t chr); bool wcsvarchr(wchar_t chr);
/** /**

View file

@ -1801,23 +1801,65 @@ void history_tests_t::test_history_speed(void)
delete hist; delete hist;
} }
static void test_new_parser_correctness(void)
{
say(L"Testing new parser!");
const struct parser_test_t
{
const wchar_t *src;
bool ok;
}
parser_tests[] =
{
{L"; ; ; ", true},
{L"if ; end", false},
{L"if true ; end", true},
{L"if true; end ; end", false},
{L"if end; end ; end", false},
{L"end", false}
};
for (size_t i=0; i < sizeof parser_tests / sizeof *parser_tests; i++)
{
const parser_test_t *test = &parser_tests[i];
parse_node_tree_t parse_tree;
parse_t parser;
bool success = parser.parse(test->src, parse_flag_none, &parse_tree, NULL);
say(L"%lu / %lu: Parse \"%ls\": %s", i+1, sizeof parser_tests / sizeof *parser_tests, test->src, success ? "yes" : "no");
if (success && ! test->ok)
{
err(L"\"%ls\" should NOT have parsed, but did", test->src);
}
else if (! success && test->ok)
{
err(L"\"%ls\" should have parsed, but failed", test->src);
}
}
say(L"Parse tests complete");
}
__attribute__((unused))
static void test_new_parser(void) static void test_new_parser(void)
{ {
say(L"Testing new parser!"); say(L"Testing new parser!");
const wcstring src = L"echo hello world"; const wcstring src = L"echo hello world";
parse_node_tree_t parse_tree; parse_node_tree_t parse_tree;
parse_t parser; parse_t parser;
bool success = parser.parse(src, &parse_tree, NULL); bool success = parser.parse(src, parse_flag_none, &parse_tree, NULL);
if (! success) if (! success)
{ {
say(L"Parsing failed"); say(L"Parsing failed");
} }
else else
{ {
#if 0
parse_execution_context_t ctx(parse_tree, src); parse_execution_context_t ctx(parse_tree, src);
say(L"Simulating execution:"); say(L"Simulating execution:");
wcstring simulation = ctx.simulate(); wcstring simulation = ctx.simulate();
say(simulation.c_str()); say(simulation.c_str());
#endif
} }
} }
@ -1827,13 +1869,12 @@ static void test_new_parser(void)
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
setlocale(LC_ALL, ""); setlocale(LC_ALL, "");
srand(time(0)); //srand(time(0));
configure_thread_assertions_for_testing(); configure_thread_assertions_for_testing();
program_name=L"(ignore)"; program_name=L"(ignore)";
say(L"Testing low-level functionality"); say(L"Testing low-level functionality");
say(L"Lines beginning with '(ignore):' are not errors, they are warning messages\ngenerated by the fish parser library when given broken input, and can be\nignored. All actual errors begin with 'Error:'.");
set_main_thread(); set_main_thread();
setup_fork_guards(); setup_fork_guards();
//proc_init(); //proc_init();
@ -1843,7 +1884,8 @@ int main(int argc, char **argv)
reader_init(); reader_init();
env_init(); env_init();
test_new_parser(); test_new_parser_correctness();
//test_new_parser();
return 0; return 0;
test_format(); test_format();

View file

@ -34,6 +34,7 @@
#include "wildcard.h" #include "wildcard.h"
#include "path.h" #include "path.h"
#include "history.h" #include "history.h"
#include "parse_tree.h"
/** /**
Number of elements in the highlight_var array Number of elements in the highlight_var array
@ -1307,11 +1308,16 @@ static void tokenize(const wchar_t * const buff, std::vector<int> &color, const
} }
} }
void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
// PCA This function does I/O, (calls is_potential_path, path_get_path, maybe others) and so ought to only run on a background thread // PCA This function does I/O, (calls is_potential_path, path_get_path, maybe others) and so ought to only run on a background thread
void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars) void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
{ {
ASSERT_IS_BACKGROUND_THREAD(); ASSERT_IS_BACKGROUND_THREAD();
if (1) {
highlight_shell_magic(buff, color, pos, error, vars);
return;
}
const size_t length = buff.size(); const size_t length = buff.size();
assert(buff.size() == color.size()); assert(buff.size() == color.size());
@ -1440,7 +1446,413 @@ void highlight_shell(const wcstring &buff, std::vector<int> &color, size_t pos,
} }
} }
static void color_node(const parse_node_t &node, int color, std::vector<int> &color_array)
{
// Can only color nodes with valid source ranges
if (! node.has_source())
return;
// Fill the color array with our color in the corresponding range
size_t source_end = node.source_start + node.source_length;
assert(source_end >= node.source_start);
assert(source_end <= color_array.size());
std::fill(color_array.begin() + node.source_start, color_array.begin() + source_end, color);
}
static void color_argument(const wcstring &buffstr, std::vector<int>::iterator colors, int normal_status)
{
const size_t buff_len = buffstr.size();
std::fill(colors, colors + buff_len, normal_status);
enum {e_unquoted, e_single_quoted, e_double_quoted} mode = e_unquoted;
int bracket_count=0;
for (size_t in_pos=0; in_pos < buff_len; in_pos++)
{
const wchar_t c = buffstr.at(in_pos);
switch (mode)
{
case e_unquoted:
{
if (c == L'\\')
{
int fill_color = HIGHLIGHT_ESCAPE; //may be set to HIGHLIGHT_ERROR
const size_t backslash_pos = in_pos;
size_t fill_end = backslash_pos;
// Move to the escaped character
in_pos++;
const wchar_t escaped_char = (in_pos < buff_len ? buffstr.at(in_pos) : L'\0');
if (escaped_char == L'\0')
{
fill_end = in_pos;
fill_color = HIGHLIGHT_ERROR;
}
else if (wcschr(L"~%", escaped_char))
{
if (in_pos == 1)
{
fill_end = in_pos + 1;
}
}
else if (escaped_char == L',')
{
if (bracket_count)
{
fill_end = in_pos + 1;
}
}
else if (wcschr(L"abefnrtv*?$(){}[]'\"<>^ \\#;|&", escaped_char))
{
fill_end = in_pos + 1;
}
else if (wcschr(L"c", escaped_char))
{
// Like \ci. So highlight three characters
fill_end = in_pos + 1;
}
else if (wcschr(L"uUxX01234567", escaped_char))
{
long long res=0;
int chars=2;
int base=16;
wchar_t max_val = ASCII_MAX;
switch (escaped_char)
{
case L'u':
{
chars=4;
max_val = UCS2_MAX;
in_pos++;
break;
}
case L'U':
{
chars=8;
max_val = WCHAR_MAX;
in_pos++;
break;
}
case L'x':
{
in_pos++;
break;
}
case L'X':
{
max_val = BYTE_MAX;
in_pos++;
break;
}
default:
{
// a digit like \12
base=8;
chars=3;
break;
}
}
// Consume
for (int i=0; i < chars && in_pos < buff_len; i++)
{
long d = convert_digit(buffstr.at(in_pos), base);
if (d < 0)
break;
res = (res * base) + d;
in_pos++;
}
//in_pos is now at the first character that could not be converted (or buff_len)
assert(in_pos >= backslash_pos && in_pos <= buff_len);
fill_end = in_pos;
// It's an error if we exceeded the max value
if (res > max_val)
fill_color = HIGHLIGHT_ERROR;
// Subtract one from in_pos, so that the increment in the loop will move to the next character
in_pos--;
}
assert(fill_end >= backslash_pos);
std::fill(colors + backslash_pos, colors + fill_end, fill_color);
}
else
{
// Not a backslash
switch (c)
{
case L'~':
case L'%':
{
if (in_pos == 0)
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
}
break;
}
case L'$':
{
assert(in_pos < buff_len);
int dollar_color = HIGHLIGHT_ERROR;
if (in_pos + 1 < buff_len)
{
wchar_t next = buffstr.at(in_pos + 1);
if (next == L'$' || wcsvarchr(next))
dollar_color = HIGHLIGHT_OPERATOR;
}
colors[in_pos] = dollar_color;
break;
}
case L'*':
case L'?':
case L'(':
case L')':
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
break;
}
case L'{':
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
bracket_count++;
break;
}
case L'}':
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
bracket_count--;
break;
}
case L',':
{
if (bracket_count > 0)
{
colors[in_pos] = HIGHLIGHT_OPERATOR;
}
break;
}
case L'\'':
{
colors[in_pos] = HIGHLIGHT_QUOTE;
mode = e_single_quoted;
break;
}
case L'\"':
{
colors[in_pos] = HIGHLIGHT_QUOTE;
mode = e_double_quoted;
break;
}
}
}
break;
}
/*
Mode 1 means single quoted string, i.e 'foo'
*/
case e_single_quoted:
{
colors[in_pos] = HIGHLIGHT_QUOTE;
if (c == L'\\')
{
// backslash
if (in_pos + 1 < buff_len)
{
const wchar_t escaped_char = buffstr.at(in_pos + 1);
if (escaped_char == L'\\' || escaped_char == L'\'')
{
colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
in_pos += 1; //skip over backslash
}
}
}
else if (c == L'\'')
{
mode = e_unquoted;
}
break;
}
/*
Mode 2 means double quoted string, i.e. "foo"
*/
case e_double_quoted:
{
colors[in_pos] = HIGHLIGHT_QUOTE;
switch (c)
{
case L'"':
{
mode = e_unquoted;
break;
}
case L'\\':
{
// backslash
if (in_pos + 1 < buff_len)
{
const wchar_t escaped_char = buffstr.at(in_pos + 1);
if (escaped_char == L'\\' || escaped_char == L'\'' || escaped_char == L'$')
{
colors[in_pos] = HIGHLIGHT_ESCAPE; //backslash
colors[in_pos + 1] = HIGHLIGHT_ESCAPE; //escaped char
in_pos += 1; //skip over backslash
}
}
break;
}
case L'$':
{
int dollar_color = HIGHLIGHT_ERROR;
if (in_pos + 1 < buff_len)
{
wchar_t next = buffstr.at(in_pos + 1);
if (next == L'$' || wcsvarchr(next))
dollar_color = HIGHLIGHT_OPERATOR;
}
colors[in_pos] = dollar_color;
break;
}
}
break;
}
}
}
}
// Color all of the arguments of the given command
static void color_arguments(const wcstring &src, const parse_node_tree_t &tree, const parse_node_t &parent, std::vector<int> &color_array)
{
const parse_node_tree_t::parse_node_list_t nodes = tree.find_nodes(parent, symbol_argument);
wcstring param;
for (node_offset_t i=0; i < nodes.size(); i++)
{
const parse_node_t *child = nodes.at(i);
assert(child != NULL && child->type == symbol_argument);
param.assign(src, child->source_start, child->source_length);
color_argument(param, color_array.begin() + child->source_start, HIGHLIGHT_NORMAL);
}
}
static void color_children(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, int color, std::vector<int> &color_array)
{
for (node_offset_t idx=0; idx < parent.child_count; idx++)
{
const parse_node_t *child = tree.get_child(parent, idx);
if (child != NULL && child->type == type && child->has_source())
{
color_node(*child, color, color_array);
}
}
}
void highlight_shell_magic(const wcstring &buff, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars)
{
ASSERT_IS_BACKGROUND_THREAD();
const size_t length = buff.size();
assert(buff.size() == color.size());
if (length == 0)
return;
std::fill(color.begin(), color.end(), -1);
/* Do something sucky and get the current working directory on this background thread. This should really be passed in. */
const wcstring working_directory = env_get_pwd_slash();
/* Parse the buffer */
parse_node_tree_t parse_tree;
parse_t parser;
parser.parse(buff, parse_flag_continue_after_error | parse_flag_include_comments, &parse_tree, NULL);
/* Walk the node tree */
for (parse_node_tree_t::const_iterator iter = parse_tree.begin(); iter != parse_tree.end(); ++iter)
{
const parse_node_t &node = *iter;
switch (node.type)
{
// Color direct string descendants, e.g. 'for' and 'in'.
case symbol_for_header:
case symbol_while_header:
case symbol_begin_header:
case symbol_function_header:
case symbol_if_clause:
case symbol_else_clause:
case symbol_case_item:
case symbol_switch_statement:
case symbol_boolean_statement:
case symbol_decorated_statement:
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
break;
case symbol_redirection:
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_REDIRECTION, color);
break;
case parse_token_type_background:
case parse_token_type_end:
color_node(node, HIGHLIGHT_END, color);
break;
case symbol_plain_statement:
{
// Color the command
color_children(parse_tree, node, parse_token_type_string, HIGHLIGHT_COMMAND, color);
// Color arguments
const parse_node_t *arguments = parse_tree.get_child(node, 1, symbol_arguments_or_redirections_list);
if (arguments != NULL)
{
color_arguments(buff, parse_tree, *arguments, color);
}
}
break;
case symbol_arguments_or_redirections_list:
case symbol_argument_list:
/* Nothing, these are handled by their parents */
break;
case parse_special_type_parse_error:
case parse_special_type_tokenizer_error:
color_node(node, HIGHLIGHT_ERROR, color);
break;
case parse_special_type_comment:
color_node(node, HIGHLIGHT_COMMENT, color);
break;
default:
break;
}
}
}
/** /**
Perform quote and parenthesis highlighting on the specified string. Perform quote and parenthesis highlighting on the specified string.

View file

@ -84,6 +84,7 @@ struct file_detection_context_t;
\param error a list in which a description of each error will be inserted. May be 0, in whcich case no error descriptions will be generated. \param error a list in which a description of each error will be inserted. May be 0, in whcich case no error descriptions will be generated.
*/ */
void highlight_shell(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars); void highlight_shell(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
void highlight_shell_magic(const wcstring &buffstr, std::vector<int> &color, size_t pos, wcstring_list_t *error, const env_vars_snapshot_t &vars);
/** /**
Perform syntax highlighting for the text in buff. Matching quotes and paranthesis are highlighted. The result is Perform syntax highlighting for the text in buff. Matching quotes and paranthesis are highlighted. The result is

View file

@ -135,14 +135,12 @@ RESOLVE(statement)
return 2; return 2;
case parse_keyword_else: case parse_keyword_else:
//symbol_stack_pop();
return NO_PRODUCTION; return NO_PRODUCTION;
case parse_keyword_switch: case parse_keyword_switch:
return 3; return 3;
case parse_keyword_end: case parse_keyword_end:
PARSER_DIE(); //todo
return NO_PRODUCTION; return NO_PRODUCTION;
// 'in' is only special within a for_header // 'in' is only special within a for_header
@ -378,7 +376,7 @@ RESOLVE(arguments_or_redirections_list)
PRODUCTIONS(argument_or_redirection) = PRODUCTIONS(argument_or_redirection) =
{ {
{parse_token_type_string}, {symbol_argument},
{parse_token_type_redirection} {parse_token_type_redirection}
}; };
RESOLVE(argument_or_redirection) RESOLVE(argument_or_redirection)
@ -394,6 +392,18 @@ RESOLVE(argument_or_redirection)
} }
} }
PRODUCTIONS(argument) =
{
{parse_token_type_string}
};
RESOLVE_ONLY(argument)
PRODUCTIONS(redirection) =
{
{parse_token_type_redirection}
};
RESOLVE_ONLY(redirection)
PRODUCTIONS(optional_background) = PRODUCTIONS(optional_background) =
{ {
{}, {},
@ -449,6 +459,8 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
TEST(plain_statement) TEST(plain_statement)
TEST(arguments_or_redirections_list) TEST(arguments_or_redirections_list)
TEST(argument_or_redirection) TEST(argument_or_redirection)
TEST(argument)
TEST(redirection)
TEST(optional_background) TEST(optional_background)
case parse_token_type_string: case parse_token_type_string:
@ -461,6 +473,14 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
PARSER_DIE(); PARSER_DIE();
break; break;
case parse_special_type_parse_error:
case parse_special_type_tokenizer_error:
case parse_special_type_comment:
fprintf(stderr, "Special type %ls passed to %s\n", token_type_description(node_type).c_str(), __FUNCTION__);
PARSER_DIE();
break;
case token_type_invalid: case token_type_invalid:
fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__); fprintf(stderr, "token_type_invalid passed to %s\n", __FUNCTION__);
PARSER_DIE(); PARSER_DIE();

View file

@ -101,6 +101,11 @@ wcstring token_type_description(parse_token_type_t type)
return L"arguments_or_redirections_list"; return L"arguments_or_redirections_list";
case symbol_argument_or_redirection: case symbol_argument_or_redirection:
return L"argument_or_redirection"; return L"argument_or_redirection";
case symbol_argument:
return L"symbol_argument";
case symbol_redirection:
return L"symbol_redirection";
case parse_token_type_string: case parse_token_type_string:
return L"token_string"; return L"token_string";
@ -116,6 +121,14 @@ wcstring token_type_description(parse_token_type_t type)
return L"token_terminate"; return L"token_terminate";
case symbol_optional_background: case symbol_optional_background:
return L"optional_background"; return L"optional_background";
case parse_special_type_parse_error:
return L"parse_error";
case parse_special_type_tokenizer_error:
return L"tokenizer_error";
case parse_special_type_comment:
return L"comment";
} }
return format_string(L"Unknown token type %ld", static_cast<long>(type)); return format_string(L"Unknown token type %ld", static_cast<long>(type));
} }
@ -217,6 +230,14 @@ static parse_token_t parse_token_from_tokenizer_token(enum token_type tokenizer_
result.type = parse_token_type_redirection; result.type = parse_token_type_redirection;
break; break;
case TOK_ERROR:
result.type = parse_special_type_tokenizer_error;
break;
case TOK_COMMENT:
result.type = parse_special_type_comment;
break;
default: default:
fprintf(stderr, "Bad token type %d passed to %s\n", (int)tokenizer_token_type, __FUNCTION__); fprintf(stderr, "Bad token type %d passed to %s\n", (int)tokenizer_token_type, __FUNCTION__);
@ -247,11 +268,18 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
append_format(*result, L" <%lu children>", node.child_count); append_format(*result, L" <%lu children>", node.child_count);
} }
if (node.type == parse_token_type_string) if (node.type == parse_token_type_string)
{
if (node.source_start == -1)
{
append_format(*result, L" (no source)");
}
else
{ {
result->append(L": \""); result->append(L": \"");
result->append(src, node.source_start, node.source_length); result->append(src, node.source_start, node.source_length);
result->append(L"\""); result->append(L"\"");
} }
}
result->push_back(L'\n'); result->push_back(L'\n');
++*line; ++*line;
for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++) for (size_t child_idx = node.child_start; child_idx < node.child_start + node.child_count; child_idx++)
@ -311,21 +339,25 @@ class parse_ll_t
// Constructor // Constructor
parse_ll_t() : fatal_errored(false) parse_ll_t() : fatal_errored(false)
{ {
// initial node this->reset();
symbol_stack.push_back(parse_stack_element_t(symbol_job_list, 0)); // goal token
nodes.push_back(parse_node_t(symbol_job_list));
} }
bool top_node_match_token(parse_token_t token); bool top_node_match_token(parse_token_t token);
void accept_token(parse_token_t token, const wcstring &src); void accept_token(parse_token_t token, const wcstring &src);
// Clear the parse symbol stack (but not the node tree). Add a new job_list_t goal node.
void reset(void);
void parse_error(const wchar_t *expected, parse_token_t token); void parse_error(const wchar_t *expected, parse_token_t token);
void parse_error(parse_token_t token, const wchar_t *format, ...); void parse_error(parse_token_t token, const wchar_t *format, ...);
void append_error_callout(wcstring &error_message, parse_token_t token); void append_error_callout(wcstring &error_message, parse_token_t token);
void dump_stack(void) const; void dump_stack(void) const;
// Figure out the ranges of intermediate nodes
void determine_node_ranges();
// Get the node corresponding to the top element of the stack // Get the node corresponding to the top element of the stack
parse_node_t &node_for_top_symbol() parse_node_t &node_for_top_symbol()
{ {
@ -453,9 +485,41 @@ void parse_ll_t::dump_stack(void) const
} }
} }
// Give each node a source range equal to the union of the ranges of its children
// Terminal nodes already have source ranges (and no children)
// Since children always appear after their parents, we can implement this very simply by walking backwards
void parse_ll_t::determine_node_ranges(void)
{
const size_t source_start_invalid = -1;
size_t idx = nodes.size();
while (idx--)
{
parse_node_t *parent = &nodes.at(idx);
// Skip nodes that already have a source range. These are terminal nodes.
if (parent->source_start != source_start_invalid)
continue;
// Ok, this node needs a source range. Get all of its children, and then set its range.
size_t min_start = source_start_invalid, max_end = 0; //note source_start_invalid is huge
for (node_offset_t i=0; i < parent->child_count; i++)
{
const parse_node_t &child = nodes.at(parent->child_offset(i));
min_start = std::min(min_start, child.source_start);
max_end = std::max(max_end, child.source_start + child.source_length);
}
if (min_start != source_start_invalid) {
assert(max_end >= min_start);
parent->source_start = min_start;
parent->source_length = max_end - min_start;
}
}
}
void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...) void parse_ll_t::parse_error(parse_token_t token, const wchar_t *fmt, ...)
{ {
this->dump_stack(); //this->dump_stack();
parse_error_t err; parse_error_t err;
va_list va; va_list va;
@ -481,8 +545,27 @@ void parse_ll_t::parse_error(const wchar_t *expected, parse_token_t token)
fatal_errored = true; fatal_errored = true;
} }
void parse_ll_t::reset(void)
{
// add a new job_list node and then reset our symbol list to point at it
node_offset_t where = nodes.size();
nodes.push_back(parse_node_t(symbol_job_list));
symbol_stack.clear();
symbol_stack.push_back(parse_stack_element_t(symbol_job_list, where)); // goal token
this->fatal_errored = false;
}
bool parse_ll_t::top_node_match_token(parse_token_t token) bool parse_ll_t::top_node_match_token(parse_token_t token)
{ {
if (symbol_stack.empty())
{
// This can come about with an unbalanced 'end' or 'else', which causes us to terminate the outermost job list.
this->fatal_errored = true;
return false;
}
PARSE_ASSERT(! symbol_stack.empty()); PARSE_ASSERT(! symbol_stack.empty());
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE); PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
bool result = false; bool result = false;
@ -520,10 +603,23 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
fprintf(stderr, "Accept token %ls\n", token.describe().c_str()); fprintf(stderr, "Accept token %ls\n", token.describe().c_str());
} }
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE); PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
PARSE_ASSERT(! symbol_stack.empty());
bool consumed = false; bool consumed = false;
// Handle special types specially. Note that these are the only types that can be pushed if the symbol stack is empty.
if (token.type == parse_special_type_parse_error || token.type == parse_special_type_tokenizer_error || token.type == parse_special_type_comment)
{
parse_node_t err_node(token.type);
err_node.source_start = token.source_start;
err_node.source_length = token.source_length;
nodes.push_back(err_node);
consumed = true;
}
while (! consumed && ! this->fatal_errored) while (! consumed && ! this->fatal_errored)
{ {
PARSE_ASSERT(! symbol_stack.empty());
if (top_node_match_token(token)) if (top_node_match_token(token))
{ {
if (logit) if (logit)
@ -534,6 +630,10 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
break; break;
} }
// top_node_match_token may indicate an error if our stack is empty
if (this->fatal_errored)
break;
// Get the production for the top of the stack // Get the production for the top of the stack
parse_stack_element_t &stack_elem = symbol_stack.back(); parse_stack_element_t &stack_elem = symbol_stack.back();
parse_node_t &node = nodes.at(stack_elem.node_idx); parse_node_t &node = nodes.at(stack_elem.node_idx);
@ -548,6 +648,12 @@ void parse_ll_t::accept_token(parse_token_t token, const wcstring &src)
// Manipulate the symbol stack. // Manipulate the symbol stack.
// Note that stack_elem is invalidated by popping the stack. // Note that stack_elem is invalidated by popping the stack.
symbol_stack_pop_push_production(production); symbol_stack_pop_push_production(production);
// If we end up with an empty stack, something bad happened, like an unbalanced end
if (symbol_stack.empty())
{
this->parse_error(token, L"All symbols removed from symbol stack. Likely unbalanced else or end?", stack_elem.describe().c_str(), token.describe().c_str());
}
} }
} }
} }
@ -556,6 +662,11 @@ parse_t::parse_t() : parser(new parse_ll_t())
{ {
} }
parse_t::~parse_t()
{
delete parser;
}
static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt) static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
{ {
parse_keyword_t result = parse_keyword_none; parse_keyword_t result = parse_keyword_none;
@ -597,21 +708,20 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
return result; return result;
} }
bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors) bool parse_t::parse(const wcstring &str, parse_tree_flags_t parse_flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it)
{ {
tokenizer_t tok = tokenizer_t(str.c_str(), 0); tok_flags_t tok_options = TOK_SQUASH_ERRORS;
if (parse_flags & parse_flag_include_comments)
tok_options |= TOK_SHOW_COMMENTS;
tokenizer_t tok = tokenizer_t(str.c_str(), tok_options);
for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok)) for (; tok_has_next(&tok) && ! this->parser->fatal_errored; tok_next(&tok))
{ {
token_type tok_type = static_cast<token_type>(tok_last_type(&tok)); token_type tok_type = static_cast<token_type>(tok_last_type(&tok));
const wchar_t *tok_txt = tok_last(&tok); const wchar_t *tok_txt = tok_last(&tok);
int tok_start = tok_get_pos(&tok); int tok_start = tok_get_pos(&tok);
size_t tok_extent = tok_get_extent(&tok); size_t tok_extent = tok_get_extent(&tok);
assert(tok_extent < 10000000); //paranoia
if (tok_type == TOK_ERROR)
{
fprintf(stderr, "Tokenizer error\n");
break;
}
parse_token_t token = parse_token_from_tokenizer_token(tok_type); parse_token_t token = parse_token_from_tokenizer_token(tok_type);
token.tokenizer_type = tok_type; token.tokenizer_type = tok_type;
@ -621,12 +731,31 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
this->parser->accept_token(token, str); this->parser->accept_token(token, str);
if (this->parser->fatal_errored) if (this->parser->fatal_errored)
{
if (parse_flags & parse_flag_continue_after_error)
{
/* Mark an error and then keep going */
token.type = parse_special_type_parse_error;
token.keyword = parse_keyword_none;
this->parser->accept_token(token, str);
this->parser->reset();
}
else
{
/* Bail out */
break; break;
} }
}
}
// Teach each node where its source range is
this->parser->determine_node_ranges();
#if 0
wcstring result = dump_tree(this->parser->nodes, str); wcstring result = dump_tree(this->parser->nodes, str);
fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str()); fprintf(stderr, "Tree (%ld nodes):\n%ls", this->parser->nodes.size(), result.c_str());
fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t)); fprintf(stderr, "%lu nodes, node size %lu, %lu bytes\n", this->parser->nodes.size(), sizeof(parse_node_t), this->parser->nodes.size() * sizeof(parse_node_t));
#endif
if (output != NULL) if (output != NULL)
{ {
@ -642,3 +771,40 @@ bool parse_t::parse(const wcstring &str, parse_node_tree_t *output, parse_error_
return ! this->parser->fatal_errored; return ! this->parser->fatal_errored;
} }
const parse_node_t *parse_node_tree_t::get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type) const
{
const parse_node_t *result = NULL;
PARSE_ASSERT(which < parent.child_count);
node_offset_t child_offset = parent.child_offset(which);
if (child_offset < this->size())
{
result = &this->at(child_offset);
}
// If we are given an expected type, then the node must be null or that type
if (result != NULL)
{
assert(expected_type == token_type_invalid || expected_type == result->type);
}
return result;
}
static void find_nodes_recursive(const parse_node_tree_t &tree, const parse_node_t &parent, parse_token_type_t type, parse_node_tree_t::parse_node_list_t *result)
{
if (parent.type == type) result->push_back(&parent);
for (size_t i=0; i < parent.child_count; i++)
{
const parse_node_t *child = tree.get_child(parent, i);
assert(child != NULL);
find_nodes_recursive(tree, *child, type, result);
}
}
parse_node_tree_t::parse_node_list_t parse_node_tree_t::find_nodes(const parse_node_t &parent, parse_token_type_t type) const
{
parse_node_list_t result;
find_nodes_recursive(*this, parent, type, &result);
return result;
}

View file

@ -15,7 +15,7 @@
#include <vector> #include <vector>
#define PARSE_ASSERT(a) assert(a) #define PARSE_ASSERT(a) assert(a)
#define PARSER_DIE() exit_without_destructors(-1) #define PARSER_DIE() do { fprintf(stderr, "Parser dying!\n"); exit_without_destructors(-1); } while (0)
class parse_node_t; class parse_node_t;
class parse_node_tree_t; class parse_node_tree_t;
@ -36,6 +36,18 @@ struct parse_error_t
}; };
typedef std::vector<parse_error_t> parse_error_list_t; typedef std::vector<parse_error_t> parse_error_list_t;
enum
{
parse_flag_none = 0,
/* Attempt to build a "parse tree" no matter what. This may result in a 'forest' of disconnected trees. This is intended to be used by syntax highlighting. */
parse_flag_continue_after_error = 1 << 0,
/* Include comment tokens */
parse_flag_include_comments = 1 << 1
};
typedef unsigned int parse_tree_flags_t;
class parse_ll_t; class parse_ll_t;
class parse_t class parse_t
{ {
@ -43,7 +55,8 @@ class parse_t
public: public:
parse_t(); parse_t();
bool parse(const wcstring &str, parse_node_tree_t *output, parse_error_list_t *errors); ~parse_t();
bool parse(const wcstring &str, parse_tree_flags_t flags, parse_node_tree_t *output, parse_error_list_t *errors, bool log_it = false);
}; };
enum parse_token_type_t enum parse_token_type_t
@ -80,6 +93,9 @@ enum parse_token_type_t
symbol_argument_list_nonempty, symbol_argument_list_nonempty,
symbol_argument_list, symbol_argument_list,
symbol_argument,
symbol_redirection,
symbol_optional_background, symbol_optional_background,
// Terminal types // Terminal types
@ -90,6 +106,11 @@ enum parse_token_type_t
parse_token_type_end, parse_token_type_end,
parse_token_type_terminate, parse_token_type_terminate,
// Very special terminal types that don't appear in the production list
parse_special_type_parse_error,
parse_special_type_tokenizer_error,
parse_special_type_comment,
LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate, LAST_TOKEN_OR_SYMBOL = parse_token_type_terminate,
FIRST_PARSE_TOKEN_TYPE = parse_token_type_string FIRST_PARSE_TOKEN_TYPE = parse_token_type_string
}; };
@ -145,7 +166,7 @@ public:
wcstring describe(void) const; wcstring describe(void) const;
/* Constructor */ /* Constructor */
explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(0), source_length(0), child_start(0), child_count(0), tag(0) explicit parse_node_t(parse_token_type_t ty) : type(ty), source_start(-1), source_length(0), child_start(0), child_count(0), tag(0)
{ {
} }
@ -154,10 +175,23 @@ public:
PARSE_ASSERT(which < child_count); PARSE_ASSERT(which < child_count);
return child_start + which; return child_start + which;
} }
bool has_source() const
{
return source_start != (size_t)(-1);
}
}; };
class parse_node_tree_t : public std::vector<parse_node_t> class parse_node_tree_t : public std::vector<parse_node_t>
{ {
public:
/* Get the node corresponding to a child of the given node, or NULL if there is no such child. If expected_type is provided, assert that the node has that type. */
const parse_node_t *get_child(const parse_node_t &parent, node_offset_t which, parse_token_type_t expected_type = token_type_invalid) const;
/* Find all the nodes of a given type underneath a given node */
typedef std::vector<const parse_node_t *> parse_node_list_t;
parse_node_list_t find_nodes(const parse_node_t &parent, parse_token_type_t type) const;
}; };
@ -214,7 +248,8 @@ class parse_node_tree_t : public std::vector<parse_node_t>
arguments_or_redirections_list = <empty> | arguments_or_redirections_list = <empty> |
argument_or_redirection arguments_or_redirections_list argument_or_redirection arguments_or_redirections_list
argument_or_redirection = redirection | <TOK_STRING> argument_or_redirection = argument | redirection
argument = <TOK_STRING>
redirection = <TOK_REDIRECTION> redirection = <TOK_REDIRECTION>
terminator = <TOK_END> | <TOK_BACKGROUND> terminator = <TOK_END> | <TOK_BACKGROUND>