Rework decision process for whether to interpret keywords as structural

or as commands (for LL parser). Will allow 'builtin --' to parse as a
plain statement, instead of a decorated statement '--'
This commit is contained in:
ridiculousfish 2013-10-12 02:46:49 -07:00
parent 77e358a001
commit ddec870d25
5 changed files with 72 additions and 93 deletions

View file

@ -2004,6 +2004,7 @@ static void test_new_parser_ll2(void)
{L"command", L"command", L"", parse_statement_decoration_none},
{L"command -", L"command", L"-", parse_statement_decoration_none},
{L"command --", L"command", L"--", parse_statement_decoration_none},
{L"builtin --names", L"builtin", L"--names", parse_statement_decoration_none},
{L"function", L"function", L"", parse_statement_decoration_none},
{L"function --help", L"function", L"--help", parse_statement_decoration_none}
};

View file

@ -26,37 +26,9 @@ static bool production_is_valid(const production_options_t production_list, prod
return nonempty_found;
}
/* Helper function indicates whether a token (typically second token) causes the preceding token to be treated as a command instead of giving it a special role. This is so we can treat e.g. 'command --help' as "invoke the 'command' builtin with --help' instead of 'run the --help command'.
if naked_invocation_invokes_help is true, then we treat an invalid type or something other than a string as indicating help; this means that the user ran e.g. 'command' with no arguments.
*/
static inline bool token_implies_previous_keyword_is_command(parse_token_type_t type, parse_keyword_t keyword, bool naked_invocation_invokes_help)
{
bool result = false;
switch (keyword)
{
case parse_keyword_dash:
case parse_keyword_dashdash:
case parse_keyword_dash_h:
case parse_keyword_dashdash_help:
result = true;
break;
default:
break;
}
if (! result)
{
result = naked_invocation_invokes_help && type != parse_token_type_string;
}
return result;
}
#define PRODUCTIONS(sym) static const production_options_t productions_##sym
#define RESOLVE(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, parse_token_type_t token_type2, parse_keyword_t token_keyword2)
#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (parse_token_type_t token_type, parse_keyword_t token_keyword, parse_token_type_t token_type2, parse_keyword_t token_keyword2) { return 0; }
#define RESOLVE(sym) static production_option_idx_t resolve_##sym (const parse_token_t &token1, const parse_token_t &token2)
#define RESOLVE_ONLY(sym) static production_option_idx_t resolve_##sym (const parse_token_t &input1, const parse_token_t &input2) { return 0; }
#define KEYWORD(x) ((x) + LAST_TOKEN_OR_SYMBOL + 1)
@ -71,11 +43,11 @@ PRODUCTIONS(job_list) =
RESOLVE(job_list)
{
switch (token_type)
switch (token1.type)
{
case parse_token_type_string:
// 'end' is special
switch (token_keyword)
switch (token1.keyword)
{
case parse_keyword_end:
case parse_keyword_else:
@ -120,7 +92,7 @@ PRODUCTIONS(job_continuation) =
};
RESOLVE(job_continuation)
{
switch (token_type)
switch (token1.type)
{
case parse_token_type_pipe:
// Pipe, continuation
@ -143,21 +115,29 @@ PRODUCTIONS(statement) =
};
RESOLVE(statement)
{
// Go to decorated statements if the subsequent token looks like '--help'
// Go to decorated statements if the subsequent token looks like '--'
// If we are 'begin', then we expect to be invoked with no arguments. But if we are anything else, we require an argument, so do the same thing if the subsequent token is a line end.
if (token_type == parse_token_type_string)
if (token1.type == parse_token_type_string)
{
bool naked_invocation_invokes_help = (token_keyword != parse_keyword_begin && token_keyword != parse_keyword_end);
if (token_implies_previous_keyword_is_command(token_type2, token_keyword2, naked_invocation_invokes_help))
// If the next token looks like an option (starts with a dash), then parse it as a decorated statement
if (token2.has_dash_prefix)
{
return 4; //decorated statement
}
return 4;
}
switch (token_type)
// Likewise if the next token doesn't look like an argument at all. This corresponds to e.g. a "naked if".
bool naked_invocation_invokes_help = (token1.keyword != parse_keyword_begin && token1.keyword != parse_keyword_end);
if (naked_invocation_invokes_help && token2.type != parse_token_type_string)
{
return 4;
}
}
switch (token1.type)
{
case parse_token_type_string:
switch (token_keyword)
switch (token1.keyword)
{
case parse_keyword_and:
case parse_keyword_or:
@ -188,10 +168,6 @@ RESOLVE(statement)
case parse_keyword_command:
case parse_keyword_builtin:
case parse_keyword_case:
case parse_keyword_dash:
case parse_keyword_dashdash:
case parse_keyword_dash_h:
case parse_keyword_dashdash_help:
return 4;
}
break;
@ -227,7 +203,7 @@ PRODUCTIONS(else_clause) =
};
RESOLVE(else_clause)
{
switch (token_keyword)
switch (token1.keyword)
{
case parse_keyword_else:
return 1;
@ -243,7 +219,7 @@ PRODUCTIONS(else_continuation) =
};
RESOLVE(else_continuation)
{
switch (token_keyword)
switch (token1.keyword)
{
case parse_keyword_if:
return 0;
@ -266,8 +242,8 @@ PRODUCTIONS(case_item_list) =
};
RESOLVE(case_item_list)
{
if (token_keyword == parse_keyword_case) return 1;
else if (token_type == parse_token_type_end) return 2; //empty line
if (token1.keyword == parse_keyword_case) return 1;
else if (token1.type == parse_token_type_end) return 2; //empty line
else return 0;
}
@ -284,7 +260,7 @@ PRODUCTIONS(argument_list) =
};
RESOLVE(argument_list)
{
switch (token_type)
switch (token1.type)
{
case parse_token_type_string:
return 1;
@ -308,7 +284,7 @@ PRODUCTIONS(block_header) =
};
RESOLVE(block_header)
{
switch (token_keyword)
switch (token1.keyword)
{
case parse_keyword_else:
return NO_PRODUCTION;
@ -358,7 +334,7 @@ PRODUCTIONS(boolean_statement) =
};
RESOLVE(boolean_statement)
{
switch (token_keyword)
switch (token1.keyword)
{
case parse_keyword_and:
return 0;
@ -379,11 +355,13 @@ PRODUCTIONS(decorated_statement) =
};
RESOLVE(decorated_statement)
{
/* If this is e.g. 'command --help' then the command is 'command' and not a decoration */
if (token_implies_previous_keyword_is_command(token_type2, token_keyword2, true /* naked_invocation_is_help */))
/* If this is e.g. 'command --help' then the command is 'command' and not a decoration. If the second token is not a string, then this is a naked 'command' and we should execute it as undecorated. */
if (token2.type != parse_token_type_string || token2.has_dash_prefix)
{
return 0;
}
switch (token_keyword)
switch (token1.keyword)
{
default:
return 0;
@ -407,7 +385,7 @@ PRODUCTIONS(arguments_or_redirections_list) =
};
RESOLVE(arguments_or_redirections_list)
{
switch (token_type)
switch (token1.type)
{
case parse_token_type_string:
case parse_token_type_redirection:
@ -424,7 +402,7 @@ PRODUCTIONS(argument_or_redirection) =
};
RESOLVE(argument_or_redirection)
{
switch (token_type)
switch (token1.type)
{
case parse_token_type_string:
return 0;
@ -455,7 +433,7 @@ PRODUCTIONS(optional_background) =
RESOLVE(optional_background)
{
switch (token_type)
switch (token1.type)
{
case parse_token_type_background:
return 1;
@ -465,17 +443,17 @@ RESOLVE(optional_background)
}
#define TEST(sym) case (symbol_##sym): production_list = & productions_ ## sym ; resolver = resolve_ ## sym ; break;
const production_t *parse_productions::production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, parse_token_type_t input_type2, parse_keyword_t input_keyword2, production_option_idx_t *out_which_production, wcstring *out_error_text)
const production_t *parse_productions::production_for_token(parse_token_type_t node_type, const parse_token_t &input1, const parse_token_t &input2, production_option_idx_t *out_which_production, wcstring *out_error_text)
{
bool log_it = false;
if (log_it)
{
fprintf(stderr, "Resolving production for %ls with input type %ls <%ls>\n", token_type_description(node_type).c_str(), token_type_description(input_type).c_str(), keyword_description(input_keyword).c_str());
fprintf(stderr, "Resolving production for %ls with input token <%ls>\n", token_type_description(node_type).c_str(), input1.describe().c_str());
}
/* Fetch the list of productions and the function to resolve them */
const production_options_t *production_list = NULL;
production_option_idx_t (*resolver)(parse_token_type_t token_type, parse_keyword_t token_keyword, parse_token_type_t token_type2, parse_keyword_t token_keyword2) = NULL;
production_option_idx_t (*resolver)(const parse_token_t &input1, const parse_token_t &input2) = NULL;
switch (node_type)
{
TEST(job_list)
@ -533,7 +511,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
PARSE_ASSERT(resolver != NULL);
const production_t *result = NULL;
production_option_idx_t which = resolver(input_type, input_keyword, input_type2, input_keyword2);
production_option_idx_t which = resolver(input1, input2);
if (log_it)
{
@ -545,7 +523,7 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
{
if (log_it)
{
fprintf(stderr, "Token type '%ls' has no production for input type '%ls', keyword '%ls' (in %s)\n", token_type_description(node_type).c_str(), token_type_description(input_type).c_str(), keyword_description(input_keyword).c_str(), __FUNCTION__);
fprintf(stderr, "Node type '%ls' has no production for input '%ls' (in %s)\n", token_type_description(node_type).c_str(), input1.describe().c_str(), __FUNCTION__);
}
result = NULL;
}
@ -557,3 +535,4 @@ const production_t *parse_productions::production_for_token(parse_token_type_t n
*out_which_production = which;
return result;
}

View file

@ -14,7 +14,6 @@ namespace parse_productions
#define MAX_PRODUCTIONS 5
#define MAX_SYMBOLS_PER_PRODUCTION 5
typedef uint32_t production_tag_t;
/* A production is an array of unsigned char. Symbols are encoded directly as their symbol value. Keywords are encoded with an offset of LAST_TOKEN_OR_SYMBOL + 1. So essentially we glom together keywords and symbols. */
@ -63,7 +62,7 @@ inline bool production_element_is_valid(production_element_t elem)
}
/* Fetch a production. We are passed two input tokens. The first input token is guaranteed to not be invalid; the second token may be invalid if there's no more tokens. */
const production_t *production_for_token(parse_token_type_t node_type, parse_token_type_t input_type, parse_keyword_t input_keyword, parse_token_type_t input_type2, parse_keyword_t input_keyword2, production_option_idx_t *out_idx, wcstring *out_error_text);
const production_t *production_for_token(parse_token_type_t node_type, const parse_token_t &input1, const parse_token_t &input2, production_option_idx_t *out_which_production, wcstring *out_error_text);
}

View file

@ -179,24 +179,18 @@ wcstring parse_node_t::describe(void) const
return result;
}
/** A struct representing the token type passed to */
struct parse_token_t
{
enum parse_token_type_t type; // The type of the token as represented by the parser
enum parse_keyword_t keyword; // Any keyword represented by this parser
size_t source_start;
size_t source_length;
wcstring describe() const
{
/** Returns a string description of the given parse token */
wcstring parse_token_t::describe() const
{
wcstring result = token_type_description(type);
if (keyword != parse_keyword_none)
{
append_format(result, L" <%ls>", keyword_description(keyword).c_str());
}
return result;
}
};
}
/* Convert from tokenizer_t's token type to a parse_token_t type */
static inline parse_token_type_t parse_token_type_from_tokenizer_token(enum token_type tokenizer_token_type)
@ -720,7 +714,7 @@ void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2)
// Get the production for the top of the stack
parse_stack_element_t &stack_elem = symbol_stack.back();
parse_node_t &node = nodes.at(stack_elem.node_idx);
const production_t *production = production_for_token(stack_elem.type, token1.type, token1.keyword, token2.type, token2.keyword, &node.production_idx, NULL /* error text */);
const production_t *production = production_for_token(stack_elem.type, token1, token2, &node.production_idx, NULL /* error text */);
if (production == NULL)
{
if (should_generate_error_messages)
@ -783,11 +777,7 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
{L"or", parse_keyword_or},
{L"not", parse_keyword_not},
{L"command", parse_keyword_command},
{L"builtin", parse_keyword_builtin},
{L"-", parse_keyword_dash},
{L"--", parse_keyword_dashdash},
{L"-h", parse_keyword_dash_h},
{L"--help", parse_keyword_dashdash_help}
{L"builtin", parse_keyword_builtin}
};
for (size_t i=0; i < sizeof keywords / sizeof *keywords; i++)
@ -803,7 +793,7 @@ static parse_keyword_t keyword_for_token(token_type tok, const wchar_t *tok_txt)
}
/* Placeholder invalid token */
static const parse_token_t kInvalidToken = {token_type_invalid, parse_keyword_none, -1, -1};
static const parse_token_t kInvalidToken = {token_type_invalid, parse_keyword_none, false, -1, -1};
/* Return a new parse token, advancing the tokenizer */
static inline parse_token_t next_parse_token(tokenizer_t *tok)
@ -820,10 +810,13 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok)
const wchar_t *tok_txt = tok_last(tok);
parse_token_t result;
/* Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy, because it ignores quotes. This is the historical behavior. For example, `builtin --names` lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard ant it even starts to look like a feature. */
result.type = parse_token_type_from_tokenizer_token(tok_type);
result.keyword = keyword_for_token(tok_type, tok_txt);
result.has_dash_prefix = (tok_txt[0] == L'-');
result.source_start = (size_t)tok_start;
result.source_length = tok_extent;
result.keyword = keyword_for_token(tok_type, tok_txt);
tok_next(tok);
return result;

View file

@ -113,13 +113,20 @@ enum parse_keyword_t
parse_keyword_command,
parse_keyword_builtin,
/* The following are not really keywords but are necessary for e.g. "command --help" to work */
parse_keyword_dash,
parse_keyword_dashdash,
parse_keyword_dash_h,
parse_keyword_dashdash_help,
LAST_KEYWORD = parse_keyword_builtin
};
LAST_KEYWORD = parse_keyword_dashdash_help
/** A struct representing the token type that we use internally */
struct parse_token_t
{
enum parse_token_type_t type; // The type of the token as represented by the parser
enum parse_keyword_t keyword; // Any keyword represented by this token
bool has_dash_prefix; // Hackish: whether the source contains a dash prefix
size_t source_start;
size_t source_length;
wcstring describe() const;
};