Migrate next_parse_token into token_stream_t

Cleaning up parse_tree.cpp with an eye to remove it.
This commit is contained in:
ridiculousfish 2020-07-07 14:01:01 -07:00
parent 72e35af381
commit 5308223212
3 changed files with 114 additions and 124 deletions

View file

@ -23,6 +23,84 @@ static tok_flags_t tokenizer_flags_from_parse_flags(parse_tree_flags_t flags) {
return tok_flags;
}
// Given an expanded string, returns any keyword it matches.
static parse_keyword_t keyword_with_name(const wchar_t *name) {
return str_to_enum(name, keyword_enum_map, keyword_enum_map_len);
}
static bool is_keyword_char(wchar_t c) {
return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || (c >= L'0' && c <= L'9') ||
c == L'\'' || c == L'"' || c == L'\\' || c == '\n' || c == L'!';
}
/// Given a token, returns the keyword it matches, or parse_keyword_t::none.
static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) {
/* Only strings can be keywords */
if (tok != token_type_t::string) {
return parse_keyword_t::none;
}
// If tok_txt is clean (which most are), we can compare it directly. Otherwise we have to expand
// it. We only expand quotes, and we don't want to do expensive expansions like tilde
// expansions. So we do our own "cleanliness" check; if we find a character not in our allowed
// set we know it's not a keyword, and if we never find a quote we don't have to expand! Note
// that this lowercase set could be shrunk to be just the characters that are in keywords.
parse_keyword_t result = parse_keyword_t::none;
bool needs_expand = false, all_chars_valid = true;
const wchar_t *tok_txt = token.c_str();
for (size_t i = 0; tok_txt[i] != L'\0'; i++) {
wchar_t c = tok_txt[i];
if (!is_keyword_char(c)) {
all_chars_valid = false;
break;
}
// If we encounter a quote, we need expansion.
needs_expand = needs_expand || c == L'"' || c == L'\'' || c == L'\\';
}
if (all_chars_valid) {
// Expand if necessary.
if (!needs_expand) {
result = keyword_with_name(tok_txt);
} else {
wcstring storage;
if (unescape_string(tok_txt, &storage, 0)) {
result = keyword_with_name(storage.c_str());
}
}
}
return result;
}
/// Convert from tokenizer_t's token type to a parse_token_t type.
static parse_token_type_t parse_token_type_from_tokenizer_token(
enum token_type_t tokenizer_token_type) {
switch (tokenizer_token_type) {
case token_type_t::string:
return parse_token_type_string;
case token_type_t::pipe:
return parse_token_type_pipe;
case token_type_t::andand:
return parse_token_type_andand;
case token_type_t::oror:
return parse_token_type_oror;
case token_type_t::end:
return parse_token_type_end;
case token_type_t::background:
return parse_token_type_background;
case token_type_t::redirect:
return parse_token_type_redirection;
case token_type_t::error:
return parse_special_type_tokenizer_error;
case token_type_t::comment:
return parse_special_type_comment;
}
FLOGF(error, L"Bad token type %d passed to %s", static_cast<int>(tokenizer_token_type),
__FUNCTION__);
DIE("bad token type");
return token_type_invalid;
}
/// A token stream generates a sequence of parser tokens, permitting arbitrary lookahead.
class token_stream_t {
public:
@ -64,10 +142,11 @@ class token_stream_t {
// Helper to mask our circular buffer.
static constexpr size_t mask(size_t idx) { return idx % kMaxLookahead; }
/// \return the next parse token from the tokenizer.
/// This consumes and stores comments.
parse_token_t next_from_tok() {
for (;;) {
maybe_t<tok_t> tokenizer_tok{};
parse_token_t res = next_parse_token(&tok_, &tokenizer_tok, &storage_);
parse_token_t res = advance_1();
if (res.type == parse_special_type_comment) {
comment_ranges.push_back(res.range());
continue;
@ -76,6 +155,39 @@ class token_stream_t {
}
}
/// \return a new parse token, advancing the tokenizer.
/// This returns comments.
parse_token_t advance_1() {
auto mtoken = tok_.next();
if (!mtoken.has_value()) {
return parse_token_t{parse_token_type_terminate};
}
const tok_t &token = *mtoken;
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite
// sketchy, because it ignores quotes. This is the historical behavior. For example,
// `builtin --names` lists builtins, but `builtin "--names"` attempts to run --names as a
// command. Amazingly as of this writing (10/12/13) nobody seems to have noticed this.
// Squint at it really hard and it even starts to look like a feature.
parse_token_t result{parse_token_type_from_tokenizer_token(token.type)};
const wcstring &text = tok_.copy_text_of(token, &storage_);
result.keyword = keyword_for_token(token.type, text);
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
result.is_help_argument = (text == L"-h" || text == L"--help");
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
result.may_be_variable_assignment = variable_assignment_equals_pos(text).has_value();
result.tok_error = token.error;
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work
// in uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably
// just crash.
assert(token.offset < SOURCE_OFFSET_INVALID);
result.source_start = static_cast<source_offset_t>(token.offset);
assert(token.length <= SOURCE_OFFSET_INVALID);
result.source_length = static_cast<source_offset_t>(token.length);
return result;
}
// The maximum number of lookahead supported.
static constexpr size_t kMaxLookahead = 2;

View file

@ -202,125 +202,6 @@ wcstring parse_token_t::user_presentable_description() const {
return token_type_user_presentable_description(type, keyword);
}
/// Convert from tokenizer_t's token type to a parse_token_t type.
static inline parse_token_type_t parse_token_type_from_tokenizer_token(
enum token_type_t tokenizer_token_type) {
switch (tokenizer_token_type) {
case token_type_t::string:
return parse_token_type_string;
case token_type_t::pipe:
return parse_token_type_pipe;
case token_type_t::andand:
return parse_token_type_andand;
case token_type_t::oror:
return parse_token_type_oror;
case token_type_t::end:
return parse_token_type_end;
case token_type_t::background:
return parse_token_type_background;
case token_type_t::redirect:
return parse_token_type_redirection;
case token_type_t::error:
return parse_special_type_tokenizer_error;
case token_type_t::comment:
return parse_special_type_comment;
}
FLOGF(error, L"Bad token type %d passed to %s", static_cast<int>(tokenizer_token_type),
__FUNCTION__);
DIE("bad token type");
return token_type_invalid;
}
// Given an expanded string, returns any keyword it matches.
static inline parse_keyword_t keyword_with_name(const wchar_t *name) {
return str_to_enum(name, keyword_enum_map, keyword_enum_map_len);
}
static bool is_keyword_char(wchar_t c) {
return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || (c >= L'0' && c <= L'9') ||
c == L'\'' || c == L'"' || c == L'\\' || c == '\n' || c == L'!';
}
/// Given a token, returns the keyword it matches, or parse_keyword_t::none.
static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) {
/* Only strings can be keywords */
if (tok != token_type_t::string) {
return parse_keyword_t::none;
}
// If tok_txt is clean (which most are), we can compare it directly. Otherwise we have to expand
// it. We only expand quotes, and we don't want to do expensive expansions like tilde
// expansions. So we do our own "cleanliness" check; if we find a character not in our allowed
// set we know it's not a keyword, and if we never find a quote we don't have to expand! Note
// that this lowercase set could be shrunk to be just the characters that are in keywords.
parse_keyword_t result = parse_keyword_t::none;
bool needs_expand = false, all_chars_valid = true;
const wchar_t *tok_txt = token.c_str();
for (size_t i = 0; tok_txt[i] != L'\0'; i++) {
wchar_t c = tok_txt[i];
if (!is_keyword_char(c)) {
all_chars_valid = false;
break;
}
// If we encounter a quote, we need expansion.
needs_expand = needs_expand || c == L'"' || c == L'\'' || c == L'\\';
}
if (all_chars_valid) {
// Expand if necessary.
if (!needs_expand) {
result = keyword_with_name(tok_txt);
} else {
wcstring storage;
if (unescape_string(tok_txt, &storage, 0)) {
result = keyword_with_name(storage.c_str());
}
}
}
return result;
}
/// Terminal token.
static constexpr parse_token_t kTerminalToken = {parse_token_type_terminate};
static inline bool is_help_argument(const wcstring &txt) {
return txt == L"-h" || txt == L"--help";
}
/// Return a new parse token, advancing the tokenizer.
parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token, wcstring *storage) {
*out_token = tok->next();
if (!out_token->has_value()) {
return kTerminalToken;
}
const tok_t &token = **out_token;
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy,
// because it ignores quotes. This is the historical behavior. For example, `builtin --names`
// lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
// even starts to look like a feature.
parse_token_t result{parse_token_type_from_tokenizer_token(token.type)};
const wcstring &text = tok->copy_text_of(token, storage);
result.keyword = keyword_for_token(token.type, text);
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
result.may_be_variable_assignment = variable_assignment_equals_pos(text).has_value();
result.tok_error = token.error;
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
// crash.
assert(token.offset < SOURCE_OFFSET_INVALID);
result.source_start = static_cast<source_offset_t>(token.offset);
assert(token.length <= SOURCE_OFFSET_INVALID);
result.source_length = static_cast<source_offset_t>(token.length);
return result;
}
parsed_source_t::parsed_source_t(wcstring s, ast::ast_t &&ast)
: src(std::move(s)), ast(make_unique<ast::ast_t>(std::move(ast))) {}

View file

@ -50,9 +50,6 @@ struct parse_token_t {
constexpr parse_token_t(parse_token_type_t type) : type(type) {}
};
/// Return a new parse token, advancing the tokenizer.
parse_token_t next_parse_token(tokenizer_t *tok, maybe_t<tok_t> *out_token, wcstring *storage);
const wchar_t *token_type_description(parse_token_type_t type);
const wchar_t *keyword_description(parse_keyword_t type);