#include "config.h" // IWYU pragma: keep #include "ast.h" #include #include #include #include #include #include "common.h" #include "enum_map.h" #include "flog.h" #include "parse_constants.h" #include "parse_tree.h" #include "tokenizer.h" #include "wutil.h" // IWYU pragma: keep namespace { /// \return tokenizer flags corresponding to parse tree flags. static tok_flags_t tokenizer_flags_from_parse_flags(parse_tree_flags_t flags) { tok_flags_t tok_flags = 0; // Note we do not need to respect parse_flag_show_blank_lines, no clients are interested in // them. if (flags & parse_flag_include_comments) tok_flags |= TOK_SHOW_COMMENTS; if (flags & parse_flag_accept_incomplete_tokens) tok_flags |= TOK_ACCEPT_UNFINISHED; if (flags & parse_flag_continue_after_error) tok_flags |= TOK_CONTINUE_AFTER_ERROR; return tok_flags; } // Given an expanded string, returns any keyword it matches. static parse_keyword_t keyword_with_name(const wcstring &name) { return str_to_enum(name.c_str(), keyword_enum_map, keyword_enum_map_len); } static bool is_keyword_char(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || (c >= L'0' && c <= L'9') || c == L'\'' || c == L'"' || c == L'\\' || c == '\n' || c == L'!'; } /// Given a token, returns the keyword it matches, or parse_keyword_t::none. static parse_keyword_t keyword_for_token(token_type_t tok, const wcstring &token) { /* Only strings can be keywords */ if (tok != token_type_t::string) { return parse_keyword_t::none; } // If token is clean (which most are), we can compare it directly. Otherwise we have to expand // it. We only expand quotes, and we don't want to do expensive expansions like tilde // expansions. So we do our own "cleanliness" check; if we find a character not in our allowed // set we know it's not a keyword, and if we never find a quote we don't have to expand! Note // that this lowercase set could be shrunk to be just the characters that are in keywords. parse_keyword_t result = parse_keyword_t::none; bool needs_expand = false, all_chars_valid = true; for (wchar_t c : token) { if (!is_keyword_char(c)) { all_chars_valid = false; break; } // If we encounter a quote, we need expansion. needs_expand = needs_expand || c == L'"' || c == L'\'' || c == L'\\'; } if (all_chars_valid) { // Expand if necessary. if (!needs_expand) { result = keyword_with_name(token); } else { wcstring storage; if (unescape_string(token, &storage, 0)) { result = keyword_with_name(storage); } } } return result; } /// Convert from tokenizer_t's token type to a parse_token_t type. static parse_token_type_t parse_token_type_from_tokenizer_token( enum token_type_t tokenizer_token_type) { switch (tokenizer_token_type) { case token_type_t::string: return parse_token_type_t::string; case token_type_t::pipe: return parse_token_type_t::pipe; case token_type_t::andand: return parse_token_type_t::andand; case token_type_t::oror: return parse_token_type_t::oror; case token_type_t::end: return parse_token_type_t::end; case token_type_t::background: return parse_token_type_t::background; case token_type_t::redirect: return parse_token_type_t::redirection; case token_type_t::error: return parse_token_type_t::tokenizer_error; case token_type_t::comment: return parse_token_type_t::comment; } FLOGF(error, L"Bad token type %d passed to %s", static_cast(tokenizer_token_type), __FUNCTION__); DIE("bad token type"); return parse_token_type_t::invalid; } /// A token stream generates a sequence of parser tokens, permitting arbitrary lookahead. class token_stream_t { public: explicit token_stream_t(const wcstring &src, parse_tree_flags_t flags, std::vector &comments) : src_(src), tok_(src_.c_str(), tokenizer_flags_from_parse_flags(flags)), comment_ranges(comments) {} /// \return the token at the given index, without popping it. If the token streamĀ is exhausted, /// it will have parse_token_type_t::terminate. idx = 0 means the next token, idx = 1 means the /// next-next token, and so forth. /// We must have that idx < kMaxLookahead. const parse_token_t &peek(size_t idx = 0) { assert(idx < kMaxLookahead && "Trying to look too far ahead"); while (idx >= count_) { lookahead_.at(mask(start_ + count_)) = next_from_tok(); count_ += 1; } return lookahead_.at(mask(start_ + idx)); } /// Pop the next token. parse_token_t pop() { if (count_ == 0) { return next_from_tok(); } parse_token_t result = lookahead_[start_]; start_ = mask(start_ + 1); count_ -= 1; return result; } /// Provide the orignal source code. const wcstring &source() const { return src_; } private: // Helper to mask our circular buffer. static constexpr size_t mask(size_t idx) { return idx % kMaxLookahead; } /// \return the next parse token from the tokenizer. /// This consumes and stores comments. parse_token_t next_from_tok() { for (;;) { parse_token_t res = advance_1(); if (res.type == parse_token_type_t::comment) { comment_ranges.push_back(res.range()); continue; } return res; } } /// \return a new parse token, advancing the tokenizer. /// This returns comments. parse_token_t advance_1() { auto mtoken = tok_.next(); if (!mtoken.has_value()) { return parse_token_t{parse_token_type_t::terminate}; } const tok_t &token = *mtoken; // Set the type, keyword, and whether there's a dash prefix. Note that this is quite // sketchy, because it ignores quotes. This is the historical behavior. For example, // `builtin --names` lists builtins, but `builtin "--names"` attempts to run --names as a // command. Amazingly as of this writing (10/12/13) nobody seems to have noticed this. // Squint at it really hard and it even starts to look like a feature. parse_token_t result{parse_token_type_from_tokenizer_token(token.type)}; const wcstring &text = tok_.copy_text_of(token, &storage_); result.keyword = keyword_for_token(token.type, text); result.has_dash_prefix = !text.empty() && text.at(0) == L'-'; result.is_help_argument = (text == L"-h" || text == L"--help"); result.is_newline = (result.type == parse_token_type_t::end && text == L"\n"); result.may_be_variable_assignment = variable_assignment_equals_pos(text).has_value(); result.tok_error = token.error; // These assertions are totally bogus. Basically our tokenizer works in size_t but we work // in uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably // just crash. assert(token.offset < SOURCE_OFFSET_INVALID); result.source_start = static_cast(token.offset); assert(token.length <= SOURCE_OFFSET_INVALID); result.source_length = static_cast(token.length); if (token.error != tokenizer_error_t::none) { auto subtoken_offset = static_cast(token.error_offset_within_token); // Skip invalid tokens that have a zero length, especially if they are at EOF. if (subtoken_offset < result.source_length) { result.source_start += subtoken_offset; result.source_length = token.error_length; } } return result; } // The maximum number of lookahead supported. static constexpr size_t kMaxLookahead = 2; // We implement a queue with a simple circular buffer. // Note that peek() returns an address, so we must not move elements which are peek'd. // This prevents using vector (which may reallocate). // Deque would work but is too heavyweight for just 2 items. std::array lookahead_ = { {parse_token_type_t::invalid, parse_token_type_t::invalid}}; // Starting index in our lookahead. // The "first" token is at this index. size_t start_ = 0; // Number of items in our lookahead. size_t count_ = 0; // A reference to the original source. const wcstring &src_; // The tokenizer to generate new tokens. tokenizer_t tok_; /// Any comment nodes are collected here. /// These are only collected if parse_flag_include_comments is set. std::vector &comment_ranges; // Temporary storage. wcstring storage_; }; } // namespace namespace ast { /// Given a node which we believe to be some sort of block statement, attempt to return a source /// range for the block's keyword (for, if, etc) and a user-presentable description. This is used to /// provide better error messages. \return {nullptr, nullptr} if we couldn't find it. Note at this /// point the parse tree is incomplete; in particular parent nodes are not set. static std::pair find_block_open_keyword(const node_t *node) { const node_t *cursor = node; while (cursor != nullptr) { switch (cursor->type) { case type_t::block_statement: cursor = cursor->as()->header.contents.get(); break; case type_t::for_header: { const auto *h = cursor->as(); return {h->kw_for.range, L"for loop"}; } case type_t::while_header: { const auto *h = cursor->as(); return {h->kw_while.range, L"while loop"}; } case type_t::function_header: { const auto *h = cursor->as(); return {h->kw_function.range, L"function definition"}; } case type_t::begin_header: { const auto *h = cursor->as(); return {h->kw_begin.range, L"begin"}; } case type_t::if_statement: { const auto *h = cursor->as(); return {h->if_clause.kw_if.range, L"if statement"}; } case type_t::switch_statement: { const auto *h = cursor->as(); return {h->kw_switch.range, L"switch statement"}; } default: return {source_range_t{}, nullptr}; } } return {source_range_t{}, nullptr}; } /// \return the decoration for this statement. statement_decoration_t decorated_statement_t::decoration() const { if (!opt_decoration) { return statement_decoration_t::none; } switch (opt_decoration->kw) { case parse_keyword_t::kw_command: return statement_decoration_t::command; case parse_keyword_t::kw_builtin: return statement_decoration_t::builtin; case parse_keyword_t::kw_exec: return statement_decoration_t::exec; default: assert(0 && "Unexpected keyword in statement decoration"); return statement_decoration_t::none; } } /// \return a string literal name for an ast type. const wchar_t *ast_type_to_string(type_t type) { switch (type) { #define ELEM(T) \ case type_t::T: \ return L"" #T; #include "ast_node_types.inc" } assert(0 && "unreachable"); return L"(unknown)"; } /// Delete an untyped node. void node_deleter_t::operator()(node_t *n) { if (!n) return; switch (n->type) { #define ELEM(T) \ case type_t::T: \ delete n->as(); \ break; #include "ast_node_types.inc" } } wcstring node_t::describe() const { wcstring res = ast_type_to_string(this->type); if (const auto *n = this->try_as()) { append_format(res, L" '%ls'", token_type_description(n->type)); } else if (const auto *n = this->try_as()) { append_format(res, L" '%ls'", keyword_description(n->kw)); } return res; } /// From C++14. template using enable_if_t = typename std::enable_if::type; namespace { struct source_range_visitor_t { template enable_if_t visit(const Node &node) { if (node.unsourced) any_unsourced = true; // Union with our range. if (node.range.length > 0) { if (total.length == 0) { total = node.range; } else { auto end = std::max(total.start + total.length, node.range.start + node.range.length); total.start = std::min(total.start, node.range.start); total.length = end - total.start; } } return; } // Other node types recurse. template enable_if_t visit(const Node &node) { node_visitor(*this).accept_children_of(node); } // Total range we have encountered. source_range_t total{0, 0}; // Whether any node was found to be unsourced. bool any_unsourced{false}; }; } // namespace maybe_t node_t::try_source_range() const { source_range_visitor_t v; node_visitor(v).accept(this); if (v.any_unsourced) return none(); return v.total; } // Helper to describe a list of keywords. // TODO: these need to be localized properly. static wcstring keywords_user_presentable_description(std::initializer_list kws) { assert(kws.size() > 0 && "Should not be empty list"); if (kws.size() == 1) { return format_string(L"keyword '%ls'", keyword_description(*kws.begin())); } size_t idx = 0; wcstring res = L"keywords "; for (parse_keyword_t kw : kws) { const wchar_t *optor = (idx++ ? L" or " : L""); append_format(res, L"%ls'%ls'", optor, keyword_description(kw)); } return res; } // Helper to describe a list of token types. // TODO: these need to be localized properly. static wcstring token_types_user_presentable_description( std::initializer_list types) { assert(types.size() > 0 && "Should not be empty list"); if (types.size() == 1) { return token_type_user_presentable_description(*types.begin()); } size_t idx = 0; wcstring res; for (parse_token_type_t type : types) { const wchar_t *optor = (idx++ ? L" or " : L""); append_format(res, L"%ls%ls", optor, token_type_user_presentable_description(type).c_str()); } return res; } namespace { using namespace ast; struct populator_t { template using unique_ptr = std::unique_ptr; // Construct from a source, flags, top type, and out_errors, which may be null. populator_t(const wcstring &src, parse_tree_flags_t flags, type_t top_type, parse_error_list_t *out_errors) : flags_(flags), tokens_(src, flags, extras_.comments), top_type_(top_type), out_errors_(out_errors) {} // Given a node type, allocate it and invoke its default constructor. // \return the resulting Node pointer. It is never null. template unique_ptr allocate() { unique_ptr node = make_unique(); FLOGF(ast_construction, L"%*smake %ls %p", spaces(), "", ast_type_to_string(Node::AstType), node.get()); return node; } // Given a node type, allocate it, invoke its default constructor, // and then visit it as a field. // \return the resulting Node pointer. It is never null. template unique_ptr allocate_visit() { unique_ptr node = allocate(); this->visit_node_field(*node); return node; } /// Helper for FLOGF. This returns a number of spaces appropriate for a '%*c' format. int spaces() const { return static_cast(visit_stack_.size() * 2); } /// The status of our parser. enum class status_t { // Parsing is going just fine, thanks for asking. ok, // We have exhausted the token stream, but the caller was OK with an incomplete parse tree. // All further leaf nodes should have the unsourced flag set. unsourcing, // We encountered an parse error and are "unwinding." // Do not consume any tokens until we get back to a list type which stops unwinding. unwinding, }; /// \return the parser's status. status_t status() { if (unwinding_) { return status_t::unwinding; } else if ((flags_ & parse_flag_leave_unterminated) && peek_type() == parse_token_type_t::terminate) { return status_t::unsourcing; } return status_t::ok; } /// \return whether the status is unwinding. /// This is more efficient than checking the status directly. bool is_unwinding() const { return unwinding_; } /// \return whether any leaf nodes we visit should be marked as unsourced. bool unsource_leaves() { status_t s = status(); return s == status_t::unsourcing || s == status_t::unwinding; } /// \return whether we permit an incomplete parse tree. bool allow_incomplete() const { return flags_ & parse_flag_leave_unterminated; } /// This indicates a bug in fish code. void internal_error(const char *func, const wchar_t *fmt, ...) const { va_list va; va_start(va, fmt); wcstring msg = vformat_string(fmt, va); va_end(va); FLOG(debug, "Internal parse error from", func, "- this indicates a bug in fish.", msg); FLOG(debug, "Encountered while parsing:<<<\n%ls\n>>>", tokens_.source().c_str()); abort(); } /// \return whether a list type \p type allows arbitrary newlines in it. bool list_type_chomps_newlines(type_t type) const { switch (type) { case type_t::argument_list: // Hackish. If we are producing a freestanding argument list, then it allows // semicolons, for hysterical raisins. return top_type_ == type_t::freestanding_argument_list; case type_t::argument_or_redirection_list: // No newlines inside arguments. return false; case type_t::variable_assignment_list: // No newlines inside variable assignment lists. return false; case type_t::job_list: // Like echo a \n \n echo b return true; case type_t::case_item_list: // Like switch foo \n \n \n case a \n end return true; case type_t::andor_job_list: // Like while true ; \n \n and true ; end return true; case type_t::elseif_clause_list: // Like if true ; \n \n else if false; end return true; case type_t::job_conjunction_continuation_list: // This would be like echo a && echo b \n && echo c // We could conceivably support this but do not now. return false; case type_t::job_continuation_list: // This would be like echo a \n | echo b // We could conceivably support this but do not now. return false; default: internal_error(__FUNCTION__, L"Type %ls not handled", ast_type_to_string(type)); return false; } } /// \return whether a list type \p type allows arbitrary semicolons in it. bool list_type_chomps_semis(type_t type) const { switch (type) { case type_t::argument_list: // Hackish. If we are producing a freestanding argument list, then it allows // semicolons, for hysterical raisins. // That is, this is OK: complete -c foo -a 'x ; y ; z' // But this is not: foo x ; y ; z return top_type_ == type_t::freestanding_argument_list; case type_t::argument_or_redirection_list: case type_t::variable_assignment_list: return false; case type_t::job_list: // Like echo a ; ; echo b return true; case type_t::case_item_list: // Like switch foo ; ; ; case a \n end // This is historically allowed. return true; case type_t::andor_job_list: // Like while true ; ; ; and true ; end return true; case type_t::elseif_clause_list: // Like if true ; ; ; else if false; end return false; case type_t::job_conjunction_continuation_list: // Like echo a ; ; && echo b. Not supported. return false; case type_t::job_continuation_list: // This would be like echo a ; | echo b // Not supported. // We could conceivably support this but do not now. return false; default: internal_error(__FUNCTION__, L"Type %ls not handled", ast_type_to_string(type)); return false; } } // Chomp extra comments, semicolons, etc. for a given list type. void chomp_extras(type_t type) { bool chomp_semis = list_type_chomps_semis(type); bool chomp_newlines = list_type_chomps_newlines(type); for (;;) { const auto &peek = this->tokens_.peek(); if (chomp_newlines && peek.type == parse_token_type_t::end && peek.is_newline) { // Just skip this newline, no need to save it. this->tokens_.pop(); } else if (chomp_semis && peek.type == parse_token_type_t::end && !peek.is_newline) { auto tok = this->tokens_.pop(); // Perhaps save this extra semi. if (flags_ & parse_flag_show_extra_semis) { extras_.semis.push_back(tok.range()); } } else { break; } } } /// \return whether a list type should recover from errors.s /// That is, whether we should stop unwinding when we encounter this type. bool list_type_stops_unwind(type_t type) const { return type == type_t::job_list && (flags_ & parse_flag_continue_after_error); } /// Report an error based on \p fmt for the source range \p range. void parse_error_impl(source_range_t range, parse_error_code_t code, const wchar_t *fmt, va_list va) { any_error_ = true; // Ignore additional parse errors while unwinding. // These may come about e.g. from `true | and`. if (unwinding_) return; unwinding_ = true; FLOGF(ast_construction, L"%*sparse error - begin unwinding", spaces(), ""); // TODO: can store this conditionally dependent on flags. if (range.start != SOURCE_OFFSET_INVALID) { extras_.errors.push_back(range); } if (out_errors_) { parse_error_t err; err.text = vformat_string(fmt, va); err.code = code; err.source_start = range.start; err.source_length = range.length; out_errors_->push_back(std::move(err)); } } /// Report an error based on \p fmt for the source range \p range. void parse_error(source_range_t range, parse_error_code_t code, const wchar_t *fmt, ...) { va_list va; va_start(va, fmt); parse_error_impl(range, code, fmt, va); va_end(va); } /// Report an error based on \p fmt for the source range \p range. void parse_error(const parse_token_t &token, parse_error_code_t code, const wchar_t *fmt, ...) { va_list va; va_start(va, fmt); parse_error_impl(token.range(), code, fmt, va); va_end(va); } // \return a reference to a non-comment token at index \p idx. const parse_token_t &peek_token(size_t idx = 0) { return tokens_.peek(idx); } // \return the type of a non-comment token. parse_token_type_t peek_type(size_t idx = 0) { return peek_token(idx).type; } // Consume the next token, chomping any comments. // It is an error to call this unless we know there is a non-terminate token available. // \return the token. parse_token_t consume_any_token() { parse_token_t tok = tokens_.pop(); assert(tok.type != parse_token_type_t::comment && "Should not be a comment"); assert(tok.type != parse_token_type_t::terminate && "Cannot consume terminate token, caller should check status first"); return tok; } // Consume the next token which is expected to be of the given type. source_range_t consume_token_type(parse_token_type_t type) { assert(type != parse_token_type_t::terminate && "Should not attempt to consume terminate token"); auto tok = consume_any_token(); if (tok.type != type) { parse_error(tok, parse_error_generic, _(L"Expected %ls, but found %ls"), token_type_user_presentable_description(type).c_str(), tok.user_presentable_description().c_str()); return source_range_t{0, 0}; } return tok.range(); } // The next token could not be parsed at the top level. // For example a trailing end like `begin ; end ; end` // Or an unexpected redirection like `>` // Consume it and add an error. void consume_excess_token_generating_error() { auto tok = consume_any_token(); // In the rare case that we are parsing a freestanding argument list and not a job list, // generate a generic error. // TODO: this is a crummy message if we get a tokenizer error, for example: // complete -c foo -a "'abc" if (this->top_type_ == type_t::freestanding_argument_list) { this->parse_error( tok, parse_error_generic, _(L"Expected %ls, but found %ls"), token_type_user_presentable_description(parse_token_type_t::string).c_str(), tok.user_presentable_description().c_str()); return; } assert(this->top_type_ == type_t::job_list); switch (tok.type) { case parse_token_type_t::string: // There are three keywords which end a job list. switch (tok.keyword) { case parse_keyword_t::kw_end: this->parse_error(tok, parse_error_unbalancing_end, _(L"'end' outside of a block")); break; case parse_keyword_t::kw_else: this->parse_error(tok, parse_error_unbalancing_else, _(L"'else' builtin not inside of if block")); break; case parse_keyword_t::kw_case: this->parse_error(tok, parse_error_unbalancing_case, _(L"'case' builtin not inside of switch block")); break; default: internal_error(__FUNCTION__, L"Token %ls should not have prevented parsing a job list", tok.user_presentable_description().c_str()); break; } break; case parse_token_type_t::pipe: case parse_token_type_t::redirection: case parse_token_type_t::background: case parse_token_type_t::andand: case parse_token_type_t::oror: parse_error(tok, parse_error_generic, _(L"Expected a string, but found %ls"), tok.user_presentable_description().c_str()); break; case parse_token_type_t::tokenizer_error: parse_error(tok, parse_error_from_tokenizer_error(tok.tok_error), L"%ls", tokenizer_get_error_message(tok.tok_error)); break; case parse_token_type_t::end: internal_error(__FUNCTION__, L"End token should never be excess"); break; case parse_token_type_t::terminate: internal_error(__FUNCTION__, L"Terminate token should never be excess"); break; default: internal_error(__FUNCTION__, L"Unexpected excess token type: %ls", tok.user_presentable_description().c_str()); break; } } // Our can_parse implementations are for optional values and for lists. // A true return means we should descend into the production, false means stop. // Note that the argument is always nullptr and should be ignored. It is provided strictly for // overloading purposes. bool can_parse(job_conjunction_t *) { const auto &token = peek_token(); if (token.type != parse_token_type_t::string) return false; switch (peek_token().keyword) { case parse_keyword_t::kw_end: case parse_keyword_t::kw_else: case parse_keyword_t::kw_case: // These end a job list. return false; case parse_keyword_t::none: default: return true; } } bool can_parse(argument_t *) { return peek_type() == parse_token_type_t::string; } bool can_parse(redirection_t *) { return peek_type() == parse_token_type_t::redirection; } bool can_parse(argument_or_redirection_t *) { return can_parse((argument_t *)nullptr) || can_parse((redirection_t *)nullptr); } bool can_parse(variable_assignment_t *) { // Do we have a variable assignment at all? if (!peek_token(0).may_be_variable_assignment) return false; // What is the token after it? switch (peek_type(1)) { case parse_token_type_t::string: // We have `a= cmd` and should treat it as a variable assignment. return true; case parse_token_type_t::terminate: // We have `a=` which is OK if we are allowing incomplete, an error otherwise. return allow_incomplete(); default: // We have e.g. `a= >` which is an error. // Note that we do not produce an error here. Instead we return false so this the // token will be seen by allocate_populate_statement_contents. return false; } } template bool can_parse(token_t *tok) { return tok->allows_token(peek_token().type); } // Note we have specific overloads for our keyword nodes, as they need custom logic. bool can_parse(job_conjunction_t::decorator_t *) { // This is for a job conjunction like `and stuff` // But if it's `and --help` then we treat it as an ordinary command. return job_conjunction_t::decorator_t::allows_keyword(peek_token(0).keyword) && !peek_token(1).is_help_argument; } bool can_parse(decorated_statement_t::decorator_t *) { // Here the keyword is 'command' or 'builtin' or 'exec'. // `command stuff` executes a command called stuff. // `command -n` passes the -n argument to the 'command' builtin. // `command` by itself is a command. if (!decorated_statement_t::decorator_t::allows_keyword(peek_token(0).keyword)) { return false; } // Is it like `command --stuff` or `command` by itself? auto tok1 = peek_token(1); return tok1.type == parse_token_type_t::string && !tok1.is_dash_prefix_string(); } bool can_parse(keyword_t *) { // Time keyword is only the time builtin if the next argument doesn't have a dash. return keyword_t::allows_keyword(peek_token(0).keyword) && !peek_token(1).is_dash_prefix_string(); } bool can_parse(job_continuation_t *) { return peek_type() == parse_token_type_t::pipe; } bool can_parse(job_conjunction_continuation_t *) { auto type = peek_type(); return type == parse_token_type_t::andand || type == parse_token_type_t::oror; } bool can_parse(andor_job_t *) { switch (peek_token().keyword) { case parse_keyword_t::kw_and: case parse_keyword_t::kw_or: { // Check that the argument to and/or is a string that's not help. Otherwise it's // either 'and --help' or a naked 'and', and not part of this list. const auto &nexttok = peek_token(1); return nexttok.type == parse_token_type_t::string && !nexttok.is_help_argument; } default: return false; } } bool can_parse(elseif_clause_t *) { return peek_token(0).keyword == parse_keyword_t::kw_else && peek_token(1).keyword == parse_keyword_t::kw_if; } bool can_parse(else_clause_t *) { return peek_token().keyword == parse_keyword_t::kw_else; } bool can_parse(case_item_t *) { return peek_token().keyword == parse_keyword_t::kw_case; } // Given that we are a list of type ListNodeType, whose contents type is ContentsNode, populate // as many elements as we can. // If exhaust_stream is set, then keep going until we get parse_token_type_t::terminate. template void populate_list(list_t &list, bool exhaust_stream = false) { assert(list.contents == nullptr && "List is not initially empty"); // Do not attempt to parse a list if we are unwinding. if (is_unwinding()) { assert(!exhaust_stream && "exhaust_stream should only be set at top level, and so we should not be " "unwinding"); // Mark in the list that it was unwound. FLOGF(ast_construction, L"%*sunwinding %ls", spaces(), "", ast_type_to_string(ListType)); assert(list.empty() && "Should be an empty list"); return; } // We're going to populate a vector with our nodes. // Later on we will copy this to the heap with a single allocation. std::vector> contents; for (;;) { // If we are unwinding, then either we recover or we break the loop, dependent on the // loop type. if (is_unwinding()) { if (!list_type_stops_unwind(ListType)) { break; } // We are going to stop unwinding. // Rather hackish. Just chomp until we get to a string or end node. for (auto type = peek_type(); type != parse_token_type_t::string && type != parse_token_type_t::terminate && type != parse_token_type_t::end; type = peek_type()) { parse_token_t tok = tokens_.pop(); extras_.errors.push_back(tok.range()); FLOGF(ast_construction, L"%*schomping range %u-%u", spaces(), "", tok.source_start, tok.source_length); } FLOGF(ast_construction, L"%*sdone unwinding", spaces(), ""); unwinding_ = false; } // Chomp semis and newlines. chomp_extras(ListType); // Now try parsing a node. if (auto node = this->try_parse()) { // #7201: Minimize reallocations of contents vector if (contents.empty()) { contents.reserve(64); } contents.emplace_back(std::move(node)); } else if (exhaust_stream && peek_type() != parse_token_type_t::terminate) { // We aren't allowed to stop. Produce an error and keep going. consume_excess_token_generating_error(); } else { // We either stop once we can't parse any more of this contents node, or we // exhausted the stream as requested. break; } } // Populate our list from our contents. if (!contents.empty()) { assert(contents.size() <= UINT32_MAX && "Contents size out of bounds"); assert(list.contents == nullptr && "List should still be empty"); // We're going to heap-allocate our array. using contents_ptr_t = typename list_t::contents_ptr_t; auto *array = new contents_ptr_t[contents.size()]; std::move(contents.begin(), contents.end(), array); list.length = static_cast(contents.size()); list.contents = array; } FLOGF(ast_construction, L"%*s%ls size: %lu", spaces(), "", ast_type_to_string(ListType), (unsigned long)list.count()); } /// Allocate and populate a statement contents pointer. /// This must never return null. statement_t::contents_ptr_t allocate_populate_statement_contents() { // In case we get a parse error, we still need to return something non-null. Use a decorated // statement; all of its leaf nodes will end up unsourced. auto got_error = [this] { assert(unwinding_ && "Should have produced an error"); return this->allocate_visit(); }; using pkt = parse_keyword_t; const auto &token1 = peek_token(0); if (token1.type == parse_token_type_t::terminate && allow_incomplete()) { // This may happen if we just have a 'time' prefix. // Construct a decorated statement, which will be unsourced. return this->allocate_visit(); } else if (token1.type != parse_token_type_t::string) { // We may be unwinding already; do not produce another error. // For example in `true | and`. parse_error(token1, parse_error_generic, _(L"Expected a command, but found %ls"), token1.user_presentable_description().c_str()); return got_error(); } else if (token1.may_be_variable_assignment) { // Here we have a variable assignment which we chose to not parse as a variable // assignment because there was no string after it. // Ensure we consume the token, so we don't get back here again at the same place. parse_error(consume_any_token(), parse_error_bare_variable_assignment, L""); return got_error(); } // The only block-like builtin that takes any parameters is 'function'. So go to decorated // statements if the subsequent token looks like '--'. The logic here is subtle: // // If we are 'begin', then we expect to be invoked with no arguments. // If we are 'function', then we are a non-block if we are invoked with -h or --help // If we are anything else, we require an argument, so do the same thing if the subsequent // token is a statement terminator. if (token1.type == parse_token_type_t::string) { const auto &token2 = peek_token(1); // If we are a function, then look for help arguments. Otherwise, if the next token // looks like an option (starts with a dash), then parse it as a decorated statement. if (token1.keyword == pkt::kw_function && token2.is_help_argument) { return allocate_visit(); } else if (token1.keyword != pkt::kw_function && token2.has_dash_prefix) { return allocate_visit(); } // Likewise if the next token doesn't look like an argument at all. This corresponds to // e.g. a "naked if". bool naked_invocation_invokes_help = (token1.keyword != pkt::kw_begin && token1.keyword != pkt::kw_end); if (naked_invocation_invokes_help && (token2.type == parse_token_type_t::end || token2.type == parse_token_type_t::terminate)) { return allocate_visit(); } } switch (token1.keyword) { case pkt::kw_not: case pkt::kw_exclam: return allocate_visit(); case pkt::kw_for: case pkt::kw_while: case pkt::kw_function: case pkt::kw_begin: return allocate_visit(); case pkt::kw_if: return allocate_visit(); case pkt::kw_switch: return allocate_visit(); case pkt::kw_end: // 'end' is forbidden as a command. // For example, `if end` or `while end` will produce this error. // We still have to descend into the decorated statement because // we can't leave our pointer as null. parse_error(token1, parse_error_generic, _(L"Expected a command, but found %ls"), token1.user_presentable_description().c_str()); return got_error(); default: return allocate_visit(); } } /// Allocate and populate a block statement header. /// This must never return null. block_statement_t::header_ptr_t allocate_populate_block_header() { switch (peek_token().keyword) { case parse_keyword_t::kw_for: return allocate_visit(); case parse_keyword_t::kw_while: return allocate_visit(); case parse_keyword_t::kw_function: return allocate_visit(); case parse_keyword_t::kw_begin: return allocate_visit(); default: internal_error(__FUNCTION__, L"should not have descended into block_header"); DIE("Unreachable"); } } template unique_ptr try_parse() { if (!can_parse((AstNode *)nullptr)) return nullptr; return allocate_visit(); } void visit_node_field(argument_t &arg) { if (unsource_leaves()) { arg.unsourced = true; return; } arg.range = consume_token_type(parse_token_type_t::string); } void visit_node_field(variable_assignment_t &varas) { if (unsource_leaves()) { varas.unsourced = true; return; } if (!peek_token().may_be_variable_assignment) { internal_error(__FUNCTION__, L"Should not have created variable_assignment_t from this token"); } varas.range = consume_token_type(parse_token_type_t::string); } void visit_node_field(job_continuation_t &node) { // Special error handling to catch 'and' and 'or' in pipelines, like `true | and false`. const auto &tok = peek_token(1); if (tok.keyword == parse_keyword_t::kw_and || tok.keyword == parse_keyword_t::kw_or) { const wchar_t *cmdname = (tok.keyword == parse_keyword_t::kw_and ? L"and" : L"or"); parse_error(tok, parse_error_andor_in_pipeline, INVALID_PIPELINE_CMD_ERR_MSG, cmdname); } node.accept(*this); } // Visit branch nodes by just calling accept() to visit their fields. template enable_if_t visit_node_field(Node &node) { // This field is a direct embedding of an AST value. node.accept(*this); return; } // Overload for token fields. template void visit_node_field(token_t &token) { if (unsource_leaves()) { token.unsourced = true; return; } if (!token.allows_token(peek_token().type)) { const auto &peek = peek_token(); if ((flags_ & parse_flag_leave_unterminated) && (peek.tok_error == tokenizer_error_t::unterminated_quote || peek.tok_error == tokenizer_error_t::unterminated_subshell)) { return; } parse_error(peek, parse_error_generic, L"Expected %ls, but found %ls", token_types_user_presentable_description({TokTypes...}).c_str(), peek.user_presentable_description().c_str()); token.unsourced = true; return; } parse_token_t tok = consume_any_token(); token.type = tok.type; token.range = tok.range(); } // Overload for keyword fields. template void visit_node_field(keyword_t &keyword) { if (unsource_leaves()) { keyword.unsourced = true; return; } if (!keyword.allows_keyword(peek_token().keyword)) { keyword.unsourced = true; const auto &peek = peek_token(); if ((flags_ & parse_flag_leave_unterminated) && (peek.tok_error == tokenizer_error_t::unterminated_quote || peek.tok_error == tokenizer_error_t::unterminated_subshell)) { return; } // Special error reporting for keyword_t. std::array allowed = {{KWs...}}; if (allowed.size() == 1 && allowed[0] == parse_keyword_t::kw_end) { assert(!visit_stack_.empty() && "Visit stack should not be empty"); auto p = find_block_open_keyword(visit_stack_.back()); source_range_t kw_range = p.first; const wchar_t *kw_name = p.second; if (kw_name) { this->parse_error(kw_range, parse_error_generic, L"Missing end to balance this %ls", kw_name); } } parse_error(peek, parse_error_generic, L"Expected %ls, but found %ls", keywords_user_presentable_description({KWs...}).c_str(), peek.user_presentable_description().c_str()); return; } parse_token_t tok = consume_any_token(); keyword.kw = tok.keyword; keyword.range = tok.range(); } // Overload for maybe_newlines void visit_node_field(maybe_newlines_t &nls) { if (unsource_leaves()) { nls.unsourced = true; return; } // TODO: it would be nice to have the start offset be the current position in the token // stream, even if there are no newlines. nls.range = {0, 0}; while (peek_token().is_newline) { auto r = consume_token_type(parse_token_type_t::end); if (nls.range.length == 0) { nls.range = r; } else { nls.range.length = r.start + r.length - nls.range.start; } } } template void visit_optional_field(optional_t &ptr) { // This field is an optional node. ptr.contents = this->try_parse(); } template void visit_list_field(list_t &list) { // This field is an embedding of an array of (pointers to) ContentsNode. // Parse as many as we can. populate_list(list); } // We currently only have a handful of union pointer types. // Handle them directly. void visit_union_field(statement_t::contents_ptr_t &ptr) { ptr = this->allocate_populate_statement_contents(); assert(ptr && "Statement contents must never be null"); } void visit_union_field(argument_or_redirection_t::contents_ptr_t &contents) { if (auto arg = try_parse()) { contents = std::move(arg); } else if (auto redir = try_parse()) { contents = std::move(redir); } else { internal_error(__FUNCTION__, L"Unable to parse argument or redirection"); } assert(contents && "Statement contents must never be null"); } void visit_union_field(block_statement_t::header_ptr_t &ptr) { ptr = this->allocate_populate_block_header(); assert(ptr && "Header pointer must never be null"); } void will_visit_fields_of(const node_t &node) { FLOGF(ast_construction, L"%*swill_visit %ls %p", spaces(), "", node.describe().c_str(), (const void *)&node); visit_stack_.push_back(&node); } void did_visit_fields_of(const node_t &node) { assert(!visit_stack_.empty() && visit_stack_.back() == &node && "Node was not at the top of the visit stack"); visit_stack_.pop_back(); } /// Flags controlling parsing. parse_tree_flags_t flags_{}; /// Extra stuff like comment ranges. ast_t::extras_t extras_{}; /// Stream of tokens which we consume. token_stream_t tokens_; /** The type which we are attempting to parse, typically job_list but may be freestanding_argument_list. */ const type_t top_type_; /// If set, we are unwinding due to error recovery. bool unwinding_{false}; /// If set, we have encountered an error. bool any_error_{false}; /// A stack containing the nodes whose fields we are visiting. std::vector visit_stack_{}; // If non-null, populate with errors. parse_error_list_t *out_errors_{}; }; } // namespace // Set the parent fields of all nodes in the tree rooted at \p node. static void set_parents(const node_t *top) { struct parent_setter_t { void visit(const node_t &node) { const_cast(node).parent = parent_; const node_t *saved = parent_; parent_ = &node; node_visitor(*this).accept_children_of(&node); parent_ = saved; } const node_t *parent_{nullptr}; }; struct parent_setter_t ps; node_visitor(ps).accept(top); } // static ast_t ast_t::parse_from_top(const wcstring &src, parse_tree_flags_t parse_flags, parse_error_list_t *out_errors, type_t top_type) { assert((top_type == type_t::job_list || top_type == type_t::freestanding_argument_list) && "Invalid top type"); ast_t ast; populator_t pops(src, parse_flags, top_type, out_errors); if (top_type == type_t::job_list) { std::unique_ptr list = pops.allocate(); pops.populate_list(*list, true /* exhaust_stream */); ast.top_.reset(list.release()); } else { std::unique_ptr list = pops.allocate(); pops.populate_list(list->arguments, true /* exhaust_stream */); ast.top_.reset(list.release()); } // Chomp trailing extras, etc. pops.chomp_extras(type_t::job_list); ast.any_error_ = pops.any_error_; ast.extras_ = std::move(pops.extras_); // Set all parent nodes. // It turns out to be more convenient to do this after the parse phase. set_parents(ast.top()); return ast; } // static ast_t ast_t::parse(const wcstring &src, parse_tree_flags_t flags, parse_error_list_t *out_errors) { return parse_from_top(src, flags, out_errors, type_t::job_list); } // static ast_t ast_t::parse_argument_list(const wcstring &src, parse_tree_flags_t flags, parse_error_list_t *out_errors) { return parse_from_top(src, flags, out_errors, type_t::freestanding_argument_list); } // \return the depth of a node, i.e. number of parent links. static int get_depth(const node_t *node) { int result = 0; for (const node_t *cursor = node->parent; cursor; cursor = cursor->parent) { result += 1; } return result; } wcstring ast_t::dump(const wcstring &orig) const { wcstring result; // Return a string that repeats "| " \p amt times. auto pipespace = [](int amt) { std::string result; result.reserve(amt * 2); for (int i = 0; i < amt; i++) result.append("! "); return result; }; traversal_t tv = this->walk(); while (const auto *node = tv.next()) { int depth = get_depth(node); // dot-| padding append_format(result, L"%s", pipespace(depth).c_str()); if (const auto *n = node->try_as()) { append_format(result, L"argument"); if (auto argsrc = n->try_source(orig)) { append_format(result, L": '%ls'", argsrc->c_str()); } } else if (const auto *n = node->try_as()) { append_format(result, L"keyword: %ls", keyword_description(n->kw)); } else if (const auto *n = node->try_as()) { wcstring desc; switch (n->type) { case parse_token_type_t::string: desc = format_string(L"string"); if (auto strsource = n->try_source(orig)) { append_format(desc, L": '%ls'", strsource->c_str()); } break; case parse_token_type_t::redirection: desc = L"redirection"; if (auto strsource = n->try_source(orig)) { append_format(desc, L": '%ls'", strsource->c_str()); } break; case parse_token_type_t::end: desc = L"<;>"; break; case parse_token_type_t::invalid: // This may occur with errors, e.g. we expected to see a string but saw a // redirection. desc = L""; break; default: desc = token_type_user_presentable_description(n->type); break; } append_format(result, L"%ls", desc.c_str()); } else { append_format(result, L"%ls", node->describe().c_str()); } append_format(result, L"\n"); } return result; } } // namespace ast