diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index 72ab0549d..5293544e3 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -270,7 +270,10 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring & append_format(*result, L" <%lu children>", node.child_count); } if (node.has_comments()) { - append_format(*result, L" ", node.child_count); + append_format(*result, L" "); + } + if (node.has_preceding_escaped_newline()) { + append_format(*result, L" "); } if (node.has_source() && node.type == parse_token_type_string) { @@ -357,7 +360,7 @@ class parse_ll_t { parse_error_list_t errors; // The symbol stack can contain terminal types or symbols. Symbols go on to do productions, but // terminal types are just matched against input tokens. - bool top_node_handle_terminal_types(parse_token_t token); + bool top_node_handle_terminal_types(const parse_token_t &token); void parse_error_unexpected_token(const wchar_t *expected, parse_token_t token); void parse_error(parse_token_t token, parse_error_code_t code, const wchar_t *format, ...); @@ -758,7 +761,7 @@ bool parse_ll_t::report_error_for_unclosed_block() { return reported_error; } -bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token) { +bool parse_ll_t::top_node_handle_terminal_types(const parse_token_t &token) { PARSE_ASSERT(!symbol_stack.empty()); //!OCLINT(multiple unary operator) PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE); parse_stack_element_t &stack_top = symbol_stack.back(); @@ -791,6 +794,8 @@ bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token) { node.keyword = token.keyword; node.source_start = token.source_start; node.source_length = token.source_length; + if (token.preceding_escaped_nl) + node.flags |= parse_node_flag_preceding_escaped_nl; } else { // Failure if (stack_top.type == parse_token_type_string && token.type == parse_token_type_string) { @@ -858,6 +863,8 @@ void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2) { special_node.parent = symbol_stack.back().node_idx; special_node.source_start = token1.source_start; special_node.source_length = token1.source_length; + if (token1.preceding_escaped_nl) + special_node.flags |= parse_node_flag_preceding_escaped_nl; nodes.push_back(special_node); // Mark special flags. @@ -969,12 +976,10 @@ static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token) } /// Placeholder invalid token. -static constexpr parse_token_t kInvalidToken = { - token_type_invalid, parse_keyword_none, false, false, false, SOURCE_OFFSET_INVALID, 0}; +static constexpr parse_token_t kInvalidToken{token_type_invalid}; /// Terminal token. -static constexpr parse_token_t kTerminalToken = { - parse_token_type_terminate, parse_keyword_none, false, false, false, SOURCE_OFFSET_INVALID, 0}; +static constexpr parse_token_t kTerminalToken = {parse_token_type_terminate}; static inline bool is_help_argument(const wcstring &txt) { return txt == L"-h" || txt == L"--help"; @@ -986,19 +991,18 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token, wcs return kTerminalToken; } - parse_token_t result; - // Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy, // because it ignores quotes. This is the historical behavior. For example, `builtin --names` // lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of // this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it // even starts to look like a feature. - result.type = parse_token_type_from_tokenizer_token(token->type); + parse_token_t result{parse_token_type_from_tokenizer_token(token->type)}; const wcstring &text = tok->copy_text_of(*token, storage); result.keyword = keyword_for_token(token->type, text); result.has_dash_prefix = !text.empty() && text.at(0) == L'-'; result.is_help_argument = result.has_dash_prefix && is_help_argument(text); result.is_newline = (result.type == parse_token_type_end && text == L"\n"); + result.preceding_escaped_nl = token->preceding_escaped_nl; // These assertions are totally bogus. Basically our tokenizer works in size_t but we work in // uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just @@ -1079,13 +1083,9 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags, } // Mark a special error token, and then keep going. - const parse_token_t token = {parse_special_type_parse_error, - parse_keyword_none, - false, - false, - false, - queue[error_token_idx].source_start, - queue[error_token_idx].source_length}; + parse_token_t token = {parse_special_type_parse_error}; + token.source_start = queue[error_token_idx].source_start; + token.source_length = queue[error_token_idx].source_length; parser.accept_tokens(token, kInvalidToken); parser.reset_symbols(goal); } diff --git a/src/parse_tree.h b/src/parse_tree.h index 184037f1c..9e032f11d 100644 --- a/src/parse_tree.h +++ b/src/parse_tree.h @@ -28,15 +28,18 @@ constexpr source_offset_t SOURCE_OFFSET_INVALID = static_cast(- /// A struct representing the token type that we use internally. struct parse_token_t { enum parse_token_type_t type; // The type of the token as represented by the parser - enum parse_keyword_t keyword; // Any keyword represented by this token - bool has_dash_prefix; // Hackish: whether the source contains a dash prefix - bool is_help_argument; // Hackish: whether the source looks like '-h' or '--help' - bool is_newline; // Hackish: if TOK_END, whether the source is a newline. - source_offset_t source_start; - source_offset_t source_length; + enum parse_keyword_t keyword{parse_keyword_none}; // Any keyword represented by this token + bool has_dash_prefix{false}; // Hackish: whether the source contains a dash prefix + bool is_help_argument{false}; // Hackish: whether the source looks like '-h' or '--help' + bool is_newline{false}; // Hackish: if TOK_END, whether the source is a newline. + bool preceding_escaped_nl{false}; // Whether there was an escaped newline preceding this token. + source_offset_t source_start{SOURCE_OFFSET_INVALID}; + source_offset_t source_length{0}; wcstring describe() const; wcstring user_presentable_description() const; + + constexpr parse_token_t(parse_token_type_t type) : type(type) {} }; enum { @@ -66,6 +69,11 @@ const wchar_t *keyword_description(parse_keyword_t type); enum { /// Flag indicating that the node has associated comment nodes. parse_node_flag_has_comments = 1 << 0, + + /// Flag indicating that the token was preceded by an escaped newline, e.g. + /// echo abc | \ + /// cat + parse_node_flag_preceding_escaped_nl = 1 << 1, }; typedef uint8_t parse_node_flags_t; @@ -123,7 +131,12 @@ class parse_node_t { /// Indicate if the node has comment nodes. bool has_comments() const { - return static_cast(this->flags & parse_node_flag_has_comments); + return this->flags & parse_node_flag_has_comments; + } + + /// Indicates if we have a preceding escaped newline. + bool has_preceding_escaped_newline() const { + return this->flags & parse_node_flag_preceding_escaped_nl; } /// Gets source for the node, or the empty string if it has no source. diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 7f1f86a20..fe9d229b9 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -428,10 +428,12 @@ maybe_t tokenizer_t::tok_next() { } // Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it. + bool preceding_escaped_nl = false; for (;;) { if (this->buff[0] == L'\\' && this->buff[1] == L'\n') { this->buff += 2; this->continue_line_after_comment = true; + preceding_escaped_nl = true; } else if (iswspace_not_nl(this->buff[0])) { this->buff++; } else { @@ -454,6 +456,7 @@ maybe_t tokenizer_t::tok_next() { result.type = TOK_COMMENT; result.offset = comment_start - this->start; result.length = comment_len; + result.preceding_escaped_nl = preceding_escaped_nl; return result; } while (iswspace_not_nl(this->buff[0])) this->buff++; @@ -551,6 +554,7 @@ maybe_t tokenizer_t::tok_next() { break; } } + result.preceding_escaped_nl = preceding_escaped_nl; return result; } diff --git a/src/tokenizer.h b/src/tokenizer.h index f44ea2815..10f600737 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -82,6 +82,9 @@ struct tok_t { // If an error, this is the error code. tokenizer_error *error { TOK_ERROR_NONE }; + // Whether the token was preceded by an escaped newline. + bool preceding_escaped_nl{false}; + // If an error, this is the offset of the error within the token. A value of 0 means it occurred // at 'offset'. size_t error_offset{size_t(-1)};