mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-27 20:25:12 +00:00
Teach the tokenizer to report escaped newlines
Add fields and flags so that escaped newlines can be reported, for the benefit of fish_indent.
This commit is contained in:
parent
678fd86107
commit
6f57fef8f8
4 changed files with 44 additions and 24 deletions
|
@ -270,7 +270,10 @@ static void dump_tree_recursive(const parse_node_tree_t &nodes, const wcstring &
|
||||||
append_format(*result, L" <%lu children>", node.child_count);
|
append_format(*result, L" <%lu children>", node.child_count);
|
||||||
}
|
}
|
||||||
if (node.has_comments()) {
|
if (node.has_comments()) {
|
||||||
append_format(*result, L" <has_comments>", node.child_count);
|
append_format(*result, L" <has_comments>");
|
||||||
|
}
|
||||||
|
if (node.has_preceding_escaped_newline()) {
|
||||||
|
append_format(*result, L" <preceding_esc_nl>");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node.has_source() && node.type == parse_token_type_string) {
|
if (node.has_source() && node.type == parse_token_type_string) {
|
||||||
|
@ -357,7 +360,7 @@ class parse_ll_t {
|
||||||
parse_error_list_t errors;
|
parse_error_list_t errors;
|
||||||
// The symbol stack can contain terminal types or symbols. Symbols go on to do productions, but
|
// The symbol stack can contain terminal types or symbols. Symbols go on to do productions, but
|
||||||
// terminal types are just matched against input tokens.
|
// terminal types are just matched against input tokens.
|
||||||
bool top_node_handle_terminal_types(parse_token_t token);
|
bool top_node_handle_terminal_types(const parse_token_t &token);
|
||||||
|
|
||||||
void parse_error_unexpected_token(const wchar_t *expected, parse_token_t token);
|
void parse_error_unexpected_token(const wchar_t *expected, parse_token_t token);
|
||||||
void parse_error(parse_token_t token, parse_error_code_t code, const wchar_t *format, ...);
|
void parse_error(parse_token_t token, parse_error_code_t code, const wchar_t *format, ...);
|
||||||
|
@ -758,7 +761,7 @@ bool parse_ll_t::report_error_for_unclosed_block() {
|
||||||
return reported_error;
|
return reported_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token) {
|
bool parse_ll_t::top_node_handle_terminal_types(const parse_token_t &token) {
|
||||||
PARSE_ASSERT(!symbol_stack.empty()); //!OCLINT(multiple unary operator)
|
PARSE_ASSERT(!symbol_stack.empty()); //!OCLINT(multiple unary operator)
|
||||||
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
|
PARSE_ASSERT(token.type >= FIRST_PARSE_TOKEN_TYPE);
|
||||||
parse_stack_element_t &stack_top = symbol_stack.back();
|
parse_stack_element_t &stack_top = symbol_stack.back();
|
||||||
|
@ -791,6 +794,8 @@ bool parse_ll_t::top_node_handle_terminal_types(parse_token_t token) {
|
||||||
node.keyword = token.keyword;
|
node.keyword = token.keyword;
|
||||||
node.source_start = token.source_start;
|
node.source_start = token.source_start;
|
||||||
node.source_length = token.source_length;
|
node.source_length = token.source_length;
|
||||||
|
if (token.preceding_escaped_nl)
|
||||||
|
node.flags |= parse_node_flag_preceding_escaped_nl;
|
||||||
} else {
|
} else {
|
||||||
// Failure
|
// Failure
|
||||||
if (stack_top.type == parse_token_type_string && token.type == parse_token_type_string) {
|
if (stack_top.type == parse_token_type_string && token.type == parse_token_type_string) {
|
||||||
|
@ -858,6 +863,8 @@ void parse_ll_t::accept_tokens(parse_token_t token1, parse_token_t token2) {
|
||||||
special_node.parent = symbol_stack.back().node_idx;
|
special_node.parent = symbol_stack.back().node_idx;
|
||||||
special_node.source_start = token1.source_start;
|
special_node.source_start = token1.source_start;
|
||||||
special_node.source_length = token1.source_length;
|
special_node.source_length = token1.source_length;
|
||||||
|
if (token1.preceding_escaped_nl)
|
||||||
|
special_node.flags |= parse_node_flag_preceding_escaped_nl;
|
||||||
nodes.push_back(special_node);
|
nodes.push_back(special_node);
|
||||||
|
|
||||||
// Mark special flags.
|
// Mark special flags.
|
||||||
|
@ -969,12 +976,10 @@ static parse_keyword_t keyword_for_token(token_type tok, const wcstring &token)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Placeholder invalid token.
|
/// Placeholder invalid token.
|
||||||
static constexpr parse_token_t kInvalidToken = {
|
static constexpr parse_token_t kInvalidToken{token_type_invalid};
|
||||||
token_type_invalid, parse_keyword_none, false, false, false, SOURCE_OFFSET_INVALID, 0};
|
|
||||||
|
|
||||||
/// Terminal token.
|
/// Terminal token.
|
||||||
static constexpr parse_token_t kTerminalToken = {
|
static constexpr parse_token_t kTerminalToken = {parse_token_type_terminate};
|
||||||
parse_token_type_terminate, parse_keyword_none, false, false, false, SOURCE_OFFSET_INVALID, 0};
|
|
||||||
|
|
||||||
static inline bool is_help_argument(const wcstring &txt) {
|
static inline bool is_help_argument(const wcstring &txt) {
|
||||||
return txt == L"-h" || txt == L"--help";
|
return txt == L"-h" || txt == L"--help";
|
||||||
|
@ -986,19 +991,18 @@ static inline parse_token_t next_parse_token(tokenizer_t *tok, tok_t *token, wcs
|
||||||
return kTerminalToken;
|
return kTerminalToken;
|
||||||
}
|
}
|
||||||
|
|
||||||
parse_token_t result;
|
|
||||||
|
|
||||||
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy,
|
// Set the type, keyword, and whether there's a dash prefix. Note that this is quite sketchy,
|
||||||
// because it ignores quotes. This is the historical behavior. For example, `builtin --names`
|
// because it ignores quotes. This is the historical behavior. For example, `builtin --names`
|
||||||
// lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of
|
// lists builtins, but `builtin "--names"` attempts to run --names as a command. Amazingly as of
|
||||||
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
|
// this writing (10/12/13) nobody seems to have noticed this. Squint at it really hard and it
|
||||||
// even starts to look like a feature.
|
// even starts to look like a feature.
|
||||||
result.type = parse_token_type_from_tokenizer_token(token->type);
|
parse_token_t result{parse_token_type_from_tokenizer_token(token->type)};
|
||||||
const wcstring &text = tok->copy_text_of(*token, storage);
|
const wcstring &text = tok->copy_text_of(*token, storage);
|
||||||
result.keyword = keyword_for_token(token->type, text);
|
result.keyword = keyword_for_token(token->type, text);
|
||||||
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
|
result.has_dash_prefix = !text.empty() && text.at(0) == L'-';
|
||||||
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
|
result.is_help_argument = result.has_dash_prefix && is_help_argument(text);
|
||||||
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
|
result.is_newline = (result.type == parse_token_type_end && text == L"\n");
|
||||||
|
result.preceding_escaped_nl = token->preceding_escaped_nl;
|
||||||
|
|
||||||
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
|
// These assertions are totally bogus. Basically our tokenizer works in size_t but we work in
|
||||||
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
|
// uint32_t to save some space. If we have a source file larger than 4 GB, we'll probably just
|
||||||
|
@ -1079,13 +1083,9 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark a special error token, and then keep going.
|
// Mark a special error token, and then keep going.
|
||||||
const parse_token_t token = {parse_special_type_parse_error,
|
parse_token_t token = {parse_special_type_parse_error};
|
||||||
parse_keyword_none,
|
token.source_start = queue[error_token_idx].source_start;
|
||||||
false,
|
token.source_length = queue[error_token_idx].source_length;
|
||||||
false,
|
|
||||||
false,
|
|
||||||
queue[error_token_idx].source_start,
|
|
||||||
queue[error_token_idx].source_length};
|
|
||||||
parser.accept_tokens(token, kInvalidToken);
|
parser.accept_tokens(token, kInvalidToken);
|
||||||
parser.reset_symbols(goal);
|
parser.reset_symbols(goal);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,15 +28,18 @@ constexpr source_offset_t SOURCE_OFFSET_INVALID = static_cast<source_offset_t>(-
|
||||||
/// A struct representing the token type that we use internally.
|
/// A struct representing the token type that we use internally.
|
||||||
struct parse_token_t {
|
struct parse_token_t {
|
||||||
enum parse_token_type_t type; // The type of the token as represented by the parser
|
enum parse_token_type_t type; // The type of the token as represented by the parser
|
||||||
enum parse_keyword_t keyword; // Any keyword represented by this token
|
enum parse_keyword_t keyword{parse_keyword_none}; // Any keyword represented by this token
|
||||||
bool has_dash_prefix; // Hackish: whether the source contains a dash prefix
|
bool has_dash_prefix{false}; // Hackish: whether the source contains a dash prefix
|
||||||
bool is_help_argument; // Hackish: whether the source looks like '-h' or '--help'
|
bool is_help_argument{false}; // Hackish: whether the source looks like '-h' or '--help'
|
||||||
bool is_newline; // Hackish: if TOK_END, whether the source is a newline.
|
bool is_newline{false}; // Hackish: if TOK_END, whether the source is a newline.
|
||||||
source_offset_t source_start;
|
bool preceding_escaped_nl{false}; // Whether there was an escaped newline preceding this token.
|
||||||
source_offset_t source_length;
|
source_offset_t source_start{SOURCE_OFFSET_INVALID};
|
||||||
|
source_offset_t source_length{0};
|
||||||
|
|
||||||
wcstring describe() const;
|
wcstring describe() const;
|
||||||
wcstring user_presentable_description() const;
|
wcstring user_presentable_description() const;
|
||||||
|
|
||||||
|
constexpr parse_token_t(parse_token_type_t type) : type(type) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
|
@ -66,6 +69,11 @@ const wchar_t *keyword_description(parse_keyword_t type);
|
||||||
enum {
|
enum {
|
||||||
/// Flag indicating that the node has associated comment nodes.
|
/// Flag indicating that the node has associated comment nodes.
|
||||||
parse_node_flag_has_comments = 1 << 0,
|
parse_node_flag_has_comments = 1 << 0,
|
||||||
|
|
||||||
|
/// Flag indicating that the token was preceded by an escaped newline, e.g.
|
||||||
|
/// echo abc | \
|
||||||
|
/// cat
|
||||||
|
parse_node_flag_preceding_escaped_nl = 1 << 1,
|
||||||
};
|
};
|
||||||
typedef uint8_t parse_node_flags_t;
|
typedef uint8_t parse_node_flags_t;
|
||||||
|
|
||||||
|
@ -123,7 +131,12 @@ class parse_node_t {
|
||||||
|
|
||||||
/// Indicate if the node has comment nodes.
|
/// Indicate if the node has comment nodes.
|
||||||
bool has_comments() const {
|
bool has_comments() const {
|
||||||
return static_cast<bool>(this->flags & parse_node_flag_has_comments);
|
return this->flags & parse_node_flag_has_comments;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Indicates if we have a preceding escaped newline.
|
||||||
|
bool has_preceding_escaped_newline() const {
|
||||||
|
return this->flags & parse_node_flag_preceding_escaped_nl;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets source for the node, or the empty string if it has no source.
|
/// Gets source for the node, or the empty string if it has no source.
|
||||||
|
|
|
@ -428,10 +428,12 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
|
// Consume non-newline whitespace. If we get an escaped newline, mark it and continue past it.
|
||||||
|
bool preceding_escaped_nl = false;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (this->buff[0] == L'\\' && this->buff[1] == L'\n') {
|
if (this->buff[0] == L'\\' && this->buff[1] == L'\n') {
|
||||||
this->buff += 2;
|
this->buff += 2;
|
||||||
this->continue_line_after_comment = true;
|
this->continue_line_after_comment = true;
|
||||||
|
preceding_escaped_nl = true;
|
||||||
} else if (iswspace_not_nl(this->buff[0])) {
|
} else if (iswspace_not_nl(this->buff[0])) {
|
||||||
this->buff++;
|
this->buff++;
|
||||||
} else {
|
} else {
|
||||||
|
@ -454,6 +456,7 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||||
result.type = TOK_COMMENT;
|
result.type = TOK_COMMENT;
|
||||||
result.offset = comment_start - this->start;
|
result.offset = comment_start - this->start;
|
||||||
result.length = comment_len;
|
result.length = comment_len;
|
||||||
|
result.preceding_escaped_nl = preceding_escaped_nl;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
while (iswspace_not_nl(this->buff[0])) this->buff++;
|
while (iswspace_not_nl(this->buff[0])) this->buff++;
|
||||||
|
@ -551,6 +554,7 @@ maybe_t<tok_t> tokenizer_t::tok_next() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
result.preceding_escaped_nl = preceding_escaped_nl;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -82,6 +82,9 @@ struct tok_t {
|
||||||
// If an error, this is the error code.
|
// If an error, this is the error code.
|
||||||
tokenizer_error *error { TOK_ERROR_NONE };
|
tokenizer_error *error { TOK_ERROR_NONE };
|
||||||
|
|
||||||
|
// Whether the token was preceded by an escaped newline.
|
||||||
|
bool preceding_escaped_nl{false};
|
||||||
|
|
||||||
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
|
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
|
||||||
// at 'offset'.
|
// at 'offset'.
|
||||||
size_t error_offset{size_t(-1)};
|
size_t error_offset{size_t(-1)};
|
||||||
|
|
Loading…
Reference in a new issue