Merge branch 'fix_brace_parsing'

Closes #3802 and improves tokenizer handling of invalid expressions
involving braces, parentheses, and brackets.
This commit is contained in:
Mahmoud Al-Qudsi 2018-03-12 07:05:27 -05:00
commit d385248cc8
16 changed files with 360 additions and 315 deletions

View file

@ -1288,10 +1288,11 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in
const bool unescape_special = static_cast<bool>(flags & UNESCAPE_SPECIAL); const bool unescape_special = static_cast<bool>(flags & UNESCAPE_SPECIAL);
const bool allow_incomplete = static_cast<bool>(flags & UNESCAPE_INCOMPLETE); const bool allow_incomplete = static_cast<bool>(flags & UNESCAPE_INCOMPLETE);
int bracket_count = 0; bool brace_text_start = false;
int brace_count = 0;
bool errored = false; bool errored = false;
enum { mode_unquoted, mode_single_quotes, mode_double_quotes } mode = mode_unquoted; enum { mode_unquoted, mode_single_quotes, mode_double_quotes, mode_braces } mode = mode_unquoted;
for (size_t input_position = 0; input_position < input_len && !errored; input_position++) { for (size_t input_position = 0; input_position < input_len && !errored; input_position++) {
const wchar_t c = input[input_position]; const wchar_t c = input[input_position];
@ -1352,21 +1353,32 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in
} }
case L'{': { case L'{': {
if (unescape_special) { if (unescape_special) {
bracket_count++; brace_count++;
to_append_or_none = BRACKET_BEGIN; to_append_or_none = BRACE_BEGIN;
} }
break; break;
} }
case L'}': { case L'}': {
if (unescape_special) { if (unescape_special) {
bracket_count--; assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we shouldn't be able to get here");
to_append_or_none = BRACKET_END; brace_count--;
brace_text_start = brace_text_start && brace_count > 0;
to_append_or_none = BRACE_END;
} }
break; break;
} }
case L',': { case L',': {
if (unescape_special && bracket_count > 0) { if (unescape_special && brace_count > 0) {
to_append_or_none = BRACKET_SEP; to_append_or_none = BRACE_SEP;
brace_text_start = false;
}
break;
}
case L'\n':
case L'\t':
case L' ': {
if (unescape_special && brace_count > 0) {
to_append_or_none = brace_text_start ? BRACE_SPACE : NOT_A_WCHAR;
} }
break; break;
} }
@ -1380,7 +1392,12 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in
to_append_or_none = unescape_special ? wint_t(INTERNAL_SEPARATOR) : NOT_A_WCHAR; to_append_or_none = unescape_special ? wint_t(INTERNAL_SEPARATOR) : NOT_A_WCHAR;
break; break;
} }
default: { break; } default: {
if (unescape_special && brace_count > 0) {
brace_text_start = true;
}
break;
}
} }
} else if (mode == mode_single_quotes) { } else if (mode == mode_single_quotes) {
if (c == L'\\') { if (c == L'\\') {

View file

@ -807,6 +807,19 @@ struct enum_map {
const wchar_t *const str; const wchar_t *const str;
}; };
/// Use for scoped enums (i.e. `enum class`) with bitwise operations
#define ENUM_FLAG_OPERATOR(T,X,Y) \
inline T operator X (T lhs, T rhs) { return (T) (static_cast<std::underlying_type<T>::type>(lhs) X static_cast<std::underlying_type<T>::type>(rhs)); } \
inline T operator Y (T &lhs, T rhs) { return lhs = (T) (static_cast<std::underlying_type<T>::type>(lhs) X static_cast<std::underlying_type<T>::type>(rhs)); }
#define ENUM_FLAGS(T) \
enum class T; \
inline T operator ~ (T t) { return (T) (~static_cast<std::underlying_type<T>::type>(t)); } \
ENUM_FLAG_OPERATOR(T,|,|=) \
ENUM_FLAG_OPERATOR(T,^,^=) \
ENUM_FLAG_OPERATOR(T,&,&=) \
enum class T
/// Given a string return the matching enum. Return the sentinal enum if no match is made. The map /// Given a string return the matching enum. Return the sentinal enum if no match is made. The map
/// must be sorted by the `str` member. A binary search is twice as fast as a linear search with 16 /// must be sorted by the `str` member. A binary search is twice as fast as a linear search with 16
/// elements in the map. /// elements in the map.

View file

@ -47,6 +47,7 @@
#include "proc.h" #include "proc.h"
#include "reader.h" #include "reader.h"
#include "wildcard.h" #include "wildcard.h"
#include "wcstringutil.h"
#include "wutil.h" // IWYU pragma: keep #include "wutil.h" // IWYU pragma: keep
#ifdef KERN_PROCARGS2 #ifdef KERN_PROCARGS2
#else #else
@ -570,7 +571,7 @@ static void find_process(const wchar_t *proc, expand_flags_t flags,
static size_t parse_slice(const wchar_t *in, wchar_t **end_ptr, std::vector<long> &idx, static size_t parse_slice(const wchar_t *in, wchar_t **end_ptr, std::vector<long> &idx,
std::vector<size_t> &source_positions, size_t array_size) { std::vector<size_t> &source_positions, size_t array_size) {
const long size = (long)array_size; const long size = (long)array_size;
size_t pos = 1; // skip past the opening square bracket size_t pos = 1; // skip past the opening square brace
while (1) { while (1) {
while (iswspace(in[pos]) || (in[pos] == INTERNAL_SEPARATOR)) pos++; while (iswspace(in[pos]) || (in[pos] == INTERNAL_SEPARATOR)) pos++;
@ -846,39 +847,39 @@ static bool expand_variables(const wcstring &instr, std::vector<completion_t> *o
return true; return true;
} }
/// Perform bracket expansion. /// Perform brace expansion.
static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flags, static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags,
std::vector<completion_t> *out, parse_error_list_t *errors) { std::vector<completion_t> *out, parse_error_list_t *errors) {
bool syntax_error = false; bool syntax_error = false;
int bracket_count = 0; int brace_count = 0;
const wchar_t *bracket_begin = NULL, *bracket_end = NULL; const wchar_t *brace_begin = NULL, *brace_end = NULL;
const wchar_t *last_sep = NULL; const wchar_t *last_sep = NULL;
const wchar_t *item_begin; const wchar_t *item_begin;
size_t length_preceding_brackets, length_following_brackets, tot_len; size_t length_preceding_braces, length_following_braces, tot_len;
const wchar_t *const in = instr.c_str(); const wchar_t *const in = instr.c_str();
// Locate the first non-nested bracket pair. // Locate the first non-nested brace pair.
for (const wchar_t *pos = in; (*pos) && !syntax_error; pos++) { for (const wchar_t *pos = in; (*pos) && !syntax_error; pos++) {
switch (*pos) { switch (*pos) {
case BRACKET_BEGIN: { case BRACE_BEGIN: {
if (bracket_count == 0) bracket_begin = pos; if (brace_count == 0) brace_begin = pos;
bracket_count++; brace_count++;
break; break;
} }
case BRACKET_END: { case BRACE_END: {
bracket_count--; brace_count--;
if (bracket_count < 0) { if (brace_count < 0) {
syntax_error = true; syntax_error = true;
} else if (bracket_count == 0) { } else if (brace_count == 0) {
bracket_end = pos; brace_end = pos;
} }
break; break;
} }
case BRACKET_SEP: { case BRACE_SEP: {
if (bracket_count == 1) last_sep = pos; if (brace_count == 1) last_sep = pos;
break; break;
} }
default: { default: {
@ -887,72 +888,80 @@ static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flag
} }
} }
if (bracket_count > 0) { if (brace_count > 0) {
if (!(flags & EXPAND_FOR_COMPLETIONS)) { if (!(flags & EXPAND_FOR_COMPLETIONS)) {
syntax_error = true; syntax_error = true;
} else { } else {
// The user hasn't typed an end bracket yet; make one up and append it, then expand // The user hasn't typed an end brace yet; make one up and append it, then expand
// that. // that.
wcstring mod; wcstring mod;
if (last_sep) { if (last_sep) {
mod.append(in, bracket_begin - in + 1); mod.append(in, brace_begin - in + 1);
mod.append(last_sep + 1); mod.append(last_sep + 1);
mod.push_back(BRACKET_END); mod.push_back(BRACE_END);
} else { } else {
mod.append(in); mod.append(in);
mod.push_back(BRACKET_END); mod.push_back(BRACE_END);
} }
// Note: this code looks very fishy, apparently it has never worked. // Note: this code looks very fishy, apparently it has never worked.
return expand_brackets(mod, 1, out, errors); return expand_braces(mod, 1, out, errors);
} }
} }
// Expand a literal "{}" to itself because it is useless otherwise, // Expand a literal "{}" to itself because it is useless otherwise,
// and this eases e.g. `find -exec {}`. See #1109. // and this eases e.g. `find -exec {}`. See #1109.
if (bracket_begin + 1 == bracket_end) { if (brace_begin + 1 == brace_end) {
wcstring newstr = instr; wcstring newstr = instr;
newstr.at(bracket_begin - in) = L'{'; newstr.at(brace_begin - in) = L'{';
newstr.at(bracket_end - in) = L'}'; newstr.at(brace_end - in) = L'}';
return expand_brackets(newstr, flags, out, errors); return expand_braces(newstr, flags, out, errors);
} }
if (syntax_error) { if (syntax_error) {
append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched brackets")); append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched braces"));
return EXPAND_ERROR; return EXPAND_ERROR;
} }
if (bracket_begin == NULL) { if (brace_begin == NULL) {
append_completion(out, instr); append_completion(out, instr);
return EXPAND_OK; return EXPAND_OK;
} }
length_preceding_brackets = (bracket_begin - in); length_preceding_braces = (brace_begin - in);
length_following_brackets = wcslen(bracket_end) - 1; length_following_braces = wcslen(brace_end) - 1;
tot_len = length_preceding_brackets + length_following_brackets; tot_len = length_preceding_braces + length_following_braces;
item_begin = bracket_begin + 1; item_begin = brace_begin + 1;
for (const wchar_t *pos = (bracket_begin + 1); true; pos++) { for (const wchar_t *pos = (brace_begin + 1); true; pos++) {
if (bracket_count == 0 && ((*pos == BRACKET_SEP) || (pos == bracket_end))) { if (brace_count == 0 && ((*pos == BRACE_SEP) || (pos == brace_end))) {
assert(pos >= item_begin); assert(pos >= item_begin);
size_t item_len = pos - item_begin; size_t item_len = pos - item_begin;
wcstring item = wcstring(item_begin, item_len);
item = trim(item, (const wchar_t[]) { BRACE_SPACE });
for (auto &c : item) {
if (c == BRACE_SPACE) {
c = ' ';
}
}
wcstring whole_item; wcstring whole_item;
whole_item.reserve(tot_len + item_len + 2); whole_item.reserve(tot_len + item_len + 2);
whole_item.append(in, length_preceding_brackets); whole_item.append(in, length_preceding_braces);
whole_item.append(item_begin, item_len); whole_item.append(item.begin(), item.end());
whole_item.append(bracket_end + 1); whole_item.append(brace_end + 1);
expand_brackets(whole_item, flags, out, errors); whole_item = trim(whole_item, (const wchar_t[]) { BRACE_SPACE });
expand_braces(whole_item, flags, out, errors);
item_begin = pos + 1; item_begin = pos + 1;
if (pos == bracket_end) break; if (pos == brace_end) break;
} }
if (*pos == BRACKET_BEGIN) { if (*pos == BRACE_BEGIN) {
bracket_count++; brace_count++;
} }
if (*pos == BRACKET_END) { if (*pos == BRACE_END) {
bracket_count--; brace_count--;
} }
} }
return EXPAND_OK; return EXPAND_OK;
@ -1274,9 +1283,9 @@ static expand_error_t expand_stage_variables(const wcstring &input, std::vector<
return EXPAND_OK; return EXPAND_OK;
} }
static expand_error_t expand_stage_brackets(const wcstring &input, std::vector<completion_t> *out, static expand_error_t expand_stage_braces(const wcstring &input, std::vector<completion_t> *out,
expand_flags_t flags, parse_error_list_t *errors) { expand_flags_t flags, parse_error_list_t *errors) {
return expand_brackets(input, flags, out, errors); return expand_braces(input, flags, out, errors);
} }
static expand_error_t expand_stage_home(const wcstring &input, static expand_error_t expand_stage_home(const wcstring &input,
@ -1393,7 +1402,7 @@ expand_error_t expand_string(const wcstring &input, std::vector<completion_t> *o
// Our expansion stages. // Our expansion stages.
const expand_stage_t stages[] = {expand_stage_cmdsubst, expand_stage_variables, const expand_stage_t stages[] = {expand_stage_cmdsubst, expand_stage_variables,
expand_stage_brackets, expand_stage_home, expand_stage_braces, expand_stage_home,
expand_stage_wildcards}; expand_stage_wildcards};
// Load up our single initial completion. // Load up our single initial completion.

View file

@ -65,11 +65,13 @@ enum {
/// Character representing variable expansion into a single element. /// Character representing variable expansion into a single element.
VARIABLE_EXPAND_SINGLE, VARIABLE_EXPAND_SINGLE,
/// Character representing the start of a bracket expansion. /// Character representing the start of a bracket expansion.
BRACKET_BEGIN, BRACE_BEGIN,
/// Character representing the end of a bracket expansion. /// Character representing the end of a bracket expansion.
BRACKET_END, BRACE_END,
/// Character representing separation between two bracket elements. /// Character representing separation between two bracket elements.
BRACKET_SEP, BRACE_SEP,
/// Character that takes the place of any whitespace within non-quoted text in braces
BRACE_SPACE,
/// Separate subtokens in a token with this character. /// Separate subtokens in a token with this character.
INTERNAL_SEPARATOR, INTERNAL_SEPARATOR,
/// Character representing an empty variable expansion. Only used transitively while expanding /// Character representing an empty variable expansion. Only used transitively while expanding

View file

@ -578,6 +578,15 @@ static void test_tokenizer() {
do_test(token.error_offset == 3); do_test(token.error_offset == 3);
} }
{
tokenizer_t t(L"abc )defg(hij", 0);
do_test(t.next(&token));
do_test(t.next(&token));
do_test(token.type == TOK_ERROR);
do_test(token.error == TOK_CLOSING_UNOPENED_SUBSHELL);
do_test(token.error_offset == 4);
}
{ {
tokenizer_t t(L"abc defg(hij (klm)", 0); tokenizer_t t(L"abc defg(hij (klm)", 0);
do_test(t.next(&token)); do_test(t.next(&token));
@ -4420,10 +4429,11 @@ static void test_illegal_command_exit_code() {
const command_result_tuple_t tests[] = { const command_result_tuple_t tests[] = {
{L"echo -n", STATUS_CMD_OK}, {L"pwd", STATUS_CMD_OK}, {L"echo -n", STATUS_CMD_OK}, {L"pwd", STATUS_CMD_OK},
{L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, // a `)` without a matching `(` is now a tokenizer error, and cannot be executed even as an illegal command
// {L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}
{L"*", STATUS_ILLEGAL_CMD}, {L"**", STATUS_ILLEGAL_CMD}, {L"*", STATUS_ILLEGAL_CMD}, {L"**", STATUS_ILLEGAL_CMD},
{L"?", STATUS_ILLEGAL_CMD}, {L"abc?def", STATUS_ILLEGAL_CMD}, {L"?", STATUS_ILLEGAL_CMD}, {L"abc?def", STATUS_ILLEGAL_CMD},
{L") ", STATUS_ILLEGAL_CMD}}; };
int res = 0; int res = 0;
const io_chain_t empty_ios; const io_chain_t empty_ios;

View file

@ -122,9 +122,9 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l
switch (c) { switch (c) {
case VARIABLE_EXPAND: case VARIABLE_EXPAND:
case VARIABLE_EXPAND_SINGLE: case VARIABLE_EXPAND_SINGLE:
case BRACKET_BEGIN: case BRACE_BEGIN:
case BRACKET_END: case BRACE_END:
case BRACKET_SEP: case BRACE_SEP:
case ANY_CHAR: case ANY_CHAR:
case ANY_STRING: case ANY_STRING:
case ANY_STRING_RECURSIVE: { case ANY_STRING_RECURSIVE: {

View file

@ -169,6 +169,7 @@ enum parse_error_code_t {
parse_error_tokenizer_unterminated_subshell, parse_error_tokenizer_unterminated_subshell,
parse_error_tokenizer_unterminated_slice, parse_error_tokenizer_unterminated_slice,
parse_error_tokenizer_unterminated_escape, parse_error_tokenizer_unterminated_escape,
parse_error_tokenizer_nested_slice,
parse_error_tokenizer_other, parse_error_tokenizer_other,
parse_error_unbalancing_end, // end outside of block parse_error_unbalancing_end, // end outside of block

View file

@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
} }
void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) { void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) {
parse_error_code_t parse_error_code; parse_error_code_t parse_error_code = tok.error->parser_error;
switch (tok.error) {
case TOK_UNTERMINATED_QUOTE: {
parse_error_code = parse_error_tokenizer_unterminated_quote;
break;
}
case TOK_UNTERMINATED_SUBSHELL: {
parse_error_code = parse_error_tokenizer_unterminated_subshell;
break;
}
case TOK_UNTERMINATED_SLICE: {
parse_error_code = parse_error_tokenizer_unterminated_slice;
break;
}
case TOK_UNTERMINATED_ESCAPE: {
parse_error_code = parse_error_tokenizer_unterminated_escape;
break;
}
case TOK_INVALID_REDIRECT:
case TOK_INVALID_PIPE:
default: {
parse_error_code = parse_error_tokenizer_other;
break;
}
}
this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset, this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset,
parse_error_code, L"%ls", parse_error_code, L"%ls",
error_message_for_code(tok.error).c_str()); tok.error->Message);
} }
void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) { void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {

View file

@ -834,14 +834,14 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token
wchar_t char_after_dollar = dollar_pos + 1 >= token.size() ? 0 : token.at(dollar_pos + 1); wchar_t char_after_dollar = dollar_pos + 1 >= token.size() ? 0 : token.at(dollar_pos + 1);
switch (char_after_dollar) { switch (char_after_dollar) {
case BRACKET_BEGIN: case BRACE_BEGIN:
case L'{': { case L'{': {
// The BRACKET_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible // The BRACE_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible
// quoted) ${. See if we have a }, and the stuff in between is variable material. If so, // quoted) ${. See if we have a }, and the stuff in between is variable material. If so,
// report a bracket error. Otherwise just complain about the ${. // report a bracket error. Otherwise just complain about the ${.
bool looks_like_variable = false; bool looks_like_variable = false;
size_t closing_bracket = size_t closing_bracket =
token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACKET_END), dollar_pos + 2); token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACE_END), dollar_pos + 2);
wcstring var_name; wcstring var_name;
if (closing_bracket != wcstring::npos) { if (closing_bracket != wcstring::npos) {
size_t var_start = dollar_pos + 2, var_end = closing_bracket; size_t var_start = dollar_pos + 2, var_end = closing_bracket;

View file

@ -16,46 +16,22 @@
#include "tokenizer.h" #include "tokenizer.h"
#include "wutil.h" // IWYU pragma: keep #include "wutil.h" // IWYU pragma: keep
/// Error string for unexpected end of string. tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L"");
#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced") tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote);
tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell);
/// Error string for mismatched parenthesis. tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice);
#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match") tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape);
tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection"));
/// Error string for mismatched square brackets. tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output"));
#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match") tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis"));
tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location"));
/// Error string for unterminated escape (backslash without continuation). tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion"));
#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence") tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion"));
tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'"));
/// Error string for invalid redirections. tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'"));
#define REDIRECT_ERROR _(L"Invalid input/output redirection")
/// Error string for when trying to pipe from fd 0.
#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")
wcstring error_message_for_code(tokenizer_error err) {
switch (err) {
case TOK_UNTERMINATED_QUOTE:
return QUOTE_ERROR;
case TOK_UNTERMINATED_SUBSHELL:
return PARAN_ERROR;
case TOK_UNTERMINATED_SLICE:
return SQUARE_BRACKET_ERROR;
case TOK_UNTERMINATED_ESCAPE:
return UNTERMINATED_ESCAPE_ERROR;
case TOK_INVALID_REDIRECT:
return REDIRECT_ERROR;
case TOK_INVALID_PIPE:
return PIPE_ERROR;
default:
assert(0 && "Unknown error type");
return {};
}
}
/// Return an error token and mark that we no longer have a next token. /// Return an error token and mark that we no longer have a next token.
tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start, tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start,
const wchar_t *error_loc) { const wchar_t *error_loc) {
assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error"); assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
assert(error_loc >= token_start && "Invalid error location"); assert(error_loc >= token_start && "Invalid error location");
@ -119,194 +95,166 @@ static bool tok_is_string_character(wchar_t c, bool is_first) {
/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster /// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster
/// by adding a fast path for the most common characters. This is obviously not a suitable /// by adding a fast path for the most common characters. This is obviously not a suitable
/// replacement for iswalpha. /// replacement for iswalpha.
static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } static inline int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }
ENUM_FLAGS(tok_mode) {
regular_text = 0, // regular text
subshell = 1 << 0, // inside of subshell parentheses
array_brackets = 1 << 1, // inside of array brackets
curly_braces = 1 << 2,
char_escape = 1 << 3,
};
/// Read the next token as a string. /// Read the next token as a string.
tok_t tokenizer_t::read_string() { tok_t tokenizer_t::read_string() {
bool do_loop = true; tok_mode mode { tok_mode::regular_text };
size_t paran_count = 0; std::vector<int> paran_offsets;
// Up to 96 open parens, before we give up on good error reporting. std::vector<int> brace_offsets;
const size_t paran_offsets_max = 96; std::vector<char> expecting;
size_t paran_offsets[paran_offsets_max]; int slice_offset = 0;
// Where the open bracket is.
size_t offset_of_bracket = 0;
const wchar_t *const buff_start = this->buff; const wchar_t *const buff_start = this->buff;
bool is_first = true; bool is_first = true;
enum tok_mode_t { while (true) {
mode_regular_text = 0, // regular text wchar_t c = *this->buff;
mode_subshell = 1, // inside of subshell #if false
mode_array_brackets = 2, // inside of array brackets wcstring msg = L"Handling 0x%x (%lc)";
mode_array_brackets_and_subshell = tok_mode mode_begin = mode;
3 // inside of array brackets and subshell, like in '$foo[(ech' #endif
} mode = mode_regular_text;
while (1) { if (c == L'\0') {
if (!myal(*this->buff)) { break;
if (*this->buff == L'\\') {
const wchar_t *error_location = this->buff;
this->buff++;
if (*this->buff == L'\0') {
if ((!this->accept_unfinished)) {
return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
error_location);
}
// Since we are about to increment tok->buff, decrement it first so the
// increment doesn't go past the end of the buffer. See issue #389.
this->buff--;
do_loop = 0;
}
this->buff++;
continue;
}
switch (mode) {
case mode_regular_text: {
switch (*this->buff) {
case L'(': {
paran_count = 1;
paran_offsets[0] = this->buff - this->start;
mode = mode_subshell;
break;
}
case L'[': {
if (this->buff != buff_start) {
mode = mode_array_brackets;
offset_of_bracket = this->buff - this->start;
}
break;
}
case L'\'':
case L'"': {
const wchar_t *end = quote_end(this->buff);
if (end) {
this->buff = end;
} else {
const wchar_t *error_loc = this->buff;
this->buff += wcslen(this->buff);
if (!this->accept_unfinished) {
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
error_loc);
}
do_loop = 0;
}
break;
}
default: {
if (!tok_is_string_character(*(this->buff), is_first)) {
do_loop = 0;
}
break;
}
}
break;
}
case mode_array_brackets_and_subshell:
case mode_subshell: {
switch (*this->buff) {
case L'\'':
case L'\"': {
const wchar_t *end = quote_end(this->buff);
if (end) {
this->buff = end;
} else {
const wchar_t *error_loc = this->buff;
this->buff += wcslen(this->buff);
if ((!this->accept_unfinished)) {
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
error_loc);
}
do_loop = 0;
}
break;
}
case L'(': {
if (paran_count < paran_offsets_max) {
paran_offsets[paran_count] = this->buff - this->start;
}
paran_count++;
break;
}
case L')': {
assert(paran_count > 0);
paran_count--;
if (paran_count == 0) {
mode =
(mode == mode_array_brackets_and_subshell ? mode_array_brackets
: mode_regular_text);
}
break;
}
case L'\0': {
do_loop = 0;
break;
}
default: {
break; // ignore other chars
}
}
break;
}
case mode_array_brackets: {
switch (*this->buff) {
case L'(': {
paran_count = 1;
paran_offsets[0] = this->buff - this->start;
mode = mode_array_brackets_and_subshell;
break;
}
case L']': {
mode = mode_regular_text;
break;
}
case L'\0': {
do_loop = 0;
break;
}
default: {
break; // ignore other chars
}
}
break;
}
}
} }
if (!do_loop) break; // Make sure this character isn't being escaped before anything else
if ((mode & tok_mode::char_escape) == tok_mode::char_escape) {
mode &= ~(tok_mode::char_escape);
// and do nothing more
}
else if (myal(c)) {
// Early exit optimization in case the character is just a letter,
// which has no special meaning to the tokenizer, i.e. the same mode continues.
}
// Now proceed with the evaluation of the token, first checking to see if the token
// has been explicitly ignored (escaped).
else if (c == L'\\') {
mode |= tok_mode::char_escape;
}
else if (c == L'(') {
paran_offsets.push_back(this->buff - this->start);
expecting.push_back(L')');
mode |= tok_mode::subshell;
}
else if (c == L'{') {
brace_offsets.push_back(this->buff - this->start);
expecting.push_back(L'}');
mode |= tok_mode::curly_braces;
}
else if (c == L')') {
if (expecting.size() > 0 && expecting.back() == L'}') {
return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff);
}
switch (paran_offsets.size()) {
case 0:
return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff);
case 1:
mode &= ~(tok_mode::subshell);
default:
paran_offsets.pop_back();
}
expecting.pop_back();
}
else if (c == L'}') {
if (expecting.size() > 0 && expecting.back() == L')') {
return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff);
}
switch (brace_offsets.size()) {
case 0:
return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff);
case 1:
mode &= ~(tok_mode::curly_braces);
default:
brace_offsets.pop_back();
}
expecting.pop_back();
}
else if (c == L'[') {
if (this->buff != buff_start) {
if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) {
// Nested brackets should not overwrite the existing slice_offset
//mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell
//prints an error message with the caret pointing at token_start,
//not err_loc, making the TOK_ILLEGAL_SLICE message misleading.
// return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff);
return this->call_error(TOK_UNTERMINATED_SLICE, this->start, this->buff);
}
slice_offset = this->buff - this->start;
mode |= tok_mode::array_brackets;
}
else {
// This is actually allowed so the test operator `[` can be used as the head of a command
}
}
// Only exit bracket mode if we are in bracket mode.
// Reason: `]` can be a parameter, e.g. last parameter to `[` test alias.
// e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket
else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) {
mode &= ~(tok_mode::array_brackets);
}
else if (c == L'\'' || c == L'"') {
const wchar_t *end = quote_end(this->buff);
if (end) {
this->buff = end;
} else {
const wchar_t *error_loc = this->buff;
this->buff += wcslen(this->buff);
if ((!this->accept_unfinished)) {
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc);
}
break;
}
}
else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) {
break;
}
#if false
if (mode != mode_begin) {
msg.append(L": mode 0x%x -> 0x%x\n");
} else {
msg.push_back(L'\n');
}
debug(0, msg.c_str(), c, c, int(mode_begin), int(mode));
#endif
this->buff++; this->buff++;
is_first = false; is_first = false;
} }
if ((!this->accept_unfinished) && (mode != mode_regular_text)) { if ((!this->accept_unfinished) && (mode != tok_mode::regular_text)) {
tok_t error; tok_t error;
switch (mode) { if ((mode & tok_mode::char_escape) == tok_mode::char_escape) {
case mode_subshell: { error = this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
// Determine the innermost opening paran offset by interrogating paran_offsets. this->buff - 1);
assert(paran_count > 0); }
size_t offset_of_open_paran = 0; else if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) {
if (paran_count <= paran_offsets_max) { error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
offset_of_open_paran = paran_offsets[paran_count - 1]; this->start + slice_offset);
} }
else if ((mode & tok_mode::subshell) == tok_mode::subshell) {
assert(paran_offsets.size() > 0);
size_t offset_of_open_paran = paran_offsets.back();
error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start, error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
this->start + offset_of_open_paran); this->start + offset_of_open_paran);
break; }
} else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) {
case mode_array_brackets: assert(brace_offsets.size() > 0);
case mode_array_brackets_and_subshell: { size_t offset_of_open_brace = brace_offsets.back();
error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
this->start + offset_of_bracket); error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start,
break; this->start + offset_of_open_brace);
}
default: {
DIE("unexpected mode in read_string");
break;
}
} }
return error; return error;
} }

View file

@ -7,6 +7,7 @@
#include "common.h" #include "common.h"
#include "maybe.h" #include "maybe.h"
#include "parse_constants.h"
/// Token types. /// Token types.
enum token_type { enum token_type {
@ -22,17 +23,26 @@ enum token_type {
TOK_COMMENT /// comment token TOK_COMMENT /// comment token
}; };
/// Tokenizer error types. struct tokenizer_error {
enum tokenizer_error { const wchar_t *Message;
TOK_ERROR_NONE, enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error
TOK_UNTERMINATED_QUOTE, tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other)
TOK_UNTERMINATED_SUBSHELL, : Message(msg), parser_error(perr) {}
TOK_UNTERMINATED_SLICE, tokenizer_error(const tokenizer_error&) = delete;
TOK_UNTERMINATED_ESCAPE,
TOK_INVALID_REDIRECT,
TOK_INVALID_PIPE
}; };
extern tokenizer_error *TOK_ERROR_NONE;
extern tokenizer_error *TOK_UNTERMINATED_QUOTE;
extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL;
extern tokenizer_error *TOK_UNTERMINATED_SLICE;
extern tokenizer_error *TOK_UNTERMINATED_ESCAPE;
extern tokenizer_error *TOK_UNTERMINATED_BRACE;
extern tokenizer_error *TOK_INVALID_REDIRECT;
extern tokenizer_error *TOK_INVALID_PIPE;
extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL;
extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE;
extern tokenizer_error *TOK_ILLEGAL_SLICE;
enum class redirection_type_t { enum class redirection_type_t {
overwrite, // normal redirection: > file.txt overwrite, // normal redirection: > file.txt
append, // appending redirection: >> file.txt append, // appending redirection: >> file.txt
@ -67,7 +77,7 @@ struct tok_t {
maybe_t<int> redirected_fd{}; maybe_t<int> redirected_fd{};
// If an error, this is the error code. // If an error, this is the error code.
enum tokenizer_error error { TOK_ERROR_NONE }; tokenizer_error *error { TOK_ERROR_NONE };
// If an error, this is the offset of the error within the token. A value of 0 means it occurred // If an error, this is the offset of the error within the token. A value of 0 means it occurred
// at 'offset'. // at 'offset'.
@ -97,7 +107,7 @@ class tokenizer_t {
/// Whether to continue the previous line after the comment. /// Whether to continue the previous line after the comment.
bool continue_line_after_comment{false}; bool continue_line_after_comment{false};
tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start, tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start,
const wchar_t *error_loc); const wchar_t *error_loc);
tok_t read_string(); tok_t read_string();
maybe_t<tok_t> tok_next(); maybe_t<tok_t> tok_next();

View file

@ -45,3 +45,14 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) {
output.push_back(ellipsis_char); output.push_back(ellipsis_char);
return output; return output;
} }
wcstring trim(const wcstring &input, const wchar_t *any_of) {
auto begin_offset = input.find_first_not_of(any_of);
if (begin_offset == wcstring::npos) {
return wcstring{};
}
auto end = input.cbegin() + input.find_last_not_of(any_of);
wcstring result(input.begin() + begin_offset, end + 1);
return result;
}

View file

@ -59,5 +59,6 @@ enum class ellipsis_type {
}; };
wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest); wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest);
wcstring trim(const wcstring &input, const wchar_t *any_of);
#endif #endif

View file

View file

@ -0,0 +1,34 @@
# basic expansion test
echo {}
echo {apple}
echo {apple,orange}
# expansion tests with spaces
echo {apple, orange}
echo { apple, orange, banana }
# expansion with spaces and cartesian products
echo \'{ hello , world }\'
# expansion with escapes
for phrase in {good\,, beautiful ,morning}; echo -n "$phrase "; end | string trim;
for phrase in {goodbye\,,\ cruel\ ,world\n}; echo -n $phrase; end;
# whitespace within entries converted to spaces in a single entry
for foo in { hello
world }
echo \'$foo\'
end
# dual expansion cartesian product
echo { alpha, beta }\ {lambda, gamma }, | sed -r 's/(.*),/\1/'
# expansion with subshells
for name in { (echo Meg), (echo Jo) }
echo $name
end
# subshells with expansion
for name in (for name in {Beth, Amy}; printf "$name\n"; end); printf "$name\n"; end
# vim: set ft=fish:

View file

@ -0,0 +1,14 @@
{}
apple
apple orange
apple orange
apple orange banana
'hello' 'world'
good, beautiful morning
goodbye, cruel world
'hello world'
alpha lambda, beta lambda, alpha gamma, beta gamma
Meg
Jo
Beth
Amy