mirror of
https://github.com/fish-shell/fish-shell
synced 2024-12-26 12:53:13 +00:00
Merge branch 'fix_brace_parsing'
Closes #3802 and improves tokenizer handling of invalid expressions involving braces, parentheses, and brackets.
This commit is contained in:
commit
d385248cc8
16 changed files with 360 additions and 315 deletions
|
@ -1288,10 +1288,11 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in
|
|||
const bool unescape_special = static_cast<bool>(flags & UNESCAPE_SPECIAL);
|
||||
const bool allow_incomplete = static_cast<bool>(flags & UNESCAPE_INCOMPLETE);
|
||||
|
||||
int bracket_count = 0;
|
||||
bool brace_text_start = false;
|
||||
int brace_count = 0;
|
||||
|
||||
bool errored = false;
|
||||
enum { mode_unquoted, mode_single_quotes, mode_double_quotes } mode = mode_unquoted;
|
||||
enum { mode_unquoted, mode_single_quotes, mode_double_quotes, mode_braces } mode = mode_unquoted;
|
||||
|
||||
for (size_t input_position = 0; input_position < input_len && !errored; input_position++) {
|
||||
const wchar_t c = input[input_position];
|
||||
|
@ -1352,21 +1353,32 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in
|
|||
}
|
||||
case L'{': {
|
||||
if (unescape_special) {
|
||||
bracket_count++;
|
||||
to_append_or_none = BRACKET_BEGIN;
|
||||
brace_count++;
|
||||
to_append_or_none = BRACE_BEGIN;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L'}': {
|
||||
if (unescape_special) {
|
||||
bracket_count--;
|
||||
to_append_or_none = BRACKET_END;
|
||||
assert(brace_count > 0 && "imbalanced brackets are a tokenizer error, we shouldn't be able to get here");
|
||||
brace_count--;
|
||||
brace_text_start = brace_text_start && brace_count > 0;
|
||||
to_append_or_none = BRACE_END;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L',': {
|
||||
if (unescape_special && bracket_count > 0) {
|
||||
to_append_or_none = BRACKET_SEP;
|
||||
if (unescape_special && brace_count > 0) {
|
||||
to_append_or_none = BRACE_SEP;
|
||||
brace_text_start = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L'\n':
|
||||
case L'\t':
|
||||
case L' ': {
|
||||
if (unescape_special && brace_count > 0) {
|
||||
to_append_or_none = brace_text_start ? BRACE_SPACE : NOT_A_WCHAR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1380,7 +1392,12 @@ static bool unescape_string_internal(const wchar_t *const input, const size_t in
|
|||
to_append_or_none = unescape_special ? wint_t(INTERNAL_SEPARATOR) : NOT_A_WCHAR;
|
||||
break;
|
||||
}
|
||||
default: { break; }
|
||||
default: {
|
||||
if (unescape_special && brace_count > 0) {
|
||||
brace_text_start = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (mode == mode_single_quotes) {
|
||||
if (c == L'\\') {
|
||||
|
|
13
src/common.h
13
src/common.h
|
@ -807,6 +807,19 @@ struct enum_map {
|
|||
const wchar_t *const str;
|
||||
};
|
||||
|
||||
|
||||
/// Use for scoped enums (i.e. `enum class`) with bitwise operations
|
||||
#define ENUM_FLAG_OPERATOR(T,X,Y) \
|
||||
inline T operator X (T lhs, T rhs) { return (T) (static_cast<std::underlying_type<T>::type>(lhs) X static_cast<std::underlying_type<T>::type>(rhs)); } \
|
||||
inline T operator Y (T &lhs, T rhs) { return lhs = (T) (static_cast<std::underlying_type<T>::type>(lhs) X static_cast<std::underlying_type<T>::type>(rhs)); }
|
||||
#define ENUM_FLAGS(T) \
|
||||
enum class T; \
|
||||
inline T operator ~ (T t) { return (T) (~static_cast<std::underlying_type<T>::type>(t)); } \
|
||||
ENUM_FLAG_OPERATOR(T,|,|=) \
|
||||
ENUM_FLAG_OPERATOR(T,^,^=) \
|
||||
ENUM_FLAG_OPERATOR(T,&,&=) \
|
||||
enum class T
|
||||
|
||||
/// Given a string return the matching enum. Return the sentinal enum if no match is made. The map
|
||||
/// must be sorted by the `str` member. A binary search is twice as fast as a linear search with 16
|
||||
/// elements in the map.
|
||||
|
|
103
src/expand.cpp
103
src/expand.cpp
|
@ -47,6 +47,7 @@
|
|||
#include "proc.h"
|
||||
#include "reader.h"
|
||||
#include "wildcard.h"
|
||||
#include "wcstringutil.h"
|
||||
#include "wutil.h" // IWYU pragma: keep
|
||||
#ifdef KERN_PROCARGS2
|
||||
#else
|
||||
|
@ -570,7 +571,7 @@ static void find_process(const wchar_t *proc, expand_flags_t flags,
|
|||
static size_t parse_slice(const wchar_t *in, wchar_t **end_ptr, std::vector<long> &idx,
|
||||
std::vector<size_t> &source_positions, size_t array_size) {
|
||||
const long size = (long)array_size;
|
||||
size_t pos = 1; // skip past the opening square bracket
|
||||
size_t pos = 1; // skip past the opening square brace
|
||||
|
||||
while (1) {
|
||||
while (iswspace(in[pos]) || (in[pos] == INTERNAL_SEPARATOR)) pos++;
|
||||
|
@ -846,39 +847,39 @@ static bool expand_variables(const wcstring &instr, std::vector<completion_t> *o
|
|||
return true;
|
||||
}
|
||||
|
||||
/// Perform bracket expansion.
|
||||
static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flags,
|
||||
/// Perform brace expansion.
|
||||
static expand_error_t expand_braces(const wcstring &instr, expand_flags_t flags,
|
||||
std::vector<completion_t> *out, parse_error_list_t *errors) {
|
||||
bool syntax_error = false;
|
||||
int bracket_count = 0;
|
||||
int brace_count = 0;
|
||||
|
||||
const wchar_t *bracket_begin = NULL, *bracket_end = NULL;
|
||||
const wchar_t *brace_begin = NULL, *brace_end = NULL;
|
||||
const wchar_t *last_sep = NULL;
|
||||
|
||||
const wchar_t *item_begin;
|
||||
size_t length_preceding_brackets, length_following_brackets, tot_len;
|
||||
size_t length_preceding_braces, length_following_braces, tot_len;
|
||||
|
||||
const wchar_t *const in = instr.c_str();
|
||||
|
||||
// Locate the first non-nested bracket pair.
|
||||
// Locate the first non-nested brace pair.
|
||||
for (const wchar_t *pos = in; (*pos) && !syntax_error; pos++) {
|
||||
switch (*pos) {
|
||||
case BRACKET_BEGIN: {
|
||||
if (bracket_count == 0) bracket_begin = pos;
|
||||
bracket_count++;
|
||||
case BRACE_BEGIN: {
|
||||
if (brace_count == 0) brace_begin = pos;
|
||||
brace_count++;
|
||||
break;
|
||||
}
|
||||
case BRACKET_END: {
|
||||
bracket_count--;
|
||||
if (bracket_count < 0) {
|
||||
case BRACE_END: {
|
||||
brace_count--;
|
||||
if (brace_count < 0) {
|
||||
syntax_error = true;
|
||||
} else if (bracket_count == 0) {
|
||||
bracket_end = pos;
|
||||
} else if (brace_count == 0) {
|
||||
brace_end = pos;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case BRACKET_SEP: {
|
||||
if (bracket_count == 1) last_sep = pos;
|
||||
case BRACE_SEP: {
|
||||
if (brace_count == 1) last_sep = pos;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
|
@ -887,72 +888,80 @@ static expand_error_t expand_brackets(const wcstring &instr, expand_flags_t flag
|
|||
}
|
||||
}
|
||||
|
||||
if (bracket_count > 0) {
|
||||
if (brace_count > 0) {
|
||||
if (!(flags & EXPAND_FOR_COMPLETIONS)) {
|
||||
syntax_error = true;
|
||||
} else {
|
||||
// The user hasn't typed an end bracket yet; make one up and append it, then expand
|
||||
// The user hasn't typed an end brace yet; make one up and append it, then expand
|
||||
// that.
|
||||
wcstring mod;
|
||||
if (last_sep) {
|
||||
mod.append(in, bracket_begin - in + 1);
|
||||
mod.append(in, brace_begin - in + 1);
|
||||
mod.append(last_sep + 1);
|
||||
mod.push_back(BRACKET_END);
|
||||
mod.push_back(BRACE_END);
|
||||
} else {
|
||||
mod.append(in);
|
||||
mod.push_back(BRACKET_END);
|
||||
mod.push_back(BRACE_END);
|
||||
}
|
||||
|
||||
// Note: this code looks very fishy, apparently it has never worked.
|
||||
return expand_brackets(mod, 1, out, errors);
|
||||
return expand_braces(mod, 1, out, errors);
|
||||
}
|
||||
}
|
||||
|
||||
// Expand a literal "{}" to itself because it is useless otherwise,
|
||||
// and this eases e.g. `find -exec {}`. See #1109.
|
||||
if (bracket_begin + 1 == bracket_end) {
|
||||
if (brace_begin + 1 == brace_end) {
|
||||
wcstring newstr = instr;
|
||||
newstr.at(bracket_begin - in) = L'{';
|
||||
newstr.at(bracket_end - in) = L'}';
|
||||
return expand_brackets(newstr, flags, out, errors);
|
||||
newstr.at(brace_begin - in) = L'{';
|
||||
newstr.at(brace_end - in) = L'}';
|
||||
return expand_braces(newstr, flags, out, errors);
|
||||
}
|
||||
|
||||
if (syntax_error) {
|
||||
append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched brackets"));
|
||||
append_syntax_error(errors, SOURCE_LOCATION_UNKNOWN, _(L"Mismatched braces"));
|
||||
return EXPAND_ERROR;
|
||||
}
|
||||
|
||||
if (bracket_begin == NULL) {
|
||||
if (brace_begin == NULL) {
|
||||
append_completion(out, instr);
|
||||
return EXPAND_OK;
|
||||
}
|
||||
|
||||
length_preceding_brackets = (bracket_begin - in);
|
||||
length_following_brackets = wcslen(bracket_end) - 1;
|
||||
tot_len = length_preceding_brackets + length_following_brackets;
|
||||
item_begin = bracket_begin + 1;
|
||||
for (const wchar_t *pos = (bracket_begin + 1); true; pos++) {
|
||||
if (bracket_count == 0 && ((*pos == BRACKET_SEP) || (pos == bracket_end))) {
|
||||
length_preceding_braces = (brace_begin - in);
|
||||
length_following_braces = wcslen(brace_end) - 1;
|
||||
tot_len = length_preceding_braces + length_following_braces;
|
||||
item_begin = brace_begin + 1;
|
||||
for (const wchar_t *pos = (brace_begin + 1); true; pos++) {
|
||||
if (brace_count == 0 && ((*pos == BRACE_SEP) || (pos == brace_end))) {
|
||||
assert(pos >= item_begin);
|
||||
size_t item_len = pos - item_begin;
|
||||
wcstring item = wcstring(item_begin, item_len);
|
||||
item = trim(item, (const wchar_t[]) { BRACE_SPACE });
|
||||
for (auto &c : item) {
|
||||
if (c == BRACE_SPACE) {
|
||||
c = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
wcstring whole_item;
|
||||
whole_item.reserve(tot_len + item_len + 2);
|
||||
whole_item.append(in, length_preceding_brackets);
|
||||
whole_item.append(item_begin, item_len);
|
||||
whole_item.append(bracket_end + 1);
|
||||
expand_brackets(whole_item, flags, out, errors);
|
||||
whole_item.append(in, length_preceding_braces);
|
||||
whole_item.append(item.begin(), item.end());
|
||||
whole_item.append(brace_end + 1);
|
||||
whole_item = trim(whole_item, (const wchar_t[]) { BRACE_SPACE });
|
||||
expand_braces(whole_item, flags, out, errors);
|
||||
|
||||
item_begin = pos + 1;
|
||||
if (pos == bracket_end) break;
|
||||
if (pos == brace_end) break;
|
||||
}
|
||||
|
||||
if (*pos == BRACKET_BEGIN) {
|
||||
bracket_count++;
|
||||
if (*pos == BRACE_BEGIN) {
|
||||
brace_count++;
|
||||
}
|
||||
|
||||
if (*pos == BRACKET_END) {
|
||||
bracket_count--;
|
||||
if (*pos == BRACE_END) {
|
||||
brace_count--;
|
||||
}
|
||||
}
|
||||
return EXPAND_OK;
|
||||
|
@ -1274,9 +1283,9 @@ static expand_error_t expand_stage_variables(const wcstring &input, std::vector<
|
|||
return EXPAND_OK;
|
||||
}
|
||||
|
||||
static expand_error_t expand_stage_brackets(const wcstring &input, std::vector<completion_t> *out,
|
||||
static expand_error_t expand_stage_braces(const wcstring &input, std::vector<completion_t> *out,
|
||||
expand_flags_t flags, parse_error_list_t *errors) {
|
||||
return expand_brackets(input, flags, out, errors);
|
||||
return expand_braces(input, flags, out, errors);
|
||||
}
|
||||
|
||||
static expand_error_t expand_stage_home(const wcstring &input,
|
||||
|
@ -1393,7 +1402,7 @@ expand_error_t expand_string(const wcstring &input, std::vector<completion_t> *o
|
|||
|
||||
// Our expansion stages.
|
||||
const expand_stage_t stages[] = {expand_stage_cmdsubst, expand_stage_variables,
|
||||
expand_stage_brackets, expand_stage_home,
|
||||
expand_stage_braces, expand_stage_home,
|
||||
expand_stage_wildcards};
|
||||
|
||||
// Load up our single initial completion.
|
||||
|
|
|
@ -65,11 +65,13 @@ enum {
|
|||
/// Character representing variable expansion into a single element.
|
||||
VARIABLE_EXPAND_SINGLE,
|
||||
/// Character representing the start of a bracket expansion.
|
||||
BRACKET_BEGIN,
|
||||
BRACE_BEGIN,
|
||||
/// Character representing the end of a bracket expansion.
|
||||
BRACKET_END,
|
||||
BRACE_END,
|
||||
/// Character representing separation between two bracket elements.
|
||||
BRACKET_SEP,
|
||||
BRACE_SEP,
|
||||
/// Character that takes the place of any whitespace within non-quoted text in braces
|
||||
BRACE_SPACE,
|
||||
/// Separate subtokens in a token with this character.
|
||||
INTERNAL_SEPARATOR,
|
||||
/// Character representing an empty variable expansion. Only used transitively while expanding
|
||||
|
|
|
@ -578,6 +578,15 @@ static void test_tokenizer() {
|
|||
do_test(token.error_offset == 3);
|
||||
}
|
||||
|
||||
{
|
||||
tokenizer_t t(L"abc )defg(hij", 0);
|
||||
do_test(t.next(&token));
|
||||
do_test(t.next(&token));
|
||||
do_test(token.type == TOK_ERROR);
|
||||
do_test(token.error == TOK_CLOSING_UNOPENED_SUBSHELL);
|
||||
do_test(token.error_offset == 4);
|
||||
}
|
||||
|
||||
{
|
||||
tokenizer_t t(L"abc defg(hij (klm)", 0);
|
||||
do_test(t.next(&token));
|
||||
|
@ -4420,10 +4429,11 @@ static void test_illegal_command_exit_code() {
|
|||
|
||||
const command_result_tuple_t tests[] = {
|
||||
{L"echo -n", STATUS_CMD_OK}, {L"pwd", STATUS_CMD_OK},
|
||||
{L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD},
|
||||
// a `)` without a matching `(` is now a tokenizer error, and cannot be executed even as an illegal command
|
||||
// {L")", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}, {L") ", STATUS_ILLEGAL_CMD}
|
||||
{L"*", STATUS_ILLEGAL_CMD}, {L"**", STATUS_ILLEGAL_CMD},
|
||||
{L"?", STATUS_ILLEGAL_CMD}, {L"abc?def", STATUS_ILLEGAL_CMD},
|
||||
{L") ", STATUS_ILLEGAL_CMD}};
|
||||
};
|
||||
|
||||
int res = 0;
|
||||
const io_chain_t empty_ios;
|
||||
|
|
|
@ -122,9 +122,9 @@ bool is_potential_path(const wcstring &potential_path_fragment, const wcstring_l
|
|||
switch (c) {
|
||||
case VARIABLE_EXPAND:
|
||||
case VARIABLE_EXPAND_SINGLE:
|
||||
case BRACKET_BEGIN:
|
||||
case BRACKET_END:
|
||||
case BRACKET_SEP:
|
||||
case BRACE_BEGIN:
|
||||
case BRACE_END:
|
||||
case BRACE_SEP:
|
||||
case ANY_CHAR:
|
||||
case ANY_STRING:
|
||||
case ANY_STRING_RECURSIVE: {
|
||||
|
|
|
@ -169,6 +169,7 @@ enum parse_error_code_t {
|
|||
parse_error_tokenizer_unterminated_subshell,
|
||||
parse_error_tokenizer_unterminated_slice,
|
||||
parse_error_tokenizer_unterminated_escape,
|
||||
parse_error_tokenizer_nested_slice,
|
||||
parse_error_tokenizer_other,
|
||||
|
||||
parse_error_unbalancing_end, // end outside of block
|
||||
|
|
|
@ -668,35 +668,10 @@ void parse_ll_t::parse_error_failed_production(struct parse_stack_element_t &sta
|
|||
}
|
||||
|
||||
void parse_ll_t::report_tokenizer_error(const tokenizer_t &tokenizer, const tok_t &tok) {
|
||||
parse_error_code_t parse_error_code;
|
||||
switch (tok.error) {
|
||||
case TOK_UNTERMINATED_QUOTE: {
|
||||
parse_error_code = parse_error_tokenizer_unterminated_quote;
|
||||
break;
|
||||
}
|
||||
case TOK_UNTERMINATED_SUBSHELL: {
|
||||
parse_error_code = parse_error_tokenizer_unterminated_subshell;
|
||||
break;
|
||||
}
|
||||
case TOK_UNTERMINATED_SLICE: {
|
||||
parse_error_code = parse_error_tokenizer_unterminated_slice;
|
||||
break;
|
||||
}
|
||||
case TOK_UNTERMINATED_ESCAPE: {
|
||||
parse_error_code = parse_error_tokenizer_unterminated_escape;
|
||||
break;
|
||||
}
|
||||
case TOK_INVALID_REDIRECT:
|
||||
case TOK_INVALID_PIPE:
|
||||
default: {
|
||||
parse_error_code = parse_error_tokenizer_other;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
parse_error_code_t parse_error_code = tok.error->parser_error;
|
||||
this->parse_error_at_location(tok.offset, tok.length, tok.offset + tok.error_offset,
|
||||
parse_error_code, L"%ls",
|
||||
error_message_for_code(tok.error).c_str());
|
||||
tok.error->Message);
|
||||
}
|
||||
|
||||
void parse_ll_t::parse_error_unexpected_token(const wchar_t *expected, parse_token_t token) {
|
||||
|
|
|
@ -834,14 +834,14 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token
|
|||
wchar_t char_after_dollar = dollar_pos + 1 >= token.size() ? 0 : token.at(dollar_pos + 1);
|
||||
|
||||
switch (char_after_dollar) {
|
||||
case BRACKET_BEGIN:
|
||||
case BRACE_BEGIN:
|
||||
case L'{': {
|
||||
// The BRACKET_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible
|
||||
// The BRACE_BEGIN is for unquoted, the { is for quoted. Anyways we have (possible
|
||||
// quoted) ${. See if we have a }, and the stuff in between is variable material. If so,
|
||||
// report a bracket error. Otherwise just complain about the ${.
|
||||
bool looks_like_variable = false;
|
||||
size_t closing_bracket =
|
||||
token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACKET_END), dollar_pos + 2);
|
||||
token.find(char_after_dollar == L'{' ? L'}' : wchar_t(BRACE_END), dollar_pos + 2);
|
||||
wcstring var_name;
|
||||
if (closing_bracket != wcstring::npos) {
|
||||
size_t var_start = dollar_pos + 2, var_end = closing_bracket;
|
||||
|
|
|
@ -16,46 +16,22 @@
|
|||
#include "tokenizer.h"
|
||||
#include "wutil.h" // IWYU pragma: keep
|
||||
|
||||
/// Error string for unexpected end of string.
|
||||
#define QUOTE_ERROR _(L"Unexpected end of string, quotes are not balanced")
|
||||
|
||||
/// Error string for mismatched parenthesis.
|
||||
#define PARAN_ERROR _(L"Unexpected end of string, parenthesis do not match")
|
||||
|
||||
/// Error string for mismatched square brackets.
|
||||
#define SQUARE_BRACKET_ERROR _(L"Unexpected end of string, square brackets do not match")
|
||||
|
||||
/// Error string for unterminated escape (backslash without continuation).
|
||||
#define UNTERMINATED_ESCAPE_ERROR _(L"Unexpected end of string, incomplete escape sequence")
|
||||
|
||||
/// Error string for invalid redirections.
|
||||
#define REDIRECT_ERROR _(L"Invalid input/output redirection")
|
||||
|
||||
/// Error string for when trying to pipe from fd 0.
|
||||
#define PIPE_ERROR _(L"Cannot use stdin (fd 0) as pipe output")
|
||||
|
||||
wcstring error_message_for_code(tokenizer_error err) {
|
||||
switch (err) {
|
||||
case TOK_UNTERMINATED_QUOTE:
|
||||
return QUOTE_ERROR;
|
||||
case TOK_UNTERMINATED_SUBSHELL:
|
||||
return PARAN_ERROR;
|
||||
case TOK_UNTERMINATED_SLICE:
|
||||
return SQUARE_BRACKET_ERROR;
|
||||
case TOK_UNTERMINATED_ESCAPE:
|
||||
return UNTERMINATED_ESCAPE_ERROR;
|
||||
case TOK_INVALID_REDIRECT:
|
||||
return REDIRECT_ERROR;
|
||||
case TOK_INVALID_PIPE:
|
||||
return PIPE_ERROR;
|
||||
default:
|
||||
assert(0 && "Unknown error type");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
tokenizer_error *TOK_ERROR_NONE = new tokenizer_error(L"");
|
||||
tokenizer_error *TOK_UNTERMINATED_QUOTE = new tokenizer_error((L"Unexpected end of string, quotes are not balanced"), parse_error_tokenizer_unterminated_quote);
|
||||
tokenizer_error *TOK_UNTERMINATED_SUBSHELL = new tokenizer_error((L"Unexpected end of string, expecting ')'"), parse_error_tokenizer_unterminated_subshell);
|
||||
tokenizer_error *TOK_UNTERMINATED_SLICE = new tokenizer_error((L"Unexpected end of string, square brackets do not match"), parse_error_tokenizer_unterminated_slice);
|
||||
tokenizer_error *TOK_UNTERMINATED_ESCAPE = new tokenizer_error((L"Unexpected end of string, incomplete escape sequence"), parse_error_tokenizer_unterminated_escape);
|
||||
tokenizer_error *TOK_INVALID_REDIRECT = new tokenizer_error((L"Invalid input/output redirection"));
|
||||
tokenizer_error *TOK_INVALID_PIPE = new tokenizer_error((L"Cannot use stdin (fd 0) as pipe output"));
|
||||
tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL = new tokenizer_error((L"Unexpected ')' for unopened parenthesis"));
|
||||
tokenizer_error *TOK_ILLEGAL_SLICE = new tokenizer_error((L"Unexpected '[' at this location"));
|
||||
tokenizer_error *TOK_CLOSING_UNOPENED_BRACE = new tokenizer_error((L"Unexpected '}' for unopened brace expansion"));
|
||||
tokenizer_error *TOK_UNTERMINATED_BRACE = new tokenizer_error((L"Unexpected end of string, incomplete parameter expansion"));
|
||||
tokenizer_error *TOK_EXPECTED_PCLOSE_FOUND_BCLOSE = new tokenizer_error((L"Unexpected '}' found, expecting ')'"));
|
||||
tokenizer_error *TOK_EXPECTED_BCLOSE_FOUND_PCLOSE = new tokenizer_error((L"Unexpected ')' found, expecting '}'"));
|
||||
|
||||
/// Return an error token and mark that we no longer have a next token.
|
||||
tok_t tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *token_start,
|
||||
tok_t tokenizer_t::call_error(tokenizer_error *error_type, const wchar_t *token_start,
|
||||
const wchar_t *error_loc) {
|
||||
assert(error_type != TOK_ERROR_NONE && "TOK_ERROR_NONE passed to call_error");
|
||||
assert(error_loc >= token_start && "Invalid error location");
|
||||
|
@ -119,194 +95,166 @@ static bool tok_is_string_character(wchar_t c, bool is_first) {
|
|||
/// Quick test to catch the most common 'non-magical' characters, makes read_string slightly faster
|
||||
/// by adding a fast path for the most common characters. This is obviously not a suitable
|
||||
/// replacement for iswalpha.
|
||||
static int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }
|
||||
static inline int myal(wchar_t c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); }
|
||||
|
||||
ENUM_FLAGS(tok_mode) {
|
||||
regular_text = 0, // regular text
|
||||
subshell = 1 << 0, // inside of subshell parentheses
|
||||
array_brackets = 1 << 1, // inside of array brackets
|
||||
curly_braces = 1 << 2,
|
||||
char_escape = 1 << 3,
|
||||
};
|
||||
|
||||
/// Read the next token as a string.
|
||||
tok_t tokenizer_t::read_string() {
|
||||
bool do_loop = true;
|
||||
size_t paran_count = 0;
|
||||
// Up to 96 open parens, before we give up on good error reporting.
|
||||
const size_t paran_offsets_max = 96;
|
||||
size_t paran_offsets[paran_offsets_max];
|
||||
// Where the open bracket is.
|
||||
size_t offset_of_bracket = 0;
|
||||
tok_mode mode { tok_mode::regular_text };
|
||||
std::vector<int> paran_offsets;
|
||||
std::vector<int> brace_offsets;
|
||||
std::vector<char> expecting;
|
||||
int slice_offset = 0;
|
||||
const wchar_t *const buff_start = this->buff;
|
||||
bool is_first = true;
|
||||
|
||||
enum tok_mode_t {
|
||||
mode_regular_text = 0, // regular text
|
||||
mode_subshell = 1, // inside of subshell
|
||||
mode_array_brackets = 2, // inside of array brackets
|
||||
mode_array_brackets_and_subshell =
|
||||
3 // inside of array brackets and subshell, like in '$foo[(ech'
|
||||
} mode = mode_regular_text;
|
||||
while (true) {
|
||||
wchar_t c = *this->buff;
|
||||
#if false
|
||||
wcstring msg = L"Handling 0x%x (%lc)";
|
||||
tok_mode mode_begin = mode;
|
||||
#endif
|
||||
|
||||
while (1) {
|
||||
if (!myal(*this->buff)) {
|
||||
if (*this->buff == L'\\') {
|
||||
const wchar_t *error_location = this->buff;
|
||||
this->buff++;
|
||||
if (*this->buff == L'\0') {
|
||||
if ((!this->accept_unfinished)) {
|
||||
return this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
|
||||
error_location);
|
||||
}
|
||||
// Since we are about to increment tok->buff, decrement it first so the
|
||||
// increment doesn't go past the end of the buffer. See issue #389.
|
||||
this->buff--;
|
||||
do_loop = 0;
|
||||
}
|
||||
|
||||
this->buff++;
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case mode_regular_text: {
|
||||
switch (*this->buff) {
|
||||
case L'(': {
|
||||
paran_count = 1;
|
||||
paran_offsets[0] = this->buff - this->start;
|
||||
mode = mode_subshell;
|
||||
break;
|
||||
}
|
||||
case L'[': {
|
||||
if (this->buff != buff_start) {
|
||||
mode = mode_array_brackets;
|
||||
offset_of_bracket = this->buff - this->start;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L'\'':
|
||||
case L'"': {
|
||||
const wchar_t *end = quote_end(this->buff);
|
||||
if (end) {
|
||||
this->buff = end;
|
||||
} else {
|
||||
const wchar_t *error_loc = this->buff;
|
||||
this->buff += wcslen(this->buff);
|
||||
|
||||
if (!this->accept_unfinished) {
|
||||
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
|
||||
error_loc);
|
||||
}
|
||||
do_loop = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
if (!tok_is_string_character(*(this->buff), is_first)) {
|
||||
do_loop = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case mode_array_brackets_and_subshell:
|
||||
case mode_subshell: {
|
||||
switch (*this->buff) {
|
||||
case L'\'':
|
||||
case L'\"': {
|
||||
const wchar_t *end = quote_end(this->buff);
|
||||
if (end) {
|
||||
this->buff = end;
|
||||
} else {
|
||||
const wchar_t *error_loc = this->buff;
|
||||
this->buff += wcslen(this->buff);
|
||||
if ((!this->accept_unfinished)) {
|
||||
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start,
|
||||
error_loc);
|
||||
}
|
||||
do_loop = 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L'(': {
|
||||
if (paran_count < paran_offsets_max) {
|
||||
paran_offsets[paran_count] = this->buff - this->start;
|
||||
}
|
||||
paran_count++;
|
||||
break;
|
||||
}
|
||||
case L')': {
|
||||
assert(paran_count > 0);
|
||||
paran_count--;
|
||||
if (paran_count == 0) {
|
||||
mode =
|
||||
(mode == mode_array_brackets_and_subshell ? mode_array_brackets
|
||||
: mode_regular_text);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case L'\0': {
|
||||
do_loop = 0;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break; // ignore other chars
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case mode_array_brackets: {
|
||||
switch (*this->buff) {
|
||||
case L'(': {
|
||||
paran_count = 1;
|
||||
paran_offsets[0] = this->buff - this->start;
|
||||
mode = mode_array_brackets_and_subshell;
|
||||
break;
|
||||
}
|
||||
case L']': {
|
||||
mode = mode_regular_text;
|
||||
break;
|
||||
}
|
||||
case L'\0': {
|
||||
do_loop = 0;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break; // ignore other chars
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (c == L'\0') {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!do_loop) break;
|
||||
// Make sure this character isn't being escaped before anything else
|
||||
if ((mode & tok_mode::char_escape) == tok_mode::char_escape) {
|
||||
mode &= ~(tok_mode::char_escape);
|
||||
// and do nothing more
|
||||
}
|
||||
else if (myal(c)) {
|
||||
// Early exit optimization in case the character is just a letter,
|
||||
// which has no special meaning to the tokenizer, i.e. the same mode continues.
|
||||
}
|
||||
|
||||
// Now proceed with the evaluation of the token, first checking to see if the token
|
||||
// has been explicitly ignored (escaped).
|
||||
else if (c == L'\\') {
|
||||
mode |= tok_mode::char_escape;
|
||||
}
|
||||
else if (c == L'(') {
|
||||
paran_offsets.push_back(this->buff - this->start);
|
||||
expecting.push_back(L')');
|
||||
mode |= tok_mode::subshell;
|
||||
}
|
||||
else if (c == L'{') {
|
||||
brace_offsets.push_back(this->buff - this->start);
|
||||
expecting.push_back(L'}');
|
||||
mode |= tok_mode::curly_braces;
|
||||
}
|
||||
else if (c == L')') {
|
||||
if (expecting.size() > 0 && expecting.back() == L'}') {
|
||||
return this->call_error(TOK_EXPECTED_BCLOSE_FOUND_PCLOSE, this->start, this->buff);
|
||||
}
|
||||
switch (paran_offsets.size()) {
|
||||
case 0:
|
||||
return this->call_error(TOK_CLOSING_UNOPENED_SUBSHELL, this->start, this->buff);
|
||||
case 1:
|
||||
mode &= ~(tok_mode::subshell);
|
||||
default:
|
||||
paran_offsets.pop_back();
|
||||
}
|
||||
expecting.pop_back();
|
||||
}
|
||||
else if (c == L'}') {
|
||||
if (expecting.size() > 0 && expecting.back() == L')') {
|
||||
return this->call_error(TOK_EXPECTED_PCLOSE_FOUND_BCLOSE, this->start, this->buff);
|
||||
}
|
||||
switch (brace_offsets.size()) {
|
||||
case 0:
|
||||
return this->call_error(TOK_CLOSING_UNOPENED_BRACE, this->start, this->buff);
|
||||
case 1:
|
||||
mode &= ~(tok_mode::curly_braces);
|
||||
default:
|
||||
brace_offsets.pop_back();
|
||||
}
|
||||
expecting.pop_back();
|
||||
}
|
||||
else if (c == L'[') {
|
||||
if (this->buff != buff_start) {
|
||||
if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) {
|
||||
// Nested brackets should not overwrite the existing slice_offset
|
||||
//mqudsi: TOK_ILLEGAL_SLICE is the right error here, but the shell
|
||||
//prints an error message with the caret pointing at token_start,
|
||||
//not err_loc, making the TOK_ILLEGAL_SLICE message misleading.
|
||||
// return call_error(TOK_ILLEGAL_SLICE, buff_start, this->buff);
|
||||
return this->call_error(TOK_UNTERMINATED_SLICE, this->start, this->buff);
|
||||
}
|
||||
slice_offset = this->buff - this->start;
|
||||
mode |= tok_mode::array_brackets;
|
||||
}
|
||||
else {
|
||||
// This is actually allowed so the test operator `[` can be used as the head of a command
|
||||
}
|
||||
}
|
||||
// Only exit bracket mode if we are in bracket mode.
|
||||
// Reason: `]` can be a parameter, e.g. last parameter to `[` test alias.
|
||||
// e.g. echo $argv[([ $x -eq $y ])] # must not end bracket mode on first bracket
|
||||
else if (c == L']' && ((mode & tok_mode::array_brackets) == tok_mode::array_brackets)) {
|
||||
mode &= ~(tok_mode::array_brackets);
|
||||
}
|
||||
else if (c == L'\'' || c == L'"') {
|
||||
const wchar_t *end = quote_end(this->buff);
|
||||
if (end) {
|
||||
this->buff = end;
|
||||
} else {
|
||||
const wchar_t *error_loc = this->buff;
|
||||
this->buff += wcslen(this->buff);
|
||||
if ((!this->accept_unfinished)) {
|
||||
return this->call_error(TOK_UNTERMINATED_QUOTE, buff_start, error_loc);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (mode == tok_mode::regular_text && !tok_is_string_character(c, is_first)) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if false
|
||||
if (mode != mode_begin) {
|
||||
msg.append(L": mode 0x%x -> 0x%x\n");
|
||||
} else {
|
||||
msg.push_back(L'\n');
|
||||
}
|
||||
debug(0, msg.c_str(), c, c, int(mode_begin), int(mode));
|
||||
#endif
|
||||
|
||||
this->buff++;
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
if ((!this->accept_unfinished) && (mode != mode_regular_text)) {
|
||||
if ((!this->accept_unfinished) && (mode != tok_mode::regular_text)) {
|
||||
tok_t error;
|
||||
switch (mode) {
|
||||
case mode_subshell: {
|
||||
// Determine the innermost opening paran offset by interrogating paran_offsets.
|
||||
assert(paran_count > 0);
|
||||
size_t offset_of_open_paran = 0;
|
||||
if (paran_count <= paran_offsets_max) {
|
||||
offset_of_open_paran = paran_offsets[paran_count - 1];
|
||||
}
|
||||
if ((mode & tok_mode::char_escape) == tok_mode::char_escape) {
|
||||
error = this->call_error(TOK_UNTERMINATED_ESCAPE, buff_start,
|
||||
this->buff - 1);
|
||||
}
|
||||
else if ((mode & tok_mode::array_brackets) == tok_mode::array_brackets) {
|
||||
error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
|
||||
this->start + slice_offset);
|
||||
}
|
||||
else if ((mode & tok_mode::subshell) == tok_mode::subshell) {
|
||||
assert(paran_offsets.size() > 0);
|
||||
size_t offset_of_open_paran = paran_offsets.back();
|
||||
|
||||
error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
|
||||
this->start + offset_of_open_paran);
|
||||
break;
|
||||
}
|
||||
case mode_array_brackets:
|
||||
case mode_array_brackets_and_subshell: {
|
||||
error = this->call_error(TOK_UNTERMINATED_SLICE, buff_start,
|
||||
this->start + offset_of_bracket);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
DIE("unexpected mode in read_string");
|
||||
break;
|
||||
}
|
||||
error = this->call_error(TOK_UNTERMINATED_SUBSHELL, buff_start,
|
||||
this->start + offset_of_open_paran);
|
||||
}
|
||||
else if ((mode & tok_mode::curly_braces) == tok_mode::curly_braces) {
|
||||
assert(brace_offsets.size() > 0);
|
||||
size_t offset_of_open_brace = brace_offsets.back();
|
||||
|
||||
error = this->call_error(TOK_UNTERMINATED_BRACE, buff_start,
|
||||
this->start + offset_of_open_brace);
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "common.h"
|
||||
#include "maybe.h"
|
||||
#include "parse_constants.h"
|
||||
|
||||
/// Token types.
|
||||
enum token_type {
|
||||
|
@ -22,17 +23,26 @@ enum token_type {
|
|||
TOK_COMMENT /// comment token
|
||||
};
|
||||
|
||||
/// Tokenizer error types.
|
||||
enum tokenizer_error {
|
||||
TOK_ERROR_NONE,
|
||||
TOK_UNTERMINATED_QUOTE,
|
||||
TOK_UNTERMINATED_SUBSHELL,
|
||||
TOK_UNTERMINATED_SLICE,
|
||||
TOK_UNTERMINATED_ESCAPE,
|
||||
TOK_INVALID_REDIRECT,
|
||||
TOK_INVALID_PIPE
|
||||
struct tokenizer_error {
|
||||
const wchar_t *Message;
|
||||
enum parse_error_code_t parser_error; //the parser error associated with this tokenizer error
|
||||
tokenizer_error(const wchar_t *msg, enum parse_error_code_t perr = parse_error_tokenizer_other)
|
||||
: Message(msg), parser_error(perr) {}
|
||||
tokenizer_error(const tokenizer_error&) = delete;
|
||||
};
|
||||
|
||||
extern tokenizer_error *TOK_ERROR_NONE;
|
||||
extern tokenizer_error *TOK_UNTERMINATED_QUOTE;
|
||||
extern tokenizer_error *TOK_UNTERMINATED_SUBSHELL;
|
||||
extern tokenizer_error *TOK_UNTERMINATED_SLICE;
|
||||
extern tokenizer_error *TOK_UNTERMINATED_ESCAPE;
|
||||
extern tokenizer_error *TOK_UNTERMINATED_BRACE;
|
||||
extern tokenizer_error *TOK_INVALID_REDIRECT;
|
||||
extern tokenizer_error *TOK_INVALID_PIPE;
|
||||
extern tokenizer_error *TOK_CLOSING_UNOPENED_SUBSHELL;
|
||||
extern tokenizer_error *TOK_CLOSING_UNOPENED_BRACE;
|
||||
extern tokenizer_error *TOK_ILLEGAL_SLICE;
|
||||
|
||||
enum class redirection_type_t {
|
||||
overwrite, // normal redirection: > file.txt
|
||||
append, // appending redirection: >> file.txt
|
||||
|
@ -67,7 +77,7 @@ struct tok_t {
|
|||
maybe_t<int> redirected_fd{};
|
||||
|
||||
// If an error, this is the error code.
|
||||
enum tokenizer_error error { TOK_ERROR_NONE };
|
||||
tokenizer_error *error { TOK_ERROR_NONE };
|
||||
|
||||
// If an error, this is the offset of the error within the token. A value of 0 means it occurred
|
||||
// at 'offset'.
|
||||
|
@ -97,7 +107,7 @@ class tokenizer_t {
|
|||
/// Whether to continue the previous line after the comment.
|
||||
bool continue_line_after_comment{false};
|
||||
|
||||
tok_t call_error(enum tokenizer_error error_type, const wchar_t *token_start,
|
||||
tok_t call_error(tokenizer_error *error_type, const wchar_t *token_start,
|
||||
const wchar_t *error_loc);
|
||||
tok_t read_string();
|
||||
maybe_t<tok_t> tok_next();
|
||||
|
|
|
@ -45,3 +45,14 @@ wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype) {
|
|||
output.push_back(ellipsis_char);
|
||||
return output;
|
||||
}
|
||||
|
||||
wcstring trim(const wcstring &input, const wchar_t *any_of) {
|
||||
auto begin_offset = input.find_first_not_of(any_of);
|
||||
if (begin_offset == wcstring::npos) {
|
||||
return wcstring{};
|
||||
}
|
||||
auto end = input.cbegin() + input.find_last_not_of(any_of);
|
||||
|
||||
wcstring result(input.begin() + begin_offset, end + 1);
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -59,5 +59,6 @@ enum class ellipsis_type {
|
|||
};
|
||||
|
||||
wcstring truncate(const wcstring &input, int max_len, ellipsis_type etype = ellipsis_type::Prettiest);
|
||||
wcstring trim(const wcstring &input, const wchar_t *any_of);
|
||||
|
||||
#endif
|
||||
|
|
0
tests/parameter_expansion.err
Normal file
0
tests/parameter_expansion.err
Normal file
34
tests/parameter_expansion.in
Normal file
34
tests/parameter_expansion.in
Normal file
|
@ -0,0 +1,34 @@
|
|||
# basic expansion test
|
||||
echo {}
|
||||
echo {apple}
|
||||
echo {apple,orange}
|
||||
|
||||
# expansion tests with spaces
|
||||
echo {apple, orange}
|
||||
echo { apple, orange, banana }
|
||||
|
||||
# expansion with spaces and cartesian products
|
||||
echo \'{ hello , world }\'
|
||||
|
||||
# expansion with escapes
|
||||
for phrase in {good\,, beautiful ,morning}; echo -n "$phrase "; end | string trim;
|
||||
for phrase in {goodbye\,,\ cruel\ ,world\n}; echo -n $phrase; end;
|
||||
|
||||
# whitespace within entries converted to spaces in a single entry
|
||||
for foo in { hello
|
||||
world }
|
||||
echo \'$foo\'
|
||||
end
|
||||
|
||||
# dual expansion cartesian product
|
||||
echo { alpha, beta }\ {lambda, gamma }, | sed -r 's/(.*),/\1/'
|
||||
|
||||
# expansion with subshells
|
||||
for name in { (echo Meg), (echo Jo) }
|
||||
echo $name
|
||||
end
|
||||
|
||||
# subshells with expansion
|
||||
for name in (for name in {Beth, Amy}; printf "$name\n"; end); printf "$name\n"; end
|
||||
|
||||
# vim: set ft=fish:
|
14
tests/parameter_expansion.out
Normal file
14
tests/parameter_expansion.out
Normal file
|
@ -0,0 +1,14 @@
|
|||
{}
|
||||
apple
|
||||
apple orange
|
||||
apple orange
|
||||
apple orange banana
|
||||
'hello' 'world'
|
||||
good, beautiful morning
|
||||
goodbye, cruel world
|
||||
'hello world'
|
||||
alpha lambda, beta lambda, alpha gamma, beta gamma
|
||||
Meg
|
||||
Jo
|
||||
Beth
|
||||
Amy
|
Loading…
Reference in a new issue