mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-13 21:44:16 +00:00
Some early cleanup of tokenizer
Prior to this the tokenizer ran "one ahead", where tokenizer_t::next() would in fact return the last-parsed token. Switch to parsing on demand instead of running one ahead; this is simpler and prepares for tokenizer changes.
This commit is contained in:
parent
04c399c5a9
commit
b13ee818d2
4 changed files with 74 additions and 72 deletions
|
@ -137,10 +137,10 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
|
|||
size_t pos = get_cursor_pos() - (begin - get_buffer());
|
||||
|
||||
if (tokenize) {
|
||||
wchar_t *buff = wcsndup(begin, end - begin);
|
||||
// fwprintf( stderr, L"Subshell: %ls, end char %lc\n", buff, *end );
|
||||
wcstring out;
|
||||
tokenizer_t tok(buff, TOK_ACCEPT_UNFINISHED);
|
||||
wcstring buff(begin, end - begin);
|
||||
tokenizer_t tok(buff.c_str(), TOK_ACCEPT_UNFINISHED);
|
||||
tok_t token;
|
||||
while (tok.next(&token)) {
|
||||
if ((cut_at_cursor) && (token.offset + token.text.size() >= pos)) break;
|
||||
|
@ -154,7 +154,6 @@ static void write_part(const wchar_t *begin, const wchar_t *end, int cut_at_curs
|
|||
}
|
||||
|
||||
streams.out.append(out);
|
||||
free(buff);
|
||||
} else {
|
||||
if (cut_at_cursor) {
|
||||
streams.out.append(begin, pos);
|
||||
|
|
|
@ -509,6 +509,29 @@ static void test_tokenizer() {
|
|||
say(L"Testing tokenizer");
|
||||
tok_t token;
|
||||
|
||||
{
|
||||
bool got = false;
|
||||
const wchar_t *str = L"alpha beta";
|
||||
tokenizer_t t(str, 0);
|
||||
|
||||
got = t.next(&token); // alpha
|
||||
do_test(got);
|
||||
do_test(token.type == TOK_STRING);
|
||||
do_test(token.offset == 0);
|
||||
do_test(token.length == 5);
|
||||
do_test(token.text == L"alpha");
|
||||
|
||||
got = t.next(&token); // beta
|
||||
do_test(got);
|
||||
do_test(token.type == TOK_STRING);
|
||||
do_test(token.offset == 6);
|
||||
do_test(token.length == 4);
|
||||
do_test(token.text == L"beta");
|
||||
|
||||
got = t.next(&token);
|
||||
do_test(!got);
|
||||
}
|
||||
|
||||
const wchar_t *str =
|
||||
L"string <redirection 2>&1 'nested \"quoted\" '(string containing subshells "
|
||||
L"){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect "
|
||||
|
@ -524,14 +547,17 @@ static void test_tokenizer() {
|
|||
tokenizer_t t(str, 0);
|
||||
size_t i = 0;
|
||||
while (t.next(&token)) {
|
||||
if (i > sizeof types / sizeof *types) {
|
||||
if (i >= sizeof types / sizeof *types) {
|
||||
err(L"Too many tokens returned from tokenizer");
|
||||
fwprintf(stdout, L"Got excess token type %ld\n", (long)token.type);
|
||||
break;
|
||||
}
|
||||
if (types[i] != token.type) {
|
||||
err(L"Tokenization error:");
|
||||
fwprintf(stdout, L"Token number %zu of string \n'%ls'\n, got token type %ld\n",
|
||||
i + 1, str, (long)token.type);
|
||||
fwprintf(stdout,
|
||||
L"Token number %zu of string \n'%ls'\n, expected type %ld, got token type "
|
||||
L"%ld\n",
|
||||
i + 1, str, (long)types[i], (long)token.type);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
|
|
@ -45,41 +45,27 @@ void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *whe
|
|||
const wchar_t *error_message) {
|
||||
this->last_type = TOK_ERROR;
|
||||
this->error = error_type;
|
||||
this->global_error_offset = where ? where - this->orig_buff : 0;
|
||||
this->global_error_offset = where ? where - this->start : 0;
|
||||
this->last_token = error_message;
|
||||
this->has_next = false;
|
||||
}
|
||||
|
||||
tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags)
|
||||
: buff(b),
|
||||
orig_buff(b),
|
||||
last_type(TOK_NONE),
|
||||
last_pos(0),
|
||||
has_next(false),
|
||||
accept_unfinished(false),
|
||||
show_comments(false),
|
||||
show_blank_lines(false),
|
||||
error(TOK_ERROR_NONE),
|
||||
global_error_offset(-1),
|
||||
squash_errors(false),
|
||||
continue_line_after_comment(false) {
|
||||
assert(b != NULL);
|
||||
tokenizer_t::tokenizer_t(const wchar_t *start, tok_flags_t flags) : buff(start), start(start) {
|
||||
assert(start != nullptr && "Invalid start");
|
||||
|
||||
this->accept_unfinished = static_cast<bool>(flags & TOK_ACCEPT_UNFINISHED);
|
||||
this->show_comments = static_cast<bool>(flags & TOK_SHOW_COMMENTS);
|
||||
this->squash_errors = static_cast<bool>(flags & TOK_SQUASH_ERRORS);
|
||||
this->show_blank_lines = static_cast<bool>(flags & TOK_SHOW_BLANK_LINES);
|
||||
|
||||
this->has_next = (*b != L'\0');
|
||||
this->tok_next();
|
||||
}
|
||||
|
||||
bool tokenizer_t::next(struct tok_t *result) {
|
||||
assert(result != NULL);
|
||||
if (!this->has_next) {
|
||||
if (!this->tok_next()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t current_pos = this->buff - this->orig_buff;
|
||||
const size_t current_pos = this->buff - this->start;
|
||||
|
||||
// We want to copy our last_token into result->text. If we just do this naively via =, we are
|
||||
// liable to trigger std::string's CoW implementation: result->text's storage will be
|
||||
|
@ -92,7 +78,7 @@ bool tokenizer_t::next(struct tok_t *result) {
|
|||
result->type = this->last_type;
|
||||
result->offset = this->last_pos;
|
||||
result->error = this->last_type == TOK_ERROR ? this->error : TOK_ERROR_NONE;
|
||||
assert(this->buff >= this->orig_buff);
|
||||
assert(this->buff >= this->start);
|
||||
|
||||
// Compute error offset.
|
||||
result->error_offset = 0;
|
||||
|
@ -101,10 +87,8 @@ bool tokenizer_t::next(struct tok_t *result) {
|
|||
result->error_offset = this->global_error_offset - this->last_pos;
|
||||
}
|
||||
|
||||
assert(this->buff >= this->orig_buff);
|
||||
assert(this->buff >= this->start);
|
||||
result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
|
||||
|
||||
this->tok_next();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -149,7 +133,7 @@ void tokenizer_t::read_string() {
|
|||
size_t paran_offsets[paran_offsets_max];
|
||||
// Where the open bracket is.
|
||||
size_t offset_of_bracket = 0;
|
||||
const wchar_t *const start = this->buff;
|
||||
const wchar_t *const buff_start = this->buff;
|
||||
bool is_first = true;
|
||||
|
||||
enum tok_mode_t {
|
||||
|
@ -186,14 +170,14 @@ void tokenizer_t::read_string() {
|
|||
switch (*this->buff) {
|
||||
case L'(': {
|
||||
paran_count = 1;
|
||||
paran_offsets[0] = this->buff - this->orig_buff;
|
||||
paran_offsets[0] = this->buff - this->start;
|
||||
mode = mode_subshell;
|
||||
break;
|
||||
}
|
||||
case L'[': {
|
||||
if (this->buff != start) {
|
||||
if (this->buff != buff_start) {
|
||||
mode = mode_array_brackets;
|
||||
offset_of_bracket = this->buff - this->orig_buff;
|
||||
offset_of_bracket = this->buff - this->start;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -247,7 +231,7 @@ void tokenizer_t::read_string() {
|
|||
}
|
||||
case L'(': {
|
||||
if (paran_count < paran_offsets_max) {
|
||||
paran_offsets[paran_count] = this->buff - this->orig_buff;
|
||||
paran_offsets[paran_count] = this->buff - this->start;
|
||||
}
|
||||
paran_count++;
|
||||
break;
|
||||
|
@ -277,7 +261,7 @@ void tokenizer_t::read_string() {
|
|||
switch (*this->buff) {
|
||||
case L'(': {
|
||||
paran_count = 1;
|
||||
paran_offsets[0] = this->buff - this->orig_buff;
|
||||
paran_offsets[0] = this->buff - this->start;
|
||||
mode = mode_array_brackets_and_subshell;
|
||||
break;
|
||||
}
|
||||
|
@ -315,13 +299,13 @@ void tokenizer_t::read_string() {
|
|||
}
|
||||
|
||||
TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR,
|
||||
this->orig_buff + offset_of_open_paran);
|
||||
this->start + offset_of_open_paran);
|
||||
break;
|
||||
}
|
||||
case mode_array_brackets:
|
||||
case mode_array_brackets_and_subshell: {
|
||||
TOK_CALL_ERROR(this, TOK_UNTERMINATED_SLICE, SQUARE_BRACKET_ERROR,
|
||||
this->orig_buff + offset_of_bracket);
|
||||
this->start + offset_of_bracket);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
|
@ -332,9 +316,9 @@ void tokenizer_t::read_string() {
|
|||
return;
|
||||
}
|
||||
|
||||
len = this->buff - start;
|
||||
len = this->buff - buff_start;
|
||||
|
||||
this->last_token.assign(start, len);
|
||||
this->last_token.assign(buff_start, len);
|
||||
this->last_type = TOK_STRING;
|
||||
}
|
||||
|
||||
|
@ -483,16 +467,9 @@ int oflags_for_redirection_type(enum token_type type) {
|
|||
/// to be whitespace.
|
||||
static bool my_iswspace(wchar_t c) { return c != L'\n' && iswspace(c); }
|
||||
|
||||
void tokenizer_t::tok_next() {
|
||||
if (this->last_type == TOK_ERROR) {
|
||||
this->has_next = false;
|
||||
return;
|
||||
}
|
||||
|
||||
bool tokenizer_t::tok_next() {
|
||||
if (!this->has_next) {
|
||||
// fwprintf(stdout, L"EOL\n" );
|
||||
this->last_type = TOK_END;
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
|
@ -508,29 +485,28 @@ void tokenizer_t::tok_next() {
|
|||
|
||||
while (*this->buff == L'#') {
|
||||
if (this->show_comments) {
|
||||
this->last_pos = this->buff - this->orig_buff;
|
||||
this->last_pos = this->buff - this->start;
|
||||
this->read_comment();
|
||||
|
||||
if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
while (*(this->buff) != L'\n' && *(this->buff) != L'\0') this->buff++;
|
||||
while (*this->buff != L'\n' && *this->buff != L'\0') this->buff++;
|
||||
if (this->buff[0] == L'\n' && this->continue_line_after_comment) this->buff++;
|
||||
while (my_iswspace(*(this->buff))) this->buff++;
|
||||
while (my_iswspace(*this->buff)) this->buff++;
|
||||
}
|
||||
|
||||
this->continue_line_after_comment = false;
|
||||
|
||||
this->last_pos = this->buff - this->orig_buff;
|
||||
this->last_pos = this->buff - this->start;
|
||||
|
||||
switch (*this->buff) {
|
||||
case L'\0': {
|
||||
this->last_type = TOK_END;
|
||||
// fwprintf( stderr, L"End of string\n" );
|
||||
this->has_next = false;
|
||||
this->last_token.clear();
|
||||
break;
|
||||
return false;
|
||||
}
|
||||
case L'\r': // carriage-return
|
||||
case L'\n': // newline
|
||||
|
@ -604,11 +580,12 @@ void tokenizer_t::tok_next() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
wcstring tok_first(const wcstring &str) {
|
||||
wcstring result;
|
||||
tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
|
||||
tokenizer_t t(str.data(), TOK_SQUASH_ERRORS);
|
||||
tok_t token;
|
||||
if (t.next(&token) && token.type == TOK_STRING) {
|
||||
result = std::move(token.text);
|
||||
|
|
|
@ -71,41 +71,41 @@ struct tok_t {
|
|||
/// The tokenizer struct.
|
||||
class tokenizer_t {
|
||||
// No copying, etc.
|
||||
tokenizer_t(const tokenizer_t &);
|
||||
void operator=(const tokenizer_t &);
|
||||
tokenizer_t(const tokenizer_t &) = delete;
|
||||
void operator=(const tokenizer_t &) = delete;
|
||||
|
||||
/// A pointer into the original string, showing where the next token begins.
|
||||
const wchar_t *buff;
|
||||
/// A copy of the original string.
|
||||
const wchar_t *orig_buff;
|
||||
/// The start of the original string.
|
||||
const wchar_t *const start;
|
||||
/// The last token.
|
||||
wcstring last_token;
|
||||
/// Type of last token.
|
||||
enum token_type last_type;
|
||||
enum token_type last_type { TOK_NONE };
|
||||
/// Offset of last token.
|
||||
size_t last_pos;
|
||||
size_t last_pos{0};
|
||||
/// Whether there are more tokens.
|
||||
bool has_next;
|
||||
bool has_next{true};
|
||||
/// Whether incomplete tokens are accepted.
|
||||
bool accept_unfinished;
|
||||
bool accept_unfinished{false};
|
||||
/// Whether comments should be returned.
|
||||
bool show_comments;
|
||||
bool show_comments{false};
|
||||
/// Whether all blank lines are returned.
|
||||
bool show_blank_lines;
|
||||
bool show_blank_lines{false};
|
||||
/// Last error.
|
||||
tokenizer_error error;
|
||||
tokenizer_error error{TOK_ERROR_NONE};
|
||||
/// Last error offset, in "global" coordinates (relative to orig_buff).
|
||||
size_t global_error_offset;
|
||||
size_t global_error_offset{size_t(-1)};
|
||||
/// Whether we are squashing errors.
|
||||
bool squash_errors;
|
||||
bool squash_errors{false};
|
||||
/// Whether to continue the previous line after the comment.
|
||||
bool continue_line_after_comment;
|
||||
bool continue_line_after_comment{false};
|
||||
|
||||
void call_error(enum tokenizer_error error_type, const wchar_t *where,
|
||||
const wchar_t *error_message);
|
||||
void read_string();
|
||||
void read_comment();
|
||||
void tok_next();
|
||||
bool tok_next();
|
||||
|
||||
public:
|
||||
/// Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and
|
||||
|
|
Loading…
Reference in a new issue