Continue to refine tokenizer interface

Migrate some functions into tokenizer instance methods
This commit is contained in:
ridiculousfish 2015-07-26 00:58:32 -07:00
parent 4ebaa7b6bd
commit c9349f2ef6
7 changed files with 122 additions and 150 deletions

View file

@ -3620,7 +3620,7 @@ static int builtin_fg(parser_t &parser, wchar_t **argv)
j->command_wcstr()); j->command_wcstr());
} }
const wcstring ft = tok_first(j->command_wcstr()); const wcstring ft = tok_first(j->command());
if (! ft.empty()) if (! ft.empty())
env_set(L"_", ft.c_str(), ENV_EXPORT); env_set(L"_", ft.c_str(), ENV_EXPORT);
reader_write_title(j->command()); reader_write_title(j->command());

View file

@ -535,7 +535,7 @@ bool process_iterator_t::next_process(wcstring *out_str, pid_t *out_pid)
fgetws2(&full_command_line, cmdfile); fgetws2(&full_command_line, cmdfile);
/* The command line needs to be escaped */ /* The command line needs to be escaped */
cmd = tok_first(full_command_line.c_str()); cmd = tok_first(full_command_line);
} }
#ifdef SunOS #ifdef SunOS
else if ((cmdfile=wfopen(path + L"/psinfo", "r"))) else if ((cmdfile=wfopen(path + L"/psinfo", "r")))

View file

@ -1299,7 +1299,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags,
if (errors == NULL) if (errors == NULL)
tok_options |= TOK_SQUASH_ERRORS; tok_options |= TOK_SQUASH_ERRORS;
tokenizer_t tok = tokenizer_t(str.c_str(), tok_options); tokenizer_t tok(str.c_str(), tok_options);
/* We are an LL(2) parser. We pass two tokens at a time. New tokens come in at index 1. Seed our queue with an initial token at index 1. */ /* We are an LL(2) parser. We pass two tokens at a time. New tokens come in at index 1. Seed our queue with an initial token at index 1. */
parse_token_t queue[2] = {kInvalidToken, kInvalidToken}; parse_token_t queue[2] = {kInvalidToken, kInvalidToken};

View file

@ -1169,7 +1169,7 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token
&cmdsub_end, &cmdsub_end,
true) > 0) true) > 0)
{ {
token_after_parens = tok_first(paren_text.c_str()); token_after_parens = tok_first(paren_text);
} }
/* Make sure we always show something */ /* Make sure we always show something */
@ -1233,7 +1233,7 @@ static parser_test_error_bits_t detect_dollar_cmdsub_errors(size_t arg_src_offse
result_bits |= PARSER_TEST_ERROR; result_bits |= PARSER_TEST_ERROR;
if (out_errors != NULL) if (out_errors != NULL)
{ {
wcstring subcommand_first_token = tok_first(cmdsubst_src.c_str()); wcstring subcommand_first_token = tok_first(cmdsubst_src);
if (subcommand_first_token.empty()) if (subcommand_first_token.empty())
{ {
// e.g. $(). Report somthing. // e.g. $(). Report somthing.

View file

@ -2538,7 +2538,7 @@ void reader_run_command(parser_t &parser, const wcstring &cmd)
struct timeval time_before, time_after; struct timeval time_before, time_after;
wcstring ft = tok_first(cmd.c_str()); wcstring ft = tok_first(cmd);
if (! ft.empty()) if (! ft.empty())
env_set(L"_", ft.c_str(), ENV_GLOBAL); env_set(L"_", ft.c_str(), ENV_GLOBAL);

View file

@ -24,7 +24,7 @@ segments.
#include "common.h" #include "common.h"
/* Wow what a hack */ /* Wow what a hack */
#define TOK_CALL_ERROR(t, e, x) do { tok_call_error((t), (e), (t)->squash_errors ? L"" : (x)); } while (0) #define TOK_CALL_ERROR(t, e, x) do { (t)->call_error((e), (t)->squash_errors ? L"" : (x)); } while (0)
/** /**
Error string for unexpected end of string Error string for unexpected end of string
@ -52,23 +52,19 @@ segments.
*/ */
#define PIPE_ERROR _( L"Cannot use stdin (fd 0) as pipe output" ) #define PIPE_ERROR _( L"Cannot use stdin (fd 0) as pipe output" )
static void tok_next(tokenizer_t *tok);
static enum token_type tok_last_type(tokenizer_t *tok);
static const wchar_t *tok_last(tokenizer_t *tok);
/** /**
Set the latest tokens string to be the specified error message Set the latest tokens string to be the specified error message
*/ */
static void tok_call_error(tokenizer_t *tok, enum tokenizer_error error_type, const wchar_t *error_message) void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *error_message)
{ {
tok->last_type = TOK_ERROR; this->last_type = TOK_ERROR;
tok->error = error_type; this->error = error_type;
tok->last_token = error_message; this->last_token = error_message;
} }
tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false) tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(b), orig_buff(b), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false)
{ {
CHECK(b,); assert(b != NULL);
this->accept_unfinished = !!(flags & TOK_ACCEPT_UNFINISHED); this->accept_unfinished = !!(flags & TOK_ACCEPT_UNFINISHED);
this->show_comments = !!(flags & TOK_SHOW_COMMENTS); this->show_comments = !!(flags & TOK_SHOW_COMMENTS);
@ -76,8 +72,7 @@ tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig
this->show_blank_lines = !!(flags & TOK_SHOW_BLANK_LINES); this->show_blank_lines = !!(flags & TOK_SHOW_BLANK_LINES);
this->has_next = (*b != L'\0'); this->has_next = (*b != L'\0');
this->orig_buff = this->buff = b; this->tok_next();
tok_next(this);
} }
bool tokenizer_t::next(struct tok_t *result) bool tokenizer_t::next(struct tok_t *result)
@ -97,25 +92,10 @@ bool tokenizer_t::next(struct tok_t *result)
size_t current_pos = this->buff - this->orig_buff; size_t current_pos = this->buff - this->orig_buff;
result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0; result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0;
tok_next(this); this->tok_next();
return true; return true;
} }
static enum token_type tok_last_type(tokenizer_t *tok)
{
CHECK(tok, TOK_ERROR);
CHECK(tok->buff, TOK_ERROR);
return tok->last_type;
}
static const wchar_t *tok_last(tokenizer_t *tok)
{
CHECK(tok, 0);
return tok->last_token.c_str();
}
/** /**
Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the first character. Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the first character.
Hash (#) starts a comment if it's the first character in a token; otherwise it is considered a string character. Hash (#) starts a comment if it's the first character in a token; otherwise it is considered a string character.
@ -161,14 +141,14 @@ static int myal(wchar_t c)
/** /**
Read the next token as a string Read the next token as a string
*/ */
static void read_string(tokenizer_t *tok) void tokenizer_t::read_string()
{ {
const wchar_t *start; const wchar_t *start;
long len; long len;
int do_loop=1; int do_loop=1;
int paran_count=0; int paran_count=0;
start = tok->buff; start = this->buff;
bool is_first = true; bool is_first = true;
enum tok_mode_t enum tok_mode_t
@ -181,27 +161,27 @@ static void read_string(tokenizer_t *tok)
while (1) while (1)
{ {
if (!myal(*tok->buff)) if (!myal(*this->buff))
{ {
if (*tok->buff == L'\\') if (*this->buff == L'\\')
{ {
tok->buff++; this->buff++;
if (*tok->buff == L'\0') if (*this->buff == L'\0')
{ {
if ((!tok->accept_unfinished)) if ((!this->accept_unfinished))
{ {
TOK_CALL_ERROR(tok, TOK_UNTERMINATED_ESCAPE, QUOTE_ERROR); TOK_CALL_ERROR(this, TOK_UNTERMINATED_ESCAPE, QUOTE_ERROR);
return; return;
} }
else else
{ {
/* Since we are about to increment tok->buff, decrement it first so the increment doesn't go past the end of the buffer. https://github.com/fish-shell/fish-shell/issues/389 */ /* Since we are about to increment tok->buff, decrement it first so the increment doesn't go past the end of the buffer. https://github.com/fish-shell/fish-shell/issues/389 */
tok->buff--; this->buff--;
do_loop = 0; do_loop = 0;
} }
} }
tok->buff++; this->buff++;
continue; continue;
} }
@ -209,7 +189,7 @@ static void read_string(tokenizer_t *tok)
{ {
case mode_regular_text: case mode_regular_text:
{ {
switch (*tok->buff) switch (*this->buff)
{ {
case L'(': case L'(':
{ {
@ -220,7 +200,7 @@ static void read_string(tokenizer_t *tok)
case L'[': case L'[':
{ {
if (tok->buff != start) if (this->buff != start)
mode = mode_array_brackets; mode = mode_array_brackets;
break; break;
} }
@ -229,18 +209,18 @@ static void read_string(tokenizer_t *tok)
case L'"': case L'"':
{ {
const wchar_t *end = quote_end(tok->buff); const wchar_t *end = quote_end(this->buff);
if (end) if (end)
{ {
tok->buff=(wchar_t *)end; this->buff=end;
} }
else else
{ {
tok->buff += wcslen(tok->buff); this->buff += wcslen(this->buff);
if (! tok->accept_unfinished) if (! this->accept_unfinished)
{ {
TOK_CALL_ERROR(tok, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR); TOK_CALL_ERROR(this, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR);
return; return;
} }
do_loop = 0; do_loop = 0;
@ -251,7 +231,7 @@ static void read_string(tokenizer_t *tok)
default: default:
{ {
if (! tok_is_string_character(*(tok->buff), is_first)) if (! tok_is_string_character(*(this->buff), is_first))
{ {
do_loop=0; do_loop=0;
} }
@ -262,22 +242,22 @@ static void read_string(tokenizer_t *tok)
case mode_array_brackets_and_subshell: case mode_array_brackets_and_subshell:
case mode_subshell: case mode_subshell:
switch (*tok->buff) switch (*this->buff)
{ {
case L'\'': case L'\'':
case L'\"': case L'\"':
{ {
const wchar_t *end = quote_end(tok->buff); const wchar_t *end = quote_end(this->buff);
if (end) if (end)
{ {
tok->buff=(wchar_t *)end; this->buff = end;
} }
else else
{ {
tok->buff += wcslen(tok->buff); this->buff += wcslen(this->buff);
if ((!tok->accept_unfinished)) if ((!this->accept_unfinished))
{ {
TOK_CALL_ERROR(tok, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR); TOK_CALL_ERROR(this, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR);
return; return;
} }
do_loop = 0; do_loop = 0;
@ -303,7 +283,7 @@ static void read_string(tokenizer_t *tok)
break; break;
case mode_array_brackets: case mode_array_brackets:
switch (*tok->buff) switch (*this->buff)
{ {
case L'(': case L'(':
paran_count=1; paran_count=1;
@ -326,20 +306,20 @@ static void read_string(tokenizer_t *tok)
if (!do_loop) if (!do_loop)
break; break;
tok->buff++; this->buff++;
is_first = false; is_first = false;
} }
if ((!tok->accept_unfinished) && (mode != mode_regular_text)) if ((!this->accept_unfinished) && (mode != mode_regular_text))
{ {
switch (mode) switch (mode)
{ {
case mode_subshell: case mode_subshell:
TOK_CALL_ERROR(tok, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR); TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR);
break; break;
case mode_array_brackets: case mode_array_brackets:
case mode_array_brackets_and_subshell: case mode_array_brackets_and_subshell:
TOK_CALL_ERROR(tok, TOK_UNTERMINATED_SUBSHELL, SQUARE_BRACKET_ERROR); // TOK_UNTERMINATED_SUBSHELL is a lie but nobody actually looks at it TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, SQUARE_BRACKET_ERROR); // TOK_UNTERMINATED_SUBSHELL is a lie but nobody actually looks at it
break; break;
default: default:
assert(0 && "Unexpected mode in read_string"); assert(0 && "Unexpected mode in read_string");
@ -349,27 +329,24 @@ static void read_string(tokenizer_t *tok)
} }
len = tok->buff - start; len = this->buff - start;
tok->last_token.assign(start, len); this->last_token.assign(start, len);
tok->last_type = TOK_STRING; this->last_type = TOK_STRING;
} }
/** /**
Read the next token as a comment. Read the next token as a comment.
*/ */
static void read_comment(tokenizer_t *tok) void tokenizer_t::read_comment()
{ {
const wchar_t *start; const wchar_t *start = this->buff;
while (*(this->buff)!= L'\n' && *(this->buff)!= L'\0')
this->buff++;
start = tok->buff; size_t len = this->buff - start;
while (*(tok->buff)!= L'\n' && *(tok->buff)!= L'\0') this->last_token.assign(start, len);
tok->buff++; this->last_type = TOK_COMMENT;
size_t len = tok->buff - start;
tok->last_token.assign(start, len);
tok->last_type = TOK_COMMENT;
} }
@ -531,35 +508,31 @@ static bool my_iswspace(wchar_t c)
return c != L'\n' && iswspace(c); return c != L'\n' && iswspace(c);
} }
static void tok_next(tokenizer_t *tok) void tokenizer_t::tok_next()
{ {
if (this->last_type == TOK_ERROR)
CHECK(tok,);
CHECK(tok->buff,);
if (tok_last_type(tok) == TOK_ERROR)
{ {
tok->has_next=false; this->has_next=false;
return; return;
} }
if (!tok->has_next) if (!this->has_next)
{ {
/* wprintf( L"EOL\n" );*/ /* wprintf( L"EOL\n" );*/
tok->last_type = TOK_END; this->last_type = TOK_END;
return; return;
} }
while (1) while (1)
{ {
if (tok->buff[0] == L'\\' && tok->buff[1] == L'\n') if (this->buff[0] == L'\\' && this->buff[1] == L'\n')
{ {
tok->buff += 2; this->buff += 2;
tok->continue_line_after_comment = true; this->continue_line_after_comment = true;
} }
else if (my_iswspace(tok->buff[0])) else if (my_iswspace(this->buff[0]))
{ {
tok->buff++; this->buff++;
} }
else else
{ {
@ -568,68 +541,68 @@ static void tok_next(tokenizer_t *tok)
} }
while (*tok->buff == L'#') while (*this->buff == L'#')
{ {
if (tok->show_comments) if (this->show_comments)
{ {
tok->last_pos = tok->buff - tok->orig_buff; this->last_pos = this->buff - this->orig_buff;
read_comment(tok); this->read_comment();
if (tok->buff[0] == L'\n' && tok->continue_line_after_comment) if (this->buff[0] == L'\n' && this->continue_line_after_comment)
tok->buff++; this->buff++;
return; return;
} }
else else
{ {
while (*(tok->buff)!= L'\n' && *(tok->buff)!= L'\0') while (*(this->buff)!= L'\n' && *(this->buff)!= L'\0')
tok->buff++; this->buff++;
if (tok->buff[0] == L'\n' && tok->continue_line_after_comment) if (this->buff[0] == L'\n' && this->continue_line_after_comment)
tok->buff++; this->buff++;
} }
while (my_iswspace(*(tok->buff))) { while (my_iswspace(*(this->buff))) {
tok->buff++; this->buff++;
} }
} }
tok->continue_line_after_comment = false; this->continue_line_after_comment = false;
tok->last_pos = tok->buff - tok->orig_buff; this->last_pos = this->buff - this->orig_buff;
switch (*tok->buff) switch (*this->buff)
{ {
case L'\0': case L'\0':
tok->last_type = TOK_END; this->last_type = TOK_END;
/*fwprintf( stderr, L"End of string\n" );*/ /*fwprintf( stderr, L"End of string\n" );*/
tok->has_next = false; this->has_next = false;
break; break;
case 13: // carriage return case 13: // carriage return
case L'\n': case L'\n':
case L';': case L';':
tok->last_type = TOK_END; this->last_type = TOK_END;
tok->buff++; this->buff++;
// Hack: when we get a newline, swallow as many as we can // Hack: when we get a newline, swallow as many as we can
// This compresses multiple subsequent newlines into a single one // This compresses multiple subsequent newlines into a single one
if (! tok->show_blank_lines) if (! this->show_blank_lines)
{ {
while (*tok->buff == L'\n' || *tok->buff == 13 /* CR */ || *tok->buff == ' ' || *tok->buff == '\t') while (*this->buff == L'\n' || *this->buff == 13 /* CR */ || *this->buff == ' ' || *this->buff == '\t')
{ {
tok->buff++; this->buff++;
} }
} }
tok->last_token.clear(); this->last_token.clear();
break; break;
case L'&': case L'&':
tok->last_type = TOK_BACKGROUND; this->last_type = TOK_BACKGROUND;
tok->buff++; this->buff++;
break; break;
case L'|': case L'|':
tok->last_token = L"1"; this->last_token = L"1";
tok->last_type = TOK_PIPE; this->last_type = TOK_PIPE;
tok->buff++; this->buff++;
break; break;
case L'>': case L'>':
@ -639,16 +612,16 @@ static void tok_next(tokenizer_t *tok)
/* There's some duplication with the code in the default case below. The key difference here is that we must never parse these as a string; a failed redirection is an error! */ /* There's some duplication with the code in the default case below. The key difference here is that we must never parse these as a string; a failed redirection is an error! */
enum token_type mode = TOK_NONE; enum token_type mode = TOK_NONE;
int fd = -1; int fd = -1;
size_t consumed = read_redirection_or_fd_pipe(tok->buff, &mode, &fd); size_t consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd);
if (consumed == 0 || fd < 0) if (consumed == 0 || fd < 0)
{ {
TOK_CALL_ERROR(tok, TOK_OTHER, REDIRECT_ERROR); TOK_CALL_ERROR(this, TOK_OTHER, REDIRECT_ERROR);
} }
else else
{ {
tok->buff += consumed; this->buff += consumed;
tok->last_type = mode; this->last_type = mode;
tok->last_token = to_string(fd); this->last_token = to_string(fd);
} }
} }
break; break;
@ -659,53 +632,42 @@ static void tok_next(tokenizer_t *tok)
size_t consumed = 0; size_t consumed = 0;
enum token_type mode = TOK_NONE; enum token_type mode = TOK_NONE;
int fd = -1; int fd = -1;
if (iswdigit(*tok->buff)) if (iswdigit(*this->buff))
consumed = read_redirection_or_fd_pipe(tok->buff, &mode, &fd); consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd);
if (consumed > 0) if (consumed > 0)
{ {
/* It looks like a redirection or a pipe. But we don't support piping fd 0. Note that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer error. */ /* It looks like a redirection or a pipe. But we don't support piping fd 0. Note that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer error. */
if (mode == TOK_PIPE && fd == 0) if (mode == TOK_PIPE && fd == 0)
{ {
TOK_CALL_ERROR(tok, TOK_OTHER, PIPE_ERROR); TOK_CALL_ERROR(this, TOK_OTHER, PIPE_ERROR);
} }
else else
{ {
tok->buff += consumed; this->buff += consumed;
tok->last_type = mode; this->last_type = mode;
tok->last_token = to_string(fd); this->last_token = to_string(fd);
} }
} }
else else
{ {
/* Not a redirection or pipe, so just a string */ /* Not a redirection or pipe, so just a string */
read_string(tok); this->read_string();
} }
} }
break; break;
} }
} }
wcstring tok_first(const wchar_t *str) wcstring tok_first(const wcstring &str)
{ {
wcstring result; wcstring result;
if (str) tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS);
tok_t token;
if (t.next(&token) && token.type == TOK_STRING)
{ {
tokenizer_t t(str, TOK_SQUASH_ERRORS); result.swap(token.text);
switch (tok_last_type(&t))
{
case TOK_STRING:
{
const wchar_t *tmp = tok_last(&t);
if (tmp != NULL)
result = tmp;
break;
}
default:
break;
}
} }
return result; return result;
} }

View file

@ -91,8 +91,12 @@ struct tok_t
/** /**
The tokenizer struct. The tokenizer struct.
*/ */
struct tokenizer_t class tokenizer_t
{ {
/* No copying, etc. */
tokenizer_t(const tokenizer_t&);
void operator=(const tokenizer_t&);
/** A pointer into the original string, showing where the next token begins */ /** A pointer into the original string, showing where the next token begins */
const wchar_t *buff; const wchar_t *buff;
/** A copy of the original string */ /** A copy of the original string */
@ -120,7 +124,13 @@ struct tokenizer_t
/* Whether to continue the previous line after the comment */ /* Whether to continue the previous line after the comment */
bool continue_line_after_comment; bool continue_line_after_comment;
void call_error(enum tokenizer_error error_type, const wchar_t *error_message);
void read_string();
void read_comment();
void tok_next();
public:
/** /**
Constructor for a tokenizer. b is the string that is to be Constructor for a tokenizer. b is the string that is to be
tokenized. It is not copied, and should not be freed by the caller tokenized. It is not copied, and should not be freed by the caller
@ -146,7 +156,7 @@ struct tokenizer_t
On failure, returns the empty string. On failure, returns the empty string.
*/ */
wcstring tok_first(const wchar_t *str); wcstring tok_first(const wcstring &str);
/* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid. Also returns the fd by reference. */ /* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid. Also returns the fd by reference. */
enum token_type redirection_type_for_string(const wcstring &str, int *out_fd = NULL); enum token_type redirection_type_for_string(const wcstring &str, int *out_fd = NULL);