diff --git a/src/builtin.cpp b/src/builtin.cpp index 3381996a7..b31fb6a11 100644 --- a/src/builtin.cpp +++ b/src/builtin.cpp @@ -3620,7 +3620,7 @@ static int builtin_fg(parser_t &parser, wchar_t **argv) j->command_wcstr()); } - const wcstring ft = tok_first(j->command_wcstr()); + const wcstring ft = tok_first(j->command()); if (! ft.empty()) env_set(L"_", ft.c_str(), ENV_EXPORT); reader_write_title(j->command()); diff --git a/src/expand.cpp b/src/expand.cpp index 1d541a246..29cc410f3 100644 --- a/src/expand.cpp +++ b/src/expand.cpp @@ -535,7 +535,7 @@ bool process_iterator_t::next_process(wcstring *out_str, pid_t *out_pid) fgetws2(&full_command_line, cmdfile); /* The command line needs to be escaped */ - cmd = tok_first(full_command_line.c_str()); + cmd = tok_first(full_command_line); } #ifdef SunOS else if ((cmdfile=wfopen(path + L"/psinfo", "r"))) diff --git a/src/parse_tree.cpp b/src/parse_tree.cpp index c884f6670..1583b8045 100644 --- a/src/parse_tree.cpp +++ b/src/parse_tree.cpp @@ -1299,7 +1299,7 @@ bool parse_tree_from_string(const wcstring &str, parse_tree_flags_t parse_flags, if (errors == NULL) tok_options |= TOK_SQUASH_ERRORS; - tokenizer_t tok = tokenizer_t(str.c_str(), tok_options); + tokenizer_t tok(str.c_str(), tok_options); /* We are an LL(2) parser. We pass two tokens at a time. New tokens come in at index 1. Seed our queue with an initial token at index 1. */ parse_token_t queue[2] = {kInvalidToken, kInvalidToken}; diff --git a/src/parse_util.cpp b/src/parse_util.cpp index ac6cb8ca5..bbee5d844 100644 --- a/src/parse_util.cpp +++ b/src/parse_util.cpp @@ -1169,7 +1169,7 @@ void parse_util_expand_variable_error(const wcstring &token, size_t global_token &cmdsub_end, true) > 0) { - token_after_parens = tok_first(paren_text.c_str()); + token_after_parens = tok_first(paren_text); } /* Make sure we always show something */ @@ -1233,7 +1233,7 @@ static parser_test_error_bits_t detect_dollar_cmdsub_errors(size_t arg_src_offse result_bits |= PARSER_TEST_ERROR; if (out_errors != NULL) { - wcstring subcommand_first_token = tok_first(cmdsubst_src.c_str()); + wcstring subcommand_first_token = tok_first(cmdsubst_src); if (subcommand_first_token.empty()) { // e.g. $(). Report somthing. diff --git a/src/reader.cpp b/src/reader.cpp index 0ee2746c5..33376284e 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -2538,7 +2538,7 @@ void reader_run_command(parser_t &parser, const wcstring &cmd) struct timeval time_before, time_after; - wcstring ft = tok_first(cmd.c_str()); + wcstring ft = tok_first(cmd); if (! ft.empty()) env_set(L"_", ft.c_str(), ENV_GLOBAL); diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 18a617b18..c06638766 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -24,7 +24,7 @@ segments. #include "common.h" /* Wow what a hack */ -#define TOK_CALL_ERROR(t, e, x) do { tok_call_error((t), (e), (t)->squash_errors ? L"" : (x)); } while (0) +#define TOK_CALL_ERROR(t, e, x) do { (t)->call_error((e), (t)->squash_errors ? L"" : (x)); } while (0) /** Error string for unexpected end of string @@ -52,23 +52,19 @@ segments. */ #define PIPE_ERROR _( L"Cannot use stdin (fd 0) as pipe output" ) -static void tok_next(tokenizer_t *tok); -static enum token_type tok_last_type(tokenizer_t *tok); -static const wchar_t *tok_last(tokenizer_t *tok); - /** Set the latest tokens string to be the specified error message */ -static void tok_call_error(tokenizer_t *tok, enum tokenizer_error error_type, const wchar_t *error_message) +void tokenizer_t::call_error(enum tokenizer_error error_type, const wchar_t *error_message) { - tok->last_type = TOK_ERROR; - tok->error = error_type; - tok->last_token = error_message; + this->last_type = TOK_ERROR; + this->error = error_type; + this->last_token = error_message; } -tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false) +tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(b), orig_buff(b), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false) { - CHECK(b,); + assert(b != NULL); this->accept_unfinished = !!(flags & TOK_ACCEPT_UNFINISHED); this->show_comments = !!(flags & TOK_SHOW_COMMENTS); @@ -76,8 +72,7 @@ tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig this->show_blank_lines = !!(flags & TOK_SHOW_BLANK_LINES); this->has_next = (*b != L'\0'); - this->orig_buff = this->buff = b; - tok_next(this); + this->tok_next(); } bool tokenizer_t::next(struct tok_t *result) @@ -97,25 +92,10 @@ bool tokenizer_t::next(struct tok_t *result) size_t current_pos = this->buff - this->orig_buff; result->length = current_pos >= this->last_pos ? current_pos - this->last_pos : 0; - tok_next(this); + this->tok_next(); return true; } -static enum token_type tok_last_type(tokenizer_t *tok) -{ - CHECK(tok, TOK_ERROR); - CHECK(tok->buff, TOK_ERROR); - - return tok->last_type; -} - -static const wchar_t *tok_last(tokenizer_t *tok) -{ - CHECK(tok, 0); - - return tok->last_token.c_str(); -} - /** Tests if this character can be a part of a string. The redirect ^ is allowed unless it's the first character. Hash (#) starts a comment if it's the first character in a token; otherwise it is considered a string character. @@ -161,14 +141,14 @@ static int myal(wchar_t c) /** Read the next token as a string */ -static void read_string(tokenizer_t *tok) +void tokenizer_t::read_string() { const wchar_t *start; long len; int do_loop=1; int paran_count=0; - start = tok->buff; + start = this->buff; bool is_first = true; enum tok_mode_t @@ -181,27 +161,27 @@ static void read_string(tokenizer_t *tok) while (1) { - if (!myal(*tok->buff)) + if (!myal(*this->buff)) { - if (*tok->buff == L'\\') + if (*this->buff == L'\\') { - tok->buff++; - if (*tok->buff == L'\0') + this->buff++; + if (*this->buff == L'\0') { - if ((!tok->accept_unfinished)) + if ((!this->accept_unfinished)) { - TOK_CALL_ERROR(tok, TOK_UNTERMINATED_ESCAPE, QUOTE_ERROR); + TOK_CALL_ERROR(this, TOK_UNTERMINATED_ESCAPE, QUOTE_ERROR); return; } else { /* Since we are about to increment tok->buff, decrement it first so the increment doesn't go past the end of the buffer. https://github.com/fish-shell/fish-shell/issues/389 */ - tok->buff--; + this->buff--; do_loop = 0; } } - tok->buff++; + this->buff++; continue; } @@ -209,7 +189,7 @@ static void read_string(tokenizer_t *tok) { case mode_regular_text: { - switch (*tok->buff) + switch (*this->buff) { case L'(': { @@ -220,7 +200,7 @@ static void read_string(tokenizer_t *tok) case L'[': { - if (tok->buff != start) + if (this->buff != start) mode = mode_array_brackets; break; } @@ -229,18 +209,18 @@ static void read_string(tokenizer_t *tok) case L'"': { - const wchar_t *end = quote_end(tok->buff); + const wchar_t *end = quote_end(this->buff); if (end) { - tok->buff=(wchar_t *)end; + this->buff=end; } else { - tok->buff += wcslen(tok->buff); + this->buff += wcslen(this->buff); - if (! tok->accept_unfinished) + if (! this->accept_unfinished) { - TOK_CALL_ERROR(tok, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR); + TOK_CALL_ERROR(this, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR); return; } do_loop = 0; @@ -251,7 +231,7 @@ static void read_string(tokenizer_t *tok) default: { - if (! tok_is_string_character(*(tok->buff), is_first)) + if (! tok_is_string_character(*(this->buff), is_first)) { do_loop=0; } @@ -262,22 +242,22 @@ static void read_string(tokenizer_t *tok) case mode_array_brackets_and_subshell: case mode_subshell: - switch (*tok->buff) + switch (*this->buff) { case L'\'': case L'\"': { - const wchar_t *end = quote_end(tok->buff); + const wchar_t *end = quote_end(this->buff); if (end) { - tok->buff=(wchar_t *)end; + this->buff = end; } else { - tok->buff += wcslen(tok->buff); - if ((!tok->accept_unfinished)) + this->buff += wcslen(this->buff); + if ((!this->accept_unfinished)) { - TOK_CALL_ERROR(tok, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR); + TOK_CALL_ERROR(this, TOK_UNTERMINATED_QUOTE, QUOTE_ERROR); return; } do_loop = 0; @@ -303,7 +283,7 @@ static void read_string(tokenizer_t *tok) break; case mode_array_brackets: - switch (*tok->buff) + switch (*this->buff) { case L'(': paran_count=1; @@ -326,20 +306,20 @@ static void read_string(tokenizer_t *tok) if (!do_loop) break; - tok->buff++; + this->buff++; is_first = false; } - if ((!tok->accept_unfinished) && (mode != mode_regular_text)) + if ((!this->accept_unfinished) && (mode != mode_regular_text)) { switch (mode) { case mode_subshell: - TOK_CALL_ERROR(tok, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR); + TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, PARAN_ERROR); break; case mode_array_brackets: case mode_array_brackets_and_subshell: - TOK_CALL_ERROR(tok, TOK_UNTERMINATED_SUBSHELL, SQUARE_BRACKET_ERROR); // TOK_UNTERMINATED_SUBSHELL is a lie but nobody actually looks at it + TOK_CALL_ERROR(this, TOK_UNTERMINATED_SUBSHELL, SQUARE_BRACKET_ERROR); // TOK_UNTERMINATED_SUBSHELL is a lie but nobody actually looks at it break; default: assert(0 && "Unexpected mode in read_string"); @@ -349,27 +329,24 @@ static void read_string(tokenizer_t *tok) } - len = tok->buff - start; + len = this->buff - start; - tok->last_token.assign(start, len); - tok->last_type = TOK_STRING; + this->last_token.assign(start, len); + this->last_type = TOK_STRING; } /** Read the next token as a comment. */ -static void read_comment(tokenizer_t *tok) +void tokenizer_t::read_comment() { - const wchar_t *start; + const wchar_t *start = this->buff; + while (*(this->buff)!= L'\n' && *(this->buff)!= L'\0') + this->buff++; - start = tok->buff; - while (*(tok->buff)!= L'\n' && *(tok->buff)!= L'\0') - tok->buff++; - - - size_t len = tok->buff - start; - tok->last_token.assign(start, len); - tok->last_type = TOK_COMMENT; + size_t len = this->buff - start; + this->last_token.assign(start, len); + this->last_type = TOK_COMMENT; } @@ -531,35 +508,31 @@ static bool my_iswspace(wchar_t c) return c != L'\n' && iswspace(c); } -static void tok_next(tokenizer_t *tok) +void tokenizer_t::tok_next() { - - CHECK(tok,); - CHECK(tok->buff,); - - if (tok_last_type(tok) == TOK_ERROR) + if (this->last_type == TOK_ERROR) { - tok->has_next=false; + this->has_next=false; return; } - if (!tok->has_next) + if (!this->has_next) { /* wprintf( L"EOL\n" );*/ - tok->last_type = TOK_END; + this->last_type = TOK_END; return; } while (1) { - if (tok->buff[0] == L'\\' && tok->buff[1] == L'\n') + if (this->buff[0] == L'\\' && this->buff[1] == L'\n') { - tok->buff += 2; - tok->continue_line_after_comment = true; + this->buff += 2; + this->continue_line_after_comment = true; } - else if (my_iswspace(tok->buff[0])) + else if (my_iswspace(this->buff[0])) { - tok->buff++; + this->buff++; } else { @@ -568,68 +541,68 @@ static void tok_next(tokenizer_t *tok) } - while (*tok->buff == L'#') + while (*this->buff == L'#') { - if (tok->show_comments) + if (this->show_comments) { - tok->last_pos = tok->buff - tok->orig_buff; - read_comment(tok); + this->last_pos = this->buff - this->orig_buff; + this->read_comment(); - if (tok->buff[0] == L'\n' && tok->continue_line_after_comment) - tok->buff++; + if (this->buff[0] == L'\n' && this->continue_line_after_comment) + this->buff++; return; } else { - while (*(tok->buff)!= L'\n' && *(tok->buff)!= L'\0') - tok->buff++; + while (*(this->buff)!= L'\n' && *(this->buff)!= L'\0') + this->buff++; - if (tok->buff[0] == L'\n' && tok->continue_line_after_comment) - tok->buff++; + if (this->buff[0] == L'\n' && this->continue_line_after_comment) + this->buff++; } - while (my_iswspace(*(tok->buff))) { - tok->buff++; + while (my_iswspace(*(this->buff))) { + this->buff++; } } - tok->continue_line_after_comment = false; + this->continue_line_after_comment = false; - tok->last_pos = tok->buff - tok->orig_buff; + this->last_pos = this->buff - this->orig_buff; - switch (*tok->buff) + switch (*this->buff) { case L'\0': - tok->last_type = TOK_END; + this->last_type = TOK_END; /*fwprintf( stderr, L"End of string\n" );*/ - tok->has_next = false; + this->has_next = false; break; case 13: // carriage return case L'\n': case L';': - tok->last_type = TOK_END; - tok->buff++; + this->last_type = TOK_END; + this->buff++; // Hack: when we get a newline, swallow as many as we can // This compresses multiple subsequent newlines into a single one - if (! tok->show_blank_lines) + if (! this->show_blank_lines) { - while (*tok->buff == L'\n' || *tok->buff == 13 /* CR */ || *tok->buff == ' ' || *tok->buff == '\t') + while (*this->buff == L'\n' || *this->buff == 13 /* CR */ || *this->buff == ' ' || *this->buff == '\t') { - tok->buff++; + this->buff++; } } - tok->last_token.clear(); + this->last_token.clear(); break; case L'&': - tok->last_type = TOK_BACKGROUND; - tok->buff++; + this->last_type = TOK_BACKGROUND; + this->buff++; break; case L'|': - tok->last_token = L"1"; - tok->last_type = TOK_PIPE; - tok->buff++; + this->last_token = L"1"; + this->last_type = TOK_PIPE; + this->buff++; break; case L'>': @@ -639,16 +612,16 @@ static void tok_next(tokenizer_t *tok) /* There's some duplication with the code in the default case below. The key difference here is that we must never parse these as a string; a failed redirection is an error! */ enum token_type mode = TOK_NONE; int fd = -1; - size_t consumed = read_redirection_or_fd_pipe(tok->buff, &mode, &fd); + size_t consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd); if (consumed == 0 || fd < 0) { - TOK_CALL_ERROR(tok, TOK_OTHER, REDIRECT_ERROR); + TOK_CALL_ERROR(this, TOK_OTHER, REDIRECT_ERROR); } else { - tok->buff += consumed; - tok->last_type = mode; - tok->last_token = to_string(fd); + this->buff += consumed; + this->last_type = mode; + this->last_token = to_string(fd); } } break; @@ -659,53 +632,42 @@ static void tok_next(tokenizer_t *tok) size_t consumed = 0; enum token_type mode = TOK_NONE; int fd = -1; - if (iswdigit(*tok->buff)) - consumed = read_redirection_or_fd_pipe(tok->buff, &mode, &fd); + if (iswdigit(*this->buff)) + consumed = read_redirection_or_fd_pipe(this->buff, &mode, &fd); if (consumed > 0) { /* It looks like a redirection or a pipe. But we don't support piping fd 0. Note that fd 0 may be -1, indicating overflow; but we don't treat that as a tokenizer error. */ if (mode == TOK_PIPE && fd == 0) { - TOK_CALL_ERROR(tok, TOK_OTHER, PIPE_ERROR); + TOK_CALL_ERROR(this, TOK_OTHER, PIPE_ERROR); } else { - tok->buff += consumed; - tok->last_type = mode; - tok->last_token = to_string(fd); + this->buff += consumed; + this->last_type = mode; + this->last_token = to_string(fd); } } else { /* Not a redirection or pipe, so just a string */ - read_string(tok); + this->read_string(); } } break; - } } -wcstring tok_first(const wchar_t *str) +wcstring tok_first(const wcstring &str) { wcstring result; - if (str) + tokenizer_t t(str.c_str(), TOK_SQUASH_ERRORS); + tok_t token; + if (t.next(&token) && token.type == TOK_STRING) { - tokenizer_t t(str, TOK_SQUASH_ERRORS); - switch (tok_last_type(&t)) - { - case TOK_STRING: - { - const wchar_t *tmp = tok_last(&t); - if (tmp != NULL) - result = tmp; - break; - } - default: - break; - } + result.swap(token.text); } return result; } diff --git a/src/tokenizer.h b/src/tokenizer.h index 7e4003f89..08a771f1b 100644 --- a/src/tokenizer.h +++ b/src/tokenizer.h @@ -91,8 +91,12 @@ struct tok_t /** The tokenizer struct. */ -struct tokenizer_t +class tokenizer_t { + /* No copying, etc. */ + tokenizer_t(const tokenizer_t&); + void operator=(const tokenizer_t&); + /** A pointer into the original string, showing where the next token begins */ const wchar_t *buff; /** A copy of the original string */ @@ -120,7 +124,13 @@ struct tokenizer_t /* Whether to continue the previous line after the comment */ bool continue_line_after_comment; - + + void call_error(enum tokenizer_error error_type, const wchar_t *error_message); + void read_string(); + void read_comment(); + void tok_next(); + +public: /** Constructor for a tokenizer. b is the string that is to be tokenized. It is not copied, and should not be freed by the caller @@ -146,7 +156,7 @@ struct tokenizer_t On failure, returns the empty string. */ -wcstring tok_first(const wchar_t *str); +wcstring tok_first(const wcstring &str); /* Helper function to determine redirection type from a string, or TOK_NONE if the redirection is invalid. Also returns the fd by reference. */ enum token_type redirection_type_for_string(const wcstring &str, int *out_fd = NULL);