Early reworking of tokenizer interface

This commit is contained in:
ridiculousfish 2015-07-25 23:05:47 -07:00
parent 0dbd83ffaf
commit 618896c043
4 changed files with 72 additions and 20 deletions

View file

@ -468,22 +468,34 @@ static void test_tok()
const wchar_t *str = L"string <redirection 2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect Compress_Newlines\n \n\t\n \nInto_Just_One";
const int types[] =
{
TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING, TOK_END
TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING
};
say(L"Test correct tokenization");
tokenizer_t t(str, 0);
for (size_t i=0; i < sizeof types / sizeof *types; i++, tok_next(&t))
tok_t token;
size_t i = 0;
while (t.next(&token))
{
if (types[i] != tok_last_type(&t))
if (i > sizeof types / sizeof *types)
{
err(L"Too many tokens returned from tokenizer");
break;
}
if (types[i] != token.type)
{
err(L"Tokenization error:");
wprintf(L"Token number %d of string \n'%ls'\n, got token '%ls'\n",
wprintf(L"Token number %d of string \n'%ls'\n, got token type %ld\n",
i+1,
str,
tok_last(&t));
(long)token.type);
}
i++;
}
if (i < sizeof types / sizeof *types)
{
err(L"Too few tokens returned from tokenizer");
}
}

View file

@ -246,7 +246,7 @@ public:
/**
Saved position used by token history search
*/
int token_history_pos;
size_t token_history_pos;
/**
Saved search string for token history search. Not handled by command_line_changed.
@ -2256,7 +2256,7 @@ static void handle_token_history(int forward, int reset)
return;
wcstring str;
long current_pos;
size_t current_pos;
if (reset)
{
@ -2292,7 +2292,7 @@ static void handle_token_history(int forward, int reset)
}
else
{
if (current_pos == -1)
if (current_pos == size_t(-1))
{
data->token_history_buff.clear();
@ -2330,26 +2330,26 @@ static void handle_token_history(int forward, int reset)
//debug( 3, L"new '%ls'", data->token_history_buff.c_str() );
tokenizer_t tok(data->token_history_buff.c_str(), TOK_ACCEPT_UNFINISHED);
for (; tok_has_next(&tok); tok_next(&tok))
tok_t token;
while (tok.next(&token))
{
switch (tok_last_type(&tok))
switch (token.type)
{
case TOK_STRING:
{
if (wcsstr(tok_last(&tok), data->search_buff.c_str()))
{
if (token.text.find(data->search_buff) != wcstring::npos)
{
//debug( 3, L"Found token at pos %d\n", tok_get_pos( &tok ) );
if (tok_get_pos(&tok) >= current_pos)
if (token.offset >= current_pos)
{
break;
}
//debug( 3, L"ok pos" );
const wcstring last_tok = tok_last(&tok);
if (find(data->search_prev.begin(), data->search_prev.end(), last_tok) == data->search_prev.end())
if (find(data->search_prev.begin(), data->search_prev.end(), token.text) == data->search_prev.end())
{
data->token_history_pos = tok_get_pos(&tok);
str = tok_last(&tok);
data->token_history_pos = token.offset;
str = token.text;
}
}

View file

@ -55,7 +55,7 @@ segments.
/**
Set the latest tokens string to be the specified error message
*/
static void tok_call_error(tokenizer_t *tok, int error_type, const wchar_t *error_message)
static void tok_call_error(tokenizer_t *tok, enum tokenizer_error error_type, const wchar_t *error_message)
{
tok->last_type = TOK_ERROR;
tok->error = error_type;
@ -67,7 +67,7 @@ int tok_get_error(tokenizer_t *tok)
return tok->error;
}
tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(0), squash_errors(false), continue_line_after_comment(false)
tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false)
{
CHECK(b,);
@ -81,6 +81,22 @@ tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig
tok_next(this);
}
bool tokenizer_t::next(struct tok_t *result)
{
assert(result != NULL);
if (! this->has_next)
{
return false;
}
result->text = this->last_token;
result->type = this->last_type;
result->offset = last_pos;
assert(this->buff >= this->orig_buff);
result->length = this->buff - this->orig_buff;
tok_next(this);
return true;
}
enum token_type tok_last_type(tokenizer_t *tok)
{
CHECK(tok, TOK_ERROR);

View file

@ -36,6 +36,7 @@ enum token_type
*/
enum tokenizer_error
{
TOK_ERROR_NONE,
TOK_UNTERMINATED_QUOTE,
TOK_UNTERMINATED_SUBSHELL,
TOK_UNTERMINATED_ESCAPE,
@ -67,6 +68,26 @@ enum tokenizer_error
typedef unsigned int tok_flags_t;
struct tok_t
{
/* The text of the token, or an error message for type error */
wcstring text;
/* The type of the token */
token_type type;
/* Offset of the token */
size_t offset;
/* Length of the token */
size_t length;
/* If an error, this is the error code */
enum tokenizer_error error;
tok_t() : type(TOK_NONE), offset(-1), length(-1), error(TOK_ERROR_NONE) {}
};
/**
The tokenizer struct.
*/
@ -93,7 +114,7 @@ struct tokenizer_t
/** Whether all blank lines are returned */
bool show_blank_lines;
/** Last error */
int error;
tokenizer_error error;
/* Whether we are squashing errors */
bool squash_errors;
@ -112,6 +133,9 @@ struct tokenizer_t
*/
tokenizer_t(const wchar_t *b, tok_flags_t flags);
/** Returns the next token by reference. Returns true if we got one, false if we're at the end. */
bool next(struct tok_t *result);
};
/**