Early reworking of tokenizer interface

2024-11-11 23:47:25 +00:00 · 2015-07-25 23:05:47 -07:00 · 2015-07-25 23:05:47 -07:00 · 618896c043
commit 618896c043
parent 0dbd83ffaf
4 changed files with 72 additions and 20 deletions
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@ -468,22 +468,34 @@ static void test_tok()
        const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect Compress_Newlines\n  \n\t\n   \nInto_Just_One";
        const int types[] =
        {
-            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING, TOK_END
+            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING
        };

        say(L"Test correct tokenization");

        tokenizer_t t(str, 0);
-        for (size_t i=0; i < sizeof types / sizeof *types; i++, tok_next(&t))
+        tok_t token;
+        size_t i = 0;
+        while (t.next(&token))
        {
-            if (types[i] != tok_last_type(&t))
+            if (i > sizeof types / sizeof *types)
+            {
+                err(L"Too many tokens returned from tokenizer");
+                break;
+            }
+            if (types[i] != token.type)
            {
                err(L"Tokenization error:");
-                wprintf(L"Token number %d of string \n'%ls'\n, got token '%ls'\n",
+                wprintf(L"Token number %d of string \n'%ls'\n, got token type %ld\n",
                        i+1,
                        str,
-                        tok_last(&t));
+                        (long)token.type);
            }
+            i++;
+        }
+        if (i < sizeof types / sizeof *types)
+        {
+            err(L"Too few tokens returned from tokenizer");
        }
    }

--- a/src/reader.cpp
+++ b/src/reader.cpp
@ -246,7 +246,7 @@ public:
    /**
       Saved position used by token history search
    */
-    int token_history_pos;
+    size_t token_history_pos;

    /**
       Saved search string for token history search. Not handled by command_line_changed.
@ -2256,7 +2256,7 @@ static void handle_token_history(int forward, int reset)
        return;

    wcstring str;
-    long current_pos;
+    size_t current_pos;

    if (reset)
    {
@ -2292,7 +2292,7 @@ static void handle_token_history(int forward, int reset)
    }
    else
    {
-        if (current_pos == -1)
+        if (current_pos == size_t(-1))
        {
            data->token_history_buff.clear();

@ -2330,26 +2330,26 @@ static void handle_token_history(int forward, int reset)

            //debug( 3, L"new '%ls'", data->token_history_buff.c_str() );
            tokenizer_t tok(data->token_history_buff.c_str(), TOK_ACCEPT_UNFINISHED);
-            for (; tok_has_next(&tok); tok_next(&tok))
+            tok_t token;
+            while (tok.next(&token))
            {
-                switch (tok_last_type(&tok))
+                switch (token.type)
                {
                    case TOK_STRING:
-                    {
-                        if (wcsstr(tok_last(&tok), data->search_buff.c_str()))
+                    {   
+                        if (token.text.find(data->search_buff) != wcstring::npos)
                        {
                            //debug( 3, L"Found token at pos %d\n", tok_get_pos( &tok ) );
-                            if (tok_get_pos(&tok) >= current_pos)
+                            if (token.offset >= current_pos)
                            {
                                break;
                            }
                            //debug( 3, L"ok pos" );

-                            const wcstring last_tok = tok_last(&tok);
-                            if (find(data->search_prev.begin(), data->search_prev.end(), last_tok) == data->search_prev.end())
+                            if (find(data->search_prev.begin(), data->search_prev.end(), token.text) == data->search_prev.end())
                            {
-                                data->token_history_pos = tok_get_pos(&tok);
-                                str = tok_last(&tok);
+                                data->token_history_pos = token.offset;
+                                str = token.text;
                            }

                        }
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -55,7 +55,7 @@ segments.
 /**
   Set the latest tokens string to be the specified error message
 */
-static void tok_call_error(tokenizer_t *tok, int error_type, const wchar_t *error_message)
+static void tok_call_error(tokenizer_t *tok, enum tokenizer_error error_type, const wchar_t *error_message)
 {
    tok->last_type = TOK_ERROR;
    tok->error = error_type;
@ -67,7 +67,7 @@ int tok_get_error(tokenizer_t *tok)
    return tok->error;
 }

-tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(0), squash_errors(false), continue_line_after_comment(false)
+tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig_buff(NULL), last_type(TOK_NONE), last_pos(0), has_next(false), accept_unfinished(false), show_comments(false), show_blank_lines(false), error(TOK_ERROR_NONE), squash_errors(false), continue_line_after_comment(false)
 {
    CHECK(b,);

@ -81,6 +81,22 @@ tokenizer_t::tokenizer_t(const wchar_t *b, tok_flags_t flags) : buff(NULL), orig
    tok_next(this);
 }

+bool tokenizer_t::next(struct tok_t *result)
+{
+    assert(result != NULL);
+    if (! this->has_next)
+    {
+        return false;
+    }
+    result->text = this->last_token;
+    result->type = this->last_type;
+    result->offset = last_pos;
+    assert(this->buff >= this->orig_buff);
+    result->length = this->buff - this->orig_buff;
+    tok_next(this);
+    return true;
+}
+
 enum token_type tok_last_type(tokenizer_t *tok)
 {
    CHECK(tok, TOK_ERROR);
--- a/src/tokenizer.h
+++ b/src/tokenizer.h
@ -36,6 +36,7 @@ enum token_type
 */
 enum tokenizer_error
 {
+    TOK_ERROR_NONE,
    TOK_UNTERMINATED_QUOTE,
    TOK_UNTERMINATED_SUBSHELL,
    TOK_UNTERMINATED_ESCAPE,
@ -67,6 +68,26 @@ enum tokenizer_error

 typedef unsigned int tok_flags_t;

+struct tok_t
+{
+    /* The text of the token, or an error message for type error */
+    wcstring text;
+    
+    /* The type of the token */
+    token_type type;
+    
+    /* Offset of the token */
+    size_t offset;
+    
+    /* Length of the token */
+    size_t length;
+    
+    /* If an error, this is the error code */
+    enum tokenizer_error error;
+    
+    tok_t() : type(TOK_NONE), offset(-1), length(-1), error(TOK_ERROR_NONE) {}
+};
+
 /**
   The tokenizer struct.
 */
@ -93,7 +114,7 @@ struct tokenizer_t
    /** Whether all blank lines are returned */
    bool show_blank_lines;
    /** Last error */
-    int error;
+    tokenizer_error error;
    /* Whether we are squashing errors */
    bool squash_errors;

@ -112,6 +133,9 @@ struct tokenizer_t

    */
    tokenizer_t(const wchar_t *b, tok_flags_t flags);
+    
+    /** Returns the next token by reference. Returns true if we got one, false if we're at the end. */
+    bool next(struct tok_t *result);
 };

 /**