Hack the tokenizer to compress multiple adjacent newlines into one

This slightly reduces the size of parse trees, and is otherwise a minor optimization
2025-01-13 05:28:49 +00:00 · 2014-11-24 01:20:57 -08:00 · 2014-11-24 01:20:57 -08:00 · eafd577629
commit eafd577629
parent 196a7c9d18
2 changed files with 12 additions and 4 deletions
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@ -458,10 +458,10 @@ static void test_tok()
    say(L"Test destruction of broken tokenizer");
    {

-        const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect";
+        const wchar_t *str = L"string <redirection  2>&1 'nested \"quoted\" '(string containing subshells ){and,brackets}$as[$well (as variable arrays)] not_a_redirect^ ^ ^^is_a_redirect Compress_Newlines\n  \n\t\n   \nInto_Just_One";
        const int types[] =
        {
-            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_END
+            TOK_STRING, TOK_REDIRECT_IN, TOK_STRING, TOK_REDIRECT_FD, TOK_STRING, TOK_STRING, TOK_STRING, TOK_REDIRECT_OUT, TOK_REDIRECT_APPEND, TOK_STRING, TOK_STRING, TOK_END, TOK_STRING, TOK_END
        };

        say(L"Test correct tokenization");
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@ -621,14 +621,22 @@ void tok_next(tokenizer_t *tok)

    switch (*tok->buff)
    {
-
        case L'\0':
            tok->last_type = TOK_END;
            /*fwprintf( stderr, L"End of string\n" );*/
            tok->has_next = false;
            break;
-        case 13:
+        case 13: // carriage return
        case L'\n':
+            // Hack: when we get a newline, swallow as many as we can
+            // This compresses multiple subsequent newlines into a single one
+            while (*tok->buff == L'\n' || *tok->buff == 13 || *tok->buff == ' ' || *tok->buff == '\t')
+            {
+                tok->buff++;
+            }
+            tok->last_type = TOK_END;
+            break;
+
        case L';':
            tok->last_type = TOK_END;
            tok->buff++;