Changes to work recognition per https://github.com/fish-shell/fish-shell/issues/384

Word movement should be very similar to fish 1.x backward-kill-word remains more liberal, but now stops at any of {,'"=}
2024-12-25 20:33:08 +00:00 · 2012-12-20 17:37:09 -08:00 · 2012-12-20 17:37:09 -08:00 · 0b1e371880
commit 0b1e371880
parent ce15abd577
7 changed files with 224 additions and 51 deletions
--- a/builtin.cpp
+++ b/builtin.cpp
@ -2031,7 +2031,7 @@ static int builtin_function(parser_t &parser, wchar_t **argv)
        {
            const wchar_t *nxt = names.at(i).c_str();
            size_t l = wcslen(nxt + 2);
-            if (chars+l > common_get_width())
+            if (chars+l > (size_t)common_get_width())
            {
                chars = 0;
                stderr_buffer.push_back(L'\n');
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@ -674,6 +674,95 @@ static void test_path()
    }
 }

+enum word_motion_t {
+    word_motion_left,
+    word_motion_right
+};
+static void test_1_word_motion(word_motion_t motion, move_word_style_t style, const wcstring &test)
+{
+    wcstring command;
+    std::set<size_t> stops;
+    
+    // Carets represent stops and should be cut out of the command
+    for (size_t i=0; i < test.size(); i++) {
+        wchar_t wc = test.at(i);
+        if (wc == L'^')
+        {
+            stops.insert(command.size());
+        }
+        else
+        {
+            command.push_back(wc);
+        }
+    }
+    
+    size_t idx, end;
+    if (motion == word_motion_left)
+    {
+        idx = command.size();
+        end = 0;
+    }
+    else
+    {
+        idx = 0;
+        end = command.size();
+    }
+    
+    move_word_state_machine_t sm(style);
+    while (idx != end)
+    {
+        size_t char_idx = (motion == word_motion_left ? idx - 1 : idx);
+        wchar_t wc = command.at(char_idx);
+        bool will_stop = ! sm.consume_char(wc);
+        //printf("idx %lu, looking at %lu (%c): %d\n", idx, char_idx, (char)wc, will_stop);
+        bool expected_stop = (stops.count(idx) > 0);
+        if (will_stop != expected_stop)
+        {
+            wcstring tmp = command;
+            tmp.insert(idx, L"^");
+            const char *dir = (motion == word_motion_left ? "left" : "right");
+            if (will_stop)
+            {
+                err(L"Word motion: moving %s, unexpected stop at idx %lu: '%ls'", dir, idx, tmp.c_str());
+            }
+            else if (! will_stop && expected_stop)
+            {
+                err(L"Word motion: moving %s, should have stopped at idx %lu: '%ls'", dir, idx, tmp.c_str());
+            }
+        }
+        // We don't expect to stop here next time
+        if (expected_stop)
+        {
+            stops.erase(idx);
+        }
+        if (will_stop)
+        {
+            sm.reset();
+        }
+        else
+        {
+            idx += (motion == word_motion_left ? -1 : 1);
+        }
+    }
+}
+
+/** Test word motion (forward-word, etc.). Carets represent cursor stops. */
+static void test_word_motion()
+{
+    say(L"Testing word motion");
+    test_1_word_motion(word_motion_left, move_word_style_punctuation, L"^echo ^hello_^world.^txt");
+    test_1_word_motion(word_motion_right, move_word_style_punctuation, L"echo^ hello^_world^.txt^");
+    
+    test_1_word_motion(word_motion_left, move_word_style_punctuation, L"echo ^foo_^foo_^foo/^/^/^/^/^    ");
+    test_1_word_motion(word_motion_right, move_word_style_punctuation, L"echo^ foo^_foo^_foo^/^/^/^/^/    ^");
+    
+    test_1_word_motion(word_motion_left, move_word_style_path_components, L"^/^foo/^bar/^baz/");
+    test_1_word_motion(word_motion_left, move_word_style_path_components, L"^echo ^--foo ^--bar");
+    test_1_word_motion(word_motion_left, move_word_style_path_components, L"^echo ^hi ^> /^dev/^null");
+    
+    test_1_word_motion(word_motion_left, move_word_style_path_components, L"^echo /^foo/^bar{^aaa,^bbb,^ccc}^bak/");
+}
+
 /** Test is_potential_path */
 static void test_is_potential_path()
 {
@ -1489,6 +1578,9 @@ int main(int argc, char **argv)
    builtin_init();
    reader_init();
    env_init();
+    
+    test_word_motion();
+    return 0;

    test_format();
    test_escape();
@ -1501,6 +1593,7 @@ int main(int argc, char **argv)
    test_expand();
    test_test();
    test_path();
+    test_word_motion();
    test_is_potential_path();
    test_colors();
    test_autosuggest_suggest_special();
--- a/highlight.cpp
+++ b/highlight.cpp
@ -1447,7 +1447,7 @@ static void highlight_universal_internal(const wcstring &buffstr, std::vector<in
        */
        if ((buffstr.at(pos) == L'\'') || (buffstr.at(pos) == L'\"'))
        {
-            std::vector<long> lst;
+            std::vector<size_t> lst;

            int level=0;
            wchar_t prev_q=0;
@ -1476,7 +1476,7 @@ static void highlight_universal_internal(const wcstring &buffstr, std::vector<in
                        {
                            if (prev_q == *str)
                            {
-                                long pos1, pos2;
+                                size_t pos1, pos2;

                                level--;
                                pos1 = lst.back();
--- a/parser.cpp
+++ b/parser.cpp
@ -1237,7 +1237,7 @@ int parser_t::is_help(const wchar_t *s, int min_match) const
    min_match = maxi(min_match, 3);

    return (wcscmp(L"-h", s) == 0) ||
-           (len >= min_match && (wcsncmp(L"--help", s, len) == 0));
+           (len >= (size_t)min_match && (wcsncmp(L"--help", s, len) == 0));
 }

 job_t *parser_t::job_create(void)
--- a/reader.cpp
+++ b/reader.cpp
@ -1295,7 +1295,7 @@ static void accept_autosuggestion(bool full)
        else
        {
            /* Accept characters up to a word separator */
-            move_word_state_machine_t state;
+            move_word_state_machine_t state(move_word_style_punctuation);
            for (size_t idx = data->command_line.size(); idx < data->autosuggestion.size(); idx++)
            {
                wchar_t wc = data->autosuggestion.at(idx);
@ -2048,19 +2048,6 @@ static void handle_token_history(int forward, int reset)
   \param dir Direction to move/erase. 0 means move left, 1 means move right.
   \param erase Whether to erase the characters along the way or only move past them.
   \param new if the new kill item should be appended to the previous kill item or not.
-
-   The regex we implement:
-
-      WHITESPACE*
-        (SEPARATOR+)
-      |
-        (SLASH*
-         TOK_STRING_CHARACTERS_EXCEPT_SLASH*)
-
-   Interesting test case:
-     /foo/bar/baz/ -> /foo/bar/ -> /foo/ -> /
-     echo --foo --bar -> echo --foo -> echo
-     echo hi>/dev/null -> echo hi>/dev/ -> echo hi >/ -> echo hi > -> echo hi -> echo
 */
 enum move_word_dir_t
 {
@ -2068,7 +2055,7 @@ enum move_word_dir_t
    MOVE_DIR_RIGHT
 };

-static void move_word(bool move_right, bool erase, bool newv)
+static void move_word(bool move_right, bool erase, enum move_word_style_t style, bool newv)
 {
    /* Return if we are already at the edge */
    const size_t boundary = move_right ? data->command_length() : 0;
@ -2076,7 +2063,7 @@ static void move_word(bool move_right, bool erase, bool newv)
        return;

    /* When moving left, a value of 1 means the character at index 0. */
-    move_word_state_machine_t state;
+    move_word_state_machine_t state(style);
    const wchar_t * const command_line = data->command_line.c_str();
    const size_t start_buff_pos = data->buff_pos;

@ -2696,7 +2683,6 @@ static bool is_backslashed(const wchar_t *str, size_t pos)

 const wchar_t *reader_readline()
 {
-
    wint_t c;
    int last_char=0;
    size_t yank_len=0;
@ -3268,21 +3254,21 @@ const wchar_t *reader_readline()
            /* kill one word left */
            case R_BACKWARD_KILL_WORD:
            {
-                move_word(MOVE_DIR_LEFT, true /* erase */, last_char!=R_BACKWARD_KILL_WORD);
+                move_word(MOVE_DIR_LEFT, true /* erase */, move_word_style_path_components, last_char!=R_BACKWARD_KILL_WORD);
                break;
            }

            /* kill one word right */
            case R_KILL_WORD:
            {
-                move_word(MOVE_DIR_RIGHT, true /* erase */, last_char!=R_KILL_WORD);
+                move_word(MOVE_DIR_RIGHT, true /* erase */, move_word_style_punctuation, last_char!=R_KILL_WORD);
                break;
            }

            /* move one word left*/
            case R_BACKWARD_WORD:
            {
-                move_word(MOVE_DIR_LEFT, false /* do not erase */, false);
+                move_word(MOVE_DIR_LEFT, false /* do not erase */, move_word_style_punctuation, false);
                break;
            }

@ -3291,7 +3277,7 @@ const wchar_t *reader_readline()
            {
                if (data->buff_pos < data->command_length())
                {
-                    move_word(MOVE_DIR_RIGHT, false /* do not erase */, false);
+                    move_word(MOVE_DIR_RIGHT, false /* do not erase */, move_word_style_punctuation, false);
                }
                else
                {
--- a/tokenizer.cpp
+++ b/tokenizer.cpp
@ -668,42 +668,111 @@ void tok_set_pos(tokenizer_t *tok, int pos)
    tok_next(tok);
 }

-
-
-move_word_state_machine_t::move_word_state_machine_t() : state(s_whitespace)
+bool move_word_state_machine_t::consume_char_punctuation(wchar_t c)
 {
-}
-
-bool move_word_state_machine_t::consume_char(wchar_t c)
-{
-    //printf("state %d, consume '%lc'\n", state, c);
+    enum
+    {
+        s_always_one = 0,
+        s_whitespace,
+        s_alphanumeric,
+        s_end
+    };
+    
    bool consumed = false;
-    /* Always treat separators as first. All this does is ensure that we treat ^ as a string character instead of as stderr redirection, which I hypothesize is usually what is desired. */
-    bool was_first = true;
    while (state != s_end && ! consumed)
    {
        switch (state)
        {
+            case s_always_one:
+                /* Always consume the first character */
+                consumed = true;
+                state = s_whitespace;
+                break;
+            
            case s_whitespace:
                if (iswspace(c))
                {
                    /* Consumed whitespace */
                    consumed = true;
                }
-                else if (tok_is_string_character(c, was_first))
+                else
                {
-                    /* String path */
+                    state = s_alphanumeric;
+                }
+                break;
+            
+            case s_alphanumeric:
+                if (iswalnum(c))
+                {
+                    /* Consumed alphanumeric */
+                    consumed = true;
+                }
+                else
+                {
+                    state = s_end;
+                }
+                break;
+                
+            case s_end:
+            default:
+                break;
+        }
+    }
+    return consumed;
+}
+
+bool move_word_state_machine_t::is_path_component_character(wchar_t c)
+{
+    /* Always treat separators as first. All this does is ensure that we treat ^ as a string character instead of as stderr redirection, which I hypothesize is usually what is desired. */
+    return tok_is_string_character(c, true) && ! wcschr(L"/={,}'\"", c);
+}
+
+bool move_word_state_machine_t::consume_char_path_components(wchar_t c)
+{
+    enum
+    {
+        s_initial_punctuation,
+        s_whitespace,
+        s_separator,
+        s_slash,
+        s_path_component_characters,
+        s_end
+    };
+    
+    //printf("state %d, consume '%lc'\n", state, c);
+    bool consumed = false;
+    while (state != s_end && ! consumed)
+    {
+        switch (state)
+        {
+            case s_initial_punctuation:
+                if (! is_path_component_character(c))
+                {
+                    consumed = true;
+                }
+                state = s_whitespace;
+                break;
+            
+            case s_whitespace:
+                if (iswspace(c))
+                {
+                    /* Consumed whitespace */
+                    consumed = true;
+                }
+                else if (c == L'/' || is_path_component_character(c))
+                {
+                    /* Path component */
                    state = s_slash;
                }
                else
                {
-                    /* Separator path */
+                    /* Path separator */
                    state = s_separator;
                }
                break;

            case s_separator:
-                if (! iswspace(c) && ! tok_is_string_character(c, was_first))
+                if (! iswspace(c) && ! is_path_component_character(c))
                {
                    /* Consumed separator */
                    consumed = true;
@ -722,12 +791,12 @@ bool move_word_state_machine_t::consume_char(wchar_t c)
                }
                else
                {
-                    state = s_nonseparators_except_slash;
+                    state = s_path_component_characters;
                }
                break;

-            case s_nonseparators_except_slash:
-                if (c != L'/' && tok_is_string_character(c, was_first))
+            case s_path_component_characters:
+                if (is_path_component_character(c))
                {
                    /* Consumed string character except slash */
                    consumed = true;
@ -747,3 +816,21 @@ bool move_word_state_machine_t::consume_char(wchar_t c)
    return consumed;
 }

+bool move_word_state_machine_t::consume_char(wchar_t c)
+{
+    switch (style)
+    {
+        case move_word_style_punctuation: return consume_char_punctuation(c);
+        case move_word_style_path_components: return consume_char_path_components(c);
+        default: return false;
+    }
+}
+
+move_word_state_machine_t::move_word_state_machine_t(move_word_style_t syl) : state(0), style(syl)
+{
+}
+
+void move_word_state_machine_t::reset()
+{
+    state = 0;
+}
--- a/tokenizer.h
+++ b/tokenizer.h
@ -183,22 +183,29 @@ const wchar_t *tok_get_desc(int type);
 */
 int tok_get_error(tokenizer_t *tok);

+enum move_word_style_t
+{
+    move_word_style_punctuation, //stop at punctuation
+    move_word_style_path_components //stops at path components
+};

 /* Our state machine that implements "one word" movement or erasure. */
 class move_word_state_machine_t
 {
-    enum
-    {
-        s_whitespace,
-        s_separator,
-        s_slash,
-        s_nonseparators_except_slash,
-        s_end
-    } state;
+private:
+    
+    bool consume_char_punctuation(wchar_t c);
+    bool consume_char_path_components(wchar_t c);
+    bool is_path_component_character(wchar_t c);
+    
+    int state;
+    move_word_style_t style;

 public:
-    move_word_state_machine_t();
+
+    move_word_state_machine_t(move_word_style_t st);
    bool consume_char(wchar_t c);
+    void reset();
 };