From 991c900fc6d55de3c11f23d06b5c06393abb1b2d Mon Sep 17 00:00:00 2001
From: ridiculousfish <corydoras@ridiculousfish.com>
Date: Sun, 29 Sep 2013 02:48:35 -0700
Subject: [PATCH] Set of changes to improve detection of escape sequences for
 prompt width computation. Addresses #767

---
 fish_tests.cpp |  11 ++
 screen.cpp     | 333 ++++++++++++++++++++++++++-----------------------
 screen.h       |   2 +
 3 files changed, 192 insertions(+), 154 deletions(-)
diff --git a/fish_tests.cpp b/fish_tests.cpp
index b47ce3a3a..8b79ef3ac 100644
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -550,6 +550,16 @@ static void test_utils()
     if (begin != a + wcslen(L"echo (echo (")) err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
 }
 
+static void test_escape_sequences(void)
+{
+    say(L"Testing escape codes");
+    if (escape_code_length(L"") != 0) err(L"test_escape_sequences failed on line %d\n", __LINE__);
+    if (escape_code_length(L"abcd") != 0) err(L"test_escape_sequences failed on line %d\n", __LINE__);
+    if (escape_code_length(L"\x1b[2J") != 4) err(L"test_escape_sequences failed on line %d\n", __LINE__);
+    if (escape_code_length(L"\x1b[38;5;123mABC") != strlen("\x1b[38;5;123m")) err(L"test_escape_sequences failed on line %d\n", __LINE__);
+    if (escape_code_length(L"\x1b@") != 2) err(L"test_escape_sequences failed on line %d\n", __LINE__);
+}
+
 class lru_node_test_t : public lru_node_t
 {
 public:
@@ -1834,6 +1844,7 @@ int main(int argc, char **argv)
     test_fork();
     test_parser();
     test_utils();
+    test_escape_sequences();
     test_lru();
     test_expand();
     test_fuzzy_match();
diff --git a/screen.cpp b/screen.cpp
index 8c8438346..5ebe8605d 100644
--- a/screen.cpp
+++ b/screen.cpp
@@ -92,11 +92,9 @@ public:
    specified position of the specified wide character string. All of
    \c seq must match, but str may be longer than seq.
 */
-static int try_sequence(const char *seq, const wchar_t *str)
+static size_t try_sequence(const char *seq, const wchar_t *str)
 {
-    int i;
-
-    for (i=0;; i++)
+    for (size_t i=0; ; i++)
     {
         if (!seq[i])
             return i;
@@ -121,29 +119,6 @@ static size_t next_tab_stop(size_t in)
     return ((in/tab_width)+1)*tab_width;
 }
 
-// PCA for term256 support, let's just detect the escape codes directly
-static int is_term256_escape(const wchar_t *str)
-{
-    // An escape code looks like this: \x1b[38;5;<num>m
-    // or like this: \x1b[48;5;<num>m
-
-    // parse out the required prefix
-    int len = try_sequence("\x1b[38;5;", str);
-    if (! len) len = try_sequence("\x1b[48;5;", str);
-    if (! len) return 0;
-
-    // now try parsing out a string of digits
-    // we need at least one
-    if (! iswdigit(str[len])) return 0;
-    while (iswdigit(str[len])) len++;
-
-    // look for the terminating m
-    if (str[len++] != L'm') return 0;
-
-    // success
-    return len;
-}
-
 /* Like fish_wcwidth, but returns 0 for control characters instead of -1 */
 static int fish_wcwidth_min_0(wchar_t wc)
 {
@@ -157,6 +132,178 @@ static bool allow_soft_wrap(void)
     return !! auto_right_margin;
 }
 
+
+/* Returns the number of characters in the escape code starting at 'code' (which should initially contain \x1b) */
+size_t escape_code_length(const wchar_t *code)
+{
+    assert(code != NULL);
+    
+    /* The only escape codes we recognize start with \x1b */
+    if (code[0] != L'\x1b')
+        return 0;
+    
+    size_t resulting_length = 0;
+    bool found = false;
+    
+    if (cur_term != NULL)
+    {
+        /*
+         Detect these terminfo color escapes with parameter
+         value 0..7, all of which don't move the cursor
+         */
+        char * const esc[] =
+        {
+            set_a_foreground,
+            set_a_background,
+            set_foreground,
+            set_background,
+        };
+    
+        for (size_t p=0; p < sizeof esc / sizeof *esc && !found; p++)
+        {
+            if (!esc[p])
+                continue;
+            
+            for (size_t k=0; k<8; k++)
+            {
+                size_t len = try_sequence(tparm(esc[p],k), code);
+                if (len)
+                {
+                    resulting_length = len;
+                    found = true;
+                    break;
+                }
+            }
+        }
+    }
+    
+    if (cur_term != NULL)
+    {
+        /*
+         Detect these semi-common terminfo escapes without any
+         parameter values, all of which don't move the cursor
+         */
+        char * const esc2[] =
+        {
+            enter_bold_mode,
+            exit_attribute_mode,
+            enter_underline_mode,
+            exit_underline_mode,
+            enter_standout_mode,
+            exit_standout_mode,
+            flash_screen,
+            enter_subscript_mode,
+            exit_subscript_mode,
+            enter_superscript_mode,
+            exit_superscript_mode,
+            enter_blink_mode,
+            enter_italics_mode,
+            exit_italics_mode,
+            enter_reverse_mode,
+            enter_shadow_mode,
+            exit_shadow_mode,
+            enter_standout_mode,
+            exit_standout_mode,
+            enter_secure_mode
+        };
+        
+    
+    
+        for (size_t p=0; p < sizeof esc2 / sizeof *esc2 && !found; p++)
+        {
+            if (!esc2[p])
+                continue;
+            /*
+             Test both padded and unpadded version, just to
+             be safe. Most versions of tparm don't actually
+             seem to do anything these days.
+             */
+            size_t len = maxi(try_sequence(tparm(esc2[p]), code), try_sequence(esc2[p], code));
+            if (len)
+            {
+                resulting_length = len;
+                found = true;
+            }
+        }
+    }
+    
+    if (!found)
+    {
+        if (code[1] == L'k')
+        {
+            /* This looks like the escape sequence for setting a screen name */
+            const env_var_t term_name = env_get_string(L"TERM");
+            if (!term_name.missing() && string_prefixes_string(L"screen", term_name))
+            {
+                const wchar_t * const screen_name_end_sentinel = L"\x1b\\";
+                const wchar_t *screen_name_end = wcsstr(&code[2], screen_name_end_sentinel);
+                if (screen_name_end != NULL)
+                {
+                    const wchar_t *escape_sequence_end = screen_name_end + wcslen(screen_name_end_sentinel);
+                    resulting_length = escape_sequence_end - code;
+                }
+                else
+                {
+                    /* Consider just <esc>k to be the code */
+                    resulting_length = 2;
+                }
+                found = true;
+            }
+        }
+    }
+    
+    if (! found)
+    {
+        /* Generic VT100 one byte sequence: CSI followed by something in the range @ through _ */
+        if (code[1] == L'[' && (code[2] >= L'@' && code[2] <= L'_'))
+        {
+            resulting_length = 3;
+            found = true;
+        }
+    }
+    
+    if (! found)
+    {
+        /* Generic VT100 CSI-style sequence. <esc>, followed by zero or more ASCII characters NOT in the range [@,_], followed by one character in that range */
+        if (code[1] == L'[')
+        {
+            // Start at 2 to skip over <esc>[
+            size_t cursor = 2;
+            for (; code[cursor] != L'\0'; cursor++)
+            {
+                /* Consume a sequence of ASCII characters not in the range [@, ~] */
+                wchar_t c = code[cursor];
+                
+                /* If we're not in ASCII, just stop */
+                if (c > 127)
+                    break;
+                
+                /* If we're the end character, then consume it and then stop */
+                if (c >= L'@' && c <= L'~')
+                {
+                    cursor++;
+                    break;
+                }
+            }
+            /* curs now indexes just beyond the end of the sequence (or at the terminating zero) */
+            found = true;
+            resulting_length = cursor;
+        }
+    }
+    
+    if (! found)
+    {
+        /* Generic VT100 two byte sequence: <esc> followed by something in the range @ through _ */
+        if (code[1] >= L'@' && code[1] <= L'_')
+        {
+            resulting_length = 2;
+            found = true;
+        }
+    }
+    
+    return resulting_length;
+}
+
 /* Information about a prompt layout */
 struct prompt_layout_t
 {
@@ -178,7 +325,7 @@ struct prompt_layout_t
 static prompt_layout_t calc_prompt_layout(const wchar_t *prompt)
 {
     size_t current_line_width = 0;
-    size_t j, k;
+    size_t j;
 
     prompt_layout_t prompt_layout = {};
     prompt_layout.line_count = 1;
@@ -187,134 +334,12 @@ static prompt_layout_t calc_prompt_layout(const wchar_t *prompt)
     {
         if (prompt[j] == L'\x1b')
         {
-            /*
-             This is the start of an escape code. Try to guess its width.
-             */
-            size_t p;
-            int len=0;
-            bool found = false;
-
-            /*
-             Detect these terminfo color escapes with parameter
-             value 0..7, all of which don't move the cursor
-             */
-            char * const esc[] =
+            /* This is the start of an escape code. Skip over it if it's at least one character long. */
+            size_t escape_len = escape_code_length(&prompt[j]);
+            if (escape_len > 0)
             {
-                set_a_foreground,
-                set_a_background,
-                set_foreground,
-                set_background,
+                j += escape_len - 1;
             }
-            ;
-
-            /*
-             Detect these semi-common terminfo escapes without any
-             parameter values, all of which don't move the cursor
-             */
-            char * const esc2[] =
-            {
-                enter_bold_mode,
-                exit_attribute_mode,
-                enter_underline_mode,
-                exit_underline_mode,
-                enter_standout_mode,
-                exit_standout_mode,
-                flash_screen,
-                enter_subscript_mode,
-                exit_subscript_mode,
-                enter_superscript_mode,
-                exit_superscript_mode,
-                enter_blink_mode,
-                enter_italics_mode,
-                exit_italics_mode,
-                enter_reverse_mode,
-                enter_shadow_mode,
-                exit_shadow_mode,
-                enter_standout_mode,
-                exit_standout_mode,
-                enter_secure_mode
-            }
-            ;
-
-            for (p=0; p < sizeof esc / sizeof *esc && !found; p++)
-            {
-                if (!esc[p])
-                    continue;
-
-                for (k=0; k<8; k++)
-                {
-                    len = try_sequence(tparm(esc[p],k), &prompt[j]);
-                    if (len)
-                    {
-                        j += (len-1);
-                        found = true;
-                        break;
-                    }
-                }
-            }
-
-            /* PCA for term256 support, let's just detect the escape codes directly */
-            if (! found)
-            {
-                len = is_term256_escape(&prompt[j]);
-                if (len)
-                {
-                    j += (len - 1);
-                    found = true;
-                }
-            }
-
-
-            for (p=0; p < (sizeof(esc2)/sizeof(char *)) && !found; p++)
-            {
-                if (!esc2[p])
-                    continue;
-                /*
-                 Test both padded and unpadded version, just to
-                 be safe. Most versions of tparm don't actually
-                 seem to do anything these days.
-                 */
-                len = maxi(try_sequence(tparm(esc2[p]), &prompt[j]),
-                           try_sequence(esc2[p], &prompt[j]));
-
-                if (len)
-                {
-                    j += (len-1);
-                    found = true;
-                }
-            }
-
-            if (!found)
-            {
-                if (prompt[j+1] == L'k')
-                {
-                    const env_var_t term_name = env_get_string(L"TERM");
-                    if (!term_name.missing() && string_prefixes_string(L"screen", term_name))
-                    {
-                        const wchar_t *end;
-                        j+=2;
-                        found = true;
-                        end = wcsstr(&prompt[j], L"\x1b\\");
-                        if (end)
-                        {
-                            /*
-                             You'd thing this should be
-                             '(end-prompt)+2', in order to move j
-                             past the end of the string, but there is
-                             a 'j++' at the end of each lap, so j
-                             should always point to the last menged
-                             character, e.g. +1.
-                             */
-                            j = (end-prompt)+1;
-                        }
-                        else
-                        {
-                            break;
-                        }
-                    }
-                }
-            }
-
         }
         else if (prompt[j] == L'\t')
         {
diff --git a/screen.h b/screen.h
index 0307fdd7d..1d9fde2c2 100644
--- a/screen.h
+++ b/screen.h
@@ -227,5 +227,7 @@ enum screen_reset_mode_t
 
 void s_reset(screen_t *s, screen_reset_mode_t mode);
 
+/* Returns the length of an escape code. Exposed for testing purposes only. */
+size_t escape_code_length(const wchar_t *code);
 
 #endif