implement string unescape

Fixes #3543
2025-01-27 04:05:08 +00:00 · 2017-06-22 20:47:54 -07:00 · 2017-06-22 20:47:54 -07:00 · f3cb625802
commit f3cb625802
parent 60bca14b37
8 changed files with 319 additions and 34 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,7 @@
 - New `status is-breakpoint` command that is true when a prompt is displayed in response to a `breakpoint` command (#1310).
 - Invalid array indexes are now silently ignored (#826, #4127).
 - `string escape` has a new `--style=xxx` flag where `xxx` can be `script`, `var`, or `url` (#4150)
+- `string unescape` has been implemented to reverse the effects of `string escape` (#3543)

 ## Other significant changes

--- a/doc_src/string.txt
+++ b/doc_src/string.txt
@ -18,6 +18,7 @@ string sub [(-s | --start) START] [(-l | --length) LENGTH] [(-q | --quiet)]
           [STRING...]
 string trim [(-l | --left)] [(-r | --right)] [(-c | --chars CHARS)]
            [(-q | --quiet)] [STRING...]
+string unescape [--style=xxx] [STRING...]
 string upper [(-q | --quiet)] [STRING...]
 \endfish

@ -42,6 +43,8 @@ The second is `--style=var` which ensures the string can be used as a variable n

 The third is `--style=url` which ensures the string can be used as a URL by hex encoding any character which is not legal in a URL. The string is first converted to UTF-8 before being encoded.

+`string unescape` performs the inverse of the `string escape` command. If the string to be unescaped is not properly formatted it is ignored. For example, doing `string unescape --style=var (string escape --style=var $str)` will return the original string.
+
 \subsection string-join "join" subcommand

 `string join` joins its STRING arguments into a single string separated by SEP, which can be an empty string. Exit status: 0 if at least one join was performed, or 1 otherwise.
--- a/src/builtin_string.cpp
+++ b/src/builtin_string.cpp
@ -484,6 +484,64 @@ static int string_escape_var(options_t &opts, int optind, wchar_t **argv, io_str
    return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
 }

+/// Unescape a string encoded so it can be used in fish script.
+static int string_unescape_script(options_t &opts, int optind, wchar_t **argv,
+                                  io_streams_t &streams) {
+    UNUSED(opts);
+    wcstring storage;
+    int nesc = 0;
+    unescape_flags_t flags = 0;
+
+    while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
+        wcstring result;
+        if (unescape_string(arg, &result, flags, STRING_STYLE_SCRIPT)) {
+            streams.out.append(result);
+            streams.out.append(L'\n');
+            nesc++;
+        }
+    }
+
+    return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+}
+
+/// Unescape an encoded URL.
+static int string_unescape_url(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
+    UNUSED(opts);
+    wcstring storage;
+    int nesc = 0;
+    unescape_flags_t flags = 0;
+
+    while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
+        wcstring result;
+        if (unescape_string(arg, &result, flags, STRING_STYLE_URL)) {
+            streams.out.append(result);
+            streams.out.append(L'\n');
+            nesc++;
+        }
+    }
+
+    return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+}
+
+/// Unescape an encoded var name.
+static int string_unescape_var(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
+    UNUSED(opts);
+    wcstring storage;
+    int nesc = 0;
+    unescape_flags_t flags = 0;
+
+    while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
+        wcstring result;
+        if (unescape_string(arg, &result, flags, STRING_STYLE_VAR)) {
+            streams.out.append(result);
+            streams.out.append(L'\n');
+            nesc++;
+        }
+    }
+
+    return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
+}
+
 static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
    options_t opts;
    opts.no_quoted_valid = true;
@ -507,6 +565,29 @@ static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wcha
    DIE("should never reach this statement");
 }

+static int string_unescape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
+    options_t opts;
+    opts.no_quoted_valid = true;
+    opts.style_valid = true;
+    int optind;
+    int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams);
+    if (retval != STATUS_CMD_OK) return retval;
+
+    switch (opts.escape_style) {
+        case STRING_STYLE_SCRIPT: {
+            return string_unescape_script(opts, optind, argv, streams);
+        }
+        case STRING_STYLE_URL: {
+            return string_unescape_url(opts, optind, argv, streams);
+        }
+        case STRING_STYLE_VAR: {
+            return string_unescape_var(opts, optind, argv, streams);
+        }
+    }
+
+    DIE("should never reach this statement");
+}
+
 static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
    options_t opts;
    opts.quiet_valid = true;
@ -1277,11 +1358,19 @@ static const struct string_subcommand {
                   wchar_t **argv);                       //!OCLINT(unused param)
 }

-string_subcommands[] = {
-    {L"escape", &string_escape}, {L"join", &string_join},       {L"length", &string_length},
-    {L"match", &string_match},   {L"replace", &string_replace}, {L"split", &string_split},
-    {L"sub", &string_sub},       {L"trim", &string_trim},       {L"lower", &string_lower},
-    {L"upper", &string_upper},   {L"repeat", &string_repeat},   {NULL, NULL}};
+string_subcommands[] = {{L"escape", &string_escape},
+                        {L"join", &string_join},
+                        {L"length", &string_length},
+                        {L"match", &string_match},
+                        {L"replace", &string_replace},
+                        {L"split", &string_split},
+                        {L"sub", &string_sub},
+                        {L"trim", &string_trim},
+                        {L"lower", &string_lower},
+                        {L"upper", &string_upper},
+                        {L"repeat", &string_repeat},
+                        {L"unescape", &string_unescape},
+                        {NULL, NULL}};

 /// The string builtin, for manipulating strings.
 int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) {
--- a/src/common.cpp
+++ b/src/common.cpp
@ -75,6 +75,38 @@ static void debug_shared(const wchar_t msg_level, const wcstring &msg);

 bool has_working_tty_timestamps = true;

+/// Convert a character to its integer equivalent if it is a valid character for the requested base.
+/// Return the integer value if it is valid else -1.
+long convert_digit(wchar_t d, int base) {
+    long res = -1;
+    if ((d <= L'9') && (d >= L'0')) {
+        res = d - L'0';
+    } else if ((d <= L'z') && (d >= L'a')) {
+        res = d + 10 - L'a';
+    } else if ((d <= L'Z') && (d >= L'A')) {
+        res = d + 10 - L'A';
+    }
+    if (res >= base) {
+        res = -1;
+    }
+
+    return res;
+}
+
+/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions.
+static bool is_hex_digit(int c) { return strchr("0123456789ABCDEF", c) != NULL; }
+
+/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase.
+long convert_hex_digit(wchar_t d) {
+    if ((d <= L'9') && (d >= L'0')) {
+        return d - L'0';
+    } else if ((d <= L'Z') && (d >= L'A')) {
+        return 10 + d - L'A';
+    }
+
+    return -1;
+}
+
 #ifdef HAVE_BACKTRACE_SYMBOLS
 // This function produces a stack backtrace with demangled function & method names. It is based on
 // https://gist.github.com/fmela/591333 but adapted to the style of the fish project.
@ -765,12 +797,41 @@ static void escape_string_url(const wchar_t *orig_in, wcstring &out) {
    }
 }

-static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; }
+/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII
+/// chars.
+static bool unescape_string_url(const wchar_t *in, wcstring *out) {
+    std::string result;
+    result.reserve(out->size());
+    for (wchar_t c = *in; c; c = *++in) {
+        if (c > 0x7F) return false;  // invalid character means we can't decode the string
+        if (c == '%') {
+            int c1 = in[1];
+            if (c1 == 0) return false;  // found unexpected end of string
+            if (c1 == '%') {
+                result.push_back('%');
+                in++;
+            } else {
+                int c2 = in[2];
+                if (c2 == 0) return false;  // string ended prematurely
+                long d1 = convert_digit(c1, 16);
+                if (d1 < 0) return false;
+                long d2 = convert_digit(c2, 16);
+                if (d2 < 0) return false;
+                result.push_back(16 * d1 + d2);
+                in += 2;
+            }
+        } else {
+            result.push_back(c);
+        }
+    }
+
+    *out = str2wcstring(result);
+    return true;
+}

 /// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
 static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
    bool prev_was_hex_encoded = false;
-    bool maybe_encode_next_char = false;
    const std::string &in = wcs2string(orig_in);
    for (auto c1 : in) {
        // This silliness is so we get the correct result whether chars are signed or unsigned.
@ -799,6 +860,46 @@ static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
    }
 }

+/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII
+/// chars.
+static bool unescape_string_var(const wchar_t *in, wcstring *out) {
+    std::string result;
+    result.reserve(out->size());
+    bool prev_was_hex_encoded = false;
+    for (wchar_t c = *in; c; c = *++in) {
+        if (c > 0x7F) return false;  // invalid character means we can't decode the string
+        if (c == '_') {
+            int c1 = in[1];
+            if (c1 == 0) {
+                if (prev_was_hex_encoded) break;
+                return false;  // found unexpected escape char at end of string
+            }
+            if (c1 == '_') {
+                result.push_back('_');
+                in++;
+            } else if (is_hex_digit(c1)) {
+                int c2 = in[2];
+                if (c2 == 0) return false;  // string ended prematurely
+                long d1 = convert_hex_digit(c1);
+                if (d1 < 0) return false;
+                long d2 = convert_hex_digit(c2);
+                if (d2 < 0) return false;
+                result.push_back(16 * d1 + d2);
+                in += 2;
+                prev_was_hex_encoded = true;
+            }
+            // No "else" clause because if the first char after an underscore is not another
+            // underscore or a valid hex character then the underscore is there to improve
+            // readability after we've encoded a character not valid in a var name.
+        } else {
+            result.push_back(c);
+        }
+    }
+
+    *out = str2wcstring(result);
+    return true;
+}
+
 /// Escape a string in a fashion suitable for using in fish script. Store the result in out_str.
 static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out,
                                 escape_flags_t flags) {
@ -1390,14 +1491,44 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) {
    return success;
 }

-bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special) {
-    bool success = unescape_string_internal(input, wcslen(input), output, escape_special);
+bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
+                     escape_string_style_t style) {
+    bool success;
+    switch (style) {
+        case STRING_STYLE_SCRIPT: {
+            success = unescape_string_internal(input, wcslen(input), output, escape_special);
+            break;
+        }
+        case STRING_STYLE_URL: {
+            success = unescape_string_url(input, output);
+            break;
+        }
+        case STRING_STYLE_VAR: {
+            success = unescape_string_var(input, output);
+            break;
+        }
+    }
    if (!success) output->clear();
    return success;
 }

-bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special) {
-    bool success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
+bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
+                     escape_string_style_t style) {
+    bool success;
+    switch (style) {
+        case STRING_STYLE_SCRIPT: {
+            success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
+            break;
+        }
+        case STRING_STYLE_URL: {
+            success = unescape_string_url(input.c_str(), output);
+            break;
+        }
+        case STRING_STYLE_VAR: {
+            success = unescape_string_var(input.c_str(), output);
+            break;
+        }
+    }
    if (!success) output->clear();
    return success;
 }
@ -2023,22 +2154,6 @@ char **make_null_terminated_array(const std::vector<std::string> &lst) {
    return make_null_terminated_array_helper(lst);
 }

-long convert_digit(wchar_t d, int base) {
-    long res = -1;
-    if ((d <= L'9') && (d >= L'0')) {
-        res = d - L'0';
-    } else if ((d <= L'z') && (d >= L'a')) {
-        res = d + 10 - L'a';
-    } else if ((d <= L'Z') && (d >= L'A')) {
-        res = d + 10 - L'A';
-    }
-    if (res >= base) {
-        res = -1;
-    }
-
-    return res;
-}
-
 /// Test if the specified character is in a range that fish uses interally to store special tokens.
 ///
 /// NOTE: This is used when tokenizing the input. It is also used when reading input, before
--- a/src/common.h
+++ b/src/common.h
@ -715,10 +715,13 @@ size_t read_unquoted_escape(const wchar_t *input, wcstring *result, bool allow_i
 /// indicates the string was unmodified.
 bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special);

-/// Unescapes a string, returning the unescaped value by reference. On failure, the output is set to
-/// an empty string.
-bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special);
-bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special);
+/// Reverse the effects of calling `escape_string`. Returns the unescaped value by reference. On
+/// failure, the output is set to an empty string.
+bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
+                     escape_string_style_t style = STRING_STYLE_SCRIPT);
+
+bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
+                     escape_string_style_t style = STRING_STYLE_SCRIPT);

 /// Returns the width of the terminal window, so that not all functions that use these values
 /// continually have to keep track of it separately.
--- a/tests/string.err
+++ b/tests/string.err
@ -5,7 +5,7 @@ string match: ^

 # string invalidarg
 string: Subcommand 'invalidarg' is not valid
-Standard input (line 215): 
+Standard input (line 258): 
 string invalidarg; and echo "unexpected exit 0" >&2
 ^

@ -29,6 +29,6 @@ string repeat: Expected argument

 # string repeat -l fakearg 2>&1
 string repeat: Unknown option '-l'
-Standard input (line 316): 
+Standard input (line 359): 
 string repeat -l fakearg
 ^
--- a/tests/string.in
+++ b/tests/string.in
@ -126,6 +126,49 @@ echo
 echo '# string escape --style=var -- -'
 string escape --style=var -- -

+# The following tests verify that we can correctly unescape the same strings
+# we tested escaping above.
+
+echo
+echo '# set x (string unescape (echo \x07 | string escape))'
+set x (string unescape (echo \x07 | string escape))
+test $x = \x07
+and echo success
+
+echo
+echo '# string unescape --style=script (string escape --style=script \'a b#c"\\\'d\')'
+string unescape --style=script (string escape --style=script 'a b#c"\'d')
+
+echo
+echo '# string unescape --style=url (string escape --style=url \'a b#c"\\\'d\')'
+string unescape --style=url (string escape --style=url 'a b#c"\'d')
+
+echo
+echo '# string unescape --style=url (string escape --style=url \na\nb%c~d\n)'
+string unescape --style=url (string escape --style=url \na\nb%c~d\n)
+
+echo
+echo '# string unescape --style=var (string escape --style=var \'a b#c"\\\'d\')'
+string unescape --style=var (string escape --style=var 'a b#c"\'d')
+
+echo
+echo '# string unescape --style=var (string escape --style=var a\nghi_)'
+string unescape --style=var (string escape --style=var a\nghi_)
+
+echo
+echo '# string unescape --style=var (string escape --style=var \'abc\')'
+string unescape --style=var (string escape --style=var 'abc')
+
+echo
+echo '# string unescape --style=var (string escape --style=var \'_a_b_c_\')'
+string unescape --style=var (string escape --style=var '_a_b_c_')
+
+echo
+echo '# string unescape --style=var (string escape --style=var -- -)'
+string unescape --style=var -- (string escape --style=var -- -)
+
+# The following tests verify that we can correctly match strings.
+
 echo
 echo '# string match "?" a'
 string match "?" a
--- a/tests/string.out
+++ b/tests/string.out
@ -84,7 +84,7 @@ a%20b%23c%22%27d
 %0Aa%0Ab%25c~d%0A

 # string escape --style=var 'a b#c"\'d'
-a_20_62_23_63_22_27_64_
+a_20_b_23_c_22_27_d

 # string escape --style=script a\nghi_
 a_0A_ghi__
@ -98,6 +98,37 @@ __a__b__c__
 # string escape --style=var -- -
 _2D_

+# set x (string unescape (echo \x07 | string escape))
+success
+
+# string unescape --style=script (string escape --style=script 'a b#c"\'d')
+a b#c"'d
+
+# string unescape --style=url (string escape --style=url 'a b#c"\'d')
+a b#c"'d
+
+# string unescape --style=url (string escape --style=url \na\nb%c~d\n)
+
+a
+b%c~d
+
+
+# string unescape --style=var (string escape --style=var 'a b#c"\'d')
+a b#c"'d
+
+# string unescape --style=var (string escape --style=var a\nghi_)
+a
+ghi_
+
+# string unescape --style=var (string escape --style=var 'abc')
+abc
+
+# string unescape --style=var (string escape --style=var '_a_b_c_')
+_a_b_c_
+
+# string unescape --style=var (string escape --style=var -- -)
+-
+
 # string match "?" a
 a