diff --git a/CHANGELOG.md b/CHANGELOG.md index adfb6a955..2d160c4d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - The `COLUMNS` and `LINES` env vars are now correctly set the first time `fish_prompt` is run (#4141). - New `status is-breakpoint` command that is true when a prompt is displayed in response to a `breakpoint` command (#1310). - Invalid array indexes are now silently ignored (#826, #4127). +- `string escape` has a new `--style=xxx` flag where `xxx` can be `script`, `var`, or `url` (#4150) ## Other significant changes diff --git a/doc_src/string.txt b/doc_src/string.txt index f893f7fb6..f4d9c6f9b 100644 --- a/doc_src/string.txt +++ b/doc_src/string.txt @@ -2,7 +2,7 @@ \subsection string-synopsis Synopsis \fish{synopsis} -string escape [(-n | --no-quoted)] [STRING...] +string escape [(-n | --no-quoted)] [--style=xxx] [STRING...] string join [(-q | --quiet)] SEP [STRING...] string length [(-q | --quiet)] [STRING...] string lower [(-q | --quiet)] [STRING...] @@ -36,7 +36,11 @@ The following subcommands are available. \subsection string-escape "escape" subcommand -`string escape` escapes each STRING such that it can be passed back to `eval` to produce the original argument again. By default, all special characters are escaped, and quotes are used to simplify the output when possible. If `-n` or `--no-quoted` is given, the simplifying quoted format is not used. Exit status: 0 if at least one string was escaped, or 1 otherwise. +`string escape` escapes each STRING in one of three ways. The first is `--style=script`. This is the default. It alters the string such that it can be passed back to `eval` to produce the original argument again. By default, all special characters are escaped, and quotes are used to simplify the output when possible. If `-n` or `--no-quoted` is given, the simplifying quoted format is not used. Exit status: 0 if at least one string was escaped, or 1 otherwise. + +The second is `--style=var` which ensures the string can be used as a variable name by hex encoding any non-alphanumeric characters. The string is first converted to UTF-8 before being encoded. + +The third is `--style=url` which ensures the string can be used as a URL by hex encoding any character which is not legal in a URL. The string is first converted to UTF-8 before being encoded. \subsection string-join "join" subcommand @@ -159,6 +163,11 @@ In general, special characters are special by default, so `a+` matches one or mo cg \endfish +\fish{cli-dark} +>_ string escape --style=var 'a1 b2'\u6161 +a1_20b2__c_E6_85_A1 +\endfish + \subsection string-example-match-glob Match Glob Examples \fish{cli-dark} diff --git a/src/builtin_string.cpp b/src/builtin_string.cpp index d2f8a6c26..fa58bd728 100644 --- a/src/builtin_string.cpp +++ b/src/builtin_string.cpp @@ -116,6 +116,7 @@ typedef struct { //!OCLINT(too many fields) bool regex_valid = false; bool right_valid = false; bool start_valid = false; + bool style_valid = false; bool all = false; bool entire = false; @@ -138,8 +139,34 @@ typedef struct { //!OCLINT(too many fields) const wchar_t *chars_to_trim = L" \f\n\r\t"; const wchar_t *arg1 = NULL; const wchar_t *arg2 = NULL; + + escape_string_style_t escape_style = STRING_STYLE_SCRIPT; } options_t; +/// This handles the `--style=xxx` flag. +static int handle_flag_1(wchar_t **argv, parser_t &parser, io_streams_t &streams, wgetopter_t &w, + options_t *opts) { + const wchar_t *cmd = argv[0]; + + if (opts->style_valid) { + if (wcscmp(w.woptarg, L"script") == 0) { + opts->escape_style = STRING_STYLE_SCRIPT; + } else if (wcscmp(w.woptarg, L"url") == 0) { + opts->escape_style = STRING_STYLE_URL; + } else if (wcscmp(w.woptarg, L"var") == 0) { + opts->escape_style = STRING_STYLE_VAR; + } + else { + string_error(streams, _(L"%ls: Invalid escape style '%ls'\n"), cmd, w.woptarg); + return STATUS_INVALID_ARGS; + } + return STATUS_CMD_OK; + } + + string_unknown_option(parser, streams, cmd, argv[w.woptind - 1]); + return STATUS_INVALID_ARGS; +} + static int handle_flag_N(wchar_t **argv, parser_t &parser, io_streams_t &streams, wgetopter_t &w, options_t *opts) { if (opts->no_newline_valid) { @@ -349,13 +376,14 @@ static const struct woption long_options[] = { {L"max", required_argument, NULL, 'm'}, {L"no-newline", no_argument, NULL, 'N'}, {L"no-quoted", no_argument, NULL, 'n'}, {L"quiet", no_argument, NULL, 'q'}, {L"regex", no_argument, NULL, 'r'}, {L"right", no_argument, NULL, 'r'}, - {L"start", required_argument, NULL, 's'}, {NULL, 0, NULL, 0}}; + {L"start", required_argument, NULL, 's'}, {L"style", required_argument, NULL, 1}, + {NULL, 0, NULL, 0}}; static std::map flag_to_function = { {'N', handle_flag_N}, {'a', handle_flag_a}, {'c', handle_flag_c}, {'e', handle_flag_e}, {'f', handle_flag_f}, {'i', handle_flag_i}, {'l', handle_flag_l}, {'m', handle_flag_m}, {'n', handle_flag_n}, {'q', handle_flag_q}, {'r', handle_flag_r}, {'s', handle_flag_s}, - {'v', handle_flag_v}}; + {'v', handle_flag_v}, {1, handle_flag_1}}; /// Parse the arguments for flags recognized by a specific string subcommand. static int parse_opts(options_t *opts, int *optind, int n_req_args, int argc, wchar_t **argv, @@ -408,21 +436,15 @@ static int parse_opts(options_t *opts, int *optind, int n_req_args, int argc, wc return STATUS_CMD_OK; } -static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { - options_t opts; - opts.no_quoted_valid = true; - int optind; - int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams); - if (retval != STATUS_CMD_OK) return retval; - +/// Escape a string so that it can be used in a fish script without further word splitting. +static int string_escape_script(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + wcstring storage; + int nesc = 0; escape_flags_t flags = ESCAPE_ALL; if (opts.no_quoted) flags |= ESCAPE_NO_QUOTED; - int nesc = 0; - wcstring storage; - const wchar_t *arg; - while ((arg = string_get_arg(&optind, argv, &storage, streams)) != 0) { - streams.out.append(escape_string(arg, flags)); + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + streams.out.append(escape_string(arg, flags, STRING_STYLE_SCRIPT)); streams.out.append(L'\n'); nesc++; } @@ -430,6 +452,61 @@ static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wcha return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; } +/// Escape a string so that it can be used as a URL. +static int string_escape_url(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + escape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + streams.out.append(escape_string(arg, flags, STRING_STYLE_URL)); + streams.out.append(L'\n'); + nesc++; + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +/// Escape a string so that it can be used as a fish var name. +static int string_escape_var(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + escape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + streams.out.append(escape_string(arg, flags, STRING_STYLE_VAR)); + streams.out.append(L'\n'); + nesc++; + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { + options_t opts; + opts.no_quoted_valid = true; + opts.style_valid = true; + int optind; + int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams); + if (retval != STATUS_CMD_OK) return retval; + + switch (opts.escape_style) { + case STRING_STYLE_SCRIPT: { + return string_escape_script(opts, optind, argv, streams); + } + case STRING_STYLE_URL: { + return string_escape_url(opts, optind, argv, streams); + } + case STRING_STYLE_VAR: { + return string_escape_var(opts, optind, argv, streams); + } + } + + DIE("should never reach this statement"); +} + static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { options_t opts; opts.quiet_valid = true; diff --git a/src/common.cpp b/src/common.cpp index d57654e7c..fb51791de 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1,6 +1,7 @@ // Various functions, mostly string utilities, that are used by most parts of fish. #include "config.h" +#include #include #include #include @@ -745,11 +746,62 @@ wcstring reformat_for_screen(const wcstring &msg) { return buff; } -/// Escape a string, storing the result in out_str. -static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstring *out_str, - escape_flags_t flags) { - assert(orig_in != NULL); +/// Escape a string in a fashion suitable for using as a URL. Store the result in out_str. +static void escape_string_url(const wchar_t *orig_in, wcstring &out) { + const std::string &in = wcs2string(orig_in); + for (auto c1 : in) { + // This silliness is so we get the correct result whether chars are signed or unsigned. + unsigned int c2 = (unsigned int)c1 & 0xFF; + if (!(c2 & 0x80) && + (isalnum(c2) || c2 == '/' || c2 == '.' || c2 == '~' || c2 == '-' || c2 == '_')) { + // The above characters don't need to be encoded. + out.push_back((wchar_t)c2); + } else { + // All other chars need to have their UTF-8 representation encoded in hex. + wchar_t buf[4]; + swprintf(buf, sizeof buf / sizeof buf[0], L"%%%02X", c2); + out.append(buf); + } + } +} +static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; } + +/// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str. +static void escape_string_var(const wchar_t *orig_in, wcstring &out) { + bool prev_was_hex_encoded = false; + bool maybe_encode_next_char = false; + const std::string &in = wcs2string(orig_in); + for (auto c1 : in) { + // This silliness is so we get the correct result whether chars are signed or unsigned. + unsigned int c2 = (unsigned int)c1 & 0xFF; + if (!(c2 & 0x80) && isalnum(c2) && (!prev_was_hex_encoded || !is_hex_digit(c2))) { + // ASCII alphanumerics don't need to be encoded. + if (prev_was_hex_encoded) { + out.push_back(L'_'); + prev_was_hex_encoded = false; + } + out.push_back((wchar_t)c2); + } else if (c2 == '_') { + // Underscores are encoded by doubling them. + out.append(L"__"); + prev_was_hex_encoded = false; + } else { + // All other chars need to have their UTF-8 representation encoded in hex. + wchar_t buf[4]; + swprintf(buf, sizeof buf / sizeof buf[0], L"_%02X", c2); + out.append(buf); + prev_was_hex_encoded = true; + } + } + if (prev_was_hex_encoded) { + out.push_back(L'_'); + } +} + +/// Escape a string in a fashion suitable for using in fish script. Store the result in out_str. +static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out, + escape_flags_t flags) { const wchar_t *in = orig_in; bool escape_all = static_cast(flags & ESCAPE_ALL); bool no_quoted = static_cast(flags & ESCAPE_NO_QUOTED); @@ -758,9 +810,6 @@ static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstri int need_escape = 0; int need_complex_escape = 0; - // Avoid dereferencing all over the place. - wcstring &out = *out_str; - if (!no_quoted && in_len == 0) { out.assign(L"''"); return; @@ -903,15 +952,45 @@ static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstri } } -wcstring escape_string(const wchar_t *in, escape_flags_t flags) { +wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_style_t style) { wcstring result; - escape_string_internal(in, wcslen(in), &result, flags); + + switch (style) { + case STRING_STYLE_SCRIPT: { + escape_string_script(in, wcslen(in), result, flags); + break; + } + case STRING_STYLE_URL: { + escape_string_url(in, result); + break; + } + case STRING_STYLE_VAR: { + escape_string_var(in, result); + break; + } + } + return result; } -wcstring escape_string(const wcstring &in, escape_flags_t flags) { +wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_style_t style) { wcstring result; - escape_string_internal(in.c_str(), in.size(), &result, flags); + + switch (style) { + case STRING_STYLE_SCRIPT: { + escape_string_script(in.c_str(), in.size(), result, flags); + break; + } + case STRING_STYLE_URL: { + DIE("STRING_STYLE_URL not implemented"); + break; + } + case STRING_STYLE_VAR: { + escape_string_var(in.c_str(), result); + break; + } + } + return result; } diff --git a/src/common.h b/src/common.h index a1c2e0841..e1ed13772 100644 --- a/src/common.h +++ b/src/common.h @@ -89,6 +89,12 @@ typedef std::vector wcstring_list_t; #define INPUT_COMMON_BASE (wchar_t)0xF700 #define INPUT_COMMON_END (INPUT_COMMON_BASE + 64) +enum escape_string_style_t { + STRING_STYLE_SCRIPT, + STRING_STYLE_URL, + STRING_STYLE_VAR +}; + // Flags for unescape_string functions. enum { UNESCAPE_DEFAULT = 0, // default behavior @@ -97,15 +103,14 @@ enum { }; typedef unsigned int unescape_flags_t; -// Flags for the escape_string() and escape_string() functions. +// Flags for the escape_string() and escape_string() functions. These are only applicable when the +// escape style is "script" (i.e., STRING_STYLE_SCRIPT). enum { /// Escape all characters, including magic characters like the semicolon. ESCAPE_ALL = 1 << 0, - /// Do not try to use 'simplified' quoted escapes, and do not use empty quotes as the empty /// string. ESCAPE_NO_QUOTED = 1 << 1, - /// Do not escape tildes. ESCAPE_NO_TILDE = 1 << 2 }; @@ -692,8 +697,10 @@ ssize_t read_loop(int fd, void *buff, size_t count); /// \param in The string to be escaped /// \param flags Flags to control the escaping /// \return The escaped string -wcstring escape_string(const wchar_t *in, escape_flags_t flags); -wcstring escape_string(const wcstring &in, escape_flags_t flags); +wcstring escape_string(const wchar_t *in, escape_flags_t flags, + escape_string_style_t style=STRING_STYLE_SCRIPT); +wcstring escape_string(const wcstring &in, escape_flags_t flags, + escape_string_style_t style=STRING_STYLE_SCRIPT); /// Expand backslashed escapes and substitute them with their unescaped counterparts. Also /// optionally change the wildcards, the tilde character and a few more into constants which are diff --git a/tests/string.err b/tests/string.err index 33e9e847f..0c14841dc 100644 --- a/tests/string.err +++ b/tests/string.err @@ -5,7 +5,7 @@ string match: ^ # string invalidarg string: Subcommand 'invalidarg' is not valid -Standard input (line 183): +Standard input (line 215): string invalidarg; and echo "unexpected exit 0" >&2 ^ @@ -29,6 +29,6 @@ string repeat: Expected argument # string repeat -l fakearg 2>&1 string repeat: Unknown option '-l' -Standard input (line 284): +Standard input (line 316): string repeat -l fakearg ^ diff --git a/tests/string.in b/tests/string.in index ef4fa1940..297c77679 100644 --- a/tests/string.in +++ b/tests/string.in @@ -94,6 +94,38 @@ echo echo '# echo \x07 | string escape' echo \x07 | string escape +echo +echo '# string escape --style=script \'a b#c"\\\'d\'' +string escape --style=script 'a b#c"\'d' + +echo +echo '# string escape --style=url \'a b#c"\\\'d\'' +string escape --style=url 'a b#c"\'d' + +echo +echo '# string escape --style=url \\na\\nb%c~d\\n' +string escape --style=url \na\nb%c~d\n + +echo +echo '# string escape --style=var \'a b#c"\\\'d\'' +string escape --style=var 'a b#c"\'d' + +echo +echo '# string escape --style=script a\nghi_' +string escape --style=var a\nghi_ + +echo +echo '# string escape --style=var \'abc\'' +string escape --style=var 'abc' + +echo +echo '# string escape --style=var \'_a_b_c_\'' +string escape --style=var '_a_b_c_' + +echo +echo '# string escape --style=var -- -' +string escape --style=var -- - + echo echo '# string match "?" a' string match "?" a diff --git a/tests/string.out b/tests/string.out index 8425591d8..1c24eec48 100644 --- a/tests/string.out +++ b/tests/string.out @@ -74,6 +74,30 @@ zan # echo \x07 | string escape \cg +# string escape --style=script 'a b#c"\'d' +a\ b\#c\"\'d + +# string escape --style=url 'a b#c"\'d' +a%20b%23c%22%27d + +# string escape --style=url \na\nb%c~d\n +%0Aa%0Ab%25c~d%0A + +# string escape --style=var 'a b#c"\'d' +a_20_62_23_63_22_27_64_ + +# string escape --style=script a\nghi_ +a_0A_ghi__ + +# string escape --style=var 'abc' +abc + +# string escape --style=var '_a_b_c_' +__a__b__c__ + +# string escape --style=var -- - +_2D_ + # string match "?" a a