implement string unescape

Fixes #3543
This commit is contained in:
Kurtis Rader 2017-06-22 20:47:54 -07:00
parent 60bca14b37
commit f3cb625802
8 changed files with 319 additions and 34 deletions

View file

@ -5,6 +5,7 @@
- New `status is-breakpoint` command that is true when a prompt is displayed in response to a `breakpoint` command (#1310).
- Invalid array indexes are now silently ignored (#826, #4127).
- `string escape` has a new `--style=xxx` flag where `xxx` can be `script`, `var`, or `url` (#4150)
- `string unescape` has been implemented to reverse the effects of `string escape` (#3543)
## Other significant changes

View file

@ -18,6 +18,7 @@ string sub [(-s | --start) START] [(-l | --length) LENGTH] [(-q | --quiet)]
[STRING...]
string trim [(-l | --left)] [(-r | --right)] [(-c | --chars CHARS)]
[(-q | --quiet)] [STRING...]
string unescape [--style=xxx] [STRING...]
string upper [(-q | --quiet)] [STRING...]
\endfish
@ -42,6 +43,8 @@ The second is `--style=var` which ensures the string can be used as a variable n
The third is `--style=url` which ensures the string can be used as a URL by hex encoding any character which is not legal in a URL. The string is first converted to UTF-8 before being encoded.
`string unescape` performs the inverse of the `string escape` command. If the string to be unescaped is not properly formatted it is ignored. For example, doing `string unescape --style=var (string escape --style=var $str)` will return the original string.
\subsection string-join "join" subcommand
`string join` joins its STRING arguments into a single string separated by SEP, which can be an empty string. Exit status: 0 if at least one join was performed, or 1 otherwise.

View file

@ -484,6 +484,64 @@ static int string_escape_var(options_t &opts, int optind, wchar_t **argv, io_str
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
/// Unescape a string encoded so it can be used in fish script.
static int string_unescape_script(options_t &opts, int optind, wchar_t **argv,
io_streams_t &streams) {
UNUSED(opts);
wcstring storage;
int nesc = 0;
unescape_flags_t flags = 0;
while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
wcstring result;
if (unescape_string(arg, &result, flags, STRING_STYLE_SCRIPT)) {
streams.out.append(result);
streams.out.append(L'\n');
nesc++;
}
}
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
/// Unescape an encoded URL.
static int string_unescape_url(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
UNUSED(opts);
wcstring storage;
int nesc = 0;
unescape_flags_t flags = 0;
while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
wcstring result;
if (unescape_string(arg, &result, flags, STRING_STYLE_URL)) {
streams.out.append(result);
streams.out.append(L'\n');
nesc++;
}
}
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
/// Unescape an encoded var name.
static int string_unescape_var(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) {
UNUSED(opts);
wcstring storage;
int nesc = 0;
unescape_flags_t flags = 0;
while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) {
wcstring result;
if (unescape_string(arg, &result, flags, STRING_STYLE_VAR)) {
streams.out.append(result);
streams.out.append(L'\n');
nesc++;
}
}
return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR;
}
static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
options_t opts;
opts.no_quoted_valid = true;
@ -507,6 +565,29 @@ static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wcha
DIE("should never reach this statement");
}
static int string_unescape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
options_t opts;
opts.no_quoted_valid = true;
opts.style_valid = true;
int optind;
int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams);
if (retval != STATUS_CMD_OK) return retval;
switch (opts.escape_style) {
case STRING_STYLE_SCRIPT: {
return string_unescape_script(opts, optind, argv, streams);
}
case STRING_STYLE_URL: {
return string_unescape_url(opts, optind, argv, streams);
}
case STRING_STYLE_VAR: {
return string_unescape_var(opts, optind, argv, streams);
}
}
DIE("should never reach this statement");
}
static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) {
options_t opts;
opts.quiet_valid = true;
@ -1277,11 +1358,19 @@ static const struct string_subcommand {
wchar_t **argv); //!OCLINT(unused param)
}
string_subcommands[] = {
{L"escape", &string_escape}, {L"join", &string_join}, {L"length", &string_length},
{L"match", &string_match}, {L"replace", &string_replace}, {L"split", &string_split},
{L"sub", &string_sub}, {L"trim", &string_trim}, {L"lower", &string_lower},
{L"upper", &string_upper}, {L"repeat", &string_repeat}, {NULL, NULL}};
string_subcommands[] = {{L"escape", &string_escape},
{L"join", &string_join},
{L"length", &string_length},
{L"match", &string_match},
{L"replace", &string_replace},
{L"split", &string_split},
{L"sub", &string_sub},
{L"trim", &string_trim},
{L"lower", &string_lower},
{L"upper", &string_upper},
{L"repeat", &string_repeat},
{L"unescape", &string_unescape},
{NULL, NULL}};
/// The string builtin, for manipulating strings.
int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) {

View file

@ -75,6 +75,38 @@ static void debug_shared(const wchar_t msg_level, const wcstring &msg);
bool has_working_tty_timestamps = true;
/// Convert a character to its integer equivalent if it is a valid character for the requested base.
/// Return the integer value if it is valid else -1.
long convert_digit(wchar_t d, int base) {
long res = -1;
if ((d <= L'9') && (d >= L'0')) {
res = d - L'0';
} else if ((d <= L'z') && (d >= L'a')) {
res = d + 10 - L'a';
} else if ((d <= L'Z') && (d >= L'A')) {
res = d + 10 - L'A';
}
if (res >= base) {
res = -1;
}
return res;
}
/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions.
static bool is_hex_digit(int c) { return strchr("0123456789ABCDEF", c) != NULL; }
/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase.
long convert_hex_digit(wchar_t d) {
if ((d <= L'9') && (d >= L'0')) {
return d - L'0';
} else if ((d <= L'Z') && (d >= L'A')) {
return 10 + d - L'A';
}
return -1;
}
#ifdef HAVE_BACKTRACE_SYMBOLS
// This function produces a stack backtrace with demangled function & method names. It is based on
// https://gist.github.com/fmela/591333 but adapted to the style of the fish project.
@ -765,12 +797,41 @@ static void escape_string_url(const wchar_t *orig_in, wcstring &out) {
}
}
static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; }
/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII
/// chars.
static bool unescape_string_url(const wchar_t *in, wcstring *out) {
std::string result;
result.reserve(out->size());
for (wchar_t c = *in; c; c = *++in) {
if (c > 0x7F) return false; // invalid character means we can't decode the string
if (c == '%') {
int c1 = in[1];
if (c1 == 0) return false; // found unexpected end of string
if (c1 == '%') {
result.push_back('%');
in++;
} else {
int c2 = in[2];
if (c2 == 0) return false; // string ended prematurely
long d1 = convert_digit(c1, 16);
if (d1 < 0) return false;
long d2 = convert_digit(c2, 16);
if (d2 < 0) return false;
result.push_back(16 * d1 + d2);
in += 2;
}
} else {
result.push_back(c);
}
}
*out = str2wcstring(result);
return true;
}
/// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
bool prev_was_hex_encoded = false;
bool maybe_encode_next_char = false;
const std::string &in = wcs2string(orig_in);
for (auto c1 : in) {
// This silliness is so we get the correct result whether chars are signed or unsigned.
@ -799,6 +860,46 @@ static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
}
}
/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII
/// chars.
static bool unescape_string_var(const wchar_t *in, wcstring *out) {
std::string result;
result.reserve(out->size());
bool prev_was_hex_encoded = false;
for (wchar_t c = *in; c; c = *++in) {
if (c > 0x7F) return false; // invalid character means we can't decode the string
if (c == '_') {
int c1 = in[1];
if (c1 == 0) {
if (prev_was_hex_encoded) break;
return false; // found unexpected escape char at end of string
}
if (c1 == '_') {
result.push_back('_');
in++;
} else if (is_hex_digit(c1)) {
int c2 = in[2];
if (c2 == 0) return false; // string ended prematurely
long d1 = convert_hex_digit(c1);
if (d1 < 0) return false;
long d2 = convert_hex_digit(c2);
if (d2 < 0) return false;
result.push_back(16 * d1 + d2);
in += 2;
prev_was_hex_encoded = true;
}
// No "else" clause because if the first char after an underscore is not another
// underscore or a valid hex character then the underscore is there to improve
// readability after we've encoded a character not valid in a var name.
} else {
result.push_back(c);
}
}
*out = str2wcstring(result);
return true;
}
/// Escape a string in a fashion suitable for using in fish script. Store the result in out_str.
static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out,
escape_flags_t flags) {
@ -1390,14 +1491,44 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) {
return success;
}
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special) {
bool success = unescape_string_internal(input, wcslen(input), output, escape_special);
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style) {
bool success;
switch (style) {
case STRING_STYLE_SCRIPT: {
success = unescape_string_internal(input, wcslen(input), output, escape_special);
break;
}
case STRING_STYLE_URL: {
success = unescape_string_url(input, output);
break;
}
case STRING_STYLE_VAR: {
success = unescape_string_var(input, output);
break;
}
}
if (!success) output->clear();
return success;
}
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special) {
bool success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style) {
bool success;
switch (style) {
case STRING_STYLE_SCRIPT: {
success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
break;
}
case STRING_STYLE_URL: {
success = unescape_string_url(input.c_str(), output);
break;
}
case STRING_STYLE_VAR: {
success = unescape_string_var(input.c_str(), output);
break;
}
}
if (!success) output->clear();
return success;
}
@ -2023,22 +2154,6 @@ char **make_null_terminated_array(const std::vector<std::string> &lst) {
return make_null_terminated_array_helper(lst);
}
long convert_digit(wchar_t d, int base) {
long res = -1;
if ((d <= L'9') && (d >= L'0')) {
res = d - L'0';
} else if ((d <= L'z') && (d >= L'a')) {
res = d + 10 - L'a';
} else if ((d <= L'Z') && (d >= L'A')) {
res = d + 10 - L'A';
}
if (res >= base) {
res = -1;
}
return res;
}
/// Test if the specified character is in a range that fish uses interally to store special tokens.
///
/// NOTE: This is used when tokenizing the input. It is also used when reading input, before

View file

@ -715,10 +715,13 @@ size_t read_unquoted_escape(const wchar_t *input, wcstring *result, bool allow_i
/// indicates the string was unmodified.
bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special);
/// Unescapes a string, returning the unescaped value by reference. On failure, the output is set to
/// an empty string.
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special);
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special);
/// Reverse the effects of calling `escape_string`. Returns the unescaped value by reference. On
/// failure, the output is set to an empty string.
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style = STRING_STYLE_SCRIPT);
/// Returns the width of the terminal window, so that not all functions that use these values
/// continually have to keep track of it separately.

View file

@ -5,7 +5,7 @@ string match: ^
# string invalidarg
string: Subcommand 'invalidarg' is not valid
Standard input (line 215):
Standard input (line 258):
string invalidarg; and echo "unexpected exit 0" >&2
^
@ -29,6 +29,6 @@ string repeat: Expected argument
# string repeat -l fakearg 2>&1
string repeat: Unknown option '-l'
Standard input (line 316):
Standard input (line 359):
string repeat -l fakearg
^

View file

@ -126,6 +126,49 @@ echo
echo '# string escape --style=var -- -'
string escape --style=var -- -
# The following tests verify that we can correctly unescape the same strings
# we tested escaping above.
echo
echo '# set x (string unescape (echo \x07 | string escape))'
set x (string unescape (echo \x07 | string escape))
test $x = \x07
and echo success
echo
echo '# string unescape --style=script (string escape --style=script \'a b#c"\\\'d\')'
string unescape --style=script (string escape --style=script 'a b#c"\'d')
echo
echo '# string unescape --style=url (string escape --style=url \'a b#c"\\\'d\')'
string unescape --style=url (string escape --style=url 'a b#c"\'d')
echo
echo '# string unescape --style=url (string escape --style=url \na\nb%c~d\n)'
string unescape --style=url (string escape --style=url \na\nb%c~d\n)
echo
echo '# string unescape --style=var (string escape --style=var \'a b#c"\\\'d\')'
string unescape --style=var (string escape --style=var 'a b#c"\'d')
echo
echo '# string unescape --style=var (string escape --style=var a\nghi_)'
string unescape --style=var (string escape --style=var a\nghi_)
echo
echo '# string unescape --style=var (string escape --style=var \'abc\')'
string unescape --style=var (string escape --style=var 'abc')
echo
echo '# string unescape --style=var (string escape --style=var \'_a_b_c_\')'
string unescape --style=var (string escape --style=var '_a_b_c_')
echo
echo '# string unescape --style=var (string escape --style=var -- -)'
string unescape --style=var -- (string escape --style=var -- -)
# The following tests verify that we can correctly match strings.
echo
echo '# string match "?" a'
string match "?" a

View file

@ -84,7 +84,7 @@ a%20b%23c%22%27d
%0Aa%0Ab%25c~d%0A
# string escape --style=var 'a b#c"\'d'
a_20_62_23_63_22_27_64_
a_20_b_23_c_22_27_d
# string escape --style=script a\nghi_
a_0A_ghi__
@ -98,6 +98,37 @@ __a__b__c__
# string escape --style=var -- -
_2D_
# set x (string unescape (echo \x07 | string escape))
success
# string unescape --style=script (string escape --style=script 'a b#c"\'d')
a b#c"'d
# string unescape --style=url (string escape --style=url 'a b#c"\'d')
a b#c"'d
# string unescape --style=url (string escape --style=url \na\nb%c~d\n)
a
b%c~d
# string unescape --style=var (string escape --style=var 'a b#c"\'d')
a b#c"'d
# string unescape --style=var (string escape --style=var a\nghi_)
a
ghi_
# string unescape --style=var (string escape --style=var 'abc')
abc
# string unescape --style=var (string escape --style=var '_a_b_c_')
_a_b_c_
# string unescape --style=var (string escape --style=var -- -)
-
# string match "?" a
a