Implement PCRE2 escaping

Closes #5309.
This commit is contained in:
Mahmoud Al-Qudsi 2018-11-14 23:30:11 -06:00
parent f56c317bd0
commit e160cde606
7 changed files with 107 additions and 3 deletions

View file

@ -196,6 +196,8 @@ static int handle_flag_1(wchar_t **argv, parser_t &parser, io_streams_t &streams
opts->escape_style = STRING_STYLE_URL;
} else if (wcscmp(w.woptarg, L"var") == 0) {
opts->escape_style = STRING_STYLE_VAR;
} else if (wcscmp(w.woptarg, L"pcre2") == 0) {
opts->escape_style = STRING_STYLE_PCRE2;
} else {
string_error(streams, _(L"%ls: Invalid escape style '%ls'\n"), cmd, w.woptarg);
return STATUS_INVALID_ARGS;

View file

@ -1096,6 +1096,42 @@ static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring
}
}
/// Escapes a string for use in a regex string. Not safe for use with `eval` as only
/// characters reserved by PCRE2 are escaped, i.e. it relies on fish's automatic escaping
/// of subshell output in subsequent concatenation or for use as an argument.
/// \param in is the raw string to be searched for literally when substituted in a PCRE2 expression.
static wcstring escape_string_pcre2(const wcstring &in) {
wcstring out;
out.reserve(in.size() * 1.3); // a wild guess
for (auto c : in) {
switch (c) {
case L'.':
case L'^':
case L'$':
case L'*':
case L'+':
case L'(':
case L')':
case L'?':
case L'[':
case L'{':
case L'}':
case L'\\':
case L'|':
// these two only *need* to be escaped within a character class, and technically it makes
// no sense to ever use process substitution output to compose a character class, but...
case L'-':
case L']':
out.push_back('\\');
default:
out.push_back(c);
}
}
return out;
}
wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_style_t style) {
wcstring result;
@ -1112,6 +1148,10 @@ wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_st
escape_string_var(in, result);
break;
}
case STRING_STYLE_PCRE2: {
result = escape_string_pcre2(in);
break;
}
}
return result;
@ -1133,6 +1173,10 @@ wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_s
escape_string_var(in, result);
break;
}
case STRING_STYLE_PCRE2: {
result = escape_string_pcre2(in);
break;
}
}
return result;
@ -1617,6 +1661,11 @@ bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t es
success = unescape_string_var(input, output);
break;
}
case STRING_STYLE_PCRE2: {
// unescaping PCRE2 is not needed/supported, the PCRE2 engine is responsible for that
success = false;
break;
}
}
if (!success) output->clear();
return success;
@ -1638,6 +1687,11 @@ bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t e
success = unescape_string_var(input.c_str(), output);
break;
}
case STRING_STYLE_PCRE2: {
// unescaping PCRE2 is not needed/supported, the PCRE2 engine is responsible for that
success = false;
break;
}
}
if (!success) output->clear();
return success;

View file

@ -118,7 +118,12 @@ static_assert(false, "Neither NAME_MAX nor MAXNAMELEN is defined!");
#endif
#endif
enum escape_string_style_t { STRING_STYLE_SCRIPT, STRING_STYLE_URL, STRING_STYLE_VAR };
enum escape_string_style_t {
STRING_STYLE_SCRIPT,
STRING_STYLE_URL,
STRING_STYLE_VAR,
STRING_STYLE_PCRE2,
};
// Flags for unescape_string functions.
enum {

View file

@ -4349,6 +4349,33 @@ static void test_wcstring_tok() {
}
}
static void test_pcre2_escape() {
say(L"Testing escaping strings as pcre2 literals");
// plain text should not be needlessly escaped
auto input = L"hello world!";
auto escaped = escape_string(input, 0, STRING_STYLE_PCRE2);
if (escaped != input) {
err(L"Input string %ls unnecessarily PCRE2 escaped as %ls", input, escaped.c_str());
}
// all the following are intended to be ultimately matched literally - even if they don't look
// like that's the intent - so we escape them.
const wchar_t * tests[][2] = {
L".ext", L"\\.ext",
L"{word}", L"\\{word\\}",
L"hola-mundo", L"hola\\-mundo",
L"$17.42 is your total?", L"\\$17\\.42 is your total\\?",
L"not really escaped\\?", L"not really escaped\\\\\\?",
};
for (auto &test : tests) {
auto escaped = escape_string(test[0], 0, STRING_STYLE_PCRE2);
if (escaped != test[1]) {
err(L"pcre2_escape error: pcre2_escape(%ls) -> %ls, expected %ls", test[0], escaped.c_str(), test[1]);
}
}
}
int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv);
static void run_one_string_test(const wchar_t *const *argv, int expected_rc,
const wchar_t *expected_out) {
@ -4961,6 +4988,7 @@ int main(int argc, char **argv) {
if (should_test_function("utf8")) test_utf8();
if (should_test_function("feature_flags")) test_feature_flags();
if (should_test_function("escape_sequences")) test_escape_sequences();
if (should_test_function("pcre2_escape")) test_pcre2_escape();
if (should_test_function("lru")) test_lru();
if (should_test_function("expand")) test_expand();
if (should_test_function("fuzzy_match")) test_fuzzy_match();

View file

@ -92,6 +92,9 @@
####################
# string escape with multibyte chars
####################
# string escape for literal pcre2 searching
####################
# set x (string unescape (echo \x07 | string escape))
@ -182,7 +185,7 @@ string match: ^
####################
# string invalidarg
string: Subcommand 'invalidarg' is not valid
Standard input (line 205):
Standard input (line 211):
string invalidarg; and echo "unexpected exit 0"
^
@ -267,7 +270,7 @@ string repeat: Expected argument
####################
# string repeat -l fakearg 2>&1
string repeat: Unknown option '-l'
Standard input (line 281):
Standard input (line 287):
string repeat -l fakearg
^

View file

@ -101,6 +101,12 @@ string escape --style=var 中
string escape --style=var aöb | string unescape --style=var
string escape --style=var | string unescape --style=var
# test regex escaping
logmsg 'string escape for literal pcre2 searching'
string escape --style=pcre2 ".ext"
string escape --style=pcre2 "bonjour, amigo"
string escape --style=pcre2 "^this is a literal string"
# The following tests verify that we can correctly unescape the same strings
# we tested escaping above.

View file

@ -140,6 +140,12 @@ _E4_B8_AD_
aöb
####################
# string escape for literal pcre2 searching
\.ext
bonjour, amigo
\^this is a literal string
####################
# set x (string unescape (echo \x07 | string escape))
success