fish-shell/src/builtin_string.cpp

1376 lines
34 KiB
C++
Raw Normal View History

Merge new string builtin This adds the new builtin 'string' which supports various string manipulation and matching algorithms, including PCRE based regular expressions. Fixes #2296 Squashed commit of the following: commit 4c3eaeb6e57d76463e9683c327142b0aeafb92b8 Author: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat Sep 12 12:51:30 2015 -0700 Remove testdata and doc dirs from pcre2 source commit b2a8b4b50f2398b204fb72cfe4b5ba77ece2e1ab Merge: 11c8a47 7974aab Author: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat Sep 12 12:32:40 2015 -0700 Merge branch 'string' of git://github.com/msteed/fish-shell into string-test commit 7974aab6d367f999f1140ab34c2535cef5cf3b00 Author: Michael Steed <msteed@saltstack.com> Date: Fri Sep 11 13:00:02 2015 -0600 build pcre2 lib only, no docs commit eb20b43d2d96b7e6d24618158ce71078de83c40b Merge: 1a09e70 5f519cb Author: Michael Steed <msteed68@gmail.com> Date: Thu Sep 10 20:00:47 2015 -0600 Merge branch 'string' of github.com:msteed/fish-shell into string commit 1a09e709d028393c9e9e6dc9a84278f399a15f3d Author: Michael Steed <msteed68@gmail.com> Date: Thu Sep 10 19:58:24 2015 -0600 rebase on master & address the fallout commit a0ec9772cd1a0a548a501a7633be05dab4e5ee46 Author: Michael Steed <msteed68@gmail.com> Date: Thu Sep 10 19:26:45 2015 -0600 use fish's wildcard_match() for glob matching commit 64c25a01e3f7234f220ba13545cf658a7492b1a4 Author: Michael Steed <msteed68@gmail.com> Date: Thu Aug 27 08:19:23 2015 -0600 some fixes from review - string_get_arg_stdin(): simplify and don't discard the argument when the trailing newline is absent - fix calls to pcre2 for e.g. string match -r -a 'a*' 'b' - correct test for args coming from stdin commit ece7f35ec5f4093763627d68d671b6c0c876896d Author: Michael Steed <msteed68@gmail.com> Date: Sat Aug 22 19:35:56 2015 -0600 fixes from review - Makefile.in: restore iwyu target - regex_replacer_t::replace_matches(): correct size passed to realloc() commit 9ff7477a926c4572e26171cab3cd42f8086be678 Author: Michael Steed <msteed68@gmail.com> Date: Thu Aug 20 13:08:33 2015 -0600 Minor doc improvements commit baf4e096b22dde3063b85b833795eb570d660ba7 Author: Michael Steed <msteed68@gmail.com> Date: Wed Aug 19 18:29:02 2015 -0600 another attempt to fix the ci build commit 896a2c2b279a419747bea26102229fbe84534a6f Author: Michael Steed <msteed68@gmail.com> Date: Wed Aug 19 18:03:49 2015 -0600 Updates after review comments - make match/replace without -a operate on the first match on each argument - use different exit codes for "no operation performed" and errors, as grep does - refactor regex compile code - use human-friendly error messages from pcre2 - improve error handling & reporting elsewhere - add a few tests - make some doc fixes - some simplification & cleanup - fix ci build failure (I hope) commit efd47dcbda2ca247d58bee56a7774cd75a1062fd Author: Michael Steed <msteed68@gmail.com> Date: Wed Aug 12 00:26:07 2015 -0600 fix dependencies for parallel make commit ed0850e2db467362066a3d94e3ececd17c1756cd Author: Michael Steed <msteed68@gmail.com> Date: Tue Aug 11 23:37:22 2015 -0600 Add missing pcre2 files + .gitignore commit 9492e7a7e929c03554336be1ddf80ca6b37f53c5 Author: Michael Steed <msteed68@gmail.com> Date: Tue Aug 11 22:44:05 2015 -0600 add pcre2-10.20 and update license.hdr commit 1a60b933718feb20c0bf7c9e257b8e495014ea1b Author: Michael Steed <msteed68@gmail.com> Date: Tue Aug 11 22:41:19 2015 -0600 add string builtin files - string builtin source, tests, & docs - changes to configure.ac & Makefile.in commit 5f519cb2a2c05213e0a88a7add7af288bc1c1352 Author: Michael Steed <msteed68@gmail.com> Date: Thu Sep 10 19:26:45 2015 -0600 use fish's wildcard_match() for glob matching commit 2ecd24f79500879e2de5bdf1b4c19dd44fc6ac85 Author: Michael Steed <msteed68@gmail.com> Date: Thu Aug 27 08:19:23 2015 -0600 some fixes from review - string_get_arg_stdin(): simplify and don't discard the argument when the trailing newline is absent - fix calls to pcre2 for e.g. string match -r -a 'a*' 'b' - correct test for args coming from stdin commit 45b777e4dc85c05cd4a186f4bdcae543c21aaf08 Author: Michael Steed <msteed68@gmail.com> Date: Sat Aug 22 19:35:56 2015 -0600 fixes from review - Makefile.in: restore iwyu target - regex_replacer_t::replace_matches(): correct size passed to realloc() commit 981cbb6ddf742a5fe8881af916e7b870b7e6422a Author: Michael Steed <msteed68@gmail.com> Date: Thu Aug 20 13:08:33 2015 -0600 Minor doc improvements commit ddb6a2a8fdb6aa31aad41e80d5481bb32c6ed8ff Author: Michael Steed <msteed68@gmail.com> Date: Wed Aug 19 18:29:02 2015 -0600 another attempt to fix the ci build commit 1e34e3191b028162863d263e9868052f75194aa5 Author: Michael Steed <msteed68@gmail.com> Date: Wed Aug 19 18:03:49 2015 -0600 Updates after review comments - make match/replace without -a operate on the first match on each argument - use different exit codes for "no operation performed" and errors, as grep does - refactor regex compile code - use human-friendly error messages from pcre2 - improve error handling & reporting elsewhere - add a few tests - make some doc fixes - some simplification & cleanup - fix ci build failure (I hope) commit 34232e152df17a3cfbf0a094dd51d148a4f04e6f Author: Michael Steed <msteed68@gmail.com> Date: Wed Aug 12 00:26:07 2015 -0600 fix dependencies for parallel make commit 00d7e781697f53454beb91c1d0fc4b2d28d6e034 Author: Michael Steed <msteed68@gmail.com> Date: Tue Aug 11 23:37:22 2015 -0600 Add missing pcre2 files + .gitignore commit 4498aa5f576e09634f7f619443e74d2f33c108e4 Author: Michael Steed <msteed68@gmail.com> Date: Tue Aug 11 22:44:05 2015 -0600 add pcre2-10.20 and update license.hdr commit 290c58c72e22db644ccf6fa9088051644980ed0a Author: Michael Steed <msteed68@gmail.com> Date: Tue Aug 11 22:41:19 2015 -0600 add string builtin files - string builtin source, tests, & docs - changes to configure.ac & Makefile.in
2015-09-12 19:59:40 +00:00
/** \file builtin_string.cpp
Implementation of the string builtin.
*/
#define PCRE2_CODE_UNIT_WIDTH WCHAR_T_BITS
#ifdef _WIN32
#define PCRE2_STATIC
#endif
#include "pcre2.h"
#include "wildcard.h"
#define MAX_REPLACE_SIZE size_t(1048576) // pcre2_substitute maximum output size in wchar_t
#define STRING_ERR_MISSING _(L"%ls: Expected argument\n")
enum
{
BUILTIN_STRING_OK = 0,
BUILTIN_STRING_NONE = 1,
BUILTIN_STRING_ERROR = 2
};
static void string_error(const wchar_t *fmt, ...)
{
va_list va;
va_start(va, fmt);
wcstring errstr = vformat_string(fmt, va);
va_end(va);
stderr_buffer += L"string ";
stderr_buffer += errstr;
}
static void string_unknown_option(parser_t &parser, const wchar_t *subcmd, const wchar_t *opt)
{
string_error(BUILTIN_ERR_UNKNOWN, subcmd, opt);
builtin_print_help(parser, L"string", stderr_buffer);
}
static bool string_args_from_stdin()
{
return builtin_stdin != STDIN_FILENO || !isatty(builtin_stdin);
}
static const wchar_t *string_get_arg_stdin()
{
static wcstring warg;
std::string arg;
for (;;)
{
char ch = '\0';
int rc = read_blocked(builtin_stdin, &ch, 1);
if (rc < 0)
{
// failure
return 0;
}
if (rc == 0)
{
// eof
if (arg.empty())
{
return 0;
}
else
{
break;
}
}
if (ch == '\n')
{
break;
}
arg += ch;
}
warg = str2wcstring(arg.c_str(), arg.size());
return warg.c_str();
}
static const wchar_t *string_get_arg_argv(int *argidx, wchar_t **argv)
{
return (argv && argv[*argidx]) ? argv[(*argidx)++] : 0;
}
static const wchar_t *string_get_arg(int *argidx, wchar_t **argv)
{
if (string_args_from_stdin())
{
return string_get_arg_stdin();
}
else
{
return string_get_arg_argv(argidx, argv);
}
}
static int string_escape(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L"n";
const struct woption long_options[] =
{
{ L"no-quoted", no_argument, 0, 'n' },
{ 0, 0, 0, 0 }
};
escape_flags_t flags = ESCAPE_ALL;
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'n':
flags |= ESCAPE_NO_QUOTED;
break;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
int nesc = 0;
const wchar_t *arg;
while ((arg = string_get_arg(&i, argv)) != 0)
{
stdout_buffer += escape(arg, flags);
stdout_buffer += L'\n';
nesc++;
}
return (nesc > 0) ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
}
static int string_join(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L"q";
const struct woption long_options[] =
{
{ L"quiet", no_argument, 0, 'q'},
{ 0, 0, 0, 0 }
};
bool quiet = false;
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'q':
quiet = true;
break;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
const wchar_t *sep;
if ((sep = string_get_arg_argv(&i, argv)) == 0)
{
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
}
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
int nargs = 0;
const wchar_t *arg;
while ((arg = string_get_arg(&i, argv)) != 0)
{
if (!quiet)
{
stdout_buffer += arg;
stdout_buffer += sep;
}
nargs++;
}
if (nargs > 0 && !quiet)
{
stdout_buffer.resize(stdout_buffer.length() - wcslen(sep));
stdout_buffer += L'\n';
}
return (nargs > 1) ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
}
static int string_length(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L"q";
const struct woption long_options[] =
{
{ L"quiet", no_argument, 0, 'q'},
{ 0, 0, 0, 0 }
};
bool quiet = false;
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'q':
quiet = true;
break;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
const wchar_t *arg;
int nnonempty = 0;
while ((arg = string_get_arg(&i, argv)) != 0)
{
size_t n = wcslen(arg);
if (n > 0)
{
nnonempty++;
}
if (!quiet)
{
stdout_buffer += to_string(int(n));
stdout_buffer += L'\n';
}
}
return (nnonempty > 0) ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
}
struct match_options_t
{
bool all;
bool ignore_case;
bool index;
bool quiet;
match_options_t(): all(false), ignore_case(false), index(false), quiet(false) { }
};
class string_matcher_t
{
protected:
match_options_t opts;
int total_matched;
public:
string_matcher_t(const match_options_t &opts_)
: opts(opts_), total_matched(0)
{ }
virtual ~string_matcher_t() { }
virtual bool report_matches(const wchar_t *arg) = 0;
int match_count() { return total_matched; }
};
class wildcard_matcher_t: public string_matcher_t
{
wcstring wcpattern;
public:
wildcard_matcher_t(const wchar_t * /*argv0*/, const wchar_t *pattern, const match_options_t &opts)
: string_matcher_t(opts)
{
wcpattern = parse_util_unescape_wildcards(pattern);
if (opts.ignore_case)
{
for (int i = 0; i < wcpattern.length(); i++)
{
wcpattern[i] = towlower(wcpattern[i]);
}
}
}
virtual ~wildcard_matcher_t() { }
bool report_matches(const wchar_t *arg)
{
// Note: --all is a no-op for glob matching since the pattern is always
// matched against the entire argument
bool match;
if (opts.ignore_case)
{
wcstring s = arg;
for (int i = 0; i < s.length(); i++)
{
s[i] = towlower(s[i]);
}
match = wildcard_match(s, wcpattern, false);
}
else
{
match = wildcard_match(arg, wcpattern, false);
}
if (match)
{
total_matched++;
}
if (!opts.quiet)
{
if (match)
{
if (opts.index)
{
stdout_buffer += L"1 ";
stdout_buffer += to_string(wcslen(arg));
stdout_buffer += L'\n';
}
else
{
stdout_buffer += arg;
stdout_buffer += L'\n';
}
}
}
return true;
}
};
static const wchar_t *pcre2_strerror(int err_code)
{
static wchar_t buf[128];
pcre2_get_error_message(err_code, (PCRE2_UCHAR *)buf, sizeof(buf) / sizeof(wchar_t));
return buf;
}
struct compiled_regex_t
{
pcre2_code *code;
pcre2_match_data *match;
compiled_regex_t(const wchar_t *argv0, const wchar_t *pattern, bool ignore_case)
: code(0), match(0)
{
// Disable some sequences that can lead to security problems
uint32_t options = PCRE2_NEVER_UTF;
#if PCRE2_CODE_UNIT_WIDTH < 32
options |= PCRE2_NEVER_BACKSLASH_C;
#endif
int err_code = 0;
PCRE2_SIZE err_offset = 0;
code = pcre2_compile(
PCRE2_SPTR(pattern),
PCRE2_ZERO_TERMINATED,
options | (ignore_case ? PCRE2_CASELESS : 0),
&err_code,
&err_offset,
0);
if (code == 0)
{
string_error(_(L"%ls: Regular expression compile error: %ls\n"),
argv0, pcre2_strerror(err_code));
string_error(L"%ls: %ls\n", argv0, pattern);
string_error(L"%ls: %*ls\n", argv0, err_offset, L"^");
return;
}
match = pcre2_match_data_create_from_pattern(code, 0);
if (match == 0)
{
DIE_MEM();
}
}
~compiled_regex_t()
{
if (match != 0)
{
pcre2_match_data_free(match);
}
if (code != 0)
{
pcre2_code_free(code);
}
}
};
class pcre2_matcher_t: public string_matcher_t
{
const wchar_t *argv0;
compiled_regex_t regex;
int report_match(const wchar_t *arg, int pcre2_rc)
{
// Return values: -1 = error, 0 = no match, 1 = match
if (pcre2_rc == PCRE2_ERROR_NOMATCH)
{
return 0;
}
if (pcre2_rc < 0)
{
string_error(_(L"%ls: Regular expression match error: %ls\n"),
argv0, pcre2_strerror(pcre2_rc));
return -1;
}
if (pcre2_rc == 0)
{
// The output vector wasn't big enough. Should not happen.
string_error(_(L"%ls: Regular expression internal error\n"), argv0);
return -1;
}
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(regex.match);
for (int j = 0; j < pcre2_rc; j++)
{
PCRE2_SIZE begin = ovector[2*j];
PCRE2_SIZE end = ovector[2*j + 1];
if (!opts.quiet)
{
if (begin != PCRE2_UNSET && end != PCRE2_UNSET)
{
if (opts.index)
{
stdout_buffer += to_string(begin + 1);
stdout_buffer += ' ';
stdout_buffer += to_string(end - begin);
}
else if (end > begin) // may have end < begin if \K is used
{
stdout_buffer += wcstring(&arg[begin], end - begin);
}
stdout_buffer += L'\n';
}
}
}
return 1;
}
public:
pcre2_matcher_t(const wchar_t *argv0_, const wchar_t *pattern, const match_options_t &opts)
: string_matcher_t(opts),
argv0(argv0_),
regex(argv0_, pattern, opts.ignore_case)
{ }
virtual ~pcre2_matcher_t() { }
bool report_matches(const wchar_t *arg)
{
// A return value of true means all is well (even if no matches were
// found), false indicates an unrecoverable error.
if (regex.code == 0)
{
// pcre2_compile() failed
return false;
}
int matched = 0;
// See pcre2demo.c for an explanation of this logic
PCRE2_SIZE arglen = wcslen(arg);
int rc = report_match(arg, pcre2_match(regex.code, PCRE2_SPTR(arg), arglen, 0, 0, regex.match, 0));
if (rc < 0)
{
// pcre2 match error
return false;
}
if (rc == 0)
{
// no match
return true;
}
matched++;
total_matched++;
// Report any additional matches
PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(regex.match);
while (opts.all || matched == 0)
{
uint32_t options = 0;
PCRE2_SIZE offset = ovector[1]; // Start at end of previous match
if (ovector[0] == ovector[1])
{
if (ovector[0] == arglen)
{
break;
}
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
rc = report_match(arg, pcre2_match(regex.code, PCRE2_SPTR(arg), arglen, offset, options, regex.match, 0));
if (rc < 0)
{
return false;
}
if (rc == 0)
{
if (options == 0)
{
// All matches found
break;
}
ovector[1] = offset + 1;
continue;
}
matched++;
total_matched++;
}
return true;
}
};
static int string_match(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L"ainqr";
const struct woption long_options[] =
{
{ L"all", no_argument, 0, 'a'},
{ L"ignore-case", no_argument, 0, 'i'},
{ L"index", no_argument, 0, 'n'},
{ L"quiet", no_argument, 0, 'q'},
{ L"regex", no_argument, 0, 'r'},
{ 0, 0, 0, 0 }
};
match_options_t opts;
bool regex = false;
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'a':
opts.all = true;
break;
case 'i':
opts.ignore_case = true;
break;
case 'n':
opts.index = true;
break;
case 'q':
opts.quiet = true;
break;
case 'r':
regex = true;
break;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
const wchar_t *pattern;
if ((pattern = string_get_arg_argv(&i, argv)) == 0)
{
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
}
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
string_matcher_t *matcher;
if (regex)
{
matcher = new pcre2_matcher_t(argv[0], pattern, opts);
}
else
{
matcher = new wildcard_matcher_t(argv[0], pattern, opts);
}
const wchar_t *arg;
while ((arg = string_get_arg(&i, argv)) != 0)
{
if (!matcher->report_matches(arg))
{
delete matcher;
return BUILTIN_STRING_ERROR;
}
}
int rc = matcher->match_count() > 0 ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
delete matcher;
return rc;
}
struct replace_options_t
{
bool all;
bool ignore_case;
bool quiet;
replace_options_t(): all(false), ignore_case(false), quiet(false) { }
};
class string_replacer_t
{
protected:
const wchar_t *argv0;
replace_options_t opts;
int total_replaced;
public:
string_replacer_t(const wchar_t *argv0_, const replace_options_t &opts_)
: argv0(argv0_), opts(opts_), total_replaced(0)
{ }
virtual ~string_replacer_t() {}
virtual bool replace_matches(const wchar_t *arg) = 0;
int replace_count() { return total_replaced; }
};
class literal_replacer_t: public string_replacer_t
{
const wchar_t *pattern;
const wchar_t *replacement;
int patlen;
public:
literal_replacer_t(const wchar_t *argv0, const wchar_t *pattern_, const wchar_t *replacement_,
const replace_options_t &opts)
: string_replacer_t(argv0, opts),
pattern(pattern_), replacement(replacement_), patlen(wcslen(pattern))
{ }
virtual ~literal_replacer_t() { }
bool replace_matches(const wchar_t *arg)
{
wcstring result;
if (patlen == 0)
{
result = arg;
}
else
{
int replaced = 0;
const wchar_t *cur = arg;
while (*cur != L'\0')
{
if ((opts.all || replaced == 0) &&
(opts.ignore_case ? wcsncasecmp(cur, pattern, patlen) : wcsncmp(cur, pattern, patlen)) == 0)
{
result += replacement;
cur += patlen;
replaced++;
total_replaced++;
}
else
{
result += *cur;
cur++;
}
}
}
if (!opts.quiet)
{
stdout_buffer += result;
stdout_buffer += L'\n';
}
return true;
}
};
class regex_replacer_t: public string_replacer_t
{
compiled_regex_t regex;
wcstring replacement;
wcstring interpret_escapes(const wchar_t *orig)
{
wcstring result;
while (*orig != L'\0')
{
if (*orig == L'\\')
{
orig += read_unquoted_escape(orig, &result, true, false);
}
else
{
result += *orig;
orig++;
}
}
return result;
}
public:
regex_replacer_t(const wchar_t *argv0, const wchar_t *pattern, const wchar_t *replacement_,
const replace_options_t &opts)
: string_replacer_t(argv0, opts),
regex(argv0, pattern, opts.ignore_case),
replacement(interpret_escapes(replacement_))
{ }
virtual ~regex_replacer_t() { }
bool replace_matches(const wchar_t *arg)
{
// A return value of true means all is well (even if no replacements
// were performed), false indicates an unrecoverable error.
if (regex.code == 0)
{
// pcre2_compile() failed
return false;
}
uint32_t options = opts.all ? PCRE2_SUBSTITUTE_GLOBAL : 0;
int arglen = wcslen(arg);
PCRE2_SIZE outlen = (arglen == 0) ? 16 : 2 * arglen;
wchar_t *output = (wchar_t *)malloc(sizeof(wchar_t) * outlen);
if (output == 0)
{
DIE_MEM();
}
int pcre2_rc = 0;
for (;;)
{
pcre2_rc = pcre2_substitute(
regex.code,
PCRE2_SPTR(arg),
arglen,
0, // start offset
options,
regex.match,
0, // match context
PCRE2_SPTR(replacement.c_str()),
PCRE2_ZERO_TERMINATED,
(PCRE2_UCHAR *)output,
&outlen);
if (pcre2_rc == PCRE2_ERROR_NOMEMORY)
{
if (outlen < MAX_REPLACE_SIZE)
{
outlen = std::min(2 * outlen, MAX_REPLACE_SIZE);
output = (wchar_t *)realloc(output, sizeof(wchar_t) * outlen);
if (output == 0)
{
DIE_MEM();
}
continue;
}
string_error(_(L"%ls: Replacement string too large\n"), argv0);
free(output);
return false;
}
break;
}
bool rc = true;
if (pcre2_rc < 0)
{
string_error(_(L"%ls: Regular expression substitute error: %ls\n"),
argv0, pcre2_strerror(pcre2_rc));
rc = false;
}
else
{
if (!opts.quiet)
{
stdout_buffer += output;
stdout_buffer += L'\n';
}
total_replaced += pcre2_rc;
}
free(output);
return rc;
}
};
static int string_replace(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L"aiqr";
const struct woption long_options[] =
{
{ L"all", no_argument, 0, 'a'},
{ L"ignore-case", no_argument, 0, 'i'},
{ L"quiet", no_argument, 0, 'q'},
{ L"regex", no_argument, 0, 'r'},
{ 0, 0, 0, 0 }
};
replace_options_t opts;
bool regex = false;
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'a':
opts.all = true;
break;
case 'i':
opts.ignore_case = true;
break;
case 'q':
opts.quiet = true;
break;
case 'r':
regex = true;
break;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
const wchar_t *pattern, *replacement;
if ((pattern = string_get_arg_argv(&i, argv)) == 0)
{
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
}
if ((replacement = string_get_arg_argv(&i, argv)) == 0)
{
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
}
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
string_replacer_t *replacer;
if (regex)
{
replacer = new regex_replacer_t(argv[0], pattern, replacement, opts);
}
else
{
replacer = new literal_replacer_t(argv[0], pattern, replacement, opts);
}
const wchar_t *arg;
while ((arg = string_get_arg(&i, argv)) != 0)
{
if (!replacer->replace_matches(arg))
{
delete replacer;
return BUILTIN_STRING_ERROR;
}
}
int rc = replacer->replace_count() > 0 ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
delete replacer;
return rc;
}
static int string_split(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L":m:qr";
const struct woption long_options[] =
{
{ L"max", required_argument, 0, 'm'},
{ L"quiet", no_argument, 0, 'q'},
{ L"right", no_argument, 0, 'r'},
{ 0, 0, 0, 0 }
};
long max = LONG_MAX;
bool quiet = false;
bool right = false;
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'm':
{
errno = 0;
wchar_t *endptr = 0;
max = wcstol(w.woptarg, &endptr, 10);
if (*endptr != L'\0' || errno != 0)
{
string_error(BUILTIN_ERR_NOT_NUMBER, argv[0], w.woptarg);
return BUILTIN_STRING_ERROR;
}
break;
}
case 'q':
quiet = true;
break;
case 'r':
right = true;
break;
case ':':
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
const wchar_t *sep;
if ((sep = string_get_arg_argv(&i, argv)) == 0)
{
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
}
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
std::list<wcstring> splits;
int seplen = wcslen(sep);
int nsplit = 0;
const wchar_t *arg;
if (right)
{
while ((arg = string_get_arg(&i, argv)) != 0)
{
int nargsplit = 0;
if (seplen == 0)
{
// Split to individual characters
const wchar_t *cur = arg + wcslen(arg) - 1;
while (cur > arg && nargsplit < max)
{
splits.push_front(wcstring(cur, 1));
cur--;
nargsplit++;
nsplit++;
}
splits.push_front(wcstring(arg, cur - arg + 1));
}
else
{
const wchar_t *end = arg + wcslen(arg);
const wchar_t *cur = end - seplen;
while (cur >= arg && nargsplit < max)
{
if (wcsncmp(cur, sep, seplen) == 0)
{
splits.push_front(wcstring(cur + seplen, end - cur - seplen));
end = cur;
cur -= seplen;
nargsplit++;
nsplit++;
}
else
{
cur--;
}
}
splits.push_front(wcstring(arg, end - arg));
}
}
}
else
{
while ((arg = string_get_arg(&i, argv)) != 0)
{
const wchar_t *cur = arg;
int nargsplit = 0;
if (seplen == 0)
{
// Split to individual characters
const wchar_t *last = arg + wcslen(arg) - 1;
while (cur < last && nargsplit < max)
{
splits.push_back(wcstring(cur, 1));
cur++;
nargsplit++;
nsplit++;
}
splits.push_back(cur);
}
else
{
while (cur != 0)
{
const wchar_t *ptr = (nargsplit < max) ? wcsstr(cur, sep) : 0;
if (ptr == 0)
{
splits.push_back(cur);
cur = 0;
}
else
{
splits.push_back(wcstring(cur, ptr - cur));
cur = ptr + seplen;
nargsplit++;
nsplit++;
}
}
}
}
}
if (!quiet)
{
std::list<wcstring>::const_iterator si = splits.begin();
while (si != splits.end())
{
stdout_buffer += *si;
stdout_buffer += L'\n';
si++;
}
}
return (nsplit > 0) ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
}
static int string_sub(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L":l:qs:";
const struct woption long_options[] =
{
{ L"length", required_argument, 0, 'l'},
{ L"quiet", no_argument, 0, 'q'},
{ L"start", required_argument, 0, 's'},
{ 0, 0, 0, 0 }
};
int start = 0;
int length = -1;
bool quiet = false;
wgetopter_t w;
wchar_t *endptr = 0;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'l':
errno = 0;
length = int(wcstol(w.woptarg, &endptr, 10));
if (*endptr != L'\0' || errno != 0)
{
string_error(BUILTIN_ERR_NOT_NUMBER, argv[0], w.woptarg);
return BUILTIN_STRING_ERROR;
}
if (length < 0)
{
string_error(_(L"%ls: Invalid length value '%d'\n"), argv[0], length);
return BUILTIN_STRING_ERROR;
}
break;
case 'q':
quiet = true;
break;
case 's':
errno = 0;
start = int(wcstol(w.woptarg, &endptr, 10));
if (*endptr != L'\0' || errno != 0)
{
string_error(BUILTIN_ERR_NOT_NUMBER, argv[0], w.woptarg);
return BUILTIN_STRING_ERROR;
}
if (start == 0)
{
string_error(_(L"%ls: Invalid start value '%d'\n"), argv[0], start);
return BUILTIN_STRING_ERROR;
}
break;
case ':':
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
int nsub = 0;
const wchar_t *arg;
while ((arg = string_get_arg(&i, argv)) != 0)
{
wcstring::size_type pos = 0;
wcstring::size_type count = wcstring::npos;
wcstring s(arg);
if (start > 0)
{
pos = start - 1;
}
else if (start < 0)
{
wcstring::size_type n = -start;
pos = n > s.length() ? 0 : s.length() - n;
}
if (pos > s.length())
{
pos = s.length();
}
if (length >= 0)
{
count = length;
}
if (pos + count > s.length())
{
count = wcstring::npos;
}
if (!quiet)
{
stdout_buffer += s.substr(pos, count);
stdout_buffer += L'\n';
}
nsub++;
}
return (nsub > 0) ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
}
static int string_trim(parser_t &parser, int argc, wchar_t **argv)
{
const wchar_t *short_options = L":c:lqr";
const struct woption long_options[] =
{
{ L"chars", required_argument, 0, 'c'},
{ L"left", no_argument, 0, 'l'},
{ L"quiet", no_argument, 0, 'q'},
{ L"right", no_argument, 0, 'r'},
{ 0, 0, 0, 0 }
};
int leftright = 0;
bool quiet = false;
wcstring chars = L" \f\n\r\t";
wgetopter_t w;
for (;;)
{
int c = w.wgetopt_long(argc, argv, short_options, long_options, 0);
if (c == -1)
{
break;
}
switch (c)
{
case 0:
break;
case 'c':
chars = w.woptarg;
break;
case 'l':
leftright |= 1;
break;
case 'q':
quiet = true;
break;
case 'r':
leftright |= 2;
break;
case ':':
string_error(STRING_ERR_MISSING, argv[0]);
return BUILTIN_STRING_ERROR;
case '?':
string_unknown_option(parser, argv[0], argv[w.woptind - 1]);
return BUILTIN_STRING_ERROR;
}
}
int i = w.woptind;
if (string_args_from_stdin() && argc > i)
{
string_error(BUILTIN_ERR_TOO_MANY_ARGUMENTS, argv[0]);
return BUILTIN_STRING_ERROR;
}
const wchar_t *arg;
int ntrim = 0;
while ((arg = string_get_arg(&i, argv)) != 0)
{
const wchar_t *begin = arg;
const wchar_t *end = arg + wcslen(arg);
if (!leftright || (leftright & 1))
{
while (begin != end && chars.find_first_of(begin, 0, 1) != wcstring::npos)
{
begin++;
ntrim++;
}
}
if (!leftright || (leftright & 2))
{
while (begin != end && chars.find_first_of(end - 1, 0, 1) != wcstring::npos)
{
end--;
ntrim++;
}
}
if (!quiet)
{
stdout_buffer += wcstring(begin, end - begin);
stdout_buffer += L'\n';
}
}
return (ntrim > 0) ? BUILTIN_STRING_OK : BUILTIN_STRING_NONE;
}
static const struct string_subcommand
{
const wchar_t *name;
int (*handler)(parser_t &, int argc, wchar_t **argv);
}
string_subcommands[] =
{
{ L"escape", &string_escape },
{ L"join", &string_join },
{ L"length", &string_length },
{ L"match", &string_match },
{ L"replace", &string_replace },
{ L"split", &string_split },
{ L"sub", &string_sub },
{ L"trim", &string_trim },
{ 0, 0 }
};
/**
The string builtin, for manipulating strings.
*/
/*static*/ int builtin_string(parser_t &parser, wchar_t **argv)
{
int argc = builtin_count_args(argv);
if (argc <= 1)
{
string_error(STRING_ERR_MISSING, argv[0]);
builtin_print_help(parser, L"string", stderr_buffer);
return BUILTIN_STRING_ERROR;
}
if (wcscmp(argv[1], L"-h") == 0 || wcscmp(argv[1], L"--help") == 0)
{
builtin_print_help(parser, L"string", stderr_buffer);
return BUILTIN_STRING_OK;
}
const string_subcommand *subcmd = &string_subcommands[0];
while (subcmd->name != 0 && wcscmp(subcmd->name, argv[1]) != 0)
{
subcmd++;
}
if (subcmd->handler == 0)
{
string_error(_(L"%ls: Unknown subcommand '%ls'\n"), argv[0], argv[1]);
builtin_print_help(parser, L"string", stderr_buffer);
return BUILTIN_STRING_ERROR;
}
argc--;
argv++;
return subcmd->handler(parser, argc, argv);
}