From 59e50f77bcca46e7b147dedd83137daaf9afde96 Mon Sep 17 00:00:00 2001 From: Andrey Mishchenko Date: Sun, 13 Mar 2022 03:23:35 -0700 Subject: [PATCH] Allow underscores as separators in the `math` builtin (#8611) * Implement fish_wcstod_underscores * Add fish_wcstod_underscores unit tests * Switch to using fish_wcstod_underscores in tinyexpr * Add tests for math builtin underscore separator functionality * Add documentation for underscore separators for math builtin * Add a changelog entry for underscore numeric separators --- CHANGELOG.rst | 1 + doc_src/cmds/math.rst | 2 ++ src/fish_tests.cpp | 44 +++++++++++++++++++++++++++++++ src/tinyexpr.cpp | 2 +- src/wutil.cpp | 59 ++++++++++++++++++++++++++++++++++++++++++ src/wutil.h | 1 + tests/checks/math.fish | 21 +++++++++++++++ 7 files changed, 129 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e54d8c7e4..da9f07bea 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -69,6 +69,7 @@ Scripting improvements - ``$fish_user_paths`` is now automatically deduplicated to fix a common user error of appending to it in config.fish when it is universal (:issue:`8117`). :ref:`fish_add_path ` remains the recommended way to add to $PATH. - ``return`` can now be used outside functions. In scripts, it does the same thing as ``exit``. In interactive mode,it sets ``$status`` without exiting (:issue:`8148`). - An oversight prevented all syntax checks from running on commands given to ``fish -c`` (:issue:`8171`). This includes checks such as ``exec`` not being allowed in a pipeline, and ``$$`` not being a valid variable. Generally, another error was generated anyway. +- ``math`` now allows you to use underscores as visual separators when writing numbers - for example, ``math 1_000 + 2_000`` returns ``3000`` (:issue:`8496`). - ``fish_indent`` now correctly reformats tokens that end with a backslash followed by a newline (:issue:`8197`). - ``commandline`` gained an ``--is-valid`` option to check if the command line is syntactically valid and complete. This allows basic implementation of transient prompts (:issue:`8142`). - ``commandline`` gained a ``--paging-full-mode`` option to check if the pager is showing all the possible lines (no "7 more rows" message) (:issue:`8485`). diff --git a/doc_src/cmds/math.rst b/doc_src/cmds/math.rst index 1aac3eada..8c73298e7 100644 --- a/doc_src/cmds/math.rst +++ b/doc_src/cmds/math.rst @@ -64,6 +64,8 @@ Syntax For numbers, ``.`` is always the radix character regardless of locale - ``2.5``, not ``2,5``. Scientific notation (``10e5``) and hexadecimal (``0xFF``) are also available. +``math`` allows you to use underscores as visual separators for digit grouping. For example, you can write ``1_000_000``, ``0x_89_AB_CD_EF``, and ``1.234_567_e89``. + Operators --------- diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 94221bf12..3f005e056 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -3000,6 +3000,49 @@ static void test_wcstod() { tod_test(L"nope", "nope"); } +static void test_fish_wcstod_underscores() { + say(L"Testing fish_wcstod_underscores"); + + auto test_case = [](const wchar_t *s, size_t expected_num_consumed) { + wchar_t *endptr = nullptr; + fish_wcstod_underscores(s, &endptr); + size_t num_consumed = (size_t)(endptr - (wchar_t *)s); + do_test(expected_num_consumed == num_consumed); + }; + + test_case(L"123", 3); + test_case(L"1_2.3_4.5_6", 7); + test_case(L"1_2", 3); + test_case(L"1_._2", 5); + test_case(L"1__2", 4); + test_case(L" 1__2 3__4 ", 5); + test_case(L"1_2 3_4", 3); + test_case(L" 1", 2); + test_case(L" 1_", 3); + test_case(L" 1__", 4); + test_case(L" 1___", 5); + test_case(L" 1___ 2___", 5); + test_case(L" _1", 3); + test_case(L"1 ", 1); + test_case(L"infinity_", 8); + test_case(L" -INFINITY", 10); + test_case(L"_infinity", 0); + test_case(L"nan(0)", 6); + test_case(L"nan(0)_", 6); + test_case(L"_nan(0)", 0); + // We don't strip the underscores in this commented-out test case, and the behavior is + // implementation-defined, so we don't actually know how many characters will get consumed. On + // macOS the strtod man page only says what happens with an alphanumeric string passed to nan(), + // but the strtod consumes all of the characters even if there are underscores. + // test_case(L"nan(0_1_2)", 3); + test_case(L" _ 1", 0); + test_case(L"0x_dead_beef", 12); + test_case(L"None", 0); + test_case(L" None", 0); + test_case(L"Also none", 0); + test_case(L" Also none", 0); +} + static void test_dup2s() { using std::make_shared; io_chain_t chain; @@ -6801,6 +6844,7 @@ static const test_t s_tests[]{ {TEST_GROUP("abbreviations"), test_abbreviations}, {TEST_GROUP("builtins/test"), test_test}, {TEST_GROUP("wcstod"), test_wcstod}, + {TEST_GROUP("fish_wcstod_underscores"), test_fish_wcstod_underscores}, {TEST_GROUP("dup2s"), test_dup2s}, {TEST_GROUP("dup2s"), test_dup2s_fd_for_target_fd}, {TEST_GROUP("path"), test_path}, diff --git a/src/tinyexpr.cpp b/src/tinyexpr.cpp index e7d4449fd..d2bf295f2 100644 --- a/src/tinyexpr.cpp +++ b/src/tinyexpr.cpp @@ -272,7 +272,7 @@ static void next_token(state *s) { /* Try reading a number. */ if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') { - s->value = fish_wcstod(s->next, const_cast(&s->next)); + s->value = fish_wcstod_underscores(s->next, const_cast(&s->next)); s->type = TOK_NUMBER; } else { /* Look for a function call. */ diff --git a/src/wutil.cpp b/src/wutil.cpp index 55c8affef..ccacb24c4 100644 --- a/src/wutil.cpp +++ b/src/wutil.cpp @@ -18,9 +18,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -711,6 +713,63 @@ double fish_wcstod(const wchar_t *str, wchar_t **endptr) { return std::wcstod(str, endptr); } +/// Like wcstod(), but allows underscore separators. Leading, trailing, and multiple underscores are +/// allowed, as are underscores next to decimal (.), exponent (E/e/P/p), and hexadecimal (X/x) +/// delimiters. This consumes trailing underscores -- endptr will point past the last underscore +/// which is legal to include in a parse (according to the above rules). Free-floating leading +/// underscores ("_ 3") are not allowed and will result in a no-parse. Underscores are not allowed +/// before or inside of "infinity" or "nan" input. Trailing underscores after "infinity" or "nan" +/// are not consumed. +double fish_wcstod_underscores(const wchar_t *str, wchar_t **endptr) { + const wchar_t *orig = str; + while (iswspace(*str)) str++; // Skip leading whitespace. + size_t leading_whitespace = size_t(str - orig); + auto is_sign = [](wchar_t c) { return c == L'+' || c == L'-'; }; + auto is_inf_or_nan_char = [](wchar_t c) { + return c == L'i' || c == L'I' || c == L'n' || c == L'N'; + }; + // We don't do any underscore-stripping for infinity/NaN. + if (is_inf_or_nan_char(*str) || (is_sign(*str) && is_inf_or_nan_char(*(str + 1)))) { + return fish_wcstod(orig, endptr); + } + // We build a string to pass to the system wcstod, pruned of underscores. We will take all + // leading alphanumeric characters that can appear in a strtod numeric literal, dots (.), and + // signs (+/-). In order to be more clever, for example to stop earlier in the case of strings + // like "123xxxxx", we would need to do a full parse, because sometimes 'a' is a hex digit and + // sometimes it is the end of the parse, sometimes a dot '.' is a decimal delimiter and + // sometimes it is the end of the valid parse, as in "1_2.3_4.5_6", etc. + wcstring pruned; + // We keep track of the positions *in the pruned string* where there used to be underscores. We + // will pass the pruned version of the input string to the system wcstod, which in turn will + // tell us how many characters it consumed. Then we will set our own endptr based on (1) the + // number of characters consumed from the pruned string, and (2) how many underscores came + // before the last consumed character. The alternative to doing it this way (for example, "only + // deleting the correct underscores") would require actually parsing the input string, so that + // we can know when to stop grabbing characters and dropping underscores, as in "1_2.3_4.5_6". + std::vector underscores; + // If we wanted to future-proof against a strtod from the future that, say, allows octal + // literals using 0o, etc., we could just use iswalnum, instead of iswxdigit and P/p/X/x checks. + while (iswxdigit(*str) || *str == L'P' || *str == L'p' || *str == L'X' || *str == L'x' || + is_sign(*str) || *str == L'.' || *str == L'_') { + if (*str == L'_') underscores.push_back(pruned.length()); + else pruned.push_back(*str); + str++; + } + const wchar_t *pruned_begin = pruned.c_str(); + const wchar_t *pruned_end = nullptr; + double result = fish_wcstod(pruned_begin, (wchar_t **)(&pruned_end)); + if (pruned_end == pruned_begin) { + if (endptr) *endptr = (wchar_t *)orig; + return result; + } + auto consumed_underscores_end = std::upper_bound(underscores.begin(), underscores.end(), + size_t(pruned_end - pruned_begin)); + size_t num_underscores_consumed = std::distance(underscores.begin(), consumed_underscores_end); + if (endptr) *endptr = (wchar_t *)(orig + leading_whitespace + (pruned_end - pruned_begin) + + num_underscores_consumed); + return result; +} + file_id_t file_id_t::from_stat(const struct stat &buf) { file_id_t result = {}; result.device = buf.st_dev; diff --git a/src/wutil.h b/src/wutil.h index 139f5c8aa..dcb7f8345 100644 --- a/src/wutil.h +++ b/src/wutil.h @@ -133,6 +133,7 @@ long long fish_wcstoll(const wchar_t *str, const wchar_t **endptr = nullptr, int unsigned long long fish_wcstoull(const wchar_t *str, const wchar_t **endptr = nullptr, int base = 10); double fish_wcstod(const wchar_t *str, wchar_t **endptr); +double fish_wcstod_underscores(const wchar_t *str, wchar_t **endptr); /// Class for representing a file's inode. We use this to detect and avoid symlink loops, among /// other things. While an inode / dev pair is sufficient to distinguish co-existing files, Linux diff --git a/tests/checks/math.fish b/tests/checks/math.fish index ae3ba8680..455d0fac0 100644 --- a/tests/checks/math.fish +++ b/tests/checks/math.fish @@ -261,3 +261,24 @@ math pow 2 x cos'(-pi)', 2 math 'ncr(0/0, 1)' # CHECKERR: math: Error: Result is infinite # CHECKERR: 'ncr(0/0, 1)' + +math 0_1 +# CHECK: 1 +math 0x0_A +# CHECK: 10 +math 1_000 + 2_000 +# CHECK: 3000 +math 1_0_0_0 +# CHECK: 1000 +math 0_0.5_0 + 0_1.0_0 +# CHECK: 1.5 +math 2e0_0_2 +# CHECK: 200 +math -0_0.5_0_0E0_0_3 +# CHECK: -500 +math 20e-0_1 +# CHECK: 2 +math 0x0_2.0_0_0P0_2 +# CHECK: 8 +math -0x8p-0_3 +# CHECK: -1