From 59e50f77bcca46e7b147dedd83137daaf9afde96 Mon Sep 17 00:00:00 2001
From: Andrey Mishchenko <mishchea@gmail.com>
Date: Sun, 13 Mar 2022 03:23:35 -0700
Subject: [PATCH] Allow underscores as separators in the `math` builtin (#8611)

* Implement fish_wcstod_underscores

* Add fish_wcstod_underscores unit tests

* Switch to using fish_wcstod_underscores in tinyexpr

* Add tests for math builtin underscore separator functionality

* Add documentation for underscore separators for math builtin

* Add a changelog entry for underscore numeric separators
---
 CHANGELOG.rst          |  1 +
 doc_src/cmds/math.rst  |  2 ++
 src/fish_tests.cpp     | 44 +++++++++++++++++++++++++++++++
 src/tinyexpr.cpp       |  2 +-
 src/wutil.cpp          | 59 ++++++++++++++++++++++++++++++++++++++++++
 src/wutil.h            |  1 +
 tests/checks/math.fish | 21 +++++++++++++++
 7 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e54d8c7e4..da9f07bea 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -69,6 +69,7 @@ Scripting improvements
 - ``$fish_user_paths`` is now automatically deduplicated to fix a common user error of appending to it in config.fish when it is universal (:issue:`8117`). :ref:`fish_add_path <cmd-fish_add_path>` remains the recommended way to add to $PATH.
 - ``return`` can now be used outside functions. In scripts, it does the same thing as ``exit``. In interactive mode,it sets ``$status`` without exiting (:issue:`8148`).
 - An oversight prevented all syntax checks from running on commands given to ``fish -c`` (:issue:`8171`). This includes checks such as ``exec`` not being allowed in a pipeline, and ``$$`` not being a valid variable. Generally, another error was generated anyway.
+- ``math`` now allows you to use underscores as visual separators when writing numbers - for example, ``math 1_000 + 2_000`` returns ``3000`` (:issue:`8496`).
 - ``fish_indent`` now correctly reformats tokens that end with a backslash followed by a newline (:issue:`8197`).
 - ``commandline`` gained an ``--is-valid`` option to check if the command line is syntactically valid and complete. This allows basic implementation of transient prompts (:issue:`8142`).
 - ``commandline`` gained a ``--paging-full-mode`` option to check if the pager is showing all the possible lines (no "7 more rows" message) (:issue:`8485`).
diff --git a/doc_src/cmds/math.rst b/doc_src/cmds/math.rst
index 1aac3eada..8c73298e7 100644
--- a/doc_src/cmds/math.rst
+++ b/doc_src/cmds/math.rst
@@ -64,6 +64,8 @@ Syntax
 For numbers, ``.`` is always the radix character regardless of locale - ``2.5``, not ``2,5``.
 Scientific notation (``10e5``) and hexadecimal (``0xFF``) are also available.
 
+``math`` allows you to use underscores as visual separators for digit grouping. For example, you can write ``1_000_000``, ``0x_89_AB_CD_EF``, and ``1.234_567_e89``.
+
 Operators
 ---------
 
diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp
index 94221bf12..3f005e056 100644
--- a/src/fish_tests.cpp
+++ b/src/fish_tests.cpp
@@ -3000,6 +3000,49 @@ static void test_wcstod() {
     tod_test(L"nope", "nope");
 }
 
+static void test_fish_wcstod_underscores() {
+    say(L"Testing fish_wcstod_underscores");
+
+    auto test_case = [](const wchar_t *s, size_t expected_num_consumed) {
+        wchar_t *endptr = nullptr;
+        fish_wcstod_underscores(s, &endptr);
+        size_t num_consumed = (size_t)(endptr - (wchar_t *)s);
+        do_test(expected_num_consumed == num_consumed);
+    };
+
+    test_case(L"123", 3);
+    test_case(L"1_2.3_4.5_6", 7);
+    test_case(L"1_2", 3);
+    test_case(L"1_._2", 5);
+    test_case(L"1__2", 4);
+    test_case(L" 1__2 3__4 ", 5);
+    test_case(L"1_2 3_4", 3);
+    test_case(L" 1", 2);
+    test_case(L" 1_", 3);
+    test_case(L" 1__", 4);
+    test_case(L" 1___", 5);
+    test_case(L" 1___ 2___", 5);
+    test_case(L" _1", 3);
+    test_case(L"1 ", 1);
+    test_case(L"infinity_", 8);
+    test_case(L" -INFINITY", 10);
+    test_case(L"_infinity", 0);
+    test_case(L"nan(0)", 6);
+    test_case(L"nan(0)_", 6);
+    test_case(L"_nan(0)", 0);
+    // We don't strip the underscores in this commented-out test case, and the behavior is
+    // implementation-defined, so we don't actually know how many characters will get consumed. On
+    // macOS the strtod man page only says what happens with an alphanumeric string passed to nan(),
+    // but the strtod consumes all of the characters even if there are underscores.
+    // test_case(L"nan(0_1_2)", 3);
+    test_case(L" _ 1", 0);
+    test_case(L"0x_dead_beef", 12);
+    test_case(L"None", 0);
+    test_case(L" None", 0);
+    test_case(L"Also none", 0);
+    test_case(L" Also none", 0);
+}
+
 static void test_dup2s() {
     using std::make_shared;
     io_chain_t chain;
@@ -6801,6 +6844,7 @@ static const test_t s_tests[]{
     {TEST_GROUP("abbreviations"), test_abbreviations},
     {TEST_GROUP("builtins/test"), test_test},
     {TEST_GROUP("wcstod"), test_wcstod},
+    {TEST_GROUP("fish_wcstod_underscores"), test_fish_wcstod_underscores},
     {TEST_GROUP("dup2s"), test_dup2s},
     {TEST_GROUP("dup2s"), test_dup2s_fd_for_target_fd},
     {TEST_GROUP("path"), test_path},
diff --git a/src/tinyexpr.cpp b/src/tinyexpr.cpp
index e7d4449fd..d2bf295f2 100644
--- a/src/tinyexpr.cpp
+++ b/src/tinyexpr.cpp
@@ -272,7 +272,7 @@ static void next_token(state *s) {
 
         /* Try reading a number. */
         if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') {
-            s->value = fish_wcstod(s->next, const_cast<wchar_t **>(&s->next));
+            s->value = fish_wcstod_underscores(s->next, const_cast<wchar_t **>(&s->next));
             s->type = TOK_NUMBER;
         } else {
             /* Look for a function call. */
diff --git a/src/wutil.cpp b/src/wutil.cpp
index 55c8affef..ccacb24c4 100644
--- a/src/wutil.cpp
+++ b/src/wutil.cpp
@@ -18,9 +18,11 @@
 #include <unistd.h>
 #include <wctype.h>
 
+#include <algorithm>
 #include <atomic>
 #include <cstring>
 #include <cwchar>
+#include <iterator>
 #include <string>
 #include <unordered_map>
 
@@ -711,6 +713,63 @@ double fish_wcstod(const wchar_t *str, wchar_t **endptr) {
     return std::wcstod(str, endptr);
 }
 
+/// Like wcstod(), but allows underscore separators. Leading, trailing, and multiple underscores are
+/// allowed, as are underscores next to decimal (.), exponent (E/e/P/p), and hexadecimal (X/x)
+/// delimiters. This consumes trailing underscores -- endptr will point past the last underscore
+/// which is legal to include in a parse (according to the above rules). Free-floating leading
+/// underscores ("_ 3") are not allowed and will result in a no-parse. Underscores are not allowed
+/// before or inside of "infinity" or "nan" input. Trailing underscores after "infinity" or "nan"
+/// are not consumed.
+double fish_wcstod_underscores(const wchar_t *str, wchar_t **endptr) {
+    const wchar_t *orig = str;
+    while (iswspace(*str)) str++; // Skip leading whitespace.
+    size_t leading_whitespace = size_t(str - orig);
+    auto is_sign = [](wchar_t c) { return c == L'+' || c == L'-'; };
+    auto is_inf_or_nan_char = [](wchar_t c) {
+        return c == L'i' || c == L'I' || c == L'n' || c == L'N';
+    };
+    // We don't do any underscore-stripping for infinity/NaN.
+    if (is_inf_or_nan_char(*str) || (is_sign(*str) && is_inf_or_nan_char(*(str + 1)))) {
+        return fish_wcstod(orig, endptr);
+    }
+    // We build a string to pass to the system wcstod, pruned of underscores. We will take all
+    // leading alphanumeric characters that can appear in a strtod numeric literal, dots (.), and
+    // signs (+/-). In order to be more clever, for example to stop earlier in the case of strings
+    // like "123xxxxx", we would need to do a full parse, because sometimes 'a' is a hex digit and
+    // sometimes it is the end of the parse, sometimes a dot '.' is a decimal delimiter and
+    // sometimes it is the end of the valid parse, as in "1_2.3_4.5_6", etc.
+    wcstring pruned;
+    // We keep track of the positions *in the pruned string* where there used to be underscores. We
+    // will pass the pruned version of the input string to the system wcstod, which in turn will
+    // tell us how many characters it consumed. Then we will set our own endptr based on (1) the
+    // number of characters consumed from the pruned string, and (2) how many underscores came
+    // before the last consumed character. The alternative to doing it this way (for example, "only
+    // deleting the correct underscores") would require actually parsing the input string, so that
+    // we can know when to stop grabbing characters and dropping underscores, as in "1_2.3_4.5_6".
+    std::vector<size_t> underscores;
+    // If we wanted to future-proof against a strtod from the future that, say, allows octal
+    // literals using 0o, etc., we could just use iswalnum, instead of iswxdigit and P/p/X/x checks.
+    while (iswxdigit(*str) || *str == L'P' || *str == L'p' || *str == L'X' || *str == L'x' ||
+           is_sign(*str) || *str == L'.' || *str == L'_') {
+        if (*str == L'_') underscores.push_back(pruned.length());
+        else pruned.push_back(*str);
+        str++;
+    }
+    const wchar_t *pruned_begin = pruned.c_str();
+    const wchar_t *pruned_end = nullptr;
+    double result = fish_wcstod(pruned_begin, (wchar_t **)(&pruned_end));
+    if (pruned_end == pruned_begin) {
+        if (endptr) *endptr = (wchar_t *)orig;
+        return result;
+    }
+    auto consumed_underscores_end = std::upper_bound(underscores.begin(), underscores.end(),
+                                                     size_t(pruned_end - pruned_begin));
+    size_t num_underscores_consumed = std::distance(underscores.begin(), consumed_underscores_end);
+    if (endptr) *endptr = (wchar_t *)(orig + leading_whitespace + (pruned_end - pruned_begin)
+                                    + num_underscores_consumed);
+    return result;
+}
+
 file_id_t file_id_t::from_stat(const struct stat &buf) {
     file_id_t result = {};
     result.device = buf.st_dev;
diff --git a/src/wutil.h b/src/wutil.h
index 139f5c8aa..dcb7f8345 100644
--- a/src/wutil.h
+++ b/src/wutil.h
@@ -133,6 +133,7 @@ long long fish_wcstoll(const wchar_t *str, const wchar_t **endptr = nullptr, int
 unsigned long long fish_wcstoull(const wchar_t *str, const wchar_t **endptr = nullptr,
                                  int base = 10);
 double fish_wcstod(const wchar_t *str, wchar_t **endptr);
+double fish_wcstod_underscores(const wchar_t *str, wchar_t **endptr);
 
 /// Class for representing a file's inode. We use this to detect and avoid symlink loops, among
 /// other things. While an inode / dev pair is sufficient to distinguish co-existing files, Linux
diff --git a/tests/checks/math.fish b/tests/checks/math.fish
index ae3ba8680..455d0fac0 100644
--- a/tests/checks/math.fish
+++ b/tests/checks/math.fish
@@ -261,3 +261,24 @@ math pow 2 x cos'(-pi)', 2
 math 'ncr(0/0, 1)'
 # CHECKERR: math: Error: Result is infinite
 # CHECKERR: 'ncr(0/0, 1)'
+
+math 0_1
+# CHECK: 1
+math 0x0_A
+# CHECK: 10
+math 1_000 + 2_000
+# CHECK: 3000
+math 1_0_0_0
+# CHECK: 1000
+math 0_0.5_0 + 0_1.0_0
+# CHECK: 1.5
+math 2e0_0_2
+# CHECK: 200
+math -0_0.5_0_0E0_0_3
+# CHECK: -500
+math 20e-0_1
+# CHECK: 2
+math 0x0_2.0_0_0P0_2
+# CHECK: 8
+math -0x8p-0_3
+# CHECK: -1