Allow underscores as separators in the math builtin (#8611)

* Implement fish_wcstod_underscores

* Add fish_wcstod_underscores unit tests

* Switch to using fish_wcstod_underscores in tinyexpr

* Add tests for math builtin underscore separator functionality

* Add documentation for underscore separators for math builtin

* Add a changelog entry for underscore numeric separators
This commit is contained in:
Andrey Mishchenko 2022-03-13 03:23:35 -07:00 committed by GitHub
parent 9ada7d9aad
commit 59e50f77bc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 129 additions and 1 deletions

View file

@ -69,6 +69,7 @@ Scripting improvements
- ``$fish_user_paths`` is now automatically deduplicated to fix a common user error of appending to it in config.fish when it is universal (:issue:`8117`). :ref:`fish_add_path <cmd-fish_add_path>` remains the recommended way to add to $PATH.
- ``return`` can now be used outside functions. In scripts, it does the same thing as ``exit``. In interactive mode,it sets ``$status`` without exiting (:issue:`8148`).
- An oversight prevented all syntax checks from running on commands given to ``fish -c`` (:issue:`8171`). This includes checks such as ``exec`` not being allowed in a pipeline, and ``$$`` not being a valid variable. Generally, another error was generated anyway.
- ``math`` now allows you to use underscores as visual separators when writing numbers - for example, ``math 1_000 + 2_000`` returns ``3000`` (:issue:`8496`).
- ``fish_indent`` now correctly reformats tokens that end with a backslash followed by a newline (:issue:`8197`).
- ``commandline`` gained an ``--is-valid`` option to check if the command line is syntactically valid and complete. This allows basic implementation of transient prompts (:issue:`8142`).
- ``commandline`` gained a ``--paging-full-mode`` option to check if the pager is showing all the possible lines (no "7 more rows" message) (:issue:`8485`).

View file

@ -64,6 +64,8 @@ Syntax
For numbers, ``.`` is always the radix character regardless of locale - ``2.5``, not ``2,5``.
Scientific notation (``10e5``) and hexadecimal (``0xFF``) are also available.
``math`` allows you to use underscores as visual separators for digit grouping. For example, you can write ``1_000_000``, ``0x_89_AB_CD_EF``, and ``1.234_567_e89``.
Operators
---------

View file

@ -3000,6 +3000,49 @@ static void test_wcstod() {
tod_test(L"nope", "nope");
}
static void test_fish_wcstod_underscores() {
say(L"Testing fish_wcstod_underscores");
auto test_case = [](const wchar_t *s, size_t expected_num_consumed) {
wchar_t *endptr = nullptr;
fish_wcstod_underscores(s, &endptr);
size_t num_consumed = (size_t)(endptr - (wchar_t *)s);
do_test(expected_num_consumed == num_consumed);
};
test_case(L"123", 3);
test_case(L"1_2.3_4.5_6", 7);
test_case(L"1_2", 3);
test_case(L"1_._2", 5);
test_case(L"1__2", 4);
test_case(L" 1__2 3__4 ", 5);
test_case(L"1_2 3_4", 3);
test_case(L" 1", 2);
test_case(L" 1_", 3);
test_case(L" 1__", 4);
test_case(L" 1___", 5);
test_case(L" 1___ 2___", 5);
test_case(L" _1", 3);
test_case(L"1 ", 1);
test_case(L"infinity_", 8);
test_case(L" -INFINITY", 10);
test_case(L"_infinity", 0);
test_case(L"nan(0)", 6);
test_case(L"nan(0)_", 6);
test_case(L"_nan(0)", 0);
// We don't strip the underscores in this commented-out test case, and the behavior is
// implementation-defined, so we don't actually know how many characters will get consumed. On
// macOS the strtod man page only says what happens with an alphanumeric string passed to nan(),
// but the strtod consumes all of the characters even if there are underscores.
// test_case(L"nan(0_1_2)", 3);
test_case(L" _ 1", 0);
test_case(L"0x_dead_beef", 12);
test_case(L"None", 0);
test_case(L" None", 0);
test_case(L"Also none", 0);
test_case(L" Also none", 0);
}
static void test_dup2s() {
using std::make_shared;
io_chain_t chain;
@ -6801,6 +6844,7 @@ static const test_t s_tests[]{
{TEST_GROUP("abbreviations"), test_abbreviations},
{TEST_GROUP("builtins/test"), test_test},
{TEST_GROUP("wcstod"), test_wcstod},
{TEST_GROUP("fish_wcstod_underscores"), test_fish_wcstod_underscores},
{TEST_GROUP("dup2s"), test_dup2s},
{TEST_GROUP("dup2s"), test_dup2s_fd_for_target_fd},
{TEST_GROUP("path"), test_path},

View file

@ -272,7 +272,7 @@ static void next_token(state *s) {
/* Try reading a number. */
if ((s->next[0] >= '0' && s->next[0] <= '9') || s->next[0] == '.') {
s->value = fish_wcstod(s->next, const_cast<wchar_t **>(&s->next));
s->value = fish_wcstod_underscores(s->next, const_cast<wchar_t **>(&s->next));
s->type = TOK_NUMBER;
} else {
/* Look for a function call. */

View file

@ -18,9 +18,11 @@
#include <unistd.h>
#include <wctype.h>
#include <algorithm>
#include <atomic>
#include <cstring>
#include <cwchar>
#include <iterator>
#include <string>
#include <unordered_map>
@ -711,6 +713,63 @@ double fish_wcstod(const wchar_t *str, wchar_t **endptr) {
return std::wcstod(str, endptr);
}
/// Like wcstod(), but allows underscore separators. Leading, trailing, and multiple underscores are
/// allowed, as are underscores next to decimal (.), exponent (E/e/P/p), and hexadecimal (X/x)
/// delimiters. This consumes trailing underscores -- endptr will point past the last underscore
/// which is legal to include in a parse (according to the above rules). Free-floating leading
/// underscores ("_ 3") are not allowed and will result in a no-parse. Underscores are not allowed
/// before or inside of "infinity" or "nan" input. Trailing underscores after "infinity" or "nan"
/// are not consumed.
double fish_wcstod_underscores(const wchar_t *str, wchar_t **endptr) {
const wchar_t *orig = str;
while (iswspace(*str)) str++; // Skip leading whitespace.
size_t leading_whitespace = size_t(str - orig);
auto is_sign = [](wchar_t c) { return c == L'+' || c == L'-'; };
auto is_inf_or_nan_char = [](wchar_t c) {
return c == L'i' || c == L'I' || c == L'n' || c == L'N';
};
// We don't do any underscore-stripping for infinity/NaN.
if (is_inf_or_nan_char(*str) || (is_sign(*str) && is_inf_or_nan_char(*(str + 1)))) {
return fish_wcstod(orig, endptr);
}
// We build a string to pass to the system wcstod, pruned of underscores. We will take all
// leading alphanumeric characters that can appear in a strtod numeric literal, dots (.), and
// signs (+/-). In order to be more clever, for example to stop earlier in the case of strings
// like "123xxxxx", we would need to do a full parse, because sometimes 'a' is a hex digit and
// sometimes it is the end of the parse, sometimes a dot '.' is a decimal delimiter and
// sometimes it is the end of the valid parse, as in "1_2.3_4.5_6", etc.
wcstring pruned;
// We keep track of the positions *in the pruned string* where there used to be underscores. We
// will pass the pruned version of the input string to the system wcstod, which in turn will
// tell us how many characters it consumed. Then we will set our own endptr based on (1) the
// number of characters consumed from the pruned string, and (2) how many underscores came
// before the last consumed character. The alternative to doing it this way (for example, "only
// deleting the correct underscores") would require actually parsing the input string, so that
// we can know when to stop grabbing characters and dropping underscores, as in "1_2.3_4.5_6".
std::vector<size_t> underscores;
// If we wanted to future-proof against a strtod from the future that, say, allows octal
// literals using 0o, etc., we could just use iswalnum, instead of iswxdigit and P/p/X/x checks.
while (iswxdigit(*str) || *str == L'P' || *str == L'p' || *str == L'X' || *str == L'x' ||
is_sign(*str) || *str == L'.' || *str == L'_') {
if (*str == L'_') underscores.push_back(pruned.length());
else pruned.push_back(*str);
str++;
}
const wchar_t *pruned_begin = pruned.c_str();
const wchar_t *pruned_end = nullptr;
double result = fish_wcstod(pruned_begin, (wchar_t **)(&pruned_end));
if (pruned_end == pruned_begin) {
if (endptr) *endptr = (wchar_t *)orig;
return result;
}
auto consumed_underscores_end = std::upper_bound(underscores.begin(), underscores.end(),
size_t(pruned_end - pruned_begin));
size_t num_underscores_consumed = std::distance(underscores.begin(), consumed_underscores_end);
if (endptr) *endptr = (wchar_t *)(orig + leading_whitespace + (pruned_end - pruned_begin)
+ num_underscores_consumed);
return result;
}
file_id_t file_id_t::from_stat(const struct stat &buf) {
file_id_t result = {};
result.device = buf.st_dev;

View file

@ -133,6 +133,7 @@ long long fish_wcstoll(const wchar_t *str, const wchar_t **endptr = nullptr, int
unsigned long long fish_wcstoull(const wchar_t *str, const wchar_t **endptr = nullptr,
int base = 10);
double fish_wcstod(const wchar_t *str, wchar_t **endptr);
double fish_wcstod_underscores(const wchar_t *str, wchar_t **endptr);
/// Class for representing a file's inode. We use this to detect and avoid symlink loops, among
/// other things. While an inode / dev pair is sufficient to distinguish co-existing files, Linux

View file

@ -261,3 +261,24 @@ math pow 2 x cos'(-pi)', 2
math 'ncr(0/0, 1)'
# CHECKERR: math: Error: Result is infinite
# CHECKERR: 'ncr(0/0, 1)'
math 0_1
# CHECK: 1
math 0x0_A
# CHECK: 10
math 1_000 + 2_000
# CHECK: 3000
math 1_0_0_0
# CHECK: 1000
math 0_0.5_0 + 0_1.0_0
# CHECK: 1.5
math 2e0_0_2
# CHECK: 200
math -0_0.5_0_0E0_0_3
# CHECK: -500
math 20e-0_1
# CHECK: 2
math 0x0_2.0_0_0P0_2
# CHECK: 8
math -0x8p-0_3
# CHECK: -1