format: new dedicated number parser

The parser can parse integral and floating point numbers as expected by
the coreutils `printf` command.
This commit is contained in:
Samuel Tardieu 2024-01-08 15:02:12 +01:00
parent 5dfeca9ff2
commit 00cd6fa347
2 changed files with 379 additions and 0 deletions

View file

@ -33,6 +33,7 @@
mod argument;
mod escape;
pub mod num_format;
pub mod num_parser;
mod spec;
pub use argument::*;

View file

@ -0,0 +1,378 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
//! Utilities for parsing numbers in various formats
// spell-checker:ignore powf copysign prec inity
#[derive(Clone, Copy, PartialEq)]
pub enum Base {
Binary = 2,
Octal = 8,
Decimal = 10,
Hexadecimal = 16,
}
impl Base {
pub fn digit(&self, c: char) -> Option<u64> {
fn from_decimal(c: char) -> u64 {
u64::from(c) - u64::from('0')
}
match self {
Self::Binary => ('0'..='1').contains(&c).then(|| from_decimal(c)),
Self::Octal => ('0'..='7').contains(&c).then(|| from_decimal(c)),
Self::Decimal => c.is_ascii_digit().then(|| from_decimal(c)),
Self::Hexadecimal => match c.to_ascii_lowercase() {
'0'..='9' => Some(from_decimal(c)),
c @ 'a'..='f' => Some(u64::from(c) - u64::from('a') + 10),
_ => None,
},
}
}
}
/// Type returned if a number could not be parsed in its entirety
#[derive(Debug, PartialEq)]
pub enum ParseError<'a, T> {
/// The input as a whole makes no sense
NotNumeric,
/// The beginning of the input made sense and has been parsed,
/// while the remaining doesn't.
PartialMatch(T, &'a str),
/// The integral part has overflowed the requested type, or
/// has overflowed the `u64` internal storage when parsing the
/// integral part of a floating point number.
Overflow,
}
impl<'a, T> ParseError<'a, T> {
fn map<U>(self, f: impl FnOnce(T, &'a str) -> ParseError<'a, U>) -> ParseError<'a, U> {
match self {
Self::NotNumeric => ParseError::NotNumeric,
Self::Overflow => ParseError::Overflow,
Self::PartialMatch(v, s) => f(v, s),
}
}
}
/// A number parser for binary, octal, decimal, hexadecimal and single characters.
///
/// Internally, in order to get the maximum possible precision and cover the full
/// range of u64 and i64 without losing precision for f64, the returned number is
/// decomposed into:
/// - A `base` value
/// - A `neg` sign bit
/// - A `integral` positive part
/// - A `fractional` positive part
/// - A `precision` representing the number of digits in the fractional part
///
/// If the fractional part cannot be represented on a `u64`, parsing continues
/// silently by ignoring non-significant digits.
pub struct ParsedNumber {
base: Base,
negative: bool,
integral: u64,
fractional: u64,
precision: usize,
}
impl ParsedNumber {
fn into_i64(self) -> Option<i64> {
if self.negative {
i64::try_from(-i128::from(self.integral)).ok()
} else {
i64::try_from(self.integral).ok()
}
}
/// Parse a number as i64. No fractional part is allowed.
pub fn parse_i64(input: &str) -> Result<i64, ParseError<'_, i64>> {
match Self::parse(input, true) {
Ok(v) => v.into_i64().ok_or(ParseError::Overflow),
Err(e) => Err(e.map(|v, rest| {
v.into_i64()
.map(|v| ParseError::PartialMatch(v, rest))
.unwrap_or(ParseError::Overflow)
})),
}
}
/// Parse a number as u64. No fractional part is allowed.
pub fn parse_u64(input: &str) -> Result<u64, ParseError<'_, u64>> {
match Self::parse(input, true) {
Ok(v) | Err(ParseError::PartialMatch(v, _)) if v.negative => {
Err(ParseError::NotNumeric)
}
Ok(v) => Ok(v.integral),
Err(e) => Err(e.map(|v, rest| ParseError::PartialMatch(v.integral, rest))),
}
}
fn into_f64(self) -> f64 {
let n = self.integral as f64
+ (self.fractional as f64) / (self.base as u8 as f64).powf(self.precision as f64);
if self.negative {
-n
} else {
n
}
}
/// Parse a number as f64
pub fn parse_f64(input: &str) -> Result<f64, ParseError<'_, f64>> {
match Self::parse(input, false) {
Ok(v) => Ok(v.into_f64()),
Err(ParseError::NotNumeric) => Self::parse_f64_special_values(input),
Err(e) => Err(e.map(|v, rest| ParseError::PartialMatch(v.into_f64(), rest))),
}
}
fn parse_f64_special_values(input: &str) -> Result<f64, ParseError<'_, f64>> {
let (sign, rest) = if let Some(input) = input.strip_prefix('-') {
(-1.0, input)
} else {
(1.0, input)
};
let prefix = rest
.chars()
.take(3)
.map(|c| c.to_ascii_lowercase())
.collect::<String>();
let special = match prefix.as_str() {
"inf" => f64::INFINITY,
"nan" => f64::NAN,
_ => return Err(ParseError::NotNumeric),
}
.copysign(sign);
if rest.len() == 3 {
Ok(special)
} else {
Err(ParseError::PartialMatch(special, &rest[3..]))
}
}
#[allow(clippy::cognitive_complexity)]
fn parse(input: &str, integral_only: bool) -> Result<Self, ParseError<'_, Self>> {
// Parse the "'" prefix separately
if let Some(rest) = input.strip_prefix('\'') {
let mut chars = rest.char_indices().fuse();
let v = chars.next().map(|(_, c)| Self {
base: Base::Decimal,
negative: false,
integral: u64::from(c),
fractional: 0,
precision: 0,
});
return match (v, chars.next()) {
(Some(v), None) => Ok(v),
(Some(v), Some((i, _))) => Err(ParseError::PartialMatch(v, &rest[i..])),
(None, _) => Err(ParseError::NotNumeric),
};
}
// Initial minus sign
let (negative, unsigned) = if let Some(input) = input.strip_prefix('-') {
(true, input)
} else {
(false, input)
};
// Parse an optional base prefix ("0b" / "0B" / "0" / "0x" / "0X"). "0" is octal unless a
// fractional part is allowed in which case it is an insignificant leading 0. A "0" prefix
// will not be consumed in case the parsable string contains only "0": the leading extra "0"
// will have no influence on the result.
let (base, rest) = if let Some(rest) = unsigned.strip_prefix('0') {
if let Some(rest) = rest.strip_prefix(['b', 'B']) {
(Base::Binary, rest)
} else if let Some(rest) = rest.strip_prefix(['x', 'X']) {
(Base::Hexadecimal, rest)
} else if integral_only {
(Base::Octal, unsigned)
} else {
(Base::Decimal, unsigned)
}
} else {
(Base::Decimal, unsigned)
};
if rest.is_empty() {
return Err(ParseError::NotNumeric);
}
// Parse the integral part of the number
let mut chars = rest.chars().enumerate().fuse().peekable();
let mut integral = 0u64;
while let Some(d) = chars.peek().and_then(|&(_, c)| base.digit(c)) {
chars.next();
integral = integral
.checked_mul(base as u64)
.and_then(|n| n.checked_add(d))
.ok_or(ParseError::Overflow)?;
}
// Parse the fractional part of the number if there can be one and the input contains
// a '.' decimal separator.
let (mut fractional, mut precision) = (0u64, 0);
if matches!(chars.peek(), Some(&(_, '.')))
&& matches!(base, Base::Decimal | Base::Hexadecimal)
&& !integral_only
{
chars.next();
let mut ended = false;
while let Some(d) = chars.peek().and_then(|&(_, c)| base.digit(c)) {
chars.next();
if !ended {
if let Some(f) = fractional
.checked_mul(base as u64)
.and_then(|n| n.checked_add(d))
{
(fractional, precision) = (f, precision + 1);
} else {
ended = true;
}
}
}
}
// If nothing has been parsed, declare the parsing unsuccessful
if let Some((0, _)) = chars.peek() {
return Err(ParseError::NotNumeric);
}
// Return what has been parsed so far. It there are extra characters, mark the
// parsing as a partial match.
let parsed = Self {
base,
negative,
integral,
fractional,
precision,
};
if let Some((first_unparsed, _)) = chars.next() {
Err(ParseError::PartialMatch(parsed, &rest[first_unparsed..]))
} else {
Ok(parsed)
}
}
}
#[cfg(test)]
mod tests {
use super::{ParseError, ParsedNumber};
#[test]
fn test_decimal_u64() {
assert_eq!(Ok(123), ParsedNumber::parse_u64("123"));
assert_eq!(
Ok(u64::MAX),
ParsedNumber::parse_u64(&format!("{}", u64::MAX))
);
assert!(matches!(
ParsedNumber::parse_u64("-123"),
Err(ParseError::NotNumeric)
));
assert!(matches!(
ParsedNumber::parse_u64(""),
Err(ParseError::NotNumeric)
));
assert!(matches!(
ParsedNumber::parse_u64("123.15"),
Err(ParseError::PartialMatch(123, ".15"))
));
}
#[test]
fn test_decimal_i64() {
assert_eq!(Ok(123), ParsedNumber::parse_i64("123"));
assert_eq!(Ok(-123), ParsedNumber::parse_i64("-123"));
assert!(matches!(
ParsedNumber::parse_i64("--123"),
Err(ParseError::NotNumeric)
));
assert_eq!(
Ok(i64::MAX),
ParsedNumber::parse_i64(&format!("{}", i64::MAX))
);
assert_eq!(
Ok(i64::MIN),
ParsedNumber::parse_i64(&format!("{}", i64::MIN))
);
assert!(matches!(
ParsedNumber::parse_i64(&format!("{}", u64::MAX)),
Err(ParseError::Overflow)
));
assert!(matches!(
ParsedNumber::parse_i64(&format!("{}", i64::MAX as u64 + 1)),
Err(ParseError::Overflow)
));
}
#[test]
fn test_decimal_f64() {
assert_eq!(Ok(123.0), ParsedNumber::parse_f64("123"));
assert_eq!(Ok(-123.0), ParsedNumber::parse_f64("-123"));
assert_eq!(Ok(123.0), ParsedNumber::parse_f64("123."));
assert_eq!(Ok(-123.0), ParsedNumber::parse_f64("-123."));
assert_eq!(Ok(123.0), ParsedNumber::parse_f64("123.0"));
assert_eq!(Ok(-123.0), ParsedNumber::parse_f64("-123.0"));
assert_eq!(Ok(123.15), ParsedNumber::parse_f64("123.15"));
assert_eq!(Ok(-123.15), ParsedNumber::parse_f64("-123.15"));
assert_eq!(Ok(0.15), ParsedNumber::parse_f64(".15"));
assert_eq!(Ok(-0.15), ParsedNumber::parse_f64("-.15"));
assert_eq!(
Ok(0.15),
ParsedNumber::parse_f64(".150000000000000000000000000231313")
);
assert!(matches!(ParsedNumber::parse_f64("1.2.3"),
Err(ParseError::PartialMatch(f, ".3")) if f == 1.2));
assert_eq!(Ok(f64::INFINITY), ParsedNumber::parse_f64("inf"));
assert_eq!(Ok(f64::NEG_INFINITY), ParsedNumber::parse_f64("-inf"));
assert!(ParsedNumber::parse_f64("NaN").unwrap().is_nan());
assert!(ParsedNumber::parse_f64("NaN").unwrap().is_sign_positive());
assert!(ParsedNumber::parse_f64("-NaN").unwrap().is_nan());
assert!(ParsedNumber::parse_f64("-NaN").unwrap().is_sign_negative());
assert!(matches!(ParsedNumber::parse_f64("-infinity"),
Err(ParseError::PartialMatch(f, "inity")) if f == f64::NEG_INFINITY));
assert!(ParsedNumber::parse_f64(&format!("{}", u64::MAX)).is_ok());
assert!(ParsedNumber::parse_f64(&format!("{}", i64::MIN)).is_ok());
}
#[test]
fn test_hexadecimal() {
assert_eq!(Ok(0x123), ParsedNumber::parse_u64("0x123"));
assert_eq!(Ok(0x123), ParsedNumber::parse_u64("0X123"));
assert_eq!(Ok(0xfe), ParsedNumber::parse_u64("0xfE"));
assert_eq!(Ok(-0x123), ParsedNumber::parse_i64("-0x123"));
assert_eq!(Ok(0.5), ParsedNumber::parse_f64("0x.8"));
assert_eq!(Ok(0.0625), ParsedNumber::parse_f64("0x.1"));
assert_eq!(Ok(15.0078125), ParsedNumber::parse_f64("0xf.02"));
}
#[test]
fn test_octal() {
assert_eq!(Ok(0), ParsedNumber::parse_u64("0"));
assert_eq!(Ok(0o123), ParsedNumber::parse_u64("0123"));
assert_eq!(Ok(0o123), ParsedNumber::parse_u64("00123"));
assert_eq!(Ok(0), ParsedNumber::parse_u64("00"));
assert!(matches!(
ParsedNumber::parse_u64("008"),
Err(ParseError::PartialMatch(0, "8"))
));
assert!(matches!(
ParsedNumber::parse_u64("08"),
Err(ParseError::PartialMatch(0, "8"))
));
assert!(matches!(
ParsedNumber::parse_u64("0."),
Err(ParseError::PartialMatch(0, "."))
));
}
#[test]
fn test_binary() {
assert_eq!(Ok(0b1011), ParsedNumber::parse_u64("0b1011"));
assert_eq!(Ok(0b1011), ParsedNumber::parse_u64("0B1011"));
}
}