From 00cd6fa347a1bf147d0cfdc9e01f99c4cb7c2f6f Mon Sep 17 00:00:00 2001 From: Samuel Tardieu Date: Mon, 8 Jan 2024 15:02:12 +0100 Subject: [PATCH] format: new dedicated number parser The parser can parse integral and floating point numbers as expected by the coreutils `printf` command. --- src/uucore/src/lib/features/format/mod.rs | 1 + .../src/lib/features/format/num_parser.rs | 378 ++++++++++++++++++ 2 files changed, 379 insertions(+) create mode 100644 src/uucore/src/lib/features/format/num_parser.rs diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index 4d30753d6..8f662080d 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -33,6 +33,7 @@ mod argument; mod escape; pub mod num_format; +pub mod num_parser; mod spec; pub use argument::*; diff --git a/src/uucore/src/lib/features/format/num_parser.rs b/src/uucore/src/lib/features/format/num_parser.rs new file mode 100644 index 000000000..0d65651d7 --- /dev/null +++ b/src/uucore/src/lib/features/format/num_parser.rs @@ -0,0 +1,378 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +//! Utilities for parsing numbers in various formats + +// spell-checker:ignore powf copysign prec inity + +#[derive(Clone, Copy, PartialEq)] +pub enum Base { + Binary = 2, + Octal = 8, + Decimal = 10, + Hexadecimal = 16, +} + +impl Base { + pub fn digit(&self, c: char) -> Option { + fn from_decimal(c: char) -> u64 { + u64::from(c) - u64::from('0') + } + match self { + Self::Binary => ('0'..='1').contains(&c).then(|| from_decimal(c)), + Self::Octal => ('0'..='7').contains(&c).then(|| from_decimal(c)), + Self::Decimal => c.is_ascii_digit().then(|| from_decimal(c)), + Self::Hexadecimal => match c.to_ascii_lowercase() { + '0'..='9' => Some(from_decimal(c)), + c @ 'a'..='f' => Some(u64::from(c) - u64::from('a') + 10), + _ => None, + }, + } + } +} + +/// Type returned if a number could not be parsed in its entirety +#[derive(Debug, PartialEq)] +pub enum ParseError<'a, T> { + /// The input as a whole makes no sense + NotNumeric, + /// The beginning of the input made sense and has been parsed, + /// while the remaining doesn't. + PartialMatch(T, &'a str), + /// The integral part has overflowed the requested type, or + /// has overflowed the `u64` internal storage when parsing the + /// integral part of a floating point number. + Overflow, +} + +impl<'a, T> ParseError<'a, T> { + fn map(self, f: impl FnOnce(T, &'a str) -> ParseError<'a, U>) -> ParseError<'a, U> { + match self { + Self::NotNumeric => ParseError::NotNumeric, + Self::Overflow => ParseError::Overflow, + Self::PartialMatch(v, s) => f(v, s), + } + } +} + +/// A number parser for binary, octal, decimal, hexadecimal and single characters. +/// +/// Internally, in order to get the maximum possible precision and cover the full +/// range of u64 and i64 without losing precision for f64, the returned number is +/// decomposed into: +/// - A `base` value +/// - A `neg` sign bit +/// - A `integral` positive part +/// - A `fractional` positive part +/// - A `precision` representing the number of digits in the fractional part +/// +/// If the fractional part cannot be represented on a `u64`, parsing continues +/// silently by ignoring non-significant digits. +pub struct ParsedNumber { + base: Base, + negative: bool, + integral: u64, + fractional: u64, + precision: usize, +} + +impl ParsedNumber { + fn into_i64(self) -> Option { + if self.negative { + i64::try_from(-i128::from(self.integral)).ok() + } else { + i64::try_from(self.integral).ok() + } + } + + /// Parse a number as i64. No fractional part is allowed. + pub fn parse_i64(input: &str) -> Result> { + match Self::parse(input, true) { + Ok(v) => v.into_i64().ok_or(ParseError::Overflow), + Err(e) => Err(e.map(|v, rest| { + v.into_i64() + .map(|v| ParseError::PartialMatch(v, rest)) + .unwrap_or(ParseError::Overflow) + })), + } + } + + /// Parse a number as u64. No fractional part is allowed. + pub fn parse_u64(input: &str) -> Result> { + match Self::parse(input, true) { + Ok(v) | Err(ParseError::PartialMatch(v, _)) if v.negative => { + Err(ParseError::NotNumeric) + } + Ok(v) => Ok(v.integral), + Err(e) => Err(e.map(|v, rest| ParseError::PartialMatch(v.integral, rest))), + } + } + + fn into_f64(self) -> f64 { + let n = self.integral as f64 + + (self.fractional as f64) / (self.base as u8 as f64).powf(self.precision as f64); + if self.negative { + -n + } else { + n + } + } + + /// Parse a number as f64 + pub fn parse_f64(input: &str) -> Result> { + match Self::parse(input, false) { + Ok(v) => Ok(v.into_f64()), + Err(ParseError::NotNumeric) => Self::parse_f64_special_values(input), + Err(e) => Err(e.map(|v, rest| ParseError::PartialMatch(v.into_f64(), rest))), + } + } + + fn parse_f64_special_values(input: &str) -> Result> { + let (sign, rest) = if let Some(input) = input.strip_prefix('-') { + (-1.0, input) + } else { + (1.0, input) + }; + let prefix = rest + .chars() + .take(3) + .map(|c| c.to_ascii_lowercase()) + .collect::(); + let special = match prefix.as_str() { + "inf" => f64::INFINITY, + "nan" => f64::NAN, + _ => return Err(ParseError::NotNumeric), + } + .copysign(sign); + if rest.len() == 3 { + Ok(special) + } else { + Err(ParseError::PartialMatch(special, &rest[3..])) + } + } + + #[allow(clippy::cognitive_complexity)] + fn parse(input: &str, integral_only: bool) -> Result> { + // Parse the "'" prefix separately + if let Some(rest) = input.strip_prefix('\'') { + let mut chars = rest.char_indices().fuse(); + let v = chars.next().map(|(_, c)| Self { + base: Base::Decimal, + negative: false, + integral: u64::from(c), + fractional: 0, + precision: 0, + }); + return match (v, chars.next()) { + (Some(v), None) => Ok(v), + (Some(v), Some((i, _))) => Err(ParseError::PartialMatch(v, &rest[i..])), + (None, _) => Err(ParseError::NotNumeric), + }; + } + + // Initial minus sign + let (negative, unsigned) = if let Some(input) = input.strip_prefix('-') { + (true, input) + } else { + (false, input) + }; + + // Parse an optional base prefix ("0b" / "0B" / "0" / "0x" / "0X"). "0" is octal unless a + // fractional part is allowed in which case it is an insignificant leading 0. A "0" prefix + // will not be consumed in case the parsable string contains only "0": the leading extra "0" + // will have no influence on the result. + let (base, rest) = if let Some(rest) = unsigned.strip_prefix('0') { + if let Some(rest) = rest.strip_prefix(['b', 'B']) { + (Base::Binary, rest) + } else if let Some(rest) = rest.strip_prefix(['x', 'X']) { + (Base::Hexadecimal, rest) + } else if integral_only { + (Base::Octal, unsigned) + } else { + (Base::Decimal, unsigned) + } + } else { + (Base::Decimal, unsigned) + }; + if rest.is_empty() { + return Err(ParseError::NotNumeric); + } + + // Parse the integral part of the number + let mut chars = rest.chars().enumerate().fuse().peekable(); + let mut integral = 0u64; + while let Some(d) = chars.peek().and_then(|&(_, c)| base.digit(c)) { + chars.next(); + integral = integral + .checked_mul(base as u64) + .and_then(|n| n.checked_add(d)) + .ok_or(ParseError::Overflow)?; + } + + // Parse the fractional part of the number if there can be one and the input contains + // a '.' decimal separator. + let (mut fractional, mut precision) = (0u64, 0); + if matches!(chars.peek(), Some(&(_, '.'))) + && matches!(base, Base::Decimal | Base::Hexadecimal) + && !integral_only + { + chars.next(); + let mut ended = false; + while let Some(d) = chars.peek().and_then(|&(_, c)| base.digit(c)) { + chars.next(); + if !ended { + if let Some(f) = fractional + .checked_mul(base as u64) + .and_then(|n| n.checked_add(d)) + { + (fractional, precision) = (f, precision + 1); + } else { + ended = true; + } + } + } + } + + // If nothing has been parsed, declare the parsing unsuccessful + if let Some((0, _)) = chars.peek() { + return Err(ParseError::NotNumeric); + } + + // Return what has been parsed so far. It there are extra characters, mark the + // parsing as a partial match. + let parsed = Self { + base, + negative, + integral, + fractional, + precision, + }; + if let Some((first_unparsed, _)) = chars.next() { + Err(ParseError::PartialMatch(parsed, &rest[first_unparsed..])) + } else { + Ok(parsed) + } + } +} + +#[cfg(test)] +mod tests { + use super::{ParseError, ParsedNumber}; + + #[test] + fn test_decimal_u64() { + assert_eq!(Ok(123), ParsedNumber::parse_u64("123")); + assert_eq!( + Ok(u64::MAX), + ParsedNumber::parse_u64(&format!("{}", u64::MAX)) + ); + assert!(matches!( + ParsedNumber::parse_u64("-123"), + Err(ParseError::NotNumeric) + )); + assert!(matches!( + ParsedNumber::parse_u64(""), + Err(ParseError::NotNumeric) + )); + assert!(matches!( + ParsedNumber::parse_u64("123.15"), + Err(ParseError::PartialMatch(123, ".15")) + )); + } + + #[test] + fn test_decimal_i64() { + assert_eq!(Ok(123), ParsedNumber::parse_i64("123")); + assert_eq!(Ok(-123), ParsedNumber::parse_i64("-123")); + assert!(matches!( + ParsedNumber::parse_i64("--123"), + Err(ParseError::NotNumeric) + )); + assert_eq!( + Ok(i64::MAX), + ParsedNumber::parse_i64(&format!("{}", i64::MAX)) + ); + assert_eq!( + Ok(i64::MIN), + ParsedNumber::parse_i64(&format!("{}", i64::MIN)) + ); + assert!(matches!( + ParsedNumber::parse_i64(&format!("{}", u64::MAX)), + Err(ParseError::Overflow) + )); + assert!(matches!( + ParsedNumber::parse_i64(&format!("{}", i64::MAX as u64 + 1)), + Err(ParseError::Overflow) + )); + } + + #[test] + fn test_decimal_f64() { + assert_eq!(Ok(123.0), ParsedNumber::parse_f64("123")); + assert_eq!(Ok(-123.0), ParsedNumber::parse_f64("-123")); + assert_eq!(Ok(123.0), ParsedNumber::parse_f64("123.")); + assert_eq!(Ok(-123.0), ParsedNumber::parse_f64("-123.")); + assert_eq!(Ok(123.0), ParsedNumber::parse_f64("123.0")); + assert_eq!(Ok(-123.0), ParsedNumber::parse_f64("-123.0")); + assert_eq!(Ok(123.15), ParsedNumber::parse_f64("123.15")); + assert_eq!(Ok(-123.15), ParsedNumber::parse_f64("-123.15")); + assert_eq!(Ok(0.15), ParsedNumber::parse_f64(".15")); + assert_eq!(Ok(-0.15), ParsedNumber::parse_f64("-.15")); + assert_eq!( + Ok(0.15), + ParsedNumber::parse_f64(".150000000000000000000000000231313") + ); + assert!(matches!(ParsedNumber::parse_f64("1.2.3"), + Err(ParseError::PartialMatch(f, ".3")) if f == 1.2)); + assert_eq!(Ok(f64::INFINITY), ParsedNumber::parse_f64("inf")); + assert_eq!(Ok(f64::NEG_INFINITY), ParsedNumber::parse_f64("-inf")); + assert!(ParsedNumber::parse_f64("NaN").unwrap().is_nan()); + assert!(ParsedNumber::parse_f64("NaN").unwrap().is_sign_positive()); + assert!(ParsedNumber::parse_f64("-NaN").unwrap().is_nan()); + assert!(ParsedNumber::parse_f64("-NaN").unwrap().is_sign_negative()); + assert!(matches!(ParsedNumber::parse_f64("-infinity"), + Err(ParseError::PartialMatch(f, "inity")) if f == f64::NEG_INFINITY)); + assert!(ParsedNumber::parse_f64(&format!("{}", u64::MAX)).is_ok()); + assert!(ParsedNumber::parse_f64(&format!("{}", i64::MIN)).is_ok()); + } + + #[test] + fn test_hexadecimal() { + assert_eq!(Ok(0x123), ParsedNumber::parse_u64("0x123")); + assert_eq!(Ok(0x123), ParsedNumber::parse_u64("0X123")); + assert_eq!(Ok(0xfe), ParsedNumber::parse_u64("0xfE")); + assert_eq!(Ok(-0x123), ParsedNumber::parse_i64("-0x123")); + + assert_eq!(Ok(0.5), ParsedNumber::parse_f64("0x.8")); + assert_eq!(Ok(0.0625), ParsedNumber::parse_f64("0x.1")); + assert_eq!(Ok(15.0078125), ParsedNumber::parse_f64("0xf.02")); + } + + #[test] + fn test_octal() { + assert_eq!(Ok(0), ParsedNumber::parse_u64("0")); + assert_eq!(Ok(0o123), ParsedNumber::parse_u64("0123")); + assert_eq!(Ok(0o123), ParsedNumber::parse_u64("00123")); + assert_eq!(Ok(0), ParsedNumber::parse_u64("00")); + assert!(matches!( + ParsedNumber::parse_u64("008"), + Err(ParseError::PartialMatch(0, "8")) + )); + assert!(matches!( + ParsedNumber::parse_u64("08"), + Err(ParseError::PartialMatch(0, "8")) + )); + assert!(matches!( + ParsedNumber::parse_u64("0."), + Err(ParseError::PartialMatch(0, ".")) + )); + } + + #[test] + fn test_binary() { + assert_eq!(Ok(0b1011), ParsedNumber::parse_u64("0b1011")); + assert_eq!(Ok(0b1011), ParsedNumber::parse_u64("0B1011")); + } +}