fish-shell/printf/src/printf_impl.rs
Peter Ammon 974ad882fa
Clean up fish-printf in preparation for publishing
Make fish-printf no longer depend on the widestring crate, as other clients
won't use it; instead this is an optional feature.

Make format strings a generic type, so that both narrow and wide strings can
serve. This removes a lot of the complexity around converting from narrow to
wide.

Add a README.md to this crate.
2024-09-21 17:52:11 -07:00

585 lines
18 KiB
Rust

/** Rust printf implementation, based on musl. */
use super::arg::Arg;
use super::fmt_fp::format_float;
use super::locale::Locale;
use std::fmt::{self, Write};
use std::mem;
use std::result::Result;
#[cfg(feature = "widestring")]
use widestring::Utf32Str as wstr;
/// Possible errors from printf.
#[derive(Debug, PartialEq, Eq)]
pub enum Error {
/// Invalid format string.
BadFormatString,
/// Too few arguments.
MissingArg,
/// Too many arguments.
ExtraArg,
/// Argument type doesn't match format specifier.
BadArgType,
/// Precision is too large to represent.
Overflow,
/// Error emitted by the output stream.
Fmt(fmt::Error),
}
// Convenience conversion from fmt::Error.
impl From<fmt::Error> for Error {
fn from(err: fmt::Error) -> Error {
Error::Fmt(err)
}
}
#[derive(Debug, Copy, Clone, Default)]
pub(super) struct ModifierFlags {
pub alt_form: bool, // #
pub zero_pad: bool, // 0
pub left_adj: bool, // negative field width
pub pad_pos: bool, // space: blank before positive numbers
pub mark_pos: bool, // +: sign before positive numbers
pub grouped: bool, // ': group indicator
}
impl ModifierFlags {
// If c is a modifier character, set the flag and return true.
// Otherwise return false. Note we allow repeated modifier flags.
fn try_set(&mut self, c: char) -> bool {
match c {
'#' => self.alt_form = true,
'0' => self.zero_pad = true,
'-' => self.left_adj = true,
' ' => self.pad_pos = true,
'+' => self.mark_pos = true,
'\'' => self.grouped = true,
_ => return false,
};
true
}
}
// The set of prefixes of conversion specifiers.
// Note that we mostly ignore prefixes - we take sizes of values from the arguments themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[allow(non_camel_case_types)]
enum ConversionPrefix {
Empty,
hh,
h,
l,
ll,
j,
t,
z,
L,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[allow(non_camel_case_types)]
#[rustfmt::skip]
pub(super) enum ConversionSpec {
// Integers, with prefixes "hh", "h", "l", "ll", "j", "t", "z"
// Note that we treat '%i' as '%d'.
d, o, u, x, X,
// USizeRef receiver, with same prefixes as ints
n,
// Float, with prefixes "l" and "L"
a, A, e, E, f, F, g, G,
// Pointer, no prefixes
p,
// Character or String, with supported prefixes "l"
// Note that we treat '%C' as '%c' and '%S' as '%s'.
c, s,
}
impl ConversionSpec {
// Returns true if the given prefix is supported by this conversion specifier.
fn supports_prefix(self, prefix: ConversionPrefix) -> bool {
use ConversionPrefix::*;
use ConversionSpec::*;
if matches!(prefix, Empty) {
// No prefix is always supported.
return true;
}
match self {
d | o | u | x | X | n => matches!(prefix, hh | h | l | ll | j | t | z),
a | A | e | E | f | F | g | G => matches!(prefix, l | L),
p => false,
c | s => matches!(prefix, l),
}
}
// Returns true if the conversion specifier is lowercase,
// which affects certain rendering.
#[inline]
pub(super) fn is_lower(self) -> bool {
use ConversionSpec::*;
match self {
d | o | u | x | n | a | e | f | g | p | c | s => true,
X | A | E | F | G => false,
}
}
// Returns a ConversionSpec from a character, or None if none.
fn from_char(cc: char) -> Option<Self> {
use ConversionSpec::*;
let res = match cc {
'd' | 'i' => d,
'o' => o,
'u' => u,
'x' => x,
'X' => X,
'n' => n,
'a' => a,
'A' => A,
'e' => e,
'E' => E,
'f' => f,
'F' => F,
'g' => g,
'G' => G,
'p' => p,
'c' | 'C' => c,
's' | 'S' => s,
_ => return None,
};
Some(res)
}
}
// A helper type with convenience functions for format strings.
pub trait FormatString {
// Return true if we are empty.
fn is_empty(&self) -> bool;
// Return the character at a given index, or None if out of bounds.
// Note the index is a count of characters, not bytes.
fn at(&self, index: usize) -> Option<char>;
// Advance by the given number of characters.
fn advance_by(&mut self, n: usize);
// Read a sequence of characters to be output literally, advancing the cursor.
// The characters may optionally be stored in the given buffer.
// This handles a tail of %%.
fn take_literal<'a: 'b, 'b>(&'a mut self, buffer: &'b mut String) -> &'b str;
}
impl FormatString for &str {
fn is_empty(&self) -> bool {
(*self).is_empty()
}
fn at(&self, index: usize) -> Option<char> {
self.chars().nth(index)
}
fn advance_by(&mut self, n: usize) {
let mut chars = self.chars();
for _ in 0..n {
let c = chars.next();
assert!(c.is_some(), "FormatString::advance(): index out of bounds");
}
*self = chars.as_str();
}
fn take_literal<'a: 'b, 'b>(&'a mut self, _buffer: &'b mut String) -> &'b str {
// Count length of non-percent characters.
let non_percents: usize = self
.chars()
.take_while(|&c| c != '%')
.map(|c| c.len_utf8())
.sum();
// Take only an even number of percents. Note we know these have byte length 1.
let percent_pairs = self[non_percents..]
.chars()
.take_while(|&c| c == '%')
.count()
/ 2;
let (prefix, rest) = self.split_at(non_percents + percent_pairs * 2);
*self = rest;
// Trim half of the trailing percent characters from the prefix.
&prefix[..prefix.len() - percent_pairs]
}
}
#[cfg(feature = "widestring")]
impl FormatString for &wstr {
fn is_empty(&self) -> bool {
(*self).is_empty()
}
fn at(&self, index: usize) -> Option<char> {
self.as_char_slice().get(index).copied()
}
fn advance_by(&mut self, n: usize) {
*self = &self[n..];
}
fn take_literal<'a: 'b, 'b>(&'a mut self, buffer: &'b mut String) -> &'b str {
let s = self.as_char_slice();
let non_percents = s.iter().take_while(|&&c| c != '%').count();
// Take only an even number of percents.
let percent_pairs: usize = s[non_percents..].iter().take_while(|&&c| c == '%').count() / 2;
*self = &self[non_percents + percent_pairs * 2..];
buffer.clear();
buffer.extend(s[..non_percents + percent_pairs].iter());
buffer.as_str()
}
}
// Read an int from a format string, stopping at the first non-digit.
// Negative values are not supported.
// If there are no digits, return 0.
// Adjust the format string to point to the char after the int.
fn get_int(fmt: &mut impl FormatString) -> Result<usize, Error> {
use Error::Overflow;
let mut i: usize = 0;
while let Some(digit) = fmt.at(0).and_then(|c| c.to_digit(10)) {
i = i.checked_mul(10).ok_or(Overflow)?;
i = i.checked_add(digit as usize).ok_or(Overflow)?;
fmt.advance_by(1);
}
Ok(i)
}
// Read a conversion prefix from a format string, advancing it.
fn get_prefix(fmt: &mut impl FormatString) -> ConversionPrefix {
use ConversionPrefix as CP;
let prefix = match fmt.at(0).unwrap_or('\0') {
'h' if fmt.at(1) == Some('h') => CP::hh,
'h' => CP::h,
'l' if fmt.at(1) == Some('l') => CP::ll,
'l' => CP::l,
'j' => CP::j,
't' => CP::t,
'z' => CP::z,
'L' => CP::L,
_ => CP::Empty,
};
fmt.advance_by(match prefix {
CP::Empty => 0,
CP::hh | CP::ll => 2,
_ => 1,
});
prefix
}
// Read an (optionally prefixed) format specifier, such as d, Lf, etc.
// Adjust the cursor to point to the char after the specifier.
fn get_specifier(fmt: &mut impl FormatString) -> Result<ConversionSpec, Error> {
let prefix = get_prefix(fmt);
// Awkwardly placed hack to disallow %lC and %lS, since we otherwise treat
// them as the same.
if prefix != ConversionPrefix::Empty && matches!(fmt.at(0), Some('C' | 'S')) {
return Err(Error::BadFormatString);
}
let spec = fmt
.at(0)
.and_then(ConversionSpec::from_char)
.ok_or(Error::BadFormatString)?;
if !spec.supports_prefix(prefix) {
return Err(Error::BadFormatString);
}
fmt.advance_by(1);
Ok(spec)
}
// Pad output by emitting `c` until `min_width` is reached.
pub(super) fn pad(
f: &mut impl Write,
c: char,
min_width: usize,
current_width: usize,
) -> fmt::Result {
assert!(c == '0' || c == ' ');
if current_width >= min_width {
return Ok(());
}
const ZEROS: &str = "0000000000000000";
const SPACES: &str = " ";
let buff = if c == '0' { ZEROS } else { SPACES };
let mut remaining = min_width - current_width;
while remaining > 0 {
let n = remaining.min(buff.len());
f.write_str(&buff[..n])?;
remaining -= n;
}
Ok(())
}
/// Formats a string using the provided format specifiers, arguments, and locale,
/// and writes the output to the given `Write` implementation.
///
/// # Parameters
/// - `f`: The receiver of formatted output.
/// - `fmt`: The format string being parsed.
/// - `locale`: The locale to use for number formatting.
/// - `args`: Iterator over the arguments to format.
///
/// # Returns
/// A `Result` which is `Ok` containing the number of bytes written on success, or an `Error`.
///
/// # Example
///
/// ```
/// use fish_printf::{sprintf_locale, ToArg, FormatString, locale};
/// use std::fmt::Write;
///
/// let mut output = String::new();
/// let fmt: &str = "%'0.2f";
/// let mut args = [1234567.89.to_arg()];
///
/// let result = sprintf_locale(&mut output, fmt, &locale::EN_US_LOCALE, &mut args);
///
/// assert!(result == Ok(12));
/// assert_eq!(output, "1,234,567.89");
/// ```
pub fn sprintf_locale(
f: &mut impl Write,
fmt: impl FormatString,
locale: &Locale,
args: &mut [Arg],
) -> Result<usize, Error> {
use ConversionSpec as CS;
let mut s = fmt;
let mut args = args.iter_mut();
let mut out_len: usize = 0;
// Shared storage for the output of the conversion specifier.
let buf = &mut String::new();
'main: while !s.is_empty() {
buf.clear();
// Handle literal text and %% format specifiers.
let lit = s.take_literal(buf);
if !lit.is_empty() {
f.write_str(lit)?;
out_len = out_len
.checked_add(lit.chars().count())
.ok_or(Error::Overflow)?;
continue 'main;
}
// Consume the % at the start of the format specifier.
debug_assert!(s.at(0) == Some('%'));
s.advance_by(1);
// Read modifier flags. '-' and '0' flags are mutually exclusive.
let mut flags = ModifierFlags::default();
while flags.try_set(s.at(0).unwrap_or('\0')) {
s.advance_by(1);
}
if flags.left_adj {
flags.zero_pad = false;
}
// Read field width. We do not support $.
let width = if s.at(0) == Some('*') {
let arg_width = args.next().ok_or(Error::MissingArg)?.as_sint()?;
s.advance_by(1);
if arg_width < 0 {
flags.left_adj = true;
}
arg_width
.unsigned_abs()
.try_into()
.map_err(|_| Error::Overflow)?
} else {
get_int(&mut s)?
};
// Optionally read precision. We do not support $.
let mut prec: Option<usize> = if s.at(0) == Some('.') && s.at(1) == Some('*') {
// "A negative precision is treated as though it were missing."
// Here we assume the precision is always signed.
s.advance_by(2);
let p = args.next().ok_or(Error::MissingArg)?.as_sint()?;
p.try_into().ok()
} else if s.at(0) == Some('.') {
s.advance_by(1);
Some(get_int(&mut s)?)
} else {
None
};
// Disallow precisions larger than i32::MAX, in keeping with C.
if prec.unwrap_or(0) > i32::MAX as usize {
return Err(Error::Overflow);
}
// Read out the format specifier and arg.
let conv_spec = get_specifier(&mut s)?;
let arg = args.next().ok_or(Error::MissingArg)?;
let mut prefix = "";
// Thousands grouping only works for d,u,i,f,F.
// 'i' is mapped to 'd'.
if flags.grouped && !matches!(conv_spec, CS::d | CS::u | CS::f | CS::F) {
return Err(Error::BadFormatString);
}
// Disable zero-pad if we have an explicit precision.
// "If a precision is given with a numeric conversion (d, i, o, u, i, x, and X),
// the 0 flag is ignored." p is included here.
let spec_is_numeric = matches!(conv_spec, CS::d | CS::u | CS::o | CS::p | CS::x | CS::X);
if spec_is_numeric && prec.is_some() {
flags.zero_pad = false;
}
// Apply the formatting. Some cases continue the main loop.
// Note that numeric conversions must leave 'body' empty if the value is 0.
let body: &str = match conv_spec {
CS::n => {
arg.set_count(out_len)?;
continue 'main;
}
CS::e | CS::f | CS::g | CS::a | CS::E | CS::F | CS::G | CS::A => {
// Floating point types handle output on their own.
let float = arg.as_float()?;
let len = format_float(f, float, width, prec, flags, locale, conv_spec, buf)?;
out_len = out_len.checked_add(len).ok_or(Error::Overflow)?;
continue 'main;
}
CS::p => {
const PTR_HEX_DIGITS: usize = 2 * mem::size_of::<*const u8>();
prec = prec.map(|p| p.max(PTR_HEX_DIGITS));
let uint = arg.as_uint()?;
if uint != 0 {
prefix = "0x";
write!(buf, "{:x}", uint)?;
}
buf
}
CS::x | CS::X => {
// If someone passes us a negative value, format it with the width
// we were given.
let lower = conv_spec.is_lower();
let (_, uint) = arg.as_wrapping_sint()?;
if uint != 0 {
if flags.alt_form {
prefix = if lower { "0x" } else { "0X" };
}
if lower {
write!(buf, "{:x}", uint)?;
} else {
write!(buf, "{:X}", uint)?;
}
}
buf
}
CS::o => {
let uint = arg.as_uint()?;
if uint != 0 {
write!(buf, "{:o}", uint)?;
}
if flags.alt_form && prec.unwrap_or(0) <= buf.len() + 1 {
prec = Some(buf.len() + 1);
}
buf
}
CS::u => {
let uint = arg.as_uint()?;
if uint != 0 {
write!(buf, "{}", uint)?;
}
buf
}
CS::d => {
let arg_i = arg.as_sint()?;
if arg_i < 0 {
prefix = "-";
} else if flags.mark_pos {
prefix = "+";
} else if flags.pad_pos {
prefix = " ";
}
if arg_i != 0 {
write!(buf, "{}", arg_i.unsigned_abs())?;
}
buf
}
CS::c => {
// also 'C'
flags.zero_pad = false;
buf.push(arg.as_char()?);
buf
}
CS::s => {
// also 'S'
let s = arg.as_str(buf)?;
let p = prec.unwrap_or(s.len()).min(s.len());
prec = Some(p);
flags.zero_pad = false;
&s[..p]
}
};
// Numeric output should be empty iff the value is 0.
if spec_is_numeric && body.is_empty() {
debug_assert!(arg.as_uint().unwrap() == 0);
}
// Decide if we want to apply thousands grouping to the body, and compute its size.
// Note we have already errored out if grouped is set and this is non-numeric.
let wants_grouping = flags.grouped && locale.thousands_sep.is_some();
let body_len = match wants_grouping {
true => body.len() + locale.separator_count(body.len()),
false => body.len(),
};
// Resolve the precision.
// In the case of a non-numeric conversion, update the precision to at least the
// length of the string.
let prec = if !spec_is_numeric {
prec.unwrap_or(body_len)
} else {
prec.unwrap_or(1).max(body_len)
};
let prefix_len = prefix.len();
let unpadded_width = prefix_len.checked_add(prec).ok_or(Error::Overflow)?;
let width = width.max(unpadded_width);
// Pad on the left with spaces to the desired width?
if !flags.left_adj && !flags.zero_pad {
pad(f, ' ', width, unpadded_width)?;
}
// Output any prefix.
f.write_str(prefix)?;
// Pad after the prefix with zeros to the desired width?
if !flags.left_adj && flags.zero_pad {
pad(f, '0', width, unpadded_width)?;
}
// Pad on the left to the given precision?
pad(f, '0', prec, body_len)?;
// Output the actual value, perhaps with grouping.
if wants_grouping {
f.write_str(&locale.apply_grouping(body))?;
} else {
f.write_str(body)?;
}
// Pad on the right with spaces if we are left adjusted?
if flags.left_adj {
pad(f, ' ', width, unpadded_width)?;
}
out_len = out_len.checked_add(width).ok_or(Error::Overflow)?;
}
// Too many args?
if args.next().is_some() {
return Err(Error::ExtraArg);
}
Ok(out_len)
}