printf: basic support for unicode escape sequences

2024-12-13 23:02:38 +00:00 · 2023-11-20 13:45:02 +01:00 · 2023-11-20 13:45:02 +01:00 · 68d036c9a2
commit 68d036c9a2
parent 066d8ba73d
2 changed files with 44 additions and 20 deletions
--- a/src/uucore/src/lib/features/format/escape.rs
+++ b/src/uucore/src/lib/features/format/escape.rs
@ -1,6 +1,7 @@
 #[derive(Debug)]
 pub enum EscapedChar {
-    Char(u8),
+    Byte(u8),
+    Char(char),
    Backslash(u8),
    End,
 }
@ -61,6 +62,24 @@ fn parse_code(input: &mut &[u8], base: Base) -> Option<u8> {
    Some(ret)
 }

+/// Parse `\uHHHH` and `\UHHHHHHHH`
+// TODO: This should print warnings and possibly halt execution when it fails to parse
+// TODO: If the character cannot be converted to u32, the input should be printed.
+fn parse_unicode(input: &mut &[u8], digits: u8) -> Option<char> {
+    let (c, rest) = input.split_first()?;
+    let mut ret = Base::Hex.to_digit(*c)? as u32;
+    *input = &rest[..];
+
+    for _ in 1..digits {
+        let (c, rest) = input.split_first()?;
+        let n = Base::Hex.to_digit(*c)?;
+        ret = ret.wrapping_mul(Base::Hex as u32).wrapping_add(n as u32);
+        *input = &rest[..];
+    }
+
+    char::from_u32(ret)
+}
+
 pub fn parse_escape_code(rest: &mut &[u8]) -> EscapedChar {
    if let [c, new_rest @ ..] = rest {
        // This is for the \NNN syntax for octal sequences.
@ -68,33 +87,35 @@ pub fn parse_escape_code(rest: &mut &[u8]) -> EscapedChar {
        // would be the \0NNN syntax.
        if let b'1'..=b'7' = c {
            if let Some(parsed) = parse_code(rest, Base::Oct) {
-                return EscapedChar::Char(parsed);
+                return EscapedChar::Byte(parsed);
            }
        }

        *rest = &new_rest[..];
        match c {
-            b'\\' => EscapedChar::Char(b'\\'),
-            b'a' => EscapedChar::Char(b'\x07'),
-            b'b' => EscapedChar::Char(b'\x08'),
+            b'\\' => EscapedChar::Byte(b'\\'),
+            b'a' => EscapedChar::Byte(b'\x07'),
+            b'b' => EscapedChar::Byte(b'\x08'),
            b'c' => return EscapedChar::End,
-            b'e' => EscapedChar::Char(b'\x1b'),
-            b'f' => EscapedChar::Char(b'\x0c'),
-            b'n' => EscapedChar::Char(b'\n'),
-            b'r' => EscapedChar::Char(b'\r'),
-            b't' => EscapedChar::Char(b'\t'),
-            b'v' => EscapedChar::Char(b'\x0b'),
+            b'e' => EscapedChar::Byte(b'\x1b'),
+            b'f' => EscapedChar::Byte(b'\x0c'),
+            b'n' => EscapedChar::Byte(b'\n'),
+            b'r' => EscapedChar::Byte(b'\r'),
+            b't' => EscapedChar::Byte(b'\t'),
+            b'v' => EscapedChar::Byte(b'\x0b'),
            b'x' => {
                if let Some(c) = parse_code(rest, Base::Hex) {
-                    EscapedChar::Char(c)
+                    EscapedChar::Byte(c)
                } else {
                    EscapedChar::Backslash(b'x')
                }
            }
-            b'0' => EscapedChar::Char(parse_code(rest, Base::Oct).unwrap_or(b'\0')),
+            b'0' => EscapedChar::Byte(parse_code(rest, Base::Oct).unwrap_or(b'\0')),
+            b'u' => EscapedChar::Char(parse_unicode(rest, 4).unwrap_or('\0')),
+            b'U' => EscapedChar::Char(parse_unicode(rest, 8).unwrap_or('\0')),
            c => EscapedChar::Backslash(*c),
        }
    } else {
-        EscapedChar::Char(b'\\')
+        EscapedChar::Byte(b'\\')
    }
 }
--- a/src/uucore/src/lib/features/format/mod.rs
+++ b/src/uucore/src/lib/features/format/mod.rs
@ -19,11 +19,12 @@

 // spell-checker:ignore (vars) charf decf floatf intf scif strf Cninety

-mod escape;
 mod argument;
+mod escape;
 pub mod num_format;
 mod spec;

+pub use argument::*;
 use spec::Spec;
 use std::{
    error::Error,
@ -31,7 +32,6 @@ use std::{
    io::{stdout, Write},
    ops::ControlFlow,
 };
-pub use argument::*;

 use crate::error::UError;

@ -91,9 +91,12 @@ impl FormatChar for u8 {
 impl FormatChar for EscapedChar {
    fn write(&self, mut writer: impl Write) -> std::io::Result<ControlFlow<()>> {
        match self {
-            EscapedChar::Char(c) => {
+            EscapedChar::Byte(c) => {
                writer.write(&[*c])?;
            }
+            EscapedChar::Char(c) => {
+                write!(writer, "{c}")?;
+            }
            EscapedChar::Backslash(c) => {
                writer.write(&[b'\\', *c])?;
            }
@ -125,7 +128,7 @@ pub fn parse_spec_and_escape(
        [] => return None,
        [b'%', b'%', rest @ ..] => {
            current = rest;
-            Some(Ok(FormatItem::Char(EscapedChar::Char(b'%'))))
+            Some(Ok(FormatItem::Char(EscapedChar::Byte(b'%'))))
        }
        [b'%', rest @ ..] => {
            current = rest;
@ -141,7 +144,7 @@ pub fn parse_spec_and_escape(
        }
        [c, rest @ ..] => {
            current = rest;
-            Some(Ok(FormatItem::Char(EscapedChar::Char(*c))))
+            Some(Ok(FormatItem::Char(EscapedChar::Byte(*c))))
        }
    })
 }
@ -179,7 +182,7 @@ fn parse_escape_only(fmt: &[u8]) -> impl Iterator<Item = EscapedChar> + '_ {
        }
        [c, rest @ ..] => {
            current = rest;
-            Some(EscapedChar::Char(*c))
+            Some(EscapedChar::Byte(*c))
        }
    })
 }