od: start with multi-byte support

2024-11-16 17:58:06 +00:00 · 2016-08-01 16:39:58 +02:00 · 2016-08-01 16:39:58 +02:00 · bd0424fa0c
commit bd0424fa0c
parent 1164b9e118
4 changed files with 116 additions and 30 deletions
--- a/src/od/formatteriteminfo.rs
+++ b/src/od/formatteriteminfo.rs
@ -1,10 +1,18 @@
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy)]
 pub enum FormatWriter {
    IntWriter(fn(u64, usize, usize) -> String),
    FloatWriter(fn(f64) -> String),
+    MultibyteWriter(fn(&[u8]) -> String),
 }

-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+impl Clone for FormatWriter {
+    #[inline]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+#[derive(Copy, Clone)]
 pub struct FormatterItemInfo {
    pub byte_size: usize,
    pub print_width: usize,      // including a space in front of the text
--- a/src/od/od.rs
+++ b/src/od/od.rs
@ -208,7 +208,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
                }
            }
        };
-        let min_bytes = formats.iter().fold(2, |max, next| cmp::max(max, next.byte_size));
+        let min_bytes = formats.iter().fold(1, |max, next| cmp::max(max, next.byte_size));
        if line_bytes % min_bytes != 0 {
            show_warning!("invalid width {}; using {} instead", line_bytes, min_bytes);
            line_bytes = min_bytes;
@ -267,6 +267,7 @@ fn odfunc(line_bytes: usize, input_offset_base: Radix, byte_order: ByteOrder,

    loop {
        // print each line data (or multi-format raster of several lines describing the same data).
+        // TODO: we need to read more data in case a multi-byte sequence starts at the end of the line

        match mf.f_read(bytes.as_mut_slice()) {
            Ok(0) => {
@ -358,6 +359,9 @@ fn print_bytes(byte_order: ByteOrder, bytes: &[u8], length: usize, prefix: &str,
                    };
                    output_text.push_str(&func(p));
                }
+                FormatWriter::MultibyteWriter(func) => {
+                    output_text.push_str(&func(&bytes[b..length]));
+                }
            }
            b = nextb;
        }
--- a/src/od/prn_char.rs
+++ b/src/od/prn_char.rs
@ -1,3 +1,4 @@
+use std::str::from_utf8;
 use formatteriteminfo::*;

 pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo {
@ -9,13 +10,11 @@ pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo {
 pub static FORMAT_ITEM_C: FormatterItemInfo = FormatterItemInfo {
    byte_size: 1,
    print_width: 4,
-    formatter: FormatWriter::IntWriter(format_item_c),
+    formatter: FormatWriter::MultibyteWriter(format_item_c),
 };

-// TODO: multi-byte chars
-// Quoth the man page: Multi-byte characters are displayed in the area corresponding to the first byte of the character. The remaining bytes are shown as `**'.

-static A_CHRS : [&'static str; 160]  =
+static A_CHRS : [&'static str; 128]  =
 ["nul",   "soh",   "stx",   "etx",   "eot",   "enq",   "ack",   "bel",
 "bs",    "ht",   "nl",     "vt",    "ff",    "cr",    "so",    "si",
 "dle",   "dc1",   "dc2",   "dc3",   "dc4",   "nak",   "syn",   "etb",
@ -31,21 +30,17 @@ static A_CHRS : [&'static str; 160]  =
  "`",     "a",     "b",     "c",     "d",     "e",     "f",     "g",
  "h",     "i",     "j",     "k",     "l",     "m",     "n",     "o",
  "p",     "q",     "r",     "s",     "t",     "u",     "v",     "w",
-  "x",     "y",     "z",     "{",     "|",     "}",     "~",   "del",
- "80",    "81",    "82",    "83",    "84",    "85",    "86",    "87",
- "88",    "89",    "8a",    "8b",    "8c",    "8d",    "8e",    "8f",
- "90",    "91",    "92",    "93",    "94",    "95",    "96",    "97",
- "98",    "99",    "9a",    "9b",    "9c",    "9d",    "9e",    "9f"];
+  "x",     "y",     "z",     "{",     "|",     "}",     "~",   "del"];

-pub fn format_item_a(p: u64, _: usize, _: usize) -> String {
+fn format_item_a(p: u64, _: usize, _: usize) -> String {
    // itembytes == 1
-    let b = (p & 0xff) as u8;
-    format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"?") // XXX od dose not actually do this, it just prints the byte
+    let b = (p & 0x7f) as u8;
+    format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"??")
  )
 }


-static C_CHRS : [&'static str; 127]  = [
+static C_CHRS : [&'static str; 128]  = [
 "\\0",   "001",   "002",   "003",   "004",   "005",   "006",    "\\a",
 "\\b",    "\\t",  "\\n",   "\\v",    "\\f",    "\\r",   "016",   "017",
 "020",   "021",   "022",   "023",   "024",   "025",   "026",   "027",
@ -61,18 +56,94 @@ static C_CHRS : [&'static str; 127]  = [
  "`",     "a",     "b",     "c",     "d",     "e",     "f",     "g",
  "h",     "i",     "j",     "k",     "l",     "m",     "n",     "o",
  "p",     "q",     "r",     "s",     "t",     "u",     "v",     "w",
-  "x",     "y",     "z",     "{",     "|",     "}",     "~" ];
+  "x",     "y",     "z",     "{",     "|",     "}",     "~",    "177"];


-pub fn format_item_c(p: u64, _: usize, _: usize) -> String {
+fn format_item_c(bytes: &[u8]) -> String {
    // itembytes == 1
-    let b = (p & 0xff) as usize;
+    let b = bytes[0];

-    if b < C_CHRS.len() {
+    if b & 0x80 == 0x00 {
        match C_CHRS.get(b as usize) {
            Some(s) => format!("{:>4}", s),
            None => format!("{:>4}", b),
        }
    }
-    else { String::new() }
+    else if (b & 0xc0) == 0x80 {
+        // second or subsequent octet of an utf-8 sequence
+        String::from("  **")
+    }
+    else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
+        // start of a 2 octet utf-8 sequence
+        match from_utf8(&bytes[0..2]) {
+            Ok(s) => { format!("{:>4}", s) },
+            Err(_) => { format!(" {:03o}", b) },
+        }
+    }
+    else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
+        // start of a 3 octet utf-8 sequence
+        match from_utf8(&bytes[0..3]) {
+            Ok(s) => { format!("{:>4}", s) },
+            Err(_) => { format!(" {:03o}", b) },
+        }
+    }
+    else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
+        // start of a 4 octet utf-8 sequence
+        match from_utf8(&bytes[0..4]) {
+            Ok(s) => { format!("{:>4}", s) },
+            Err(_) => { format!(" {:03o}", b) },
+        }
+    }
+    else {
+        // invalid utf-8
+        format!(" {:03o}", b)
+    }
+}
+
+#[test]
+fn test_format_item_a() {
+    assert_eq!(" nul", format_item_a(0x00, 1, 4));
+    assert_eq!(" soh", format_item_a(0x01, 1, 4));
+    assert_eq!("  sp", format_item_a(0x20, 1, 4));
+    assert_eq!("   A", format_item_a(0x41, 1, 4));
+    assert_eq!("   ~", format_item_a(0x7e, 1, 4));
+    assert_eq!(" del", format_item_a(0x7f, 1, 4));
+
+    assert_eq!(" nul", format_item_a(0x80, 1, 4));
+    assert_eq!("   A", format_item_a(0xc1, 1, 4));
+    assert_eq!("   ~", format_item_a(0xfe, 1, 4));
+    assert_eq!(" del", format_item_a(0xff, 1, 4));
+}
+
+#[test]
+fn test_format_item_c() {
+    assert_eq!("  \\0", format_item_c(&[0x00]));
+    assert_eq!(" 001", format_item_c(&[0x01]));
+    assert_eq!("    ", format_item_c(&[0x20]));
+    assert_eq!("   A", format_item_c(&[0x41]));
+    assert_eq!("   ~", format_item_c(&[0x7e]));
+    assert_eq!(" 177", format_item_c(&[0x7f]));
+    assert_eq!("   A", format_item_c(&[0x41, 0x21]));
+
+    assert_eq!("  **", format_item_c(&[0x80]));
+    assert_eq!("  **", format_item_c(&[0x9f]));
+
+    assert_eq!("   ß", format_item_c(&[0xc3, 0x9f]));
+    assert_eq!("   ß", format_item_c(&[0xc3, 0x9f, 0x21]));
+
+    assert_eq!("   \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
+    assert_eq!("   \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
+
+    assert_eq!("   \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
+    assert_eq!("   \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21]));
+
+    assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (MUTF-8 null)
+    assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
+    assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
+    assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
+    assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
+    assert_eq!("   \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8
+    assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
+    assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
+    assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8
 }
--- a/tests/test_od.rs
+++ b/tests/test_od.rs
@ -243,18 +243,21 @@ fn test_f64(){
    assert_eq!(result.stdout, expected_output);
 }

-// We don't support multibyte chars, so big NEIN to this
-/*
 #[test]
-fn mit_die_umlauten_getesten() {
-    let result = new_ucmd!()
-        .run_piped_stdin("Universität Tübingen".as_bytes());
+fn test_multibyte() {
+
+    // TODO: replace **** with \u{1B000}
+    let result = new_ucmd!().arg("-c").arg("-w12").run_piped_stdin("Universität Tübingen ****".as_bytes());
+
    assert_empty_stderr!(result);
    assert!(result.success);
-    assert_eq!(result.stdout,
-    "0000000    U   n   i   v   e   r   s   i   t   ä  **   t       T   ü  **\n0000020    b   i   n   g   e   n\n0000026")
+    assert_eq!(result.stdout, unindent("
+            0000000   U   n   i   v   e   r   s   i   t   ä  **   t
+            0000014       T   ü  **   b   i   n   g   e   n       *
+            0000030   *   *   *
+            0000033
+            "));
 }
-*/

 #[test]
 fn test_width(){
@ -358,7 +361,7 @@ fn test_alignment_Xxa() {
    let expected_output = unindent("
        0000000        66650d0a        9f9e0067
                   0d0a    6665    0067    9f9e
-                 nl  cr   e   f   g nul  9e  9f
+                 nl  cr   e   f   g nul  rs  us
        0000010
        ");