From bd0424fa0cc1ad25ed5c1d5538d39d1348bbd3de Mon Sep 17 00:00:00 2001
From: Wim Hueskes <rust@wimhueskes.eu>
Date: Mon, 1 Aug 2016 16:39:58 +0200
Subject: [PATCH] od: start with multi-byte support

---
 src/od/formatteriteminfo.rs |  12 +++-
 src/od/od.rs                |   6 +-
 src/od/prn_char.rs          | 107 ++++++++++++++++++++++++++++++------
 tests/test_od.rs            |  21 ++++---
 4 files changed, 116 insertions(+), 30 deletions(-)
diff --git a/src/od/formatteriteminfo.rs b/src/od/formatteriteminfo.rs
index a6bd6f5a6..dae0f63e9 100644
--- a/src/od/formatteriteminfo.rs
+++ b/src/od/formatteriteminfo.rs
@@ -1,10 +1,18 @@
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy)]
 pub enum FormatWriter {
     IntWriter(fn(u64, usize, usize) -> String),
     FloatWriter(fn(f64) -> String),
+    MultibyteWriter(fn(&[u8]) -> String),
 }
 
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+impl Clone for FormatWriter {
+    #[inline]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+#[derive(Copy, Clone)]
 pub struct FormatterItemInfo {
     pub byte_size: usize,
     pub print_width: usize,      // including a space in front of the text
diff --git a/src/od/od.rs b/src/od/od.rs
index efe83cd8c..74728bd70 100644
--- a/src/od/od.rs
+++ b/src/od/od.rs
@@ -208,7 +208,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
                 }
             }
         };
-        let min_bytes = formats.iter().fold(2, |max, next| cmp::max(max, next.byte_size));
+        let min_bytes = formats.iter().fold(1, |max, next| cmp::max(max, next.byte_size));
         if line_bytes % min_bytes != 0 {
             show_warning!("invalid width {}; using {} instead", line_bytes, min_bytes);
             line_bytes = min_bytes;
@@ -267,6 +267,7 @@ fn odfunc(line_bytes: usize, input_offset_base: Radix, byte_order: ByteOrder,
 
     loop {
         // print each line data (or multi-format raster of several lines describing the same data).
+        // TODO: we need to read more data in case a multi-byte sequence starts at the end of the line
 
         match mf.f_read(bytes.as_mut_slice()) {
             Ok(0) => {
@@ -358,6 +359,9 @@ fn print_bytes(byte_order: ByteOrder, bytes: &[u8], length: usize, prefix: &str,
                     };
                     output_text.push_str(&func(p));
                 }
+                FormatWriter::MultibyteWriter(func) => {
+                    output_text.push_str(&func(&bytes[b..length]));
+                }
             }
             b = nextb;
         }
diff --git a/src/od/prn_char.rs b/src/od/prn_char.rs
index d0811f107..f4d096aa8 100644
--- a/src/od/prn_char.rs
+++ b/src/od/prn_char.rs
@@ -1,3 +1,4 @@
+use std::str::from_utf8;
 use formatteriteminfo::*;
 
 pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo {
@@ -9,13 +10,11 @@ pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo {
 pub static FORMAT_ITEM_C: FormatterItemInfo = FormatterItemInfo {
     byte_size: 1,
     print_width: 4,
-    formatter: FormatWriter::IntWriter(format_item_c),
+    formatter: FormatWriter::MultibyteWriter(format_item_c),
 };
 
-// TODO: multi-byte chars
-// Quoth the man page: Multi-byte characters are displayed in the area corresponding to the first byte of the character. The remaining bytes are shown as `**'.
 
-static A_CHRS : [&'static str; 160]  =
+static A_CHRS : [&'static str; 128]  =
 ["nul",   "soh",   "stx",   "etx",   "eot",   "enq",   "ack",   "bel",
  "bs",    "ht",   "nl",     "vt",    "ff",    "cr",    "so",    "si",
  "dle",   "dc1",   "dc2",   "dc3",   "dc4",   "nak",   "syn",   "etb",
@@ -31,21 +30,17 @@ static A_CHRS : [&'static str; 160]  =
   "`",     "a",     "b",     "c",     "d",     "e",     "f",     "g",
   "h",     "i",     "j",     "k",     "l",     "m",     "n",     "o",
   "p",     "q",     "r",     "s",     "t",     "u",     "v",     "w",
-  "x",     "y",     "z",     "{",     "|",     "}",     "~",   "del",
- "80",    "81",    "82",    "83",    "84",    "85",    "86",    "87",
- "88",    "89",    "8a",    "8b",    "8c",    "8d",    "8e",    "8f",
- "90",    "91",    "92",    "93",    "94",    "95",    "96",    "97",
- "98",    "99",    "9a",    "9b",    "9c",    "9d",    "9e",    "9f"];
+  "x",     "y",     "z",     "{",     "|",     "}",     "~",   "del"];
 
-pub fn format_item_a(p: u64, _: usize, _: usize) -> String {
+fn format_item_a(p: u64, _: usize, _: usize) -> String {
     // itembytes == 1
-    let b = (p & 0xff) as u8;
-    format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"?") // XXX od dose not actually do this, it just prints the byte
+    let b = (p & 0x7f) as u8;
+    format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"??")
   )
 }
 
 
-static C_CHRS : [&'static str; 127]  = [
+static C_CHRS : [&'static str; 128]  = [
 "\\0",   "001",   "002",   "003",   "004",   "005",   "006",    "\\a",
 "\\b",    "\\t",  "\\n",   "\\v",    "\\f",    "\\r",   "016",   "017",
 "020",   "021",   "022",   "023",   "024",   "025",   "026",   "027",
@@ -61,18 +56,94 @@ static C_CHRS : [&'static str; 127]  = [
   "`",     "a",     "b",     "c",     "d",     "e",     "f",     "g",
   "h",     "i",     "j",     "k",     "l",     "m",     "n",     "o",
   "p",     "q",     "r",     "s",     "t",     "u",     "v",     "w",
-  "x",     "y",     "z",     "{",     "|",     "}",     "~" ];
+  "x",     "y",     "z",     "{",     "|",     "}",     "~",    "177"];
 
 
-pub fn format_item_c(p: u64, _: usize, _: usize) -> String {
+fn format_item_c(bytes: &[u8]) -> String {
     // itembytes == 1
-    let b = (p & 0xff) as usize;
+    let b = bytes[0];
 
-    if b < C_CHRS.len() {
+    if b & 0x80 == 0x00 {
         match C_CHRS.get(b as usize) {
             Some(s) => format!("{:>4}", s),
             None => format!("{:>4}", b),
         }
     }
-    else { String::new() }
+    else if (b & 0xc0) == 0x80 {
+        // second or subsequent octet of an utf-8 sequence
+        String::from("  **")
+    }
+    else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
+        // start of a 2 octet utf-8 sequence
+        match from_utf8(&bytes[0..2]) {
+            Ok(s) => { format!("{:>4}", s) },
+            Err(_) => { format!(" {:03o}", b) },
+        }
+    }
+    else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
+        // start of a 3 octet utf-8 sequence
+        match from_utf8(&bytes[0..3]) {
+            Ok(s) => { format!("{:>4}", s) },
+            Err(_) => { format!(" {:03o}", b) },
+        }
+    }
+    else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
+        // start of a 4 octet utf-8 sequence
+        match from_utf8(&bytes[0..4]) {
+            Ok(s) => { format!("{:>4}", s) },
+            Err(_) => { format!(" {:03o}", b) },
+        }
+    }
+    else {
+        // invalid utf-8
+        format!(" {:03o}", b)
+    }
+}
+
+#[test]
+fn test_format_item_a() {
+    assert_eq!(" nul", format_item_a(0x00, 1, 4));
+    assert_eq!(" soh", format_item_a(0x01, 1, 4));
+    assert_eq!("  sp", format_item_a(0x20, 1, 4));
+    assert_eq!("   A", format_item_a(0x41, 1, 4));
+    assert_eq!("   ~", format_item_a(0x7e, 1, 4));
+    assert_eq!(" del", format_item_a(0x7f, 1, 4));
+
+    assert_eq!(" nul", format_item_a(0x80, 1, 4));
+    assert_eq!("   A", format_item_a(0xc1, 1, 4));
+    assert_eq!("   ~", format_item_a(0xfe, 1, 4));
+    assert_eq!(" del", format_item_a(0xff, 1, 4));
+}
+
+#[test]
+fn test_format_item_c() {
+    assert_eq!("  \\0", format_item_c(&[0x00]));
+    assert_eq!(" 001", format_item_c(&[0x01]));
+    assert_eq!("    ", format_item_c(&[0x20]));
+    assert_eq!("   A", format_item_c(&[0x41]));
+    assert_eq!("   ~", format_item_c(&[0x7e]));
+    assert_eq!(" 177", format_item_c(&[0x7f]));
+    assert_eq!("   A", format_item_c(&[0x41, 0x21]));
+
+    assert_eq!("  **", format_item_c(&[0x80]));
+    assert_eq!("  **", format_item_c(&[0x9f]));
+
+    assert_eq!("   ß", format_item_c(&[0xc3, 0x9f]));
+    assert_eq!("   ß", format_item_c(&[0xc3, 0x9f, 0x21]));
+
+    assert_eq!("   \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
+    assert_eq!("   \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
+
+    assert_eq!("   \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
+    assert_eq!("   \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21]));
+
+    assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (MUTF-8 null)
+    assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
+    assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
+    assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
+    assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
+    assert_eq!("   \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8
+    assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
+    assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
+    assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8
 }
diff --git a/tests/test_od.rs b/tests/test_od.rs
index 96196d9e5..af4eca612 100644
--- a/tests/test_od.rs
+++ b/tests/test_od.rs
@@ -243,18 +243,21 @@ fn test_f64(){
     assert_eq!(result.stdout, expected_output);
 }
 
-// We don't support multibyte chars, so big NEIN to this
-/*
 #[test]
-fn mit_die_umlauten_getesten() {
-    let result = new_ucmd!()
-        .run_piped_stdin("Universität Tübingen".as_bytes());
+fn test_multibyte() {
+
+    // TODO: replace **** with \u{1B000}
+    let result = new_ucmd!().arg("-c").arg("-w12").run_piped_stdin("Universität Tübingen ****".as_bytes());
+
     assert_empty_stderr!(result);
     assert!(result.success);
-    assert_eq!(result.stdout,
-    "0000000    U   n   i   v   e   r   s   i   t   ä  **   t       T   ü  **\n0000020    b   i   n   g   e   n\n0000026")
+    assert_eq!(result.stdout, unindent("
+            0000000   U   n   i   v   e   r   s   i   t   ä  **   t
+            0000014       T   ü  **   b   i   n   g   e   n       *
+            0000030   *   *   *
+            0000033
+            "));
 }
-*/
 
 #[test]
 fn test_width(){
@@ -358,7 +361,7 @@ fn test_alignment_Xxa() {
     let expected_output = unindent("
         0000000        66650d0a        9f9e0067
                    0d0a    6665    0067    9f9e
-                 nl  cr   e   f   g nul  9e  9f
+                 nl  cr   e   f   g nul  rs  us
         0000010
         ");