From bd0424fa0cc1ad25ed5c1d5538d39d1348bbd3de Mon Sep 17 00:00:00 2001 From: Wim Hueskes Date: Mon, 1 Aug 2016 16:39:58 +0200 Subject: [PATCH] od: start with multi-byte support --- src/od/formatteriteminfo.rs | 12 +++- src/od/od.rs | 6 +- src/od/prn_char.rs | 107 ++++++++++++++++++++++++++++++------ tests/test_od.rs | 21 ++++--- 4 files changed, 116 insertions(+), 30 deletions(-) diff --git a/src/od/formatteriteminfo.rs b/src/od/formatteriteminfo.rs index a6bd6f5a6..dae0f63e9 100644 --- a/src/od/formatteriteminfo.rs +++ b/src/od/formatteriteminfo.rs @@ -1,10 +1,18 @@ -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy)] pub enum FormatWriter { IntWriter(fn(u64, usize, usize) -> String), FloatWriter(fn(f64) -> String), + MultibyteWriter(fn(&[u8]) -> String), } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +impl Clone for FormatWriter { + #[inline] + fn clone(&self) -> Self { + *self + } +} + +#[derive(Copy, Clone)] pub struct FormatterItemInfo { pub byte_size: usize, pub print_width: usize, // including a space in front of the text diff --git a/src/od/od.rs b/src/od/od.rs index efe83cd8c..74728bd70 100644 --- a/src/od/od.rs +++ b/src/od/od.rs @@ -208,7 +208,7 @@ pub fn uumain(args: Vec) -> i32 { } } }; - let min_bytes = formats.iter().fold(2, |max, next| cmp::max(max, next.byte_size)); + let min_bytes = formats.iter().fold(1, |max, next| cmp::max(max, next.byte_size)); if line_bytes % min_bytes != 0 { show_warning!("invalid width {}; using {} instead", line_bytes, min_bytes); line_bytes = min_bytes; @@ -267,6 +267,7 @@ fn odfunc(line_bytes: usize, input_offset_base: Radix, byte_order: ByteOrder, loop { // print each line data (or multi-format raster of several lines describing the same data). + // TODO: we need to read more data in case a multi-byte sequence starts at the end of the line match mf.f_read(bytes.as_mut_slice()) { Ok(0) => { @@ -358,6 +359,9 @@ fn print_bytes(byte_order: ByteOrder, bytes: &[u8], length: usize, prefix: &str, }; output_text.push_str(&func(p)); } + FormatWriter::MultibyteWriter(func) => { + output_text.push_str(&func(&bytes[b..length])); + } } b = nextb; } diff --git a/src/od/prn_char.rs b/src/od/prn_char.rs index d0811f107..f4d096aa8 100644 --- a/src/od/prn_char.rs +++ b/src/od/prn_char.rs @@ -1,3 +1,4 @@ +use std::str::from_utf8; use formatteriteminfo::*; pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo { @@ -9,13 +10,11 @@ pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo { pub static FORMAT_ITEM_C: FormatterItemInfo = FormatterItemInfo { byte_size: 1, print_width: 4, - formatter: FormatWriter::IntWriter(format_item_c), + formatter: FormatWriter::MultibyteWriter(format_item_c), }; -// TODO: multi-byte chars -// Quoth the man page: Multi-byte characters are displayed in the area corresponding to the first byte of the character. The remaining bytes are shown as `**'. -static A_CHRS : [&'static str; 160] = +static A_CHRS : [&'static str; 128] = ["nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel", "bs", "ht", "nl", "vt", "ff", "cr", "so", "si", "dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb", @@ -31,21 +30,17 @@ static A_CHRS : [&'static str; 160] = "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "{", "|", "}", "~", "del", - "80", "81", "82", "83", "84", "85", "86", "87", - "88", "89", "8a", "8b", "8c", "8d", "8e", "8f", - "90", "91", "92", "93", "94", "95", "96", "97", - "98", "99", "9a", "9b", "9c", "9d", "9e", "9f"]; + "x", "y", "z", "{", "|", "}", "~", "del"]; -pub fn format_item_a(p: u64, _: usize, _: usize) -> String { +fn format_item_a(p: u64, _: usize, _: usize) -> String { // itembytes == 1 - let b = (p & 0xff) as u8; - format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"?") // XXX od dose not actually do this, it just prints the byte + let b = (p & 0x7f) as u8; + format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"??") ) } -static C_CHRS : [&'static str; 127] = [ +static C_CHRS : [&'static str; 128] = [ "\\0", "001", "002", "003", "004", "005", "006", "\\a", "\\b", "\\t", "\\n", "\\v", "\\f", "\\r", "016", "017", "020", "021", "022", "023", "024", "025", "026", "027", @@ -61,18 +56,94 @@ static C_CHRS : [&'static str; 127] = [ "`", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "{", "|", "}", "~" ]; + "x", "y", "z", "{", "|", "}", "~", "177"]; -pub fn format_item_c(p: u64, _: usize, _: usize) -> String { +fn format_item_c(bytes: &[u8]) -> String { // itembytes == 1 - let b = (p & 0xff) as usize; + let b = bytes[0]; - if b < C_CHRS.len() { + if b & 0x80 == 0x00 { match C_CHRS.get(b as usize) { Some(s) => format!("{:>4}", s), None => format!("{:>4}", b), } } - else { String::new() } + else if (b & 0xc0) == 0x80 { + // second or subsequent octet of an utf-8 sequence + String::from(" **") + } + else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) { + // start of a 2 octet utf-8 sequence + match from_utf8(&bytes[0..2]) { + Ok(s) => { format!("{:>4}", s) }, + Err(_) => { format!(" {:03o}", b) }, + } + } + else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) { + // start of a 3 octet utf-8 sequence + match from_utf8(&bytes[0..3]) { + Ok(s) => { format!("{:>4}", s) }, + Err(_) => { format!(" {:03o}", b) }, + } + } + else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) { + // start of a 4 octet utf-8 sequence + match from_utf8(&bytes[0..4]) { + Ok(s) => { format!("{:>4}", s) }, + Err(_) => { format!(" {:03o}", b) }, + } + } + else { + // invalid utf-8 + format!(" {:03o}", b) + } +} + +#[test] +fn test_format_item_a() { + assert_eq!(" nul", format_item_a(0x00, 1, 4)); + assert_eq!(" soh", format_item_a(0x01, 1, 4)); + assert_eq!(" sp", format_item_a(0x20, 1, 4)); + assert_eq!(" A", format_item_a(0x41, 1, 4)); + assert_eq!(" ~", format_item_a(0x7e, 1, 4)); + assert_eq!(" del", format_item_a(0x7f, 1, 4)); + + assert_eq!(" nul", format_item_a(0x80, 1, 4)); + assert_eq!(" A", format_item_a(0xc1, 1, 4)); + assert_eq!(" ~", format_item_a(0xfe, 1, 4)); + assert_eq!(" del", format_item_a(0xff, 1, 4)); +} + +#[test] +fn test_format_item_c() { + assert_eq!(" \\0", format_item_c(&[0x00])); + assert_eq!(" 001", format_item_c(&[0x01])); + assert_eq!(" ", format_item_c(&[0x20])); + assert_eq!(" A", format_item_c(&[0x41])); + assert_eq!(" ~", format_item_c(&[0x7e])); + assert_eq!(" 177", format_item_c(&[0x7f])); + assert_eq!(" A", format_item_c(&[0x41, 0x21])); + + assert_eq!(" **", format_item_c(&[0x80])); + assert_eq!(" **", format_item_c(&[0x9f])); + + assert_eq!(" ß", format_item_c(&[0xc3, 0x9f])); + assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21])); + + assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80])); + assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21])); + + assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96])); + assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21])); + + assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (MUTF-8 null) + assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8 + assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8 + assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong) + assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet) + assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 + assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8 + assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8 + assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8 } diff --git a/tests/test_od.rs b/tests/test_od.rs index 96196d9e5..af4eca612 100644 --- a/tests/test_od.rs +++ b/tests/test_od.rs @@ -243,18 +243,21 @@ fn test_f64(){ assert_eq!(result.stdout, expected_output); } -// We don't support multibyte chars, so big NEIN to this -/* #[test] -fn mit_die_umlauten_getesten() { - let result = new_ucmd!() - .run_piped_stdin("Universität Tübingen".as_bytes()); +fn test_multibyte() { + + // TODO: replace **** with \u{1B000} + let result = new_ucmd!().arg("-c").arg("-w12").run_piped_stdin("Universität Tübingen ****".as_bytes()); + assert_empty_stderr!(result); assert!(result.success); - assert_eq!(result.stdout, - "0000000 U n i v e r s i t ä ** t T ü **\n0000020 b i n g e n\n0000026") + assert_eq!(result.stdout, unindent(" + 0000000 U n i v e r s i t ä ** t + 0000014 T ü ** b i n g e n * + 0000030 * * * + 0000033 + ")); } -*/ #[test] fn test_width(){ @@ -358,7 +361,7 @@ fn test_alignment_Xxa() { let expected_output = unindent(" 0000000 66650d0a 9f9e0067 0d0a 6665 0067 9f9e - nl cr e f g nul 9e 9f + nl cr e f g nul rs us 0000010 ");