od: start with multi-byte support

This commit is contained in:
Wim Hueskes 2016-08-01 16:39:58 +02:00
parent 1164b9e118
commit bd0424fa0c
4 changed files with 116 additions and 30 deletions

View file

@ -1,10 +1,18 @@
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[derive(Copy)]
pub enum FormatWriter {
IntWriter(fn(u64, usize, usize) -> String),
FloatWriter(fn(f64) -> String),
MultibyteWriter(fn(&[u8]) -> String),
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
impl Clone for FormatWriter {
#[inline]
fn clone(&self) -> Self {
*self
}
}
#[derive(Copy, Clone)]
pub struct FormatterItemInfo {
pub byte_size: usize,
pub print_width: usize, // including a space in front of the text

View file

@ -208,7 +208,7 @@ pub fn uumain(args: Vec<String>) -> i32 {
}
}
};
let min_bytes = formats.iter().fold(2, |max, next| cmp::max(max, next.byte_size));
let min_bytes = formats.iter().fold(1, |max, next| cmp::max(max, next.byte_size));
if line_bytes % min_bytes != 0 {
show_warning!("invalid width {}; using {} instead", line_bytes, min_bytes);
line_bytes = min_bytes;
@ -267,6 +267,7 @@ fn odfunc(line_bytes: usize, input_offset_base: Radix, byte_order: ByteOrder,
loop {
// print each line data (or multi-format raster of several lines describing the same data).
// TODO: we need to read more data in case a multi-byte sequence starts at the end of the line
match mf.f_read(bytes.as_mut_slice()) {
Ok(0) => {
@ -358,6 +359,9 @@ fn print_bytes(byte_order: ByteOrder, bytes: &[u8], length: usize, prefix: &str,
};
output_text.push_str(&func(p));
}
FormatWriter::MultibyteWriter(func) => {
output_text.push_str(&func(&bytes[b..length]));
}
}
b = nextb;
}

View file

@ -1,3 +1,4 @@
use std::str::from_utf8;
use formatteriteminfo::*;
pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo {
@ -9,13 +10,11 @@ pub static FORMAT_ITEM_A: FormatterItemInfo = FormatterItemInfo {
pub static FORMAT_ITEM_C: FormatterItemInfo = FormatterItemInfo {
byte_size: 1,
print_width: 4,
formatter: FormatWriter::IntWriter(format_item_c),
formatter: FormatWriter::MultibyteWriter(format_item_c),
};
// TODO: multi-byte chars
// Quoth the man page: Multi-byte characters are displayed in the area corresponding to the first byte of the character. The remaining bytes are shown as `**'.
static A_CHRS : [&'static str; 160] =
static A_CHRS : [&'static str; 128] =
["nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel",
"bs", "ht", "nl", "vt", "ff", "cr", "so", "si",
"dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb",
@ -31,21 +30,17 @@ static A_CHRS : [&'static str; 160] =
"`", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "{", "|", "}", "~", "del",
"80", "81", "82", "83", "84", "85", "86", "87",
"88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
"90", "91", "92", "93", "94", "95", "96", "97",
"98", "99", "9a", "9b", "9c", "9d", "9e", "9f"];
"x", "y", "z", "{", "|", "}", "~", "del"];
pub fn format_item_a(p: u64, _: usize, _: usize) -> String {
fn format_item_a(p: u64, _: usize, _: usize) -> String {
// itembytes == 1
let b = (p & 0xff) as u8;
format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"?") // XXX od dose not actually do this, it just prints the byte
let b = (p & 0x7f) as u8;
format!("{:>4}", A_CHRS.get(b as usize).unwrap_or(&"??")
)
}
static C_CHRS : [&'static str; 127] = [
static C_CHRS : [&'static str; 128] = [
"\\0", "001", "002", "003", "004", "005", "006", "\\a",
"\\b", "\\t", "\\n", "\\v", "\\f", "\\r", "016", "017",
"020", "021", "022", "023", "024", "025", "026", "027",
@ -61,18 +56,94 @@ static C_CHRS : [&'static str; 127] = [
"`", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "{", "|", "}", "~" ];
"x", "y", "z", "{", "|", "}", "~", "177"];
pub fn format_item_c(p: u64, _: usize, _: usize) -> String {
fn format_item_c(bytes: &[u8]) -> String {
// itembytes == 1
let b = (p & 0xff) as usize;
let b = bytes[0];
if b < C_CHRS.len() {
if b & 0x80 == 0x00 {
match C_CHRS.get(b as usize) {
Some(s) => format!("{:>4}", s),
None => format!("{:>4}", b),
}
}
else { String::new() }
else if (b & 0xc0) == 0x80 {
// second or subsequent octet of an utf-8 sequence
String::from(" **")
}
else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
// start of a 2 octet utf-8 sequence
match from_utf8(&bytes[0..2]) {
Ok(s) => { format!("{:>4}", s) },
Err(_) => { format!(" {:03o}", b) },
}
}
else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
// start of a 3 octet utf-8 sequence
match from_utf8(&bytes[0..3]) {
Ok(s) => { format!("{:>4}", s) },
Err(_) => { format!(" {:03o}", b) },
}
}
else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
// start of a 4 octet utf-8 sequence
match from_utf8(&bytes[0..4]) {
Ok(s) => { format!("{:>4}", s) },
Err(_) => { format!(" {:03o}", b) },
}
}
else {
// invalid utf-8
format!(" {:03o}", b)
}
}
#[test]
fn test_format_item_a() {
assert_eq!(" nul", format_item_a(0x00, 1, 4));
assert_eq!(" soh", format_item_a(0x01, 1, 4));
assert_eq!(" sp", format_item_a(0x20, 1, 4));
assert_eq!(" A", format_item_a(0x41, 1, 4));
assert_eq!(" ~", format_item_a(0x7e, 1, 4));
assert_eq!(" del", format_item_a(0x7f, 1, 4));
assert_eq!(" nul", format_item_a(0x80, 1, 4));
assert_eq!(" A", format_item_a(0xc1, 1, 4));
assert_eq!(" ~", format_item_a(0xfe, 1, 4));
assert_eq!(" del", format_item_a(0xff, 1, 4));
}
#[test]
fn test_format_item_c() {
assert_eq!(" \\0", format_item_c(&[0x00]));
assert_eq!(" 001", format_item_c(&[0x01]));
assert_eq!(" ", format_item_c(&[0x20]));
assert_eq!(" A", format_item_c(&[0x41]));
assert_eq!(" ~", format_item_c(&[0x7e]));
assert_eq!(" 177", format_item_c(&[0x7f]));
assert_eq!(" A", format_item_c(&[0x41, 0x21]));
assert_eq!(" **", format_item_c(&[0x80]));
assert_eq!(" **", format_item_c(&[0x9f]));
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f]));
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21]));
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21]));
assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (MUTF-8 null)
assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8
assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8
}

View file

@ -243,18 +243,21 @@ fn test_f64(){
assert_eq!(result.stdout, expected_output);
}
// We don't support multibyte chars, so big NEIN to this
/*
#[test]
fn mit_die_umlauten_getesten() {
let result = new_ucmd!()
.run_piped_stdin("Universität Tübingen".as_bytes());
fn test_multibyte() {
// TODO: replace **** with \u{1B000}
let result = new_ucmd!().arg("-c").arg("-w12").run_piped_stdin("Universität Tübingen ****".as_bytes());
assert_empty_stderr!(result);
assert!(result.success);
assert_eq!(result.stdout,
"0000000 U n i v e r s i t ä ** t T ü **\n0000020 b i n g e n\n0000026")
assert_eq!(result.stdout, unindent("
0000000 U n i v e r s i t ä ** t
0000014 T ü ** b i n g e n *
0000030 * * *
0000033
"));
}
*/
#[test]
fn test_width(){
@ -358,7 +361,7 @@ fn test_alignment_Xxa() {
let expected_output = unindent("
0000000 66650d0a 9f9e0067
0d0a 6665 0067 9f9e
nl cr e f g nul 9e 9f
nl cr e f g nul rs us
0000010
");