From 731aa6bbdd5a1df60d8ef2ce4333ba78f30f9a88 Mon Sep 17 00:00:00 2001 From: Darren Schroeder Date: Thu, 11 Jun 2020 19:37:43 -0500 Subject: [PATCH] use encoding on open for #1939 (#1949) * WIP - not compiling * compiling but panicing * still broken * nearly working * reverted deserializer_string changes updated enter.rs and open.rs to use Option> Accepted Clippy suggestions Accepted fmt suggestions Left original code from open.rs We may want to use some of it and only fallback to encoding. * Don't exit when there is an unknown encoding. * When encoding is unknown default to utf-8. * only do encoding if the user says to it * merged some conflicts on open * made error messages consistent * Updated unwrap with expect * updated open test to pass with more descriptive err updated enter test to not fail * change _location to location * changed _visitor to visitor * Added a more verbose usage statement for encoding Linked to docs.rs/encoding_rs for details Co-authored-by: Darren Schroeder --- Cargo.lock | 1 + crates/nu-cli/Cargo.toml | 1 + crates/nu-cli/src/commands/enter.rs | 38 ++- crates/nu-cli/src/commands/open.rs | 329 +++++++++++++++++++++++++- crates/nu-cli/tests/commands/enter.rs | 2 +- crates/nu-cli/tests/commands/open.rs | 4 +- 6 files changed, 357 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1c29147fef..3cc8903b9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2247,6 +2247,7 @@ dependencies = [ "dirs 2.0.2", "dunce", "eml-parser", + "encoding_rs", "filesize", "futures 0.3.5", "futures-util", diff --git a/crates/nu-cli/Cargo.toml b/crates/nu-cli/Cargo.toml index 18652b40f7..dcd062a7f0 100644 --- a/crates/nu-cli/Cargo.toml +++ b/crates/nu-cli/Cargo.toml @@ -92,6 +92,7 @@ trash = { version = "1.0.1", optional = true } clipboard = { version = "0.5", optional = true } starship = { version = "0.41.3", optional = true } rayon = "1.3.0" +encoding_rs = "0.8.23" [target.'cfg(unix)'.dependencies] users = "0.10.0" diff --git a/crates/nu-cli/src/commands/enter.rs b/crates/nu-cli/src/commands/enter.rs index deb17102f4..ae59a87308 100644 --- a/crates/nu-cli/src/commands/enter.rs +++ b/crates/nu-cli/src/commands/enter.rs @@ -14,6 +14,7 @@ pub struct Enter; #[derive(Deserialize)] pub struct EnterArgs { location: Tagged, + encoding: Option>, } #[async_trait] @@ -23,15 +24,29 @@ impl WholeStreamCommand for Enter { } fn signature(&self) -> Signature { - Signature::build("enter").required( - "location", - SyntaxShape::Path, - "the location to create a new shell from", - ) + Signature::build("enter") + .required( + "location", + SyntaxShape::Path, + "the location to create a new shell from", + ) + .named( + "encoding", + SyntaxShape::String, + "encoding to use to open file", + Some('e'), + ) } fn usage(&self) -> &str { - "Create a new shell and begin at this path." + r#"Create a new shell and begin at this path. + +Multiple encodings are supported for reading text files by using +the '--encoding ' parameter. Here is an example of a few: +big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5 + +For a more complete list of encodings please refer to the encoding_rs +documentation link at https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics"# } async fn run( @@ -54,6 +69,11 @@ impl WholeStreamCommand for Enter { example: "enter package.json", result: None, }, + Example { + description: "Enters file with iso-8859-1 encoding", + example: "enter file.csv --encoding iso-8859-1", + result: None, + }, ] } } @@ -68,7 +88,7 @@ fn enter(raw_args: CommandArgs, registry: &CommandRegistry) -> Result Result e.to_string(), + _ => "".to_string() + } ).await?; match contents { diff --git a/crates/nu-cli/src/commands/open.rs b/crates/nu-cli/src/commands/open.rs index ce7e52f2ab..720ffa70d4 100644 --- a/crates/nu-cli/src/commands/open.rs +++ b/crates/nu-cli/src/commands/open.rs @@ -4,6 +4,12 @@ use nu_errors::ShellError; use nu_protocol::{CommandAction, ReturnSuccess, Signature, SyntaxShape, UntaggedValue}; use nu_source::{AnchorLocation, Span, Tagged}; use std::path::{Path, PathBuf}; +extern crate encoding_rs; +use encoding_rs::*; +use std::fs::File; +use std::io::BufWriter; +use std::io::Read; +use std::io::Write; pub struct Open; @@ -11,6 +17,7 @@ pub struct Open; pub struct OpenArgs { path: Tagged, raw: Tagged, + encoding: Option>, } #[async_trait] @@ -31,10 +38,23 @@ impl WholeStreamCommand for Open { "load content as a string instead of a table", Some('r'), ) + .named( + "encoding", + SyntaxShape::String, + "encoding to use to open file", + Some('e'), + ) } fn usage(&self) -> &str { - "Load a file into a cell, convert to table if possible (avoid by appending '--raw')" + r#"Load a file into a cell, convert to table if possible (avoid by appending '--raw'). + +Multiple encodings are supported for reading text files by using +the '--encoding ' parameter. Here is an example of a few: +big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5 + +For a more complete list of encodings please refer to the encoding_rs +documentation link at https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics"# } async fn run( @@ -46,11 +66,32 @@ impl WholeStreamCommand for Open { } fn examples(&self) -> Vec { - vec![Example { - description: "Opens \"users.csv\" and creates a table from the data", - example: "open users.csv", - result: None, - }] + vec![ + Example { + description: "Opens \"users.csv\" and creates a table from the data", + example: "open users.csv", + result: None, + }, + Example { + description: "Opens file with iso-8859-1 encoding", + example: "open file.csv --encoding iso-8859-1 | from csv", + result: None, + }, + ] + } +} + +pub fn get_encoding(opt: Option) -> &'static Encoding { + match opt { + None => UTF_8, + Some(label) => match Encoding::for_label((&label).as_bytes()) { + None => { + //print!("{} is not a known encoding label. Trying UTF-8.", label); + //std::process::exit(-2); + get_encoding(Some("utf-8".to_string())) + } + Some(encoding) => encoding, + }, } } @@ -59,8 +100,19 @@ async fn open(args: CommandArgs, registry: &CommandRegistry) -> Result e.to_string(), + _ => "".to_string(), + }; + let result = fetch(&full_path, &path.item, path.tag.span, enc).await; let (file_extension, contents, contents_tag) = result?; @@ -87,9 +139,173 @@ pub async fn fetch( cwd: &PathBuf, location: &PathBuf, span: Span, + encoding: String, ) -> Result<(Option, UntaggedValue, Tag), ShellError> { let mut cwd = cwd.clone(); + let output_encoding: &Encoding = get_encoding(Some("utf-8".to_string())); + let input_encoding: &Encoding = get_encoding(Some(encoding.clone())); + let mut decoder = input_encoding.new_decoder(); + let mut encoder = output_encoding.new_encoder(); + let mut _file: File; + let buf = Vec::new(); + let mut bufwriter = BufWriter::new(buf); + cwd.push(Path::new(location)); + if let Ok(cwd) = dunce::canonicalize(&cwd) { + if !encoding.is_empty() { + // use the encoding string + match File::open(&Path::new(&cwd)) { + Ok(mut _file) => { + convert_via_utf8( + &mut decoder, + &mut encoder, + &mut _file, + &mut bufwriter, + false, + ); + //bufwriter.flush()?; + Ok(( + cwd.extension() + .map(|name| name.to_string_lossy().to_string()), + UntaggedValue::string(String::from_utf8_lossy(&bufwriter.buffer())), + Tag { + span, + anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())), + }, + )) + } + Err(_) => Err(ShellError::labeled_error( + format!("Cannot open {:?} for reading.", &cwd), + "file not found", + span, + )), + } + } else { + // Do the old stuff + match std::fs::read(&cwd) { + Ok(bytes) => match std::str::from_utf8(&bytes) { + Ok(s) => Ok(( + cwd.extension() + .map(|name| name.to_string_lossy().to_string()), + UntaggedValue::string(s), + Tag { + span, + anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())), + }, + )), + Err(_) => { + //Non utf8 data. + match (bytes.get(0), bytes.get(1)) { + (Some(x), Some(y)) if *x == 0xff && *y == 0xfe => { + // Possibly UTF-16 little endian + let utf16 = read_le_u16(&bytes[2..]); + + if let Some(utf16) = utf16 { + match std::string::String::from_utf16(&utf16) { + Ok(s) => Ok(( + cwd.extension() + .map(|name| name.to_string_lossy().to_string()), + UntaggedValue::string(s), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )), + Err(_) => Ok(( + None, + UntaggedValue::binary(bytes), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )), + } + } else { + Ok(( + None, + UntaggedValue::binary(bytes), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )) + } + } + (Some(x), Some(y)) if *x == 0xfe && *y == 0xff => { + // Possibly UTF-16 big endian + let utf16 = read_be_u16(&bytes[2..]); + + if let Some(utf16) = utf16 { + match std::string::String::from_utf16(&utf16) { + Ok(s) => Ok(( + cwd.extension() + .map(|name| name.to_string_lossy().to_string()), + UntaggedValue::string(s), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )), + Err(_) => Ok(( + None, + UntaggedValue::binary(bytes), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )), + } + } else { + Ok(( + None, + UntaggedValue::binary(bytes), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )) + } + } + _ => Ok(( + None, + UntaggedValue::binary(bytes), + Tag { + span, + anchor: Some(AnchorLocation::File( + cwd.to_string_lossy().to_string(), + )), + }, + )), + } + } + }, + Err(_) => Err(ShellError::labeled_error( + format!("Cannot open {:?} for reading.", &cwd), + "file not found", + span, + )), + } + } + } else { + Err(ShellError::labeled_error( + format!("Cannot open {:?} for reading.", &cwd), + "file not found", + span, + )) + } + /* cwd.push(Path::new(location)); if let Ok(cwd) = dunce::canonicalize(cwd) { match std::fs::read(&cwd) { @@ -214,6 +430,103 @@ pub async fn fetch( span, )) } + */ +} + +fn convert_via_utf8( + decoder: &mut Decoder, + encoder: &mut Encoder, + read: &mut dyn Read, + write: &mut dyn Write, + last: bool, +) { + let mut input_buffer = [0u8; 2048]; + let mut intermediate_buffer_bytes = [0u8; 4096]; + // Is there a safe way to create a stack-allocated &mut str? + let mut intermediate_buffer: &mut str = + //unsafe { std::mem::transmute(&mut intermediate_buffer_bytes[..]) }; + std::str::from_utf8_mut(&mut intermediate_buffer_bytes[..]).expect("error with from_utf8_mut"); + let mut output_buffer = [0u8; 4096]; + let mut current_input_ended = false; + while !current_input_ended { + match read.read(&mut input_buffer) { + Err(_) => { + print!("Error reading input."); + //std::process::exit(-5); + } + Ok(decoder_input_end) => { + current_input_ended = decoder_input_end == 0; + let input_ended = last && current_input_ended; + let mut decoder_input_start = 0usize; + loop { + let (decoder_result, decoder_read, decoder_written, _) = decoder.decode_to_str( + &input_buffer[decoder_input_start..decoder_input_end], + &mut intermediate_buffer, + input_ended, + ); + decoder_input_start += decoder_read; + + let last_output = if input_ended { + match decoder_result { + CoderResult::InputEmpty => true, + CoderResult::OutputFull => false, + } + } else { + false + }; + + // Regardless of whether the intermediate buffer got full + // or the input buffer was exhausted, let's process what's + // in the intermediate buffer. + + if encoder.encoding() == UTF_8 { + // If the target is UTF-8, optimize out the encoder. + if write + .write_all(&intermediate_buffer.as_bytes()[..decoder_written]) + .is_err() + { + print!("Error writing output."); + //std::process::exit(-7); + } + } else { + let mut encoder_input_start = 0usize; + loop { + let (encoder_result, encoder_read, encoder_written, _) = encoder + .encode_from_utf8( + &intermediate_buffer[encoder_input_start..decoder_written], + &mut output_buffer, + last_output, + ); + encoder_input_start += encoder_read; + if write.write_all(&output_buffer[..encoder_written]).is_err() { + print!("Error writing output."); + //std::process::exit(-6); + } + match encoder_result { + CoderResult::InputEmpty => { + break; + } + CoderResult::OutputFull => { + continue; + } + } + } + } + + // Now let's see if we should read again or process the + // rest of the current input buffer. + match decoder_result { + CoderResult::InputEmpty => { + break; + } + CoderResult::OutputFull => { + continue; + } + } + } + } + } + } } fn read_le_u16(input: &[u8]) -> Option> { diff --git a/crates/nu-cli/tests/commands/enter.rs b/crates/nu-cli/tests/commands/enter.rs index 2bc87e74ff..bbacd5a1fe 100644 --- a/crates/nu-cli/tests/commands/enter.rs +++ b/crates/nu-cli/tests/commands/enter.rs @@ -80,7 +80,7 @@ fn errors_if_file_not_found() { "enter i_dont_exist.csv" ); - assert!(actual.err.contains("File could not be opened")); + //assert!(actual.err.contains("File could not be opened")); assert!(actual.err.contains("file not found")); }) } diff --git a/crates/nu-cli/tests/commands/open.rs b/crates/nu-cli/tests/commands/open.rs index b009f98213..b24eb1dca6 100644 --- a/crates/nu-cli/tests/commands/open.rs +++ b/crates/nu-cli/tests/commands/open.rs @@ -225,6 +225,6 @@ fn errors_if_file_not_found() { "open i_dont_exist.txt" ); - assert!(actual.err.contains("File could not be opened")); - assert!(actual.err.contains("file not found")); + //assert!(actual.err.contains("File could not be opened")); + assert!(actual.err.contains("Cannot open")); }