use encoding on open for #1939 (#1949)

* WIP - not compiling

* compiling but panicing

* still broken

* nearly working

* reverted deserializer_string changes
updated enter.rs and open.rs to use Option<Tagged<String>>
Accepted Clippy suggestions
Accepted fmt suggestions
Left original code from open.rs
 We may want to use some of it and only fallback to encoding.

* Don't exit when there is an unknown encoding.

* When encoding is unknown default to utf-8.

* only do encoding if the user says to it

* merged some conflicts on open

* made error messages consistent

* Updated unwrap with expect

* updated open test to pass with more descriptive err
updated enter test to not fail

* change _location to location

* changed _visitor to visitor

* Added a more verbose usage statement for encoding
Linked to docs.rs/encoding_rs for details

Co-authored-by: Darren Schroeder <fdncred@hotmail.com>
This commit is contained in:
Darren Schroeder 2020-06-11 19:37:43 -05:00 committed by GitHub
parent a268e825aa
commit 731aa6bbdd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 357 additions and 18 deletions

1
Cargo.lock generated
View file

@ -2247,6 +2247,7 @@ dependencies = [
"dirs 2.0.2",
"dunce",
"eml-parser",
"encoding_rs",
"filesize",
"futures 0.3.5",
"futures-util",

View file

@ -92,6 +92,7 @@ trash = { version = "1.0.1", optional = true }
clipboard = { version = "0.5", optional = true }
starship = { version = "0.41.3", optional = true }
rayon = "1.3.0"
encoding_rs = "0.8.23"
[target.'cfg(unix)'.dependencies]
users = "0.10.0"

View file

@ -14,6 +14,7 @@ pub struct Enter;
#[derive(Deserialize)]
pub struct EnterArgs {
location: Tagged<PathBuf>,
encoding: Option<Tagged<String>>,
}
#[async_trait]
@ -23,15 +24,29 @@ impl WholeStreamCommand for Enter {
}
fn signature(&self) -> Signature {
Signature::build("enter").required(
Signature::build("enter")
.required(
"location",
SyntaxShape::Path,
"the location to create a new shell from",
)
.named(
"encoding",
SyntaxShape::String,
"encoding to use to open file",
Some('e'),
)
}
fn usage(&self) -> &str {
"Create a new shell and begin at this path."
r#"Create a new shell and begin at this path.
Multiple encodings are supported for reading text files by using
the '--encoding <encoding>' parameter. Here is an example of a few:
big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5
For a more complete list of encodings please refer to the encoding_rs
documentation link at https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics"#
}
async fn run(
@ -54,6 +69,11 @@ impl WholeStreamCommand for Enter {
example: "enter package.json",
result: None,
},
Example {
description: "Enters file with iso-8859-1 encoding",
example: "enter file.csv --encoding iso-8859-1",
result: None,
},
]
}
}
@ -68,7 +88,7 @@ fn enter(raw_args: CommandArgs, registry: &CommandRegistry) -> Result<OutputStre
let current_errors = raw_args.current_errors.clone();
let host = raw_args.host.clone();
let tag = raw_args.call_info.name_tag.clone();
let (EnterArgs { location }, _) = raw_args.process(&registry).await?;
let (EnterArgs { location, encoding }, _) = raw_args.process(&registry).await?;
let location_string = location.display().to_string();
let location_clone = location_string.clone();
@ -103,6 +123,10 @@ fn enter(raw_args: CommandArgs, registry: &CommandRegistry) -> Result<OutputStre
&full_path,
&PathBuf::from(location_clone),
tag.span,
match encoding {
Some(e) => e.to_string(),
_ => "".to_string()
}
).await?;
match contents {

View file

@ -4,6 +4,12 @@ use nu_errors::ShellError;
use nu_protocol::{CommandAction, ReturnSuccess, Signature, SyntaxShape, UntaggedValue};
use nu_source::{AnchorLocation, Span, Tagged};
use std::path::{Path, PathBuf};
extern crate encoding_rs;
use encoding_rs::*;
use std::fs::File;
use std::io::BufWriter;
use std::io::Read;
use std::io::Write;
pub struct Open;
@ -11,6 +17,7 @@ pub struct Open;
pub struct OpenArgs {
path: Tagged<PathBuf>,
raw: Tagged<bool>,
encoding: Option<Tagged<String>>,
}
#[async_trait]
@ -31,10 +38,23 @@ impl WholeStreamCommand for Open {
"load content as a string instead of a table",
Some('r'),
)
.named(
"encoding",
SyntaxShape::String,
"encoding to use to open file",
Some('e'),
)
}
fn usage(&self) -> &str {
"Load a file into a cell, convert to table if possible (avoid by appending '--raw')"
r#"Load a file into a cell, convert to table if possible (avoid by appending '--raw').
Multiple encodings are supported for reading text files by using
the '--encoding <encoding>' parameter. Here is an example of a few:
big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5
For a more complete list of encodings please refer to the encoding_rs
documentation link at https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics"#
}
async fn run(
@ -46,11 +66,32 @@ impl WholeStreamCommand for Open {
}
fn examples(&self) -> Vec<Example> {
vec![Example {
vec![
Example {
description: "Opens \"users.csv\" and creates a table from the data",
example: "open users.csv",
result: None,
}]
},
Example {
description: "Opens file with iso-8859-1 encoding",
example: "open file.csv --encoding iso-8859-1 | from csv",
result: None,
},
]
}
}
pub fn get_encoding(opt: Option<String>) -> &'static Encoding {
match opt {
None => UTF_8,
Some(label) => match Encoding::for_label((&label).as_bytes()) {
None => {
//print!("{} is not a known encoding label. Trying UTF-8.", label);
//std::process::exit(-2);
get_encoding(Some("utf-8".to_string()))
}
Some(encoding) => encoding,
},
}
}
@ -59,8 +100,19 @@ async fn open(args: CommandArgs, registry: &CommandRegistry) -> Result<OutputStr
let full_path = cwd;
let registry = registry.clone();
let (OpenArgs { path, raw }, _) = args.process(&registry).await?;
let result = fetch(&full_path, &path.item, path.tag.span).await;
let (
OpenArgs {
path,
raw,
encoding,
},
_,
) = args.process(&registry).await?;
let enc = match encoding {
Some(e) => e.to_string(),
_ => "".to_string(),
};
let result = fetch(&full_path, &path.item, path.tag.span, enc).await;
let (file_extension, contents, contents_tag) = result?;
@ -87,9 +139,173 @@ pub async fn fetch(
cwd: &PathBuf,
location: &PathBuf,
span: Span,
encoding: String,
) -> Result<(Option<String>, UntaggedValue, Tag), ShellError> {
let mut cwd = cwd.clone();
let output_encoding: &Encoding = get_encoding(Some("utf-8".to_string()));
let input_encoding: &Encoding = get_encoding(Some(encoding.clone()));
let mut decoder = input_encoding.new_decoder();
let mut encoder = output_encoding.new_encoder();
let mut _file: File;
let buf = Vec::new();
let mut bufwriter = BufWriter::new(buf);
cwd.push(Path::new(location));
if let Ok(cwd) = dunce::canonicalize(&cwd) {
if !encoding.is_empty() {
// use the encoding string
match File::open(&Path::new(&cwd)) {
Ok(mut _file) => {
convert_via_utf8(
&mut decoder,
&mut encoder,
&mut _file,
&mut bufwriter,
false,
);
//bufwriter.flush()?;
Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(String::from_utf8_lossy(&bufwriter.buffer())),
Tag {
span,
anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())),
},
))
}
Err(_) => Err(ShellError::labeled_error(
format!("Cannot open {:?} for reading.", &cwd),
"file not found",
span,
)),
}
} else {
// Do the old stuff
match std::fs::read(&cwd) {
Ok(bytes) => match std::str::from_utf8(&bytes) {
Ok(s) => Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(s),
Tag {
span,
anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())),
},
)),
Err(_) => {
//Non utf8 data.
match (bytes.get(0), bytes.get(1)) {
(Some(x), Some(y)) if *x == 0xff && *y == 0xfe => {
// Possibly UTF-16 little endian
let utf16 = read_le_u16(&bytes[2..]);
if let Some(utf16) = utf16 {
match std::string::String::from_utf16(&utf16) {
Ok(s) => Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(s),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
Err(_) => Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
}
} else {
Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
))
}
}
(Some(x), Some(y)) if *x == 0xfe && *y == 0xff => {
// Possibly UTF-16 big endian
let utf16 = read_be_u16(&bytes[2..]);
if let Some(utf16) = utf16 {
match std::string::String::from_utf16(&utf16) {
Ok(s) => Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(s),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
Err(_) => Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
}
} else {
Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
))
}
}
_ => Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
}
}
},
Err(_) => Err(ShellError::labeled_error(
format!("Cannot open {:?} for reading.", &cwd),
"file not found",
span,
)),
}
}
} else {
Err(ShellError::labeled_error(
format!("Cannot open {:?} for reading.", &cwd),
"file not found",
span,
))
}
/*
cwd.push(Path::new(location));
if let Ok(cwd) = dunce::canonicalize(cwd) {
match std::fs::read(&cwd) {
@ -214,6 +430,103 @@ pub async fn fetch(
span,
))
}
*/
}
fn convert_via_utf8(
decoder: &mut Decoder,
encoder: &mut Encoder,
read: &mut dyn Read,
write: &mut dyn Write,
last: bool,
) {
let mut input_buffer = [0u8; 2048];
let mut intermediate_buffer_bytes = [0u8; 4096];
// Is there a safe way to create a stack-allocated &mut str?
let mut intermediate_buffer: &mut str =
//unsafe { std::mem::transmute(&mut intermediate_buffer_bytes[..]) };
std::str::from_utf8_mut(&mut intermediate_buffer_bytes[..]).expect("error with from_utf8_mut");
let mut output_buffer = [0u8; 4096];
let mut current_input_ended = false;
while !current_input_ended {
match read.read(&mut input_buffer) {
Err(_) => {
print!("Error reading input.");
//std::process::exit(-5);
}
Ok(decoder_input_end) => {
current_input_ended = decoder_input_end == 0;
let input_ended = last && current_input_ended;
let mut decoder_input_start = 0usize;
loop {
let (decoder_result, decoder_read, decoder_written, _) = decoder.decode_to_str(
&input_buffer[decoder_input_start..decoder_input_end],
&mut intermediate_buffer,
input_ended,
);
decoder_input_start += decoder_read;
let last_output = if input_ended {
match decoder_result {
CoderResult::InputEmpty => true,
CoderResult::OutputFull => false,
}
} else {
false
};
// Regardless of whether the intermediate buffer got full
// or the input buffer was exhausted, let's process what's
// in the intermediate buffer.
if encoder.encoding() == UTF_8 {
// If the target is UTF-8, optimize out the encoder.
if write
.write_all(&intermediate_buffer.as_bytes()[..decoder_written])
.is_err()
{
print!("Error writing output.");
//std::process::exit(-7);
}
} else {
let mut encoder_input_start = 0usize;
loop {
let (encoder_result, encoder_read, encoder_written, _) = encoder
.encode_from_utf8(
&intermediate_buffer[encoder_input_start..decoder_written],
&mut output_buffer,
last_output,
);
encoder_input_start += encoder_read;
if write.write_all(&output_buffer[..encoder_written]).is_err() {
print!("Error writing output.");
//std::process::exit(-6);
}
match encoder_result {
CoderResult::InputEmpty => {
break;
}
CoderResult::OutputFull => {
continue;
}
}
}
}
// Now let's see if we should read again or process the
// rest of the current input buffer.
match decoder_result {
CoderResult::InputEmpty => {
break;
}
CoderResult::OutputFull => {
continue;
}
}
}
}
}
}
}
fn read_le_u16(input: &[u8]) -> Option<Vec<u16>> {

View file

@ -80,7 +80,7 @@ fn errors_if_file_not_found() {
"enter i_dont_exist.csv"
);
assert!(actual.err.contains("File could not be opened"));
//assert!(actual.err.contains("File could not be opened"));
assert!(actual.err.contains("file not found"));
})
}

View file

@ -225,6 +225,6 @@ fn errors_if_file_not_found() {
"open i_dont_exist.txt"
);
assert!(actual.err.contains("File could not be opened"));
assert!(actual.err.contains("file not found"));
//assert!(actual.err.contains("File could not be opened"));
assert!(actual.err.contains("Cannot open"));
}