Convert open/fetch to stream (#2028)

* Types lined up for open with stream

* Chunking stream

* Maybe I didn't need most of the Stream stuff after all?

* Some clean-up

* Merge weird cargo.lock

* Start moving some encoding logic to MaybeTextCodec

Will we lose the nice table formatting if we Stream? How do we get it back? Collect the Stream at the end?

* Clean-up and small refinements

* Put in auto-convert workaround

* Workaround to make sure bat functionality works

* Handle some easy error cases

* All tests pass

* Remove guessing logic

* Address clippy comments

* Pull latest master and fix MaybeTextCodec usage

* Add tag to enable autoview
This commit is contained in:
Arash Outadi 2020-07-03 12:53:20 -07:00 committed by GitHub
parent 8775991c2d
commit e31e8d1550
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 609 additions and 407 deletions

View file

@ -1,5 +1,5 @@
use crate::commands::classified::block::run_block;
use crate::commands::classified::external::{MaybeTextCodec, StringOrBinary};
use crate::commands::classified::maybe_text_codec::{MaybeTextCodec, StringOrBinary};
use crate::commands::plugin::JsonRpc;
use crate::commands::plugin::{PluginCommand, PluginSink};
use crate::commands::whole_stream_command;
@ -953,7 +953,7 @@ pub async fn process_line(
let input_stream = if redirect_stdin {
let file = futures::io::AllowStdIo::new(std::io::stdin());
let stream = FramedRead::new(file, MaybeTextCodec).map(|line| {
let stream = FramedRead::new(file, MaybeTextCodec::default()).map(|line| {
if let Ok(line) = line {
match line {
StringOrBinary::String(s) => Ok(Value {

View file

@ -20,6 +20,7 @@ pub(crate) mod clip;
pub(crate) mod command;
pub(crate) mod compact;
pub(crate) mod config;
pub(crate) mod constants;
pub(crate) mod count;
pub(crate) mod cp;
pub(crate) mod date;

View file

@ -1,3 +1,4 @@
use crate::commands::classified::maybe_text_codec::{MaybeTextCodec, StringOrBinary};
use crate::evaluate::evaluate_baseline_expr;
use crate::futures::ThreadedReceiver;
use crate::prelude::*;
@ -7,9 +8,7 @@ use std::ops::Deref;
use std::process::{Command, Stdio};
use std::sync::mpsc;
use bytes::{BufMut, Bytes, BytesMut};
use futures::executor::block_on_stream;
// use futures::stream::StreamExt;
use futures_codec::FramedRead;
use log::trace;
@ -18,70 +17,6 @@ use nu_protocol::hir::ExternalCommand;
use nu_protocol::{Primitive, Scope, ShellTypeName, UntaggedValue, Value};
use nu_source::Tag;
pub enum StringOrBinary {
String(String),
Binary(Vec<u8>),
}
pub struct MaybeTextCodec;
impl futures_codec::Encoder for MaybeTextCodec {
type Item = StringOrBinary;
type Error = std::io::Error;
fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> Result<(), Self::Error> {
match item {
StringOrBinary::String(s) => {
dst.reserve(s.len());
dst.put(s.as_bytes());
Ok(())
}
StringOrBinary::Binary(b) => {
dst.reserve(b.len());
dst.put(Bytes::from(b));
Ok(())
}
}
}
}
impl futures_codec::Decoder for MaybeTextCodec {
type Item = StringOrBinary;
type Error = std::io::Error;
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
let v: Vec<u8> = src.to_vec();
match String::from_utf8(v) {
Ok(s) => {
src.clear();
if s.is_empty() {
Ok(None)
} else {
Ok(Some(StringOrBinary::String(s)))
}
}
Err(err) => {
// Note: the longest UTF-8 character per Unicode spec is currently 6 bytes. If we fail somewhere earlier than the last 6 bytes,
// we know that we're failing to understand the string encoding and not just seeing a partial character. When this happens, let's
// fall back to assuming it's a binary buffer.
if src.is_empty() {
Ok(None)
} else if src.len() > 6 && (src.len() - err.utf8_error().valid_up_to() > 6) {
// Fall back to assuming binary
let buf = src.to_vec();
src.clear();
Ok(Some(StringOrBinary::Binary(buf)))
} else {
// Looks like a utf-8 string, so let's assume that
let buf = src.split_to(err.utf8_error().valid_up_to() + 1);
String::from_utf8(buf.to_vec())
.map(|x| Some(StringOrBinary::String(x)))
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
}
}
}
}
}
pub(crate) async fn run_external_command(
command: ExternalCommand,
context: &mut Context,
@ -319,7 +254,7 @@ fn spawn(
};
let file = futures::io::AllowStdIo::new(stdout);
let stream = FramedRead::new(file, MaybeTextCodec);
let stream = FramedRead::new(file, MaybeTextCodec::default());
for line in block_on_stream(stream) {
match line {
@ -373,7 +308,7 @@ fn spawn(
}
let file = futures::io::AllowStdIo::new(stderr);
let err_stream = FramedRead::new(file, MaybeTextCodec);
let err_stream = FramedRead::new(file, MaybeTextCodec::default());
for err_line in block_on_stream(err_stream) {
match err_line {

View file

@ -0,0 +1,103 @@
use bytes::{BufMut, Bytes, BytesMut};
use nu_errors::ShellError;
extern crate encoding_rs;
use encoding_rs::{CoderResult, Decoder, Encoding, UTF_8};
const OUTPUT_BUFFER_SIZE: usize = 8192;
pub enum StringOrBinary {
String(String),
Binary(Vec<u8>),
}
pub struct MaybeTextCodec {
decoder: Decoder,
}
impl MaybeTextCodec {
// The constructor takes an Option<&'static Encoding>, because an absence of an encoding indicates that we want BOM sniffing enabled
pub fn new(encoding: Option<&'static Encoding>) -> Self {
let decoder = match encoding {
Some(e) => e.new_decoder_with_bom_removal(),
None => UTF_8.new_decoder(),
};
MaybeTextCodec { decoder }
}
}
impl Default for MaybeTextCodec {
// The default MaybeTextCodec uses a UTF_8 decoder
fn default() -> Self {
MaybeTextCodec {
decoder: UTF_8.new_decoder(),
}
}
}
impl futures_codec::Encoder for MaybeTextCodec {
type Item = StringOrBinary;
type Error = std::io::Error;
fn encode(&mut self, item: Self::Item, dst: &mut BytesMut) -> Result<(), Self::Error> {
match item {
StringOrBinary::String(s) => {
dst.reserve(s.len());
dst.put(s.as_bytes());
Ok(())
}
StringOrBinary::Binary(b) => {
dst.reserve(b.len());
dst.put(Bytes::from(b));
Ok(())
}
}
}
}
// TODO: Write some tests
impl futures_codec::Decoder for MaybeTextCodec {
type Item = StringOrBinary;
type Error = ShellError;
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
if src.is_empty() {
return Ok(None);
}
let mut s = String::with_capacity(OUTPUT_BUFFER_SIZE);
let (res, read, replacements) = self.decoder.decode_to_string(src, &mut s, false);
// If we had to make replacements when converting to utf8, fallback to binary
if replacements {
return Ok(Some(StringOrBinary::Binary(src.to_vec())));
}
match res {
CoderResult::InputEmpty => {
src.clear();
Ok(Some(StringOrBinary::String(s)))
}
CoderResult::OutputFull => {
// If the original buffer size is too small,
// We continue to allocate new Strings and append them to the result until the input buffer is smaller than the allocated String
let mut starting_index = read;
loop {
let mut more = String::with_capacity(OUTPUT_BUFFER_SIZE);
let (res, read, _replacements) =
self.decoder
.decode_to_string(&src[starting_index..], &mut more, false);
s.push_str(&more);
// Our input buffer is smaller than out allocated String, we can stop now
if let CoderResult::InputEmpty = res {
break;
}
starting_index += read;
}
src.clear();
Ok(Some(StringOrBinary::String(s)))
}
}
}
}

View file

@ -3,6 +3,7 @@ mod dynamic;
pub(crate) mod expr;
pub(crate) mod external;
pub(crate) mod internal;
pub(crate) mod maybe_text_codec;
#[allow(unused_imports)]
pub(crate) use dynamic::Command as DynamicCommand;

View file

@ -0,0 +1,358 @@
pub const BAT_LANGUAGES: &[&str] = &[
"as",
"csv",
"tsv",
"applescript",
"script editor",
"s",
"S",
"adoc",
"asciidoc",
"asc",
"asa",
"yasm",
"nasm",
"asm",
"inc",
"mac",
"awk",
"bat",
"cmd",
"bib",
"sh",
"bash",
"zsh",
".bash_aliases",
".bash_completions",
".bash_functions",
".bash_login",
".bash_logout",
".bash_profile",
".bash_variables",
".bashrc",
".profile",
".textmate_init",
".zshrc",
"PKGBUILD",
".ebuild",
".eclass",
"c",
"h",
"cs",
"csx",
"cpp",
"cc",
"cp",
"cxx",
"c++",
"C",
"h",
"hh",
"hpp",
"hxx",
"h++",
"inl",
"ipp",
"cabal",
"clj",
"cljc",
"cljs",
"edn",
"CMakeLists.txt",
"cmake",
"h.in",
"hh.in",
"hpp.in",
"hxx.in",
"h++.in",
"CMakeCache.txt",
"cr",
"css",
"css.erb",
"css.liquid",
"d",
"di",
"dart",
"diff",
"patch",
"Dockerfile",
"dockerfile",
"ex",
"exs",
"elm",
"erl",
"hrl",
"Emakefile",
"emakefile",
"fs",
"fsi",
"fsx",
"fs",
"fsi",
"fsx",
"fish",
"attributes",
"gitattributes",
".gitattributes",
"COMMIT_EDITMSG",
"MERGE_MSG",
"TAG_EDITMSG",
"gitconfig",
".gitconfig",
".gitmodules",
"exclude",
"gitignore",
".gitignore",
".git",
"gitlog",
"git-rebase-todo",
"go",
"dot",
"DOT",
"gv",
"groovy",
"gvy",
"gradle",
"Jenkinsfile",
"hs",
"hs",
"hsc",
"show-nonprintable",
"html",
"htm",
"shtml",
"xhtml",
"asp",
"html.eex",
"yaws",
"rails",
"rhtml",
"erb",
"html.erb",
"adp",
"twig",
"html.twig",
"ini",
"INI",
"INF",
"reg",
"REG",
"lng",
"cfg",
"CFG",
"desktop",
"url",
"URL",
".editorconfig",
".hgrc",
"hgrc",
"java",
"bsh",
"properties",
"jsp",
"js",
"htc",
"js",
"jsx",
"babel",
"es6",
"js.erb",
"json",
"sublime-settings",
"sublime-menu",
"sublime-keymap",
"sublime-mousemap",
"sublime-theme",
"sublime-build",
"sublime-project",
"sublime-completions",
"sublime-commands",
"sublime-macro",
"sublime-color-scheme",
"ipynb",
"Pipfile.lock",
"jsonnet",
"libsonnet",
"libjsonnet",
"jl",
"kt",
"kts",
"tex",
"ltx",
"less",
"css.less",
"lisp",
"cl",
"clisp",
"l",
"mud",
"el",
"scm",
"ss",
"lsp",
"fasl",
"lhs",
"lua",
"make",
"GNUmakefile",
"makefile",
"Makefile",
"makefile.am",
"Makefile.am",
"makefile.in",
"Makefile.in",
"OCamlMakefile",
"mak",
"mk",
"md",
"mdown",
"markdown",
"markdn",
"matlab",
"build",
"nix",
"m",
"h",
"mm",
"M",
"h",
"ml",
"mli",
"mll",
"mly",
"pas",
"p",
"dpr",
"pl",
"pm",
"pod",
"t",
"PL",
"php",
"php3",
"php4",
"php5",
"php7",
"phps",
"phpt",
"phtml",
"txt",
"ps1",
"psm1",
"psd1",
"proto",
"protodevel",
"pb.txt",
"proto.text",
"textpb",
"pbtxt",
"prototxt",
"pp",
"epp",
"purs",
"py",
"py3",
"pyw",
"pyi",
"pyx",
"pyx.in",
"pxd",
"pxd.in",
"pxi",
"pxi.in",
"rpy",
"cpy",
"SConstruct",
"Sconstruct",
"sconstruct",
"SConscript",
"gyp",
"gypi",
"Snakefile",
"wscript",
"R",
"r",
"s",
"S",
"Rprofile",
"rd",
"re",
"rst",
"rest",
"robot",
"rb",
"Appfile",
"Appraisals",
"Berksfile",
"Brewfile",
"capfile",
"cgi",
"Cheffile",
"config.ru",
"Deliverfile",
"Fastfile",
"fcgi",
"Gemfile",
"gemspec",
"Guardfile",
"irbrc",
"jbuilder",
"Podfile",
"podspec",
"prawn",
"rabl",
"rake",
"Rakefile",
"Rantfile",
"rbx",
"rjs",
"ruby.rail",
"Scanfile",
"simplecov",
"Snapfile",
"thor",
"Thorfile",
"Vagrantfile",
"haml",
"sass",
"rxml",
"builder",
"rs",
"scala",
"sbt",
"sql",
"ddl",
"dml",
"erbsql",
"sql.erb",
"swift",
"log",
"tcl",
"tf",
"tfvars",
"hcl",
"sty",
"cls",
"textile",
"toml",
"tml",
"Cargo.lock",
"Gopkg.lock",
"Pipfile",
"ts",
"tsx",
"varlink",
"vim",
".vimrc",
"xml",
"xsd",
"xslt",
"tld",
"dtml",
"rss",
"opml",
"svg",
"yaml",
"yml",
"sublime-syntax",
];

View file

@ -121,21 +121,16 @@ async fn enter(
let full_path = std::path::PathBuf::from(cwd);
let (file_extension, contents, contents_tag) = crate::commands::open::fetch(
let (file_extension, tagged_contents) = crate::commands::open::fetch(
&full_path,
&PathBuf::from(location_clone),
tag.span,
match encoding {
Some(e) => e.to_string(),
_ => "".to_string(),
},
encoding,
)
.await?;
match contents {
match tagged_contents.value {
UntaggedValue::Primitive(Primitive::String(_)) => {
let tagged_contents = contents.into_value(&contents_tag);
if let Some(extension) = file_extension {
let command_name = format!("from {}", extension);
if let Some(converter) = registry.get_command(&command_name) {
@ -156,18 +151,18 @@ async fn enter(
scope: scope.clone(),
},
};
let tag = tagged_contents.tag.clone();
let mut result = converter
.run(new_args.with_input(vec![tagged_contents]), &registry)
.await?;
let result_vec: Vec<Result<ReturnSuccess, ShellError>> =
result.drain_vec().await;
Ok(futures::stream::iter(result_vec.into_iter().map(
move |res| match res {
Ok(ReturnSuccess::Value(Value { value, .. })) => Ok(
ReturnSuccess::Action(CommandAction::EnterValueShell(Value {
value,
tag: contents_tag.clone(),
tag: tag.clone(),
})),
),
x => x,
@ -185,13 +180,9 @@ async fn enter(
)))
}
}
_ => {
let tagged_contents = contents.into_value(contents_tag);
Ok(OutputStream::one(ReturnSuccess::action(
CommandAction::EnterValueShell(tagged_contents),
)))
}
_ => Ok(OutputStream::one(ReturnSuccess::action(
CommandAction::EnterValueShell(tagged_contents),
))),
}
}
}

View file

@ -1,15 +1,17 @@
use crate::commands::classified::maybe_text_codec::{MaybeTextCodec, StringOrBinary};
use crate::commands::WholeStreamCommand;
use crate::prelude::*;
use futures_codec::FramedRead;
use nu_errors::ShellError;
use nu_protocol::{CommandAction, ReturnSuccess, Signature, SyntaxShape, UntaggedValue};
use nu_protocol::{CommandAction, ReturnSuccess, Signature, SyntaxShape, UntaggedValue, Value};
use nu_source::{AnchorLocation, Span, Tagged};
use std::path::{Path, PathBuf};
use std::path::PathBuf;
extern crate encoding_rs;
use crate::commands::constants::BAT_LANGUAGES;
use encoding_rs::*;
use futures::prelude::*;
use log::debug;
use std::fs::File;
use std::io::BufWriter;
use std::io::Read;
use std::io::Write;
pub struct Open;
@ -81,23 +83,25 @@ documentation link at https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics"#
}
}
pub fn get_encoding(opt: Option<String>) -> &'static Encoding {
pub fn get_encoding(opt: Option<Tagged<String>>) -> Result<&'static Encoding, ShellError> {
match opt {
None => UTF_8,
Some(label) => match Encoding::for_label((&label).as_bytes()) {
None => {
//print!("{} is not a known encoding label. Trying UTF-8.", label);
//std::process::exit(-2);
get_encoding(Some("utf-8".to_string()))
}
Some(encoding) => encoding,
None => Ok(UTF_8),
Some(label) => match Encoding::for_label((&label.item).as_bytes()) {
None => Err(ShellError::labeled_error(
format!(
r#"{} is not a valid encoding, refer to https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics for a valid list of encodings"#,
label.item
),
"invalid encoding",
label.span(),
)),
Some(encoding) => Ok(encoding),
},
}
}
async fn open(args: CommandArgs, registry: &CommandRegistry) -> Result<OutputStream, ShellError> {
let cwd = PathBuf::from(args.shell_manager.path());
let full_path = cwd;
let registry = registry.clone();
let (
@ -108,329 +112,135 @@ async fn open(args: CommandArgs, registry: &CommandRegistry) -> Result<OutputStr
},
_,
) = args.process(&registry).await?;
let enc = match encoding {
Some(e) => e.to_string(),
_ => "".to_string(),
};
let result = fetch(&full_path, &path.item, path.tag.span, enc).await;
let (file_extension, contents, contents_tag) = result?;
// TODO: Remove once Streams are supported everywhere!
// As a short term workaround for getting AutoConvert and Bat functionality (Those don't currently support Streams)
let file_extension = if raw.item {
// Check if the extension has a "from *" command OR "bat" supports syntax highlighting
// AND the user doesn't want the raw output
// In these cases, we will collect the Stream
let ext = if raw.item {
None
} else {
// If the extension could not be determined via mimetype, try to use the path
// extension. Some file types do not declare their mimetypes (such as bson files).
file_extension.or_else(|| path.extension().map(|x| x.to_string_lossy().to_string()))
path.extension()
.map(|name| name.to_string_lossy().to_string())
};
let tagged_contents = contents.into_value(&contents_tag);
if let Some(extension) = file_extension {
Ok(OutputStream::one(ReturnSuccess::action(
CommandAction::AutoConvert(tagged_contents, extension),
)))
} else {
Ok(OutputStream::one(ReturnSuccess::value(tagged_contents)))
if let Some(ext) = ext {
// Check if we have a conversion command
if let Some(_command) = registry.get_command(&format!("from {}", ext)) {
let (_, tagged_contents) = crate::commands::open::fetch(
&cwd,
&PathBuf::from(&path.item),
path.tag.span,
encoding,
)
.await?;
return Ok(OutputStream::one(ReturnSuccess::action(
CommandAction::AutoConvert(tagged_contents, ext),
)));
}
// Check if bat does syntax highlighting
if BAT_LANGUAGES.contains(&ext.as_ref()) {
let (_, tagged_contents) = crate::commands::open::fetch(
&cwd,
&PathBuf::from(&path.item),
path.tag.span,
encoding,
)
.await?;
return Ok(OutputStream::one(ReturnSuccess::value(tagged_contents)));
}
}
// Normal Streaming operation
let with_encoding = if encoding.is_none() {
None
} else {
Some(get_encoding(encoding)?)
};
let f = File::open(&path).map_err(|e| {
ShellError::labeled_error(
format!("Error opening file: {:?}", e),
"Error opening file",
path.span(),
)
})?;
let async_reader = futures::io::AllowStdIo::new(f);
let sob_stream = FramedRead::new(async_reader, MaybeTextCodec::new(with_encoding))
.map_err(|e| ShellError::unexpected(format!("AsyncRead failed in open function: {:?}", e)))
.into_stream();
let final_stream = sob_stream.map(|x| match x {
Ok(StringOrBinary::String(s)) => {
ReturnSuccess::value(UntaggedValue::string(s).into_untagged_value())
}
Ok(StringOrBinary::Binary(b)) => ReturnSuccess::value(
UntaggedValue::binary(b.into_iter().collect()).into_untagged_value(),
),
Err(se) => Err(se),
});
Ok(OutputStream::new(final_stream))
}
// Note that we do not output a Stream in "fetch" since it is only used by "enter" command
// Which we expect to use a concrete Value a not a Stream
pub async fn fetch(
cwd: &PathBuf,
location: &PathBuf,
span: Span,
encoding: String,
) -> Result<(Option<String>, UntaggedValue, Tag), ShellError> {
encoding_choice: Option<Tagged<String>>,
) -> Result<(Option<String>, Value), ShellError> {
// TODO: I don't understand the point of this? Maybe for better error reporting
let mut cwd = cwd.clone();
let output_encoding: &Encoding = get_encoding(Some("utf-8".to_string()));
let input_encoding: &Encoding = get_encoding(Some(encoding.clone()));
let mut decoder = input_encoding.new_decoder();
let mut encoder = output_encoding.new_encoder();
let mut _file: File;
let buf = Vec::new();
let mut bufwriter = BufWriter::new(buf);
cwd.push(Path::new(location));
if let Ok(cwd) = dunce::canonicalize(&cwd) {
if !encoding.is_empty() {
// use the encoding string
match File::open(&Path::new(&cwd)) {
Ok(mut _file) => {
convert_via_utf8(
&mut decoder,
&mut encoder,
&mut _file,
&mut bufwriter,
false,
);
//bufwriter.flush()?;
Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(String::from_utf8_lossy(&bufwriter.buffer())),
Tag {
span,
anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())),
},
))
}
Err(_) => Err(ShellError::labeled_error(
format!("Cannot open {:?} for reading.", &cwd),
"file not found",
span,
)),
}
} else {
// Do the old stuff
match std::fs::read(&cwd) {
Ok(bytes) => match std::str::from_utf8(&bytes) {
Ok(s) => Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(s),
Tag {
span,
anchor: Some(AnchorLocation::File(cwd.to_string_lossy().to_string())),
},
)),
Err(_) => {
//Non utf8 data.
match (bytes.get(0), bytes.get(1)) {
(Some(x), Some(y)) if *x == 0xff && *y == 0xfe => {
// Possibly UTF-16 little endian
let utf16 = read_le_u16(&bytes[2..]);
if let Some(utf16) = utf16 {
match std::string::String::from_utf16(&utf16) {
Ok(s) => Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(s),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
Err(_) => Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
}
} else {
Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
))
}
}
(Some(x), Some(y)) if *x == 0xfe && *y == 0xff => {
// Possibly UTF-16 big endian
let utf16 = read_be_u16(&bytes[2..]);
if let Some(utf16) = utf16 {
match std::string::String::from_utf16(&utf16) {
Ok(s) => Ok((
cwd.extension()
.map(|name| name.to_string_lossy().to_string()),
UntaggedValue::string(s),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
Err(_) => Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
}
} else {
Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
))
}
}
_ => Ok((
None,
UntaggedValue::binary(bytes),
Tag {
span,
anchor: Some(AnchorLocation::File(
cwd.to_string_lossy().to_string(),
)),
},
)),
}
}
},
Err(_) => Err(ShellError::labeled_error(
format!("Cannot open {:?} for reading.", &cwd),
"file not found",
span,
)),
}
}
} else {
Err(ShellError::labeled_error(
format!("Cannot open {:?} for reading.", &cwd),
"file not found",
cwd.push(location);
let nice_location = dunce::canonicalize(&cwd).map_err(|e| {
ShellError::labeled_error(
format!("Cannot canonicalize file {:?} because {:?}", &cwd, e),
"Cannot canonicalize",
span,
))
}
}
)
})?;
fn convert_via_utf8(
decoder: &mut Decoder,
encoder: &mut Encoder,
read: &mut dyn Read,
write: &mut dyn Write,
last: bool,
) {
let mut input_buffer = [0u8; 2048];
let mut intermediate_buffer_bytes = [0u8; 4096];
// Is there a safe way to create a stack-allocated &mut str?
let mut intermediate_buffer: &mut str =
//unsafe { std::mem::transmute(&mut intermediate_buffer_bytes[..]) };
std::str::from_utf8_mut(&mut intermediate_buffer_bytes[..]).expect("error with from_utf8_mut");
let mut output_buffer = [0u8; 4096];
let mut current_input_ended = false;
while !current_input_ended {
match read.read(&mut input_buffer) {
Err(_) => {
print!("Error reading input.");
//std::process::exit(-5);
}
Ok(decoder_input_end) => {
current_input_ended = decoder_input_end == 0;
let input_ended = last && current_input_ended;
let mut decoder_input_start = 0usize;
loop {
let (decoder_result, decoder_read, decoder_written, _) = decoder.decode_to_str(
&input_buffer[decoder_input_start..decoder_input_end],
&mut intermediate_buffer,
input_ended,
);
decoder_input_start += decoder_read;
// The extension may be used in AutoConvert later on
let ext = location
.extension()
.map(|name| name.to_string_lossy().to_string());
let last_output = if input_ended {
match decoder_result {
CoderResult::InputEmpty => true,
CoderResult::OutputFull => false,
}
} else {
false
};
// The tag that will used when returning a Value
let file_tag = Tag {
span,
anchor: Some(AnchorLocation::File(
nice_location.to_string_lossy().to_string(),
)),
};
// Regardless of whether the intermediate buffer got full
// or the input buffer was exhausted, let's process what's
// in the intermediate buffer.
let res = std::fs::read(location)?;
if encoder.encoding() == UTF_8 {
// If the target is UTF-8, optimize out the encoder.
if write
.write_all(&intermediate_buffer.as_bytes()[..decoder_written])
.is_err()
{
print!("Error writing output.");
//std::process::exit(-7);
}
} else {
let mut encoder_input_start = 0usize;
loop {
let (encoder_result, encoder_read, encoder_written, _) = encoder
.encode_from_utf8(
&intermediate_buffer[encoder_input_start..decoder_written],
&mut output_buffer,
last_output,
);
encoder_input_start += encoder_read;
if write.write_all(&output_buffer[..encoder_written]).is_err() {
print!("Error writing output.");
//std::process::exit(-6);
}
match encoder_result {
CoderResult::InputEmpty => {
break;
}
CoderResult::OutputFull => {
continue;
}
}
}
}
// Now let's see if we should read again or process the
// rest of the current input buffer.
match decoder_result {
CoderResult::InputEmpty => {
break;
}
CoderResult::OutputFull => {
continue;
}
}
}
}
}
}
}
fn read_le_u16(input: &[u8]) -> Option<Vec<u16>> {
if input.len() % 2 != 0 || input.len() < 2 {
None
// If no encoding is provided we try to guess the encoding to read the file with
let encoding = if encoding_choice.is_none() {
UTF_8
} else {
let mut result = vec![];
let mut pos = 0;
while pos < input.len() {
result.push(u16::from_le_bytes([input[pos], input[pos + 1]]));
pos += 2;
}
get_encoding(encoding_choice.clone())?
};
Some(result)
}
}
fn read_be_u16(input: &[u8]) -> Option<Vec<u16>> {
if input.len() % 2 != 0 || input.len() < 2 {
None
// If the user specified an encoding, then do not do BOM sniffing
let decoded_res = if encoding_choice.is_some() {
let (cow_res, _replacements) = encoding.decode_with_bom_removal(&res);
cow_res
} else {
let mut result = vec![];
let mut pos = 0;
while pos < input.len() {
result.push(u16::from_be_bytes([input[pos], input[pos + 1]]));
pos += 2;
// Otherwise, use the default UTF-8 encoder with BOM sniffing
let (cow_res, actual_encoding, replacements) = encoding.decode(&res);
// If we had to use replacement characters then fallback to binary
if replacements {
return Ok((ext, UntaggedValue::binary(res).into_value(file_tag)));
}
Some(result)
}
debug!("Decoded using {:?}", actual_encoding);
cow_res
};
let v = UntaggedValue::string(decoded_res.to_string()).into_value(file_tag);
Ok((ext, v))
}
#[cfg(test)]

View file

@ -80,7 +80,6 @@ fn errors_if_file_not_found() {
"enter i_dont_exist.csv"
);
//assert!(actual.err.contains("File could not be opened"));
assert!(actual.err.contains("file not found"));
assert!(actual.err.contains("Cannot canonicalize"));
})
}

View file

@ -224,7 +224,11 @@ fn errors_if_file_not_found() {
cwd: "tests/fixtures/formats",
"open i_dont_exist.txt"
);
//assert!(actual.err.contains("File could not be opened"));
assert!(actual.err.contains("Cannot open"));
let expected = "Cannot canonicalize";
assert!(
actual.err.contains(expected),
"Error:\n{}\ndoes not contain{}",
actual.err,
expected
);
}