From 3645a0f0e452998400a9d4c92cb7182d7caecfa4 Mon Sep 17 00:00:00 2001 From: Fernando Herrera Date: Wed, 14 Jul 2021 21:33:21 +0100 Subject: [PATCH] Updated polars version for faster CSV reader (#3781) --- Cargo.lock | 39 +++++++++++++------ crates/nu-command/Cargo.toml | 2 +- .../nu-command/src/commands/dataframe/open.rs | 16 ++++++-- crates/nu-data/Cargo.toml | 2 +- crates/nu-protocol/Cargo.toml | 2 +- 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index caf850ff10..7b3739125a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1284,6 +1284,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "dirs" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30baa043103c9d0c2a57cf537cc2f35623889dc0d405e6c3cccfadbc81c71309" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -4342,9 +4351,9 @@ dependencies = [ [[package]] name = "polars" -version = "0.14.5" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f254b79757346a86a8371ea4a087ce6a56e604c82d61093a1b85bfd0df99aeb" +checksum = "5c1bf71de63afed8a9262b61a2a7c3a463bf3b2b8dc28a91873621a86ddeb996" dependencies = [ "polars-core", "polars-io", @@ -4353,9 +4362,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.14.5" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec1ef88e60b660c51644a5b098570519948d95f389b67ef690a0f1187395d7bf" +checksum = "d1d436b455a6ac76f09b7b8127d2c4b21e80e8b7579218edee91ce138281d178" dependencies = [ "arrow", "num 0.4.0", @@ -4364,9 +4373,9 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.14.5" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5e6ee23eb50845501c8c31368051af75801185cf4bedf9e7b3ec945a49af9c" +checksum = "f3e1a74ab0ddbb0cca4f9a79691cf419ec7af63fc1f72206fac6ab45508d6d85" dependencies = [ "ahash", "anyhow", @@ -4391,15 +4400,16 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.14.5" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94e8719cdb70555e0492dd24e8f09f637cc112bac438be754bad8dca75f466ab" +checksum = "54f37bf032736512cc938000e937175b3bbfc27ea72abd07fc03d25145d709a0" dependencies = [ "ahash", "anyhow", "arrow", "csv", "csv-core", + "dirs 3.0.2", "fast-float", "lazy_static 1.4.0", "lexical", @@ -4411,13 +4421,14 @@ dependencies = [ "polars-core", "rayon", "regex", + "simdutf8", ] [[package]] name = "polars-lazy" -version = "0.14.5" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ca6b2fb59bbe6725a84c48df12f509b4655d173cd113e5fb51f971cff1f93bc" +checksum = "4c7f767d152af32f2880c02d0e4ead23d5591e066bf92af462ef9d1e4149e7af" dependencies = [ "ahash", "itertools", @@ -5499,6 +5510,12 @@ dependencies = [ "libc", ] +[[package]] +name = "simdutf8" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970da16e7c682fa90a261cf0724dee241c9f7831635ecc4e988ae8f3b505559" + [[package]] name = "siphasher" version = "0.3.5" @@ -5877,7 +5894,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edd106a334b7657c10b7c540a0106114feadeb4dc314513e97df481d5d966f42" dependencies = [ "byteorder", - "dirs", + "dirs 1.0.5", "winapi 0.3.9", ] diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index 7afe37f2d3..b91f7b4df1 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -98,7 +98,7 @@ which = { version="4.1.0", optional=true } zip = { version="0.5.9", optional=true } [dependencies.polars] -version = "0.14.5" +version = "0.14.7" optional = true features = ["parquet", "json", "random", "pivot", "strings", "is_in"] diff --git a/crates/nu-command/src/commands/dataframe/open.rs b/crates/nu-command/src/commands/dataframe/open.rs index bc644aa31e..82d1a1348c 100644 --- a/crates/nu-command/src/commands/dataframe/open.rs +++ b/crates/nu-command/src/commands/dataframe/open.rs @@ -8,7 +8,7 @@ use nu_protocol::{ }; use nu_source::Tagged; -use polars::prelude::{CsvReader, JsonReader, ParquetReader, SerReader}; +use polars::prelude::{CsvEncoding, CsvReader, JsonReader, ParquetReader, PolarsError, SerReader}; use std::fs::File; pub struct DataFrame; @@ -151,7 +151,8 @@ fn from_csv(args: CommandArgs) -> Result let columns: Option> = args.get_flag("columns")?; let csv_reader = CsvReader::from_path(&file.item) - .map_err(|e| parse_polars_error::<&str>(&e, &file.tag.span, None))?; + .map_err(|e| parse_polars_error::<&str>(&e, &file.tag.span, None))? + .with_encoding(CsvEncoding::LossyUtf8); let csv_reader = match delimiter { None => csv_reader, @@ -205,6 +206,15 @@ fn from_csv(args: CommandArgs) -> Result match csv_reader.finish() { Ok(df) => Ok(df), - Err(e) => Err(parse_polars_error::<&str>(&e, &file.tag.span, None)), + Err(e) => match e { + PolarsError::Other(_) => Err(ShellError::labeled_error_with_secondary( + "Schema error", + "Error with the inferred schema", + &file.tag.span, + "You can use the argument 'infer_schema' with a number of rows large enough to better infer the schema", + &file.tag.span, + )), + _ => Err(parse_polars_error::<&str>(&e, &file.tag.span, None)), + }, } } diff --git a/crates/nu-data/Cargo.toml b/crates/nu-data/Cargo.toml index 5ee0750777..88122a45fd 100644 --- a/crates/nu-data/Cargo.toml +++ b/crates/nu-data/Cargo.toml @@ -38,7 +38,7 @@ nu-value-ext = { version = "0.34.0", path="../nu-value-ext" } nu-ansi-term = { version = "0.34.0", path="../nu-ansi-term" } [dependencies.polars] -version = "0.14.5" +version = "0.14.7" optional = true features = ["strings", "checked_arithmetic"] diff --git a/crates/nu-protocol/Cargo.toml b/crates/nu-protocol/Cargo.toml index 371131ab63..c75dc31525 100644 --- a/crates/nu-protocol/Cargo.toml +++ b/crates/nu-protocol/Cargo.toml @@ -31,7 +31,7 @@ serde_yaml = "0.8.16" toml = "0.5.8" [dependencies.polars] -version = "0.14.5" +version = "0.14.7" optional = true features = ["serde", "rows"]