From b36d21e76f326c70ce7c6455adf1bb53146be694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20N=2E=20Robalino?= Date: Mon, 16 Mar 2020 15:50:45 -0500 Subject: [PATCH] Infer types from regular delimited plain text unstructured files. (#1494) * Infer types from regular delimited plain text unstructured files. * Nothing resolves to an empty string. --- .cargo/config | 2 +- .../src/commands/from_delimited_data.rs | 197 ++++++++----- .../nu-cli/src/commands/to_delimited_data.rs | 1 + crates/nu-cli/tests/format_conversions/csv.rs | 36 ++- crates/nu-parser/src/hir/signature.rs | 2 +- .../syntax_shape/expression/variable_path.rs | 12 - crates/nu-parser/src/hir/tokens_iterator.rs | 7 +- .../src/hir/tokens_iterator/tests.rs | 46 --- crates/nu-parser/src/lib.rs | 8 + crates/nu-parser/src/macros.rs | 46 +++ crates/nu-parser/src/parse/parser.rs | 179 +----------- crates/nu-parser/src/parse/token_tree.rs | 22 +- crates/nu-parser/src/parse/util.rs | 1 - .../src/parse/util/line_delimited_parser.rs | 2 + .../util/line_delimited_parser/parser.rs | 272 ++++++++++++++++++ .../parse/util/line_delimited_parser/shape.rs | 91 ++++++ crates/nu-parser/src/parse/util/mod.rs | 4 + crates/nu-parser/src/test_support/mod.rs | 104 +++++++ crates/nu-source/src/meta.rs | 21 ++ tests/plugins/core_str.rs | 13 +- 20 files changed, 751 insertions(+), 315 deletions(-) delete mode 100644 crates/nu-parser/src/hir/tokens_iterator/tests.rs delete mode 100644 crates/nu-parser/src/parse/util.rs create mode 100644 crates/nu-parser/src/parse/util/line_delimited_parser.rs create mode 100644 crates/nu-parser/src/parse/util/line_delimited_parser/parser.rs create mode 100644 crates/nu-parser/src/parse/util/line_delimited_parser/shape.rs create mode 100644 crates/nu-parser/src/parse/util/mod.rs create mode 100644 crates/nu-parser/src/test_support/mod.rs diff --git a/.cargo/config b/.cargo/config index fd2875ddab..e32687a32d 100644 --- a/.cargo/config +++ b/.cargo/config @@ -1,3 +1,3 @@ [build] -#rustflags = ["--cfg", "coloring_in_tokens"] \ No newline at end of file +#rustflags = ["--cfg", "data_processing_primitives"] diff --git a/crates/nu-cli/src/commands/from_delimited_data.rs b/crates/nu-cli/src/commands/from_delimited_data.rs index 2e931885cb..0ebbcf1e03 100644 --- a/crates/nu-cli/src/commands/from_delimited_data.rs +++ b/crates/nu-cli/src/commands/from_delimited_data.rs @@ -1,42 +1,12 @@ use crate::prelude::*; -use csv::{ErrorKind, ReaderBuilder}; use nu_errors::ShellError; -use nu_protocol::{Primitive, ReturnSuccess, TaggedDictBuilder, UntaggedValue, Value}; +use nu_parser::hir::syntax_shape::{ExpandContext, SignatureRegistry}; +use nu_parser::utils::{parse_line_with_separator as parse, LineSeparatedShape}; +use nu_parser::TokensIterator; +use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value}; +use nu_source::nom_input; -fn from_delimited_string_to_value( - s: String, - headerless: bool, - separator: char, - tag: impl Into, -) -> Result { - let mut reader = ReaderBuilder::new() - .has_headers(!headerless) - .delimiter(separator as u8) - .from_reader(s.as_bytes()); - let tag = tag.into(); - - let headers = if headerless { - (1..=reader.headers()?.len()) - .map(|i| format!("Column{}", i)) - .collect::>() - } else { - reader.headers()?.iter().map(String::from).collect() - }; - - let mut rows = vec![]; - for row in reader.records() { - let mut tagged_row = TaggedDictBuilder::new(&tag); - for (value, header) in row?.iter().zip(headers.iter()) { - tagged_row.insert_value( - header, - UntaggedValue::Primitive(Primitive::String(String::from(value))).into_value(&tag), - ) - } - rows.push(tagged_row.into_value()); - } - - Ok(UntaggedValue::Table(rows).into_value(&tag)) -} +use derive_new::new; pub fn from_delimited_data( headerless: bool, @@ -50,19 +20,20 @@ pub fn from_delimited_data( let concat_string = input.collect_string(name_tag.clone()).await?; match from_delimited_string_to_value(concat_string.item, headerless, sep, name_tag.clone()) { - Ok(x) => match x { - Value { value: UntaggedValue::Table(list), .. } => { - for l in list { - yield ReturnSuccess::value(l); + Ok(rows) => { + for row in rows { + match row { + Value { value: UntaggedValue::Table(list), .. } => { + for l in list { + yield ReturnSuccess::value(l); + } + } + x => yield ReturnSuccess::value(x), } } - x => yield ReturnSuccess::value(x), }, Err(err) => { - let line_one = match pretty_csv_error(err) { - Some(pretty) => format!("Could not parse as {} ({})", format_name,pretty), - None => format!("Could not parse as {}", format_name), - }; + let line_one = format!("Could not parse as {}", format_name); let line_two = format!("input cannot be parsed as {}", format_name); yield Err(ShellError::labeled_error_with_secondary( line_one, @@ -78,25 +49,121 @@ pub fn from_delimited_data( Ok(stream.to_output_stream()) } -fn pretty_csv_error(err: csv::Error) -> Option { - match err.kind() { - ErrorKind::UnequalLengths { - pos, - expected_len, - len, - } => { - if let Some(pos) = pos { - Some(format!( - "Line {}: expected {} fields, found {}", - pos.line(), - expected_len, - len - )) - } else { - Some(format!("Expected {} fields, found {}", expected_len, len)) - } - } - ErrorKind::Seek => Some("Internal error while parsing csv".to_string()), - _ => None, +#[derive(Debug, Clone, new)] +pub struct EmptyRegistry { + #[new(default)] + signatures: indexmap::IndexMap, +} + +impl EmptyRegistry {} + +impl SignatureRegistry for EmptyRegistry { + fn has(&self, _name: &str) -> bool { + false + } + fn get(&self, _name: &str) -> Option { + None + } + fn clone_box(&self) -> Box { + Box::new(self.clone()) } } + +fn from_delimited_string_to_value( + s: String, + headerless: bool, + sep: char, + tag: impl Into, +) -> Result, ShellError> { + let tag = tag.into(); + + let mut entries = s.lines(); + + let mut fields = vec![]; + let mut out = vec![]; + + if let Some(first_entry) = entries.next() { + let tokens = match parse(&sep.to_string(), nom_input(first_entry)) { + Ok((_, tokens)) => tokens, + Err(err) => return Err(ShellError::parse_error(err)), + }; + + let tokens_span = tokens.span; + let source: nu_source::Text = tokens_span.slice(&first_entry).into(); + + if !headerless { + fields = tokens + .item + .iter() + .filter(|token| !token.is_separator()) + .map(|field| field.source(&source).to_string()) + .collect::>(); + } + + let registry = Box::new(EmptyRegistry::new()); + let ctx = ExpandContext::new(registry, &source, None); + + let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span); + let (results, tokens_identified) = iterator.expand(LineSeparatedShape); + let results = results?; + + let mut row = TaggedDictBuilder::new(&tag); + + if headerless { + let fallback_columns = (1..=tokens_identified) + .map(|i| format!("Column{}", i)) + .collect::>(); + + for (idx, field) in results.into_iter().enumerate() { + let key = if headerless { + &fallback_columns[idx] + } else { + &fields[idx] + }; + + row.insert_value(key, field.into_value(&tag)); + } + + out.push(row.into_value()) + } + } + + for entry in entries { + let tokens = match parse(&sep.to_string(), nom_input(entry)) { + Ok((_, tokens)) => tokens, + Err(err) => return Err(ShellError::parse_error(err)), + }; + let tokens_span = tokens.span; + + let source: nu_source::Text = tokens_span.slice(&entry).into(); + let registry = Box::new(EmptyRegistry::new()); + let ctx = ExpandContext::new(registry, &source, None); + + let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span); + let (results, tokens_identified) = iterator.expand(LineSeparatedShape); + let results = results?; + + let mut row = TaggedDictBuilder::new(&tag); + + let fallback_columns = (1..=tokens_identified) + .map(|i| format!("Column{}", i)) + .collect::>(); + + for (idx, field) in results.into_iter().enumerate() { + let key = if headerless { + &fallback_columns[idx] + } else { + match fields.get(idx) { + Some(key) => key, + None => &fallback_columns[idx], + } + }; + + row.insert_value(key, field.into_value(&tag)); + } + + out.push(row.into_value()) + } + + Ok(out) +} diff --git a/crates/nu-cli/src/commands/to_delimited_data.rs b/crates/nu-cli/src/commands/to_delimited_data.rs index ad3819737a..90603ed262 100644 --- a/crates/nu-cli/src/commands/to_delimited_data.rs +++ b/crates/nu-cli/src/commands/to_delimited_data.rs @@ -140,6 +140,7 @@ fn to_string_tagged_value(v: &Value) -> Result { | UntaggedValue::Primitive(Primitive::Path(_)) | UntaggedValue::Primitive(Primitive::Int(_)) => as_string(v), UntaggedValue::Primitive(Primitive::Date(d)) => Ok(d.to_string()), + UntaggedValue::Primitive(Primitive::Nothing) => Ok(String::new()), UntaggedValue::Table(_) => Ok(String::from("[Table]")), UntaggedValue::Row(_) => Ok(String::from("[Row]")), _ => Err(ShellError::labeled_error( diff --git a/crates/nu-cli/tests/format_conversions/csv.rs b/crates/nu-cli/tests/format_conversions/csv.rs index 248df32871..c6689a652c 100644 --- a/crates/nu-cli/tests/format_conversions/csv.rs +++ b/crates/nu-cli/tests/format_conversions/csv.rs @@ -73,8 +73,36 @@ fn table_to_csv_text_skipping_headers_after_conversion() { } #[test] -fn from_csv_text_to_table() { +fn infers_types() { Playground::setup("filter_from_csv_test_1", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_cuatro_mosqueteros.csv", + r#" + first_name,last_name,rusty_luck + Andrés,Robalino,1,d + Jonathan,Turner,1,d + Yehuda,Katz,1,d + Jason,Gedge,1,d + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + open los_cuatro_mosqueteros.csv + | where rusty_luck > 0 + | count + | echo $it + "# + )); + + assert_eq!(actual, "4"); + }) +} + +#[test] +fn from_csv_text_to_table() { + Playground::setup("filter_from_csv_test_2", |dirs, sandbox| { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.txt", r#" @@ -102,7 +130,7 @@ fn from_csv_text_to_table() { #[test] fn from_csv_text_with_separator_to_table() { - Playground::setup("filter_from_csv_test_2", |dirs, sandbox| { + Playground::setup("filter_from_csv_test_3", |dirs, sandbox| { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.txt", r#" @@ -130,7 +158,7 @@ fn from_csv_text_with_separator_to_table() { #[test] fn from_csv_text_with_tab_separator_to_table() { - Playground::setup("filter_from_csv_test_3", |dirs, sandbox| { + Playground::setup("filter_from_csv_test_4", |dirs, sandbox| { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.txt", r#" @@ -158,7 +186,7 @@ fn from_csv_text_with_tab_separator_to_table() { #[test] fn from_csv_text_skipping_headers_to_table() { - Playground::setup("filter_from_csv_test_4", |dirs, sandbox| { + Playground::setup("filter_from_csv_test_5", |dirs, sandbox| { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_amigos.txt", r#" diff --git a/crates/nu-parser/src/hir/signature.rs b/crates/nu-parser/src/hir/signature.rs index 50e80b51bc..fdf67abeb4 100644 --- a/crates/nu-parser/src/hir/signature.rs +++ b/crates/nu-parser/src/hir/signature.rs @@ -1,6 +1,6 @@ use crate::hir; use crate::hir::syntax_shape::{ - expand_atom, expand_syntax, BareShape, ExpandContext, ExpandSyntax, ExpansionRule, + ExpandSyntax, expand_atom, expand_syntax, BareShape, ExpandContext, ExpandSyntax, ExpansionRule, UnspannedAtomicToken, WhitespaceShape, }; use crate::hir::tokens_iterator::TokensIterator; diff --git a/crates/nu-parser/src/hir/syntax_shape/expression/variable_path.rs b/crates/nu-parser/src/hir/syntax_shape/expression/variable_path.rs index fad73e5fb5..1101b882ba 100644 --- a/crates/nu-parser/src/hir/syntax_shape/expression/variable_path.rs +++ b/crates/nu-parser/src/hir/syntax_shape/expression/variable_path.rs @@ -477,18 +477,6 @@ impl ExpandSyntax for MemberShape { return Ok(Member::Bare(bare.span())); } - /* KATZ */ - /* let number = NumberShape.test(token_nodes, context); - - if let Some(peeked) = number { - let node = peeked.not_eof("column")?.commit(); - let (n, span) = node.as_number().ok_or_else(|| { - ParseError::internal_error("can't convert node to number".spanned(node.span())) - })?; - - return Ok(Member::Number(n, span)) - }*/ - let string = token_nodes.expand_syntax(StringShape); if let Ok(syntax) = string { diff --git a/crates/nu-parser/src/hir/tokens_iterator.rs b/crates/nu-parser/src/hir/tokens_iterator.rs index b2d104d4bc..6ad7552a7d 100644 --- a/crates/nu-parser/src/hir/tokens_iterator.rs +++ b/crates/nu-parser/src/hir/tokens_iterator.rs @@ -3,9 +3,6 @@ pub(crate) mod into_shapes; pub(crate) mod pattern; pub(crate) mod state; -#[cfg(test)] -mod tests; - use self::debug::ExpandTracer; use self::into_shapes::IntoShapes; use self::state::{Peeked, TokensIteratorState}; @@ -510,7 +507,7 @@ impl<'content> TokensIterator<'content> { /// The purpose of `expand_infallible` is to clearly mark the infallible path through /// and entire list of tokens that produces a fully colored version of the source. /// - /// If the `ExpandSyntax` can poroduce a `Result`, make sure to use `expand_syntax`, + /// If the `ExpandSyntax` can produce a `Result`, make sure to use `expand_syntax`, /// which will correctly show the error in the trace. pub fn expand_infallible(&mut self, shape: impl ExpandSyntax) -> U where @@ -536,7 +533,7 @@ impl<'content> TokensIterator<'content> { }) } - fn expand(&mut self, shape: impl ExpandSyntax) -> (U, usize) + pub fn expand(&mut self, shape: impl ExpandSyntax) -> (U, usize) where U: std::fmt::Debug + Clone + 'static, { diff --git a/crates/nu-parser/src/hir/tokens_iterator/tests.rs b/crates/nu-parser/src/hir/tokens_iterator/tests.rs deleted file mode 100644 index 8c22aec77d..0000000000 --- a/crates/nu-parser/src/hir/tokens_iterator/tests.rs +++ /dev/null @@ -1,46 +0,0 @@ -use crate::hir::{syntax_shape::ExpandContext, syntax_shape::SignatureRegistry, TokensIterator}; -use crate::parse::token_tree_builder::TokenTreeBuilder as b; -use nu_protocol::Signature; -use nu_source::{Span, Text}; - -use derive_new::new; - -#[derive(Debug, Clone, new)] -struct TestRegistry { - #[new(default)] - signatures: indexmap::IndexMap, -} - -impl TestRegistry {} - -impl SignatureRegistry for TestRegistry { - fn has(&self, name: &str) -> bool { - self.signatures.contains_key(name) - } - fn get(&self, name: &str) -> Option { - self.signatures.get(name).cloned() - } - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } -} - -#[test] -fn supplies_tokens() { - let token = b::it_var(); - - let (tokens, source) = b::build(token); - - let tokens = vec![tokens]; - let source = Text::from(&source); - - let mut iterator = TokensIterator::new( - &tokens, - ExpandContext::new(Box::new(TestRegistry::new()), &source, None), - Span::unknown(), - ); - - let token = iterator.next().expect("Token expected."); - - token.expect_var(); -} diff --git a/crates/nu-parser/src/lib.rs b/crates/nu-parser/src/lib.rs index 2fb447bb34..e5f5f540c7 100644 --- a/crates/nu-parser/src/lib.rs +++ b/crates/nu-parser/src/lib.rs @@ -6,6 +6,9 @@ pub mod hir; pub mod parse; pub mod parse_command; +#[cfg(test)] +pub mod test_support; + pub use crate::commands::classified::{ external::ExternalCommand, internal::InternalCommand, ClassifiedCommand, ClassifiedPipeline, }; @@ -20,6 +23,11 @@ pub use crate::parse::parser::{module, pipeline}; pub use crate::parse::token_tree::{Delimiter, SpannedToken, Token}; pub use crate::parse::token_tree_builder::TokenTreeBuilder; +pub mod utils { + pub use crate::parse::util::parse_line_with_separator; + pub use crate::parse::util::LineSeparatedShape; +} + use log::log_enabled; use nu_errors::ShellError; use nu_protocol::{errln, outln}; diff --git a/crates/nu-parser/src/macros.rs b/crates/nu-parser/src/macros.rs index 741476f83e..31f82ff975 100644 --- a/crates/nu-parser/src/macros.rs +++ b/crates/nu-parser/src/macros.rs @@ -7,3 +7,49 @@ macro_rules! return_ok { } }; } + +#[cfg(test)] +macro_rules! equal_tokens { + ($source:tt -> $tokens:expr) => { + let result = apply(pipeline, "pipeline", $source); + let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens); + + if result != expected_tree { + let debug_result = format!("{}", result.debug($source)); + let debug_expected = format!("{}", expected_tree.debug(&expected_source)); + + if debug_result == debug_expected { + assert_eq!( + result, expected_tree, + "NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}", + $source, + debug_expected + ) + } else { + assert_eq!(debug_result, debug_expected) + } + } + }; + + (<$parser:tt> $source:tt -> $tokens:expr) => { + let result = apply($parser, stringify!($parser), $source); + + let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens); + + if result != expected_tree { + let debug_result = format!("{}", result.debug($source)); + let debug_expected = format!("{}", expected_tree.debug(&expected_source)); + + if debug_result == debug_expected { + assert_eq!( + result, expected_tree, + "NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}", + $source, + debug_expected + ) + } else { + assert_eq!(debug_result, debug_expected) + } + } + }; +} diff --git a/crates/nu-parser/src/parse/parser.rs b/crates/nu-parser/src/parse/parser.rs index f51781c5fe..2731b901dc 100644 --- a/crates/nu-parser/src/parse/parser.rs +++ b/crates/nu-parser/src/parse/parser.rs @@ -1,5 +1,4 @@ #![allow(unused)] - use crate::parse::{ call_node::*, flag::*, number::*, operator::*, pipeline::*, token_tree::*, token_tree_builder::*, unit::*, @@ -318,6 +317,7 @@ pub fn dq_string(input: NomSpan) -> IResult { let (input, _) = char('"')(input)?; let start1 = input.offset; let (input, _) = many0(none_of("\""))(input)?; + let end1 = input.offset; let (input, _) = char('"')(input)?; let end = input.offset; @@ -939,7 +939,7 @@ pub fn tight_node(input: NomSpan) -> IResult> { ))(input) } -fn to_list( +pub fn to_list( parser: impl Fn(NomSpan) -> IResult, ) -> impl Fn(NomSpan) -> IResult> { move |input| { @@ -1017,7 +1017,7 @@ fn parse_int(frag: &str, neg: Option) -> i64 { } } -fn is_boundary(c: Option) -> bool { +pub fn is_boundary(c: Option) -> bool { match c { None => true, Some(')') | Some(']') | Some('}') | Some('(') => true, @@ -1140,59 +1140,13 @@ fn is_member_start(c: char) -> bool { #[cfg(test)] mod tests { - use super::*; - use crate::parse::token_tree_builder::TokenTreeBuilder as b; - use crate::parse::token_tree_builder::{CurriedToken, TokenTreeBuilder}; + use crate::parse::parser::{module, nodes, pipeline}; + use crate::parse::token_tree_builder::TokenTreeBuilder::{self, self as b}; + use crate::test_support::apply; + use nu_source::PrettyDebugWithSource; + use pretty_assertions::assert_eq; - pub type CurriedNode = Box T + 'static>; - - macro_rules! equal_tokens { - ($source:tt -> $tokens:expr) => { - let result = apply(pipeline, "pipeline", $source); - let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens); - - if result != expected_tree { - let debug_result = format!("{}", result.debug($source)); - let debug_expected = format!("{}", expected_tree.debug(&expected_source)); - - if debug_result == debug_expected { - assert_eq!( - result, expected_tree, - "NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}", - $source, - debug_expected - ) - } else { - assert_eq!(debug_result, debug_expected) - } - } - }; - - (<$parser:tt> $source:tt -> $tokens:expr) => { - let result = apply($parser, stringify!($parser), $source); - - let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens); - - if result != expected_tree { - let debug_result = format!("{}", result.debug($source)); - let debug_expected = format!("{}", expected_tree.debug(&expected_source)); - - if debug_result == debug_expected { - assert_eq!( - result, expected_tree, - "NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}", - $source, - debug_expected - ) - } else { - assert_eq!(debug_result, debug_expected) - } - } - }; - - } - #[test] fn test_integer() { equal_tokens! { @@ -1339,7 +1293,7 @@ mod tests { fn test_flag() { equal_tokens! { - "--amigos" -> b::token_list(vec![b::flag("arepas")]) + "--amigos" -> b::token_list(vec![b::flag("amigos")]) } equal_tokens! { @@ -1721,119 +1675,4 @@ mod tests { ]) ); } - - // #[test] - // fn test_smoke_pipeline() { - // let _ = pretty_env_logger::try_init(); - - // assert_eq!( - // apply( - // pipeline, - // "pipeline", - // r#"git branch --merged | split-row "`n" | where $it != "* master""# - // ), - // build_token(b::pipeline(vec![ - // ( - // None, - // b::call( - // b::bare("git"), - // vec![b::sp(), b::bare("branch"), b::sp(), b::flag("merged")] - // ), - // Some(" ") - // ), - // ( - // Some(" "), - // b::call(b::bare("split-row"), vec![b::sp(), b::string("`n")]), - // Some(" ") - // ), - // ( - // Some(" "), - // b::call( - // b::bare("where"), - // vec![ - // b::sp(), - // b::it_var(), - // b::sp(), - // b::op("!="), - // b::sp(), - // b::string("* master") - // ] - // ), - // None - // ) - // ])) - // ); - - // assert_eq!( - // apply(pipeline, "pipeline", "ls | where { $it.size > 100 }"), - // build_token(b::pipeline(vec![ - // (None, b::call(b::bare("ls"), vec![]), Some(" ")), - // ( - // Some(" "), - // b::call( - // b::bare("where"), - // vec![ - // b::sp(), - // b::braced(vec![ - // b::path(b::it_var(), vec![b::member("size")]), - // b::sp(), - // b::op(">"), - // b::sp(), - // b::int(100) - // ]) - // ] - // ), - // None - // ) - // ])) - // ) - // } - - fn apply( - f: impl Fn( - NomSpan, - ) - -> Result<(NomSpan, SpannedToken), nom::Err<(NomSpan, nom::error::ErrorKind)>>, - desc: &str, - string: &str, - ) -> SpannedToken { - let result = f(nom_input(string)); - - match result { - Ok(value) => value.1, - Err(err) => { - let err = nu_errors::ShellError::parse_error(err); - - println!("{:?}", string); - crate::hir::baseline_parse::tests::print_err(err, &nu_source::Text::from(string)); - panic!("test failed") - } - } - } - - fn span((left, right): (usize, usize)) -> Span { - Span::new(left, right) - } - - fn delimited( - delimiter: Spanned, - children: Vec, - left: usize, - right: usize, - ) -> SpannedToken { - let start = Span::for_char(left); - let end = Span::for_char(right); - - let node = DelimitedNode::new(delimiter.item, (start, end), children); - Token::Delimited(node).into_spanned((left, right)) - } - - fn build(block: CurriedNode) -> T { - let mut builder = TokenTreeBuilder::new(); - block(&mut builder) - } - - fn build_token(block: CurriedToken) -> SpannedToken { - TokenTreeBuilder::build(block).0 - } } diff --git a/crates/nu-parser/src/parse/token_tree.rs b/crates/nu-parser/src/parse/token_tree.rs index 536c616697..796a1415c6 100644 --- a/crates/nu-parser/src/parse/token_tree.rs +++ b/crates/nu-parser/src/parse/token_tree.rs @@ -306,6 +306,13 @@ impl SpannedToken { } } + pub fn is_int(&self) -> bool { + match self.unspanned() { + Token::Number(RawNumber::Int(_)) => true, + _ => false, + } + } + pub fn as_string(&self) -> Option<(Span, Span)> { match self.unspanned() { Token::String(inner_span) => Some((self.span(), *inner_span)), @@ -327,16 +334,16 @@ impl SpannedToken { } } - pub fn is_int(&self) -> bool { + pub fn is_dot(&self) -> bool { match self.unspanned() { - Token::Number(RawNumber::Int(_)) => true, + Token::EvaluationOperator(EvaluationOperator::Dot) => true, _ => false, } } - pub fn is_dot(&self) -> bool { + pub fn is_separator(&self) -> bool { match self.unspanned() { - Token::EvaluationOperator(EvaluationOperator::Dot) => true, + Token::Separator => true, _ => false, } } @@ -479,6 +486,13 @@ impl SpannedToken { } } + pub fn expect_number(&self) -> RawNumber { + match self.unspanned() { + Token::Number(raw_number) => *raw_number, + other => panic!("Expected number, found {:?}", other), + } + } + pub fn expect_string(&self) -> (Span, Span) { match self.unspanned() { Token::String(inner_span) => (self.span(), *inner_span), diff --git a/crates/nu-parser/src/parse/util.rs b/crates/nu-parser/src/parse/util.rs deleted file mode 100644 index 8b13789179..0000000000 --- a/crates/nu-parser/src/parse/util.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/crates/nu-parser/src/parse/util/line_delimited_parser.rs b/crates/nu-parser/src/parse/util/line_delimited_parser.rs new file mode 100644 index 0000000000..730a516bb6 --- /dev/null +++ b/crates/nu-parser/src/parse/util/line_delimited_parser.rs @@ -0,0 +1,2 @@ +pub(crate) mod parser; +pub(crate) mod shape; diff --git a/crates/nu-parser/src/parse/util/line_delimited_parser/parser.rs b/crates/nu-parser/src/parse/util/line_delimited_parser/parser.rs new file mode 100644 index 0000000000..f1e057f368 --- /dev/null +++ b/crates/nu-parser/src/parse/util/line_delimited_parser/parser.rs @@ -0,0 +1,272 @@ +use crate::parse::number::RawNumber; +use crate::parse::parser::{is_boundary, to_list}; +use crate::parse::token_tree::SpannedToken; +use crate::parse::token_tree_builder::TokenTreeBuilder; +use nu_source::{HasSpan, NomSpan, Span, Spanned, SpannedItem}; + +use nom::branch::alt; +use nom::bytes::complete::{escaped, tag}; +use nom::character::complete::*; +use nom::combinator::*; +use nom::multi::*; +use nom::IResult; +use nom_tracable::tracable_parser; + +#[tracable_parser] +pub fn parse_line_with_separator<'a, 'b>( + separator: &'b str, + input: NomSpan<'a>, +) -> IResult, Spanned>> { + let start = input.offset; + let mut nodes = vec![]; + let mut next_input = input; + + loop { + let node_result = to_list(leaf(separator))(next_input); + + let (after_node_input, next_nodes) = match node_result { + Err(_) => break, + Ok((after_node_input, next_node)) => (after_node_input, next_node), + }; + + nodes.extend(next_nodes); + + match separated_by(separator)(after_node_input) { + Err(_) => { + next_input = after_node_input; + break; + } + Ok((input, s)) => { + nodes.push(s); + next_input = input; + } + } + } + + let end = next_input.offset; + + Ok((next_input, nodes.spanned(Span::new(start, end)))) +} + +#[tracable_parser] +pub fn fallback_number_without(c: char) -> impl Fn(NomSpan) -> IResult { + move |input| { + let (input, number) = fallback_raw_number_without(c)(input)?; + + Ok(( + input, + TokenTreeBuilder::spanned_number(number, number.span()), + )) + } +} + +#[tracable_parser] +pub fn fallback_raw_number_without(c: char) -> impl Fn(NomSpan) -> IResult { + move |input| { + let _anchoral = input; + let start = input.offset; + let (input, _neg) = opt(tag("-"))(input)?; + let (input, _head) = digit1(input)?; + let after_int_head = input; + + match input.fragment.chars().next() { + None => return Ok((input, RawNumber::int(Span::new(start, input.offset)))), + Some('.') => (), + other if is_boundary(other) || other == Some(c) => { + return Ok((input, RawNumber::int(Span::new(start, input.offset)))) + } + _ => { + return Err(nom::Err::Error(nom::error::make_error( + input, + nom::error::ErrorKind::Tag, + ))) + } + } + + let dot: IResult = tag(".")(input); + + let input = match dot { + Ok((input, _dot)) => input, + + // it's just an integer + Err(_) => return Ok((input, RawNumber::int(Span::new(start, input.offset)))), + }; + + let tail_digits_result: IResult = digit1(input); + + let (input, _tail) = match tail_digits_result { + Ok((input, tail)) => (input, tail), + Err(_) => { + return Ok(( + after_int_head, + RawNumber::int((start, after_int_head.offset)), + )) + } + }; + + let end = input.offset; + + let next = input.fragment.chars().next(); + + if is_boundary(next) || next == Some(c) { + Ok((input, RawNumber::decimal(Span::new(start, end)))) + } else { + Err(nom::Err::Error(nom::error::make_error( + input, + nom::error::ErrorKind::Tag, + ))) + } + } +} + +#[tracable_parser] +pub fn leaf(c: &str) -> impl Fn(NomSpan) -> IResult + '_ { + move |input| { + let separator = c.chars().next().unwrap_or_else(|| ','); + + let (input, node) = alt(( + fallback_number_without(separator), + string, + fallback_string_without(c), + ))(input)?; + + Ok((input, node)) + } +} + +#[tracable_parser] +pub fn separated_by(c: &str) -> impl Fn(NomSpan) -> IResult + '_ { + move |input| { + let left = input.offset; + let (input, _) = tag(c)(input)?; + let right = input.offset; + + Ok((input, TokenTreeBuilder::spanned_sep(Span::new(left, right)))) + } +} + +#[tracable_parser] +pub fn dq_string(input: NomSpan) -> IResult { + let start = input.offset; + let (input, _) = char('"')(input)?; + let start1 = input.offset; + let (input, _) = escaped( + none_of(r#"\""#), + '\\', + nom::character::complete::one_of(r#"\"rnt"#), + )(input)?; + + let end1 = input.offset; + let (input, _) = char('"')(input)?; + let end = input.offset; + Ok(( + input, + TokenTreeBuilder::spanned_string(Span::new(start1, end1), Span::new(start, end)), + )) +} + +#[tracable_parser] +pub fn sq_string(input: NomSpan) -> IResult { + let start = input.offset; + let (input, _) = char('\'')(input)?; + let start1 = input.offset; + let (input, _) = many0(none_of("\'"))(input)?; + let end1 = input.offset; + let (input, _) = char('\'')(input)?; + let end = input.offset; + + Ok(( + input, + TokenTreeBuilder::spanned_string(Span::new(start1, end1), Span::new(start, end)), + )) +} + +#[tracable_parser] +pub fn string(input: NomSpan) -> IResult { + alt((sq_string, dq_string))(input) +} + +#[tracable_parser] +pub fn fallback_string_without(c: &str) -> impl Fn(NomSpan) -> IResult + '_ { + move |input| { + let start = input.offset; + let (input, _) = many0(none_of(c))(input)?; + let end = input.offset; + + Ok(( + input, + TokenTreeBuilder::spanned_string(Span::new(start, end), Span::new(start, end)), + )) + } +} + +#[cfg(test)] +mod tests { + use crate::parse::token_tree_builder::TokenTreeBuilder::{self, self as b}; + use crate::parse::util::parse_line_with_separator; + use crate::test_support::apply; + use nom::IResult; + + use crate::parse::pipeline::PipelineElement; + use crate::parse::token_tree::SpannedToken; + use nu_source::NomSpan; + use nu_source::PrettyDebugWithSource; + + use pretty_assertions::assert_eq; + + pub fn nodes(input: NomSpan) -> IResult { + let (input, tokens) = parse_line_with_separator(",", input)?; + let span = tokens.span; + + Ok(( + input, + TokenTreeBuilder::spanned_pipeline(vec![PipelineElement::new(None, tokens)], span), + )) + } + + #[test] + fn separators() { + equal_tokens! { + + r#""name","lastname","age""# -> b::token_list(vec![ + b::string("name"), + b::sep(","), + b::string("lastname"), + b::sep(","), + b::string("age") + ]) + } + + equal_tokens! { + + r#""Andrés","Robalino",12"# -> b::token_list(vec![ + b::string("Andrés"), + b::sep(","), + b::string("Robalino"), + b::sep(","), + b::int(12) + ]) + } + } + + #[test] + fn strings() { + equal_tokens! { + + r#""andres""# -> b::token_list(vec![b::string("andres")]) + } + } + + #[test] + fn numbers() { + equal_tokens! { + + "123" -> b::token_list(vec![b::int(123)]) + } + + equal_tokens! { + + "-123" -> b::token_list(vec![b::int(-123)]) + } + } +} diff --git a/crates/nu-parser/src/parse/util/line_delimited_parser/shape.rs b/crates/nu-parser/src/parse/util/line_delimited_parser/shape.rs new file mode 100644 index 0000000000..34cedf2df6 --- /dev/null +++ b/crates/nu-parser/src/parse/util/line_delimited_parser/shape.rs @@ -0,0 +1,91 @@ +use crate::hir::{ + self, syntax_shape::ExpandSyntax, syntax_shape::FlatShape, syntax_shape::NumberExpressionShape, + syntax_shape::StringShape, +}; +use crate::hir::{Expression, TokensIterator}; +use crate::parse::token_tree::SeparatorType; + +use nu_errors::ParseError; +use nu_protocol::UntaggedValue; +use nu_source::Span; + +#[derive(Debug, Copy, Clone)] +pub struct LineSeparatedShape; + +impl ExpandSyntax for LineSeparatedShape { + type Output = Result, ParseError>; + + fn name(&self) -> &'static str { + "any string line separated by" + } + + fn expand<'a, 'b>( + &self, + token_nodes: &mut TokensIterator<'_>, + ) -> Result, ParseError> { + let source = token_nodes.source(); + + if token_nodes.at_end() { + return Ok(vec![]); + } + + let mut entries = vec![]; + + loop { + let field = { + token_nodes + .expand_syntax(NumberExpressionShape) + .or_else(|_| { + token_nodes + .expand_syntax(StringShape) + .map(|syntax| Expression::string(syntax.inner).into_expr(syntax.span)) + }) + }; + + if let Ok(field) = field { + match &field.expr { + Expression::Literal(hir::Literal::Number(crate::Number::Int(i))) => { + entries.push(UntaggedValue::int(i.clone())) + } + Expression::Literal(hir::Literal::Number(crate::Number::Decimal(d))) => { + entries.push(UntaggedValue::decimal(d.clone())) + } + Expression::Literal(hir::Literal::String(span)) => { + if span.is_closed() { + entries.push(UntaggedValue::nothing()) + } else { + entries.push(UntaggedValue::string(span.slice(&source))) + } + } + _ => {} + } + } + + match token_nodes.expand_infallible(SeparatorShape) { + Err(err) if !token_nodes.at_end() => return Err(err), + _ => {} + } + + if token_nodes.at_end() { + break; + } + } + + Ok(entries) + } +} + +#[derive(Debug, Copy, Clone)] +pub struct SeparatorShape; + +impl ExpandSyntax for SeparatorShape { + type Output = Result; + + fn name(&self) -> &'static str { + "separated" + } + + fn expand<'a, 'b>(&self, token_nodes: &'b mut TokensIterator<'a>) -> Result { + token_nodes.expand_token(SeparatorType, |span| Ok((FlatShape::Separator, span))) + } +} diff --git a/crates/nu-parser/src/parse/util/mod.rs b/crates/nu-parser/src/parse/util/mod.rs new file mode 100644 index 0000000000..6dc6a7f3d5 --- /dev/null +++ b/crates/nu-parser/src/parse/util/mod.rs @@ -0,0 +1,4 @@ +mod line_delimited_parser; + +pub use line_delimited_parser::parser::parse_line_with_separator; +pub use line_delimited_parser::shape::LineSeparatedShape; diff --git a/crates/nu-parser/src/test_support/mod.rs b/crates/nu-parser/src/test_support/mod.rs new file mode 100644 index 0000000000..d9507b7662 --- /dev/null +++ b/crates/nu-parser/src/test_support/mod.rs @@ -0,0 +1,104 @@ +use crate::hir::{syntax_shape::ExpandContext, syntax_shape::SignatureRegistry}; + +use crate::parse::files::Files; +use crate::parse::token_tree::{DelimitedNode, Delimiter, SpannedToken, Token}; +use crate::parse::token_tree_builder::{CurriedToken, TokenTreeBuilder}; + +use nu_errors::ShellError; +use nu_protocol::Signature; +use nu_source::{nom_input, NomSpan, Span, Spanned, Text}; + +pub use nu_source::PrettyDebug; + +use derive_new::new; + +pub type CurriedNode = Box T + 'static>; + +#[derive(Debug, Clone, new)] +pub struct TestRegistry { + #[new(default)] + signatures: indexmap::IndexMap, +} + +impl TestRegistry {} + +impl SignatureRegistry for TestRegistry { + fn has(&self, name: &str) -> bool { + self.signatures.contains_key(name) + } + fn get(&self, name: &str) -> Option { + self.signatures.get(name).cloned() + } + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } +} + +pub fn with_empty_context(source: &Text, callback: impl FnOnce(ExpandContext)) { + let registry = TestRegistry::new(); + callback(ExpandContext::new(Box::new(registry), source, None)) +} + +pub fn inner_string_span(span: Span) -> Span { + Span::new(span.start() + 1, span.end() - 1) +} + +pub fn print_err(err: ShellError, source: &Text) { + let diag = err.into_diagnostic(); + + let writer = termcolor::StandardStream::stderr(termcolor::ColorChoice::Auto); + let mut source = source.to_string(); + source.push_str(" "); + let files = Files::new(source); + let _ = language_reporting::emit( + &mut writer.lock(), + &files, + &diag, + &language_reporting::DefaultConfig, + ); +} + +pub fn apply( + f: impl Fn(NomSpan) -> Result<(NomSpan, SpannedToken), nom::Err<(NomSpan, nom::error::ErrorKind)>>, + _desc: &str, + string: &str, +) -> SpannedToken { + let result = f(nom_input(string)); + + match result { + Ok(value) => value.1, + Err(err) => { + let err = nu_errors::ShellError::parse_error(err); + + println!("{:?}", string); + crate::hir::baseline_parse::tests::print_err(err, &nu_source::Text::from(string)); + panic!("test failed") + } + } +} + +pub fn span((left, right): (usize, usize)) -> Span { + Span::new(left, right) +} + +pub fn delimited( + delimiter: Spanned, + children: Vec, + left: usize, + right: usize, +) -> SpannedToken { + let start = Span::for_char(left); + let end = Span::for_char(right); + + let node = DelimitedNode::new(delimiter.item, (start, end), children); + Token::Delimited(node).into_spanned((left, right)) +} + +pub fn build(block: CurriedNode) -> T { + let mut builder = TokenTreeBuilder::new(); + block(&mut builder) +} + +pub fn build_token(block: CurriedToken) -> SpannedToken { + TokenTreeBuilder::build(block).0 +} diff --git a/crates/nu-source/src/meta.rs b/crates/nu-source/src/meta.rs index 161c5cb087..c693217388 100644 --- a/crates/nu-source/src/meta.rs +++ b/crates/nu-source/src/meta.rs @@ -659,6 +659,27 @@ impl Span { self.start == 0 && self.end == 0 } + /// Returns a bool if the current Span does not cover. + /// + /// # Example + /// + /// ``` + /// // make clean + /// // ---- + /// // (0,4) + /// // + /// // ^(5,5) + /// + /// let make_span = Span::new(0,4); + /// let clean_span = Span::new(5,5); + /// + /// assert_eq!(make_span.is_closed(), false); + /// assert_eq!(clean_span.is_closed(), true); + /// ``` + pub fn is_closed(&self) -> bool { + self.start == self.end + } + /// Returns a slice of the input that covers the start and end of the current Span. pub fn slice<'a>(&self, source: &'a str) -> &'a str { &source[self.start..self.end] diff --git a/tests/plugins/core_str.rs b/tests/plugins/core_str.rs index 2908d452a9..800f4d0fc5 100644 --- a/tests/plugins/core_str.rs +++ b/tests/plugins/core_str.rs @@ -78,16 +78,17 @@ fn converts_to_int() { let actual = nu!( cwd: "tests/fixtures/formats", pipeline( r#" - open caco3_plastics.csv - | first 1 - | str tariff_item --to-int - | where tariff_item == 2509000000 - | get tariff_item + echo '{number_as_string: "1"}' + | from-json + | str number_as_string --to-int + | rename number + | where number == 1 + | get number | echo $it "# )); - assert_eq!(actual, "2509000000"); + assert_eq!(actual, "1"); } #[test]