Infer types from regular delimited plain text unstructured files. (#1494)

* Infer types from regular delimited plain text unstructured files.

* Nothing resolves to an empty string.
This commit is contained in:
Andrés N. Robalino 2020-03-16 15:50:45 -05:00 committed by GitHub
parent d8c4565413
commit b36d21e76f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 751 additions and 315 deletions

View file

@ -1,3 +1,3 @@
[build] [build]
#rustflags = ["--cfg", "coloring_in_tokens"] #rustflags = ["--cfg", "data_processing_primitives"]

View file

@ -1,42 +1,12 @@
use crate::prelude::*; use crate::prelude::*;
use csv::{ErrorKind, ReaderBuilder};
use nu_errors::ShellError; use nu_errors::ShellError;
use nu_protocol::{Primitive, ReturnSuccess, TaggedDictBuilder, UntaggedValue, Value}; use nu_parser::hir::syntax_shape::{ExpandContext, SignatureRegistry};
use nu_parser::utils::{parse_line_with_separator as parse, LineSeparatedShape};
use nu_parser::TokensIterator;
use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
use nu_source::nom_input;
fn from_delimited_string_to_value( use derive_new::new;
s: String,
headerless: bool,
separator: char,
tag: impl Into<Tag>,
) -> Result<Value, csv::Error> {
let mut reader = ReaderBuilder::new()
.has_headers(!headerless)
.delimiter(separator as u8)
.from_reader(s.as_bytes());
let tag = tag.into();
let headers = if headerless {
(1..=reader.headers()?.len())
.map(|i| format!("Column{}", i))
.collect::<Vec<String>>()
} else {
reader.headers()?.iter().map(String::from).collect()
};
let mut rows = vec![];
for row in reader.records() {
let mut tagged_row = TaggedDictBuilder::new(&tag);
for (value, header) in row?.iter().zip(headers.iter()) {
tagged_row.insert_value(
header,
UntaggedValue::Primitive(Primitive::String(String::from(value))).into_value(&tag),
)
}
rows.push(tagged_row.into_value());
}
Ok(UntaggedValue::Table(rows).into_value(&tag))
}
pub fn from_delimited_data( pub fn from_delimited_data(
headerless: bool, headerless: bool,
@ -50,19 +20,20 @@ pub fn from_delimited_data(
let concat_string = input.collect_string(name_tag.clone()).await?; let concat_string = input.collect_string(name_tag.clone()).await?;
match from_delimited_string_to_value(concat_string.item, headerless, sep, name_tag.clone()) { match from_delimited_string_to_value(concat_string.item, headerless, sep, name_tag.clone()) {
Ok(x) => match x { Ok(rows) => {
for row in rows {
match row {
Value { value: UntaggedValue::Table(list), .. } => { Value { value: UntaggedValue::Table(list), .. } => {
for l in list { for l in list {
yield ReturnSuccess::value(l); yield ReturnSuccess::value(l);
} }
} }
x => yield ReturnSuccess::value(x), x => yield ReturnSuccess::value(x),
}
}
}, },
Err(err) => { Err(err) => {
let line_one = match pretty_csv_error(err) { let line_one = format!("Could not parse as {}", format_name);
Some(pretty) => format!("Could not parse as {} ({})", format_name,pretty),
None => format!("Could not parse as {}", format_name),
};
let line_two = format!("input cannot be parsed as {}", format_name); let line_two = format!("input cannot be parsed as {}", format_name);
yield Err(ShellError::labeled_error_with_secondary( yield Err(ShellError::labeled_error_with_secondary(
line_one, line_one,
@ -78,25 +49,121 @@ pub fn from_delimited_data(
Ok(stream.to_output_stream()) Ok(stream.to_output_stream())
} }
fn pretty_csv_error(err: csv::Error) -> Option<String> { #[derive(Debug, Clone, new)]
match err.kind() { pub struct EmptyRegistry {
ErrorKind::UnequalLengths { #[new(default)]
pos, signatures: indexmap::IndexMap<String, Signature>,
expected_len, }
len,
} => { impl EmptyRegistry {}
if let Some(pos) = pos {
Some(format!( impl SignatureRegistry for EmptyRegistry {
"Line {}: expected {} fields, found {}", fn has(&self, _name: &str) -> bool {
pos.line(), false
expected_len, }
len fn get(&self, _name: &str) -> Option<Signature> {
)) None
}
fn clone_box(&self) -> Box<dyn SignatureRegistry> {
Box::new(self.clone())
}
}
fn from_delimited_string_to_value(
s: String,
headerless: bool,
sep: char,
tag: impl Into<Tag>,
) -> Result<Vec<Value>, ShellError> {
let tag = tag.into();
let mut entries = s.lines();
let mut fields = vec![];
let mut out = vec![];
if let Some(first_entry) = entries.next() {
let tokens = match parse(&sep.to_string(), nom_input(first_entry)) {
Ok((_, tokens)) => tokens,
Err(err) => return Err(ShellError::parse_error(err)),
};
let tokens_span = tokens.span;
let source: nu_source::Text = tokens_span.slice(&first_entry).into();
if !headerless {
fields = tokens
.item
.iter()
.filter(|token| !token.is_separator())
.map(|field| field.source(&source).to_string())
.collect::<Vec<_>>();
}
let registry = Box::new(EmptyRegistry::new());
let ctx = ExpandContext::new(registry, &source, None);
let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span);
let (results, tokens_identified) = iterator.expand(LineSeparatedShape);
let results = results?;
let mut row = TaggedDictBuilder::new(&tag);
if headerless {
let fallback_columns = (1..=tokens_identified)
.map(|i| format!("Column{}", i))
.collect::<Vec<String>>();
for (idx, field) in results.into_iter().enumerate() {
let key = if headerless {
&fallback_columns[idx]
} else { } else {
Some(format!("Expected {} fields, found {}", expected_len, len)) &fields[idx]
};
row.insert_value(key, field.into_value(&tag));
}
out.push(row.into_value())
} }
} }
ErrorKind::Seek => Some("Internal error while parsing csv".to_string()),
_ => None, for entry in entries {
let tokens = match parse(&sep.to_string(), nom_input(entry)) {
Ok((_, tokens)) => tokens,
Err(err) => return Err(ShellError::parse_error(err)),
};
let tokens_span = tokens.span;
let source: nu_source::Text = tokens_span.slice(&entry).into();
let registry = Box::new(EmptyRegistry::new());
let ctx = ExpandContext::new(registry, &source, None);
let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span);
let (results, tokens_identified) = iterator.expand(LineSeparatedShape);
let results = results?;
let mut row = TaggedDictBuilder::new(&tag);
let fallback_columns = (1..=tokens_identified)
.map(|i| format!("Column{}", i))
.collect::<Vec<String>>();
for (idx, field) in results.into_iter().enumerate() {
let key = if headerless {
&fallback_columns[idx]
} else {
match fields.get(idx) {
Some(key) => key,
None => &fallback_columns[idx],
} }
};
row.insert_value(key, field.into_value(&tag));
}
out.push(row.into_value())
}
Ok(out)
} }

View file

@ -140,6 +140,7 @@ fn to_string_tagged_value(v: &Value) -> Result<String, ShellError> {
| UntaggedValue::Primitive(Primitive::Path(_)) | UntaggedValue::Primitive(Primitive::Path(_))
| UntaggedValue::Primitive(Primitive::Int(_)) => as_string(v), | UntaggedValue::Primitive(Primitive::Int(_)) => as_string(v),
UntaggedValue::Primitive(Primitive::Date(d)) => Ok(d.to_string()), UntaggedValue::Primitive(Primitive::Date(d)) => Ok(d.to_string()),
UntaggedValue::Primitive(Primitive::Nothing) => Ok(String::new()),
UntaggedValue::Table(_) => Ok(String::from("[Table]")), UntaggedValue::Table(_) => Ok(String::from("[Table]")),
UntaggedValue::Row(_) => Ok(String::from("[Row]")), UntaggedValue::Row(_) => Ok(String::from("[Row]")),
_ => Err(ShellError::labeled_error( _ => Err(ShellError::labeled_error(

View file

@ -73,8 +73,36 @@ fn table_to_csv_text_skipping_headers_after_conversion() {
} }
#[test] #[test]
fn from_csv_text_to_table() { fn infers_types() {
Playground::setup("filter_from_csv_test_1", |dirs, sandbox| { Playground::setup("filter_from_csv_test_1", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_cuatro_mosqueteros.csv",
r#"
first_name,last_name,rusty_luck
Andrés,Robalino,1,d
Jonathan,Turner,1,d
Yehuda,Katz,1,d
Jason,Gedge,1,d
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_cuatro_mosqueteros.csv
| where rusty_luck > 0
| count
| echo $it
"#
));
assert_eq!(actual, "4");
})
}
#[test]
fn from_csv_text_to_table() {
Playground::setup("filter_from_csv_test_2", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed( sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt", "los_tres_caballeros.txt",
r#" r#"
@ -102,7 +130,7 @@ fn from_csv_text_to_table() {
#[test] #[test]
fn from_csv_text_with_separator_to_table() { fn from_csv_text_with_separator_to_table() {
Playground::setup("filter_from_csv_test_2", |dirs, sandbox| { Playground::setup("filter_from_csv_test_3", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed( sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt", "los_tres_caballeros.txt",
r#" r#"
@ -130,7 +158,7 @@ fn from_csv_text_with_separator_to_table() {
#[test] #[test]
fn from_csv_text_with_tab_separator_to_table() { fn from_csv_text_with_tab_separator_to_table() {
Playground::setup("filter_from_csv_test_3", |dirs, sandbox| { Playground::setup("filter_from_csv_test_4", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed( sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt", "los_tres_caballeros.txt",
r#" r#"
@ -158,7 +186,7 @@ fn from_csv_text_with_tab_separator_to_table() {
#[test] #[test]
fn from_csv_text_skipping_headers_to_table() { fn from_csv_text_skipping_headers_to_table() {
Playground::setup("filter_from_csv_test_4", |dirs, sandbox| { Playground::setup("filter_from_csv_test_5", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed( sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_amigos.txt", "los_tres_amigos.txt",
r#" r#"

View file

@ -1,6 +1,6 @@
use crate::hir; use crate::hir;
use crate::hir::syntax_shape::{ use crate::hir::syntax_shape::{
expand_atom, expand_syntax, BareShape, ExpandContext, ExpandSyntax, ExpansionRule, ExpandSyntax, expand_atom, expand_syntax, BareShape, ExpandContext, ExpandSyntax, ExpansionRule,
UnspannedAtomicToken, WhitespaceShape, UnspannedAtomicToken, WhitespaceShape,
}; };
use crate::hir::tokens_iterator::TokensIterator; use crate::hir::tokens_iterator::TokensIterator;

View file

@ -477,18 +477,6 @@ impl ExpandSyntax for MemberShape {
return Ok(Member::Bare(bare.span())); return Ok(Member::Bare(bare.span()));
} }
/* KATZ */
/* let number = NumberShape.test(token_nodes, context);
if let Some(peeked) = number {
let node = peeked.not_eof("column")?.commit();
let (n, span) = node.as_number().ok_or_else(|| {
ParseError::internal_error("can't convert node to number".spanned(node.span()))
})?;
return Ok(Member::Number(n, span))
}*/
let string = token_nodes.expand_syntax(StringShape); let string = token_nodes.expand_syntax(StringShape);
if let Ok(syntax) = string { if let Ok(syntax) = string {

View file

@ -3,9 +3,6 @@ pub(crate) mod into_shapes;
pub(crate) mod pattern; pub(crate) mod pattern;
pub(crate) mod state; pub(crate) mod state;
#[cfg(test)]
mod tests;
use self::debug::ExpandTracer; use self::debug::ExpandTracer;
use self::into_shapes::IntoShapes; use self::into_shapes::IntoShapes;
use self::state::{Peeked, TokensIteratorState}; use self::state::{Peeked, TokensIteratorState};
@ -510,7 +507,7 @@ impl<'content> TokensIterator<'content> {
/// The purpose of `expand_infallible` is to clearly mark the infallible path through /// The purpose of `expand_infallible` is to clearly mark the infallible path through
/// and entire list of tokens that produces a fully colored version of the source. /// and entire list of tokens that produces a fully colored version of the source.
/// ///
/// If the `ExpandSyntax` can poroduce a `Result`, make sure to use `expand_syntax`, /// If the `ExpandSyntax` can produce a `Result`, make sure to use `expand_syntax`,
/// which will correctly show the error in the trace. /// which will correctly show the error in the trace.
pub fn expand_infallible<U>(&mut self, shape: impl ExpandSyntax<Output = U>) -> U pub fn expand_infallible<U>(&mut self, shape: impl ExpandSyntax<Output = U>) -> U
where where
@ -536,7 +533,7 @@ impl<'content> TokensIterator<'content> {
}) })
} }
fn expand<U>(&mut self, shape: impl ExpandSyntax<Output = U>) -> (U, usize) pub fn expand<U>(&mut self, shape: impl ExpandSyntax<Output = U>) -> (U, usize)
where where
U: std::fmt::Debug + Clone + 'static, U: std::fmt::Debug + Clone + 'static,
{ {

View file

@ -1,46 +0,0 @@
use crate::hir::{syntax_shape::ExpandContext, syntax_shape::SignatureRegistry, TokensIterator};
use crate::parse::token_tree_builder::TokenTreeBuilder as b;
use nu_protocol::Signature;
use nu_source::{Span, Text};
use derive_new::new;
#[derive(Debug, Clone, new)]
struct TestRegistry {
#[new(default)]
signatures: indexmap::IndexMap<String, Signature>,
}
impl TestRegistry {}
impl SignatureRegistry for TestRegistry {
fn has(&self, name: &str) -> bool {
self.signatures.contains_key(name)
}
fn get(&self, name: &str) -> Option<Signature> {
self.signatures.get(name).cloned()
}
fn clone_box(&self) -> Box<dyn SignatureRegistry> {
Box::new(self.clone())
}
}
#[test]
fn supplies_tokens() {
let token = b::it_var();
let (tokens, source) = b::build(token);
let tokens = vec![tokens];
let source = Text::from(&source);
let mut iterator = TokensIterator::new(
&tokens,
ExpandContext::new(Box::new(TestRegistry::new()), &source, None),
Span::unknown(),
);
let token = iterator.next().expect("Token expected.");
token.expect_var();
}

View file

@ -6,6 +6,9 @@ pub mod hir;
pub mod parse; pub mod parse;
pub mod parse_command; pub mod parse_command;
#[cfg(test)]
pub mod test_support;
pub use crate::commands::classified::{ pub use crate::commands::classified::{
external::ExternalCommand, internal::InternalCommand, ClassifiedCommand, ClassifiedPipeline, external::ExternalCommand, internal::InternalCommand, ClassifiedCommand, ClassifiedPipeline,
}; };
@ -20,6 +23,11 @@ pub use crate::parse::parser::{module, pipeline};
pub use crate::parse::token_tree::{Delimiter, SpannedToken, Token}; pub use crate::parse::token_tree::{Delimiter, SpannedToken, Token};
pub use crate::parse::token_tree_builder::TokenTreeBuilder; pub use crate::parse::token_tree_builder::TokenTreeBuilder;
pub mod utils {
pub use crate::parse::util::parse_line_with_separator;
pub use crate::parse::util::LineSeparatedShape;
}
use log::log_enabled; use log::log_enabled;
use nu_errors::ShellError; use nu_errors::ShellError;
use nu_protocol::{errln, outln}; use nu_protocol::{errln, outln};

View file

@ -7,3 +7,49 @@ macro_rules! return_ok {
} }
}; };
} }
#[cfg(test)]
macro_rules! equal_tokens {
($source:tt -> $tokens:expr) => {
let result = apply(pipeline, "pipeline", $source);
let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens);
if result != expected_tree {
let debug_result = format!("{}", result.debug($source));
let debug_expected = format!("{}", expected_tree.debug(&expected_source));
if debug_result == debug_expected {
assert_eq!(
result, expected_tree,
"NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}",
$source,
debug_expected
)
} else {
assert_eq!(debug_result, debug_expected)
}
}
};
(<$parser:tt> $source:tt -> $tokens:expr) => {
let result = apply($parser, stringify!($parser), $source);
let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens);
if result != expected_tree {
let debug_result = format!("{}", result.debug($source));
let debug_expected = format!("{}", expected_tree.debug(&expected_source));
if debug_result == debug_expected {
assert_eq!(
result, expected_tree,
"NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}",
$source,
debug_expected
)
} else {
assert_eq!(debug_result, debug_expected)
}
}
};
}

View file

@ -1,5 +1,4 @@
#![allow(unused)] #![allow(unused)]
use crate::parse::{ use crate::parse::{
call_node::*, flag::*, number::*, operator::*, pipeline::*, token_tree::*, call_node::*, flag::*, number::*, operator::*, pipeline::*, token_tree::*,
token_tree_builder::*, unit::*, token_tree_builder::*, unit::*,
@ -318,6 +317,7 @@ pub fn dq_string(input: NomSpan) -> IResult<NomSpan, SpannedToken> {
let (input, _) = char('"')(input)?; let (input, _) = char('"')(input)?;
let start1 = input.offset; let start1 = input.offset;
let (input, _) = many0(none_of("\""))(input)?; let (input, _) = many0(none_of("\""))(input)?;
let end1 = input.offset; let end1 = input.offset;
let (input, _) = char('"')(input)?; let (input, _) = char('"')(input)?;
let end = input.offset; let end = input.offset;
@ -939,7 +939,7 @@ pub fn tight_node(input: NomSpan) -> IResult<NomSpan, Vec<SpannedToken>> {
))(input) ))(input)
} }
fn to_list( pub fn to_list(
parser: impl Fn(NomSpan) -> IResult<NomSpan, SpannedToken>, parser: impl Fn(NomSpan) -> IResult<NomSpan, SpannedToken>,
) -> impl Fn(NomSpan) -> IResult<NomSpan, Vec<SpannedToken>> { ) -> impl Fn(NomSpan) -> IResult<NomSpan, Vec<SpannedToken>> {
move |input| { move |input| {
@ -1017,7 +1017,7 @@ fn parse_int<T>(frag: &str, neg: Option<T>) -> i64 {
} }
} }
fn is_boundary(c: Option<char>) -> bool { pub fn is_boundary(c: Option<char>) -> bool {
match c { match c {
None => true, None => true,
Some(')') | Some(']') | Some('}') | Some('(') => true, Some(')') | Some(']') | Some('}') | Some('(') => true,
@ -1140,59 +1140,13 @@ fn is_member_start(c: char) -> bool {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use crate::parse::parser::{module, nodes, pipeline};
use crate::parse::token_tree_builder::TokenTreeBuilder as b; use crate::parse::token_tree_builder::TokenTreeBuilder::{self, self as b};
use crate::parse::token_tree_builder::{CurriedToken, TokenTreeBuilder}; use crate::test_support::apply;
use nu_source::PrettyDebugWithSource;
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
pub type CurriedNode<T> = Box<dyn FnOnce(&mut TokenTreeBuilder) -> T + 'static>;
macro_rules! equal_tokens {
($source:tt -> $tokens:expr) => {
let result = apply(pipeline, "pipeline", $source);
let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens);
if result != expected_tree {
let debug_result = format!("{}", result.debug($source));
let debug_expected = format!("{}", expected_tree.debug(&expected_source));
if debug_result == debug_expected {
assert_eq!(
result, expected_tree,
"NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}",
$source,
debug_expected
)
} else {
assert_eq!(debug_result, debug_expected)
}
}
};
(<$parser:tt> $source:tt -> $tokens:expr) => {
let result = apply($parser, stringify!($parser), $source);
let (expected_tree, expected_source) = TokenTreeBuilder::build($tokens);
if result != expected_tree {
let debug_result = format!("{}", result.debug($source));
let debug_expected = format!("{}", expected_tree.debug(&expected_source));
if debug_result == debug_expected {
assert_eq!(
result, expected_tree,
"NOTE: actual and expected had equivalent debug serializations, source={:?}, debug_expected={:?}",
$source,
debug_expected
)
} else {
assert_eq!(debug_result, debug_expected)
}
}
};
}
#[test] #[test]
fn test_integer() { fn test_integer() {
equal_tokens! { equal_tokens! {
@ -1339,7 +1293,7 @@ mod tests {
fn test_flag() { fn test_flag() {
equal_tokens! { equal_tokens! {
<nodes> <nodes>
"--amigos" -> b::token_list(vec![b::flag("arepas")]) "--amigos" -> b::token_list(vec![b::flag("amigos")])
} }
equal_tokens! { equal_tokens! {
@ -1721,119 +1675,4 @@ mod tests {
]) ])
); );
} }
// #[test]
// fn test_smoke_pipeline() {
// let _ = pretty_env_logger::try_init();
// assert_eq!(
// apply(
// pipeline,
// "pipeline",
// r#"git branch --merged | split-row "`n" | where $it != "* master""#
// ),
// build_token(b::pipeline(vec![
// (
// None,
// b::call(
// b::bare("git"),
// vec![b::sp(), b::bare("branch"), b::sp(), b::flag("merged")]
// ),
// Some(" ")
// ),
// (
// Some(" "),
// b::call(b::bare("split-row"), vec![b::sp(), b::string("`n")]),
// Some(" ")
// ),
// (
// Some(" "),
// b::call(
// b::bare("where"),
// vec![
// b::sp(),
// b::it_var(),
// b::sp(),
// b::op("!="),
// b::sp(),
// b::string("* master")
// ]
// ),
// None
// )
// ]))
// );
// assert_eq!(
// apply(pipeline, "pipeline", "ls | where { $it.size > 100 }"),
// build_token(b::pipeline(vec![
// (None, b::call(b::bare("ls"), vec![]), Some(" ")),
// (
// Some(" "),
// b::call(
// b::bare("where"),
// vec![
// b::sp(),
// b::braced(vec![
// b::path(b::it_var(), vec![b::member("size")]),
// b::sp(),
// b::op(">"),
// b::sp(),
// b::int(100)
// ])
// ]
// ),
// None
// )
// ]))
// )
// }
fn apply(
f: impl Fn(
NomSpan,
)
-> Result<(NomSpan, SpannedToken), nom::Err<(NomSpan, nom::error::ErrorKind)>>,
desc: &str,
string: &str,
) -> SpannedToken {
let result = f(nom_input(string));
match result {
Ok(value) => value.1,
Err(err) => {
let err = nu_errors::ShellError::parse_error(err);
println!("{:?}", string);
crate::hir::baseline_parse::tests::print_err(err, &nu_source::Text::from(string));
panic!("test failed")
}
}
}
fn span((left, right): (usize, usize)) -> Span {
Span::new(left, right)
}
fn delimited(
delimiter: Spanned<Delimiter>,
children: Vec<SpannedToken>,
left: usize,
right: usize,
) -> SpannedToken {
let start = Span::for_char(left);
let end = Span::for_char(right);
let node = DelimitedNode::new(delimiter.item, (start, end), children);
Token::Delimited(node).into_spanned((left, right))
}
fn build<T>(block: CurriedNode<T>) -> T {
let mut builder = TokenTreeBuilder::new();
block(&mut builder)
}
fn build_token(block: CurriedToken) -> SpannedToken {
TokenTreeBuilder::build(block).0
}
} }

View file

@ -306,6 +306,13 @@ impl SpannedToken {
} }
} }
pub fn is_int(&self) -> bool {
match self.unspanned() {
Token::Number(RawNumber::Int(_)) => true,
_ => false,
}
}
pub fn as_string(&self) -> Option<(Span, Span)> { pub fn as_string(&self) -> Option<(Span, Span)> {
match self.unspanned() { match self.unspanned() {
Token::String(inner_span) => Some((self.span(), *inner_span)), Token::String(inner_span) => Some((self.span(), *inner_span)),
@ -327,16 +334,16 @@ impl SpannedToken {
} }
} }
pub fn is_int(&self) -> bool { pub fn is_dot(&self) -> bool {
match self.unspanned() { match self.unspanned() {
Token::Number(RawNumber::Int(_)) => true, Token::EvaluationOperator(EvaluationOperator::Dot) => true,
_ => false, _ => false,
} }
} }
pub fn is_dot(&self) -> bool { pub fn is_separator(&self) -> bool {
match self.unspanned() { match self.unspanned() {
Token::EvaluationOperator(EvaluationOperator::Dot) => true, Token::Separator => true,
_ => false, _ => false,
} }
} }
@ -479,6 +486,13 @@ impl SpannedToken {
} }
} }
pub fn expect_number(&self) -> RawNumber {
match self.unspanned() {
Token::Number(raw_number) => *raw_number,
other => panic!("Expected number, found {:?}", other),
}
}
pub fn expect_string(&self) -> (Span, Span) { pub fn expect_string(&self) -> (Span, Span) {
match self.unspanned() { match self.unspanned() {
Token::String(inner_span) => (self.span(), *inner_span), Token::String(inner_span) => (self.span(), *inner_span),

View file

@ -1 +0,0 @@

View file

@ -0,0 +1,2 @@
pub(crate) mod parser;
pub(crate) mod shape;

View file

@ -0,0 +1,272 @@
use crate::parse::number::RawNumber;
use crate::parse::parser::{is_boundary, to_list};
use crate::parse::token_tree::SpannedToken;
use crate::parse::token_tree_builder::TokenTreeBuilder;
use nu_source::{HasSpan, NomSpan, Span, Spanned, SpannedItem};
use nom::branch::alt;
use nom::bytes::complete::{escaped, tag};
use nom::character::complete::*;
use nom::combinator::*;
use nom::multi::*;
use nom::IResult;
use nom_tracable::tracable_parser;
#[tracable_parser]
pub fn parse_line_with_separator<'a, 'b>(
separator: &'b str,
input: NomSpan<'a>,
) -> IResult<NomSpan<'a>, Spanned<Vec<SpannedToken>>> {
let start = input.offset;
let mut nodes = vec![];
let mut next_input = input;
loop {
let node_result = to_list(leaf(separator))(next_input);
let (after_node_input, next_nodes) = match node_result {
Err(_) => break,
Ok((after_node_input, next_node)) => (after_node_input, next_node),
};
nodes.extend(next_nodes);
match separated_by(separator)(after_node_input) {
Err(_) => {
next_input = after_node_input;
break;
}
Ok((input, s)) => {
nodes.push(s);
next_input = input;
}
}
}
let end = next_input.offset;
Ok((next_input, nodes.spanned(Span::new(start, end))))
}
#[tracable_parser]
pub fn fallback_number_without(c: char) -> impl Fn(NomSpan) -> IResult<NomSpan, SpannedToken> {
move |input| {
let (input, number) = fallback_raw_number_without(c)(input)?;
Ok((
input,
TokenTreeBuilder::spanned_number(number, number.span()),
))
}
}
#[tracable_parser]
pub fn fallback_raw_number_without(c: char) -> impl Fn(NomSpan) -> IResult<NomSpan, RawNumber> {
move |input| {
let _anchoral = input;
let start = input.offset;
let (input, _neg) = opt(tag("-"))(input)?;
let (input, _head) = digit1(input)?;
let after_int_head = input;
match input.fragment.chars().next() {
None => return Ok((input, RawNumber::int(Span::new(start, input.offset)))),
Some('.') => (),
other if is_boundary(other) || other == Some(c) => {
return Ok((input, RawNumber::int(Span::new(start, input.offset))))
}
_ => {
return Err(nom::Err::Error(nom::error::make_error(
input,
nom::error::ErrorKind::Tag,
)))
}
}
let dot: IResult<NomSpan, NomSpan, (NomSpan, nom::error::ErrorKind)> = tag(".")(input);
let input = match dot {
Ok((input, _dot)) => input,
// it's just an integer
Err(_) => return Ok((input, RawNumber::int(Span::new(start, input.offset)))),
};
let tail_digits_result: IResult<NomSpan, _> = digit1(input);
let (input, _tail) = match tail_digits_result {
Ok((input, tail)) => (input, tail),
Err(_) => {
return Ok((
after_int_head,
RawNumber::int((start, after_int_head.offset)),
))
}
};
let end = input.offset;
let next = input.fragment.chars().next();
if is_boundary(next) || next == Some(c) {
Ok((input, RawNumber::decimal(Span::new(start, end))))
} else {
Err(nom::Err::Error(nom::error::make_error(
input,
nom::error::ErrorKind::Tag,
)))
}
}
}
#[tracable_parser]
pub fn leaf(c: &str) -> impl Fn(NomSpan) -> IResult<NomSpan, SpannedToken> + '_ {
move |input| {
let separator = c.chars().next().unwrap_or_else(|| ',');
let (input, node) = alt((
fallback_number_without(separator),
string,
fallback_string_without(c),
))(input)?;
Ok((input, node))
}
}
#[tracable_parser]
pub fn separated_by(c: &str) -> impl Fn(NomSpan) -> IResult<NomSpan, SpannedToken> + '_ {
move |input| {
let left = input.offset;
let (input, _) = tag(c)(input)?;
let right = input.offset;
Ok((input, TokenTreeBuilder::spanned_sep(Span::new(left, right))))
}
}
#[tracable_parser]
pub fn dq_string(input: NomSpan) -> IResult<NomSpan, SpannedToken> {
let start = input.offset;
let (input, _) = char('"')(input)?;
let start1 = input.offset;
let (input, _) = escaped(
none_of(r#"\""#),
'\\',
nom::character::complete::one_of(r#"\"rnt"#),
)(input)?;
let end1 = input.offset;
let (input, _) = char('"')(input)?;
let end = input.offset;
Ok((
input,
TokenTreeBuilder::spanned_string(Span::new(start1, end1), Span::new(start, end)),
))
}
#[tracable_parser]
pub fn sq_string(input: NomSpan) -> IResult<NomSpan, SpannedToken> {
let start = input.offset;
let (input, _) = char('\'')(input)?;
let start1 = input.offset;
let (input, _) = many0(none_of("\'"))(input)?;
let end1 = input.offset;
let (input, _) = char('\'')(input)?;
let end = input.offset;
Ok((
input,
TokenTreeBuilder::spanned_string(Span::new(start1, end1), Span::new(start, end)),
))
}
#[tracable_parser]
pub fn string(input: NomSpan) -> IResult<NomSpan, SpannedToken> {
alt((sq_string, dq_string))(input)
}
#[tracable_parser]
pub fn fallback_string_without(c: &str) -> impl Fn(NomSpan) -> IResult<NomSpan, SpannedToken> + '_ {
move |input| {
let start = input.offset;
let (input, _) = many0(none_of(c))(input)?;
let end = input.offset;
Ok((
input,
TokenTreeBuilder::spanned_string(Span::new(start, end), Span::new(start, end)),
))
}
}
#[cfg(test)]
mod tests {
use crate::parse::token_tree_builder::TokenTreeBuilder::{self, self as b};
use crate::parse::util::parse_line_with_separator;
use crate::test_support::apply;
use nom::IResult;
use crate::parse::pipeline::PipelineElement;
use crate::parse::token_tree::SpannedToken;
use nu_source::NomSpan;
use nu_source::PrettyDebugWithSource;
use pretty_assertions::assert_eq;
pub fn nodes(input: NomSpan) -> IResult<NomSpan, SpannedToken> {
let (input, tokens) = parse_line_with_separator(",", input)?;
let span = tokens.span;
Ok((
input,
TokenTreeBuilder::spanned_pipeline(vec![PipelineElement::new(None, tokens)], span),
))
}
#[test]
fn separators() {
equal_tokens! {
<nodes>
r#""name","lastname","age""# -> b::token_list(vec![
b::string("name"),
b::sep(","),
b::string("lastname"),
b::sep(","),
b::string("age")
])
}
equal_tokens! {
<nodes>
r#""Andrés","Robalino",12"# -> b::token_list(vec![
b::string("Andrés"),
b::sep(","),
b::string("Robalino"),
b::sep(","),
b::int(12)
])
}
}
#[test]
fn strings() {
equal_tokens! {
<nodes>
r#""andres""# -> b::token_list(vec![b::string("andres")])
}
}
#[test]
fn numbers() {
equal_tokens! {
<nodes>
"123" -> b::token_list(vec![b::int(123)])
}
equal_tokens! {
<nodes>
"-123" -> b::token_list(vec![b::int(-123)])
}
}
}

View file

@ -0,0 +1,91 @@
use crate::hir::{
self, syntax_shape::ExpandSyntax, syntax_shape::FlatShape, syntax_shape::NumberExpressionShape,
syntax_shape::StringShape,
};
use crate::hir::{Expression, TokensIterator};
use crate::parse::token_tree::SeparatorType;
use nu_errors::ParseError;
use nu_protocol::UntaggedValue;
use nu_source::Span;
#[derive(Debug, Copy, Clone)]
pub struct LineSeparatedShape;
impl ExpandSyntax for LineSeparatedShape {
type Output = Result<Vec<UntaggedValue>, ParseError>;
fn name(&self) -> &'static str {
"any string line separated by"
}
fn expand<'a, 'b>(
&self,
token_nodes: &mut TokensIterator<'_>,
) -> Result<Vec<UntaggedValue>, ParseError> {
let source = token_nodes.source();
if token_nodes.at_end() {
return Ok(vec![]);
}
let mut entries = vec![];
loop {
let field = {
token_nodes
.expand_syntax(NumberExpressionShape)
.or_else(|_| {
token_nodes
.expand_syntax(StringShape)
.map(|syntax| Expression::string(syntax.inner).into_expr(syntax.span))
})
};
if let Ok(field) = field {
match &field.expr {
Expression::Literal(hir::Literal::Number(crate::Number::Int(i))) => {
entries.push(UntaggedValue::int(i.clone()))
}
Expression::Literal(hir::Literal::Number(crate::Number::Decimal(d))) => {
entries.push(UntaggedValue::decimal(d.clone()))
}
Expression::Literal(hir::Literal::String(span)) => {
if span.is_closed() {
entries.push(UntaggedValue::nothing())
} else {
entries.push(UntaggedValue::string(span.slice(&source)))
}
}
_ => {}
}
}
match token_nodes.expand_infallible(SeparatorShape) {
Err(err) if !token_nodes.at_end() => return Err(err),
_ => {}
}
if token_nodes.at_end() {
break;
}
}
Ok(entries)
}
}
#[derive(Debug, Copy, Clone)]
pub struct SeparatorShape;
impl ExpandSyntax for SeparatorShape {
type Output = Result<Span, ParseError>;
fn name(&self) -> &'static str {
"separated"
}
fn expand<'a, 'b>(&self, token_nodes: &'b mut TokensIterator<'a>) -> Result<Span, ParseError> {
token_nodes.expand_token(SeparatorType, |span| Ok((FlatShape::Separator, span)))
}
}

View file

@ -0,0 +1,4 @@
mod line_delimited_parser;
pub use line_delimited_parser::parser::parse_line_with_separator;
pub use line_delimited_parser::shape::LineSeparatedShape;

View file

@ -0,0 +1,104 @@
use crate::hir::{syntax_shape::ExpandContext, syntax_shape::SignatureRegistry};
use crate::parse::files::Files;
use crate::parse::token_tree::{DelimitedNode, Delimiter, SpannedToken, Token};
use crate::parse::token_tree_builder::{CurriedToken, TokenTreeBuilder};
use nu_errors::ShellError;
use nu_protocol::Signature;
use nu_source::{nom_input, NomSpan, Span, Spanned, Text};
pub use nu_source::PrettyDebug;
use derive_new::new;
pub type CurriedNode<T> = Box<dyn FnOnce(&mut TokenTreeBuilder) -> T + 'static>;
#[derive(Debug, Clone, new)]
pub struct TestRegistry {
#[new(default)]
signatures: indexmap::IndexMap<String, Signature>,
}
impl TestRegistry {}
impl SignatureRegistry for TestRegistry {
fn has(&self, name: &str) -> bool {
self.signatures.contains_key(name)
}
fn get(&self, name: &str) -> Option<Signature> {
self.signatures.get(name).cloned()
}
fn clone_box(&self) -> Box<dyn SignatureRegistry> {
Box::new(self.clone())
}
}
pub fn with_empty_context(source: &Text, callback: impl FnOnce(ExpandContext)) {
let registry = TestRegistry::new();
callback(ExpandContext::new(Box::new(registry), source, None))
}
pub fn inner_string_span(span: Span) -> Span {
Span::new(span.start() + 1, span.end() - 1)
}
pub fn print_err(err: ShellError, source: &Text) {
let diag = err.into_diagnostic();
let writer = termcolor::StandardStream::stderr(termcolor::ColorChoice::Auto);
let mut source = source.to_string();
source.push_str(" ");
let files = Files::new(source);
let _ = language_reporting::emit(
&mut writer.lock(),
&files,
&diag,
&language_reporting::DefaultConfig,
);
}
pub fn apply(
f: impl Fn(NomSpan) -> Result<(NomSpan, SpannedToken), nom::Err<(NomSpan, nom::error::ErrorKind)>>,
_desc: &str,
string: &str,
) -> SpannedToken {
let result = f(nom_input(string));
match result {
Ok(value) => value.1,
Err(err) => {
let err = nu_errors::ShellError::parse_error(err);
println!("{:?}", string);
crate::hir::baseline_parse::tests::print_err(err, &nu_source::Text::from(string));
panic!("test failed")
}
}
}
pub fn span((left, right): (usize, usize)) -> Span {
Span::new(left, right)
}
pub fn delimited(
delimiter: Spanned<Delimiter>,
children: Vec<SpannedToken>,
left: usize,
right: usize,
) -> SpannedToken {
let start = Span::for_char(left);
let end = Span::for_char(right);
let node = DelimitedNode::new(delimiter.item, (start, end), children);
Token::Delimited(node).into_spanned((left, right))
}
pub fn build<T>(block: CurriedNode<T>) -> T {
let mut builder = TokenTreeBuilder::new();
block(&mut builder)
}
pub fn build_token(block: CurriedToken) -> SpannedToken {
TokenTreeBuilder::build(block).0
}

View file

@ -659,6 +659,27 @@ impl Span {
self.start == 0 && self.end == 0 self.start == 0 && self.end == 0
} }
/// Returns a bool if the current Span does not cover.
///
/// # Example
///
/// ```
/// // make clean
/// // ----
/// // (0,4)
/// //
/// // ^(5,5)
///
/// let make_span = Span::new(0,4);
/// let clean_span = Span::new(5,5);
///
/// assert_eq!(make_span.is_closed(), false);
/// assert_eq!(clean_span.is_closed(), true);
/// ```
pub fn is_closed(&self) -> bool {
self.start == self.end
}
/// Returns a slice of the input that covers the start and end of the current Span. /// Returns a slice of the input that covers the start and end of the current Span.
pub fn slice<'a>(&self, source: &'a str) -> &'a str { pub fn slice<'a>(&self, source: &'a str) -> &'a str {
&source[self.start..self.end] &source[self.start..self.end]

View file

@ -78,16 +78,17 @@ fn converts_to_int() {
let actual = nu!( let actual = nu!(
cwd: "tests/fixtures/formats", pipeline( cwd: "tests/fixtures/formats", pipeline(
r#" r#"
open caco3_plastics.csv echo '{number_as_string: "1"}'
| first 1 | from-json
| str tariff_item --to-int | str number_as_string --to-int
| where tariff_item == 2509000000 | rename number
| get tariff_item | where number == 1
| get number
| echo $it | echo $it
"# "#
)); ));
assert_eq!(actual, "2509000000"); assert_eq!(actual, "1");
} }
#[test] #[test]