diff --git a/features.toml b/features.toml index f7cea6d9e9..e1cf56e33d 100644 --- a/features.toml +++ b/features.toml @@ -10,4 +10,12 @@ reason = """ This is laying the groundwork for merging coloring and parsing. It also makes token_nodes.atomic() naturally work with coloring, which is pretty useful on its own. """ +enabled = false + +[data_processing_primitives] + +description = "Groundwork so tables can be data processed" +reason = """ +These will allow take tables and be able to transform, process, and explore. +""" enabled = false \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs index b5a58c2ca2..fa68346efc 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -322,6 +322,8 @@ pub async fn cli() -> Result<(), Box> { whole_stream_command(Table), whole_stream_command(Version), whole_stream_command(Which), + #[cfg(data_processing_primitives)] + whole_stream_command(SplitBy), ]); #[cfg(feature = "clipboard")] diff --git a/src/commands.rs b/src/commands.rs index ba69d1e822..7c2c188629 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -57,6 +57,10 @@ pub(crate) mod shells; pub(crate) mod size; pub(crate) mod skip_while; pub(crate) mod sort_by; + +#[cfg(data_processing_primitives)] +pub(crate) mod split_by; + pub(crate) mod split_column; pub(crate) mod split_row; pub(crate) mod table; @@ -133,6 +137,10 @@ pub(crate) use shells::Shells; pub(crate) use size::Size; pub(crate) use skip_while::SkipWhile; pub(crate) use sort_by::SortBy; + +#[cfg(data_processing_primitives)] +pub(crate) use split_by::SplitBy; + pub(crate) use split_column::SplitColumn; pub(crate) use split_row::SplitRow; pub(crate) use table::Table; diff --git a/src/commands/group_by.rs b/src/commands/group_by.rs index f36d3f57dd..66c1360f5d 100644 --- a/src/commands/group_by.rs +++ b/src/commands/group_by.rs @@ -36,59 +36,154 @@ impl WholeStreamCommand for GroupBy { } } -fn group_by( +pub fn group_by( GroupByArgs { column_name }: GroupByArgs, RunnableContext { input, name, .. }: RunnableContext, ) -> Result { let stream = async_stream! { let values: Vec> = input.values.collect().await; - let mut groups = indexmap::IndexMap::new(); - for value in values { - let group_key = value.get_data_by_key(&column_name.item); - - if group_key.is_none() { - - let possibilities = value.data_descriptors(); - - let mut possible_matches: Vec<_> = possibilities - .iter() - .map(|x| (natural::distance::levenshtein_distance(x, &column_name.item), x)) - .collect(); - - possible_matches.sort(); - - let err = { - if possible_matches.len() > 0 { - ShellError::labeled_error( - "Unknown column", - format!("did you mean '{}'?", possible_matches[0].1), - &column_name.tag,) - } else { - ShellError::labeled_error( - "Unknown column", - "row does not contain this column", - &column_name.tag, - ) - } - }; - - yield Err(err) - } else { - let group_key = group_key.unwrap().as_string()?; - let mut group = groups.entry(group_key).or_insert(vec![]); - group.push(value); + if values.is_empty() { + yield Err(ShellError::labeled_error( + "Expected table from pipeline", + "requires a table input", + column_name.span() + )) + } else { + match group(&column_name, values, name) { + Ok(grouped) => yield ReturnSuccess::value(grouped), + Err(err) => yield Err(err) } } - - let mut out = TaggedDictBuilder::new(name.clone()); - - for (k,v) in groups.iter() { - out.insert(k, Value::table(v)); - } - - yield ReturnSuccess::value(out) }; Ok(stream.to_output_stream()) } + +pub fn group( + column_name: &Tagged, + values: Vec>, + tag: impl Into, +) -> Result, ShellError> { + let tag = tag.into(); + + let mut groups = indexmap::IndexMap::new(); + + for value in values { + let group_key = value.get_data_by_key(column_name); + + if group_key.is_none() { + let possibilities = value.data_descriptors(); + + let mut possible_matches: Vec<_> = possibilities + .iter() + .map(|x| (natural::distance::levenshtein_distance(x, column_name), x)) + .collect(); + + possible_matches.sort(); + + if possible_matches.len() > 0 { + return Err(ShellError::labeled_error( + "Unknown column", + format!("did you mean '{}'?", possible_matches[0].1), + column_name.tag(), + )); + } else { + return Err(ShellError::labeled_error( + "Unknown column", + "row does not contain this column", + column_name.tag(), + )); + } + } + + let group_key = group_key.unwrap().as_string()?; + let group = groups.entry(group_key).or_insert(vec![]); + group.push(value); + } + + let mut out = TaggedDictBuilder::new(&tag); + + for (k, v) in groups.iter() { + out.insert(k, Value::table(v)); + } + + Ok(out.into_tagged_value()) +} + +#[cfg(test)] +mod tests { + + use crate::commands::group_by::group; + use crate::data::meta::*; + use crate::Value; + use indexmap::IndexMap; + + fn string(input: impl Into) -> Tagged { + Value::string(input.into()).tagged_unknown() + } + + fn row(entries: IndexMap>) -> Tagged { + Value::row(entries).tagged_unknown() + } + + fn table(list: &Vec>) -> Tagged { + Value::table(list).tagged_unknown() + } + + #[test] + fn groups_table_by_key() { + let for_key = String::from("date").tagged_unknown(); + + let nu_releases = vec![ + row( + indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}, + ), + row( + indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}, + ), + row( + indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}, + ), + row( + indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}, + ), + row( + indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}, + ), + row( + indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}, + ), + row( + indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}, + ), + row( + indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}, + ), + row( + indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}, + ), + ]; + + assert_eq!( + group(&for_key, nu_releases, Tag::unknown()).unwrap(), + row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}) + ]), + }) + ); + } +} diff --git a/src/commands/split_by.rs b/src/commands/split_by.rs new file mode 100644 index 0000000000..b995b041d7 --- /dev/null +++ b/src/commands/split_by.rs @@ -0,0 +1,256 @@ +use crate::commands::WholeStreamCommand; +use crate::data::TaggedDictBuilder; +use crate::errors::ShellError; +use crate::prelude::*; + +pub struct SplitBy; + +#[derive(Deserialize)] +pub struct SplitByArgs { + column_name: Tagged, +} + +impl WholeStreamCommand for SplitBy { + fn name(&self) -> &str { + "split-by" + } + + fn signature(&self) -> Signature { + Signature::build("split-by").required( + "column_name", + SyntaxShape::String, + "the name of the column within the nested table to split by", + ) + } + + fn usage(&self) -> &str { + "Creates a new table with the data from the inner tables splitted by the column given." + } + + fn run( + &self, + args: CommandArgs, + registry: &CommandRegistry, + ) -> Result { + args.process(registry, split_by)?.run() + } +} + +pub fn split_by( + SplitByArgs { column_name }: SplitByArgs, + RunnableContext { input, name, .. }: RunnableContext, +) -> Result { + let stream = async_stream! { + let values: Vec> = input.values.collect().await; + + if values.len() > 1 || values.is_empty() { + yield Err(ShellError::labeled_error( + "Expected table from pipeline", + "requires a table input", + column_name.span() + )) + } else { + match split(&column_name, &values[0], name) { + Ok(split) => yield ReturnSuccess::value(split), + Err(err) => yield Err(err), + } + } + }; + + Ok(stream.to_output_stream()) +} + +pub fn split( + column_name: &Tagged, + value: &Tagged, + tag: impl Into, +) -> Result, ShellError> { + let origin_tag = tag.into(); + + let mut splits = indexmap::IndexMap::new(); + + match value { + Tagged { + item: Value::Row(group_sets), + .. + } => { + for (group_key, group_value) in group_sets.entries.iter() { + match *group_value { + Tagged { + item: Value::Table(ref dataset), + .. + } => { + let group = crate::commands::group_by::group( + &column_name, + dataset.to_vec(), + &origin_tag, + )?; + + match group { + Tagged { + item: Value::Row(o), + .. + } => { + for (split_label, subset) in o.entries.into_iter() { + match subset { + Tagged { + item: Value::Table(subset), + tag, + } => { + let s = splits + .entry(split_label.clone()) + .or_insert(indexmap::IndexMap::new()); + s.insert( + group_key.clone(), + Value::table(&subset).tagged(tag), + ); + } + other => { + return Err(ShellError::type_error( + "a table value", + other.tagged_type_name(), + )) + } + } + } + } + _ => { + return Err(ShellError::type_error( + "a table value", + group.tagged_type_name(), + )) + } + } + } + ref other => { + return Err(ShellError::type_error( + "a table value", + other.tagged_type_name(), + )) + } + } + } + } + _ => { + return Err(ShellError::type_error( + "a table value", + value.tagged_type_name(), + )) + } + } + + let mut out = TaggedDictBuilder::new(&origin_tag); + + for (k, v) in splits.into_iter() { + out.insert(k, Value::row(v)); + } + + Ok(out.into_tagged_value()) +} +#[cfg(test)] +mod tests { + + use crate::commands::split_by::split; + use crate::data::meta::*; + use crate::Value; + use indexmap::IndexMap; + + fn string(input: impl Into) -> Tagged { + Value::string(input.into()).tagged_unknown() + } + + fn row(entries: IndexMap>) -> Tagged { + Value::row(entries).tagged_unknown() + } + + fn table(list: &Vec>) -> Tagged { + Value::table(list).tagged_unknown() + } + + #[test] + fn splits_inner_tables_by_key() { + let for_key = String::from("country").tagged_unknown(); + + let nu_releases = row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}) + ]) + }); + + assert_eq!( + split(&for_key, &nu_releases, Tag::unknown()).unwrap(), + Value::row(indexmap! { + "EC".into() => row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}) + ]) + }), + "NZ".into() => row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}) + ]) + }), + "US".into() => row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}) + ]) + }) + }).tagged_unknown() + ); + } + + #[test] + fn errors_if_key_within_some_inner_table_is_missing() { + let for_key = String::from("country").tagged_unknown(); + + let nu_releases = row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => Value::string("JT").tagged(Tag::from(Span::new(5,10))), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}) + ]) + }); + + assert!(split(&for_key, &nu_releases, Tag::from(Span::new(5, 10))).is_err()); + } +} diff --git a/tests/commands_test.rs b/tests/commands_test.rs index 1c456b52c7..acd5e8374c 100644 --- a/tests/commands_test.rs +++ b/tests/commands_test.rs @@ -9,10 +9,10 @@ fn group_by() { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.csv", r#" - first_name,last_name,rusty_luck,type - Andrés,Robalino,1,A - Jonathan,Turner,1,B - Yehuda,Katz,1,A + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A "#, )]); @@ -20,8 +20,8 @@ fn group_by() { cwd: dirs.test(), h::pipeline( r#" open los_tres_caballeros.csv - | group-by type - | get A + | group-by rusty_at + | get "10/11/2013" | count | echo $it "# @@ -37,10 +37,10 @@ fn group_by_errors_if_unknown_column_name() { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.csv", r#" - first_name,last_name,rusty_luck,type - Andrés,Robalino,1,A - Jonathan,Turner,1,B - Yehuda,Katz,1,A + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A "#, )]); @@ -56,6 +56,60 @@ fn group_by_errors_if_unknown_column_name() { }) } +#[cfg(data_processing_primitives)] +#[test] +fn split_by() { + Playground::setup("split_by_test_1", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_tres_caballeros.csv", + r#" + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), h::pipeline( + r#" + open los_tres_caballeros.csv + | group-by rusty_at + | split-by type + | get A."10/11/2013" + | count + | echo $it + "# + )); + + assert_eq!(actual, "2"); + }) +} + +#[cfg(data_processing_primitives)] +#[test] +fn split_by_errors_if_no_table_given_as_input() { + Playground::setup("split_by_test_2", |dirs, sandbox| { + sandbox.with_files(vec![ + EmptyFile("los.txt"), + EmptyFile("tres.txt"), + EmptyFile("amigos.txt"), + EmptyFile("arepas.clu"), + ]); + + let actual = nu_error!( + cwd: dirs.test(), h::pipeline( + r#" + ls + | get name + | split-by type + "# + )); + + assert!(actual.contains("Expected table from pipeline")); + }) +} + #[test] fn first_gets_first_rows_by_amount() { Playground::setup("first_test_1", |dirs, sandbox| {