Change group-by to accept cell paths (#9020)

Closes #9003.

This PR changes `group-by` so that its optional argument is interpreted
as a cell path. In turn, this lets users use `?` to ignore rows that are
missing the column they wish to group on. For example:

```
> [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo
Error: nu:🐚:column_not_found

  × Cannot find column
   ╭─[entry #3:1:1]
 1 │ [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo
   ·                          ─────┬────             ─┬─
   ·                               │                  ╰── cannot find column 'foo'
   ·                               ╰── value originates here
   ╰────

> [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo?
╭─────┬───────────────╮
│ 123 │ [table 1 row] │
│ 234 │ [table 1 row] │
╰─────┴───────────────╯
```

~~This removes the ability to pass `group-by` a closure or block (I
wasn't able to figure out how to make the 2 features coexist), and so it
is a breaking change. I think this is OK; I didn't even know `group-by`
could accept a closure or block because there was no example for that
functionality.~~
This commit is contained in:
Reilly Wood 2023-05-17 16:34:44 -07:00 committed by GitHub
parent b912d4c1ea
commit 9ce61dc677
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 212 additions and 157 deletions

View file

@ -10,8 +10,8 @@ pub fn test_examples(cmd: impl Command + 'static) {
mod test_examples {
use super::super::{
Ansi, Date, Enumerate, Flatten, From, Get, Into, IntoString, LetEnv, Math, MathEuler,
MathPi, MathRound, ParEach, Path, Random, Sort, SortBy, Split, SplitColumn, SplitRow, Str,
StrJoin, StrLength, StrReplace, Update, Url, Values, Wrap,
MathPi, MathRound, ParEach, Path, PathParse, Random, Sort, SortBy, Split, SplitColumn,
SplitRow, Str, StrJoin, StrLength, StrReplace, Update, Url, Values, Wrap,
};
use crate::{Each, To};
use nu_cmd_lang::example_support::{
@ -85,6 +85,7 @@ mod test_examples {
working_set.add_decl(Box::new(MathRound));
working_set.add_decl(Box::new(Mut));
working_set.add_decl(Box::new(Path));
working_set.add_decl(Box::new(PathParse));
working_set.add_decl(Box::new(ParEach));
working_set.add_decl(Box::new(Random));
working_set.add_decl(Box::new(Sort));

View file

@ -1,9 +1,8 @@
use nu_engine::{eval_block, CallExt};
use nu_protocol::ast::Call;
use nu_protocol::ast::{Call, CellPath};
use nu_protocol::engine::{Closure, Command, EngineState, Stack};
use nu_protocol::{
Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape,
Type, Value,
Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value,
};
use indexmap::IndexMap;
@ -26,7 +25,16 @@ impl Command for GroupBy {
Type::List(Box::new(Type::Any)),
Type::Record(vec![]),
)])
.optional("grouper", SyntaxShape::Any, "the grouper value to use")
.optional(
"grouper",
SyntaxShape::OneOf(vec![
SyntaxShape::CellPath,
SyntaxShape::Block,
SyntaxShape::Closure(None),
SyntaxShape::Closure(Some(vec![SyntaxShape::Any])),
]),
"the path to the column to group on",
)
}
fn usage(&self) -> &str {
@ -50,6 +58,33 @@ impl Command for GroupBy {
example: r#"ls | group-by type"#,
result: None,
},
Example {
description: "Group items by the \"foo\" column's values, ignoring records without a \"foo\" column",
example: r#"open cool.json | group-by foo?"#,
result: None,
},
Example {
description: "Group using a block which is evaluated against each input value",
example: "[foo.txt bar.csv baz.txt] | group-by { path parse | get extension }",
result: Some(Value::Record {
cols: vec!["txt".to_string(), "csv".to_string()],
vals: vec![
Value::List {
vals: vec![
Value::test_string("foo.txt"),
Value::test_string("baz.txt"),
],
span: Span::test_data(),
},
Value::List {
vals: vec![Value::test_string("bar.csv")],
span: Span::test_data(),
},
],
span: Span::test_data(),
}),
},
Example {
description: "You can also group by raw values by leaving out the argument",
example: "['1' '3' '1' '3' '2' '1' '1'] | group-by",
@ -81,131 +116,163 @@ impl Command for GroupBy {
}
}
enum Grouper {
ByColumn(Option<Spanned<String>>),
ByBlock,
}
pub fn group_by(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let name = call.head;
let span = call.head;
let grouper: Option<Value> = call.opt(engine_state, stack, 0)?;
let values: Vec<Value> = input.into_iter().collect();
let mut keys: Vec<Result<String, ShellError>> = vec![];
let mut group_strategy = Grouper::ByColumn(None);
if values.is_empty() {
return Err(ShellError::GenericError(
"expected table from pipeline".into(),
"requires a table input".into(),
Some(name),
Some(span),
None,
Vec::new(),
));
}
let first = values[0].clone();
let value_list = Value::List {
vals: values.clone(),
span: name,
};
match grouper {
let group_value = match grouper {
Some(Value::CellPath { val, span }) => group_cell_path(val, values, span)?,
Some(Value::Block { .. }) | Some(Value::Closure { .. }) => {
let block: Option<Closure> = call.opt(engine_state, stack, 0)?;
let error_key = "error";
for value in values {
if let Some(capture_block) = &block {
let mut stack = stack.captures_to_stack(&capture_block.captures);
let block = engine_state.get_block(capture_block.block_id);
let pipeline = eval_block(
engine_state,
&mut stack,
block,
value.into_pipeline_data(),
call.redirect_stdout,
call.redirect_stderr,
);
match pipeline {
Ok(s) => {
let collection: Vec<Value> = s.into_iter().collect();
if collection.len() > 1 {
return Err(ShellError::GenericError(
"expected one value from the block".into(),
"requires a table with one value for grouping".into(),
Some(name),
None,
Vec::new(),
));
}
let value = match collection.get(0) {
Some(Value::Error { .. }) | None => Value::string(error_key, name),
Some(return_value) => return_value.clone(),
};
keys.push(value.as_string());
}
Err(_) => {
keys.push(Ok(error_key.into()));
}
}
}
}
group_strategy = Grouper::ByBlock;
group_closure(&values, span, block, stack, engine_state, call)?
}
Some(other) => {
group_strategy = Grouper::ByColumn(Some(Spanned {
item: other.as_string()?,
span: name,
}));
None => group_no_grouper(values, span)?,
_ => {
return Err(ShellError::TypeMismatch {
err_message: "unsupported grouper type".to_string(),
span,
})
}
_ => {}
}
let name = if let Ok(span) = first.span() {
span
} else {
name
};
let group_value = match group_strategy {
Grouper::ByBlock => {
let map = keys;
let block = Box::new(move |idx: usize, row: &Value| match map.get(idx) {
Some(Ok(key)) => Ok(key.clone()),
Some(Err(reason)) => Err(reason.clone()),
None => row.as_string(),
});
data_group(&value_list, &Some(block), name)
}
Grouper::ByColumn(column_name) => group(&column_name, &value_list, name),
};
Ok(PipelineData::Value(group_value?, None))
Ok(PipelineData::Value(group_value, None))
}
#[allow(clippy::type_complexity)]
pub fn data_group(
values: &Value,
grouper: &Option<Box<dyn Fn(usize, &Value) -> Result<String, ShellError> + Send>>,
pub fn group_cell_path(
column_name: CellPath,
values: Vec<Value>,
span: Span,
) -> Result<Value, ShellError> {
let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
for (idx, value) in values.clone().into_pipeline_data().into_iter().enumerate() {
for value in values.into_iter() {
let group_key = value
.clone()
.follow_cell_path(&column_name.members, false)?;
if matches!(group_key, Value::Nothing { .. }) {
continue; // likely the result of a failed optional access, ignore this value
}
let group_key = group_key.as_string()?;
let group = groups.entry(group_key).or_default();
group.push(value);
}
let mut cols = vec![];
let mut vals = vec![];
for (k, v) in groups {
cols.push(k.to_string());
vals.push(Value::List { vals: v, span });
}
Ok(Value::Record { cols, vals, span })
}
pub fn group_no_grouper(values: Vec<Value>, span: Span) -> Result<Value, ShellError> {
let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
for value in values.into_iter() {
let group_key = value.as_string()?;
let group = groups.entry(group_key).or_default();
group.push(value);
}
let mut cols = vec![];
let mut vals = vec![];
for (k, v) in groups {
cols.push(k.to_string());
vals.push(Value::List { vals: v, span });
}
Ok(Value::Record { cols, vals, span })
}
// TODO: refactor this, it's a bit of a mess
fn group_closure(
values: &Vec<Value>,
span: Span,
block: Option<Closure>,
stack: &mut Stack,
engine_state: &EngineState,
call: &Call,
) -> Result<Value, ShellError> {
let error_key = "error";
let mut keys: Vec<Result<String, ShellError>> = vec![];
let value_list = Value::List {
vals: values.clone(),
span,
};
for value in values {
if let Some(capture_block) = &block {
let mut stack = stack.captures_to_stack(&capture_block.captures);
let block = engine_state.get_block(capture_block.block_id);
let pipeline = eval_block(
engine_state,
&mut stack,
block,
value.clone().into_pipeline_data(),
call.redirect_stdout,
call.redirect_stderr,
);
match pipeline {
Ok(s) => {
let collection: Vec<Value> = s.into_iter().collect();
if collection.len() > 1 {
return Err(ShellError::GenericError(
"expected one value from the block".into(),
"requires a table with one value for grouping".into(),
Some(span),
None,
Vec::new(),
));
}
let value = match collection.get(0) {
Some(Value::Error { .. }) | None => Value::string(error_key, span),
Some(return_value) => return_value.clone(),
};
keys.push(value.as_string());
}
Err(_) => {
keys.push(Ok(error_key.into()));
}
}
}
}
let map = keys;
let block = Box::new(move |idx: usize, row: &Value| match map.get(idx) {
Some(Ok(key)) => Ok(key.clone()),
Some(Err(reason)) => Err(reason.clone()),
None => row.as_string(),
});
let grouper = &Some(block);
let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
for (idx, value) in value_list.into_pipeline_data().into_iter().enumerate() {
let group_key = if let Some(ref grouper) = grouper {
grouper(idx, &value)
} else {
@ -227,48 +294,6 @@ pub fn data_group(
Ok(Value::Record { cols, vals, span })
}
pub fn group(
column_name: &Option<Spanned<String>>,
values: &Value,
span: Span,
) -> Result<Value, ShellError> {
let name = span;
let grouper = if let Some(column_name) = column_name {
Grouper::ByColumn(Some(column_name.clone()))
} else {
Grouper::ByColumn(None)
};
match grouper {
Grouper::ByColumn(Some(column_name)) => {
let block = Box::new(move |_, row: &Value| {
if let Value::Error { error } = row {
return Err(*error.clone());
};
match row.get_data_by_key(&column_name.item) {
Some(group_key) => Ok(group_key.as_string()?),
None => Err(ShellError::CantFindColumn {
col_name: column_name.item.to_string(),
span: column_name.span,
src_span: row.expect_span(),
}),
}
});
data_group(values, &Some(block), name)
}
Grouper::ByColumn(None) => {
let block = Box::new(move |_, row: &Value| row.as_string());
data_group(values, &Some(block), name)
}
Grouper::ByBlock => Err(ShellError::NushellFailed {
msg: "Block not implemented: This should never happen.".into(),
}),
}
}
#[cfg(test)]
mod test {
use super::*;

View file

@ -3,7 +3,8 @@ use nu_engine::CallExt;
use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{
Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Type, Value,
Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape,
Type, Value,
};
#[derive(Clone)]
@ -182,6 +183,36 @@ pub fn split(
}
}
#[allow(clippy::type_complexity)]
fn data_group(
values: &Value,
grouper: &Option<Box<dyn Fn(usize, &Value) -> Result<String, ShellError> + Send>>,
span: Span,
) -> Result<Value, ShellError> {
let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
for (idx, value) in values.clone().into_pipeline_data().into_iter().enumerate() {
let group_key = if let Some(ref grouper) = grouper {
grouper(idx, &value)
} else {
value.as_string()
};
let group = groups.entry(group_key?).or_default();
group.push(value);
}
let mut cols = vec![];
let mut vals = vec![];
for (k, v) in groups {
cols.push(k.to_string());
vals.push(Value::List { vals: v, span });
}
Ok(Value::Record { cols, vals, span })
}
#[allow(clippy::type_complexity)]
pub fn data_split(
value: PipelineData,
@ -203,7 +234,7 @@ pub fn data_split(
_,
) => {
for (idx, list) in grouped_rows.iter().enumerate() {
match super::group_by::data_group(list, splitter, span) {
match data_group(list, splitter, span) {
Ok(grouped) => {
if let Value::Record {
vals: li,

View file

@ -72,7 +72,7 @@ fn errors_if_given_unknown_column_name() {
}
#[test]
fn errors_if_block_given_evaluates_more_than_one_row() {
fn errors_if_column_not_found() {
Playground::setup("group_by_test_3", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.csv",
@ -92,21 +92,19 @@ fn errors_if_block_given_evaluates_more_than_one_row() {
"#
));
assert!(actual.err.contains("value originates here"),);
assert!(actual.err.contains("cannot find column"),);
assert!(actual.err.contains("did you mean 'type'"),);
})
}
#[test]
fn errors_if_input_empty() {
Playground::setup("group_by_empty_test", |dirs, _sandbox| {
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
group-by date
"#
));
assert!(actual.err.contains("expected table from pipeline"));
});
let actual = nu!("group-by date");
assert!(actual.err.contains("expected table from pipeline"));
}
#[test]
fn optional_cell_path_works() {
let actual = nu!("[{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo? | to nuon");
let expected = r#"{"123": [[foo]; [123]], "234": [[foo]; [234]]}"#;
assert_eq!(actual.out, expected)
}