Add uniq command (#1132)

* start playing with ways to use the uniq command

* WIP

* Got uniq working, but still need to figure out args issue and add tests

* Add some tests for uniq

* fmt

* remove commented out code

* Add documentation and some additional tests showing uniq values and rows. Also removed args TODO

* add changes that didn't get committed

* whoops, I didn't save the docs correctly...

* fmt

* Add a test for uniq with nested json

* Add another test

* Fix unique-ness when json keys are out of order and make the test json more complicated
This commit is contained in:
Ryan Blecher 2019-12-30 23:05:02 -05:00 committed by Jonathan Turner
parent dba82ac530
commit f37f29b441
13 changed files with 298 additions and 6 deletions

View file

@ -23,7 +23,7 @@ use serde::{Deserialize, Serialize};
use std::path::PathBuf; use std::path::PathBuf;
use std::time::SystemTime; use std::time::SystemTime;
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
pub enum UntaggedValue { pub enum UntaggedValue {
Primitive(Primitive), Primitive(Primitive),
Row(Dictionary), Row(Dictionary),
@ -182,7 +182,7 @@ impl UntaggedValue {
} }
} }
#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, Hash, Serialize, Deserialize)]
pub struct Value { pub struct Value {
pub value: UntaggedValue, pub value: UntaggedValue,
pub tag: Tag, pub tag: Tag,

View file

@ -7,6 +7,7 @@ use indexmap::IndexMap;
use nu_source::{b, DebugDocBuilder, PrettyDebug, Spanned, Tag}; use nu_source::{b, DebugDocBuilder, PrettyDebug, Spanned, Tag};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::cmp::{Ord, Ordering, PartialOrd}; use std::cmp::{Ord, Ordering, PartialOrd};
use std::hash::{Hash, Hasher};
#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Clone, Getters, new)] #[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Clone, Getters, new)]
pub struct Dictionary { pub struct Dictionary {
@ -14,6 +15,15 @@ pub struct Dictionary {
pub entries: IndexMap<String, Value>, pub entries: IndexMap<String, Value>,
} }
impl Hash for Dictionary {
fn hash<H: Hasher>(&self, state: &mut H) {
let mut entries = self.entries.clone();
entries.sort_keys();
entries.keys().collect::<Vec<&String>>().hash(state);
entries.values().collect::<Vec<&Value>>().hash(state);
}
}
impl PartialOrd for Dictionary { impl PartialOrd for Dictionary {
fn partial_cmp(&self, other: &Dictionary) -> Option<Ordering> { fn partial_cmp(&self, other: &Dictionary) -> Option<Ordering> {
let this: Vec<&String> = self.entries.keys().collect(); let this: Vec<&String> = self.entries.keys().collect();

View file

@ -12,7 +12,7 @@ use num_traits::cast::{FromPrimitive, ToPrimitive};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::path::PathBuf; use std::path::PathBuf;
#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Deserialize, Serialize)] #[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Deserialize, Serialize)]
pub enum Primitive { pub enum Primitive {
Nothing, Nothing,
#[serde(with = "serde_bigint")] #[serde(with = "serde_bigint")]

View file

@ -3,7 +3,7 @@ use derive_new::new;
use nu_source::{b, DebugDocBuilder, Spanned}; use nu_source::{b, DebugDocBuilder, Spanned};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize, Hash)] #[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
pub enum RangeInclusion { pub enum RangeInclusion {
Inclusive, Inclusive,
Exclusive, Exclusive,
@ -25,7 +25,7 @@ impl RangeInclusion {
} }
} }
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize, new)] #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize, new)]
pub struct Range { pub struct Range {
pub from: (Spanned<Primitive>, RangeInclusion), pub from: (Spanned<Primitive>, RangeInclusion),
pub to: (Spanned<Primitive>, RangeInclusion), pub to: (Spanned<Primitive>, RangeInclusion),

36
docs/commands/uniq.rs Normal file
View file

@ -0,0 +1,36 @@
# uniq
Returns unique rows or values from a dataset.
## Examples
Given a file `test.csv`
```
first_name,last_name,rusty_at,type
Andrés,Robalino,10/11/2013,A
Andrés,Robalino,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
```
```
> `open test.csv | uniq`
# first_name last_name rusty_at type
0 Andrés Robalino 10/11/2013 A
1 Jonathan Turner 10/12/2013 B
2 Yehuda Katz 10/11/2013 A
```
```
> `open test.csv | get type | uniq`
# <value>
0 A
1 B
```

View file

@ -294,6 +294,7 @@ pub async fn cli() -> Result<(), Box<dyn Error>> {
whole_stream_command(Default), whole_stream_command(Default),
whole_stream_command(SkipWhile), whole_stream_command(SkipWhile),
whole_stream_command(Range), whole_stream_command(Range),
whole_stream_command(Uniq),
// Table manipulation // Table manipulation
whole_stream_command(Wrap), whole_stream_command(Wrap),
whole_stream_command(Pivot), whole_stream_command(Pivot),

View file

@ -90,6 +90,7 @@ pub(crate) mod to_tsv;
pub(crate) mod to_url; pub(crate) mod to_url;
pub(crate) mod to_yaml; pub(crate) mod to_yaml;
pub(crate) mod trim; pub(crate) mod trim;
pub(crate) mod uniq;
pub(crate) mod version; pub(crate) mod version;
pub(crate) mod what; pub(crate) mod what;
pub(crate) mod where_; pub(crate) mod where_;
@ -185,6 +186,7 @@ pub(crate) use to_tsv::ToTSV;
pub(crate) use to_url::ToURL; pub(crate) use to_url::ToURL;
pub(crate) use to_yaml::ToYAML; pub(crate) use to_yaml::ToYAML;
pub(crate) use trim::Trim; pub(crate) use trim::Trim;
pub(crate) use uniq::Uniq;
pub(crate) use version::Version; pub(crate) use version::Version;
pub(crate) use what::What; pub(crate) use what::What;
pub(crate) use where_::Where; pub(crate) use where_::Where;

48
src/commands/uniq.rs Normal file
View file

@ -0,0 +1,48 @@
use crate::commands::WholeStreamCommand;
use crate::context::CommandRegistry;
use crate::prelude::*;
use indexmap::set::IndexSet;
use nu_errors::ShellError;
use nu_protocol::{ReturnSuccess, Signature};
#[derive(Deserialize)]
struct UniqArgs {}
pub struct Uniq;
impl WholeStreamCommand for Uniq {
fn name(&self) -> &str {
"uniq"
}
fn signature(&self) -> Signature {
Signature::build("uniq")
}
fn usage(&self) -> &str {
"Return the unique rows"
}
fn run(
&self,
args: CommandArgs,
registry: &CommandRegistry,
) -> Result<OutputStream, ShellError> {
args.process(registry, uniq)?.run()
}
}
fn uniq(
UniqArgs {}: UniqArgs,
RunnableContext { input, .. }: RunnableContext,
) -> Result<OutputStream, ShellError> {
let stream = async_stream! {
let uniq_values: IndexSet<_> = input.values.collect().await;
for item in uniq_values.iter().map(|row| ReturnSuccess::value(row.clone())) {
yield item;
}
};
Ok(stream.to_output_stream())
}

View file

@ -325,6 +325,10 @@ mod tests {
loc: fixtures().join("jonathan.xml"), loc: fixtures().join("jonathan.xml"),
at: 0 at: 0
}, },
Res {
loc: fixtures().join("nested_uniq.json"),
at: 0
},
Res { Res {
loc: fixtures().join("sample.bson"), loc: fixtures().join("sample.bson"),
at: 0 at: 0

View file

@ -26,5 +26,6 @@ mod save;
mod sort_by; mod sort_by;
mod split_by; mod split_by;
mod split_column; mod split_column;
mod uniq;
mod where_; mod where_;
mod wrap; mod wrap;

118
tests/commands/uniq.rs Normal file
View file

@ -0,0 +1,118 @@
use nu_test_support::fs::Stub::FileWithContentToBeTrimmed;
use nu_test_support::playground::Playground;
use nu_test_support::{nu, pipeline};
#[test]
fn uniq_rows() {
Playground::setup("uniq_test_1", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.csv",
r#"
first_name,last_name,rusty_at,type
Andrés,Robalino,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_tres_caballeros.csv
| uniq
| count
| echo $it
"#
));
assert_eq!(actual, "3");
})
}
#[test]
fn uniq_columns() {
Playground::setup("uniq_test_2", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.csv",
r#"
first_name,last_name,rusty_at,type
Andrés,Robalino,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_tres_caballeros.csv
| pick rusty_at type
| uniq
| count
| echo $it
"#
));
assert_eq!(actual, "2");
})
}
#[test]
fn uniq_values() {
Playground::setup("uniq_test_3", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.csv",
r#"
first_name,last_name,rusty_at,type
Andrés,Robalino,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
Jonathan,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_tres_caballeros.csv
| pick get type
| uniq
| count
| echo $it
"#
));
assert_eq!(actual, "2");
})
}
#[test]
fn uniq_when_keys_out_of_order() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo '[{"a": "a", "b": [1,2,3]},{"b": [1,2,3], "a": "a"}]'
| from-json
| uniq
| count
| echo $it
"#
));
assert_eq!(actual, "1");
}
#[test]
fn uniq_nested_json_structures() {
let actual = nu!(
cwd: "tests/fixtures/formats",
"open nested_uniq.json | uniq | count | echo $it"
);
assert_eq!(actual, "3");
}

View file

@ -7,7 +7,7 @@ fn filters_by_unit_size_comparison() {
"ls | where size > 1kb | sort-by size | get name | first 1 | trim | echo $it" "ls | where size > 1kb | sort-by size | get name | first 1 | trim | echo $it"
); );
assert_eq!(actual, "cargo_sample.toml"); assert_eq!(actual, "nested_uniq.json");
} }
#[test] #[test]

72
tests/fixtures/formats/nested_uniq.json vendored Normal file
View file

@ -0,0 +1,72 @@
[
{
"name": "this is duplicated",
"nesting": [
{
"a": "a",
"b": "b"
},
{
"c": "c",
"d": "d"
}
],
"can_be_ordered_differently": {
"array": [1, 2, 3, 4, 5],
"something": { "else": "works" }
}
},
{
"can_be_ordered_differently": {
"something": { "else": "works" },
"array": [1, 2, 3, 4, 5]
},
"nesting": [
{
"b": "b",
"a": "a"
},
{
"d": "d",
"c": "c"
}
],
"name": "this is duplicated"
},
{
"name": "this is unique",
"nesting": [
{
"a": "b",
"b": "a"
},
{
"c": "d",
"d": "c"
}
],
"can_be_ordered_differently": {
"array": [],
"something": { "else": "does not work" }
}
},
{
"name": "this is unique",
"nesting": [
{
"a": "a",
"b": "b",
"c": "c"
},
{
"d": "d",
"e": "e",
"f": "f"
}
],
"can_be_ordered_differently": {
"array": [],
"something": { "else": "works" }
}
}
]