testing suite for dataframes (#379)

This commit is contained in:
Fernando Herrera 2021-11-29 06:50:57 +00:00 committed by GitHub
parent e07ce57423
commit ee239a0d37
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 403 additions and 88 deletions

View file

@ -1,10 +1,3 @@
mod describe;
mod dtypes;
mod objects;
mod open;
mod to_df;
mod nu_dataframe;
pub use describe::DescribeDF;
pub use dtypes::DataTypes;
pub use open::OpenDataFrame;
pub use to_df::ToDataFrame;
pub use nu_dataframe::commands::{DataTypes, DescribeDF, OpenDataFrame, ToDataFrame};

View file

@ -18,7 +18,7 @@ pub fn between_dataframes(
let operation_span = span(&[left.span()?, right.span()?]);
match operator.item {
Operator::Plus => match lhs.append_df(rhs, Axis::Row, operation_span) {
Ok(df) => Ok(df.to_value(operation_span)),
Ok(df) => Ok(df.into_value(operation_span)),
Err(e) => Err(e),
},
_ => Err(ShellError::OperatorMismatch {

View file

@ -1,9 +1,11 @@
use super::objects::nu_dataframe::NuDataFrame;
use crate::dataframe::nu_dataframe::Column;
use super::super::NuDataFrame;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature,
Category, Example, PipelineData, ShellError, Signature, Span,
};
use polars::{
chunked_array::ChunkedArray,
@ -31,8 +33,58 @@ impl Command for DescribeDF {
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "dataframe description",
example: "[[a b]; [1 1] [1 1]] | to-df | describe",
result: None,
example: "[[a b]; [1 1] [1 1]] | to df | describe",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"descriptor".to_string(),
vec![
"count".to_string().into(),
"sum".to_string().into(),
"mean".to_string().into(),
"median".to_string().into(),
"std".to_string().into(),
"min".to_string().into(),
"25%".to_string().into(),
"50%".to_string().into(),
"75%".to_string().into(),
"max".to_string().into(),
],
),
Column::new(
"a (i64)".to_string(),
vec![
2.0.into(),
2.0.into(),
1.0.into(),
1.0.into(),
0.0.into(),
1.0.into(),
1.0.into(),
1.0.into(),
1.0.into(),
1.0.into(),
],
),
Column::new(
"b (i64)".to_string(),
vec![
2.0.into(),
2.0.into(),
1.0.into(),
1.0.into(),
0.0.into(),
1.0.into(),
1.0.into(),
1.0.into(),
1.0.into(),
1.0.into(),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::unknown()),
),
}]
}
@ -181,3 +233,14 @@ fn command(
df, call.head,
)))
}
#[cfg(test)]
mod test {
use super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(DescribeDF {})
}
}

View file

@ -1,8 +1,8 @@
use super::objects::nu_dataframe::{Column, NuDataFrame};
use super::super::{Column, NuDataFrame};
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Value,
Category, Example, PipelineData, ShellError, Signature, Span, Value,
};
#[derive(Clone)]
@ -24,8 +24,21 @@ impl Command for DataTypes {
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "drop column a",
example: "[[a b]; [1 2] [3 4]] | to-df | dtypes",
result: None,
example: "[[a b]; [1 2] [3 4]] | to df | dtypes",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"column".to_string(),
vec!["a".to_string().into(), "b".to_string().into()],
),
Column::new(
"dtype".to_string(),
vec!["i64".to_string().into(), "i64".to_string().into()],
),
])
.expect("simple df for test should not fail")
.into_value(Span::unknown()),
),
}]
}
@ -78,5 +91,16 @@ fn command(
let dtypes_col = Column::new("dtype".to_string(), dtypes);
let df = NuDataFrame::try_from_columns(vec![names_col, dtypes_col])?;
Ok(PipelineData::Value(df.to_value(call.head)))
Ok(PipelineData::Value(df.into_value(call.head)))
}
#[cfg(test)]
mod test {
use super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(DataTypes {})
}
}

View file

@ -0,0 +1,12 @@
mod describe;
mod dtypes;
mod open;
mod to_df;
pub use describe::DescribeDF;
pub use dtypes::DataTypes;
pub use open::OpenDataFrame;
pub use to_df::ToDataFrame;
#[cfg(test)]
mod test_dataframe;

View file

@ -1,12 +1,11 @@
use std::{fs::File, path::PathBuf};
use super::objects::nu_dataframe::NuDataFrame;
use super::super::NuDataFrame;
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape,
};
use std::{fs::File, path::PathBuf};
use polars::prelude::{CsvEncoding, CsvReader, JsonReader, ParquetReader, SerReader};

View file

@ -0,0 +1,87 @@
use nu_engine::eval_block;
use nu_parser::parse;
use nu_protocol::{
engine::{Command, EngineState, Stack, StateWorkingSet},
PipelineData, Span, Value, CONFIG_VARIABLE_ID,
};
use super::ToDataFrame;
pub fn test_dataframe(cmd: impl Command + 'static) {
let examples = cmd.examples();
let mut engine_state = Box::new(EngineState::new());
let delta = {
// Base functions that are needed for testing
// Try to keep this working set small to keep tests running as fast as possible
let mut working_set = StateWorkingSet::new(&*engine_state);
working_set.add_decl(Box::new(ToDataFrame));
// Adding the command that is being tested to the working set
working_set.add_decl(Box::new(cmd));
working_set.render()
};
let _ = engine_state.merge_delta(delta);
for example in examples {
// Skip tests that don't have results to compare to
if example.result.is_none() {
continue;
}
let start = std::time::Instant::now();
let (block, delta) = {
let mut working_set = StateWorkingSet::new(&*engine_state);
let (output, err) = parse(&mut working_set, None, example.example.as_bytes(), false);
if let Some(err) = err {
panic!("test parse error in `{}`: {:?}", example.example, err)
}
(output, working_set.render())
};
let _ = engine_state.merge_delta(delta);
let mut stack = Stack::new();
// Set up our initial config to start from
stack.vars.insert(
CONFIG_VARIABLE_ID,
Value::Record {
cols: vec![],
vals: vec![],
span: Span::unknown(),
},
);
match eval_block(
&engine_state,
&mut stack,
&block,
PipelineData::new(Span::unknown()),
) {
Err(err) => panic!("test eval error in `{}`: {:?}", example.example, err),
Ok(result) => {
let result = result.into_value(Span::unknown());
println!("input: {}", example.example);
println!("result: {:?}", result);
println!("done: {:?}", start.elapsed());
// Note. Value implements PartialEq for Bool, Int, Float, String and Block
// If the command you are testing requires to compare another case, then
// you need to define its equality in the Value struct
if let Some(expected) = example.result {
if result != expected {
panic!(
"the example result is different to expected value: {:?} != {:?}",
result, expected
)
}
}
}
}
}
}

View file

@ -0,0 +1,111 @@
use super::super::{Column, NuDataFrame};
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span,
};
#[derive(Clone)]
pub struct ToDataFrame;
impl Command for ToDataFrame {
fn name(&self) -> &str {
"to df"
}
fn usage(&self) -> &str {
"Converts a List, Table or Dictionary into a dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name().to_string()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Takes a dictionary and creates a dataframe",
example: "[[a b];[1 2] [3 4]] | to df",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![1.into(), 3.into()]),
Column::new("b".to_string(), vec![2.into(), 4.into()]),
])
.expect("simple df for test should not fail")
.into_value(Span::unknown()),
),
},
Example {
description: "Takes a list of tables and creates a dataframe",
example: "[[1 2 a] [3 4 b] [5 6 c]] | to df",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("0".to_string(), vec![1.into(), 3.into(), 5.into()]),
Column::new("1".to_string(), vec![2.into(), 4.into(), 6.into()]),
Column::new(
"2".to_string(),
vec![
"a".to_string().into(),
"b".to_string().into(),
"c".to_string().into(),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::unknown()),
),
},
Example {
description: "Takes a list and creates a dataframe",
example: "[a b c] | to df",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![
"a".to_string().into(),
"b".to_string().into(),
"c".to_string().into(),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::unknown()),
),
},
Example {
description: "Takes a list of booleans and creates a dataframe",
example: "[$true $true $false] | to df",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![true.into(), true.into(), false.into()],
)])
.expect("simple df for test should not fail")
.into_value(Span::unknown()),
),
},
]
}
fn run(
&self,
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_iter(input.into_iter())?;
Ok(PipelineData::Value(NuDataFrame::into_value(df, call.head)))
}
}
#[cfg(test)]
mod test {
use super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(ToDataFrame {})
}
}

View file

@ -1,4 +1,5 @@
use super::{DataFrameValue, NuDataFrame};
use chrono::{DateTime, FixedOffset, NaiveDateTime};
use indexmap::map::{Entry, IndexMap};
use nu_protocol::{ShellError, Span, Value};

View file

@ -48,7 +48,17 @@ impl CustomValue for NuDataFrame {
fn follow_path_string(&self, column_name: String, span: Span) -> Result<Value, ShellError> {
let column = self.column(&column_name, span)?;
Ok(column.to_value(span))
Ok(column.into_value(span))
}
fn partial_cmp(&self, other: &Value) -> Option<std::cmp::Ordering> {
match other {
Value::CustomValue { val, .. } => val
.as_any()
.downcast_ref::<Self>()
.and_then(|other| self.is_equal(other)),
_ => None,
}
}
fn operation(

View file

@ -1,15 +1,17 @@
pub mod commands;
mod between_values;
mod conversion;
mod custom_value;
mod operations;
use std::{cmp::Ordering, fmt::Display, hash::Hasher};
use conversion::{Column, ColumnMap};
pub use conversion::{Column, ColumnMap};
use indexmap::map::IndexMap;
use nu_protocol::{did_you_mean, PipelineData, ShellError, Span, Value};
use polars::prelude::{DataFrame, PolarsObject, Series};
use polars::prelude::{DataFrame, DataType, PolarsObject, Series};
use serde::{Deserialize, Serialize};
use std::{cmp::Ordering, fmt::Display, hash::Hasher};
// DataFrameValue is an encapsulation of Nushell Value that can be used
// to define the PolarsObject Trait. The polars object trait allows to
@ -98,7 +100,7 @@ impl NuDataFrame {
}
}
pub fn to_value(self, span: Span) -> Value {
pub fn into_value(self, span: Span) -> Value {
Value::CustomValue {
val: Box::new(self),
span,
@ -325,4 +327,64 @@ impl NuDataFrame {
Ok(values)
}
// Dataframes are considered equal if they have the same shape, column name and values
pub fn is_equal(&self, other: &Self) -> Option<Ordering> {
if self.as_ref().width() == 0 {
// checking for empty dataframe
return None;
}
if self.as_ref().get_column_names() != other.as_ref().get_column_names() {
// checking both dataframes share the same names
return None;
}
if self.as_ref().height() != other.as_ref().height() {
// checking both dataframes have the same row size
return None;
}
// sorting dataframe by the first column
let column_names = self.as_ref().get_column_names();
let first_col = column_names
.get(0)
.expect("already checked that dataframe is different than 0");
// if unable to sort, then unable to compare
let lhs = match self.as_ref().sort(*first_col, false) {
Ok(df) => df,
Err(_) => return None,
};
let rhs = match other.as_ref().sort(*first_col, false) {
Ok(df) => df,
Err(_) => return None,
};
for name in self.as_ref().get_column_names() {
let self_series = lhs.column(name).expect("name from dataframe names");
let other_series = rhs
.column(name)
.expect("already checked that name in other");
let self_series = match self_series.dtype() {
// Casting needed to compare other numeric types with nushell numeric type.
// In nushell we only have i64 integer numeric types and any array created
// with nushell untagged primitives will be of type i64
DataType::UInt32 => match self_series.cast(&DataType::Int64) {
Ok(series) => series,
Err(_) => return None,
},
_ => self_series.clone(),
};
if !self_series.series_equal(other_series) {
return None;
}
}
Some(Ordering::Equal)
}
}

View file

@ -1 +0,0 @@
pub(super) mod nu_dataframe;

View file

@ -1,59 +0,0 @@
use super::objects::nu_dataframe::NuDataFrame;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature,
};
#[derive(Clone)]
pub struct ToDataFrame;
impl Command for ToDataFrame {
fn name(&self) -> &str {
"to df"
}
fn usage(&self) -> &str {
"Converts a List, Table or Dictionary into a dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name().to_string()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Takes a dictionary and creates a dataframe",
example: "[[a b];[1 2] [3 4]] | to df",
result: None,
},
Example {
description: "Takes a list of tables and creates a dataframe",
example: "[[1 2 a] [3 4 b] [5 6 c]] | to df",
result: None,
},
Example {
description: "Takes a list and creates a dataframe",
example: "[a b c] | to df",
result: None,
},
Example {
description: "Takes a list of booleans and creates a dataframe",
example: "[$true $true $false] | to df",
result: None,
},
]
}
fn run(
&self,
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_iter(input.into_iter())?;
Ok(PipelineData::Value(NuDataFrame::to_value(df, call.head)))
}
}

View file

@ -1,4 +1,4 @@
use std::fmt;
use std::{cmp::Ordering, fmt};
use crate::{ast::Operator, Category, ShellError, Span, Value};
@ -29,6 +29,9 @@ pub trait CustomValue: fmt::Debug + Send + Sync {
fn follow_path_int(&self, count: usize, span: Span) -> Result<Value, ShellError>;
fn follow_path_string(&self, column_name: String, span: Span) -> Result<Value, ShellError>;
// ordering with other value
fn partial_cmp(&self, other: &Value) -> Option<Ordering>;
// Definition of an operation between the object that implements the trait
// and another Value.
// The Operator enum is used to indicate the expected operation

View file

@ -1,5 +1,14 @@
use crate::{ShellError, Span, Value};
impl From<String> for Value {
fn from(val: String) -> Self {
Value::String {
val,
span: Span::unknown(),
}
}
}
impl From<bool> for Value {
fn from(val: bool) -> Self {
Value::Bool {

View file

@ -640,6 +640,7 @@ impl PartialOrd for Value {
(Value::Binary { val: lhs, .. }, Value::Binary { val: rhs, .. }) => {
lhs.partial_cmp(rhs)
}
(Value::CustomValue { val: lhs, .. }, rhs) => lhs.partial_cmp(rhs),
(Value::Nothing { .. }, Value::Nothing { .. }) => Some(Ordering::Equal),
(_, _) => None,
}