append dataframes (#3839)

This commit is contained in:
Fernando Herrera 2021-07-25 21:36:09 +01:00 committed by GitHub
parent 111477aa74
commit d54d7cc431
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 277 additions and 37 deletions

View file

@ -0,0 +1,138 @@
use crate::prelude::*;
use nu_engine::WholeStreamCommand;
use nu_errors::ShellError;
use nu_protocol::{
dataframe::{Axis, Column, NuDataFrame},
Signature, SyntaxShape, UntaggedValue, Value,
};
use nu_source::Tagged;
pub struct DataFrame;
impl WholeStreamCommand for DataFrame {
fn name(&self) -> &str {
"dataframe append"
}
fn usage(&self) -> &str {
"[DataFrame] Appends a new dataframe"
}
fn signature(&self) -> Signature {
Signature::build("dataframe append")
.required_named(
"other",
SyntaxShape::Any,
"dataframe to be appended",
Some('o'),
)
.required_named(
"axis",
SyntaxShape::String,
"row or col axis orientation",
Some('a'),
)
}
fn run(&self, args: CommandArgs) -> Result<OutputStream, ShellError> {
command(args)
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Appends a dataframe as new columns",
example: r#"let a = ([[a b]; [1 2] [3 4]] | dataframe to-df);
$a | dataframe append -o $a -a row"#,
result: Some(vec![NuDataFrame::try_from_columns(
vec![
Column::new(
"a".to_string(),
vec![UntaggedValue::int(1).into(), UntaggedValue::int(3).into()],
),
Column::new(
"b".to_string(),
vec![UntaggedValue::int(2).into(), UntaggedValue::int(4).into()],
),
Column::new(
"a_x".to_string(),
vec![UntaggedValue::int(1).into(), UntaggedValue::int(3).into()],
),
Column::new(
"b_x".to_string(),
vec![UntaggedValue::int(2).into(), UntaggedValue::int(4).into()],
),
],
&Span::default(),
)
.expect("simple df for test should not fail")
.into_value(Tag::default())]),
},
Example {
description: "Appends a dataframe merging at the end of columns",
example: r#"let a = ([[a b]; [1 2] [3 4]] | dataframe to-df);
$a | dataframe append -o $a -a col"#,
result: Some(vec![NuDataFrame::try_from_columns(
vec![
Column::new(
"a".to_string(),
vec![
UntaggedValue::int(1).into(),
UntaggedValue::int(3).into(),
UntaggedValue::int(1).into(),
UntaggedValue::int(3).into(),
],
),
Column::new(
"b".to_string(),
vec![
UntaggedValue::int(2).into(),
UntaggedValue::int(4).into(),
UntaggedValue::int(2).into(),
UntaggedValue::int(4).into(),
],
),
],
&Span::default(),
)
.expect("simple df for test should not fail")
.into_value(Tag::default())]),
},
]
}
}
fn command(mut args: CommandArgs) -> Result<OutputStream, ShellError> {
let tag = args.call_info.name_tag.clone();
let other: Value = args.req_named("other")?;
let axis: Tagged<String> = args.req_named("axis")?;
let axis = Axis::try_from_str(axis.item.as_str(), &axis.tag.span)?;
let df_other = match other.value {
UntaggedValue::DataFrame(df) => Ok(df),
_ => Err(ShellError::labeled_error(
"Incorrect type",
"can only append a dataframe to a dataframe",
other.tag.span,
)),
}?;
let (df, _) = NuDataFrame::try_from_stream(&mut args.input, &tag.span)?;
let df_new = df.append_df(&df_other, axis, &tag.span)?;
Ok(OutputStream::one(df_new.into_value(tag)))
}
#[cfg(test)]
mod tests {
use super::DataFrame;
use super::ShellError;
#[test]
fn examples_work_as_expected() -> Result<(), ShellError> {
use crate::examples::test_dataframe as test_examples;
test_examples(DataFrame {})
}
}

View file

@ -1,4 +1,5 @@
pub mod aggregate;
pub mod append;
pub mod column;
pub mod command;
pub mod drop;
@ -31,6 +32,7 @@ pub mod where_;
pub mod with_column;
pub use aggregate::DataFrame as DataFrameAggregate;
pub use append::DataFrame as DataFrameAppend;
pub use column::DataFrame as DataFrameColumn;
pub use command::Command as DataFrame;
pub use drop::DataFrame as DataFrameDrop;

View file

@ -25,9 +25,9 @@ pub use conversions::*;
pub use core_commands::*;
#[cfg(feature = "dataframe")]
pub use dataframe::{
DataFrame, DataFrameAggregate, DataFrameAllFalse, DataFrameAllTrue, DataFrameArgMax,
DataFrameArgMin, DataFrameArgSort, DataFrameArgTrue, DataFrameArgUnique, DataFrameColumn,
DataFrameConcatenate, DataFrameContains, DataFrameDTypes, DataFrameDrop,
DataFrame, DataFrameAggregate, DataFrameAllFalse, DataFrameAllTrue, DataFrameAppend,
DataFrameArgMax, DataFrameArgMin, DataFrameArgSort, DataFrameArgTrue, DataFrameArgUnique,
DataFrameColumn, DataFrameConcatenate, DataFrameContains, DataFrameDTypes, DataFrameDrop,
DataFrameDropDuplicates, DataFrameDropNulls, DataFrameDummies, DataFrameFilter, DataFrameFirst,
DataFrameGet, DataFrameGroupBy, DataFrameIsDuplicated, DataFrameIsIn, DataFrameIsNotNull,
DataFrameIsNull, DataFrameIsUnique, DataFrameJoin, DataFrameLast, DataFrameList, DataFrameMelt,

View file

@ -328,6 +328,7 @@ pub fn create_default_context(interactive: bool) -> Result<EvaluationContext, Bo
whole_stream_command(DataFrameToUppercase),
whole_stream_command(DataFrameStringSlice),
whole_stream_command(DataFrameConcatenate),
whole_stream_command(DataFrameAppend),
]);
#[cfg(feature = "clipboard-cli")]

View file

@ -1,14 +1,14 @@
use bigdecimal::BigDecimal;
use nu_errors::ShellError;
use nu_protocol::dataframe::NuDataFrame;
use nu_protocol::dataframe::{Axis, NuDataFrame};
use nu_protocol::hir::Operator;
use nu_protocol::{Primitive, ShellTypeName, UntaggedValue, Value};
use nu_source::Span;
use num_traits::ToPrimitive;
use polars::prelude::{
BooleanType, ChunkCompare, ChunkedArray, DataFrame, DataType, Float64Type, Int64Type,
IntoSeries, NumOpsDispatchChecked, PolarsError, Series,
BooleanType, ChunkCompare, ChunkedArray, DataType, Float64Type, Int64Type, IntoSeries,
NumOpsDispatchChecked, PolarsError, Series,
};
use std::ops::{Add, BitAnd, BitOr, Div, Mul, Sub};
@ -83,37 +83,14 @@ pub fn between_dataframes(
operation_span: &Span,
) -> Result<UntaggedValue, (&'static str, &'static str)> {
match operator {
Operator::Plus => {
let mut columns: Vec<&str> = Vec::new();
let new = lhs
.as_ref()
.get_columns()
.iter()
.chain(rhs.as_ref().get_columns().iter())
.map(|s| {
let name = if columns.contains(&s.name()) {
format!("{}_{}", s.name(), "x")
} else {
columns.push(s.name());
s.name().to_string()
};
let mut series = s.clone();
series.rename(name.as_str());
series
})
.collect::<Vec<Series>>();
match DataFrame::new(new) {
Ok(df) => Ok(NuDataFrame::dataframe_to_untagged(df)),
Err(e) => Ok(UntaggedValue::Error(ShellError::labeled_error(
"Appending error",
format!("{}", e),
operation_span,
))),
}
}
Operator::Plus => match lhs.append_df(rhs, Axis::Row, operation_span) {
Ok(df) => Ok(df.into_untagged()),
Err(e) => Ok(UntaggedValue::Error(ShellError::labeled_error(
"Appending error",
format!("{}", e),
operation_span,
))),
},
_ => Ok(UntaggedValue::Error(ShellError::labeled_error(
"Incorrect datatype",
"unable to use this datatype for this operation",

View file

@ -1,8 +1,10 @@
pub mod nu_dataframe;
pub mod nu_groupby;
pub mod operations;
pub use nu_dataframe::{Column, NuDataFrame};
pub use nu_groupby::NuGroupBy;
pub use operations::Axis;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]

View file

@ -0,0 +1,120 @@
use nu_errors::ShellError;
use nu_source::Span;
use polars::prelude::{DataFrame, Series};
use super::NuDataFrame;
pub enum Axis {
Row,
Column,
}
impl Axis {
pub fn try_from_str(axis: &str, span: &Span) -> Result<Axis, ShellError> {
match axis {
"row" => Ok(Axis::Row),
"col" => Ok(Axis::Column),
_ => Err(ShellError::labeled_error_with_secondary(
"Wrong axis",
"The selected axis does not exist",
span,
"The only axis options are 'row' or 'col'",
span,
)),
}
}
}
impl NuDataFrame {
pub fn append_df(
&self,
other: &NuDataFrame,
axis: Axis,
span: &Span,
) -> Result<Self, ShellError> {
match axis {
Axis::Row => {
let mut columns: Vec<&str> = Vec::new();
let new_cols = self
.as_ref()
.get_columns()
.iter()
.chain(other.as_ref().get_columns().iter())
.map(|s| {
let name = if columns.contains(&s.name()) {
format!("{}_{}", s.name(), "x")
} else {
columns.push(s.name());
s.name().to_string()
};
let mut series = s.clone();
series.rename(name.as_str());
series
})
.collect::<Vec<Series>>();
let df_new = DataFrame::new(new_cols).map_err(|e| {
ShellError::labeled_error("Appending error", format!("{}", e), span)
})?;
Ok(NuDataFrame::new(df_new))
}
Axis::Column => {
if self.as_ref().width() != other.as_ref().width() {
return Err(ShellError::labeled_error(
"Appending error",
"Dataframes with different number of columns",
span,
));
}
if !self
.as_ref()
.get_column_names()
.iter()
.all(|col| other.as_ref().get_column_names().contains(col))
{
return Err(ShellError::labeled_error(
"Appending error",
"Dataframes with different columns names",
span,
));
}
let new_cols = self
.as_ref()
.get_columns()
.iter()
.map(|s| {
let other_col = other
.as_ref()
.column(s.name())
.expect("Already checked that dataframes have same columns");
let mut tmp = s.clone();
let res = tmp.append(other_col);
match res {
Ok(s) => Ok(s.clone()),
Err(e) => Err({
ShellError::labeled_error(
"Appending error",
format!("Unable to append dataframes: {}", e),
span,
)
}),
}
})
.collect::<Result<Vec<Series>, ShellError>>()?;
let df_new = DataFrame::new(new_cols).map_err(|e| {
ShellError::labeled_error("Appending error", format!("{}", e), span)
})?;
Ok(NuDataFrame::new(df_new))
}
}
}
}