polars into-df struct fix (#13977)

# Description
This fixes an issue with converting to a dataframe when specifying a
struct in the schema. Things like the following now work correctly:
```nushell
 [[foo bar]; [{a: "a_0", b:"b_0"} 1] [{a: "a_1", b: "b_1" } 2]] | polars into-df -s {foo: {a: str, b: str}, bar: u8}
```
This commit is contained in:
Jack Wright 2024-10-02 03:59:14 -07:00 committed by GitHub
parent 573a7e2c7b
commit 1d6ac16530
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 58 additions and 19 deletions

View file

@ -6,6 +6,7 @@ use crate::{
use crate::values::NuDataFrame; use crate::values::NuDataFrame;
use log::debug;
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand}; use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
use nu_protocol::{ use nu_protocol::{
Category, Example, LabeledError, PipelineData, Signature, Span, SyntaxShape, Type, Value, Category, Example, LabeledError, PipelineData, Signature, Span, SyntaxShape, Type, Value,
@ -159,7 +160,7 @@ impl PluginCommand for ToDataFrame {
}, },
Example { Example {
description: "Convert to a dataframe and provide a schema", description: "Convert to a dataframe and provide a schema",
example: "{a: 1, b: {a: [1 2 3]}, c: [a b c]}| polars into-df -s {a: u8, b: {a: list<u64>}, c: list<str>}", example: "[[a b c]; [1 {d: [1 2 3]} [10 11 12] ]]| polars into-df -s {a: u8, b: {d: list<u64>}, c: list<u8>}",
result: Some( result: Some(
NuDataFrame::try_from_series_vec(vec![ NuDataFrame::try_from_series_vec(vec![
Series::new("a", &[1u8]), Series::new("a", &[1u8]),
@ -172,7 +173,7 @@ impl PluginCommand for ToDataFrame {
}, },
{ {
let dtype = DataType::List(Box::new(DataType::String)); let dtype = DataType::List(Box::new(DataType::String));
let vals = vec![AnyValue::List(Series::new("c", &["a", "b", "c"]))]; let vals = vec![AnyValue::List(Series::new("c", &[10, 11, 12]))];
Series::from_any_values_and_dtype("c", &vals, &dtype, false) Series::from_any_values_and_dtype("c", &vals, &dtype, false)
.expect("List series should not fail") .expect("List series should not fail")
} }
@ -208,6 +209,8 @@ impl PluginCommand for ToDataFrame {
.map(|schema| NuSchema::try_from(&schema)) .map(|schema| NuSchema::try_from(&schema))
.transpose()?; .transpose()?;
debug!("schema: {:?}", maybe_schema);
let maybe_as_columns = call.has_flag("as-columns")?; let maybe_as_columns = call.has_flag("as-columns")?;
let df = if !maybe_as_columns { let df = if !maybe_as_columns {
@ -230,14 +233,22 @@ impl PluginCommand for ToDataFrame {
.collect::<Vec<Column>>(); .collect::<Vec<Column>>();
NuDataFrame::try_from_columns(columns, maybe_schema)? NuDataFrame::try_from_columns(columns, maybe_schema)?
} }
Err(_) => NuDataFrame::try_from_iter( Err(e) => {
debug!(
"Failed to build with multiple columns, attempting as series. failure:{e}"
);
NuDataFrame::try_from_iter(
plugin, plugin,
input.into_iter(), input.into_iter(),
maybe_schema.clone(), maybe_schema.clone(),
)?, )?
} }
} }
_ => NuDataFrame::try_from_iter(plugin, input.into_iter(), maybe_schema.clone())?, }
_ => {
debug!("Other input: {input:?}");
NuDataFrame::try_from_iter(plugin, input.into_iter(), maybe_schema.clone())?
}
} }
}; };

View file

@ -1,3 +1,4 @@
use std::collections::HashMap;
use std::ops::{Deref, DerefMut}; use std::ops::{Deref, DerefMut};
use chrono::{DateTime, Duration, FixedOffset, NaiveTime, TimeZone, Utc}; use chrono::{DateTime, Duration, FixedOffset, NaiveTime, TimeZone, Utc};
@ -487,17 +488,44 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
} }
DataType::Struct(fields) => { DataType::Struct(fields) => {
let schema = Some(NuSchema::new(Schema::from_iter(fields.clone()))); let schema = Some(NuSchema::new(Schema::from_iter(fields.clone())));
let mut structs: Vec<Series> = Vec::new(); // let mut structs: Vec<Series> = Vec::new();
let mut structs: HashMap<String, Series> = HashMap::new();
for v in column.values.iter() { for v in column.values.iter() {
let mut column_values: ColumnMap = IndexMap::new(); let mut column_values: ColumnMap = IndexMap::new();
let record = v.as_record()?; let record = v.as_record()?;
insert_record(&mut column_values, record.clone(), &schema)?; insert_record(&mut column_values, record.clone(), &schema)?;
let df = from_parsed_columns(column_values)?; let df = from_parsed_columns(column_values)?;
structs.push(df.as_series(Span::unknown())?); for name in df.df.get_column_names() {
let series = df.df.column(name).map_err(|e| ShellError::GenericError {
error: format!(
"Error creating struct, could not get column name {name}: {e}"
),
msg: "".into(),
span: None,
help: None,
inner: vec![],
})?;
if let Some(v) = structs.get_mut(name) {
let _ = v.append(series)
.map_err(|e| ShellError::GenericError {
error: format!("Error creating struct, could not append to series for col {name}: {e}"),
msg: "".into(),
span: None,
help: None,
inner: vec![],
})?;
} else {
structs.insert(name.to_string(), series.to_owned());
}
}
} }
let chunked = StructChunked::new(column.name(), structs.as_ref()).map_err(|e| { let structs: Vec<Series> = structs.into_values().collect();
let chunked =
StructChunked::new(column.name(), structs.as_slice()).map_err(|e| {
ShellError::GenericError { ShellError::GenericError {
error: format!("Error creating struct: {e}"), error: format!("Error creating struct: {e}"),
msg: "".into(), msg: "".into(),