From baf86dfb0ed57aefb75962ea6cfc1552cb44c2c4 Mon Sep 17 00:00:00 2001 From: Darren Schroeder <343840+fdncred@users.noreply.github.com> Date: Sat, 14 Dec 2024 21:58:47 -0600 Subject: [PATCH] tweak polars join for better cross joins (#14586) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description closes #14585 This PR tries to make `polars join --cross` work better. Example taken from https://docs.pola.rs/user-guide/transformations/joins/#cartesian-product ### Before ```nushell ❯ let tokens = [[monopoly_token]; [hat] [shoe] [boat]] | polars into-df ❯ let players = [[name, cash]; [Alice, 78] [Bob, 135]] | polars into-df ❯ $players | polars into-lazy | polars select (polars col name) | polars join --cross $tokens | polars collect Error: nu::parser::missing_positional × Missing required positional argument. ╭─[entry #3:1:92] 1 │ $players | polars into-lazy | polars select (polars col name) | polars join --cross $tokens ╰──── help: Usage: polars join {flags} . Use `--help` for more information. ``` ### After ```nushell ❯ let players = [[name, cash]; [Alice, 78] [Bob, 135]] | polars into-df ❯ let tokens = [[monopoly_token]; [hat] [shoe] [boat]] | polars into-df ❯ $players | polars into-lazy | polars select (polars col name) | polars join --cross $tokens | polars collect ╭─#─┬─name──┬─monopoly_token─╮ │ 0 │ Alice │ hat │ │ 1 │ Alice │ shoe │ │ 2 │ Alice │ boat │ │ 3 │ Bob │ hat │ │ 4 │ Bob │ shoe │ │ 5 │ Bob │ boat │ ╰─#─┴─name──┴─monopoly_token─╯ ``` Other examples ```nushell ❯ 1..3 | polars into-df | polars join --cross (4..6 | polars into-df) ╭─#─┬─0─┬─0_x─╮ │ 0 │ 1 │ 4 │ │ 1 │ 1 │ 5 │ │ 2 │ 1 │ 6 │ │ 3 │ 2 │ 4 │ │ 4 │ 2 │ 5 │ │ 5 │ 2 │ 6 │ │ 6 │ 3 │ 4 │ │ 7 │ 3 │ 5 │ │ 8 │ 3 │ 6 │ ╰─#─┴─0─┴─0_x─╯ ❯ 1..3 | each {|x| {x: $x}} | polars into-df | polars join --cross (4..6 | each {|y| {y: $y}} | polars into-df) x y ╭─#─┬─x─┬─y─╮ │ 0 │ 1 │ 4 │ │ 1 │ 1 │ 5 │ │ 2 │ 1 │ 6 │ │ 3 │ 2 │ 4 │ │ 4 │ 2 │ 5 │ │ 5 │ 2 │ 6 │ │ 6 │ 3 │ 4 │ │ 7 │ 3 │ 5 │ │ 8 │ 3 │ 6 │ ╰─#─┴─x─┴─y─╯ ``` /cc @ayax79 # User-Facing Changes # Tests + Formatting # After Submitting --- .../src/dataframe/command/data/join.rs | 96 +++++++++++++++---- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/crates/nu_plugin_polars/src/dataframe/command/data/join.rs b/crates/nu_plugin_polars/src/dataframe/command/data/join.rs index ee18e17567..b1c13fef00 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/data/join.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/data/join.rs @@ -27,8 +27,8 @@ impl PluginCommand for LazyJoin { fn signature(&self) -> Signature { Signature::build(self.name()) .required("other", SyntaxShape::Any, "LazyFrame to join with") - .required("left_on", SyntaxShape::Any, "Left column(s) to join on") - .required("right_on", SyntaxShape::Any, "Right column(s) to join on") + .optional("left_on", SyntaxShape::Any, "Left column(s) to join on") + .optional("right_on", SyntaxShape::Any, "Right column(s) to join on") .switch( "inner", "inner joining between lazyframes (default)", @@ -54,8 +54,8 @@ impl PluginCommand for LazyJoin { vec![ Example { description: "Join two lazy dataframes", - example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-lazy); - let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy); + example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-lazy) + let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy) $df_a | polars join $df_b a foo | polars collect"#, result: Some( NuDataFrame::try_from_columns( @@ -114,8 +114,8 @@ impl PluginCommand for LazyJoin { }, Example { description: "Join one eager dataframe with a lazy dataframe", - example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-df); - let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy); + example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-df) + let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy) $df_a | polars join $df_b a foo"#, result: Some( NuDataFrame::try_from_columns( @@ -172,6 +172,43 @@ impl PluginCommand for LazyJoin { .into_value(Span::test_data()), ), }, + Example { + description: "Join one eager dataframe with another using a cross join", + example: r#"let tokens = [[monopoly_token]; [hat] [shoe] [boat]] | polars into-df + let players = [[name, cash]; [Alice, 78] [Bob, 135]] | polars into-df + $players | polars select (polars col name) | polars join --cross $tokens | polars collect"#, + result: Some( + NuDataFrame::try_from_columns( + vec![ + Column::new( + "name".to_string(), + vec![ + Value::test_string("Alice"), + Value::test_string("Alice"), + Value::test_string("Alice"), + Value::test_string("Bob"), + Value::test_string("Bob"), + Value::test_string("Bob"), + ], + ), + Column::new( + "monopoly_token".to_string(), + vec![ + Value::test_string("hat"), + Value::test_string("shoe"), + Value::test_string("boat"), + Value::test_string("hat"), + Value::test_string("shoe"), + Value::test_string("boat"), + ], + ), + ], + None, + ) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, ] } @@ -200,11 +237,21 @@ impl PluginCommand for LazyJoin { let other = NuLazyFrame::try_from_value_coerce(plugin, &other)?; let other = other.to_polars(); - let left_on: Value = call.req(1)?; - let left_on = NuExpression::extract_exprs(plugin, left_on)?; + let left_on_opt: Option = call.opt(1)?; + let left_on = match left_on_opt { + Some(left_on_value) if left || left_on_opt.is_some() => { + NuExpression::extract_exprs(plugin, left_on_value)? + } + _ => vec![], + }; - let right_on: Value = call.req(2)?; - let right_on = NuExpression::extract_exprs(plugin, right_on)?; + let right_on_opt: Option = call.opt(2)?; + let right_on = match right_on_opt { + Some(right_on_value) if full || right_on_opt.is_some() => { + NuExpression::extract_exprs(plugin, right_on_value)? + } + _ => vec![], + }; if left_on.len() != right_on.len() { let right_on: Value = call.req(2)?; @@ -232,16 +279,25 @@ impl PluginCommand for LazyJoin { let lazy = NuLazyFrame::try_from_value_coerce(plugin, &value)?; let from_eager = lazy.from_eager; let lazy = lazy.to_polars(); - - let lazy = lazy - .join_builder() - .with(other) - .left_on(left_on) - .right_on(right_on) - .how(how) - .force_parallel(true) - .suffix(suffix) - .finish(); + let lazy = if cross { + lazy.join_builder() + .with(other) + .left_on(vec![]) + .right_on(vec![]) + .how(how) + .force_parallel(true) + .suffix(suffix) + .finish() + } else { + lazy.join_builder() + .with(other) + .left_on(left_on) + .right_on(right_on) + .how(how) + .force_parallel(true) + .suffix(suffix) + .finish() + }; let lazy = NuLazyFrame::new(from_eager, lazy); lazy.to_pipeline_data(plugin, engine, call.head)