tweak polars join for better cross joins (#14586)

# Description

closes #14585

This PR tries to make `polars join --cross` work better. Example taken
from
https://docs.pola.rs/user-guide/transformations/joins/#cartesian-product

### Before
```nushell
❯ let tokens = [[monopoly_token]; [hat] [shoe] [boat]] | polars into-df
❯ let players = [[name, cash]; [Alice, 78] [Bob, 135]] | polars into-df
❯ $players | polars into-lazy | polars select (polars col name) | polars join --cross $tokens | polars collect
Error: nu::parser::missing_positional

  × Missing required positional argument.
   ╭─[entry #3:1:92]
 1 │ $players | polars into-lazy | polars select (polars col name) | polars join --cross $tokens
   ╰────
  help: Usage: polars join {flags} <other> <left_on> <right_on> . Use `--help` for more information.
```
### After
```nushell
❯ let players = [[name, cash]; [Alice, 78] [Bob, 135]] | polars into-df
❯ let tokens = [[monopoly_token]; [hat] [shoe] [boat]] | polars into-df
❯ $players | polars into-lazy | polars select (polars col name) | polars join --cross $tokens | polars collect
╭─#─┬─name──┬─monopoly_token─╮
│ 0 │ Alice │ hat            │
│ 1 │ Alice │ shoe           │
│ 2 │ Alice │ boat           │
│ 3 │ Bob   │ hat            │
│ 4 │ Bob   │ shoe           │
│ 5 │ Bob   │ boat           │
╰─#─┴─name──┴─monopoly_token─╯
```
Other examples
```nushell
❯ 1..3 | polars into-df | polars join --cross (4..6 | polars into-df)
╭─#─┬─0─┬─0_x─╮
│ 0 │ 1 │   4 │
│ 1 │ 1 │   5 │
│ 2 │ 1 │   6 │
│ 3 │ 2 │   4 │
│ 4 │ 2 │   5 │
│ 5 │ 2 │   6 │
│ 6 │ 3 │   4 │
│ 7 │ 3 │   5 │
│ 8 │ 3 │   6 │
╰─#─┴─0─┴─0_x─╯
❯ 1..3 | each {|x| {x: $x}} | polars into-df | polars join --cross (4..6 | each {|y| {y: $y}} | polars into-df) x y
╭─#─┬─x─┬─y─╮
│ 0 │ 1 │ 4 │
│ 1 │ 1 │ 5 │
│ 2 │ 1 │ 6 │
│ 3 │ 2 │ 4 │
│ 4 │ 2 │ 5 │
│ 5 │ 2 │ 6 │
│ 6 │ 3 │ 4 │
│ 7 │ 3 │ 5 │
│ 8 │ 3 │ 6 │
╰─#─┴─x─┴─y─╯
```
/cc @ayax79 
# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
This commit is contained in:
Darren Schroeder 2024-12-14 21:58:47 -06:00 committed by GitHub
parent 219b44a04f
commit baf86dfb0e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -27,8 +27,8 @@ impl PluginCommand for LazyJoin {
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("other", SyntaxShape::Any, "LazyFrame to join with")
.required("left_on", SyntaxShape::Any, "Left column(s) to join on")
.required("right_on", SyntaxShape::Any, "Right column(s) to join on")
.optional("left_on", SyntaxShape::Any, "Left column(s) to join on")
.optional("right_on", SyntaxShape::Any, "Right column(s) to join on")
.switch(
"inner",
"inner joining between lazyframes (default)",
@ -54,8 +54,8 @@ impl PluginCommand for LazyJoin {
vec![
Example {
description: "Join two lazy dataframes",
example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-lazy);
let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy);
example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-lazy)
let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy)
$df_a | polars join $df_b a foo | polars collect"#,
result: Some(
NuDataFrame::try_from_columns(
@ -114,8 +114,8 @@ impl PluginCommand for LazyJoin {
},
Example {
description: "Join one eager dataframe with a lazy dataframe",
example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-df);
let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy);
example: r#"let df_a = ([[a b c];[1 "a" 0] [2 "b" 1] [1 "c" 2] [1 "c" 3]] | polars into-df)
let df_b = ([["foo" "bar" "ham"];[1 "a" "let"] [2 "c" "var"] [3 "c" "const"]] | polars into-lazy)
$df_a | polars join $df_b a foo"#,
result: Some(
NuDataFrame::try_from_columns(
@ -172,6 +172,43 @@ impl PluginCommand for LazyJoin {
.into_value(Span::test_data()),
),
},
Example {
description: "Join one eager dataframe with another using a cross join",
example: r#"let tokens = [[monopoly_token]; [hat] [shoe] [boat]] | polars into-df
let players = [[name, cash]; [Alice, 78] [Bob, 135]] | polars into-df
$players | polars select (polars col name) | polars join --cross $tokens | polars collect"#,
result: Some(
NuDataFrame::try_from_columns(
vec![
Column::new(
"name".to_string(),
vec![
Value::test_string("Alice"),
Value::test_string("Alice"),
Value::test_string("Alice"),
Value::test_string("Bob"),
Value::test_string("Bob"),
Value::test_string("Bob"),
],
),
Column::new(
"monopoly_token".to_string(),
vec![
Value::test_string("hat"),
Value::test_string("shoe"),
Value::test_string("boat"),
Value::test_string("hat"),
Value::test_string("shoe"),
Value::test_string("boat"),
],
),
],
None,
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
@ -200,11 +237,21 @@ impl PluginCommand for LazyJoin {
let other = NuLazyFrame::try_from_value_coerce(plugin, &other)?;
let other = other.to_polars();
let left_on: Value = call.req(1)?;
let left_on = NuExpression::extract_exprs(plugin, left_on)?;
let left_on_opt: Option<Value> = call.opt(1)?;
let left_on = match left_on_opt {
Some(left_on_value) if left || left_on_opt.is_some() => {
NuExpression::extract_exprs(plugin, left_on_value)?
}
_ => vec![],
};
let right_on: Value = call.req(2)?;
let right_on = NuExpression::extract_exprs(plugin, right_on)?;
let right_on_opt: Option<Value> = call.opt(2)?;
let right_on = match right_on_opt {
Some(right_on_value) if full || right_on_opt.is_some() => {
NuExpression::extract_exprs(plugin, right_on_value)?
}
_ => vec![],
};
if left_on.len() != right_on.len() {
let right_on: Value = call.req(2)?;
@ -232,16 +279,25 @@ impl PluginCommand for LazyJoin {
let lazy = NuLazyFrame::try_from_value_coerce(plugin, &value)?;
let from_eager = lazy.from_eager;
let lazy = lazy.to_polars();
let lazy = lazy
.join_builder()
.with(other)
.left_on(left_on)
.right_on(right_on)
.how(how)
.force_parallel(true)
.suffix(suffix)
.finish();
let lazy = if cross {
lazy.join_builder()
.with(other)
.left_on(vec![])
.right_on(vec![])
.how(how)
.force_parallel(true)
.suffix(suffix)
.finish()
} else {
lazy.join_builder()
.with(other)
.left_on(left_on)
.right_on(right_on)
.how(how)
.force_parallel(true)
.suffix(suffix)
.finish()
};
let lazy = NuLazyFrame::new(from_eager, lazy);
lazy.to_pipeline_data(plugin, engine, call.head)