From 5f7082f0534b70884e33d3cab19dc82687487efe Mon Sep 17 00:00:00 2001 From: Bahex Date: Fri, 22 Nov 2024 00:58:31 +0300 Subject: [PATCH] truly flexible csv/tsv parsing (#14399) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fixes #14398 I will properly fill out this PR and fix any tests that might break when I have the time, this was a quick fix. # Description This PR makes `from csv` and `from tsv`, with the `--flexible` flag, stop dropping extra/unexpected columns. # User-Facing Changes `$text`'s contents ```csv value 1,aaa 2,bbb 3 4,ddd 5,eee,extra ``` Old behavior ```nushell > $text | from csv --flexible --noheaders ╭─#─┬─column0─╮ │ 0 │ value │ │ 1 │ 1 │ │ 2 │ 2 │ │ 3 │ 3 │ │ 4 │ 4 │ │ 5 │ 5 │ ╰─#─┴─column0─╯ ``` New behavior ```nushell > $text | from csv --flexible --noheaders ╭─#─┬─column0─┬─column1─┬─column2─╮ │ 0 │ value │ ❎ │ ❎ │ │ 1 │ 1 │ aaa │ ❎ │ │ 2 │ 2 │ bbb │ ❎ │ │ 3 │ 3 │ ❎ │ ❎ │ │ 4 │ 4 │ ddd │ ❎ │ │ 5 │ 5 │ eee │ extra │ ╰─#─┴─column0─┴─column1─┴─column2─╯ ``` - The first line in a csv (or tsv) document no longer limits the number of columns - Missing values in columns are longer automatically filled with `null` with this change, as a later row can introduce new columns. **BREAKING CHANGE** Because missing columns are different from empty columns, operations on possibly missing columns will have to use optional access syntax e.g. `get foo` => `get foo?` # Tests + Formatting Added examples that run as tests and adjusted existing tests to confirm the new behavior. # After Submitting Update the workaround with fish completer mentioned [here](https://www.nushell.sh/cookbook/external_completers.html#fish-completer) --- crates/nu-command/src/formats/from/csv.rs | 25 ++++++++++- .../nu-command/src/formats/from/delimited.rs | 43 ++++++++----------- crates/nu-command/src/formats/from/tsv.rs | 20 ++++++++- .../tests/format_conversions/csv.rs | 4 +- 4 files changed, 62 insertions(+), 30 deletions(-) diff --git a/crates/nu-command/src/formats/from/csv.rs b/crates/nu-command/src/formats/from/csv.rs index 472ccf4d71..bde84c2c73 100644 --- a/crates/nu-command/src/formats/from/csv.rs +++ b/crates/nu-command/src/formats/from/csv.rs @@ -11,7 +11,10 @@ impl Command for FromCsv { fn signature(&self) -> Signature { Signature::build("from csv") - .input_output_types(vec![(Type::String, Type::table())]) + .input_output_types(vec![ + (Type::String, Type::table()), + (Type::String, Type::list(Type::Any)), + ]) .named( "separator", SyntaxShape::String, @@ -82,6 +85,26 @@ impl Command for FromCsv { })], )) }, + Example { + description: "Convert comma-separated data to a table, allowing variable number of columns per row", + example: "\"ColA,ColB\n1,2\n3,4,5\n6\" | from csv --flexible", + result: Some(Value::test_list ( + vec![ + Value::test_record(record! { + "ColA" => Value::test_int(1), + "ColB" => Value::test_int(2), + }), + Value::test_record(record! { + "ColA" => Value::test_int(3), + "ColB" => Value::test_int(4), + "column2" => Value::test_int(5), + }), + Value::test_record(record! { + "ColA" => Value::test_int(6), + }), + ], + )) + }, Example { description: "Convert comma-separated data to a table, ignoring headers", example: "open data.txt | from csv --noheaders", diff --git a/crates/nu-command/src/formats/from/delimited.rs b/crates/nu-command/src/formats/from/delimited.rs index 0fea7e082b..5dfdd4ad82 100644 --- a/crates/nu-command/src/formats/from/delimited.rs +++ b/crates/nu-command/src/formats/from/delimited.rs @@ -39,12 +39,7 @@ fn from_delimited_stream( .from_reader(input_reader); let headers = if noheaders { - (0..reader - .headers() - .map_err(|err| from_csv_error(err, span))? - .len()) - .map(|i| format!("column{i}")) - .collect::>() + vec![] } else { reader .headers() @@ -54,32 +49,28 @@ fn from_delimited_stream( .collect() }; + let n = headers.len(); + let columns = headers + .into_iter() + .chain((n..).map(|i| format!("column{i}"))); let iter = reader.into_records().map(move |row| { let row = match row { Ok(row) => row, Err(err) => return Value::error(from_csv_error(err, span), span), }; - let columns = headers.iter().cloned(); - let values = row - .into_iter() - .map(|s| { - if no_infer { - Value::string(s, span) - } else if let Ok(i) = s.parse() { - Value::int(i, span) - } else if let Ok(f) = s.parse() { - Value::float(f, span) - } else { - Value::string(s, span) - } - }) - .chain(std::iter::repeat(Value::nothing(span))); + let columns = columns.clone(); + let values = row.into_iter().map(|s| { + if no_infer { + Value::string(s, span) + } else if let Ok(i) = s.parse() { + Value::int(i, span) + } else if let Ok(f) = s.parse() { + Value::float(f, span) + } else { + Value::string(s, span) + } + }); - // If there are more values than the number of headers, - // then the remaining values are ignored. - // - // Otherwise, if there are less values than headers, - // then `Value::nothing(span)` is used to fill the remaining columns. Value::record(columns.zip(values).collect(), span) }); diff --git a/crates/nu-command/src/formats/from/tsv.rs b/crates/nu-command/src/formats/from/tsv.rs index cd3c9f97bd..09bee4803f 100644 --- a/crates/nu-command/src/formats/from/tsv.rs +++ b/crates/nu-command/src/formats/from/tsv.rs @@ -11,7 +11,10 @@ impl Command for FromTsv { fn signature(&self) -> Signature { Signature::build("from tsv") - .input_output_types(vec![(Type::String, Type::table())]) + .input_output_types(vec![ + (Type::String, Type::table()), + (Type::String, Type::list(Type::Any)), + ]) .named( "comment", SyntaxShape::String, @@ -76,6 +79,21 @@ impl Command for FromTsv { })], )) }, + Example { + description: "Convert comma-separated data to a table, allowing variable number of columns per row and ignoring headers", + example: "\"value 1\nvalue 2\tdescription 2\" | from tsv --flexible --noheaders", + result: Some(Value::test_list ( + vec![ + Value::test_record(record! { + "column0" => Value::test_string("value 1"), + }), + Value::test_record(record! { + "column0" => Value::test_string("value 2"), + "column1" => Value::test_string("description 2"), + }), + ], + )) + }, Example { description: "Create a tsv file with header columns and open it", example: r#"$'c1(char tab)c2(char tab)c3(char nl)1(char tab)2(char tab)3' | save tsv-data | open tsv-data | from tsv"#, diff --git a/crates/nu-command/tests/format_conversions/csv.rs b/crates/nu-command/tests/format_conversions/csv.rs index f10a84b672..4f50ab1492 100644 --- a/crates/nu-command/tests/format_conversions/csv.rs +++ b/crates/nu-command/tests/format_conversions/csv.rs @@ -469,7 +469,7 @@ fn from_csv_test_flexible_extra_vals() { echo "a,b\n1,2,3" | from csv --flexible | first | values | to nuon "# )); - assert_eq!(actual.out, "[1, 2]"); + assert_eq!(actual.out, "[1, 2, 3]"); } #[test] @@ -479,5 +479,5 @@ fn from_csv_test_flexible_missing_vals() { echo "a,b\n1" | from csv --flexible | first | values | to nuon "# )); - assert_eq!(actual.out, "[1, null]"); + assert_eq!(actual.out, "[1]"); }