truly flexible csv/tsv parsing (#14399)

- fixes #14398

I will properly fill out this PR and fix any tests that might break when
I have the time, this was a quick fix.

# Description

This PR makes `from csv` and `from tsv`, with the `--flexible` flag,
stop dropping extra/unexpected columns.

# User-Facing Changes

`$text`'s contents
```csv
value
1,aaa
2,bbb
3
4,ddd
5,eee,extra
```

Old behavior
```nushell
> $text | from csv --flexible --noheaders 
╭─#─┬─column0─╮
│ 0 │ value   │
│ 1 │       1 │
│ 2 │       2 │
│ 3 │       3 │
│ 4 │       4 │
│ 5 │       5 │
╰─#─┴─column0─╯
```

New behavior
```nushell
> $text | from csv --flexible --noheaders 
╭─#─┬─column0─┬─column1─┬─column2─╮
│ 0 │ value   │       │
│ 1 │       1 │ aaa     │       │
│ 2 │       2 │ bbb     │       │
│ 3 │       3 │       │
│ 4 │       4 │ ddd     │       │
│ 5 │       5 │ eee     │ extra   │
╰─#─┴─column0─┴─column1─┴─column2─╯
```

- The first line in a csv (or tsv) document no longer limits the number
of columns
- Missing values in columns are longer automatically filled with `null`
with this change, as a later row can introduce new columns. **BREAKING
CHANGE**

Because missing columns are different from empty columns, operations on
possibly missing columns will have to use optional access syntax e.g.
`get foo` => `get foo?`
  
# Tests + Formatting
Added examples that run as tests and adjusted existing tests to confirm
the new behavior.

# After Submitting

Update the workaround with fish completer mentioned
[here](https://www.nushell.sh/cookbook/external_completers.html#fish-completer)
This commit is contained in:
Bahex 2024-11-22 00:58:31 +03:00 committed by GitHub
parent 2a90cb7355
commit 5f7082f053
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 62 additions and 30 deletions

View file

@ -11,7 +11,10 @@ impl Command for FromCsv {
fn signature(&self) -> Signature {
Signature::build("from csv")
.input_output_types(vec![(Type::String, Type::table())])
.input_output_types(vec![
(Type::String, Type::table()),
(Type::String, Type::list(Type::Any)),
])
.named(
"separator",
SyntaxShape::String,
@ -82,6 +85,26 @@ impl Command for FromCsv {
})],
))
},
Example {
description: "Convert comma-separated data to a table, allowing variable number of columns per row",
example: "\"ColA,ColB\n1,2\n3,4,5\n6\" | from csv --flexible",
result: Some(Value::test_list (
vec![
Value::test_record(record! {
"ColA" => Value::test_int(1),
"ColB" => Value::test_int(2),
}),
Value::test_record(record! {
"ColA" => Value::test_int(3),
"ColB" => Value::test_int(4),
"column2" => Value::test_int(5),
}),
Value::test_record(record! {
"ColA" => Value::test_int(6),
}),
],
))
},
Example {
description: "Convert comma-separated data to a table, ignoring headers",
example: "open data.txt | from csv --noheaders",

View file

@ -39,12 +39,7 @@ fn from_delimited_stream(
.from_reader(input_reader);
let headers = if noheaders {
(0..reader
.headers()
.map_err(|err| from_csv_error(err, span))?
.len())
.map(|i| format!("column{i}"))
.collect::<Vec<String>>()
vec![]
} else {
reader
.headers()
@ -54,32 +49,28 @@ fn from_delimited_stream(
.collect()
};
let n = headers.len();
let columns = headers
.into_iter()
.chain((n..).map(|i| format!("column{i}")));
let iter = reader.into_records().map(move |row| {
let row = match row {
Ok(row) => row,
Err(err) => return Value::error(from_csv_error(err, span), span),
};
let columns = headers.iter().cloned();
let values = row
.into_iter()
.map(|s| {
if no_infer {
Value::string(s, span)
} else if let Ok(i) = s.parse() {
Value::int(i, span)
} else if let Ok(f) = s.parse() {
Value::float(f, span)
} else {
Value::string(s, span)
}
})
.chain(std::iter::repeat(Value::nothing(span)));
let columns = columns.clone();
let values = row.into_iter().map(|s| {
if no_infer {
Value::string(s, span)
} else if let Ok(i) = s.parse() {
Value::int(i, span)
} else if let Ok(f) = s.parse() {
Value::float(f, span)
} else {
Value::string(s, span)
}
});
// If there are more values than the number of headers,
// then the remaining values are ignored.
//
// Otherwise, if there are less values than headers,
// then `Value::nothing(span)` is used to fill the remaining columns.
Value::record(columns.zip(values).collect(), span)
});

View file

@ -11,7 +11,10 @@ impl Command for FromTsv {
fn signature(&self) -> Signature {
Signature::build("from tsv")
.input_output_types(vec![(Type::String, Type::table())])
.input_output_types(vec![
(Type::String, Type::table()),
(Type::String, Type::list(Type::Any)),
])
.named(
"comment",
SyntaxShape::String,
@ -76,6 +79,21 @@ impl Command for FromTsv {
})],
))
},
Example {
description: "Convert comma-separated data to a table, allowing variable number of columns per row and ignoring headers",
example: "\"value 1\nvalue 2\tdescription 2\" | from tsv --flexible --noheaders",
result: Some(Value::test_list (
vec![
Value::test_record(record! {
"column0" => Value::test_string("value 1"),
}),
Value::test_record(record! {
"column0" => Value::test_string("value 2"),
"column1" => Value::test_string("description 2"),
}),
],
))
},
Example {
description: "Create a tsv file with header columns and open it",
example: r#"$'c1(char tab)c2(char tab)c3(char nl)1(char tab)2(char tab)3' | save tsv-data | open tsv-data | from tsv"#,

View file

@ -469,7 +469,7 @@ fn from_csv_test_flexible_extra_vals() {
echo "a,b\n1,2,3" | from csv --flexible | first | values | to nuon
"#
));
assert_eq!(actual.out, "[1, 2]");
assert_eq!(actual.out, "[1, 2, 3]");
}
#[test]
@ -479,5 +479,5 @@ fn from_csv_test_flexible_missing_vals() {
echo "a,b\n1" | from csv --flexible | first | values | to nuon
"#
));
assert_eq!(actual.out, "[1, null]");
assert_eq!(actual.out, "[1]");
}