diff --git a/src/commands.rs b/src/commands.rs index ee70534640..c238b451d8 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,6 +1,8 @@ #[macro_use] pub(crate) mod macros; +mod from_structured_data; + pub(crate) mod append; pub(crate) mod args; pub(crate) mod autoview; diff --git a/src/commands/from_csv.rs b/src/commands/from_csv.rs index 9483fed521..4bada42dfb 100644 --- a/src/commands/from_csv.rs +++ b/src/commands/from_csv.rs @@ -1,7 +1,7 @@ +use crate::commands::from_structured_data::from_structured_data; use crate::commands::WholeStreamCommand; -use crate::data::{Primitive, TaggedDictBuilder, Value}; +use crate::data::{Primitive, Value}; use crate::prelude::*; -use csv::ReaderBuilder; pub struct FromCSV; @@ -27,7 +27,7 @@ impl WholeStreamCommand for FromCSV { } fn usage(&self) -> &str { - "Parse text as .csv and create table" + "Parse text as .csv and create table." } fn run( @@ -39,64 +39,13 @@ impl WholeStreamCommand for FromCSV { } } -pub fn from_csv_string_to_value( - s: String, - headerless: bool, - separator: char, - tag: impl Into, -) -> Result, csv::Error> { - let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(separator as u8) - .from_reader(s.as_bytes()); - let tag = tag.into(); - - let mut fields: VecDeque = VecDeque::new(); - let mut iter = reader.records(); - let mut rows = vec![]; - - if let Some(result) = iter.next() { - let line = result?; - - for (idx, item) in line.iter().enumerate() { - if headerless { - fields.push_back(format!("Column{}", idx + 1)); - } else { - fields.push_back(item.to_string()); - } - } - } - - loop { - if let Some(row_values) = iter.next() { - let row_values = row_values?; - - let mut row = TaggedDictBuilder::new(tag.clone()); - - for (idx, entry) in row_values.iter().enumerate() { - row.insert_tagged( - fields.get(idx).unwrap(), - Value::Primitive(Primitive::String(String::from(entry))).tagged(&tag), - ); - } - - rows.push(row.into_tagged_value()); - } else { - break; - } - } - - Ok(Value::Table(rows).tagged(&tag)) -} - fn from_csv( FromCSVArgs { - headerless: skip_headers, + headerless, separator, }: FromCSVArgs, - RunnableContext { input, name, .. }: RunnableContext, + runnable_context: RunnableContext, ) -> Result { - let name_tag = name; let sep = match separator { Some(Tagged { item: Value::Primitive(Primitive::String(s)), @@ -116,51 +65,5 @@ fn from_csv( _ => ',', }; - let stream = async_stream! { - let values: Vec> = input.values.collect().await; - - let mut concat_string = String::new(); - let mut latest_tag: Option = None; - - for value in values { - let value_tag = value.tag(); - latest_tag = Some(value_tag.clone()); - match value.item { - Value::Primitive(Primitive::String(s)) => { - concat_string.push_str(&s); - concat_string.push_str("\n"); - } - _ => yield Err(ShellError::labeled_error_with_secondary( - "Expected a string from pipeline", - "requires string input", - name_tag.clone(), - "value originates from here", - value_tag.clone(), - )), - - } - } - - match from_csv_string_to_value(concat_string, skip_headers, sep, name_tag.clone()) { - Ok(x) => match x { - Tagged { item: Value::Table(list), .. } => { - for l in list { - yield ReturnSuccess::value(l); - } - } - x => yield ReturnSuccess::value(x), - }, - Err(_) => if let Some(last_tag) = latest_tag { - yield Err(ShellError::labeled_error_with_secondary( - "Could not parse as CSV", - "input cannot be parsed as CSV", - name_tag.clone(), - "value originates from here", - last_tag.clone(), - )) - } , - } - }; - - Ok(stream.to_output_stream()) + from_structured_data(headerless, sep, "CSV", runnable_context) } diff --git a/src/commands/from_ssv.rs b/src/commands/from_ssv.rs index 090bab508f..37bba215f1 100644 --- a/src/commands/from_ssv.rs +++ b/src/commands/from_ssv.rs @@ -45,6 +45,149 @@ impl WholeStreamCommand for FromSSV { } } +enum HeaderOptions<'a> { + WithHeaders(&'a str), + WithoutHeaders, +} + +fn parse_aligned_columns<'a>( + lines: impl Iterator, + headers: HeaderOptions, + separator: &str, +) -> Vec> { + fn construct<'a>( + lines: impl Iterator, + headers: Vec<(String, usize)>, + ) -> Vec> { + lines + .map(|l| { + headers + .iter() + .enumerate() + .map(|(i, (header_name, start_position))| { + let val = match headers.get(i + 1) { + Some((_, end)) => { + if *end < l.len() { + l.get(*start_position..*end) + } else { + l.get(*start_position..) + } + } + None => l.get(*start_position..), + } + .unwrap_or("") + .trim() + .into(); + (header_name.clone(), val) + }) + .collect() + }) + .collect() + } + + let find_indices = |line: &str| { + let values = line + .split(&separator) + .map(str::trim) + .filter(|s| !s.is_empty()); + values + .fold( + (0, vec![]), + |(current_pos, mut indices), value| match line[current_pos..].find(value) { + None => (current_pos, indices), + Some(index) => { + let absolute_index = current_pos + index; + indices.push(absolute_index); + (absolute_index + value.len(), indices) + } + }, + ) + .1 + }; + + let parse_with_headers = |lines, headers_raw: &str| { + let indices = find_indices(headers_raw); + let headers = headers_raw + .split(&separator) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(String::from) + .zip(indices); + + let columns = headers.collect::>(); + + construct(lines, columns) + }; + + let parse_without_headers = |ls: Vec<&str>| { + let mut indices = ls + .iter() + .flat_map(|s| find_indices(*s)) + .collect::>(); + + indices.sort(); + indices.dedup(); + + let headers: Vec<(String, usize)> = indices + .iter() + .enumerate() + .map(|(i, position)| (format!("Column{}", i + 1), *position)) + .collect(); + + construct(ls.iter().map(|s| s.to_owned()), headers) + }; + + match headers { + HeaderOptions::WithHeaders(headers_raw) => parse_with_headers(lines, headers_raw), + HeaderOptions::WithoutHeaders => parse_without_headers(lines.collect()), + } +} + +fn parse_separated_columns<'a>( + lines: impl Iterator, + headers: HeaderOptions, + separator: &str, +) -> Vec> { + fn collect<'a>( + headers: Vec, + rows: impl Iterator, + separator: &str, + ) -> Vec> { + rows.map(|r| { + headers + .iter() + .zip(r.split(separator).map(str::trim).filter(|s| !s.is_empty())) + .map(|(a, b)| (a.to_owned(), b.to_owned())) + .collect() + }) + .collect() + } + + let parse_with_headers = |lines, headers_raw: &str| { + let headers = headers_raw + .split(&separator) + .map(str::trim) + .map(|s| s.to_owned()) + .filter(|s| !s.is_empty()) + .collect(); + collect(headers, lines, separator) + }; + + let parse_without_headers = |ls: Vec<&str>| { + let num_columns = ls.iter().map(|r| r.len()).max().unwrap_or(0); + + let headers = (1..=num_columns) + .map(|i| format!("Column{}", i)) + .collect::>(); + collect(headers, ls.iter().map(|s| s.as_ref()), separator) + }; + + match headers { + HeaderOptions::WithHeaders(headers_raw) => parse_with_headers(lines, headers_raw), + HeaderOptions::WithoutHeaders => parse_without_headers(lines.collect()), + } +} + fn string_to_table( s: &str, headerless: bool, @@ -54,76 +197,23 @@ fn string_to_table( let mut lines = s.lines().filter(|l| !l.trim().is_empty()); let separator = " ".repeat(std::cmp::max(split_at, 1)); - if aligned_columns { - let headers_raw = lines.next()?; - - let headers = headers_raw - .trim() - .split(&separator) - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(|s| (headers_raw.find(s).unwrap(), s.to_owned())); - - let columns = if headerless { - headers - .enumerate() - .map(|(header_no, (string_index, _))| { - (string_index, format!("Column{}", header_no + 1)) - }) - .collect::>() - } else { - headers.collect::>() - }; - - Some( - lines - .map(|l| { - columns - .iter() - .enumerate() - .filter_map(|(i, (start, col))| { - (match columns.get(i + 1) { - Some((end, _)) => l.get(*start..*end), - None => l.get(*start..), - }) - .and_then(|s| Some((col.clone(), String::from(s.trim())))) - }) - .collect() - }) - .collect(), - ) + let (ls, header_options) = if headerless { + (lines, HeaderOptions::WithoutHeaders) } else { - let headers = lines - .next()? - .split(&separator) - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - .map(|s| s.to_owned()) - .collect::>(); + let headers = lines.next()?; + (lines, HeaderOptions::WithHeaders(headers)) + }; - let header_row = if headerless { - (1..=headers.len()) - .map(|i| format!("Column{}", i)) - .collect::>() - } else { - headers - }; + let f = if aligned_columns { + parse_aligned_columns + } else { + parse_separated_columns + }; - Some( - lines - .map(|l| { - header_row - .iter() - .zip( - l.split(&separator) - .map(|s| s.trim()) - .filter(|s| !s.is_empty()), - ) - .map(|(a, b)| (String::from(a), String::from(b))) - .collect() - }) - .collect(), - ) + let parsed = f(ls, header_options, &separator); + match parsed.len() { + 0 => None, + _ => Some(parsed), } } @@ -250,7 +340,7 @@ mod tests { } #[test] - fn it_ignores_headers_when_headerless() { + fn it_uses_first_row_as_data_when_headerless() { let input = r#" a b 1 2 @@ -260,6 +350,7 @@ mod tests { assert_eq!( result, Some(vec![ + vec![owned("Column1", "a"), owned("Column2", "b")], vec![owned("Column1", "1"), owned("Column2", "2")], vec![owned("Column1", "3"), owned("Column2", "4")] ]) @@ -357,4 +448,57 @@ mod tests { ],] ) } + + #[test] + fn it_handles_empty_values_when_headerless_and_aligned_columns() { + let input = r#" + a multi-word value b d + 1 3-3 4 + last + "#; + + let result = string_to_table(input, true, true, 2).unwrap(); + assert_eq!( + result, + vec![ + vec![ + owned("Column1", "a multi-word value"), + owned("Column2", "b"), + owned("Column3", ""), + owned("Column4", "d"), + owned("Column5", "") + ], + vec![ + owned("Column1", "1"), + owned("Column2", ""), + owned("Column3", "3-3"), + owned("Column4", "4"), + owned("Column5", "") + ], + vec![ + owned("Column1", ""), + owned("Column2", ""), + owned("Column3", ""), + owned("Column4", ""), + owned("Column5", "last") + ], + ] + ) + } + + #[test] + fn input_is_parsed_correctly_if_either_option_works() { + let input = r#" + docker-registry docker-registry=default docker-registry=default 172.30.78.158 5000/TCP + kubernetes component=apiserver,provider=kubernetes 172.30.0.2 443/TCP + kubernetes-ro component=apiserver,provider=kubernetes 172.30.0.1 80/TCP + "#; + + let aligned_columns_headerless = string_to_table(input, true, true, 2).unwrap(); + let separator_headerless = string_to_table(input, true, false, 2).unwrap(); + let aligned_columns_with_headers = string_to_table(input, false, true, 2).unwrap(); + let separator_with_headers = string_to_table(input, false, false, 2).unwrap(); + assert_eq!(aligned_columns_headerless, separator_headerless); + assert_eq!(aligned_columns_with_headers, separator_with_headers); + } } diff --git a/src/commands/from_structured_data.rs b/src/commands/from_structured_data.rs new file mode 100644 index 0000000000..4799a40993 --- /dev/null +++ b/src/commands/from_structured_data.rs @@ -0,0 +1,97 @@ +use crate::data::{Primitive, TaggedDictBuilder, Value}; +use crate::prelude::*; +use csv::ReaderBuilder; + +fn from_stuctured_string_to_value( + s: String, + headerless: bool, + separator: char, + tag: impl Into, +) -> Result, csv::Error> { + let mut reader = ReaderBuilder::new() + .has_headers(!headerless) + .delimiter(separator as u8) + .from_reader(s.as_bytes()); + let tag = tag.into(); + + let headers = if headerless { + (1..=reader.headers()?.len()) + .map(|i| format!("Column{}", i)) + .collect::>() + } else { + reader.headers()?.iter().map(String::from).collect() + }; + + let mut rows = vec![]; + for row in reader.records() { + let mut tagged_row = TaggedDictBuilder::new(&tag); + for (value, header) in row?.iter().zip(headers.iter()) { + tagged_row.insert_tagged( + header, + Value::Primitive(Primitive::String(String::from(value))).tagged(&tag), + ) + } + rows.push(tagged_row.into_tagged_value()); + } + + Ok(Value::Table(rows).tagged(&tag)) +} + +pub fn from_structured_data( + headerless: bool, + sep: char, + format_name: &'static str, + RunnableContext { input, name, .. }: RunnableContext, +) -> Result { + let name_tag = name; + + let stream = async_stream! { + let values: Vec> = input.values.collect().await; + + let mut concat_string = String::new(); + let mut latest_tag: Option = None; + + for value in values { + let value_tag = value.tag(); + latest_tag = Some(value_tag.clone()); + match value.item { + Value::Primitive(Primitive::String(s)) => { + concat_string.push_str(&s); + concat_string.push_str("\n"); + } + _ => yield Err(ShellError::labeled_error_with_secondary( + "Expected a string from pipeline", + "requires string input", + name_tag.clone(), + "value originates from here", + value_tag.clone(), + )), + + } + } + + match from_stuctured_string_to_value(concat_string, headerless, sep, name_tag.clone()) { + Ok(x) => match x { + Tagged { item: Value::Table(list), .. } => { + for l in list { + yield ReturnSuccess::value(l); + } + } + x => yield ReturnSuccess::value(x), + }, + Err(_) => if let Some(last_tag) = latest_tag { + let line_one = format!("Could not parse as {}", format_name); + let line_two = format!("input cannot be parsed as {}", format_name); + yield Err(ShellError::labeled_error_with_secondary( + line_one, + line_two, + name_tag.clone(), + "value originates from here", + last_tag.clone(), + )) + } , + } + }; + + Ok(stream.to_output_stream()) +} diff --git a/src/commands/from_tsv.rs b/src/commands/from_tsv.rs index 2284e95573..7931b8ef38 100644 --- a/src/commands/from_tsv.rs +++ b/src/commands/from_tsv.rs @@ -1,7 +1,6 @@ +use crate::commands::from_structured_data::from_structured_data; use crate::commands::WholeStreamCommand; -use crate::data::{Primitive, TaggedDictBuilder, Value}; use crate::prelude::*; -use csv::ReaderBuilder; pub struct FromTSV; @@ -33,108 +32,9 @@ impl WholeStreamCommand for FromTSV { } } -pub fn from_tsv_string_to_value( - s: String, - headerless: bool, - tag: impl Into, -) -> Result, csv::Error> { - let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_reader(s.as_bytes()); - let tag = tag.into(); - - let mut fields: VecDeque = VecDeque::new(); - let mut iter = reader.records(); - let mut rows = vec![]; - - if let Some(result) = iter.next() { - let line = result?; - - for (idx, item) in line.iter().enumerate() { - if headerless { - fields.push_back(format!("Column{}", idx + 1)); - } else { - fields.push_back(item.to_string()); - } - } - } - - loop { - if let Some(row_values) = iter.next() { - let row_values = row_values?; - - let mut row = TaggedDictBuilder::new(&tag); - - for (idx, entry) in row_values.iter().enumerate() { - row.insert_tagged( - fields.get(idx).unwrap(), - Value::Primitive(Primitive::String(String::from(entry))).tagged(&tag), - ); - } - - rows.push(row.into_tagged_value()); - } else { - break; - } - } - - Ok(Value::Table(rows).tagged(&tag)) -} - fn from_tsv( - FromTSVArgs { - headerless: skip_headers, - }: FromTSVArgs, - RunnableContext { input, name, .. }: RunnableContext, + FromTSVArgs { headerless }: FromTSVArgs, + runnable_context: RunnableContext, ) -> Result { - let name_tag = name; - - let stream = async_stream! { - let values: Vec> = input.values.collect().await; - - let mut concat_string = String::new(); - let mut latest_tag: Option = None; - - for value in values { - let value_tag = value.tag(); - latest_tag = Some(value_tag.clone()); - match value.item { - Value::Primitive(Primitive::String(s)) => { - concat_string.push_str(&s); - concat_string.push_str("\n"); - } - _ => yield Err(ShellError::labeled_error_with_secondary( - "Expected a string from pipeline", - "requires string input", - &name_tag, - "value originates from here", - &value_tag, - )), - - } - } - - match from_tsv_string_to_value(concat_string, skip_headers, name_tag.clone()) { - Ok(x) => match x { - Tagged { item: Value::Table(list), .. } => { - for l in list { - yield ReturnSuccess::value(l); - } - } - x => yield ReturnSuccess::value(x), - }, - Err(_) => if let Some(last_tag) = latest_tag { - yield Err(ShellError::labeled_error_with_secondary( - "Could not parse as TSV", - "input cannot be parsed as TSV", - &name_tag, - "value originates from here", - &last_tag, - )) - } , - } - }; - - Ok(stream.to_output_stream()) + from_structured_data(headerless, '\t', "TSV", runnable_context) } diff --git a/tests/filters_test.rs b/tests/filters_test.rs index e410e99e65..e18f20be67 100644 --- a/tests/filters_test.rs +++ b/tests/filters_test.rs @@ -135,7 +135,6 @@ fn converts_from_csv_text_skipping_headers_to_structured_table() { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_amigos.txt", r#" - first_name,last_name,rusty_luck Andrés,Robalino,1 Jonathan,Turner,1 Yehuda,Katz,1 @@ -361,7 +360,6 @@ fn converts_from_tsv_text_skipping_headers_to_structured_table() { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_amigos.txt", r#" - first Name Last Name rusty_luck Andrés Robalino 1 Jonathan Turner 1 Yehuda Katz 1 @@ -441,30 +439,41 @@ fn converts_from_ssv_text_to_structured_table_with_separator_specified() { } #[test] -fn converts_from_ssv_text_skipping_headers_to_structured_table() { +fn converts_from_ssv_text_treating_first_line_as_data_with_flag() { Playground::setup("filter_from_ssv_test_2", |dirs, sandbox| { sandbox.with_files(vec![FileWithContentToBeTrimmed( "oc_get_svc.txt", r#" - NAME LABELS SELECTOR IP PORT(S) docker-registry docker-registry=default docker-registry=default 172.30.78.158 5000/TCP kubernetes component=apiserver,provider=kubernetes 172.30.0.2 443/TCP kubernetes-ro component=apiserver,provider=kubernetes 172.30.0.1 80/TCP "#, )]); - let actual = nu!( - cwd: dirs.test(), h::pipeline( + let aligned_columns = nu!( + cwd: dirs.test(), h::pipeline( r#" open oc_get_svc.txt - | from-ssv --headerless - | nth 2 - | get Column2 + | from-ssv --headerless --aligned-columns + | first + | get Column1 | echo $it "# )); - assert_eq!(actual, "component=apiserver,provider=kubernetes"); + let separator_based = nu!( + cwd: dirs.test(), h::pipeline( + r#" + open oc_get_svc.txt + | from-ssv --headerless + | first + | get Column1 + | echo $it + "# + )); + + assert_eq!(aligned_columns, separator_based); + assert_eq!(separator_based, "docker-registry"); }) }