From bd0baa961c3ad0f7488b0a278fbee26186e94ab8 Mon Sep 17 00:00:00 2001 From: Darren Schroeder <343840+fdncred@users.noreply.github.com> Date: Thu, 16 Sep 2021 09:02:30 -0500 Subject: [PATCH] add table selector for downloading web tables (#4004) * add table selector for downloading web tables * type-o * updated debug mode to inspect mode --- Cargo.lock | 33 + crates/nu_plugin_selector/Cargo.toml | 2 + crates/nu_plugin_selector/src/lib.rs | 2 + crates/nu_plugin_selector/src/nu/mod.rs | 41 +- crates/nu_plugin_selector/src/selector.rs | 108 ++- crates/nu_plugin_selector/src/tables.rs | 975 ++++++++++++++++++++++ 6 files changed, 1146 insertions(+), 15 deletions(-) create mode 100644 crates/nu_plugin_selector/src/tables.rs diff --git a/Cargo.lock b/Cargo.lock index 427deb932c..c437a82733 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1068,6 +1068,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee2626afccd7561a06cf1367e2950c4718ea04565e20fb5029b6c7d8ad09abcf" +[[package]] +name = "ego-tree" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" + [[package]] name = "either" version = "1.6.1" @@ -1517,6 +1523,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.1.16" @@ -3105,12 +3120,14 @@ dependencies = [ name = "nu_plugin_selector" version = "0.37.1" dependencies = [ + "indexmap", "nipper", "nu-errors", "nu-plugin", "nu-protocol", "nu-source", "nu-test-support", + "scraper", ] [[package]] @@ -4510,6 +4527,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scraper" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "matches", + "selectors", + "smallvec", + "tendril", +] + [[package]] name = "security-framework" version = "2.3.1" diff --git a/crates/nu_plugin_selector/Cargo.toml b/crates/nu_plugin_selector/Cargo.toml index 13ea2f0573..fb410de9e2 100644 --- a/crates/nu_plugin_selector/Cargo.toml +++ b/crates/nu_plugin_selector/Cargo.toml @@ -11,6 +11,7 @@ doctest = false [dependencies] nipper = "0.1.9" +scraper = "0.12.0" nu-errors = { version = "0.37.1", path="../nu-errors" } nu-plugin = { version = "0.37.1", path="../nu-plugin" } nu-protocol = { version = "0.37.1", path="../nu-protocol" } @@ -18,3 +19,4 @@ nu-source = { version = "0.37.1", path="../nu-source" } [dev-dependencies] nu-test-support = { path="../nu-test-support", version = "0.37.1" } +indexmap = { version="1.7", features=["serde-1"] } diff --git a/crates/nu_plugin_selector/src/lib.rs b/crates/nu_plugin_selector/src/lib.rs index 4712609d61..79a9f72552 100644 --- a/crates/nu_plugin_selector/src/lib.rs +++ b/crates/nu_plugin_selector/src/lib.rs @@ -1,4 +1,6 @@ mod nu; mod selector; +mod tables; pub use selector::Selector; +pub use tables::Table; diff --git a/crates/nu_plugin_selector/src/nu/mod.rs b/crates/nu_plugin_selector/src/nu/mod.rs index a430817323..5e0e444c04 100644 --- a/crates/nu_plugin_selector/src/nu/mod.rs +++ b/crates/nu_plugin_selector/src/nu/mod.rs @@ -10,34 +10,55 @@ impl Plugin for Selector { fn config(&mut self) -> Result { Ok(Signature::build("selector") .desc("execute selector query on html/web") - .required("query", SyntaxShape::String, "selector query") - .switch("as_html", "return the query output as html", Some('a')) + .named("query", SyntaxShape::String, "selector query", Some('q')) + .switch("as_html", "return the query output as html", Some('m')) .named( "attribute", SyntaxShape::String, "downselect based on the given attribute", + Some('a'), + ) + .named( + "as_table", + SyntaxShape::Table, + "find table based on column header list", Some('t'), ) + .switch( + "inspect", + "run in inspect mode to provide more information for determining column headers", + Some('i'), + ) .filter()) } fn begin_filter(&mut self, call_info: CallInfo) -> Result, ShellError> { let tag = call_info.name_tag; - let query = call_info.args.nth(0).ok_or_else(|| { - ShellError::labeled_error( - "selector query not passed", - "selector query not passed", - &tag, - ) - })?; + // let query = call_info.args.nth(0).ok_or_else(|| { + // ShellError::labeled_error( + // "selector query not passed", + // "selector query not passed", + // &tag, + // ) + // })?; - self.query = query.as_string()?; + // self.query = query.as_string()?; + self.query = if let Some(qtext) = call_info.args.get("query") { + qtext.convert_to_string() + } else { + "".to_string() + }; self.tag = tag; self.as_html = call_info.args.has("as_html"); if call_info.args.has("attribute") { self.attribute = call_info.args.expect_get("attribute")?.convert_to_string(); } + if call_info.args.has("as_table") { + self.as_table = call_info.args.expect_get("as_table")?.clone(); + } + self.inspect = call_info.args.has("inspect"); + Ok(vec![]) } diff --git a/crates/nu_plugin_selector/src/selector.rs b/crates/nu_plugin_selector/src/selector.rs index f9bf8f1886..5503c7a000 100644 --- a/crates/nu_plugin_selector/src/selector.rs +++ b/crates/nu_plugin_selector/src/selector.rs @@ -1,5 +1,6 @@ +use crate::Table; use nipper::Document; -use nu_protocol::{value::StringExt, Value}; +use nu_protocol::{value::StringExt, Primitive, TaggedDictBuilder, UntaggedValue, Value}; use nu_source::Tag; pub struct Selector { @@ -7,6 +8,8 @@ pub struct Selector { pub tag: Tag, pub as_html: bool, pub attribute: String, + pub as_table: Value, + pub inspect: bool, } impl Selector { @@ -16,6 +19,11 @@ impl Selector { tag: Tag::unknown(), as_html: false, attribute: String::new(), + as_table: Value::new( + UntaggedValue::Primitive(Primitive::String("".to_string())), + Tag::unknown(), + ), + inspect: false, } } } @@ -27,14 +35,104 @@ impl Default for Selector { } pub fn begin_selector_query(input_html: String, selector: &Selector) -> Vec { - match selector.attribute.is_empty() { - true => execute_selector_query(&input_html, &selector.query, selector.as_html), - false => { - execute_selector_query_with_attribute(&input_html, &selector.query, &selector.attribute) + if selector.as_table.is_some() { + retrieve_tables(input_html.as_str(), &selector.as_table, selector.inspect) + } else { + match selector.attribute.is_empty() { + true => execute_selector_query( + input_html.as_str(), + selector.query.as_str(), + selector.as_html, + ), + false => execute_selector_query_with_attribute( + input_html.as_str(), + selector.query.as_str(), + selector.attribute.as_str(), + ), } } } +pub fn retrieve_tables(input_string: &str, columns: &Value, inspect_mode: bool) -> Vec { + let html = input_string; + let mut cols = Vec::new(); + if let UntaggedValue::Table(t) = &columns.value { + for x in t { + cols.push(x.convert_to_string()); + } + } + + if inspect_mode { + eprintln!("Passed in Column Headers = {:#?}", &cols,); + } + + let mut table = match Table::find_by_headers(html, &cols) { + Some(t) => { + if inspect_mode { + eprintln!("Table Found = {:#?}", &t); + } + t + } + None => Table::empty(), + }; + + let mut table_out = Vec::new(); + + // since cols was empty and headers is not, it means that headers were manually populated + // so let's fake the data in order to build a proper table. this situation happens when + // there are tables where the first column is actually the headers. kind of like a table + // that has been rotated ccw 90 degrees + if cols.is_empty() && !table.headers().is_empty() { + for col in table.headers().keys() { + cols.push(col.to_string()); + } + + let mut data2 = Vec::new(); + for x in &table.data { + data2.push(x.join(", ")); + } + // eprintln!("data2={:?}", data2); + table.data = vec![data2]; + } + + // if columns are still empty, let's just make a single column table with the data + if cols.is_empty() { + let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect(); + + for row in &table_with_no_empties { + let mut dict = TaggedDictBuilder::new(Tag::unknown()); + for (counter, cell) in row.iter().enumerate() { + let col_name = format!("Column{}", counter); + dict.insert_value( + col_name, + UntaggedValue::Primitive(Primitive::String(cell.to_string())) + .into_value(Tag::unknown()), + ); + } + table_out.push(dict.into_value()); + } + } else { + for row in &table { + let mut dict = TaggedDictBuilder::new(Tag::unknown()); + // eprintln!("row={:?}", &row); + for col in &cols { + // eprintln!("col={:?}", &col); + let key = col.to_string(); + let val = row + .get(col) + .unwrap_or(&format!("Missing column: '{}'", &col)) + .to_string(); + dict.insert_value( + key, + UntaggedValue::Primitive(Primitive::String(val)).into_value(Tag::unknown()), + ); + } + table_out.push(dict.into_value()); + } + } + table_out +} + fn execute_selector_query_with_attribute( input_string: &str, query_string: &str, diff --git a/crates/nu_plugin_selector/src/tables.rs b/crates/nu_plugin_selector/src/tables.rs new file mode 100644 index 0000000000..3efd59cb0f --- /dev/null +++ b/crates/nu_plugin_selector/src/tables.rs @@ -0,0 +1,975 @@ +use scraper::{element_ref::ElementRef, Html, Selector as ScraperSelector}; +use std::collections::HashMap; + +// Borrowed from here +// https://github.com/mk12/table-extract/blob/master/src/lib.rs +pub type Headers = HashMap; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Table { + headers: Headers, + pub data: Vec>, +} + +impl Table { + /// Finds the first table in `html`. + pub fn find_first(html: &str) -> Option { + let html = Html::parse_fragment(html); + html.select(&css("table")).next().map(Table::new) + } + + pub fn find_all_tables(html: &str) -> Vec
{ + let html = Html::parse_fragment(html); + html.select(&css("table")).map(Table::new).collect() + } + + /// Finds the table in `html` with an id of `id`. + pub fn find_by_id(html: &str, id: &str) -> Option
{ + let html = Html::parse_fragment(html); + let selector = format!("table#{}", id); + ScraperSelector::parse(&selector) + .ok() + .as_ref() + .map(|s| html.select(s)) + .and_then(|mut s| s.next()) + .map(Table::new) + } + + /// Finds the table in `html` whose first row contains all of the headers + /// specified in `headers`. The order does not matter. + /// + /// If `headers` is empty, this is the same as + /// [`find_first`](#method.find_first). + pub fn find_by_headers(html: &str, headers: &[T]) -> Option
+ where + T: AsRef, + { + if headers.is_empty() { + return Table::find_first(html); + } + + let sel_table = css("table"); + let sel_tr = css("tr"); + let sel_th = css("th"); + + let html = Html::parse_fragment(html); + html.select(&sel_table) + .find(|table| { + table.select(&sel_tr).next().map_or(false, |tr| { + let cells = select_cells(tr, &sel_th, true); + headers.iter().all(|h| contains_str(&cells, h.as_ref())) + }) + }) + .map(Table::new) + } + + /// Returns the headers of the table. + /// + /// This will be empty if the table had no `
` tags in its first row. See + /// [`Headers`](type.Headers.html) for more. + pub fn headers(&self) -> &Headers { + &self.headers + } + + /// Returns an iterator over the [`Row`](struct.Row.html)s of the table. + /// + /// Only `` cells are considered when generating rows. If the first row + /// of the table is a header row, meaning it contains at least one `` + /// cell, the iterator will start on the second row. Use + /// [`headers`](#method.headers) to access the header row in that case. + pub fn iter(&self) -> Iter { + Iter { + headers: &self.headers, + iter: self.data.iter(), + } + } + + pub fn empty() -> Table { + Table { + headers: HashMap::new(), + data: vec![vec!["".to_string()]], + } + } + + // fn new(element: ElementRef) -> Table { + // let sel_tr = css("tr"); + // let sel_th = css("th"); + // let sel_td = css("td"); + + // let mut headers = HashMap::new(); + // let mut rows = element.select(&sel_tr).peekable(); + // if let Some(tr) = rows.peek() { + // for (i, th) in tr.select(&sel_th).enumerate() { + // headers.insert(cell_content(th), i); + // } + // } + // if !headers.is_empty() { + // rows.next(); + // } + // let data = rows.map(|tr| select_cells(tr, &sel_td, true)).collect(); + // Table { headers, data } + // } + + fn new(element: ElementRef) -> Table { + let sel_tr = css("tr"); + let sel_th = css("th"); + let sel_td = css("td"); + + let mut headers = HashMap::new(); + let mut rows = element.select(&sel_tr).peekable(); + if let Some(tr) = rows.clone().peek() { + for (i, th) in tr.select(&sel_th).enumerate() { + headers.insert(cell_content(th), i); + } + } + if !headers.is_empty() { + rows.next(); + } + + if headers.is_empty() { + // try looking for data as headers i.e. they're row headers not column headers + for (i, d) in rows + .clone() + .map(|tr| select_cells(tr, &sel_th, true)) + .enumerate() + { + headers.insert(d.join(", "), i); + } + // check if headers are there but empty + let mut empty_headers = true; + for (h, _i) in headers.clone() { + if !h.is_empty() { + empty_headers = false; + break; + } + } + if empty_headers { + headers = HashMap::new(); + } + let data = rows.map(|tr| select_cells(tr, &sel_td, true)).collect(); + Table { headers, data } + } else { + let data = rows.map(|tr| select_cells(tr, &sel_td, true)).collect(); + Table { headers, data } + } + } +} + +impl<'a> IntoIterator for &'a Table { + type Item = Row<'a>; + type IntoIter = Iter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An iterator over the rows in a [`Table`](struct.Table.html). +pub struct Iter<'a> { + headers: &'a Headers, + iter: std::slice::Iter<'a, Vec>, +} + +impl<'a> Iterator for Iter<'a> { + type Item = Row<'a>; + + fn next(&mut self) -> Option { + let headers = self.headers; + self.iter.next().map(|cells| Row { headers, cells }) + } +} + +/// A row in a [`Table`](struct.Table.html). +/// +/// A row consists of a number of data cells stored as strings. If the row +/// contains the same number of cells as the table's header row, its cells can +/// be safely accessed by header names using [`get`](#method.get). Otherwise, +/// the data should be accessed via [`as_slice`](#method.as_slice) or by +/// iterating over the row. +/// +/// This struct can be thought of as a lightweight reference into a table. As +/// such, it implements the `Copy` trait. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Row<'a> { + headers: &'a Headers, + cells: &'a [String], +} + +impl<'a> Row<'a> { + /// Returns the number of cells in the row. + pub fn len(&self) -> usize { + self.cells.len() + } + + /// Returns `true` if the row contains no cells. + pub fn is_empty(&self) -> bool { + self.cells.is_empty() + } + + /// Returns the cell underneath `header`. + /// + /// Returns `None` if there is no such header, or if there is no cell at + /// that position in the row. + pub fn get(&self, header: &str) -> Option<&'a str> { + // eprintln!( + // "header={}, headers={:?}, cells={:?}", + // &header, &self.headers, &self.cells + // ); + self.headers.get(header).and_then(|&i| { + // eprintln!("i={}", i); + self.cells.get(i).map(String::as_str) + }) + } + + pub fn get_header_at(&self, index: usize) -> Option<&'a str> { + let mut a_match = ""; + for (key, val) in self.headers { + if *val == index { + a_match = key; + break; + } + } + if a_match.is_empty() { + None + } else { + Some(a_match) + } + } + + /// Returns a slice containing all the cells. + pub fn as_slice(&self) -> &'a [String] { + self.cells + } + + /// Returns an iterator over the cells of the row. + pub fn iter(&self) -> std::slice::Iter { + self.cells.iter() + } +} + +impl<'a> IntoIterator for Row<'a> { + type Item = &'a String; + type IntoIter = std::slice::Iter<'a, String>; + + fn into_iter(self) -> Self::IntoIter { + self.cells.iter() + } +} + +fn css(selector: &'static str) -> ScraperSelector { + ScraperSelector::parse(selector).expect("Unable to parse selector with scraper") +} + +fn select_cells( + element: ElementRef, + selector: &ScraperSelector, + remove_html_tags: bool, +) -> Vec { + if remove_html_tags { + let scraped = element.select(selector).map(cell_content); + let mut dehtmlized: Vec = Vec::new(); + for item in scraped { + let frag = Html::parse_fragment(&item); + for node in frag.tree { + if let scraper::node::Node::Text(text) = node { + dehtmlized.push(text.text.to_string()); + } + } + } + dehtmlized + } else { + element.select(selector).map(cell_content).collect() + } +} + +fn cell_content(element: ElementRef) -> String { + // element.inner_html().trim().to_string() + let mut dehtmlize = String::new(); + let element = element.inner_html().trim().to_string(); + let frag = Html::parse_fragment(&element); + for node in frag.tree { + if let scraper::node::Node::Text(text) = node { + dehtmlize.push_str(&text.text.to_string()) + } + } + + // eprintln!("element={} dehtmlize={}", &element, &dehtmlize); + + if dehtmlize.is_empty() { + dehtmlize = element; + } + + dehtmlize +} + +fn contains_str(slice: &[String], item: &str) -> bool { + // slice.iter().any(|s| s == item) + + let mut dehtmlized = String::new(); + let frag = Html::parse_fragment(item); + for node in frag.tree { + if let scraper::node::Node::Text(text) = node { + dehtmlized.push_str(&text.text.to_string()); + } + } + + if dehtmlized.is_empty() { + dehtmlized = item.to_string(); + } + + slice.iter().any(|s| { + // eprintln!( + // "\ns={} item={} contains={}\n", + // &s, + // &dehtmlized, + // &dehtmlized.contains(s) + // ); + // s.starts_with(item) + dehtmlized.contains(s) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::selector::retrieve_tables; + use indexmap::indexmap; + use nu_protocol::UntaggedValue; + + const TABLE_EMPTY: &'static str = r#" +
+"#; + + const TABLE_TH: &'static str = r#" + + +
NameAge
+"#; + + const TABLE_TD: &'static str = r#" + + +
NameAge
+"#; + + const TABLE_TH_TD: &'static str = r#" + + + +
NameAge
John20
+"#; + + const TABLE_TD_TD: &'static str = r#" + + + +
NameAge
John20
+"#; + + const TABLE_TH_TH: &'static str = r#" + + + +
NameAge
John20
+"#; + + const TABLE_COMPLEX: &'static str = r#" + + + + + + +
NameAgeExtra
John20
May30foo
abcd
+"#; + + const HTML_NO_TABLE: &'static str = r#" + + + foo +

Hi.

+ +"#; + + const HTML_TWO_TABLES: &'static str = r#" + + + foo + + + + +
NameAge
John20
+ + + +
NameWeight
John150
+ + +"#; + + const HTML_TABLE_FRAGMENT: &'static str = r#" + + + +
NameAge
John20
+ + +"#; + + const HTML_TABLE_WIKIPEDIA_WITH_COLUMN_NAMES: &'static str = r#" + + + + + + + + + + + + + + + + + + + + + + + + + +
Excel 2007 formats +
Format + Extension + Description +
Excel Workbook + .xlsx + The default Excel 2007 and later workbook format. In reality, a Zip compressed archive with a directory structure of XML text documents. Functions as the primary replacement for the former binary .xls format, although it does not support Excel macros for security reasons. Saving as .xlsx offers file size reduction over .xls[38] +
Excel Macro-enabled Workbook + .xlsm + As Excel Workbook, but with macro support. +
Excel Binary Workbook + .xlsb + As Excel Macro-enabled Workbook, but storing information in binary form rather than XML documents for opening and saving documents more quickly and efficiently. Intended especially for very large documents with tens of thousands of rows, and/or several hundreds of columns. This format is very useful for shrinking large Excel files as is often the case when doing data analysis. +
Excel Macro-enabled Template + .xltm + A template document that forms a basis for actual workbooks, with macro support. The replacement for the old .xlt format. +
Excel Add-in + .xlam + Excel add-in to add extra functionality and tools. Inherent macro support because of the file purpose. +
+ "#; + + const HTML_TABLE_WIKIPEDIA_COLUMNS_AS_ROWS: &'static str = r#" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Microsoft Excel +
+ Microsoft Office Excel (2019–present).svg +
+ Microsoft Excel.png +
+ A simple + line chart being + created in Excel, running on + Windows 10 +
+
+ Developer(s) + + Microsoft +
+ Initial release + + 1987; 34 years ago (1987) +
+ Stable release + +
+ 2103 (16.0.13901.20400) / April 13, 2021; 4 months ago (2021-04-13)[1] +
+
+ Operating system + + Microsoft Windows +
+ Type + + Spreadsheet +
+ License + + Trialware[2] +
+ Website + + products.office.com/en-us/excel +
+"#; + + #[test] + fn test_find_first_none() { + assert_eq!(None, Table::find_first("")); + assert_eq!(None, Table::find_first("foo")); + assert_eq!(None, Table::find_first(HTML_NO_TABLE)); + } + + #[test] + fn test_find_first_empty() { + let empty = Table { + headers: HashMap::new(), + data: Vec::new(), + }; + assert_eq!(Some(empty), Table::find_first(TABLE_EMPTY)); + } + + #[test] + fn test_find_first_some() { + assert!(Table::find_first(TABLE_TH).is_some()); + assert!(Table::find_first(TABLE_TD).is_some()); + } + + #[test] + fn test_find_by_id_none() { + assert_eq!(None, Table::find_by_id("", "")); + assert_eq!(None, Table::find_by_id("foo", "id")); + assert_eq!(None, Table::find_by_id(HTML_NO_TABLE, "id")); + + assert_eq!(None, Table::find_by_id(TABLE_EMPTY, "id")); + assert_eq!(None, Table::find_by_id(TABLE_TH, "id")); + assert_eq!(None, Table::find_by_id(TABLE_TH, "")); + assert_eq!(None, Table::find_by_id(HTML_TWO_TABLES, "id")); + } + + #[test] + fn test_find_by_id_some() { + assert!(Table::find_by_id(HTML_TWO_TABLES, "first").is_some()); + assert!(Table::find_by_id(HTML_TWO_TABLES, "second").is_some()); + } + + #[test] + fn test_find_by_headers_empty() { + let headers: [&str; 0] = []; + + assert_eq!(None, Table::find_by_headers("", &headers)); + assert_eq!(None, Table::find_by_headers("foo", &headers)); + assert_eq!(None, Table::find_by_headers(HTML_NO_TABLE, &headers)); + + assert!(Table::find_by_headers(TABLE_EMPTY, &headers).is_some()); + assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some()); + } + + #[test] + fn test_find_by_headers_none() { + let headers = ["Name", "Age"]; + let bad_headers = ["Name", "BAD"]; + + assert_eq!(None, Table::find_by_headers("", &headers)); + assert_eq!(None, Table::find_by_headers("foo", &headers)); + assert_eq!(None, Table::find_by_headers(HTML_NO_TABLE, &headers)); + + assert_eq!(None, Table::find_by_headers(TABLE_EMPTY, &bad_headers)); + assert_eq!(None, Table::find_by_headers(TABLE_TH, &bad_headers)); + + assert_eq!(None, Table::find_by_headers(TABLE_TD, &headers)); + assert_eq!(None, Table::find_by_headers(TABLE_TD, &bad_headers)); + } + + #[test] + fn test_find_by_headers_some() { + let headers: [&str; 0] = []; + assert!(Table::find_by_headers(TABLE_TH, &headers).is_some()); + assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some()); + assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some()); + + let headers = ["Name"]; + assert!(Table::find_by_headers(TABLE_TH, &headers).is_some()); + assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some()); + assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some()); + + let headers = ["Age", "Name"]; + assert!(Table::find_by_headers(TABLE_TH, &headers).is_some()); + assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some()); + assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some()); + } + + #[test] + fn test_find_first_incomplete_fragment() { + assert!(Table::find_first(HTML_TABLE_FRAGMENT).is_some()); + } + + #[test] + fn test_headers_empty() { + let empty = HashMap::new(); + assert_eq!(&empty, Table::find_first(TABLE_TD).unwrap().headers()); + assert_eq!(&empty, Table::find_first(TABLE_TD_TD).unwrap().headers()); + } + + #[test] + fn test_headers_nonempty() { + let mut headers = HashMap::new(); + headers.insert("Name".to_string(), 0); + headers.insert("Age".to_string(), 1); + + assert_eq!(&headers, Table::find_first(TABLE_TH).unwrap().headers()); + assert_eq!(&headers, Table::find_first(TABLE_TH_TD).unwrap().headers()); + assert_eq!(&headers, Table::find_first(TABLE_TH_TH).unwrap().headers()); + + headers.insert("Extra".to_string(), 2); + assert_eq!( + &headers, + Table::find_first(TABLE_COMPLEX).unwrap().headers() + ); + } + + #[test] + fn test_iter_empty() { + assert_eq!(0, Table::find_first(TABLE_EMPTY).unwrap().iter().count()); + assert_eq!(0, Table::find_first(TABLE_TH).unwrap().iter().count()); + } + + #[test] + fn test_iter_nonempty() { + assert_eq!(1, Table::find_first(TABLE_TD).unwrap().iter().count()); + assert_eq!(1, Table::find_first(TABLE_TH_TD).unwrap().iter().count()); + assert_eq!(2, Table::find_first(TABLE_TD_TD).unwrap().iter().count()); + assert_eq!(1, Table::find_first(TABLE_TH_TH).unwrap().iter().count()); + assert_eq!(4, Table::find_first(TABLE_COMPLEX).unwrap().iter().count()); + } + + #[test] + fn test_row_is_empty() { + let table = Table::find_first(TABLE_TD).unwrap(); + assert_eq!( + vec![false], + table.iter().map(|r| r.is_empty()).collect::>() + ); + + let table = Table::find_first(TABLE_COMPLEX).unwrap(); + assert_eq!( + vec![false, false, true, false], + table.iter().map(|r| r.is_empty()).collect::>() + ); + } + + #[test] + fn test_row_len() { + let table = Table::find_first(TABLE_TD).unwrap(); + assert_eq!(vec![2], table.iter().map(|r| r.len()).collect::>()); + + let table = Table::find_first(TABLE_COMPLEX).unwrap(); + assert_eq!( + vec![2, 3, 0, 4], + table.iter().map(|r| r.len()).collect::>() + ); + } + + #[test] + fn test_row_get_without_headers() { + let table = Table::find_first(TABLE_TD).unwrap(); + let mut iter = table.iter(); + let row = iter.next().unwrap(); + + assert_eq!(None, row.get("")); + assert_eq!(None, row.get("foo")); + assert_eq!(None, row.get("Name")); + assert_eq!(None, row.get("Age")); + + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_get_with_headers() { + let table = Table::find_first(TABLE_TH_TD).unwrap(); + let mut iter = table.iter(); + let row = iter.next().unwrap(); + + assert_eq!(None, row.get("")); + assert_eq!(None, row.get("foo")); + assert_eq!(Some("John"), row.get("Name")); + assert_eq!(Some("20"), row.get("Age")); + + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_get_complex() { + let table = Table::find_first(TABLE_COMPLEX).unwrap(); + let mut iter = table.iter(); + + let row = iter.next().unwrap(); + assert_eq!(Some("John"), row.get("Name")); + assert_eq!(Some("20"), row.get("Age")); + assert_eq!(None, row.get("Extra")); + + let row = iter.next().unwrap(); + assert_eq!(Some("May"), row.get("Name")); + assert_eq!(Some("30"), row.get("Age")); + assert_eq!(Some("foo"), row.get("Extra")); + + let row = iter.next().unwrap(); + assert_eq!(None, row.get("Name")); + assert_eq!(None, row.get("Age")); + assert_eq!(None, row.get("Extra")); + + let row = iter.next().unwrap(); + assert_eq!(Some("a"), row.get("Name")); + assert_eq!(Some("b"), row.get("Age")); + assert_eq!(Some("c"), row.get("Extra")); + + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_as_slice_without_headers() { + let table = Table::find_first(TABLE_TD).unwrap(); + let mut iter = table.iter(); + + assert_eq!(&["Name", "Age"], iter.next().unwrap().as_slice()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_as_slice_with_headers() { + let table = Table::find_first(TABLE_TH_TD).unwrap(); + let mut iter = table.iter(); + + assert_eq!(&["John", "20"], iter.next().unwrap().as_slice()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_as_slice_complex() { + let table = Table::find_first(TABLE_COMPLEX).unwrap(); + let mut iter = table.iter(); + let empty: [&str; 0] = []; + + assert_eq!(&["John", "20"], iter.next().unwrap().as_slice()); + assert_eq!(&["May", "30", "foo"], iter.next().unwrap().as_slice()); + assert_eq!(&empty, iter.next().unwrap().as_slice()); + assert_eq!(&["a", "b", "c", "d"], iter.next().unwrap().as_slice()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_iter_simple() { + let table = Table::find_first(TABLE_TD).unwrap(); + let row = table.iter().next().unwrap(); + let mut iter = row.iter(); + + assert_eq!(Some("Name"), iter.next().map(String::as_str)); + assert_eq!(Some("Age"), iter.next().map(String::as_str)); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_row_iter_complex() { + let table = Table::find_first(TABLE_COMPLEX).unwrap(); + let mut table_iter = table.iter(); + + let row = table_iter.next().unwrap(); + let mut iter = row.iter(); + assert_eq!(Some("John"), iter.next().map(String::as_str)); + assert_eq!(Some("20"), iter.next().map(String::as_str)); + assert_eq!(None, iter.next()); + + let row = table_iter.next().unwrap(); + let mut iter = row.iter(); + assert_eq!(Some("May"), iter.next().map(String::as_str)); + assert_eq!(Some("30"), iter.next().map(String::as_str)); + assert_eq!(Some("foo"), iter.next().map(String::as_str)); + assert_eq!(None, iter.next()); + + let row = table_iter.next().unwrap(); + let mut iter = row.iter(); + assert_eq!(None, iter.next()); + + let row = table_iter.next().unwrap(); + let mut iter = row.iter(); + assert_eq!(Some("a"), iter.next().map(String::as_str)); + assert_eq!(Some("b"), iter.next().map(String::as_str)); + assert_eq!(Some("c"), iter.next().map(String::as_str)); + assert_eq!(Some("d"), iter.next().map(String::as_str)); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_wikipedia_swapped_rows_columns() { + // empty columns + let cols = nu_protocol::value::Value { + value: nu_protocol::UntaggedValue::Primitive(nu_protocol::Primitive::String( + "".to_string(), + )), + tag: nu_source::Tag::unknown(), + }; + + // this table is taken straight from wikipedia with no changes + let table = retrieve_tables(HTML_TABLE_WIKIPEDIA_COLUMNS_AS_ROWS, &cols, true); + + let expected = vec![UntaggedValue::row(indexmap! { + "Stable release".to_string() => UntaggedValue::string("\n 2103 (16.0.13901.20400) / April\u{a0}13, 2021; 4 months ago\u{a0}(2021-04-13)[1]\n ").into(), + "Developer(s)".to_string() => UntaggedValue::string("Microsoft").into(), + "Operating system".to_string() => UntaggedValue::string("Microsoft Windows").into(), + "Type".to_string() => UntaggedValue::string("Spreadsheet").into(), + "License".to_string() => UntaggedValue::string("Trialware[2]").into(), + "".to_string() => UntaggedValue::string("").into(), + "Website".to_string() => UntaggedValue::string("products.office.com/en-us/excel").into(), + "Initial release".to_string() => UntaggedValue::string("1987; 34\u{a0}years ago\u{a0}(1987)").into(), + }).into()]; + + assert_eq!(table, expected); + } + + #[test] + fn test_wikipedia_table_with_column_headers() { + let cols = UntaggedValue::table(&[ + UntaggedValue::string("Format".to_string()).into(), + UntaggedValue::string("Extension".to_string()).into(), + UntaggedValue::string("Description".to_string()).into(), + ]) + .into(); + + // this table is taken straight from wikipedia with no changes + let table = retrieve_tables(HTML_TABLE_WIKIPEDIA_WITH_COLUMN_NAMES, &cols, true); + let expected = vec![ + UntaggedValue::row(indexmap! { + "Format".to_string() => UntaggedValue::string("Excel Workbook").into(), + "Extension".to_string() => UntaggedValue::string(".xlsx").into(), + "Description".to_string() => UntaggedValue::string("The default Excel 2007 and later workbook format. In reality, a Zip compressed archive with a directory structure of XML text documents. Functions as the primary replacement for the former binary .xls format, although it does not support Excel macros for security reasons. Saving as .xlsx offers file size reduction over .xls[38]").into(), + }).into(), + UntaggedValue::row(indexmap! { + "Format".to_string() => UntaggedValue::string("Excel Macro-enabled Workbook").into(), + "Extension".to_string() => UntaggedValue::string(".xlsm").into(), + "Description".to_string() => UntaggedValue::string("As Excel Workbook, but with macro support.").into(), + }).into(), + UntaggedValue::row(indexmap! { + "Format".to_string() => UntaggedValue::string("Excel Binary Workbook").into(), + "Extension".to_string() => UntaggedValue::string(".xlsb").into(), + "Description".to_string() => UntaggedValue::string("As Excel Macro-enabled Workbook, but storing information in binary form rather than XML documents for opening and saving documents more quickly and efficiently. Intended especially for very large documents with tens of thousands of rows, and/or several hundreds of columns. This format is very useful for shrinking large Excel files as is often the case when doing data analysis.").into(), + }).into(), + UntaggedValue::row(indexmap! { + "Format".to_string() => UntaggedValue::string("Excel Macro-enabled Template").into(), + "Extension".to_string() => UntaggedValue::string(".xltm").into(), + "Description".to_string() => UntaggedValue::string("A template document that forms a basis for actual workbooks, with macro support. The replacement for the old .xlt format.").into(), + }).into(), + UntaggedValue::row(indexmap! { + "Format".to_string() => UntaggedValue::string("Excel Add-in").into(), + "Extension".to_string() => UntaggedValue::string(".xlam").into(), + "Description".to_string() => UntaggedValue::string("Excel add-in to add extra functionality and tools. Inherent macro support because of the file purpose.").into(), + }).into(), + ]; + + assert_eq!(table, expected); + } +}