nushell/crates/nu_plugin_query/src/query_web.rs

317 lines
8.6 KiB
Rust
Raw Normal View History

use crate::web_tables::WebTable;
use nu_plugin::{EvaluatedCall, LabeledError};
use nu_protocol::{Record, Span, Value};
use scraper::{Html, Selector as ScraperSelector};
pub struct Selector {
pub query: String,
pub as_html: bool,
pub attribute: String,
pub as_table: Value,
pub inspect: bool,
}
impl Selector {
pub fn new() -> Selector {
Selector {
query: String::new(),
as_html: false,
attribute: String::new(),
as_table: Value::string("".to_string(), Span::unknown()),
inspect: false,
}
}
}
impl Default for Selector {
fn default() -> Self {
Self::new()
}
}
pub fn parse_selector_params(call: &EvaluatedCall, input: &Value) -> Result<Value, LabeledError> {
let head = call.head;
let query: String = match call.get_flag("query")? {
Some(q2) => q2,
None => "".to_string(),
};
let as_html = call.has_flag("as-html");
let attribute = call.get_flag("attribute")?.unwrap_or_default();
let as_table: Value = call
.get_flag("as-table")?
.unwrap_or_else(|| Value::nothing(head));
let inspect = call.has_flag("inspect");
if !&query.is_empty() && ScraperSelector::parse(&query).is_err() {
return Err(LabeledError {
msg: "Cannot parse this query as a valid css selector".to_string(),
label: "Parse error".to_string(),
span: Some(head),
});
}
let selector = Selector {
query,
as_html,
attribute,
as_table,
inspect,
};
match input {
Value::String { val, span } => Ok(begin_selector_query(val.to_string(), selector, *span)),
_ => Err(LabeledError {
label: "requires text input".to_string(),
msg: "Expected text from pipeline".to_string(),
Spanned Value step 1: span all value cases (#10042) # Description This doesn't really do much that the user could see, but it helps get us ready to do the steps of the refactor to split the span off of Value, so that values can be spanless. This allows us to have top-level values that can hold both a Value and a Span, without requiring that all values have them. We expect to see significant memory reduction by removing so many unnecessary spans from values. For example, a table of 100,000 rows and 5 columns would have a savings of ~8megs in just spans that are almost always duplicated. # User-Facing Changes Nothing yet # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect -A clippy::result_large_err` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
2023-08-24 20:48:05 +00:00
span: Some(input.span()),
}),
}
}
fn begin_selector_query(input_html: String, selector: Selector, span: Span) -> Value {
if let Value::List { .. } = selector.as_table {
return retrieve_tables(
input_html.as_str(),
&selector.as_table,
selector.inspect,
span,
);
} else if selector.attribute.is_empty() {
execute_selector_query(
input_html.as_str(),
selector.query.as_str(),
selector.as_html,
selector.inspect,
span,
)
} else {
execute_selector_query_with_attribute(
input_html.as_str(),
selector.query.as_str(),
selector.attribute.as_str(),
selector.inspect,
span,
)
}
}
pub fn retrieve_tables(
input_string: &str,
columns: &Value,
inspect_mode: bool,
span: Span,
) -> Value {
let html = input_string;
let mut cols: Vec<String> = Vec::new();
if let Value::List { vals, .. } = &columns {
for x in vals {
// TODO Find a way to get the Config object here
if let Value::String { val, .. } = x {
cols.push(val.to_string())
}
}
}
if inspect_mode {
eprintln!("Passed in Column Headers = {:#?}", &cols,);
}
let tables = match WebTable::find_by_headers(html, &cols) {
Some(t) => {
if inspect_mode {
eprintln!("Table Found = {:#?}", &t);
}
t
}
None => vec![WebTable::empty()],
};
if tables.len() == 1 {
return retrieve_table(
tables.into_iter().next().expect("Error retrieving table"),
columns,
span,
);
}
let vals = tables
.into_iter()
.map(move |table| retrieve_table(table, columns, span))
.collect();
Value::List { vals, span }
}
fn retrieve_table(mut table: WebTable, columns: &Value, span: Span) -> Value {
let mut cols: Vec<String> = Vec::new();
if let Value::List { vals, .. } = &columns {
for x in vals {
// TODO Find a way to get the Config object here
if let Value::String { val, .. } = x {
cols.push(val.to_string())
}
}
}
if cols.is_empty() && !table.headers().is_empty() {
for col in table.headers().keys() {
cols.push(col.to_string());
}
}
let mut table_out = Vec::new();
// sometimes there are tables where the first column is the headers, kind of like
// a table has ben rotated ccw 90 degrees, in these cases all columns will be missing
// we keep track of this with this variable so we can deal with it later
let mut at_least_one_row_filled = false;
// if columns are still empty, let's just make a single column table with the data
if cols.is_empty() {
at_least_one_row_filled = true;
let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect();
let mut record = Record::new();
for row in &table_with_no_empties {
for (counter, cell) in row.iter().enumerate() {
record.push(format!("column{counter}"), Value::string(cell, span));
}
}
table_out.push(Value::record(record, span))
} else {
for row in &table {
let mut vals = vec![];
let record_cols = &cols;
for col in &cols {
let val = row
.get(col)
.unwrap_or(&format!("Missing column: '{}'", &col))
.to_string();
if !at_least_one_row_filled && val != format!("Missing column: '{}'", &col) {
at_least_one_row_filled = true;
}
vals.push(Value::string(val, span));
}
table_out.push(Value::record(
Record {
cols: record_cols.to_vec(),
vals,
},
span,
))
}
}
if !at_least_one_row_filled {
let mut data2 = Vec::new();
for x in &table.data {
data2.push(x.join(", "));
}
table.data = vec![data2];
return retrieve_table(table, columns, span);
}
// table_out
Value::List {
vals: table_out,
span,
}
}
fn execute_selector_query_with_attribute(
input_string: &str,
query_string: &str,
attribute: &str,
inspect: bool,
span: Span,
) -> Value {
let doc = Html::parse_fragment(input_string);
let vals: Vec<Value> = doc
.select(&css(query_string, inspect))
.map(|selection| {
Value::string(
selection.value().attr(attribute).unwrap_or("").to_string(),
span,
)
})
.collect();
Value::List { vals, span }
}
fn execute_selector_query(
input_string: &str,
query_string: &str,
as_html: bool,
inspect: bool,
span: Span,
) -> Value {
let doc = Html::parse_fragment(input_string);
let vals: Vec<Value> = match as_html {
true => doc
.select(&css(query_string, inspect))
.map(|selection| Value::string(selection.html(), span))
.collect(),
false => doc
.select(&css(query_string, inspect))
.map(|selection| {
Value::string(
selection
.text()
.fold("".to_string(), |acc, x| format!("{acc}{x}")),
span,
)
})
.collect(),
};
Value::List { vals, span }
}
pub fn css(selector: &str, inspect: bool) -> ScraperSelector {
if inspect {
ScraperSelector::parse("html").expect("Error unwrapping the default scraperselector")
} else {
ScraperSelector::parse(selector).expect("Error unwrapping scraperselector::parse")
}
}
#[cfg(test)]
mod tests {
use super::*;
const SIMPLE_LIST: &str = r#"
<ul>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ul>
"#;
#[test]
fn test_first_child_is_not_empty() {
assert!(!execute_selector_query(
SIMPLE_LIST,
"li:first-child",
false,
false,
Span::test_data()
)
.is_empty())
}
#[test]
fn test_first_child() {
let item = execute_selector_query(
SIMPLE_LIST,
"li:first-child",
false,
false,
Span::test_data(),
);
let config = nu_protocol::Config::default();
let out = item.into_string("\n", &config);
assert_eq!("[Coffee]".to_string(), out)
}
}