nushell/crates/nu_plugin_query/src/query_web.rs

322 lines
8.9 KiB
Rust
Raw Normal View History

use crate::web_tables::WebTable;
use nu_plugin::{EvaluatedCall, LabeledError};
use nu_protocol::{Record, Span, Value};
use scraper::{Html, Selector as ScraperSelector};
pub struct Selector {
pub query: String,
pub as_html: bool,
pub attribute: String,
pub as_table: Value,
pub inspect: bool,
}
impl Selector {
pub fn new() -> Selector {
Selector {
query: String::new(),
as_html: false,
attribute: String::new(),
as_table: Value::string("".to_string(), Span::unknown()),
inspect: false,
}
}
}
impl Default for Selector {
fn default() -> Self {
Self::new()
}
}
pub fn parse_selector_params(call: &EvaluatedCall, input: &Value) -> Result<Value, LabeledError> {
let head = call.head;
let query: String = match call.get_flag("query")? {
Some(q2) => q2,
None => "".to_string(),
};
let as_html = call.has_flag("as-html")?;
let attribute = call.get_flag("attribute")?.unwrap_or_default();
let as_table: Value = call
.get_flag("as-table")?
.unwrap_or_else(|| Value::nothing(head));
let inspect = call.has_flag("inspect")?;
if !&query.is_empty() && ScraperSelector::parse(&query).is_err() {
return Err(LabeledError {
msg: "Cannot parse this query as a valid css selector".to_string(),
label: "Parse error".to_string(),
span: Some(head),
});
}
let selector = Selector {
query,
as_html,
attribute,
as_table,
inspect,
};
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
let span = input.span();
match input {
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
Value::String { val, .. } => Ok(begin_selector_query(val.to_string(), selector, span)),
_ => Err(LabeledError {
label: "requires text input".to_string(),
msg: "Expected text from pipeline".to_string(),
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
span: Some(span),
}),
}
}
fn begin_selector_query(input_html: String, selector: Selector, span: Span) -> Value {
if let Value::List { .. } = selector.as_table {
return retrieve_tables(
input_html.as_str(),
&selector.as_table,
selector.inspect,
span,
);
} else if selector.attribute.is_empty() {
execute_selector_query(
input_html.as_str(),
selector.query.as_str(),
selector.as_html,
selector.inspect,
span,
)
} else {
execute_selector_query_with_attribute(
input_html.as_str(),
selector.query.as_str(),
selector.attribute.as_str(),
selector.inspect,
span,
)
}
}
pub fn retrieve_tables(
input_string: &str,
columns: &Value,
inspect_mode: bool,
span: Span,
) -> Value {
let html = input_string;
let mut cols: Vec<String> = Vec::new();
if let Value::List { vals, .. } = &columns {
for x in vals {
if let Value::String { val, .. } = x {
cols.push(val.to_string())
}
}
}
if inspect_mode {
update query web wiki example (#11709) # Description This PR tries to make `query web` more resilient and easier to debug with the `--inspect` parameter when trying to scrape tables. Previously it would just fail, now at least it tries to give you a hint. This is some example output now of when something went wrong. ``` ❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [Rank City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or union territory'] --inspect Passed in Column Headers = ["Rank", "City", "Population(2011)[3]", "Population(2001)[3][a]", "State or union territory"] First 2048 HTML chars = <!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>List of cities in India by population - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["", "January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9ecdad8f-2dbd-4245-b54d-9c57aea5ca45","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_in_India_by_population","wgTitle":"List of cities in India by population","wgCurRevisionId":1192093210,"wgRev Potential HTML Headers = ["City", "Population(2011)[3]", "Population(2001)[3][a]", "State or unionterritory", "Ref"] Potential HTML Headers = ["City", "Population(2011)[5]", "Population(2001)", "State or unionterritory"] Potential HTML Headers = [".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:\"[ \"}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:\" ]\"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vtePopulation of cities in India"] Potential HTML Headers = ["vteGeography of India"] ╭──────────────────────────┬─────────────────────────────────────────────────────╮ │ Rank │ error: no data found (column name may be incorrect) │ │ City │ error: no data found (column name may be incorrect) │ │ Population(2011)[3] │ error: no data found (column name may be incorrect) │ │ Population(2001)[3][a] │ error: no data found (column name may be incorrect) │ │ State or union territory │ error: no data found (column name may be incorrect) │ ╰──────────────────────────┴─────────────────────────────────────────────────────╯ ``` The key here is to look at the `Passed in Column Headers` and compare them to the `Potential HTML Headers` and couple that with the error table at the bottom should give you a hint that, in this situation, wikipedia has changed the column names, yet again. So we need to update our query web statement's tables to get closer to what we want. ``` ❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or unionterritory' 'Ref'] ╭─#──┬───────City───────┬─Population(2011)[3]─┬─Population(2001)[3][a]─┬─State or unionterritory─┬──Ref───╮ │ 0 │ Mumbai │ 12,442,373 │ 11,978,450 │ Maharashtra │ [3] │ │ 1 │ Delhi │ 11,034,555 │ 9,879,172 │ Delhi │ [3] │ │ 2 │ Bangalore │ 8,443,675 │ 5,682,293 │ Karnataka │ [3] │ │ 3 │ Hyderabad │ 6,993,262 │ 5,496,960 │ Telangana │ [3] │ │ 4 │ Ahmedabad │ 5,577,940 │ 4,470,006 │ Gujarat │ [3] │ │ 5 │ Chennai │ 4,646,732 │ 4,343,645 │ Tamil Nadu │ [3] │ │ 6 │ Kolkata │ 4,496,694 │ 4,580,546 │ West Bengal │ [3] │ │ 7 │ Surat │ 4,467,797 │ 2,788,126 │ Gujarat │ [3] │ │ 8 │ Pune │ 3,124,458 │ 2,538,473 │ Maharashtra │ [3] │ │ 9 │ Jaipur │ 3,046,163 │ 2,322,575 │ Rajasthan │ [3] │ │ 10 │ Lucknow │ 2,817,105 │ 2,185,927 │ Uttar Pradesh │ [3] │ │ 11 │ Kanpur │ 2,765,348 │ 2,551,337 │ Uttar Pradesh │ [3] │ │ 12 │ Nagpur │ 2,405,665 │ 2,052,066 │ Maharashtra │ [3] │ ``` # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
2024-02-02 15:03:28 +00:00
eprintln!("Passed in Column Headers = {:?}\n", &cols);
eprintln!("First 2048 HTML chars = {}\n", &html[0..2047]);
}
update query web wiki example (#11709) # Description This PR tries to make `query web` more resilient and easier to debug with the `--inspect` parameter when trying to scrape tables. Previously it would just fail, now at least it tries to give you a hint. This is some example output now of when something went wrong. ``` ❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [Rank City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or union territory'] --inspect Passed in Column Headers = ["Rank", "City", "Population(2011)[3]", "Population(2001)[3][a]", "State or union territory"] First 2048 HTML chars = <!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>List of cities in India by population - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["", "January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9ecdad8f-2dbd-4245-b54d-9c57aea5ca45","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_in_India_by_population","wgTitle":"List of cities in India by population","wgCurRevisionId":1192093210,"wgRev Potential HTML Headers = ["City", "Population(2011)[3]", "Population(2001)[3][a]", "State or unionterritory", "Ref"] Potential HTML Headers = ["City", "Population(2011)[5]", "Population(2001)", "State or unionterritory"] Potential HTML Headers = [".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:\"[ \"}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:\" ]\"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vtePopulation of cities in India"] Potential HTML Headers = ["vteGeography of India"] ╭──────────────────────────┬─────────────────────────────────────────────────────╮ │ Rank │ error: no data found (column name may be incorrect) │ │ City │ error: no data found (column name may be incorrect) │ │ Population(2011)[3] │ error: no data found (column name may be incorrect) │ │ Population(2001)[3][a] │ error: no data found (column name may be incorrect) │ │ State or union territory │ error: no data found (column name may be incorrect) │ ╰──────────────────────────┴─────────────────────────────────────────────────────╯ ``` The key here is to look at the `Passed in Column Headers` and compare them to the `Potential HTML Headers` and couple that with the error table at the bottom should give you a hint that, in this situation, wikipedia has changed the column names, yet again. So we need to update our query web statement's tables to get closer to what we want. ``` ❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or unionterritory' 'Ref'] ╭─#──┬───────City───────┬─Population(2011)[3]─┬─Population(2001)[3][a]─┬─State or unionterritory─┬──Ref───╮ │ 0 │ Mumbai │ 12,442,373 │ 11,978,450 │ Maharashtra │ [3] │ │ 1 │ Delhi │ 11,034,555 │ 9,879,172 │ Delhi │ [3] │ │ 2 │ Bangalore │ 8,443,675 │ 5,682,293 │ Karnataka │ [3] │ │ 3 │ Hyderabad │ 6,993,262 │ 5,496,960 │ Telangana │ [3] │ │ 4 │ Ahmedabad │ 5,577,940 │ 4,470,006 │ Gujarat │ [3] │ │ 5 │ Chennai │ 4,646,732 │ 4,343,645 │ Tamil Nadu │ [3] │ │ 6 │ Kolkata │ 4,496,694 │ 4,580,546 │ West Bengal │ [3] │ │ 7 │ Surat │ 4,467,797 │ 2,788,126 │ Gujarat │ [3] │ │ 8 │ Pune │ 3,124,458 │ 2,538,473 │ Maharashtra │ [3] │ │ 9 │ Jaipur │ 3,046,163 │ 2,322,575 │ Rajasthan │ [3] │ │ 10 │ Lucknow │ 2,817,105 │ 2,185,927 │ Uttar Pradesh │ [3] │ │ 11 │ Kanpur │ 2,765,348 │ 2,551,337 │ Uttar Pradesh │ [3] │ │ 12 │ Nagpur │ 2,405,665 │ 2,052,066 │ Maharashtra │ [3] │ ``` # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
2024-02-02 15:03:28 +00:00
let tables = match WebTable::find_by_headers(html, &cols, inspect_mode) {
Some(t) => {
if inspect_mode {
eprintln!("Table Found = {:#?}", &t);
}
t
}
None => vec![WebTable::empty()],
};
if tables.len() == 1 {
return retrieve_table(
tables.into_iter().next().expect("Error retrieving table"),
columns,
span,
);
}
let vals = tables
.into_iter()
.map(move |table| retrieve_table(table, columns, span))
.collect();
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
Value::list(vals, span)
}
fn retrieve_table(mut table: WebTable, columns: &Value, span: Span) -> Value {
let mut cols: Vec<String> = Vec::new();
if let Value::List { vals, .. } = &columns {
for x in vals {
// TODO Find a way to get the Config object here
if let Value::String { val, .. } = x {
cols.push(val.to_string())
}
}
}
if cols.is_empty() && !table.headers().is_empty() {
for col in table.headers().keys() {
cols.push(col.to_string());
}
}
update query web wiki example (#11709) # Description This PR tries to make `query web` more resilient and easier to debug with the `--inspect` parameter when trying to scrape tables. Previously it would just fail, now at least it tries to give you a hint. This is some example output now of when something went wrong. ``` ❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [Rank City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or union territory'] --inspect Passed in Column Headers = ["Rank", "City", "Population(2011)[3]", "Population(2001)[3][a]", "State or union territory"] First 2048 HTML chars = <!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>List of cities in India by population - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["", "January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9ecdad8f-2dbd-4245-b54d-9c57aea5ca45","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_in_India_by_population","wgTitle":"List of cities in India by population","wgCurRevisionId":1192093210,"wgRev Potential HTML Headers = ["City", "Population(2011)[3]", "Population(2001)[3][a]", "State or unionterritory", "Ref"] Potential HTML Headers = ["City", "Population(2011)[5]", "Population(2001)", "State or unionterritory"] Potential HTML Headers = [".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:\"[ \"}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:\" ]\"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vtePopulation of cities in India"] Potential HTML Headers = ["vteGeography of India"] ╭──────────────────────────┬─────────────────────────────────────────────────────╮ │ Rank │ error: no data found (column name may be incorrect) │ │ City │ error: no data found (column name may be incorrect) │ │ Population(2011)[3] │ error: no data found (column name may be incorrect) │ │ Population(2001)[3][a] │ error: no data found (column name may be incorrect) │ │ State or union territory │ error: no data found (column name may be incorrect) │ ╰──────────────────────────┴─────────────────────────────────────────────────────╯ ``` The key here is to look at the `Passed in Column Headers` and compare them to the `Potential HTML Headers` and couple that with the error table at the bottom should give you a hint that, in this situation, wikipedia has changed the column names, yet again. So we need to update our query web statement's tables to get closer to what we want. ``` ❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or unionterritory' 'Ref'] ╭─#──┬───────City───────┬─Population(2011)[3]─┬─Population(2001)[3][a]─┬─State or unionterritory─┬──Ref───╮ │ 0 │ Mumbai │ 12,442,373 │ 11,978,450 │ Maharashtra │ [3] │ │ 1 │ Delhi │ 11,034,555 │ 9,879,172 │ Delhi │ [3] │ │ 2 │ Bangalore │ 8,443,675 │ 5,682,293 │ Karnataka │ [3] │ │ 3 │ Hyderabad │ 6,993,262 │ 5,496,960 │ Telangana │ [3] │ │ 4 │ Ahmedabad │ 5,577,940 │ 4,470,006 │ Gujarat │ [3] │ │ 5 │ Chennai │ 4,646,732 │ 4,343,645 │ Tamil Nadu │ [3] │ │ 6 │ Kolkata │ 4,496,694 │ 4,580,546 │ West Bengal │ [3] │ │ 7 │ Surat │ 4,467,797 │ 2,788,126 │ Gujarat │ [3] │ │ 8 │ Pune │ 3,124,458 │ 2,538,473 │ Maharashtra │ [3] │ │ 9 │ Jaipur │ 3,046,163 │ 2,322,575 │ Rajasthan │ [3] │ │ 10 │ Lucknow │ 2,817,105 │ 2,185,927 │ Uttar Pradesh │ [3] │ │ 11 │ Kanpur │ 2,765,348 │ 2,551,337 │ Uttar Pradesh │ [3] │ │ 12 │ Nagpur │ 2,405,665 │ 2,052,066 │ Maharashtra │ [3] │ ``` # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
2024-02-02 15:03:28 +00:00
// We provided columns but the table has no headers, so we'll just make a single column table
if !cols.is_empty() && table.headers().is_empty() {
let mut record = Record::new();
for col in &cols {
record.push(
col.clone(),
Value::string("error: no data found (column name may be incorrect)", span),
);
}
return Value::record(record, span);
}
let mut table_out = Vec::new();
// sometimes there are tables where the first column is the headers, kind of like
// a table has ben rotated ccw 90 degrees, in these cases all columns will be missing
// we keep track of this with this variable so we can deal with it later
let mut at_least_one_row_filled = false;
// if columns are still empty, let's just make a single column table with the data
if cols.is_empty() {
at_least_one_row_filled = true;
let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect();
let mut record = Record::new();
for row in &table_with_no_empties {
for (counter, cell) in row.iter().enumerate() {
record.push(format!("column{counter}"), Value::string(cell, span));
}
}
table_out.push(Value::record(record, span))
} else {
for row in &table {
let record = cols
.iter()
.map(|col| {
let val = row
.get(col)
.unwrap_or(&format!("Missing column: '{}'", &col))
.to_string();
if !at_least_one_row_filled && val != format!("Missing column: '{}'", &col) {
at_least_one_row_filled = true;
}
(col.clone(), Value::string(val, span))
})
.collect();
table_out.push(Value::record(record, span))
}
}
if !at_least_one_row_filled {
let mut data2 = Vec::new();
for x in &table.data {
data2.push(x.join(", "));
}
table.data = vec![data2];
return retrieve_table(table, columns, span);
}
// table_out
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
Value::list(table_out, span)
}
fn execute_selector_query_with_attribute(
input_string: &str,
query_string: &str,
attribute: &str,
inspect: bool,
span: Span,
) -> Value {
let doc = Html::parse_fragment(input_string);
let vals: Vec<Value> = doc
.select(&css(query_string, inspect))
.map(|selection| {
Value::string(
selection.value().attr(attribute).unwrap_or("").to_string(),
span,
)
})
.collect();
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
Value::list(vals, span)
}
fn execute_selector_query(
input_string: &str,
query_string: &str,
as_html: bool,
inspect: bool,
span: Span,
) -> Value {
let doc = Html::parse_fragment(input_string);
let vals: Vec<Value> = match as_html {
true => doc
.select(&css(query_string, inspect))
.map(|selection| Value::string(selection.html(), span))
.collect(),
false => doc
.select(&css(query_string, inspect))
.map(|selection| {
Value::string(
selection
.text()
.fold("".to_string(), |acc, x| format!("{acc}{x}")),
span,
)
})
.collect(),
};
Move Value to helpers, separate span call (#10121) # Description As part of the refactor to split spans off of Value, this moves to using helper functions to create values, and using `.span()` instead of matching span out of Value directly. Hoping to get a few more helping hands to finish this, as there are a lot of commands to update :) # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --> --------- Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com> Co-authored-by: WindSoilder <windsoilder@outlook.com>
2023-09-03 14:27:29 +00:00
Value::list(vals, span)
}
pub fn css(selector: &str, inspect: bool) -> ScraperSelector {
if inspect {
ScraperSelector::parse("html").expect("Error unwrapping the default scraperselector")
} else {
ScraperSelector::parse(selector).expect("Error unwrapping scraperselector::parse")
}
}
#[cfg(test)]
mod tests {
use super::*;
const SIMPLE_LIST: &str = r#"
<ul>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ul>
"#;
#[test]
fn test_first_child_is_not_empty() {
assert!(!execute_selector_query(
SIMPLE_LIST,
"li:first-child",
false,
false,
Span::test_data()
)
.is_empty())
}
#[test]
fn test_first_child() {
let item = execute_selector_query(
SIMPLE_LIST,
"li:first-child",
false,
false,
Span::test_data(),
);
let config = nu_protocol::Config::default();
let out = item.into_string("\n", &config);
assert_eq!("[Coffee]".to_string(), out)
}
}