2022-02-01 18:45:48 +00:00
|
|
|
use crate::web_tables::WebTable;
|
|
|
|
use nu_plugin::{EvaluatedCall, LabeledError};
|
Create `Record` type (#10103)
# Description
This PR creates a new `Record` type to reduce duplicate code and
possibly bugs as well. (This is an edited version of #9648.)
- `Record` implements `FromIterator` and `IntoIterator` and so can be
iterated over or collected into. For example, this helps with
conversions to and from (hash)maps. (Also, no more
`cols.iter().zip(vals)`!)
- `Record` has a `push(col, val)` function to help insure that the
number of columns is equal to the number of values. I caught a few
potential bugs thanks to this (e.g. in the `ls` command).
- Finally, this PR also adds a `record!` macro that helps simplify
record creation. It is used like so:
```rust
record! {
"key1" => some_value,
"key2" => Value::string("text", span),
"key3" => Value::int(optional_int.unwrap_or(0), span),
"key4" => Value::bool(config.setting, span),
}
```
Since macros hinder formatting, etc., the right hand side values should
be relatively short and sweet like the examples above.
Where possible, prefer `record!` or `.collect()` on an iterator instead
of multiple `Record::push`s, since the first two automatically set the
record capacity and do less work overall.
# User-Facing Changes
Besides the changes in `nu-protocol` the only other breaking changes are
to `nu-table::{ExpandedTable::build_map, JustTable::kv_table}`.
2023-08-24 19:50:29 +00:00
|
|
|
use nu_protocol::{Record, Span, Value};
|
2022-02-01 18:45:48 +00:00
|
|
|
use scraper::{Html, Selector as ScraperSelector};
|
|
|
|
|
|
|
|
pub struct Selector {
|
|
|
|
pub query: String,
|
|
|
|
pub as_html: bool,
|
|
|
|
pub attribute: String,
|
|
|
|
pub as_table: Value,
|
|
|
|
pub inspect: bool,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Selector {
|
|
|
|
pub fn new() -> Selector {
|
|
|
|
Selector {
|
|
|
|
query: String::new(),
|
|
|
|
as_html: false,
|
|
|
|
attribute: String::new(),
|
2022-12-24 13:41:57 +00:00
|
|
|
as_table: Value::string("".to_string(), Span::unknown()),
|
2022-02-01 18:45:48 +00:00
|
|
|
inspect: false,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Default for Selector {
|
|
|
|
fn default() -> Self {
|
|
|
|
Self::new()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn parse_selector_params(call: &EvaluatedCall, input: &Value) -> Result<Value, LabeledError> {
|
|
|
|
let head = call.head;
|
|
|
|
let query: String = match call.get_flag("query")? {
|
|
|
|
Some(q2) => q2,
|
|
|
|
None => "".to_string(),
|
|
|
|
};
|
2024-01-22 21:00:43 +00:00
|
|
|
let as_html = call.has_flag("as-html")?;
|
2023-01-24 11:23:42 +00:00
|
|
|
let attribute = call.get_flag("attribute")?.unwrap_or_default();
|
|
|
|
let as_table: Value = call
|
|
|
|
.get_flag("as-table")?
|
|
|
|
.unwrap_or_else(|| Value::nothing(head));
|
2022-02-01 18:45:48 +00:00
|
|
|
|
2024-01-22 21:00:43 +00:00
|
|
|
let inspect = call.has_flag("inspect")?;
|
2022-02-01 18:45:48 +00:00
|
|
|
|
|
|
|
if !&query.is_empty() && ScraperSelector::parse(&query).is_err() {
|
|
|
|
return Err(LabeledError {
|
|
|
|
msg: "Cannot parse this query as a valid css selector".to_string(),
|
|
|
|
label: "Parse error".to_string(),
|
|
|
|
span: Some(head),
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
let selector = Selector {
|
|
|
|
query,
|
|
|
|
as_html,
|
|
|
|
attribute,
|
|
|
|
as_table,
|
|
|
|
inspect,
|
|
|
|
};
|
|
|
|
|
2023-09-03 14:27:29 +00:00
|
|
|
let span = input.span();
|
2022-02-01 18:45:48 +00:00
|
|
|
match input {
|
2023-09-03 14:27:29 +00:00
|
|
|
Value::String { val, .. } => Ok(begin_selector_query(val.to_string(), selector, span)),
|
2022-02-01 18:45:48 +00:00
|
|
|
_ => Err(LabeledError {
|
|
|
|
label: "requires text input".to_string(),
|
|
|
|
msg: "Expected text from pipeline".to_string(),
|
2023-09-03 14:27:29 +00:00
|
|
|
span: Some(span),
|
2022-02-01 18:45:48 +00:00
|
|
|
}),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn begin_selector_query(input_html: String, selector: Selector, span: Span) -> Value {
|
|
|
|
if let Value::List { .. } = selector.as_table {
|
|
|
|
return retrieve_tables(
|
|
|
|
input_html.as_str(),
|
|
|
|
&selector.as_table,
|
|
|
|
selector.inspect,
|
|
|
|
span,
|
|
|
|
);
|
2023-01-24 11:23:42 +00:00
|
|
|
} else if selector.attribute.is_empty() {
|
|
|
|
execute_selector_query(
|
|
|
|
input_html.as_str(),
|
|
|
|
selector.query.as_str(),
|
|
|
|
selector.as_html,
|
|
|
|
selector.inspect,
|
|
|
|
span,
|
|
|
|
)
|
2022-02-01 18:45:48 +00:00
|
|
|
} else {
|
2023-01-24 11:23:42 +00:00
|
|
|
execute_selector_query_with_attribute(
|
|
|
|
input_html.as_str(),
|
|
|
|
selector.query.as_str(),
|
|
|
|
selector.attribute.as_str(),
|
|
|
|
selector.inspect,
|
|
|
|
span,
|
|
|
|
)
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn retrieve_tables(
|
|
|
|
input_string: &str,
|
|
|
|
columns: &Value,
|
|
|
|
inspect_mode: bool,
|
|
|
|
span: Span,
|
|
|
|
) -> Value {
|
|
|
|
let html = input_string;
|
|
|
|
let mut cols: Vec<String> = Vec::new();
|
|
|
|
if let Value::List { vals, .. } = &columns {
|
|
|
|
for x in vals {
|
|
|
|
if let Value::String { val, .. } = x {
|
|
|
|
cols.push(val.to_string())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if inspect_mode {
|
update query web wiki example (#11709)
# Description
This PR tries to make `query web` more resilient and easier to debug
with the `--inspect` parameter when trying to scrape tables. Previously
it would just fail, now at least it tries to give you a hint.
This is some example output now of when something went wrong.
```
❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [Rank City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or union territory'] --inspect
Passed in Column Headers = ["Rank", "City", "Population(2011)[3]", "Population(2001)[3][a]", "State or union territory"]
First 2048 HTML chars = <!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>List of cities in India by population - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["",
"January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9ecdad8f-2dbd-4245-b54d-9c57aea5ca45","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_in_India_by_population","wgTitle":"List of cities in India by population","wgCurRevisionId":1192093210,"wgRev
Potential HTML Headers = ["City", "Population(2011)[3]", "Population(2001)[3][a]", "State or unionterritory", "Ref"]
Potential HTML Headers = ["City", "Population(2011)[5]", "Population(2001)", "State or unionterritory"]
Potential HTML Headers = [".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:\"[ \"}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:\" ]\"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vtePopulation of cities in India"]
Potential HTML Headers = ["vteGeography of India"]
╭──────────────────────────┬─────────────────────────────────────────────────────╮
│ Rank │ error: no data found (column name may be incorrect) │
│ City │ error: no data found (column name may be incorrect) │
│ Population(2011)[3] │ error: no data found (column name may be incorrect) │
│ Population(2001)[3][a] │ error: no data found (column name may be incorrect) │
│ State or union territory │ error: no data found (column name may be incorrect) │
╰──────────────────────────┴─────────────────────────────────────────────────────╯
```
The key here is to look at the `Passed in Column Headers` and compare
them to the `Potential HTML Headers` and couple that with the error
table at the bottom should give you a hint that, in this situation,
wikipedia has changed the column names, yet again. So we need to update
our query web statement's tables to get closer to what we want.
```
❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or unionterritory' 'Ref']
╭─#──┬───────City───────┬─Population(2011)[3]─┬─Population(2001)[3][a]─┬─State or unionterritory─┬──Ref───╮
│ 0 │ Mumbai │ 12,442,373 │ 11,978,450 │ Maharashtra │ [3] │
│ 1 │ Delhi │ 11,034,555 │ 9,879,172 │ Delhi │ [3] │
│ 2 │ Bangalore │ 8,443,675 │ 5,682,293 │ Karnataka │ [3] │
│ 3 │ Hyderabad │ 6,993,262 │ 5,496,960 │ Telangana │ [3] │
│ 4 │ Ahmedabad │ 5,577,940 │ 4,470,006 │ Gujarat │ [3] │
│ 5 │ Chennai │ 4,646,732 │ 4,343,645 │ Tamil Nadu │ [3] │
│ 6 │ Kolkata │ 4,496,694 │ 4,580,546 │ West Bengal │ [3] │
│ 7 │ Surat │ 4,467,797 │ 2,788,126 │ Gujarat │ [3] │
│ 8 │ Pune │ 3,124,458 │ 2,538,473 │ Maharashtra │ [3] │
│ 9 │ Jaipur │ 3,046,163 │ 2,322,575 │ Rajasthan │ [3] │
│ 10 │ Lucknow │ 2,817,105 │ 2,185,927 │ Uttar Pradesh │ [3] │
│ 11 │ Kanpur │ 2,765,348 │ 2,551,337 │ Uttar Pradesh │ [3] │
│ 12 │ Nagpur │ 2,405,665 │ 2,052,066 │ Maharashtra │ [3] │
```
# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.
Make sure you've run and fixed any issues with these commands:
- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use std testing; testing run-tests --path
crates/nu-std"` to run the tests for the standard library
> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
2024-02-02 15:03:28 +00:00
|
|
|
eprintln!("Passed in Column Headers = {:?}\n", &cols);
|
|
|
|
eprintln!("First 2048 HTML chars = {}\n", &html[0..2047]);
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
|
update query web wiki example (#11709)
# Description
This PR tries to make `query web` more resilient and easier to debug
with the `--inspect` parameter when trying to scrape tables. Previously
it would just fail, now at least it tries to give you a hint.
This is some example output now of when something went wrong.
```
❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [Rank City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or union territory'] --inspect
Passed in Column Headers = ["Rank", "City", "Population(2011)[3]", "Population(2001)[3][a]", "State or union territory"]
First 2048 HTML chars = <!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>List of cities in India by population - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["",
"January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9ecdad8f-2dbd-4245-b54d-9c57aea5ca45","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_in_India_by_population","wgTitle":"List of cities in India by population","wgCurRevisionId":1192093210,"wgRev
Potential HTML Headers = ["City", "Population(2011)[3]", "Population(2001)[3][a]", "State or unionterritory", "Ref"]
Potential HTML Headers = ["City", "Population(2011)[5]", "Population(2001)", "State or unionterritory"]
Potential HTML Headers = [".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:\"[ \"}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:\" ]\"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vtePopulation of cities in India"]
Potential HTML Headers = ["vteGeography of India"]
╭──────────────────────────┬─────────────────────────────────────────────────────╮
│ Rank │ error: no data found (column name may be incorrect) │
│ City │ error: no data found (column name may be incorrect) │
│ Population(2011)[3] │ error: no data found (column name may be incorrect) │
│ Population(2001)[3][a] │ error: no data found (column name may be incorrect) │
│ State or union territory │ error: no data found (column name may be incorrect) │
╰──────────────────────────┴─────────────────────────────────────────────────────╯
```
The key here is to look at the `Passed in Column Headers` and compare
them to the `Potential HTML Headers` and couple that with the error
table at the bottom should give you a hint that, in this situation,
wikipedia has changed the column names, yet again. So we need to update
our query web statement's tables to get closer to what we want.
```
❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or unionterritory' 'Ref']
╭─#──┬───────City───────┬─Population(2011)[3]─┬─Population(2001)[3][a]─┬─State or unionterritory─┬──Ref───╮
│ 0 │ Mumbai │ 12,442,373 │ 11,978,450 │ Maharashtra │ [3] │
│ 1 │ Delhi │ 11,034,555 │ 9,879,172 │ Delhi │ [3] │
│ 2 │ Bangalore │ 8,443,675 │ 5,682,293 │ Karnataka │ [3] │
│ 3 │ Hyderabad │ 6,993,262 │ 5,496,960 │ Telangana │ [3] │
│ 4 │ Ahmedabad │ 5,577,940 │ 4,470,006 │ Gujarat │ [3] │
│ 5 │ Chennai │ 4,646,732 │ 4,343,645 │ Tamil Nadu │ [3] │
│ 6 │ Kolkata │ 4,496,694 │ 4,580,546 │ West Bengal │ [3] │
│ 7 │ Surat │ 4,467,797 │ 2,788,126 │ Gujarat │ [3] │
│ 8 │ Pune │ 3,124,458 │ 2,538,473 │ Maharashtra │ [3] │
│ 9 │ Jaipur │ 3,046,163 │ 2,322,575 │ Rajasthan │ [3] │
│ 10 │ Lucknow │ 2,817,105 │ 2,185,927 │ Uttar Pradesh │ [3] │
│ 11 │ Kanpur │ 2,765,348 │ 2,551,337 │ Uttar Pradesh │ [3] │
│ 12 │ Nagpur │ 2,405,665 │ 2,052,066 │ Maharashtra │ [3] │
```
# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.
Make sure you've run and fixed any issues with these commands:
- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use std testing; testing run-tests --path
crates/nu-std"` to run the tests for the standard library
> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
2024-02-02 15:03:28 +00:00
|
|
|
let tables = match WebTable::find_by_headers(html, &cols, inspect_mode) {
|
2022-02-01 18:45:48 +00:00
|
|
|
Some(t) => {
|
|
|
|
if inspect_mode {
|
|
|
|
eprintln!("Table Found = {:#?}", &t);
|
|
|
|
}
|
|
|
|
t
|
|
|
|
}
|
|
|
|
None => vec![WebTable::empty()],
|
|
|
|
};
|
|
|
|
|
|
|
|
if tables.len() == 1 {
|
|
|
|
return retrieve_table(
|
2022-04-27 12:38:36 +00:00
|
|
|
tables.into_iter().next().expect("Error retrieving table"),
|
2022-02-01 18:45:48 +00:00
|
|
|
columns,
|
|
|
|
span,
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
let vals = tables
|
|
|
|
.into_iter()
|
|
|
|
.map(move |table| retrieve_table(table, columns, span))
|
|
|
|
.collect();
|
|
|
|
|
2023-09-03 14:27:29 +00:00
|
|
|
Value::list(vals, span)
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
fn retrieve_table(mut table: WebTable, columns: &Value, span: Span) -> Value {
|
|
|
|
let mut cols: Vec<String> = Vec::new();
|
|
|
|
if let Value::List { vals, .. } = &columns {
|
|
|
|
for x in vals {
|
|
|
|
// TODO Find a way to get the Config object here
|
|
|
|
if let Value::String { val, .. } = x {
|
|
|
|
cols.push(val.to_string())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if cols.is_empty() && !table.headers().is_empty() {
|
|
|
|
for col in table.headers().keys() {
|
|
|
|
cols.push(col.to_string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
update query web wiki example (#11709)
# Description
This PR tries to make `query web` more resilient and easier to debug
with the `--inspect` parameter when trying to scrape tables. Previously
it would just fail, now at least it tries to give you a hint.
This is some example output now of when something went wrong.
```
❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [Rank City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or union territory'] --inspect
Passed in Column Headers = ["Rank", "City", "Population(2011)[3]", "Population(2001)[3][a]", "State or union territory"]
First 2048 HTML chars = <!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>List of cities in India by population - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["",
"January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9ecdad8f-2dbd-4245-b54d-9c57aea5ca45","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_in_India_by_population","wgTitle":"List of cities in India by population","wgCurRevisionId":1192093210,"wgRev
Potential HTML Headers = ["City", "Population(2011)[3]", "Population(2001)[3][a]", "State or unionterritory", "Ref"]
Potential HTML Headers = ["City", "Population(2011)[5]", "Population(2001)", "State or unionterritory"]
Potential HTML Headers = [".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:\"[ \"}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:\" ]\"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vtePopulation of cities in India"]
Potential HTML Headers = ["vteGeography of India"]
╭──────────────────────────┬─────────────────────────────────────────────────────╮
│ Rank │ error: no data found (column name may be incorrect) │
│ City │ error: no data found (column name may be incorrect) │
│ Population(2011)[3] │ error: no data found (column name may be incorrect) │
│ Population(2001)[3][a] │ error: no data found (column name may be incorrect) │
│ State or union territory │ error: no data found (column name may be incorrect) │
╰──────────────────────────┴─────────────────────────────────────────────────────╯
```
The key here is to look at the `Passed in Column Headers` and compare
them to the `Potential HTML Headers` and couple that with the error
table at the bottom should give you a hint that, in this situation,
wikipedia has changed the column names, yet again. So we need to update
our query web statement's tables to get closer to what we want.
```
❯ http get https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population | query web --as-table [City 'Population(2011)[3]' 'Population(2001)[3][a]' 'State or unionterritory' 'Ref']
╭─#──┬───────City───────┬─Population(2011)[3]─┬─Population(2001)[3][a]─┬─State or unionterritory─┬──Ref───╮
│ 0 │ Mumbai │ 12,442,373 │ 11,978,450 │ Maharashtra │ [3] │
│ 1 │ Delhi │ 11,034,555 │ 9,879,172 │ Delhi │ [3] │
│ 2 │ Bangalore │ 8,443,675 │ 5,682,293 │ Karnataka │ [3] │
│ 3 │ Hyderabad │ 6,993,262 │ 5,496,960 │ Telangana │ [3] │
│ 4 │ Ahmedabad │ 5,577,940 │ 4,470,006 │ Gujarat │ [3] │
│ 5 │ Chennai │ 4,646,732 │ 4,343,645 │ Tamil Nadu │ [3] │
│ 6 │ Kolkata │ 4,496,694 │ 4,580,546 │ West Bengal │ [3] │
│ 7 │ Surat │ 4,467,797 │ 2,788,126 │ Gujarat │ [3] │
│ 8 │ Pune │ 3,124,458 │ 2,538,473 │ Maharashtra │ [3] │
│ 9 │ Jaipur │ 3,046,163 │ 2,322,575 │ Rajasthan │ [3] │
│ 10 │ Lucknow │ 2,817,105 │ 2,185,927 │ Uttar Pradesh │ [3] │
│ 11 │ Kanpur │ 2,765,348 │ 2,551,337 │ Uttar Pradesh │ [3] │
│ 12 │ Nagpur │ 2,405,665 │ 2,052,066 │ Maharashtra │ [3] │
```
# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.
Make sure you've run and fixed any issues with these commands:
- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use std testing; testing run-tests --path
crates/nu-std"` to run the tests for the standard library
> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
2024-02-02 15:03:28 +00:00
|
|
|
// We provided columns but the table has no headers, so we'll just make a single column table
|
|
|
|
if !cols.is_empty() && table.headers().is_empty() {
|
|
|
|
let mut record = Record::new();
|
|
|
|
for col in &cols {
|
|
|
|
record.push(
|
|
|
|
col.clone(),
|
|
|
|
Value::string("error: no data found (column name may be incorrect)", span),
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return Value::record(record, span);
|
|
|
|
}
|
|
|
|
|
2022-02-01 18:45:48 +00:00
|
|
|
let mut table_out = Vec::new();
|
|
|
|
// sometimes there are tables where the first column is the headers, kind of like
|
|
|
|
// a table has ben rotated ccw 90 degrees, in these cases all columns will be missing
|
|
|
|
// we keep track of this with this variable so we can deal with it later
|
|
|
|
let mut at_least_one_row_filled = false;
|
|
|
|
// if columns are still empty, let's just make a single column table with the data
|
|
|
|
if cols.is_empty() {
|
|
|
|
at_least_one_row_filled = true;
|
|
|
|
let table_with_no_empties: Vec<_> = table.iter().filter(|item| !item.is_empty()).collect();
|
|
|
|
|
Create `Record` type (#10103)
# Description
This PR creates a new `Record` type to reduce duplicate code and
possibly bugs as well. (This is an edited version of #9648.)
- `Record` implements `FromIterator` and `IntoIterator` and so can be
iterated over or collected into. For example, this helps with
conversions to and from (hash)maps. (Also, no more
`cols.iter().zip(vals)`!)
- `Record` has a `push(col, val)` function to help insure that the
number of columns is equal to the number of values. I caught a few
potential bugs thanks to this (e.g. in the `ls` command).
- Finally, this PR also adds a `record!` macro that helps simplify
record creation. It is used like so:
```rust
record! {
"key1" => some_value,
"key2" => Value::string("text", span),
"key3" => Value::int(optional_int.unwrap_or(0), span),
"key4" => Value::bool(config.setting, span),
}
```
Since macros hinder formatting, etc., the right hand side values should
be relatively short and sweet like the examples above.
Where possible, prefer `record!` or `.collect()` on an iterator instead
of multiple `Record::push`s, since the first two automatically set the
record capacity and do less work overall.
# User-Facing Changes
Besides the changes in `nu-protocol` the only other breaking changes are
to `nu-table::{ExpandedTable::build_map, JustTable::kv_table}`.
2023-08-24 19:50:29 +00:00
|
|
|
let mut record = Record::new();
|
2022-02-01 18:45:48 +00:00
|
|
|
for row in &table_with_no_empties {
|
|
|
|
for (counter, cell) in row.iter().enumerate() {
|
Create `Record` type (#10103)
# Description
This PR creates a new `Record` type to reduce duplicate code and
possibly bugs as well. (This is an edited version of #9648.)
- `Record` implements `FromIterator` and `IntoIterator` and so can be
iterated over or collected into. For example, this helps with
conversions to and from (hash)maps. (Also, no more
`cols.iter().zip(vals)`!)
- `Record` has a `push(col, val)` function to help insure that the
number of columns is equal to the number of values. I caught a few
potential bugs thanks to this (e.g. in the `ls` command).
- Finally, this PR also adds a `record!` macro that helps simplify
record creation. It is used like so:
```rust
record! {
"key1" => some_value,
"key2" => Value::string("text", span),
"key3" => Value::int(optional_int.unwrap_or(0), span),
"key4" => Value::bool(config.setting, span),
}
```
Since macros hinder formatting, etc., the right hand side values should
be relatively short and sweet like the examples above.
Where possible, prefer `record!` or `.collect()` on an iterator instead
of multiple `Record::push`s, since the first two automatically set the
record capacity and do less work overall.
# User-Facing Changes
Besides the changes in `nu-protocol` the only other breaking changes are
to `nu-table::{ExpandedTable::build_map, JustTable::kv_table}`.
2023-08-24 19:50:29 +00:00
|
|
|
record.push(format!("column{counter}"), Value::string(cell, span));
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
}
|
Create `Record` type (#10103)
# Description
This PR creates a new `Record` type to reduce duplicate code and
possibly bugs as well. (This is an edited version of #9648.)
- `Record` implements `FromIterator` and `IntoIterator` and so can be
iterated over or collected into. For example, this helps with
conversions to and from (hash)maps. (Also, no more
`cols.iter().zip(vals)`!)
- `Record` has a `push(col, val)` function to help insure that the
number of columns is equal to the number of values. I caught a few
potential bugs thanks to this (e.g. in the `ls` command).
- Finally, this PR also adds a `record!` macro that helps simplify
record creation. It is used like so:
```rust
record! {
"key1" => some_value,
"key2" => Value::string("text", span),
"key3" => Value::int(optional_int.unwrap_or(0), span),
"key4" => Value::bool(config.setting, span),
}
```
Since macros hinder formatting, etc., the right hand side values should
be relatively short and sweet like the examples above.
Where possible, prefer `record!` or `.collect()` on an iterator instead
of multiple `Record::push`s, since the first two automatically set the
record capacity and do less work overall.
# User-Facing Changes
Besides the changes in `nu-protocol` the only other breaking changes are
to `nu-table::{ExpandedTable::build_map, JustTable::kv_table}`.
2023-08-24 19:50:29 +00:00
|
|
|
table_out.push(Value::record(record, span))
|
2022-02-01 18:45:48 +00:00
|
|
|
} else {
|
|
|
|
for row in &table {
|
2023-11-22 22:48:48 +00:00
|
|
|
let record = cols
|
|
|
|
.iter()
|
|
|
|
.map(|col| {
|
|
|
|
let val = row
|
|
|
|
.get(col)
|
|
|
|
.unwrap_or(&format!("Missing column: '{}'", &col))
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
if !at_least_one_row_filled && val != format!("Missing column: '{}'", &col) {
|
|
|
|
at_least_one_row_filled = true;
|
|
|
|
}
|
|
|
|
(col.clone(), Value::string(val, span))
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
table_out.push(Value::record(record, span))
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if !at_least_one_row_filled {
|
|
|
|
let mut data2 = Vec::new();
|
|
|
|
for x in &table.data {
|
|
|
|
data2.push(x.join(", "));
|
|
|
|
}
|
|
|
|
table.data = vec![data2];
|
|
|
|
return retrieve_table(table, columns, span);
|
|
|
|
}
|
|
|
|
// table_out
|
|
|
|
|
2023-09-03 14:27:29 +00:00
|
|
|
Value::list(table_out, span)
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
fn execute_selector_query_with_attribute(
|
|
|
|
input_string: &str,
|
|
|
|
query_string: &str,
|
|
|
|
attribute: &str,
|
2022-04-27 12:38:36 +00:00
|
|
|
inspect: bool,
|
2022-02-01 18:45:48 +00:00
|
|
|
span: Span,
|
|
|
|
) -> Value {
|
|
|
|
let doc = Html::parse_fragment(input_string);
|
|
|
|
|
|
|
|
let vals: Vec<Value> = doc
|
2022-04-27 12:38:36 +00:00
|
|
|
.select(&css(query_string, inspect))
|
2022-02-01 18:45:48 +00:00
|
|
|
.map(|selection| {
|
|
|
|
Value::string(
|
|
|
|
selection.value().attr(attribute).unwrap_or("").to_string(),
|
|
|
|
span,
|
|
|
|
)
|
|
|
|
})
|
|
|
|
.collect();
|
2023-09-03 14:27:29 +00:00
|
|
|
Value::list(vals, span)
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
fn execute_selector_query(
|
|
|
|
input_string: &str,
|
|
|
|
query_string: &str,
|
|
|
|
as_html: bool,
|
2022-04-27 12:38:36 +00:00
|
|
|
inspect: bool,
|
2022-02-01 18:45:48 +00:00
|
|
|
span: Span,
|
|
|
|
) -> Value {
|
|
|
|
let doc = Html::parse_fragment(input_string);
|
|
|
|
|
|
|
|
let vals: Vec<Value> = match as_html {
|
|
|
|
true => doc
|
2022-04-27 12:38:36 +00:00
|
|
|
.select(&css(query_string, inspect))
|
2022-02-01 18:45:48 +00:00
|
|
|
.map(|selection| Value::string(selection.html(), span))
|
|
|
|
.collect(),
|
|
|
|
false => doc
|
2022-04-27 12:38:36 +00:00
|
|
|
.select(&css(query_string, inspect))
|
2022-02-01 18:45:48 +00:00
|
|
|
.map(|selection| {
|
|
|
|
Value::string(
|
|
|
|
selection
|
|
|
|
.text()
|
2023-01-30 01:37:54 +00:00
|
|
|
.fold("".to_string(), |acc, x| format!("{acc}{x}")),
|
2022-02-01 18:45:48 +00:00
|
|
|
span,
|
|
|
|
)
|
|
|
|
})
|
|
|
|
.collect(),
|
|
|
|
};
|
|
|
|
|
2023-09-03 14:27:29 +00:00
|
|
|
Value::list(vals, span)
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
|
2022-04-27 12:38:36 +00:00
|
|
|
pub fn css(selector: &str, inspect: bool) -> ScraperSelector {
|
|
|
|
if inspect {
|
|
|
|
ScraperSelector::parse("html").expect("Error unwrapping the default scraperselector")
|
|
|
|
} else {
|
|
|
|
ScraperSelector::parse(selector).expect("Error unwrapping scraperselector::parse")
|
|
|
|
}
|
2022-02-01 18:45:48 +00:00
|
|
|
}
|
|
|
|
|
2022-02-23 16:43:36 +00:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
2022-02-01 18:45:48 +00:00
|
|
|
|
2022-02-23 16:43:36 +00:00
|
|
|
const SIMPLE_LIST: &str = r#"
|
|
|
|
<ul>
|
|
|
|
<li>Coffee</li>
|
|
|
|
<li>Tea</li>
|
|
|
|
<li>Milk</li>
|
|
|
|
</ul>
|
|
|
|
"#;
|
2022-02-01 18:45:48 +00:00
|
|
|
|
2022-02-23 16:43:36 +00:00
|
|
|
#[test]
|
|
|
|
fn test_first_child_is_not_empty() {
|
2022-04-27 12:38:36 +00:00
|
|
|
assert!(!execute_selector_query(
|
|
|
|
SIMPLE_LIST,
|
|
|
|
"li:first-child",
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
Span::test_data()
|
2022-02-23 16:43:36 +00:00
|
|
|
)
|
2022-04-27 12:38:36 +00:00
|
|
|
.is_empty())
|
2022-02-23 16:43:36 +00:00
|
|
|
}
|
2022-02-01 18:45:48 +00:00
|
|
|
|
2022-02-23 16:43:36 +00:00
|
|
|
#[test]
|
|
|
|
fn test_first_child() {
|
2022-04-27 12:38:36 +00:00
|
|
|
let item = execute_selector_query(
|
|
|
|
SIMPLE_LIST,
|
|
|
|
"li:first-child",
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
Span::test_data(),
|
|
|
|
);
|
2022-02-23 16:43:36 +00:00
|
|
|
let config = nu_protocol::Config::default();
|
|
|
|
let out = item.into_string("\n", &config);
|
|
|
|
assert_eq!("[Coffee]".to_string(), out)
|
|
|
|
}
|
|
|
|
}
|