feat: add query webpage-info to plugin_nu_query (#13252)

# Description

This PR adds a new subcommand `query webpage-info` to `plugin_nu_query`.
The subcommand is a basic wrapper for the
[`webpage`](https://crates.io/crates/webpage) crate.

Usage:

```
http get https://phoronix.com | query webpage-info
```

and it returns a `Record` version of
[`webpage::HTML`](https://docs.rs/webpage/latest/webpage/struct.HTML.html).

The PR also takes a shot at bringing @lily-mara 's
[nu-serde::to_value](https://github.com/nushell/nushell/pull/3878/files)
back to life, updating it for the latest version of nushell. That's not
the main focus of the PR though - I just didn't want to have to
implement a custom converter for `webpage::HTML` 😅. If it looks
reasonable we could move it to `nu_protocol`(?) either in this PR or a
future one (along with adding tests for it).

# User-Facing Changes

no breaking changes
This commit is contained in:
Andy Gayton 2024-06-29 17:13:31 -04:00 committed by GitHub
parent 33d0537cae
commit 4fe0f860a8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 588 additions and 4 deletions

102
Cargo.lock generated
View file

@ -1117,6 +1117,36 @@ dependencies = [
"windows-sys 0.52.0", "windows-sys 0.52.0",
] ]
[[package]]
name = "curl"
version = "0.4.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e2161dd6eba090ff1594084e95fd67aeccf04382ffea77999ea94ed42ec67b6"
dependencies = [
"curl-sys",
"libc",
"openssl-probe",
"openssl-sys",
"schannel",
"socket2",
"windows-sys 0.52.0",
]
[[package]]
name = "curl-sys"
version = "0.4.73+curl-8.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "450ab250ecf17227c39afb9a2dd9261dc0035cb80f2612472fc0c4aac2dcb84d"
dependencies = [
"cc",
"libc",
"libz-sys",
"openssl-sys",
"pkg-config",
"vcpkg",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "deranged" name = "deranged"
version = "0.3.11" version = "0.3.11"
@ -1883,12 +1913,26 @@ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [ dependencies = [
"log", "log",
"mac", "mac",
"markup5ever", "markup5ever 0.11.0",
"proc-macro2", "proc-macro2",
"quote", "quote",
"syn 1.0.109", "syn 1.0.109",
] ]
[[package]]
name = "html5ever"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
dependencies = [
"log",
"mac",
"markup5ever 0.12.1",
"proc-macro2",
"quote",
"syn 2.0.60",
]
[[package]] [[package]]
name = "http" name = "http"
version = "0.2.12" version = "0.2.12"
@ -2552,6 +2596,32 @@ dependencies = [
"tendril", "tendril",
] ]
[[package]]
name = "markup5ever"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
dependencies = [
"log",
"phf 0.11.2",
"phf_codegen 0.11.2",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18"
dependencies = [
"html5ever 0.27.0",
"markup5ever 0.12.1",
"tendril",
"xml5ever",
]
[[package]] [[package]]
name = "md-5" name = "md-5"
version = "0.10.6" version = "0.10.6"
@ -3463,8 +3533,11 @@ dependencies = [
"nu-plugin", "nu-plugin",
"nu-protocol", "nu-protocol",
"scraper", "scraper",
"serde",
"serde_json",
"sxd-document", "sxd-document",
"sxd-xpath", "sxd-xpath",
"webpage",
] ]
[[package]] [[package]]
@ -5283,7 +5356,7 @@ dependencies = [
"ahash 0.8.11", "ahash 0.8.11",
"cssparser", "cssparser",
"ego-tree", "ego-tree",
"html5ever", "html5ever 0.26.0",
"once_cell", "once_cell",
"selectors", "selectors",
"tendril", "tendril",
@ -6797,6 +6870,20 @@ dependencies = [
"pkg-config", "pkg-config",
] ]
[[package]]
name = "webpage"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70862efc041d46e6bbaa82bb9c34ae0596d090e86cbd14bd9e93b36ee6802eac"
dependencies = [
"curl",
"html5ever 0.27.0",
"markup5ever_rcdom",
"serde",
"serde_json",
"url",
]
[[package]] [[package]]
name = "which" name = "which"
version = "6.0.1" version = "6.0.1"
@ -7209,6 +7296,17 @@ dependencies = [
"rustix", "rustix",
] ]
[[package]]
name = "xml5ever"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69"
dependencies = [
"log",
"mac",
"markup5ever 0.12.1",
]
[[package]] [[package]]
name = "xxhash-rust" name = "xxhash-rust"
version = "0.8.10" version = "0.8.10"

View file

@ -23,3 +23,6 @@ gjson = "0.8"
scraper = { default-features = false, version = "0.19" } scraper = { default-features = false, version = "0.19" }
sxd-document = "0.3" sxd-document = "0.3"
sxd-xpath = "0.4" sxd-xpath = "0.4"
webpage = { version = "2.0.1", features = ["serde"] }
serde_json.workspace = true
serde.workspace = true

View file

@ -1,6 +1,7 @@
mod query; mod query;
mod query_json; mod query_json;
mod query_web; mod query_web;
mod query_webpage_info;
mod query_xml; mod query_xml;
mod web_tables; mod web_tables;

View file

@ -1,4 +1,7 @@
use crate::{query_json::QueryJson, query_web::QueryWeb, query_xml::QueryXml}; use crate::{
query_json::QueryJson, query_web::QueryWeb, query_webpage_info::QueryWebpageInfo,
query_xml::QueryXml,
};
use nu_plugin::{EvaluatedCall, Plugin, PluginCommand, SimplePluginCommand}; use nu_plugin::{EvaluatedCall, Plugin, PluginCommand, SimplePluginCommand};
use nu_protocol::{Category, LabeledError, Signature, Value}; use nu_protocol::{Category, LabeledError, Signature, Value};
@ -26,6 +29,7 @@ impl Plugin for Query {
Box::new(QueryJson), Box::new(QueryJson),
Box::new(QueryXml), Box::new(QueryXml),
Box::new(QueryWeb), Box::new(QueryWeb),
Box::new(QueryWebpageInfo),
] ]
} }
} }

View file

@ -0,0 +1,478 @@
use nu_plugin::{EngineInterface, EvaluatedCall, SimplePluginCommand};
use nu_protocol::{Category, Example, LabeledError, Record, Signature, Span, Type, Value};
use crate::Query;
pub struct QueryWebpageInfo;
impl SimplePluginCommand for QueryWebpageInfo {
type Plugin = Query;
fn name(&self) -> &str {
"query webpage-info"
}
fn usage(&self) -> &str {
"uses the webpage crate to extract info from html: title, description, language, links, RSS feeds, Opengraph, Schema.org, and more"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.input_output_type(Type::String, Type::record())
.category(Category::Network)
}
fn examples(&self) -> Vec<Example> {
web_examples()
}
fn run(
&self,
_plugin: &Query,
_engine: &EngineInterface,
_call: &EvaluatedCall,
input: &Value,
) -> Result<Value, LabeledError> {
let span = input.span();
match input {
Value::String { val, .. } => execute_webpage(val, span),
_ => Err(LabeledError::new("Requires text input")
.with_label("expected text from pipeline", span)),
}
}
}
pub fn web_examples() -> Vec<Example<'static>> {
vec![Example {
example: "http get https://phoronix.com | query webpage-info",
description: "extract detailed info from phoronix.com website",
result: None,
}]
}
fn execute_webpage(html: &str, span: Span) -> Result<Value, LabeledError> {
let info = webpage::HTML::from_string(html.to_string(), None)
.map_err(|e| LabeledError::new(e.to_string()).with_label("error parsing html", span))?;
let value = to_value(info, span).map_err(|e| {
LabeledError::new(e.to_string()).with_label("error convert Value::Record", span)
})?;
Ok(value)
}
#[cfg(test)]
mod tests {
use super::*;
const HTML: &str = r#"
<html><head><meta><title>My Title</title></head></html>
"#;
#[test]
fn test_basics() {
let info = execute_webpage(HTML, Span::test_data()).unwrap();
let record = info.as_record().unwrap();
assert_eq!(record.get("title").unwrap().as_str().unwrap(), "My Title");
}
}
// revive nu-serde sketch
use serde::Serialize;
/// Convert any serde:Serialize into a `nu_protocol::Value`
pub fn to_value<T>(value: T, span: Span) -> Result<Value, Error>
where
T: Serialize,
{
value.serialize(&ValueSerializer { span })
}
struct ValueSerializer {
span: Span,
}
struct MapSerializer<'a> {
record: Record,
serializer: &'a ValueSerializer,
current_key: Option<String>,
}
impl<'a> serde::Serializer for &'a ValueSerializer {
type Ok = Value;
type Error = Error;
type SerializeSeq = SeqSerializer<'a>;
type SerializeTuple = SeqSerializer<'a>;
type SerializeTupleStruct = SeqSerializer<'a>;
type SerializeTupleVariant = SeqSerializer<'a>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = MapSerializer<'a>;
type SerializeStructVariant = MapSerializer<'a>;
fn serialize_bool(self, v: bool) -> Result<Self::Ok, Self::Error> {
Ok(Value::bool(v, self.span))
}
fn serialize_i8(self, v: i8) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_i16(self, v: i16) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_i32(self, v: i32) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_i64(self, v: i64) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v, self.span))
}
fn serialize_u8(self, v: u8) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_u16(self, v: u16) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_u32(self, v: u32) -> Result<Self::Ok, Self::Error> {
Ok(Value::int(v.into(), self.span))
}
fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
// TODO: how to represent a u64 value a Value<i64>?
Err(Error::new("the numbers are too big"))
// Ok(Value::int(v.into(), self.span))
}
fn serialize_f32(self, v: f32) -> Result<Self::Ok, Self::Error> {
Ok(Value::float(v.into(), self.span))
}
fn serialize_f64(self, v: f64) -> Result<Self::Ok, Self::Error> {
Ok(Value::float(v, self.span))
}
fn serialize_char(self, v: char) -> Result<Self::Ok, Self::Error> {
Ok(Value::string(v, self.span))
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
Ok(Value::string(v, self.span))
}
fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
Ok(Value::binary(v, self.span))
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::nothing(self.span))
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
value.serialize(self)
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
// TODO: is this OK?
Ok(Value::nothing(self.span))
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
// TODO: is this OK?
Ok(Value::nothing(self.span))
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
// TODO: is this OK?
Ok(Value::nothing(self.span))
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
value: &T,
) -> Result<Self::Ok, Self::Error>
where
T: Serialize,
{
value.serialize(self)
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleStruct, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeTupleVariant, Self::Error> {
Ok(SeqSerializer::new(self))
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer::new(self))
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Ok(MapSerializer::new(self))
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize,
) -> Result<Self::SerializeStructVariant, Self::Error> {
Ok(MapSerializer::new(self))
}
}
pub struct Error {
message: String,
}
impl Error {
pub fn new<T: std::fmt::Display>(msg: T) -> Self {
Error {
message: msg.to_string(),
}
}
}
impl serde::ser::Error for Error {
fn custom<T: std::fmt::Display>(msg: T) -> Self {
Error::new(msg.to_string())
}
}
impl std::fmt::Debug for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.message)
}
}
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.message)
}
}
impl std::error::Error for Error {}
//
// maps
impl<'a> MapSerializer<'a> {
fn new(serializer: &'a ValueSerializer) -> Self {
Self {
record: Record::new(),
current_key: None,
serializer,
}
}
}
impl<'a> serde::ser::SerializeStruct for MapSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: Serialize,
{
self.record
.insert(key.to_owned(), value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::record(self.record, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeMap for MapSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
let value = serde_json::to_value(key).map_err(Error::new)?;
let key = value
.as_str()
.ok_or(Error::new("key must be a string"))?
.to_string();
self.current_key = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
let key = self.current_key.take().ok_or(Error::new("key expected"))?;
self.record.insert(key, value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::record(self.record, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeStructVariant for MapSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: Serialize,
{
self.record
.insert(key.to_owned(), value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::record(self.record, self.serializer.span))
}
}
//
// sequences
struct SeqSerializer<'a> {
seq: Vec<Value>,
serializer: &'a ValueSerializer,
}
impl<'a> SeqSerializer<'a> {
fn new(serializer: &'a ValueSerializer) -> Self {
Self {
seq: Vec::new(),
serializer,
}
}
}
impl<'a> serde::ser::SerializeSeq for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeTuple for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeTupleStruct for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}
impl<'a> serde::ser::SerializeTupleVariant for SeqSerializer<'a> {
type Ok = Value;
type Error = Error;
fn serialize_field<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: Serialize,
{
self.seq.push(value.serialize(self.serializer)?);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(Value::list(self.seq, self.serializer.span))
}
}