mirror of
https://github.com/rust-lang-nursery/rust-cookbook
synced 2024-11-21 19:13:07 +00:00
web/scraping: base url should be the whole path, otherwise #label
links are interpreted wrong
This commit is contained in:
parent
752b035c1a
commit
636c288a58
1 changed files with 20 additions and 19 deletions
|
@ -17,7 +17,7 @@ use reqwest::StatusCode;
|
|||
use select::document::Document;
|
||||
use select::predicate::Name;
|
||||
use std::collections::HashSet;
|
||||
use url::{Position, Url};
|
||||
use url::Url;
|
||||
|
||||
error_chain! {
|
||||
foreign_links {
|
||||
|
@ -28,30 +28,31 @@ error_chain! {
|
|||
}
|
||||
}
|
||||
|
||||
async fn get_base_url(url: &Url, doc: &Document) -> Result<Url> {
|
||||
let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0);
|
||||
let base_url =
|
||||
base_tag_href.map_or_else(|| Url::parse(&url[..Position::BeforePath]), Url::parse)?;
|
||||
Ok(base_url)
|
||||
fn get_base_url(doc: &Document) -> Result<Option<Url>> {
|
||||
let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0);
|
||||
match base_tag_href {
|
||||
Some(href) => Ok(Some(Url::parse(href)?)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
async fn check_link(url: &Url) -> Result<bool> {
|
||||
let res = reqwest::get(url.as_ref()).await?;
|
||||
Ok(res.status() != StatusCode::NOT_FOUND)
|
||||
let res = reqwest::get(url.as_ref()).await?;
|
||||
Ok(res.status() != StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let url = Url::parse("https://www.rust-lang.org/en-US/")?;
|
||||
let res = reqwest::get(url.as_ref()).await?.text().await?;
|
||||
let document = Document::from(res.as_str());
|
||||
let base_url = get_base_url(&url, &document).await?;
|
||||
let base_parser = Url::options().base_url(Some(&base_url));
|
||||
let links: HashSet<Url> = document
|
||||
.find(Name("a"))
|
||||
.filter_map(|n| n.attr("href"))
|
||||
.filter_map(|link| base_parser.parse(link).ok())
|
||||
.collect();
|
||||
let url = Url::parse("https://www.rust-lang.org/en-US/")?;
|
||||
let res = reqwest::get(url.as_ref()).await?.text().await?;
|
||||
let document = Document::from(res.as_str());
|
||||
let base_url = get_base_url(&document)?.unwrap_or(url);
|
||||
let base_parser = Url::options().base_url(Some(&base_url));
|
||||
let links: HashSet<Url> = document
|
||||
.find(Name("a"))
|
||||
.filter_map(|n| n.attr("href"))
|
||||
.filter_map(|link| base_parser.parse(link).ok())
|
||||
.collect();
|
||||
let mut tasks = vec![];
|
||||
|
||||
for link in links {
|
||||
|
@ -68,7 +69,7 @@ async fn main() -> Result<()> {
|
|||
task.await?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
|
|
Loading…
Reference in a new issue