mirror of
https://github.com/rust-lang-nursery/rust-cookbook
synced 2025-02-16 12:18:27 +00:00
Added "Check webpage for broken links" example
This commit is contained in:
parent
85c6f6b3da
commit
ba679b95d6
2 changed files with 81 additions and 0 deletions
|
@ -84,6 +84,7 @@ community. It needs and welcomes help. For details see
|
|||
| [POST a file to paste-rs][ex-file-post] | [![reqwest-badge]][reqwest] | [![cat-net-badge]][cat-net] |
|
||||
| [Listen on unused port TCP/IP][ex-random-port-tcp] | [![std-badge]][std] | [![cat-net-badge]][cat-net] |
|
||||
| [Extract all links from a webpage HTML][ex-extract-links-webpage] | [![reqwest-badge]][reqwest] [![select-badge]][select] | [![cat-net-badge]][cat-net] |
|
||||
| [Check webpage for broken links][ex-check-broken-links] | [![reqwest-badge]][reqwest] [![select-badge]][select] [![url-badge]][url] | [![cat-net-badge]][cat-net] |
|
||||
| [Extract all unique links from a MediaWiki markup][ex-extract-mediawiki-links] | [![reqwest-badge]][reqwest] [![regex-badge]][regex] | [![cat-net-badge]][cat-net] |
|
||||
|
||||
## [Application development](app.html)
|
||||
|
@ -134,6 +135,7 @@ community. It needs and welcomes help. For details see
|
|||
[ex-bitflags]: basics.html#ex-bitflags
|
||||
[ex-byteorder-le]: basics.html#ex-byteorder-le
|
||||
[ex-cc-static-bundled]: build_tools.html#ex-cc-static-bundled
|
||||
[ex-check-broken-links]: net.html#ex-check-broken-links
|
||||
[ex-clap-basic]: app.html#ex-clap-basic
|
||||
[ex-crossbeam-spawn]: concurrency.html#ex-crossbeam-spawn
|
||||
[ex-csv-serde]: encoding.html#ex-csv-serde
|
||||
|
|
79
src/net.md
79
src/net.md
|
@ -17,6 +17,7 @@
|
|||
| [POST a file to paste-rs][ex-file-post] | [![reqwest-badge]][reqwest] | [![cat-net-badge]][cat-net] |
|
||||
| [Listen on unused port TCP/IP][ex-random-port-tcp] | [![std-badge]][std] | [![cat-net-badge]][cat-net] |
|
||||
| [Extract all links from a webpage HTML][ex-extract-links-webpage] | [![reqwest-badge]][reqwest] [![select-badge]][select] | [![cat-net-badge]][cat-net] |
|
||||
| [Check webpage for broken links][ex-check-broken-links] | [![reqwest-badge]][reqwest] [![select-badge]][select] [![url-badge]][url] | [![cat-net-badge]][cat-net] |
|
||||
| [Extract all unique links from a MediaWiki markup][ex-extract-mediawiki-links] | [![reqwest-badge]][reqwest] [![regex-badge]][regex] | [![cat-net-badge]][cat-net] |
|
||||
|
||||
[ex-url-parse]: #ex-url-parse
|
||||
|
@ -876,6 +877,81 @@ fn run() -> Result<()> {
|
|||
# quick_main!(run);
|
||||
```
|
||||
|
||||
[ex-check-broken-links]: #ex-check-broken-links
|
||||
<a name="ex-check-broken-links"/>
|
||||
## Check a webpage for broken links
|
||||
|
||||
[![reqwest-badge]][reqwest] [![select-badge]][select] [![url-badge]][url] [![cat-net-badge]][cat-net]
|
||||
|
||||
We call "get_base_url" to retrieve the base URL. If the document has a "base" tag, we get the "href" [`attr`] from the first occurrence of the "base" tag. This is then used as the base URL. Otherwise, we can use [`Position::BeforePath`] with the original URL to get the base of that URL.
|
||||
|
||||
We iterate through all the links in the document and get the absolute URL for each (using [`url::ParseOptions`] and [`Url::parse`]). We then filter these so that we can report which links are broken.
|
||||
|
||||
```rust,no_run
|
||||
# #[macro_use]
|
||||
# extern crate error_chain;
|
||||
extern crate reqwest;
|
||||
extern crate select;
|
||||
extern crate url;
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use url::{Url, Position};
|
||||
use reqwest::StatusCode;
|
||||
use select::document::Document;
|
||||
use select::predicate::Name;
|
||||
#
|
||||
# error_chain! {
|
||||
# foreign_links {
|
||||
# ReqError(reqwest::Error);
|
||||
# IoError(std::io::Error);
|
||||
# UrlParseError(url::ParseError);
|
||||
# }
|
||||
# }
|
||||
|
||||
fn get_base_url(url: &Url, doc: &Document) -> Result<Url> {
|
||||
let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0);
|
||||
|
||||
let base_url = base_tag_href.map_or_else(
|
||||
|| Url::parse(&url[..Position::BeforePath]),
|
||||
Url::parse,
|
||||
)?;
|
||||
|
||||
Ok(base_url)
|
||||
}
|
||||
|
||||
fn check_link(url: &Url) -> Result<bool> {
|
||||
let res = reqwest::get(url.as_ref())?;
|
||||
|
||||
Ok(res.status() != StatusCode::NotFound)
|
||||
}
|
||||
|
||||
fn run() -> Result<()> {
|
||||
let url = Url::parse("https://www.rust-lang.org/en-US/")?;
|
||||
|
||||
let res = reqwest::get(url.as_ref())?;
|
||||
let document = Document::from_read(res)?;
|
||||
|
||||
let base_url = get_base_url(&url, &document)?;
|
||||
|
||||
let base_parser = Url::options().base_url(Some(&base_url));
|
||||
|
||||
let links: HashSet<Url> = document
|
||||
.find(Name("a"))
|
||||
.filter_map(|n| n.attr("href"))
|
||||
.filter_map(|link| base_parser.parse(link).ok())
|
||||
.collect();
|
||||
|
||||
for link in links.iter().filter(|link| check_link(link).ok() == Some(false)) {
|
||||
println!("{} is broken.", link);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#
|
||||
# quick_main!(run);
|
||||
```
|
||||
|
||||
[ex-extract-mediawiki-links]: #ex-extract-mediawiki-links
|
||||
<a name="ex-extract-mediawiki-links"/>
|
||||
## Extract all unique links from a MediaWiki markup
|
||||
|
@ -959,6 +1035,7 @@ fn run() -> Result<()> {
|
|||
[`File`]: https://doc.rust-lang.org/std/fs/struct.File.html
|
||||
[`Ipv4Addr`]: https://doc.rust-lang.org/std/net/struct.Ipv4Addr.html
|
||||
[`Name`]: https://docs.rs/select/*/select/predicate/struct.Name.html
|
||||
[`Position::BeforePath`]: https://docs.rs/url/*/url/enum.Position.html#variant.BeforePath
|
||||
[`Regex::captures_iter`]: https://doc.rust-lang.org/regex/regex/struct.Regex.html#method.captures_iter
|
||||
[`RequestBuilder::basic_auth`]: https://docs.rs/reqwest/*/reqwest/struct.RequestBuilder.html#method.basic_auth
|
||||
[`RequestBuilder::body`]: https://docs.rs/reqwest/0.6.2/reqwest/struct.RequestBuilder.html#method.body
|
||||
|
@ -997,6 +1074,8 @@ fn run() -> Result<()> {
|
|||
[`serde_json::json!`]: https://docs.rs/serde_json/*/serde_json/macro.json.html
|
||||
[`std::iter::Iterator`]: https://doc.rust-lang.org/std/iter/trait.Iterator.html
|
||||
[`url::Position`]: https://docs.rs/url/*/url/enum.Position.html
|
||||
[`url::Parse`]: https://docs.rs/url/*/url/struct.Url.html#method.parse
|
||||
[`url::ParseOptions`]: https://docs.rs/url/*/url/struct.ParseOptions.html
|
||||
|
||||
<!-- Other Reference -->
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue