Look for links/images in HTML content in the Markdown

This commit is contained in:
Tom Parker-Shemilt 2020-01-18 14:01:05 +00:00
parent a37b1cfee3
commit 6cb9433de0
2 changed files with 36 additions and 14 deletions

View file

@ -17,3 +17,4 @@ env_logger = "0.7"
async-std = "1" async-std = "1"
log = "0.4" log = "0.4"
regex = "1" regex = "1"
scraper = "0.11"

View file

@ -11,7 +11,7 @@ use log::{warn, debug};
use std::io::Write; use std::io::Write;
use reqwest::{Client, redirect::Policy, StatusCode, header}; use reqwest::{Client, redirect::Policy, StatusCode, header};
use regex::Regex; use regex::Regex;
use scraper::{Html, Selector};
use failure::{Fail, Error, format_err}; use failure::{Fail, Error, format_err};
#[derive(Debug, Fail)] #[derive(Debug, Fail)]
@ -155,22 +155,43 @@ async fn main() -> Result<(), Error> {
let mut url_checks = vec![]; let mut url_checks = vec![];
let mut do_check = |url: String| {
if !url.starts_with("http") {
return;
}
if results.working.contains(&url) {
return;
}
let check = get_url(url).boxed();
url_checks.push(check);
};
for (event, _range) in parser.into_offset_iter() { for (event, _range) in parser.into_offset_iter() {
if let Event::Start(tag) = event { match event {
match tag { Event::Start(tag) => {
Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => { match tag {
if !url.starts_with("http") { Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => {
continue; do_check(url.to_string());
} }
let url_string = url.to_string(); _ => {}
if results.working.contains(&url_string) {
continue;
}
let check = get_url(url_string).boxed();
url_checks.push(check);
} }
_ => {}
} }
Event::Html(content) => {
let fragment = Html::parse_fragment(&content);
for element in fragment.select(&Selector::parse("img").unwrap()) {
let img_src = element.value().attr("src");
if let Some(src) = img_src {
do_check(src.to_string());
}
}
for element in fragment.select(&Selector::parse("a").unwrap()) {
let a_href = element.value().attr("href");
if let Some(href) = a_href {
do_check(href.to_string());
}
}
}
_ => {}
} }
} }