2020-01-10 23:07:36 +00:00
|
|
|
use pulldown_cmark::{Parser, Event, Tag};
|
|
|
|
use std::fs;
|
2020-01-13 22:13:00 +00:00
|
|
|
use futures::future::{select_all, BoxFuture, FutureExt};
|
2020-01-10 23:07:36 +00:00
|
|
|
use std::collections::{BTreeSet, BTreeMap};
|
|
|
|
use serde::{Serialize, Deserialize};
|
2020-01-11 12:05:30 +00:00
|
|
|
use lazy_static::lazy_static;
|
|
|
|
use std::sync::atomic::{AtomicU32, Ordering};
|
|
|
|
use async_std::task;
|
|
|
|
use std::time;
|
2020-02-10 21:02:28 +00:00
|
|
|
use log::{warn, debug, info};
|
2020-01-11 12:05:30 +00:00
|
|
|
use std::io::Write;
|
2020-02-10 21:02:28 +00:00
|
|
|
use reqwest::{Client, redirect::Policy, StatusCode, header, Url};
|
2020-01-13 22:13:00 +00:00
|
|
|
use regex::Regex;
|
2020-01-18 14:01:05 +00:00
|
|
|
use scraper::{Html, Selector};
|
2020-01-17 23:24:03 +00:00
|
|
|
use failure::{Fail, Error, format_err};
|
2020-02-24 23:41:45 +00:00
|
|
|
use chrono::{Local, DateTime, Duration};
|
2020-01-17 23:24:03 +00:00
|
|
|
|
|
|
|
#[derive(Debug, Fail)]
|
|
|
|
enum CheckerError {
|
|
|
|
#[fail(display = "failed to try url")]
|
|
|
|
NotTried, // Generally shouldn't happen, but useful to have
|
|
|
|
|
|
|
|
#[fail(display = "http error: {}", status)]
|
|
|
|
HttpError {
|
|
|
|
status: StatusCode,
|
|
|
|
location: Option<String>,
|
|
|
|
},
|
|
|
|
|
|
|
|
#[fail(display = "reqwest error: {}", error)]
|
|
|
|
ReqwestError {
|
|
|
|
error: reqwest::Error,
|
2020-01-20 17:53:24 +00:00
|
|
|
},
|
|
|
|
|
|
|
|
#[fail(display = "travis build is unknown")]
|
|
|
|
TravisBuildUnknown,
|
2020-02-12 23:20:58 +00:00
|
|
|
|
2020-02-12 23:38:26 +00:00
|
|
|
#[fail(display = "travis build image with no branch")]
|
|
|
|
TravisBuildNoBranch,
|
|
|
|
|
2020-02-12 23:20:58 +00:00
|
|
|
#[fail(display = "github actions image with no branch")]
|
|
|
|
GithubActionNoBranch,
|
2020-01-17 23:24:03 +00:00
|
|
|
}
|
|
|
|
|
2020-01-11 12:05:30 +00:00
|
|
|
struct MaxHandles {
|
|
|
|
remaining: AtomicU32
|
|
|
|
}
|
|
|
|
|
|
|
|
struct Handle<'a> {
|
|
|
|
parent: &'a MaxHandles
|
|
|
|
}
|
|
|
|
|
|
|
|
impl MaxHandles {
|
|
|
|
fn new(max: u32) -> MaxHandles {
|
|
|
|
MaxHandles { remaining: AtomicU32::new(max) }
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn get<'a>(&'a self) -> Handle<'a> {
|
|
|
|
loop {
|
|
|
|
let current = self.remaining.load(Ordering::Relaxed);
|
|
|
|
if current > 0 {
|
|
|
|
let new_current = self.remaining.compare_and_swap(current, current - 1, Ordering::Relaxed);
|
|
|
|
if new_current == current { // worked
|
|
|
|
debug!("Got handle with {}", new_current);
|
|
|
|
return Handle { parent: self };
|
|
|
|
}
|
|
|
|
}
|
|
|
|
task::sleep(time::Duration::from_millis(500)).await;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Drop for Handle<'a> {
|
|
|
|
fn drop(&mut self) {
|
|
|
|
debug!("Dropping");
|
|
|
|
self.parent.remaining.fetch_add(1, Ordering::Relaxed);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
lazy_static! {
|
2020-01-13 08:39:38 +00:00
|
|
|
static ref CLIENT: Client = Client::builder()
|
2020-01-11 12:05:30 +00:00
|
|
|
.danger_accept_invalid_certs(true) // because some certs are out of date
|
2020-01-20 13:50:02 +00:00
|
|
|
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0") // so some sites (e.g. sciter.com) don't reject us
|
2020-01-13 08:39:38 +00:00
|
|
|
.redirect(Policy::none())
|
2020-01-13 22:13:13 +00:00
|
|
|
.timeout(time::Duration::from_secs(20))
|
2020-01-11 12:05:30 +00:00
|
|
|
.build().unwrap();
|
|
|
|
|
|
|
|
// This is to avoid errors with running out of file handles, so we only do 20 requests at a time
|
|
|
|
static ref HANDLES: MaxHandles = MaxHandles::new(20);
|
|
|
|
}
|
2020-01-10 23:07:36 +00:00
|
|
|
|
2020-02-16 20:44:45 +00:00
|
|
|
fn get_url(url: String) -> BoxFuture<'static, (String, Result<(), CheckerError>)> {
|
2020-01-13 22:13:00 +00:00
|
|
|
async move {
|
|
|
|
let _handle = HANDLES.get().await;
|
2020-01-17 23:24:03 +00:00
|
|
|
let mut res = Err(CheckerError::NotTried);
|
2020-01-13 22:13:00 +00:00
|
|
|
for _ in 0..5u8 {
|
|
|
|
debug!("Running {}", url);
|
|
|
|
let resp = CLIENT
|
|
|
|
.get(&url)
|
2020-02-26 21:26:18 +00:00
|
|
|
.header(header::ACCEPT, "image/svg+xml, text/html, */*;q=0.8")
|
2020-01-13 22:13:00 +00:00
|
|
|
.send()
|
|
|
|
.await;
|
|
|
|
match resp {
|
|
|
|
Err(err) => {
|
|
|
|
warn!("Error while getting {}, retrying: {}", url, err);
|
2020-01-17 23:24:03 +00:00
|
|
|
res = Err(CheckerError::ReqwestError{error: err});
|
2020-01-13 08:39:38 +00:00
|
|
|
continue;
|
|
|
|
}
|
2020-02-16 20:44:45 +00:00
|
|
|
Ok(ok) => {
|
2020-01-13 22:13:00 +00:00
|
|
|
let status = ok.status();
|
|
|
|
if status != StatusCode::OK {
|
|
|
|
lazy_static! {
|
|
|
|
static ref ACTIONS_REGEX: Regex = Regex::new(r"https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/actions(?:\?workflow=.+)?").unwrap();
|
2020-01-30 15:23:42 +00:00
|
|
|
static ref YOUTUBE_REGEX: Regex = Regex::new(r"https://www.youtube.com/watch\?v=(?P<video_id>.+)").unwrap();
|
2020-02-10 21:02:28 +00:00
|
|
|
static ref AZURE_BUILD_REGEX: Regex = Regex::new(r"https://dev.azure.com/[^/]+/[^/]+/_build").unwrap();
|
2020-01-13 22:13:00 +00:00
|
|
|
}
|
|
|
|
if status == StatusCode::NOT_FOUND && ACTIONS_REGEX.is_match(&url) {
|
|
|
|
let rewritten = ACTIONS_REGEX.replace_all(&url, "https://github.com/$org/$repo");
|
|
|
|
warn!("Got 404 with Github actions, so replacing {} with {}", url, rewritten);
|
|
|
|
let (_new_url, res) = get_url(rewritten.to_string()).await;
|
|
|
|
return (url, res);
|
|
|
|
}
|
2020-01-30 15:23:42 +00:00
|
|
|
if status == StatusCode::FOUND && YOUTUBE_REGEX.is_match(&url) {
|
|
|
|
// Based off of https://gist.github.com/tonY1883/a3b85925081688de569b779b4657439b
|
|
|
|
// Guesswork is that the img feed will cause less 302's than the main url
|
|
|
|
// See https://github.com/rust-unofficial/awesome-rust/issues/814 for original issue
|
|
|
|
let rewritten = YOUTUBE_REGEX.replace_all(&url, "http://img.youtube.com/vi/$video_id/mqdefault.jpg");
|
|
|
|
warn!("Got 302 with Youtube, so replacing {} with {}", url, rewritten);
|
|
|
|
let (_new_url, res) = get_url(rewritten.to_string()).await;
|
|
|
|
return (url, res);
|
|
|
|
};
|
2020-02-10 21:02:28 +00:00
|
|
|
if status == StatusCode::FOUND && AZURE_BUILD_REGEX.is_match(&url) {
|
|
|
|
// Azure build urls always redirect to a particular build id, so no stable url guarantees
|
|
|
|
let redirect = ok.headers().get(header::LOCATION).unwrap().to_str().unwrap();
|
|
|
|
let merged_url = Url::parse(&url).unwrap().join(redirect).unwrap();
|
|
|
|
info!("Got 302 from Azure devops, so replacing {} with {}", url, merged_url);
|
|
|
|
let (_new_url, res) = get_url(merged_url.into_string()).await;
|
|
|
|
return (url, res);
|
|
|
|
}
|
2020-01-13 22:13:00 +00:00
|
|
|
|
|
|
|
warn!("Error while getting {}, retrying: {}", url, status);
|
|
|
|
if status.is_redirection() {
|
2020-01-17 23:24:03 +00:00
|
|
|
res = Err(CheckerError::HttpError {status: status, location: ok.headers().get(header::LOCATION).and_then(|h| h.to_str().ok()).map(|x| x.to_string())});
|
2020-01-20 17:53:24 +00:00
|
|
|
break;
|
2020-01-13 22:13:00 +00:00
|
|
|
} else {
|
2020-01-17 23:24:03 +00:00
|
|
|
res = Err(CheckerError::HttpError {status: status, location: None});
|
2020-01-20 17:53:24 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
lazy_static! {
|
2020-02-12 23:38:26 +00:00
|
|
|
static ref TRAVIS_IMG_REGEX: Regex = Regex::new(r"https://api.travis-ci.(?:com|org)/[^/]+/.+\.svg(\?.+)?").unwrap();
|
2020-02-12 23:20:58 +00:00
|
|
|
static ref GITHUB_ACTIONS_REGEX: Regex = Regex::new(r"https://github.com/[^/]+/[^/]+/workflows/[^/]+/badge.svg(\?.+)?").unwrap();
|
2020-01-20 17:53:24 +00:00
|
|
|
}
|
2020-02-12 23:38:26 +00:00
|
|
|
if let Some(matches) = TRAVIS_IMG_REGEX.captures(&url) {
|
2020-02-16 20:44:45 +00:00
|
|
|
// Previously we checked the Content-Disposition headers, but sometimes that is incorrect
|
|
|
|
// We're now looking for the explicit text "unknown" in the middle of the SVG
|
|
|
|
let content = ok.text().await.unwrap();
|
|
|
|
if content.contains("unknown") {
|
2020-01-20 17:53:24 +00:00
|
|
|
res = Err(CheckerError::TravisBuildUnknown);
|
|
|
|
break;
|
2020-01-13 22:13:00 +00:00
|
|
|
}
|
2020-02-12 23:38:26 +00:00
|
|
|
let query = matches.get(1).map(|x| x.as_str()).unwrap_or("");
|
|
|
|
if !query.starts_with("?") || query.find("branch=").is_none() {
|
|
|
|
res = Err(CheckerError::TravisBuildNoBranch);
|
|
|
|
break;
|
|
|
|
}
|
2020-01-13 22:13:00 +00:00
|
|
|
}
|
2020-02-12 23:20:58 +00:00
|
|
|
if let Some(matches) = GITHUB_ACTIONS_REGEX.captures(&url) {
|
|
|
|
debug!("Github actions match {:?}", matches);
|
|
|
|
let query = matches.get(1).map(|x| x.as_str()).unwrap_or("");
|
|
|
|
if !query.starts_with("?") || query.find("branch=").is_none() {
|
|
|
|
res = Err(CheckerError::GithubActionNoBranch);
|
|
|
|
break;
|
|
|
|
}
|
2020-01-13 22:13:00 +00:00
|
|
|
}
|
2020-01-17 23:24:03 +00:00
|
|
|
debug!("Finished {}", url);
|
2020-02-16 20:44:45 +00:00
|
|
|
res = Ok(());
|
2020-01-17 23:24:03 +00:00
|
|
|
break;
|
2020-01-13 22:13:00 +00:00
|
|
|
}
|
2020-01-13 08:39:38 +00:00
|
|
|
}
|
2020-01-11 14:45:49 +00:00
|
|
|
}
|
2020-01-13 22:13:00 +00:00
|
|
|
(url, res)
|
|
|
|
}.boxed()
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
2020-02-24 23:41:45 +00:00
|
|
|
struct Link {
|
|
|
|
last_working: Option<DateTime<Local>>,
|
|
|
|
updated_at: DateTime<Local>,
|
|
|
|
working: bool,
|
|
|
|
message: String
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
|
|
|
|
2020-02-24 23:41:45 +00:00
|
|
|
type Results = BTreeMap<String, Link>;
|
2020-01-10 23:07:36 +00:00
|
|
|
|
|
|
|
#[tokio::main]
|
2020-01-17 23:24:03 +00:00
|
|
|
async fn main() -> Result<(), Error> {
|
2020-01-11 12:05:30 +00:00
|
|
|
env_logger::init();
|
2020-01-10 23:07:36 +00:00
|
|
|
let markdown_input = fs::read_to_string("README.md").expect("Can't read README.md");
|
|
|
|
let parser = Parser::new(&markdown_input);
|
|
|
|
|
2020-02-24 23:41:45 +00:00
|
|
|
let mut used: BTreeSet<String> = BTreeSet::new();
|
|
|
|
let mut results: Results = fs::read_to_string("results/results.yaml")
|
2020-01-17 23:24:03 +00:00
|
|
|
.map_err(|e| format_err!("{}", e))
|
|
|
|
.and_then(|x| serde_yaml::from_str(&x).map_err(|e| format_err!("{}", e)))
|
|
|
|
.unwrap_or(Results::new());
|
2020-01-10 23:07:36 +00:00
|
|
|
|
|
|
|
let mut url_checks = vec![];
|
|
|
|
|
2020-02-24 23:41:45 +00:00
|
|
|
let min_between_checks: Duration = Duration::days(1);
|
|
|
|
let max_allowed_failed: Duration = Duration::days(3);
|
2020-01-18 14:01:05 +00:00
|
|
|
let mut do_check = |url: String| {
|
|
|
|
if !url.starts_with("http") {
|
|
|
|
return;
|
|
|
|
}
|
2020-02-24 23:41:45 +00:00
|
|
|
used.insert(url.clone());
|
|
|
|
if let Some(link) = results.get(&url) {
|
|
|
|
if link.working {
|
|
|
|
let since = Local::now() - link.updated_at;
|
|
|
|
if since < min_between_checks {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2020-01-18 14:01:05 +00:00
|
|
|
}
|
|
|
|
let check = get_url(url).boxed();
|
|
|
|
url_checks.push(check);
|
|
|
|
};
|
|
|
|
|
2020-01-10 23:07:36 +00:00
|
|
|
for (event, _range) in parser.into_offset_iter() {
|
2020-01-18 14:01:05 +00:00
|
|
|
match event {
|
|
|
|
Event::Start(tag) => {
|
|
|
|
match tag {
|
|
|
|
Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => {
|
|
|
|
do_check(url.to_string());
|
2020-01-13 17:04:46 +00:00
|
|
|
}
|
2020-01-18 14:01:05 +00:00
|
|
|
_ => {}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Event::Html(content) => {
|
|
|
|
let fragment = Html::parse_fragment(&content);
|
|
|
|
for element in fragment.select(&Selector::parse("img").unwrap()) {
|
|
|
|
let img_src = element.value().attr("src");
|
|
|
|
if let Some(src) = img_src {
|
|
|
|
do_check(src.to_string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for element in fragment.select(&Selector::parse("a").unwrap()) {
|
|
|
|
let a_href = element.value().attr("href");
|
|
|
|
if let Some(href) = a_href {
|
|
|
|
do_check(href.to_string());
|
2020-01-13 17:04:46 +00:00
|
|
|
}
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
|
|
|
}
|
2020-01-18 14:01:05 +00:00
|
|
|
_ => {}
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-24 23:41:45 +00:00
|
|
|
let results_keys = results.keys().cloned().collect::<BTreeSet<String>>();
|
|
|
|
let old_links = results_keys.difference(&used);
|
|
|
|
for link in old_links {
|
|
|
|
results.remove(link).unwrap();
|
|
|
|
}
|
|
|
|
fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?;
|
|
|
|
|
2020-03-08 16:49:04 +00:00
|
|
|
let mut not_written = 0;
|
|
|
|
let mut last_written = Local::now();
|
2020-01-10 23:07:36 +00:00
|
|
|
while url_checks.len() > 0 {
|
2020-01-11 12:05:30 +00:00
|
|
|
debug!("Waiting...");
|
2020-01-10 23:07:36 +00:00
|
|
|
let ((url, res), _index, remaining) = select_all(url_checks).await;
|
|
|
|
url_checks = remaining;
|
|
|
|
match res {
|
|
|
|
Ok(_) => {
|
|
|
|
print!("\u{2714} ");
|
2020-02-24 23:41:45 +00:00
|
|
|
if let Some(link) = results.get_mut(&url) {
|
|
|
|
link.updated_at = Local::now();
|
|
|
|
link.last_working = Some(Local::now());
|
|
|
|
link.working = true;
|
|
|
|
link.message = String::from("")
|
|
|
|
} else {
|
|
|
|
results.insert(url.clone(), Link {
|
|
|
|
updated_at: Local::now(),
|
|
|
|
last_working: Some(Local::now()),
|
|
|
|
working: true,
|
|
|
|
message: String::from("")
|
|
|
|
});
|
|
|
|
}
|
2020-01-10 23:07:36 +00:00
|
|
|
},
|
|
|
|
Err(err) => {
|
|
|
|
print!("\u{2718} ");
|
2020-01-17 23:24:03 +00:00
|
|
|
let message = match err {
|
|
|
|
CheckerError::HttpError {status, location} => {
|
|
|
|
match location {
|
|
|
|
Some(loc) => {
|
|
|
|
format!("[{}] {} -> {}", status.as_u16(), url, loc)
|
|
|
|
}
|
|
|
|
None => {
|
|
|
|
format!("[{}] {}", status.as_u16(), url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-01-20 17:53:24 +00:00
|
|
|
CheckerError::TravisBuildUnknown => {
|
|
|
|
format!("[Unknown travis build] {}", url)
|
|
|
|
}
|
2020-02-12 23:38:26 +00:00
|
|
|
CheckerError::TravisBuildNoBranch => {
|
|
|
|
format!("[Travis build image with no branch specified] {}", url)
|
|
|
|
}
|
2020-02-12 23:20:58 +00:00
|
|
|
CheckerError::GithubActionNoBranch => {
|
|
|
|
format!("[Github action image with no branch specified] {}", url)
|
|
|
|
}
|
2020-01-17 23:24:03 +00:00
|
|
|
_ => {
|
|
|
|
format!("{:?}", err)
|
|
|
|
}
|
|
|
|
};
|
2020-02-24 23:41:45 +00:00
|
|
|
if let Some(link) = results.get_mut(&url) {
|
|
|
|
link.updated_at = Local::now();
|
|
|
|
link.working = false;
|
|
|
|
link.message = message;
|
|
|
|
link.last_working = None;
|
|
|
|
} else {
|
|
|
|
results.insert(url.clone(), Link {
|
|
|
|
updated_at: Local::now(),
|
|
|
|
working: false,
|
|
|
|
message: message,
|
|
|
|
last_working: None
|
|
|
|
});
|
|
|
|
}
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
|
|
|
}
|
2020-01-11 12:05:30 +00:00
|
|
|
std::io::stdout().flush().unwrap();
|
2020-03-08 16:49:04 +00:00
|
|
|
|
|
|
|
not_written += 1;
|
|
|
|
let duration = Local::now() - last_written;
|
|
|
|
if duration > Duration::seconds(5) || not_written > 20 {
|
|
|
|
fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?;
|
|
|
|
not_written = 0;
|
|
|
|
last_written = Local::now();
|
|
|
|
}
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
2020-03-08 16:49:04 +00:00
|
|
|
fs::write("results/results.yaml", serde_yaml::to_string(&results)?)?;
|
2020-01-10 23:07:36 +00:00
|
|
|
println!("");
|
2020-02-24 23:41:45 +00:00
|
|
|
let mut failed: u32 = 0;
|
|
|
|
|
|
|
|
for (_url, link) in results.iter() {
|
|
|
|
if !link.working {
|
|
|
|
if link.last_working.is_none() {
|
2020-03-08 16:49:14 +00:00
|
|
|
println!("{:?}", link);
|
2020-02-24 23:41:45 +00:00
|
|
|
failed +=1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if let Some(last_working) = link.last_working {
|
|
|
|
let since = Local::now() - last_working;
|
|
|
|
if since > max_allowed_failed {
|
2020-03-08 16:49:14 +00:00
|
|
|
println!("{:?}", link);
|
2020-02-24 23:41:45 +00:00
|
|
|
failed +=1;
|
|
|
|
} else {
|
|
|
|
println!("Failure occurred but only {} ago, so we're not worrying yet: {}", since, link.message);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if failed == 0 {
|
2020-01-11 12:05:30 +00:00
|
|
|
println!("No errors!");
|
|
|
|
Ok(())
|
|
|
|
} else {
|
2020-02-24 23:41:45 +00:00
|
|
|
Err(format_err!("{} urls with errors", failed))
|
2020-01-10 23:07:36 +00:00
|
|
|
}
|
|
|
|
}
|