Merge branch 'master' into patch-1

This commit is contained in:
Chrislearn Young 2021-08-05 06:23:28 +08:00
commit 8653d5cb41
7 changed files with 1316 additions and 1425 deletions

16
.github/workflows/approve.yml vendored Normal file
View file

@ -0,0 +1,16 @@
name: Automatic Approve
on:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:
jobs:
automatic-approve:
name: Automatic Approve
runs-on: ubuntu-latest
steps:
- name: Automatic Approve
uses: mheap/automatic-approve-action@v1.1.0
with:
token: ${{ secrets.GITHUB_TOKEN }}
workflows: "rust.yml"
dangerous_files: "src/main.rs,Cargo.toml,Cargo.lock"

View file

@ -5,16 +5,39 @@ on:
branches: [ master ] branches: [ master ]
pull_request: pull_request:
branches: [ master ] branches: [ master ]
schedule:
- cron: '0 0 * * *'
env: env:
CARGO_TERM_COLOR: always CARGO_TERM_COLOR: always
jobs: jobs:
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- uses: Swatinem/rust-cache@v1
- name: Get random cache id
run: echo "CACHE_ID=$((RANDOM))" >> $GITHUB_ENV
shell: bash
- uses: pat-s/always-upload-cache@v2.1.5
with:
path: results/results.yaml
key: results-${{ hashFiles('Cargo.lock') }}-${{ hashFiles('README.md') }}-${{ env.CACHE_ID }}
restore-keys: |
results-${{ hashFiles('Cargo.lock') }}-${{ hashFiles('README.md') }}-
results-${{ hashFiles('Cargo.lock') }}-
results-
- name: Build - name: Build
run: cargo build
- name: Run
run: cargo run run: cargo run
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUST_LOG: warn

View file

@ -15,7 +15,7 @@ If you want to add an entry to the `README.md` please consider this:
* if you've not published your crate to `crates.io` remove the `[[CRATE](...)]` part. * if you've not published your crate to `crates.io` remove the `[[CRATE](...)]` part.
* if you have a CI build, please add the build badge. Put the image after the description, separated by a space. Please make sure to add the branch information to the image: * if you have a CI build, please add the build badge. Put the image after the description, separated by a space. Please make sure to add the branch information to the image:
* example for Travis: ` [<img src="https://api.travis-ci.org/XXX/CRATE.svg?branch=master">](https://travis-ci.org/XXX/CRATE)` * example for Travis: ` [<img src="https://api.travis-ci.org/XXX/CRATE.svg?branch=master">](https://travis-ci.org/XXX/CRATE)`
* for Github actions please see https://docs.github.com/en/free-pro-team@latest/actions/managing-workflow-runs/adding-a-workflow-status-badge#using-the-branch-parameter * for Github actions please see https://docs.github.com/en/actions/managing-workflow-runs/adding-a-workflow-status-badge
- please pay attention to the alphabetical ordering. - please pay attention to the alphabetical ordering.

1761
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -7,20 +7,20 @@ repository = "https://github.com/rust-unofficial/awesome-rust"
edition = "2018" edition = "2018"
default-run = "awesome-rust" default-run = "awesome-rust"
# FIXME: locked down versions are a workaround for https://github.com/hyperium/hyper/issues/2191
[dependencies] [dependencies]
pulldown-cmark = "0.6" pulldown-cmark = "0.8"
futures = "=0.3.5" futures = "0.3"
reqwest = { version="=0.10.4", default_features=false, features=["rustls-tls", "trust-dns"] } reqwest = { version="0.11", default_features=false, features=["rustls-tls"] }
tokio = {version = "=0.2.21", features = ["macros", "rt-core", "rt-threaded", "time"] } tokio = {version = "1", features = ["macros", "rt", "rt-multi-thread", "time"] }
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_yaml = "0.8" serde_yaml = "0.8"
hyper = "=0.13.5" hyper = "0.14"
failure = "0.1" failure = "0.1"
lazy_static = "1" lazy_static = "1"
env_logger = "0.7" env_logger = "0.8"
log = "0.4" log = "0.4"
regex = "1" regex = "1"
scraper = "0.11" scraper = "0.11"
chrono = { version = "0.4", features = ["serde"] } chrono = { version = "0.4", features = ["serde"] }
chrono-humanize = "0.0.11" chrono-humanize = "0.2"
diffy = "0.2"

768
README.md

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
use pulldown_cmark::{Parser, Event, Tag}; use pulldown_cmark::{Parser, Event, Tag};
use std::fs; use std::{cmp::Ordering, fs};
use futures::future::{select_all, BoxFuture, FutureExt}; use futures::future::{select_all, BoxFuture, FutureExt};
use std::collections::{BTreeSet, BTreeMap}; use std::collections::{BTreeSet, BTreeMap};
use serde::{Serialize, Deserialize}; use serde::{Serialize, Deserialize};
@ -14,6 +14,7 @@ use chrono::{Local, DateTime, Duration};
use std::env; use std::env;
use tokio::sync::Semaphore; use tokio::sync::Semaphore;
use tokio::sync::SemaphorePermit; use tokio::sync::SemaphorePermit;
use diffy::create_patch;
#[derive(Debug, Fail, Serialize, Deserialize)] #[derive(Debug, Fail, Serialize, Deserialize)]
enum CheckerError { enum CheckerError {
@ -26,6 +27,9 @@ enum CheckerError {
location: Option<String>, location: Option<String>,
}, },
#[fail(display = "too many requests")]
TooManyRequests,
#[fail(display = "reqwest error: {}", error)] #[fail(display = "reqwest error: {}", error)]
ReqwestError { ReqwestError {
error: String, error: String,
@ -36,9 +40,6 @@ enum CheckerError {
#[fail(display = "travis build image with no branch")] #[fail(display = "travis build image with no branch")]
TravisBuildNoBranch, TravisBuildNoBranch,
#[fail(display = "github actions image with no branch")]
GithubActionNoBranch,
} }
fn formatter(err: &CheckerError, url: &String) -> String { fn formatter(err: &CheckerError, url: &String) -> String {
@ -59,9 +60,6 @@ fn formatter(err: &CheckerError, url: &String) -> String {
CheckerError::TravisBuildNoBranch => { CheckerError::TravisBuildNoBranch => {
format!("[Travis build image with no branch specified] {}", url) format!("[Travis build image with no branch specified] {}", url)
} }
CheckerError::GithubActionNoBranch => {
format!("[Github action image with no branch specified] {}", url)
}
_ => { _ => {
format!("{:?}", err) format!("{:?}", err)
} }
@ -82,7 +80,7 @@ impl MaxHandles {
} }
async fn get<'a>(&'a self) -> Handle<'a> { async fn get<'a>(&'a self) -> Handle<'a> {
let permit = self.remaining.acquire().await; let permit = self.remaining.acquire().await.unwrap();
return Handle { _permit: permit }; return Handle { _permit: permit };
} }
} }
@ -98,7 +96,7 @@ lazy_static! {
.danger_accept_invalid_certs(true) // because some certs are out of date .danger_accept_invalid_certs(true) // because some certs are out of date
.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0") // so some sites (e.g. sciter.com) don't reject us .user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0") // so some sites (e.g. sciter.com) don't reject us
.redirect(Policy::none()) .redirect(Policy::none())
.max_idle_per_host(0) .pool_max_idle_per_host(0)
.timeout(time::Duration::from_secs(20)) .timeout(time::Duration::from_secs(20))
.build().unwrap(); .build().unwrap();
@ -155,7 +153,9 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr
if status != StatusCode::OK { if status != StatusCode::OK {
lazy_static! { lazy_static! {
static ref ACTIONS_REGEX: Regex = Regex::new(r"https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/actions(?:\?workflow=.+)?").unwrap(); static ref ACTIONS_REGEX: Regex = Regex::new(r"https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/actions(?:\?workflow=.+)?").unwrap();
static ref YOUTUBE_REGEX: Regex = Regex::new(r"https://www.youtube.com/watch\?v=(?P<video_id>.+)").unwrap(); static ref YOUTUBE_VIDEO_REGEX: Regex = Regex::new(r"https://www.youtube.com/watch\?v=(?P<video_id>.+)").unwrap();
static ref YOUTUBE_PLAYLIST_REGEX: Regex = Regex::new(r"https://www.youtube.com/playlist\?list=(?P<playlist_id>.+)").unwrap();
static ref YOUTUBE_CONSENT_REGEX: Regex = Regex::new(r"https://consent.youtube.com/m\?continue=.+").unwrap();
static ref AZURE_BUILD_REGEX: Regex = Regex::new(r"https://dev.azure.com/[^/]+/[^/]+/_build").unwrap(); static ref AZURE_BUILD_REGEX: Regex = Regex::new(r"https://dev.azure.com/[^/]+/[^/]+/_build").unwrap();
} }
if status == StatusCode::NOT_FOUND && ACTIONS_REGEX.is_match(&url) { if status == StatusCode::NOT_FOUND && ACTIONS_REGEX.is_match(&url) {
@ -164,15 +164,22 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr
let (_new_url, res) = get_url_core(rewritten.to_string()).await; let (_new_url, res) = get_url_core(rewritten.to_string()).await;
return (url, res); return (url, res);
} }
if status == StatusCode::FOUND && YOUTUBE_REGEX.is_match(&url) { if status == StatusCode::FOUND && YOUTUBE_VIDEO_REGEX.is_match(&url) {
// Based off of https://gist.github.com/tonY1883/a3b85925081688de569b779b4657439b // Based off of https://gist.github.com/tonY1883/a3b85925081688de569b779b4657439b
// Guesswork is that the img feed will cause less 302's than the main url // Guesswork is that the img feed will cause less 302's than the main url
// See https://github.com/rust-unofficial/awesome-rust/issues/814 for original issue // See https://github.com/rust-unofficial/awesome-rust/issues/814 for original issue
let rewritten = YOUTUBE_REGEX.replace_all(&url, "http://img.youtube.com/vi/$video_id/mqdefault.jpg"); let rewritten = YOUTUBE_VIDEO_REGEX.replace_all(&url, "http://img.youtube.com/vi/$video_id/mqdefault.jpg");
warn!("Got 302 with Youtube, so replacing {} with {}", url, rewritten); warn!("Got 302 with Youtube, so replacing {} with {}", url, rewritten);
let (_new_url, res) = get_url_core(rewritten.to_string()).await; let (_new_url, res) = get_url_core(rewritten.to_string()).await;
return (url, res); return (url, res);
}; };
if status == StatusCode::FOUND && YOUTUBE_PLAYLIST_REGEX.is_match(&url) {
let location = ok.headers().get("LOCATION").map(|h| h.to_str().unwrap()).unwrap_or_default();
if YOUTUBE_CONSENT_REGEX.is_match(location) {
warn!("Got Youtube consent link for {}, so assuming playlist is ok", url);
return (url, Ok(()));
}
};
if status == StatusCode::FOUND && AZURE_BUILD_REGEX.is_match(&url) { if status == StatusCode::FOUND && AZURE_BUILD_REGEX.is_match(&url) {
// Azure build urls always redirect to a particular build id, so no stable url guarantees // Azure build urls always redirect to a particular build id, so no stable url guarantees
let redirect = ok.headers().get(header::LOCATION).unwrap().to_str().unwrap(); let redirect = ok.headers().get(header::LOCATION).unwrap().to_str().unwrap();
@ -182,6 +189,12 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr
return (url, res); return (url, res);
} }
if status == StatusCode::TOO_MANY_REQUESTS {
// We get a lot of these, and we should not retry as they'll just fail again
warn!("Error while getting {}: {}", url, status);
return (url, Err(CheckerError::TooManyRequests));
}
warn!("Error while getting {}, retrying: {}", url, status); warn!("Error while getting {}, retrying: {}", url, status);
if status.is_redirection() { if status.is_redirection() {
res = Err(CheckerError::HttpError {status: status.as_u16(), location: ok.headers().get(header::LOCATION).and_then(|h| h.to_str().ok()).map(|x| x.to_string())}); res = Err(CheckerError::HttpError {status: status.as_u16(), location: ok.headers().get(header::LOCATION).and_then(|h| h.to_str().ok()).map(|x| x.to_string())});
@ -209,14 +222,6 @@ fn get_url_core(url: String) -> BoxFuture<'static, (String, Result<(), CheckerEr
break; break;
} }
} }
if let Some(matches) = GITHUB_ACTIONS_REGEX.captures(&url) {
debug!("Github actions match {:?}", matches);
let query = matches.get(1).map(|x| x.as_str()).unwrap_or("");
if !query.starts_with("?") || query.find("branch=").is_none() {
res = Err(CheckerError::GithubActionNoBranch);
break;
}
}
debug!("Finished {}", url); debug!("Finished {}", url);
res = Ok(()); res = Ok(());
break; break;
@ -275,12 +280,77 @@ async fn main() -> Result<(), Error> {
url_checks.push(check); url_checks.push(check);
}; };
for (event, _range) in parser.into_offset_iter() { let mut to_check: Vec<String> = vec![];
#[derive(Debug)]
struct ListInfo {
location: usize,
data: Vec<String>
}
let mut list_items: Vec<ListInfo> = Vec::new();
let mut in_list_item = false;
let mut list_item: String = String::new();
for (event, range) in parser.into_offset_iter() {
match event { match event {
Event::Start(tag) => { Event::Start(tag) => {
match tag { match tag {
Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => { Tag::Link(_link_type, url, _title) | Tag::Image(_link_type, url, _title) => {
do_check(url.to_string()); to_check.push(url.to_string());
}
Tag::List(_) => {
if in_list_item && list_item.len() > 0 {
list_items.last_mut().unwrap().data.push(list_item.clone());
in_list_item = false;
}
list_items.push(ListInfo {location: range.start, data: Vec::new()});
}
Tag::Item => {
if in_list_item && list_item.len() > 0 {
list_items.last_mut().unwrap().data.push(list_item.clone());
}
in_list_item = true;
list_item = String::new();
}
Tag::Heading(_) => {}
Tag::Paragraph => {}
_ => {
if in_list_item {
in_list_item = false;
}
}
}
}
Event::Text(text) => {
if in_list_item {
list_item.push_str(&text);
}
}
Event::End(tag) => {
match tag {
Tag::Item => {
if list_item.len() > 0 {
list_items.last_mut().unwrap().data.push(list_item.clone());
list_item = String::new();
}
in_list_item = false
}
Tag::List(_) => {
let list_info = list_items.pop().unwrap();
if list_info.data.iter().find(|s| *s == "License").is_some() && list_info.data.iter().find(|s| *s == "Resources").is_some() {
// Ignore wrong ordering in top-level list
continue
}
let mut sorted_recent_list = list_info.data.to_vec();
sorted_recent_list.sort_by(|a, b| a.to_lowercase().cmp(&b.to_lowercase()));
let joined_recent = list_info.data.join("\n");
let joined_sorted = sorted_recent_list.join("\n");
let patch = create_patch(&joined_recent, &joined_sorted);
if patch.hunks().len() > 0 {
println!("{}", patch);
return Err(format_err!("Sorting error"));
}
} }
_ => {} _ => {}
} }
@ -292,6 +362,38 @@ async fn main() -> Result<(), Error> {
} }
} }
to_check.sort_by(|a,b| {
let get_time = |k| {
let res = results.get(k);
if let Some(link) = res {
if let Some(last_working) = link.last_working {
Some(last_working)
} else {
None
}
} else {
None
}
};
let res_a = get_time(a);
let res_b = get_time(b);
if res_a.is_none() {
if res_b.is_none() {
return a.cmp(b);
} else {
Ordering::Less
}
} else if res_b.is_none() {
Ordering::Greater
} else {
res_a.unwrap().cmp(&res_b.unwrap())
}
});
for url in to_check {
do_check(url)
}
let results_keys = results.keys().cloned().collect::<BTreeSet<String>>(); let results_keys = results.keys().cloned().collect::<BTreeSet<String>>();
let old_links = results_keys.difference(&used); let old_links = results_keys.difference(&used);
for link in old_links { for link in old_links {
@ -356,6 +458,13 @@ async fn main() -> Result<(), Error> {
failed +=1; failed +=1;
continue; continue;
} }
CheckerError::TooManyRequests => {
// too many tries
if link.last_working.is_some() {
info!("Ignoring 429 failure on {} as we've seen success before", url);
continue;
}
}
_ => {} _ => {}
}; };
if let Some(last_working) = link.last_working { if let Some(last_working) = link.last_working {