2021-10-16 20:56:51 +00:00
use chrono ::{ DateTime , Duration , Local } ;
use diffy ::create_patch ;
use failure ::{ format_err , Error , Fail } ;
2020-01-13 22:13:00 +00:00
use futures ::future ::{ select_all , BoxFuture , FutureExt } ;
2020-01-11 12:05:30 +00:00
use lazy_static ::lazy_static ;
2021-10-16 20:56:51 +00:00
use log ::{ debug , info , warn } ;
use pulldown_cmark ::{ Event , Parser , Tag } ;
2020-01-13 22:13:00 +00:00
use regex ::Regex ;
2021-10-16 20:56:51 +00:00
use reqwest ::{ header , redirect ::Policy , Client , StatusCode , Url } ;
use serde ::{ Deserialize , Serialize } ;
use std ::collections ::{ BTreeMap , BTreeSet } ;
2020-04-03 21:10:09 +00:00
use std ::env ;
2021-10-16 20:56:51 +00:00
use std ::io ::Write ;
use std ::time ;
use std ::u8 ;
use std ::{ cmp ::Ordering , fs } ;
2020-06-03 17:05:12 +00:00
use tokio ::sync ::Semaphore ;
use tokio ::sync ::SemaphorePermit ;
2020-01-17 23:24:03 +00:00
2021-08-10 22:22:48 +00:00
const MINIMUM_GITHUB_STARS : u32 = 50 ;
const MINIMUM_CARGO_DOWNLOADS : u32 = 2000 ;
2021-08-12 21:25:49 +00:00
// Allow overriding the needed stars for a section. "level" is the header level in the markdown, default is MINIMUM_GITHUB_STARS
2021-08-12 21:51:41 +00:00
// In general, we should just use the defaults. However, for some areas where there's not a lot of well-starred projects, but a
// a few that are say just below the thresholds, then it's worth reducing the thresholds so we can get a few more projects.
2021-08-12 21:25:49 +00:00
fn override_stars ( level : u32 , text : & str ) -> Option < u32 > {
if level = = 2 & & text . contains ( " Resources " ) {
2021-08-12 21:51:41 +00:00
// This is zero because a lot of the resources are non-github/non-cargo links and overriding for all would be annoying
// These should be evaluated with more primitive means
2021-08-12 21:25:49 +00:00
Some ( 0 )
} else if level = = 3 & & text . contains ( " Games " ) {
Some ( 40 )
2021-08-12 21:38:57 +00:00
} else if level = = 3 & & text . contains ( " Emulators " ) {
2021-08-12 21:43:49 +00:00
Some ( 40 )
2021-08-12 21:25:49 +00:00
} else {
2021-08-12 21:51:41 +00:00
None // i.e. use defaults
2021-08-12 21:25:49 +00:00
}
}
2021-08-10 22:22:48 +00:00
lazy_static! {
// Overrides for popularity count, each needs a good reason (i.e. downloads/stars we don't support automatic counting of)
// Each is a URL that's "enough" for an item to pass the popularity checks
static ref POPULARITY_OVERRIDES : Vec < String > = vec! [
" https://github.com/maidsafe " . to_string ( ) , // Many repos of Rust code, collectively > 50 stars
" https://pijul.org " . to_string ( ) , // Uses it's own VCS at https://nest.pijul.com/pijul/pijul with 190 stars at last check
" https://gitlab.com/veloren/veloren " . to_string ( ) , // No direct gitlab support, but >1000 stars there
" https://gitlab.redox-os.org/redox-os/redox " . to_string ( ) , // 394 stars
" https://amp.rs " . to_string ( ) , // https://github.com/jmacdonald/amp has 2.9k stars
" https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb " . to_string ( ) , // > 350k downloads
" https://gitpod.io " . to_string ( ) , // https://github.com/gitpod-io/gitpod has 4.7k stars
" https://wiki.gnome.org/Apps/Builder " . to_string ( ) , // https://gitlab.gnome.org/GNOME/gnome-builder has 133 stars
" https://marketplace.visualstudio.com/items?itemName=matklad.rust-analyzer " . to_string ( ) , // > 260k downloads
" https://marketplace.visualstudio.com/items?itemName=rust-lang.rust " . to_string ( ) , // > 1M downloads
" https://docs.rs " . to_string ( ) , // https://github.com/rust-lang/docs.rs has >600 stars
" https://github.com/rust-bio " . to_string ( ) , // https://github.com/rust-bio/rust-bio on it's own has >900 stars
" https://github.com/contain-rs " . to_string ( ) , // Lots of repos with good star counts
" https://github.com/georust " . to_string ( ) , // Lots of repos with good star counts
" http://kiss3d.org " . to_string ( ) , // https://github.com/sebcrozet/kiss3d has >900 stars
" https://github.com/rust-qt " . to_string ( ) , // Various high-stars repositories
" https://chromium.googlesource.com/chromiumos/platform/crosvm/ " . to_string ( ) , // Can't tell count directly, but various mirrors of it (e.g. https://github.com/dgreid/crosvm) have enough stars that it's got enough interest
" https://seed-rs.org/ " . to_string ( ) , // https://github.com/seed-rs/seed has 2.1k stars
" https://crates.io " . to_string ( ) , // This one gets a free pass :)
2021-08-14 11:16:53 +00:00
" https://cloudsmith.com/cargo-registry/ " . to_string ( ) // First private cargo registry (https://cloudsmith.com/blog/worlds-first-private-cargo-registry-w-cloudsmith-rust/) and not much in the way of other options yet. See also https://github.com/rust-unofficial/awesome-rust/pull/1141#discussion_r688711555
2021-08-10 22:22:48 +00:00
] ;
}
2020-04-03 20:12:54 +00:00
#[ derive(Debug, Fail, Serialize, Deserialize) ]
2020-01-17 23:24:03 +00:00
enum CheckerError {
#[ fail(display = " failed to try url " ) ]
NotTried , // Generally shouldn't happen, but useful to have
#[ fail(display = " http error: {} " , status) ]
HttpError {
2020-04-03 20:12:54 +00:00
status : u16 ,
2020-01-17 23:24:03 +00:00
location : Option < String > ,
} ,
2021-06-24 15:28:08 +00:00
#[ fail(display = " too many requests " ) ]
2021-07-25 19:44:10 +00:00
TooManyRequests ,
2021-06-24 15:28:08 +00:00
2020-01-17 23:24:03 +00:00
#[ fail(display = " reqwest error: {} " , error) ]
2021-10-16 20:56:51 +00:00
ReqwestError { error : String } ,
2020-01-20 17:53:24 +00:00
#[ fail(display = " travis build is unknown " ) ]
TravisBuildUnknown ,
2020-02-12 23:20:58 +00:00
2020-02-12 23:38:26 +00:00
#[ fail(display = " travis build image with no branch " ) ]
TravisBuildNoBranch ,
2020-01-17 23:24:03 +00:00
}
2020-04-03 20:12:54 +00:00
fn formatter ( err : & CheckerError , url : & String ) -> String {
match err {
2021-10-16 20:56:51 +00:00
CheckerError ::HttpError { status , location } = > match location {
Some ( loc ) = > {
format! ( " [ {} ] {} -> {} " , status , url , loc )
2020-04-03 20:12:54 +00:00
}
2021-10-16 20:56:51 +00:00
None = > {
format! ( " [ {} ] {} " , status , url )
}
} ,
2020-04-03 20:12:54 +00:00
CheckerError ::TravisBuildUnknown = > {
format! ( " [Unknown travis build] {} " , url )
}
CheckerError ::TravisBuildNoBranch = > {
format! ( " [Travis build image with no branch specified] {} " , url )
}
_ = > {
format! ( " {:?} " , err )
}
}
}
2020-01-11 12:05:30 +00:00
struct MaxHandles {
2021-10-16 20:56:51 +00:00
remaining : Semaphore ,
2020-01-11 12:05:30 +00:00
}
struct Handle < ' a > {
2021-10-16 20:56:51 +00:00
_permit : SemaphorePermit < ' a > ,
2020-01-11 12:05:30 +00:00
}
impl MaxHandles {
2020-06-03 17:05:12 +00:00
fn new ( max : usize ) -> MaxHandles {
2021-10-16 20:56:51 +00:00
MaxHandles {
remaining : Semaphore ::new ( max ) ,
}
2020-01-11 12:05:30 +00:00
}
async fn get < ' a > ( & ' a self ) -> Handle < ' a > {
2021-05-05 21:21:23 +00:00
let permit = self . remaining . acquire ( ) . await . unwrap ( ) ;
2020-06-03 17:05:12 +00:00
return Handle { _permit : permit } ;
2020-01-11 12:05:30 +00:00
}
}
impl < ' a > Drop for Handle < ' a > {
fn drop ( & mut self ) {
2020-06-03 17:05:12 +00:00
debug! ( " Dropping " ) ;
2020-01-11 12:05:30 +00:00
}
}
lazy_static! {
2020-01-13 08:39:38 +00:00
static ref CLIENT : Client = Client ::builder ( )
2020-01-11 12:05:30 +00:00
. danger_accept_invalid_certs ( true ) // because some certs are out of date
2020-01-20 13:50:02 +00:00
. user_agent ( " Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0 " ) // so some sites (e.g. sciter.com) don't reject us
2020-01-13 08:39:38 +00:00
. redirect ( Policy ::none ( ) )
2021-05-05 21:21:23 +00:00
. pool_max_idle_per_host ( 0 )
2020-01-13 22:13:13 +00:00
. timeout ( time ::Duration ::from_secs ( 20 ) )
2020-01-11 12:05:30 +00:00
. build ( ) . unwrap ( ) ;
// This is to avoid errors with running out of file handles, so we only do 20 requests at a time
static ref HANDLES : MaxHandles = MaxHandles ::new ( 20 ) ;
}
2020-01-10 23:07:36 +00:00
2020-02-16 20:44:45 +00:00
fn get_url ( url : String ) -> BoxFuture < 'static , ( String , Result < ( ) , CheckerError > ) > {
2020-06-03 16:57:07 +00:00
debug! ( " Need handle for {} " , url ) ;
2020-01-13 22:13:00 +00:00
async move {
let _handle = HANDLES . get ( ) . await ;
2020-06-03 16:57:07 +00:00
return get_url_core ( url ) . await ;
2021-10-16 20:56:51 +00:00
}
. boxed ( )
2020-06-03 16:57:07 +00:00
}
2021-08-10 22:22:48 +00:00
lazy_static! {
2021-10-16 20:56:51 +00:00
static ref GITHUB_REPO_REGEX : Regex =
2021-10-16 21:20:07 +00:00
Regex ::new ( r "^https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)(.*)" ) . unwrap ( ) ;
2021-08-10 22:22:48 +00:00
static ref GITHUB_API_REGEX : Regex = Regex ::new ( r "https://api.github.com/" ) . unwrap ( ) ;
2021-10-16 20:56:51 +00:00
static ref CRATE_REGEX : Regex =
Regex ::new ( r "https://crates.io/crates/(?P<crate>[^/]+)/?$" ) . unwrap ( ) ;
2021-08-10 22:22:48 +00:00
}
#[ derive(Deserialize, Debug) ]
struct GithubStars {
2021-10-16 20:56:51 +00:00
stargazers_count : u32 ,
2021-08-10 22:22:48 +00:00
}
2021-08-21 13:30:25 +00:00
async fn get_stars ( github_url : & str ) -> Option < u32 > {
2021-08-10 22:56:05 +00:00
warn! ( " Downloading Github stars for {} " , github_url ) ;
2021-10-16 20:56:51 +00:00
let rewritten = GITHUB_REPO_REGEX
. replace_all ( & github_url , " https://api.github.com/repos/$org/$repo " )
. to_string ( ) ;
let mut req = CLIENT . get ( & rewritten ) ;
2021-08-10 22:53:32 +00:00
if let Ok ( username ) = env ::var ( " GITHUB_USERNAME " ) {
if let Ok ( password ) = env ::var ( " GITHUB_TOKEN " ) {
// needs a token with at least public_repo scope
req = req . basic_auth ( username , Some ( password ) ) ;
}
}
2021-08-10 22:22:48 +00:00
let resp = req . send ( ) . await ;
match resp {
Err ( err ) = > {
warn! ( " Error while getting {}: {} " , github_url , err ) ;
2021-08-21 13:30:25 +00:00
return None ;
2021-08-10 22:22:48 +00:00
}
Ok ( ok ) = > {
2021-08-10 23:06:19 +00:00
let raw = ok . text ( ) . await . unwrap ( ) ;
let data = match serde_json ::from_str ::< GithubStars > ( & raw ) {
Ok ( val ) = > val ,
Err ( _ ) = > {
panic! ( " {:?} " , raw ) ;
}
} ;
2021-08-21 13:30:25 +00:00
return Some ( data . stargazers_count ) ;
2021-08-10 22:22:48 +00:00
}
}
}
#[ derive(Deserialize, Debug) ]
struct CrateInfo {
2021-10-16 20:56:51 +00:00
downloads : u64 ,
2021-08-10 22:22:48 +00:00
}
#[ derive(Deserialize, Debug) ]
struct Crate {
#[ serde(rename = " crate " ) ]
2021-10-16 20:56:51 +00:00
info : CrateInfo ,
2021-08-10 22:22:48 +00:00
}
2021-08-21 13:30:25 +00:00
async fn get_downloads ( github_url : & str ) -> Option < u64 > {
2021-08-10 22:56:05 +00:00
warn! ( " Downloading Crates downloads for {} " , github_url ) ;
2021-10-16 20:56:51 +00:00
let rewritten = CRATE_REGEX
. replace_all ( & github_url , " https://crates.io/api/v1/crates/$crate " )
. to_string ( ) ;
let req = CLIENT . get ( & rewritten ) ;
2021-08-10 22:22:48 +00:00
let resp = req . send ( ) . await ;
match resp {
Err ( err ) = > {
warn! ( " Error while getting {}: {} " , github_url , err ) ;
2021-08-21 13:30:25 +00:00
return None ;
2021-08-10 22:22:48 +00:00
}
Ok ( ok ) = > {
let data = ok . json ::< Crate > ( ) . await . unwrap ( ) ;
2021-08-21 13:30:25 +00:00
return Some ( data . info . downloads ) ;
2021-08-10 22:22:48 +00:00
}
}
}
2020-06-03 16:57:07 +00:00
fn get_url_core ( url : String ) -> BoxFuture < 'static , ( String , Result < ( ) , CheckerError > ) > {
async move {
2020-01-17 23:24:03 +00:00
let mut res = Err ( CheckerError ::NotTried ) ;
2020-01-13 22:13:00 +00:00
for _ in 0 .. 5 u8 {
2021-06-29 19:49:32 +00:00
debug! ( " Running {} " , url ) ;
2020-04-30 21:04:01 +00:00
if env ::var ( " GITHUB_USERNAME " ) . is_ok ( ) & & env ::var ( " GITHUB_TOKEN " ) . is_ok ( ) & & GITHUB_REPO_REGEX . is_match ( & url ) {
2020-04-03 21:10:09 +00:00
let rewritten = GITHUB_REPO_REGEX . replace_all ( & url , " https://api.github.com/repos/$org/$repo " ) ;
info! ( " Replacing {} with {} to workaround rate limits on Github " , url , rewritten ) ;
2020-06-03 16:57:07 +00:00
let ( _new_url , res ) = get_url_core ( rewritten . to_string ( ) ) . await ;
2020-04-03 21:10:09 +00:00
return ( url , res ) ;
}
let mut req = CLIENT
2020-01-13 22:13:00 +00:00
. get ( & url )
2020-04-03 21:10:09 +00:00
. header ( header ::ACCEPT , " image/svg+xml, text/html, */*;q=0.8 " ) ;
if GITHUB_API_REGEX . is_match ( & url ) {
if let Ok ( username ) = env ::var ( " GITHUB_USERNAME " ) {
if let Ok ( password ) = env ::var ( " GITHUB_TOKEN " ) {
// needs a token with at least public_repo scope
info! ( " Using basic auth for {} " , url ) ;
req = req . basic_auth ( username , Some ( password ) ) ;
}
}
}
let resp = req . send ( ) . await ;
2020-01-13 22:13:00 +00:00
match resp {
Err ( err ) = > {
warn! ( " Error while getting {}, retrying: {} " , url , err ) ;
2020-04-03 20:12:54 +00:00
res = Err ( CheckerError ::ReqwestError { error : err . to_string ( ) } ) ;
2020-01-13 08:39:38 +00:00
continue ;
}
2020-02-16 20:44:45 +00:00
Ok ( ok ) = > {
2020-01-13 22:13:00 +00:00
let status = ok . status ( ) ;
if status ! = StatusCode ::OK {
lazy_static! {
static ref ACTIONS_REGEX : Regex = Regex ::new ( r "https://github.com/(?P<org>[^/]+)/(?P<repo>[^/]+)/actions(?:\?workflow=.+)?" ) . unwrap ( ) ;
2021-06-13 15:07:34 +00:00
static ref YOUTUBE_VIDEO_REGEX : Regex = Regex ::new ( r "https://www.youtube.com/watch\?v=(?P<video_id>.+)" ) . unwrap ( ) ;
static ref YOUTUBE_PLAYLIST_REGEX : Regex = Regex ::new ( r "https://www.youtube.com/playlist\?list=(?P<playlist_id>.+)" ) . unwrap ( ) ;
static ref YOUTUBE_CONSENT_REGEX : Regex = Regex ::new ( r "https://consent.youtube.com/m\?continue=.+" ) . unwrap ( ) ;
2020-02-10 21:02:28 +00:00
static ref AZURE_BUILD_REGEX : Regex = Regex ::new ( r "https://dev.azure.com/[^/]+/[^/]+/_build" ) . unwrap ( ) ;
2020-01-13 22:13:00 +00:00
}
if status = = StatusCode ::NOT_FOUND & & ACTIONS_REGEX . is_match ( & url ) {
let rewritten = ACTIONS_REGEX . replace_all ( & url , " https://github.com/$org/$repo " ) ;
warn! ( " Got 404 with Github actions, so replacing {} with {} " , url , rewritten ) ;
2020-06-03 16:57:07 +00:00
let ( _new_url , res ) = get_url_core ( rewritten . to_string ( ) ) . await ;
2020-01-13 22:13:00 +00:00
return ( url , res ) ;
}
2021-06-13 15:07:34 +00:00
if status = = StatusCode ::FOUND & & YOUTUBE_VIDEO_REGEX . is_match ( & url ) {
2020-01-30 15:23:42 +00:00
// Based off of https://gist.github.com/tonY1883/a3b85925081688de569b779b4657439b
// Guesswork is that the img feed will cause less 302's than the main url
// See https://github.com/rust-unofficial/awesome-rust/issues/814 for original issue
2021-06-13 15:07:34 +00:00
let rewritten = YOUTUBE_VIDEO_REGEX . replace_all ( & url , " http://img.youtube.com/vi/$video_id/mqdefault.jpg " ) ;
2020-01-30 15:23:42 +00:00
warn! ( " Got 302 with Youtube, so replacing {} with {} " , url , rewritten ) ;
2020-06-03 16:57:07 +00:00
let ( _new_url , res ) = get_url_core ( rewritten . to_string ( ) ) . await ;
2020-01-30 15:23:42 +00:00
return ( url , res ) ;
} ;
2021-06-13 15:07:34 +00:00
if status = = StatusCode ::FOUND & & YOUTUBE_PLAYLIST_REGEX . is_match ( & url ) {
let location = ok . headers ( ) . get ( " LOCATION " ) . map ( | h | h . to_str ( ) . unwrap ( ) ) . unwrap_or_default ( ) ;
if YOUTUBE_CONSENT_REGEX . is_match ( location ) {
warn! ( " Got Youtube consent link for {}, so assuming playlist is ok " , url ) ;
return ( url , Ok ( ( ) ) ) ;
}
} ;
2020-02-10 21:02:28 +00:00
if status = = StatusCode ::FOUND & & AZURE_BUILD_REGEX . is_match ( & url ) {
// Azure build urls always redirect to a particular build id, so no stable url guarantees
let redirect = ok . headers ( ) . get ( header ::LOCATION ) . unwrap ( ) . to_str ( ) . unwrap ( ) ;
let merged_url = Url ::parse ( & url ) . unwrap ( ) . join ( redirect ) . unwrap ( ) ;
info! ( " Got 302 from Azure devops, so replacing {} with {} " , url , merged_url ) ;
2020-06-03 16:57:07 +00:00
let ( _new_url , res ) = get_url_core ( merged_url . into_string ( ) ) . await ;
2020-02-10 21:02:28 +00:00
return ( url , res ) ;
}
2020-01-13 22:13:00 +00:00
2021-06-24 15:28:08 +00:00
if status = = StatusCode ::TOO_MANY_REQUESTS {
// We get a lot of these, and we should not retry as they'll just fail again
warn! ( " Error while getting {}: {} " , url , status ) ;
return ( url , Err ( CheckerError ::TooManyRequests ) ) ;
}
2020-01-13 22:13:00 +00:00
warn! ( " Error while getting {}, retrying: {} " , url , status ) ;
if status . is_redirection ( ) {
2020-04-03 20:12:54 +00:00
res = Err ( CheckerError ::HttpError { status : status . as_u16 ( ) , location : ok . headers ( ) . get ( header ::LOCATION ) . and_then ( | h | h . to_str ( ) . ok ( ) ) . map ( | x | x . to_string ( ) ) } ) ;
2020-01-20 17:53:24 +00:00
break ;
2020-01-13 22:13:00 +00:00
} else {
2020-04-03 20:12:54 +00:00
res = Err ( CheckerError ::HttpError { status : status . as_u16 ( ) , location : None } ) ;
2020-01-20 17:53:24 +00:00
continue ;
}
}
lazy_static! {
2020-02-12 23:38:26 +00:00
static ref TRAVIS_IMG_REGEX : Regex = Regex ::new ( r "https://api.travis-ci.(?:com|org)/[^/]+/.+\.svg(\?.+)?" ) . unwrap ( ) ;
2020-02-12 23:20:58 +00:00
static ref GITHUB_ACTIONS_REGEX : Regex = Regex ::new ( r "https://github.com/[^/]+/[^/]+/workflows/[^/]+/badge.svg(\?.+)?" ) . unwrap ( ) ;
2020-01-20 17:53:24 +00:00
}
2020-02-12 23:38:26 +00:00
if let Some ( matches ) = TRAVIS_IMG_REGEX . captures ( & url ) {
2020-02-16 20:44:45 +00:00
// Previously we checked the Content-Disposition headers, but sometimes that is incorrect
// We're now looking for the explicit text "unknown" in the middle of the SVG
let content = ok . text ( ) . await . unwrap ( ) ;
if content . contains ( " unknown " ) {
2020-01-20 17:53:24 +00:00
res = Err ( CheckerError ::TravisBuildUnknown ) ;
break ;
2020-01-13 22:13:00 +00:00
}
2020-02-12 23:38:26 +00:00
let query = matches . get ( 1 ) . map ( | x | x . as_str ( ) ) . unwrap_or ( " " ) ;
if ! query . starts_with ( " ? " ) | | query . find ( " branch= " ) . is_none ( ) {
res = Err ( CheckerError ::TravisBuildNoBranch ) ;
break ;
}
2020-01-13 22:13:00 +00:00
}
2020-01-17 23:24:03 +00:00
debug! ( " Finished {} " , url ) ;
2020-02-16 20:44:45 +00:00
res = Ok ( ( ) ) ;
2020-01-17 23:24:03 +00:00
break ;
2020-01-13 22:13:00 +00:00
}
2020-01-13 08:39:38 +00:00
}
2020-01-11 14:45:49 +00:00
}
2020-01-13 22:13:00 +00:00
( url , res )
} . boxed ( )
2020-01-10 23:07:36 +00:00
}
2020-03-10 21:17:59 +00:00
#[ derive(Debug, Serialize, Deserialize) ]
enum Working {
Yes ,
2021-10-16 20:56:51 +00:00
No ( CheckerError ) ,
2020-03-10 21:17:59 +00:00
}
2020-01-10 23:07:36 +00:00
#[ derive(Debug, Serialize, Deserialize) ]
2020-02-24 23:41:45 +00:00
struct Link {
last_working : Option < DateTime < Local > > ,
updated_at : DateTime < Local > ,
2020-03-10 21:17:59 +00:00
working : Working ,
2020-01-10 23:07:36 +00:00
}
2020-02-24 23:41:45 +00:00
type Results = BTreeMap < String , Link > ;
2020-01-10 23:07:36 +00:00
2021-08-08 09:02:09 +00:00
#[ derive(Debug, Serialize, Deserialize) ]
struct PopularityData {
pub github_stars : BTreeMap < String , u32 > ,
2021-10-16 20:56:51 +00:00
pub cargo_downloads : BTreeMap < String , u32 > ,
2021-08-08 09:02:09 +00:00
}
2020-01-10 23:07:36 +00:00
#[ tokio::main ]
2020-01-17 23:24:03 +00:00
async fn main ( ) -> Result < ( ) , Error > {
2020-01-11 12:05:30 +00:00
env_logger ::init ( ) ;
2020-01-10 23:07:36 +00:00
let markdown_input = fs ::read_to_string ( " README.md " ) . expect ( " Can't read README.md " ) ;
let parser = Parser ::new ( & markdown_input ) ;
2020-02-24 23:41:45 +00:00
let mut used : BTreeSet < String > = BTreeSet ::new ( ) ;
let mut results : Results = fs ::read_to_string ( " results/results.yaml " )
2020-01-17 23:24:03 +00:00
. map_err ( | e | format_err! ( " {} " , e ) )
. and_then ( | x | serde_yaml ::from_str ( & x ) . map_err ( | e | format_err! ( " {} " , e ) ) )
. unwrap_or ( Results ::new ( ) ) ;
2020-01-10 23:07:36 +00:00
2021-08-08 09:02:09 +00:00
let mut popularity_data : PopularityData = fs ::read_to_string ( " results/popularity.yaml " )
. map_err ( | e | format_err! ( " {} " , e ) )
. and_then ( | x | serde_yaml ::from_str ( & x ) . map_err ( | e | format_err! ( " {} " , e ) ) )
2021-10-16 20:56:51 +00:00
. unwrap_or ( PopularityData {
github_stars : BTreeMap ::new ( ) ,
cargo_downloads : BTreeMap ::new ( ) ,
} ) ;
2021-08-08 09:02:09 +00:00
2021-08-10 22:22:48 +00:00
let mut url_checks = vec! [ ] ;
2021-08-08 09:02:09 +00:00
2020-07-24 07:20:26 +00:00
let min_between_checks : Duration = Duration ::days ( 3 ) ;
let max_allowed_failed : Duration = Duration ::days ( 7 ) ;
2020-01-18 14:01:05 +00:00
let mut do_check = | url : String | {
if ! url . starts_with ( " http " ) {
return ;
}
2021-08-10 22:22:48 +00:00
if used . contains ( & url ) {
return ;
}
2020-02-24 23:41:45 +00:00
used . insert ( url . clone ( ) ) ;
if let Some ( link ) = results . get ( & url ) {
2020-03-10 21:17:59 +00:00
if let Working ::Yes = link . working {
2020-02-24 23:41:45 +00:00
let since = Local ::now ( ) - link . updated_at ;
if since < min_between_checks {
return ;
}
}
2020-01-18 14:01:05 +00:00
}
let check = get_url ( url ) . boxed ( ) ;
url_checks . push ( check ) ;
} ;
2021-06-29 19:48:00 +00:00
let mut to_check : Vec < String > = vec! [ ] ;
2021-07-25 19:44:10 +00:00
#[ derive(Debug) ]
struct ListInfo {
location : usize ,
2021-10-16 20:56:51 +00:00
data : Vec < String > ,
2021-07-25 19:44:10 +00:00
}
let mut list_items : Vec < ListInfo > = Vec ::new ( ) ;
let mut in_list_item = false ;
let mut list_item : String = String ::new ( ) ;
2021-08-08 09:02:09 +00:00
let mut link_count : u8 = 0 ;
2021-08-21 13:30:25 +00:00
let mut github_stars : Option < u32 > = None ;
let mut cargo_downloads : Option < u32 > = None ;
2021-08-08 09:02:09 +00:00
2021-08-12 21:25:49 +00:00
let mut required_stars : u32 = MINIMUM_GITHUB_STARS ;
let mut last_level : u32 = 0 ;
let mut star_override_level : Option < u32 > = None ;
2021-07-25 19:44:10 +00:00
for ( event , range ) in parser . into_offset_iter ( ) {
2020-01-18 14:01:05 +00:00
match event {
Event ::Start ( tag ) = > {
match tag {
Tag ::Link ( _link_type , url , _title ) | Tag ::Image ( _link_type , url , _title ) = > {
2021-08-08 09:02:09 +00:00
if ! url . starts_with ( " # " ) {
2021-08-10 22:22:48 +00:00
let new_url = url . to_string ( ) ;
2021-10-16 21:20:07 +00:00
if POPULARITY_OVERRIDES . contains ( & new_url ) {
github_stars = Some ( MINIMUM_GITHUB_STARS ) ;
} else if GITHUB_REPO_REGEX . is_match ( & url ) {
let github_url = GITHUB_REPO_REGEX
. replace_all ( & url , " https://github.com/$org/$repo " )
. to_string ( ) ;
let existing = popularity_data . github_stars . get ( & github_url ) ;
if let Some ( stars ) = existing {
// Use existing star data, but re-retrieve url to check aliveness
// Some will have overrides, so don't check the regex yet
github_stars = Some ( * stars )
} else {
github_stars = get_stars ( & github_url ) . await ;
if let Some ( raw_stars ) = github_stars {
popularity_data
. github_stars
. insert ( github_url . to_string ( ) , raw_stars ) ;
if raw_stars > = required_stars {
fs ::write (
" results/popularity.yaml " ,
serde_yaml ::to_string ( & popularity_data ) ? ,
) ? ;
}
link_count + = 1 ;
continue ;
2021-08-21 13:30:25 +00:00
}
2021-08-10 22:22:48 +00:00
}
2021-10-16 21:20:07 +00:00
}
if CRATE_REGEX . is_match ( & url ) {
2021-08-10 22:22:48 +00:00
let existing = popularity_data . cargo_downloads . get ( & new_url ) ;
if let Some ( downloads ) = existing {
2021-08-21 13:30:25 +00:00
cargo_downloads = Some ( * downloads ) ;
2021-08-10 22:22:48 +00:00
} else {
let raw_downloads = get_downloads ( & url ) . await ;
2021-08-21 13:30:25 +00:00
if let Some ( positive_downloads ) = raw_downloads {
2021-10-16 20:56:51 +00:00
cargo_downloads = Some (
positive_downloads . clamp ( 0 , u32 ::MAX as u64 ) as u32 ,
) ;
popularity_data
. cargo_downloads
. insert ( new_url , cargo_downloads . unwrap ( ) ) ;
2021-08-21 13:30:25 +00:00
if cargo_downloads . unwrap_or ( 0 ) > = MINIMUM_CARGO_DOWNLOADS {
2021-10-16 20:56:51 +00:00
fs ::write (
" results/popularity.yaml " ,
serde_yaml ::to_string ( & popularity_data ) ? ,
) ? ;
2021-08-21 13:30:25 +00:00
}
2021-08-10 22:22:48 +00:00
}
link_count + = 1 ;
continue ;
}
}
2021-08-10 22:51:34 +00:00
2021-08-08 09:02:09 +00:00
to_check . push ( url . to_string ( ) ) ;
link_count + = 1 ;
}
2020-01-13 17:04:46 +00:00
}
2021-07-25 19:44:10 +00:00
Tag ::List ( _ ) = > {
if in_list_item & & list_item . len ( ) > 0 {
list_items . last_mut ( ) . unwrap ( ) . data . push ( list_item . clone ( ) ) ;
in_list_item = false ;
}
2021-10-16 20:56:51 +00:00
list_items . push ( ListInfo {
location : range . start ,
data : Vec ::new ( ) ,
} ) ;
2021-07-25 19:44:10 +00:00
}
Tag ::Item = > {
if in_list_item & & list_item . len ( ) > 0 {
list_items . last_mut ( ) . unwrap ( ) . data . push ( list_item . clone ( ) ) ;
}
in_list_item = true ;
list_item = String ::new ( ) ;
2021-08-08 09:02:09 +00:00
link_count = 0 ;
2021-08-21 13:30:25 +00:00
github_stars = None ;
cargo_downloads = None ;
2021-07-25 19:44:10 +00:00
}
2021-08-10 22:22:48 +00:00
Tag ::Heading ( level ) = > {
2021-08-12 21:25:49 +00:00
last_level = level ;
if let Some ( override_level ) = star_override_level {
if level = = override_level {
star_override_level = None ;
required_stars = MINIMUM_GITHUB_STARS ;
}
2021-08-10 22:22:48 +00:00
}
}
2021-07-25 19:44:10 +00:00
Tag ::Paragraph = > { }
_ = > {
if in_list_item {
in_list_item = false ;
}
}
}
}
Event ::Text ( text ) = > {
2021-08-12 21:25:49 +00:00
let possible_override = override_stars ( last_level , & text ) ;
if let Some ( override_value ) = possible_override {
star_override_level = Some ( last_level ) ;
required_stars = override_value ;
2021-08-10 22:22:48 +00:00
}
2021-08-12 21:25:49 +00:00
2021-07-25 19:44:10 +00:00
if in_list_item {
list_item . push_str ( & text ) ;
}
}
Event ::End ( tag ) = > {
match tag {
Tag ::Item = > {
2021-08-12 21:25:49 +00:00
if list_item . len ( ) > 0 {
2021-08-08 09:02:09 +00:00
if link_count > 0 {
2021-10-16 20:56:51 +00:00
if github_stars . unwrap_or ( 0 ) < required_stars
& & cargo_downloads . unwrap_or ( 0 ) < MINIMUM_CARGO_DOWNLOADS
{
2021-08-21 13:30:25 +00:00
if github_stars . is_none ( ) {
warn! ( " No valid github link " ) ;
}
if cargo_downloads . is_none ( ) {
warn! ( " No valid crates link " ) ;
}
return Err ( format_err! ( " Not high enough metrics ({:?} stars < {}, and {:?} cargo downloads < {}): {} " , github_stars , required_stars , cargo_downloads , MINIMUM_CARGO_DOWNLOADS , list_item ) ) ;
2021-08-10 22:22:48 +00:00
}
2021-08-08 09:02:09 +00:00
}
2021-07-25 19:44:10 +00:00
list_items . last_mut ( ) . unwrap ( ) . data . push ( list_item . clone ( ) ) ;
list_item = String ::new ( ) ;
}
in_list_item = false
}
Tag ::List ( _ ) = > {
let list_info = list_items . pop ( ) . unwrap ( ) ;
2021-10-16 20:56:51 +00:00
if list_info . data . iter ( ) . find ( | s | * s = = " License " ) . is_some ( )
& & list_info . data . iter ( ) . find ( | s | * s = = " Resources " ) . is_some ( )
{
2021-07-25 19:44:10 +00:00
// Ignore wrong ordering in top-level list
2021-10-16 20:56:51 +00:00
continue ;
2021-07-25 19:44:10 +00:00
}
let mut sorted_recent_list = list_info . data . to_vec ( ) ;
sorted_recent_list . sort_by ( | a , b | a . to_lowercase ( ) . cmp ( & b . to_lowercase ( ) ) ) ;
let joined_recent = list_info . data . join ( " \n " ) ;
let joined_sorted = sorted_recent_list . join ( " \n " ) ;
let patch = create_patch ( & joined_recent , & joined_sorted ) ;
if patch . hunks ( ) . len ( ) > 0 {
println! ( " {} " , patch ) ;
return Err ( format_err! ( " Sorting error " ) ) ;
}
}
2020-01-18 14:01:05 +00:00
_ = > { }
}
}
Event ::Html ( content ) = > {
2021-10-16 20:56:51 +00:00
return Err ( format_err! (
" Contains HTML content, not markdown: {} " ,
content
) ) ;
2020-01-10 23:07:36 +00:00
}
2020-01-18 14:01:05 +00:00
_ = > { }
2020-01-10 23:07:36 +00:00
}
}
2021-10-16 20:56:51 +00:00
fs ::write (
" results/popularity.yaml " ,
serde_yaml ::to_string ( & popularity_data ) ? ,
) ? ;
2020-01-10 23:07:36 +00:00
2021-10-16 20:56:51 +00:00
to_check . sort_by ( | a , b | {
2021-06-29 19:48:00 +00:00
let get_time = | k | {
let res = results . get ( k ) ;
if let Some ( link ) = res {
if let Some ( last_working ) = link . last_working {
Some ( last_working )
} else {
None
}
} else {
None
}
} ;
let res_a = get_time ( a ) ;
let res_b = get_time ( b ) ;
if res_a . is_none ( ) {
if res_b . is_none ( ) {
return a . cmp ( b ) ;
} else {
2021-06-29 20:07:56 +00:00
Ordering ::Less
2021-06-29 19:48:00 +00:00
}
} else if res_b . is_none ( ) {
2021-06-29 20:07:56 +00:00
Ordering ::Greater
2021-06-29 19:48:00 +00:00
} else {
res_a . unwrap ( ) . cmp ( & res_b . unwrap ( ) )
}
} ) ;
for url in to_check {
do_check ( url )
}
2020-02-24 23:41:45 +00:00
let results_keys = results . keys ( ) . cloned ( ) . collect ::< BTreeSet < String > > ( ) ;
let old_links = results_keys . difference ( & used ) ;
for link in old_links {
results . remove ( link ) . unwrap ( ) ;
}
fs ::write ( " results/results.yaml " , serde_yaml ::to_string ( & results ) ? ) ? ;
2020-03-08 16:49:04 +00:00
let mut not_written = 0 ;
let mut last_written = Local ::now ( ) ;
2020-01-10 23:07:36 +00:00
while url_checks . len ( ) > 0 {
2020-06-02 21:24:49 +00:00
debug! ( " Waiting for {} " , url_checks . len ( ) ) ;
2020-01-10 23:07:36 +00:00
let ( ( url , res ) , _index , remaining ) = select_all ( url_checks ) . await ;
url_checks = remaining ;
match res {
Ok ( _ ) = > {
print! ( " \u{2714} " ) ;
2020-02-24 23:41:45 +00:00
if let Some ( link ) = results . get_mut ( & url ) {
link . updated_at = Local ::now ( ) ;
link . last_working = Some ( Local ::now ( ) ) ;
2020-03-10 21:17:59 +00:00
link . working = Working ::Yes ;
2020-02-24 23:41:45 +00:00
} else {
2021-10-16 20:56:51 +00:00
results . insert (
url . clone ( ) ,
Link {
updated_at : Local ::now ( ) ,
last_working : Some ( Local ::now ( ) ) ,
working : Working ::Yes ,
} ,
) ;
2020-02-24 23:41:45 +00:00
}
2021-10-16 20:56:51 +00:00
}
2020-01-10 23:07:36 +00:00
Err ( err ) = > {
print! ( " \u{2718} " ) ;
2020-02-24 23:41:45 +00:00
if let Some ( link ) = results . get_mut ( & url ) {
link . updated_at = Local ::now ( ) ;
2020-04-03 20:12:54 +00:00
link . working = Working ::No ( err ) ;
2020-02-24 23:41:45 +00:00
} else {
2021-10-16 20:56:51 +00:00
results . insert (
url . clone ( ) ,
Link {
updated_at : Local ::now ( ) ,
working : Working ::No ( err ) ,
last_working : None ,
} ,
) ;
2020-02-24 23:41:45 +00:00
}
2020-01-10 23:07:36 +00:00
}
}
2020-01-11 12:05:30 +00:00
std ::io ::stdout ( ) . flush ( ) . unwrap ( ) ;
2020-03-08 16:49:04 +00:00
not_written + = 1 ;
let duration = Local ::now ( ) - last_written ;
if duration > Duration ::seconds ( 5 ) | | not_written > 20 {
fs ::write ( " results/results.yaml " , serde_yaml ::to_string ( & results ) ? ) ? ;
not_written = 0 ;
last_written = Local ::now ( ) ;
}
2020-01-10 23:07:36 +00:00
}
2020-03-08 16:49:04 +00:00
fs ::write ( " results/results.yaml " , serde_yaml ::to_string ( & results ) ? ) ? ;
2020-01-10 23:07:36 +00:00
println! ( " " ) ;
2020-02-24 23:41:45 +00:00
let mut failed : u32 = 0 ;
2020-04-03 20:12:54 +00:00
for ( url , link ) in results . iter ( ) {
if let Working ::No ( ref err ) = link . working {
match err {
2021-10-16 20:56:51 +00:00
CheckerError ::HttpError { status , .. }
if * status = = 301 | | * status = = 302 | | * status = = 404 = >
{
2020-04-21 10:18:53 +00:00
println! ( " {} {:?} " , url , link ) ;
2021-10-16 20:56:51 +00:00
failed + = 1 ;
2020-04-03 20:12:54 +00:00
continue ;
}
2021-06-24 15:28:08 +00:00
CheckerError ::TooManyRequests = > {
// too many tries
if link . last_working . is_some ( ) {
2021-10-16 20:56:51 +00:00
info! (
" Ignoring 429 failure on {} as we've seen success before " ,
url
) ;
2021-06-24 15:28:08 +00:00
continue ;
}
}
2020-04-03 20:12:54 +00:00
_ = > { }
} ;
2020-02-24 23:41:45 +00:00
if let Some ( last_working ) = link . last_working {
let since = Local ::now ( ) - last_working ;
if since > max_allowed_failed {
2020-04-21 10:04:59 +00:00
println! ( " {} {:?} " , url , link ) ;
2021-10-16 20:56:51 +00:00
failed + = 1 ;
2020-02-24 23:41:45 +00:00
} else {
2021-10-16 20:56:51 +00:00
println! (
" Failure occurred but only {}, so we're not worrying yet: {} " ,
chrono_humanize ::HumanTime ::from ( - since ) ,
formatter ( err , url )
) ;
2020-02-24 23:41:45 +00:00
}
2020-04-03 20:12:54 +00:00
} else {
2020-04-21 10:18:53 +00:00
println! ( " {} {:?} " , url , link ) ;
2021-10-16 20:56:51 +00:00
failed + = 1 ;
2020-04-03 20:12:54 +00:00
continue ;
2020-02-24 23:41:45 +00:00
}
}
}
if failed = = 0 {
2020-01-11 12:05:30 +00:00
println! ( " No errors! " ) ;
Ok ( ( ) )
} else {
2020-02-24 23:41:45 +00:00
Err ( format_err! ( " {} urls with errors " , failed ) )
2020-01-10 23:07:36 +00:00
}
2021-08-08 18:52:32 +00:00
}