Skip anchor checking for URL with prefix in config (#812)

* cargo fmt & clippy * Skip anchor checking for URL with prefix in config
2024-11-10 14:24:27 +00:00 · 2019-10-14 18:31:03 +02:00 · 2019-10-14 18:31:03 +02:00 · 6149fd17e1
commit 6149fd17e1
parent 4aa2ba84fc
16 changed files with 133 additions and 28 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1241,6 +1241,7 @@ dependencies = [
 name = "link_checker"
 version = "0.1.0"
 dependencies = [
+ "config 0.1.0",
 "errors 0.1.0",
 "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "reqwest 0.9.21 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/components/config/src/config.rs
+++ b/components/config/src/config.rs
@ -7,8 +7,8 @@ use syntect::parsing::{SyntaxSet, SyntaxSetBuilder};
 use toml;
 use toml::Value as Toml;

-use errors::Result;
 use errors::Error;
+use errors::Result;
 use highlighting::THEME_SET;
 use theme::Theme;
 use utils::fs::read_file_with_error;
@ -86,7 +86,20 @@ impl Default for Taxonomy {
    }
 }

-type TranslateTerm  = HashMap<String, String>;
+type TranslateTerm = HashMap<String, String>;
+
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(default)]
+pub struct LinkChecker {
+    /// Skip anchor checking for these URL prefixes
+    pub skip_anchor_prefixes: Vec<String>,
+}
+
+impl Default for LinkChecker {
+    fn default() -> LinkChecker {
+        LinkChecker { skip_anchor_prefixes: Vec::new() }
+    }
+}

 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(default)]
@ -152,6 +165,8 @@ pub struct Config {
    #[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are need
    pub extra_syntax_set: Option<SyntaxSet>,

+    pub link_checker: LinkChecker,
+
    /// All user params set in [extra] in the config
    pub extra: HashMap<String, Toml>,

@ -317,9 +332,16 @@ impl Config {
            Error::msg(format!("Translation for language '{}' is missing", lang.as_ref()))
        })?;

-        terms.get(key.as_ref()).ok_or_else(|| {
-            Error::msg(format!("Translation key '{}' for language '{}' is missing", key.as_ref(), lang.as_ref()))
-        }).map(|term| term.to_string())
+        terms
+            .get(key.as_ref())
+            .ok_or_else(|| {
+                Error::msg(format!(
+                    "Translation key '{}' for language '{}' is missing",
+                    key.as_ref(),
+                    lang.as_ref()
+                ))
+            })
+            .map(|term| term.to_string())
    }
 }

@ -346,6 +368,7 @@ impl Default for Config {
            translations: HashMap::new(),
            extra_syntaxes: Vec::new(),
            extra_syntax_set: None,
+            link_checker: LinkChecker::default(),
            extra: HashMap::new(),
            build_timestamp: Some(1),
        }
@ -551,4 +574,25 @@ ignored_content = ["*.{graphml,iso}", "*.py?"]
        assert!(g.is_match("foo.py3"));
        assert!(!g.is_match("foo.py"));
    }
+
+    #[test]
+    fn link_checker_skip_anchor_prefixes() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+
+[link_checker]
+skip_anchor_prefixes = [
+    "https://caniuse.com/#feat=",
+    "https://github.com/rust-lang/rust/blob/",
+]
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.link_checker.skip_anchor_prefixes;
+        assert_eq!(
+            v,
+            vec!["https://caniuse.com/#feat=", "https://github.com/rust-lang/rust/blob/"]
+        );
+    }
 }
--- a/components/config/src/lib.rs
+++ b/components/config/src/lib.rs
@ -14,7 +14,7 @@ extern crate utils;
 mod config;
 pub mod highlighting;
 mod theme;
-pub use config::{Config, Language, Taxonomy};
+pub use config::{Config, Language, LinkChecker, Taxonomy};

 use std::path::Path;

--- a/components/imageproc/src/lib.rs
+++ b/components/imageproc/src/lib.rs
@ -272,7 +272,7 @@ impl ImageOp {
                } else {
                    img
                }
-            },
+            }
            Fill(w, h) => {
                let factor_w = img_w as f32 / w as f32;
                let factor_h = img_h as f32 / h as f32;
--- a/components/library/src/library.rs
+++ b/components/library/src/library.rs
@ -1,7 +1,7 @@
 use std::collections::{HashMap, HashSet};
 use std::path::{Path, PathBuf};

-use slotmap::{DenseSlotMap, DefaultKey};
+use slotmap::{DefaultKey, DenseSlotMap};

 use front_matter::SortBy;

--- a/components/library/src/sorting.rs
+++ b/components/library/src/sorting.rs
@ -21,7 +21,9 @@ pub fn sort_actual_pages_by_date(a: &&Page, b: &&Page) -> Ordering {
 /// Takes a list of (page key, date, permalink) and sort them by dates if possible
 /// Pages without date will be put in the unsortable bucket
 /// The permalink is used to break ties
-pub fn sort_pages_by_date(pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
+pub fn sort_pages_by_date(
+    pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>,
+) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
    let (mut can_be_sorted, cannot_be_sorted): (Vec<_>, Vec<_>) =
        pages.into_par_iter().partition(|page| page.1.is_some());

@ -40,7 +42,9 @@ pub fn sort_pages_by_date(pages: Vec<(&DefaultKey, Option<NaiveDateTime>, &str)>
 /// Takes a list of (page key, weight, permalink) and sort them by weight if possible
 /// Pages without weight will be put in the unsortable bucket
 /// The permalink is used to break ties
-pub fn sort_pages_by_weight(pages: Vec<(&DefaultKey, Option<usize>, &str)>) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
+pub fn sort_pages_by_weight(
+    pages: Vec<(&DefaultKey, Option<usize>, &str)>,
+) -> (Vec<DefaultKey>, Vec<DefaultKey>) {
    let (mut can_be_sorted, cannot_be_sorted): (Vec<_>, Vec<_>) =
        pages.into_par_iter().partition(|page| page.1.is_some());

@ -57,7 +61,9 @@ pub fn sort_pages_by_weight(pages: Vec<(&DefaultKey, Option<usize>, &str)>) -> (
 }

 /// Find the lighter/heavier and earlier/later pages for all pages having a date/weight
-pub fn find_siblings(sorted: &[DefaultKey]) -> Vec<(DefaultKey, Option<DefaultKey>, Option<DefaultKey>)> {
+pub fn find_siblings(
+    sorted: &[DefaultKey],
+) -> Vec<(DefaultKey, Option<DefaultKey>, Option<DefaultKey>)> {
    let mut res = Vec::with_capacity(sorted.len());
    let length = sorted.len();

--- a/components/link_checker/Cargo.toml
+++ b/components/link_checker/Cargo.toml
@ -7,4 +7,5 @@ authors = ["Vincent Prouillet <prouillet.vincent@gmail.com>"]
 reqwest = "0.9"
 lazy_static = "1"

+config = { path = "../config" }
 errors = { path = "../errors" }
--- a/components/link_checker/src/lib.rs
+++ b/components/link_checker/src/lib.rs
@ -2,11 +2,13 @@ extern crate reqwest;
 #[macro_use]
 extern crate lazy_static;

+extern crate config;
 extern crate errors;

 use reqwest::header::{HeaderMap, ACCEPT};
 use reqwest::StatusCode;

+use config::LinkChecker;
 use errors::Result;

 use std::collections::HashMap;
@ -51,7 +53,7 @@ lazy_static! {
    static ref LINKS: Arc<RwLock<HashMap<String, LinkResult>>> = Arc::new(RwLock::new(HashMap::new()));
 }

-pub fn check_url(url: &str) -> LinkResult {
+pub fn check_url(url: &str, config: &LinkChecker) -> LinkResult {
    {
        let guard = LINKS.read().unwrap();
        if let Some(res) = guard.get(url) {
@ -65,9 +67,11 @@ pub fn check_url(url: &str) -> LinkResult {

    let client = reqwest::Client::new();

+    let check_anchor = !config.skip_anchor_prefixes.iter().any(|prefix| url.starts_with(prefix));
+
    // Need to actually do the link checking
    let res = match client.get(url).headers(headers).send() {
-        Ok(ref mut response) if has_anchor(url) => {
+        Ok(ref mut response) if check_anchor && has_anchor(url) => {
            match check_page_for_anchor(url, response.text()) {
                Ok(_) => LinkResult { code: Some(response.status()), error: None },
                Err(e) => LinkResult { code: None, error: Some(e.to_string()) },
@ -111,21 +115,21 @@ fn check_page_for_anchor(url: &str, body: reqwest::Result<String>) -> Result<()>

 #[cfg(test)]
 mod tests {
-    use super::{check_page_for_anchor, check_url, has_anchor, LINKS};
+    use super::{check_page_for_anchor, check_url, has_anchor, LinkChecker, LINKS};

    #[test]
    fn can_validate_ok_links() {
        let url = "https://google.com";
-        let res = check_url(url);
+        let res = check_url(url, &LinkChecker::default());
        assert!(res.is_valid());
        assert!(LINKS.read().unwrap().get(url).is_some());
-        let res = check_url(url);
+        let res = check_url(url, &LinkChecker::default());
        assert!(res.is_valid());
    }

    #[test]
    fn can_fail_404_links() {
-        let res = check_url("https://google.comys");
+        let res = check_url("https://google.comys", &LinkChecker::default());
        assert_eq!(res.is_valid(), false);
        assert!(res.code.is_none());
        assert!(res.error.is_some());
@ -190,4 +194,23 @@ mod tests {
        let res = has_anchor(url);
        assert_eq!(res, false);
    }
+
+    #[test]
+    fn skip_anchor_prefixes() {
+        let config = LinkChecker {
+            skip_anchor_prefixes: vec!["https://github.com/rust-lang/rust/blob/".to_owned()],
+        };
+
+        // anchor check is ignored because the url matches the prefix
+        let permalink = "https://github.com/rust-lang/rust/blob/c772948b687488a087356cb91432425662e034b9/src/librustc_back/target/mod.rs#L194-L214";
+        assert!(check_url(&permalink, &config).is_valid());
+
+        // other anchors are checked
+        let glossary = "https://help.github.com/en/articles/github-glossary#blame";
+        assert!(check_url(&glossary, &config).is_valid());
+
+        let glossary_invalid =
+            "https://help.github.com/en/articles/github-glossary#anchor-does-not-exist";
+        assert_eq!(check_url(&glossary_invalid, &config).is_valid(), false);
+    }
 }
--- a/components/rebuild/src/lib.rs
+++ b/components/rebuild/src/lib.rs
@ -335,7 +335,7 @@ fn is_section(path: &str, languages_codes: &[&str]) -> bool {
        }
    }

-    return false;
+    false
 }

 /// What happens when a section or a page is created/edited
--- a/components/rendering/src/markdown.rs
+++ b/components/rendering/src/markdown.rs
@ -296,8 +296,9 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render
            let start_idx = heading_ref.start_idx;
            let end_idx = heading_ref.end_idx;
            let title = get_text(&events[start_idx + 1..end_idx]);
-            let id =
-                heading_ref.id.unwrap_or_else(|| find_anchor(&inserted_anchors, slugify(&title), 0));
+            let id = heading_ref
+                .id
+                .unwrap_or_else(|| find_anchor(&inserted_anchors, slugify(&title), 0));
            inserted_anchors.push(id.clone());

            // insert `id` to the tag
@ -326,7 +327,8 @@ pub fn markdown_to_html(content: &str, context: &RenderContext) -> Result<Render

            // record heading to make table of contents
            let permalink = format!("{}#{}", context.current_page_permalink, id);
-            let h = Heading { level: heading_ref.level, id, permalink, title, children: Vec::new() };
+            let h =
+                Heading { level: heading_ref.level, id, permalink, title, children: Vec::new() };
            headings.push(h);
        }

--- a/components/site/src/lib.rs
+++ b/components/site/src/lib.rs
@ -399,7 +399,7 @@ impl Site {
            all_links
                .par_iter()
                .filter_map(|(page_path, link)| {
-                    let res = check_url(&link);
+                    let res = check_url(&link, &self.config.link_checker);
                    if res.is_valid() {
                        None
                    } else {
--- a/components/site/tests/site.rs
+++ b/components/site/tests/site.rs
@ -662,3 +662,14 @@ fn can_ignore_markdown_content() {
    let (_, _tmp_dir, public) = build_site("test_site");
    assert!(!file_exists!(public, "posts/ignored/index.html"));
 }
+
+#[test]
+fn check_site() {
+    let (mut site, _tmp_dir, _public) = build_site("test_site");
+
+    let prefixes = &site.config.link_checker.skip_anchor_prefixes;
+    assert_eq!(prefixes, &vec!["https://github.com/rust-lang/rust/blob/"]);
+
+    site.config.enable_check_mode();
+    site.load().expect("link check test_site");
+}
--- a/components/templates/src/global_fns/mod.rs
+++ b/components/templates/src/global_fns/mod.rs
@ -34,9 +34,10 @@ impl TeraFn for Trans {
        let lang = optional_arg!(String, args.get("lang"), "`trans`: `lang` must be a string.")
            .unwrap_or_else(|| self.config.default_language.clone());

-        let term = self.config.get_translation(lang, key).map_err(|e| {
-            Error::chain("Failed to retreive term translation", e)
-        })?;
+        let term = self
+            .config
+            .get_translation(lang, key)
+            .map_err(|e| Error::chain("Failed to retreive term translation", e))?;

        Ok(to_value(term).unwrap())
    }
@ -509,7 +510,6 @@ mod tests {
        assert!(static_fn.call(&args).is_err());
    }

-
    const TRANS_CONFIG: &str = r#"
 base_url = "https://remplace-par-ton-url.fr"
 default_language = "fr"
--- a/docs/content/documentation/getting-started/configuration.md
+++ b/docs/content/documentation/getting-started/configuration.md
@ -95,8 +95,14 @@ extra_syntaxes = []
 #
 #     [translations.en]
 #     title = "A title"
-#
-[translations]
+
+
+# Configure the link checker
+[link_checker]
+# Skip anchor checking for external URLs that start with these prefixes
+skip_anchor_prefixes = [
+    "https://caniuse.com/",
+]

 # You can put any kind of data in there and it
 # will be accessible in all templates
--- a/test_site/config.toml
+++ b/test_site/config.toml
@ -13,5 +13,10 @@ extra_syntaxes = ["syntaxes"]

 ignored_content = ["*/ignored.md"]

+[link_checker]
+skip_anchor_prefixes = [
+    "https://github.com/rust-lang/rust/blob/",
+]
+
 [extra.author]
 name = "Vincent Prouillet"
--- a/test_site/content/posts/tutorials/programming/rust.md
+++ b/test_site/content/posts/tutorials/programming/rust.md
@ -5,3 +5,9 @@ date = 2017-01-01
 +++

 A simple page
+
+<!-- more -->
+
+Link to some rust-lang [source code][permalink].
+
+[permalink]: https://github.com/rust-lang/rust/blob/c772948b687488a087356cb91432425662e034b9/src/librustc_back/target/mod.rs#L194-L214