Allow ignoring files when link checking (#2264)

* Allow ignoring files when link checking * cargo fmt * Fix tests * Remove mystery duplicate function..? * Add in some mysterious missing code..? * Simple tests for link checker file globs in config * cargo fmt * Remove comment * convert expect to error propagation * Address comments * cargo fmt
2024-12-05 01:49:12 +00:00 · 2024-02-04 14:59:30 -05:00 · 2024-02-04 14:59:30 -05:00 · 7d18ddfde2
commit 7d18ddfde2
parent bdb18657b6
5 changed files with 123 additions and 48 deletions
--- a/components/config/src/config/link_checker.rs
+++ b/components/config/src/config/link_checker.rs
@ -1,5 +1,9 @@
+use libs::globset::GlobSet;
 use serde::{Deserialize, Serialize};

+use errors::Result;
+use utils::globs::build_ignore_glob_set;
+
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub enum LinkCheckerLevel {
    #[serde(rename = "error")]
@ -14,7 +18,7 @@ impl Default for LinkCheckerLevel {
    }
 }

-#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
 #[serde(default)]
 pub struct LinkChecker {
    /// Skip link checking for these URL prefixes
@ -25,4 +29,16 @@ pub struct LinkChecker {
    pub internal_level: LinkCheckerLevel,
    /// Emit either "error" or "warn" for broken external links (including anchor links).
    pub external_level: LinkCheckerLevel,
+    /// A list of file glob patterns to skip link checking on
+    pub ignored_files: Vec<String>,
+    #[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are needed
+    pub ignored_files_globset: Option<GlobSet>,
+}
+
+impl LinkChecker {
+    pub fn resolve_globset(&mut self) -> Result<()> {
+        let glob_set = build_ignore_glob_set(&self.ignored_files, "files")?;
+        self.ignored_files_globset = Some(glob_set);
+        Ok(())
+    }
 }
--- a/components/config/src/config/mod.rs
+++ b/components/config/src/config/mod.rs
@ -8,13 +8,14 @@ pub mod taxonomies;
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};

-use libs::globset::{Glob, GlobSet, GlobSetBuilder};
+use libs::globset::GlobSet;
 use libs::toml::Value as Toml;
 use serde::{Deserialize, Serialize};

 use crate::theme::Theme;
 use errors::{anyhow, bail, Result};
 use utils::fs::read_file;
+use utils::globs::build_ignore_glob_set;
 use utils::slugs::slugify_paths;

 // We want a default base url for tests
@ -28,18 +29,6 @@ pub enum Mode {
    Check,
 }

-fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
-    let mut glob_set_builder = GlobSetBuilder::new();
-    for pat in ignore {
-        let glob = match Glob::new(pat) {
-            Ok(g) => g,
-            Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
-        };
-        glob_set_builder.add(glob);
-    }
-    Ok(glob_set_builder.build().unwrap_or_else(|_| panic!("Bad ignored_{} in config file.", name)))
-}
-
 #[derive(Clone, Debug, Deserialize)]
 #[serde(default)]
 pub struct Config {
@ -150,21 +139,13 @@ impl Config {

        config.add_default_language()?;
        config.slugify_taxonomies();
+        config.link_checker.resolve_globset()?;

-        if !config.ignored_content.is_empty() {
-            // Convert the file glob strings into a compiled glob set matcher. We want to do this once,
-            // at program initialization, rather than for every page, for example. We arrange for the
-            // globset matcher to always exist (even though it has to be an inside an Option at the
-            // moment because of the TOML serializer); if the glob set is empty the `is_match` function
-            // of the globber always returns false.
-            let glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
-            config.ignored_content_globset = Some(glob_set);
-        }
+        let content_glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
+        config.ignored_content_globset = Some(content_glob_set);

-        if !config.ignored_static.is_empty() {
-            let glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
-            config.ignored_static_globset = Some(glob_set);
-        }
+        let static_glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
+        config.ignored_static_globset = Some(static_glob_set);

        Ok(config)
    }
@ -652,32 +633,18 @@ title = "A title"
    }

    #[test]
-    fn missing_ignored_content_results_in_empty_vector_and_empty_globset() {
+    fn missing_ignored_content_results_in_empty_vector() {
        let config_str = r#"
 title = "My site"
 base_url = "example.com"
        "#;

        let config = Config::parse(config_str).unwrap();
-        let v = config.ignored_content;
-        assert_eq!(v.len(), 0);
-        assert!(config.ignored_content_globset.is_none());
+        assert_eq!(config.ignored_content.len(), 0);
    }

    #[test]
-    fn missing_ignored_static_results_in_empty_vector_and_empty_globset() {
-        let config_str = r#"
-title = "My site"
-base_url = "example.com"
-        "#;
-        let config = Config::parse(config_str).unwrap();
-        let v = config.ignored_static;
-        assert_eq!(v.len(), 0);
-        assert!(config.ignored_static_globset.is_none());
-    }
-
-    #[test]
-    fn empty_ignored_content_results_in_empty_vector_and_empty_globset() {
+    fn empty_ignored_content_results_in_empty_vector() {
        let config_str = r#"
 title = "My site"
 base_url = "example.com"
@ -686,11 +653,21 @@ ignored_content = []

        let config = Config::parse(config_str).unwrap();
        assert_eq!(config.ignored_content.len(), 0);
-        assert!(config.ignored_content_globset.is_none());
    }

    #[test]
-    fn empty_ignored_static_results_in_empty_vector_and_empty_globset() {
+    fn missing_ignored_static_results_in_empty_vector() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.ignored_static.len(), 0);
+    }
+
+    #[test]
+    fn empty_ignored_static_results_in_empty_vector() {
        let config_str = r#"
 title = "My site"
 base_url = "example.com"
@ -699,7 +676,30 @@ ignored_static = []

        let config = Config::parse(config_str).unwrap();
        assert_eq!(config.ignored_static.len(), 0);
-        assert!(config.ignored_static_globset.is_none());
+    }
+
+    #[test]
+    fn missing_link_checker_ignored_files_results_in_empty_vector() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.link_checker.ignored_files.len(), 0);
+    }
+
+    #[test]
+    fn empty_link_checker_ignored_files_results_in_empty_vector() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+[link_checker]
+ignored_files = []
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.link_checker.ignored_files.len(), 0);
    }

    #[test]
@ -760,6 +760,36 @@ ignored_static = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
        assert!(g.is_match("content/poetry/zen.py2"));
    }

+    #[test]
+    fn non_empty_link_checker_ignored_pages_results_in_vector_of_patterns_and_configured_globset() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+[link_checker]
+ignored_files = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.link_checker.ignored_files;
+        assert_eq!(v, vec!["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]);
+
+        let g = config.link_checker.ignored_files_globset.unwrap();
+        assert_eq!(g.len(), 3);
+        assert!(g.is_match("foo.graphml"));
+        assert!(g.is_match("foo/bar/foo.graphml"));
+        assert!(g.is_match("foo.iso"));
+        assert!(!g.is_match("foo.png"));
+        assert!(g.is_match("foo.py2"));
+        assert!(g.is_match("foo.py3"));
+        assert!(!g.is_match("foo.py"));
+        assert!(g.is_match("foo/bar/target"));
+        assert!(g.is_match("foo/bar/baz/temp_folder"));
+        assert!(g.is_match("foo/bar/baz/temp_folder/target"));
+        assert!(g.is_match("temp_folder"));
+        assert!(g.is_match("my/isos/foo.iso"));
+        assert!(g.is_match("content/poetry/zen.py2"));
+    }
+
    #[test]
    fn link_checker_skip_anchor_prefixes() {
        let config_str = r#"
--- a/components/site/src/link_checking.rs
+++ b/components/site/src/link_checking.rs
@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
 use std::{cmp, collections::HashMap, collections::HashSet, iter::FromIterator, thread};

 use config::LinkCheckerLevel;
+use libs::globset::GlobSet;
 use libs::rayon::prelude::*;

 use crate::Site;
@ -105,6 +106,10 @@ fn should_skip_by_prefix(link: &str, skip_prefixes: &[String]) -> bool {
    skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
 }

+fn should_skip_by_file(file_path: &Path, glob_set: &GlobSet) -> bool {
+    glob_set.is_match(file_path)
+}
+
 fn get_link_domain(link: &str) -> Result<String> {
    return match Url::parse(link) {
        Ok(url) => match url.host_str().map(String::from) {
@ -150,9 +155,12 @@ pub fn check_external_links(site: &Site) -> Vec<String> {
    let mut invalid_url_links: u32 = 0;
    // First we look at all the external links, skip those the user wants to skip and record
    // the ones that have invalid URLs
+    let ignored_files_globset = site.config.link_checker.ignored_files_globset.as_ref().unwrap();
    for (file_path, links) in external_links {
        for link in links {
-            if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes) {
+            if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes)
+                || should_skip_by_file(file_path, ignored_files_globset)
+            {
                skipped_link_count += 1;
            } else {
                match get_link_domain(link) {
--- a/components/utils/src/globs.rs
+++ b/components/utils/src/globs.rs
@ -0,0 +1,20 @@
+use libs::globset::{Glob, GlobSet, GlobSetBuilder};
+
+use errors::{bail, Result};
+
+pub fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
+    // Convert the file glob strings into a compiled glob set matcher. We want to do this once,
+    // at program initialization, rather than for every page, for example. We arrange for the
+    // globset matcher to always exist (even though it has to be inside an Option at the
+    // moment because of the TOML serializer); if the glob set is empty the `is_match` function
+    // of the globber always returns false.
+    let mut glob_set_builder = GlobSetBuilder::new();
+    for pat in ignore {
+        let glob = match Glob::new(pat) {
+            Ok(g) => g,
+            Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
+        };
+        glob_set_builder.add(glob);
+    }
+    Ok(glob_set_builder.build()?)
+}
--- a/components/utils/src/lib.rs
+++ b/components/utils/src/lib.rs
@ -1,6 +1,7 @@
 pub mod anchors;
 pub mod de;
 pub mod fs;
+pub mod globs;
 pub mod net;
 pub mod site;
 pub mod slugs;