Allow ignoring files when link checking (#2264)

* Allow ignoring files when link checking

* cargo fmt

* Fix tests

* Remove mystery duplicate function..?

* Add in some mysterious missing code..?

* Simple tests for link checker file globs in config

* cargo fmt

* Remove comment

* convert expect to error propagation

* Address comments

* cargo fmt
This commit is contained in:
Andrew Langmeier 2024-02-04 14:59:30 -05:00 committed by Vincent Prouillet
parent bdb18657b6
commit 7d18ddfde2
5 changed files with 123 additions and 48 deletions

View file

@ -1,5 +1,9 @@
use libs::globset::GlobSet;
use serde::{Deserialize, Serialize};
use errors::Result;
use utils::globs::build_ignore_glob_set;
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum LinkCheckerLevel {
#[serde(rename = "error")]
@ -14,7 +18,7 @@ impl Default for LinkCheckerLevel {
}
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct LinkChecker {
/// Skip link checking for these URL prefixes
@ -25,4 +29,16 @@ pub struct LinkChecker {
pub internal_level: LinkCheckerLevel,
/// Emit either "error" or "warn" for broken external links (including anchor links).
pub external_level: LinkCheckerLevel,
/// A list of file glob patterns to skip link checking on
pub ignored_files: Vec<String>,
#[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are needed
pub ignored_files_globset: Option<GlobSet>,
}
impl LinkChecker {
pub fn resolve_globset(&mut self) -> Result<()> {
let glob_set = build_ignore_glob_set(&self.ignored_files, "files")?;
self.ignored_files_globset = Some(glob_set);
Ok(())
}
}

View file

@ -8,13 +8,14 @@ pub mod taxonomies;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use libs::globset::{Glob, GlobSet, GlobSetBuilder};
use libs::globset::GlobSet;
use libs::toml::Value as Toml;
use serde::{Deserialize, Serialize};
use crate::theme::Theme;
use errors::{anyhow, bail, Result};
use utils::fs::read_file;
use utils::globs::build_ignore_glob_set;
use utils::slugs::slugify_paths;
// We want a default base url for tests
@ -28,18 +29,6 @@ pub enum Mode {
Check,
}
fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
let mut glob_set_builder = GlobSetBuilder::new();
for pat in ignore {
let glob = match Glob::new(pat) {
Ok(g) => g,
Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
};
glob_set_builder.add(glob);
}
Ok(glob_set_builder.build().unwrap_or_else(|_| panic!("Bad ignored_{} in config file.", name)))
}
#[derive(Clone, Debug, Deserialize)]
#[serde(default)]
pub struct Config {
@ -150,21 +139,13 @@ impl Config {
config.add_default_language()?;
config.slugify_taxonomies();
config.link_checker.resolve_globset()?;
if !config.ignored_content.is_empty() {
// Convert the file glob strings into a compiled glob set matcher. We want to do this once,
// at program initialization, rather than for every page, for example. We arrange for the
// globset matcher to always exist (even though it has to be an inside an Option at the
// moment because of the TOML serializer); if the glob set is empty the `is_match` function
// of the globber always returns false.
let glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
config.ignored_content_globset = Some(glob_set);
}
let content_glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
config.ignored_content_globset = Some(content_glob_set);
if !config.ignored_static.is_empty() {
let glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
config.ignored_static_globset = Some(glob_set);
}
let static_glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
config.ignored_static_globset = Some(static_glob_set);
Ok(config)
}
@ -652,32 +633,18 @@ title = "A title"
}
#[test]
fn missing_ignored_content_results_in_empty_vector_and_empty_globset() {
fn missing_ignored_content_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
"#;
let config = Config::parse(config_str).unwrap();
let v = config.ignored_content;
assert_eq!(v.len(), 0);
assert!(config.ignored_content_globset.is_none());
assert_eq!(config.ignored_content.len(), 0);
}
#[test]
fn missing_ignored_static_results_in_empty_vector_and_empty_globset() {
let config_str = r#"
title = "My site"
base_url = "example.com"
"#;
let config = Config::parse(config_str).unwrap();
let v = config.ignored_static;
assert_eq!(v.len(), 0);
assert!(config.ignored_static_globset.is_none());
}
#[test]
fn empty_ignored_content_results_in_empty_vector_and_empty_globset() {
fn empty_ignored_content_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
@ -686,11 +653,21 @@ ignored_content = []
let config = Config::parse(config_str).unwrap();
assert_eq!(config.ignored_content.len(), 0);
assert!(config.ignored_content_globset.is_none());
}
#[test]
fn empty_ignored_static_results_in_empty_vector_and_empty_globset() {
fn missing_ignored_static_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
"#;
let config = Config::parse(config_str).unwrap();
assert_eq!(config.ignored_static.len(), 0);
}
#[test]
fn empty_ignored_static_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
@ -699,7 +676,30 @@ ignored_static = []
let config = Config::parse(config_str).unwrap();
assert_eq!(config.ignored_static.len(), 0);
assert!(config.ignored_static_globset.is_none());
}
#[test]
fn missing_link_checker_ignored_files_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
"#;
let config = Config::parse(config_str).unwrap();
assert_eq!(config.link_checker.ignored_files.len(), 0);
}
#[test]
fn empty_link_checker_ignored_files_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
[link_checker]
ignored_files = []
"#;
let config = Config::parse(config_str).unwrap();
assert_eq!(config.link_checker.ignored_files.len(), 0);
}
#[test]
@ -760,6 +760,36 @@ ignored_static = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
assert!(g.is_match("content/poetry/zen.py2"));
}
#[test]
fn non_empty_link_checker_ignored_pages_results_in_vector_of_patterns_and_configured_globset() {
let config_str = r#"
title = "My site"
base_url = "example.com"
[link_checker]
ignored_files = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
"#;
let config = Config::parse(config_str).unwrap();
let v = config.link_checker.ignored_files;
assert_eq!(v, vec!["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]);
let g = config.link_checker.ignored_files_globset.unwrap();
assert_eq!(g.len(), 3);
assert!(g.is_match("foo.graphml"));
assert!(g.is_match("foo/bar/foo.graphml"));
assert!(g.is_match("foo.iso"));
assert!(!g.is_match("foo.png"));
assert!(g.is_match("foo.py2"));
assert!(g.is_match("foo.py3"));
assert!(!g.is_match("foo.py"));
assert!(g.is_match("foo/bar/target"));
assert!(g.is_match("foo/bar/baz/temp_folder"));
assert!(g.is_match("foo/bar/baz/temp_folder/target"));
assert!(g.is_match("temp_folder"));
assert!(g.is_match("my/isos/foo.iso"));
assert!(g.is_match("content/poetry/zen.py2"));
}
#[test]
fn link_checker_skip_anchor_prefixes() {
let config_str = r#"

View file

@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
use std::{cmp, collections::HashMap, collections::HashSet, iter::FromIterator, thread};
use config::LinkCheckerLevel;
use libs::globset::GlobSet;
use libs::rayon::prelude::*;
use crate::Site;
@ -105,6 +106,10 @@ fn should_skip_by_prefix(link: &str, skip_prefixes: &[String]) -> bool {
skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
}
fn should_skip_by_file(file_path: &Path, glob_set: &GlobSet) -> bool {
glob_set.is_match(file_path)
}
fn get_link_domain(link: &str) -> Result<String> {
return match Url::parse(link) {
Ok(url) => match url.host_str().map(String::from) {
@ -150,9 +155,12 @@ pub fn check_external_links(site: &Site) -> Vec<String> {
let mut invalid_url_links: u32 = 0;
// First we look at all the external links, skip those the user wants to skip and record
// the ones that have invalid URLs
let ignored_files_globset = site.config.link_checker.ignored_files_globset.as_ref().unwrap();
for (file_path, links) in external_links {
for link in links {
if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes) {
if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes)
|| should_skip_by_file(file_path, ignored_files_globset)
{
skipped_link_count += 1;
} else {
match get_link_domain(link) {

View file

@ -0,0 +1,20 @@
use libs::globset::{Glob, GlobSet, GlobSetBuilder};
use errors::{bail, Result};
pub fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
// Convert the file glob strings into a compiled glob set matcher. We want to do this once,
// at program initialization, rather than for every page, for example. We arrange for the
// globset matcher to always exist (even though it has to be inside an Option at the
// moment because of the TOML serializer); if the glob set is empty the `is_match` function
// of the globber always returns false.
let mut glob_set_builder = GlobSetBuilder::new();
for pat in ignore {
let glob = match Glob::new(pat) {
Ok(g) => g,
Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
};
glob_set_builder.add(glob);
}
Ok(glob_set_builder.build()?)
}

View file

@ -1,6 +1,7 @@
pub mod anchors;
pub mod de;
pub mod fs;
pub mod globs;
pub mod net;
pub mod site;
pub mod slugs;