Merge pull request #3754 from ackerleytng/main

Add `parse_glob` module and update `du` to use `parse_glob`
2025-01-19 00:24:13 +00:00 · 2022-08-10 19:28:40 +02:00 · 2022-08-10 19:28:40 +02:00 · 8692301ec7
commit 8692301ec7
parent e304758f61 c2bb9596d9
7 changed files with 224 additions and 104 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3102,6 +3102,7 @@ dependencies = [
 "data-encoding-macro",
 "dns-lookup",
 "dunce",
+ "glob",
 "itertools",
 "libc",
 "nix",
--- a/src/uu/du/src/du.rs
+++ b/src/uu/du/src/du.rs
@ -37,6 +37,7 @@ use uucore::display::{print_verbatim, Quotable};
 use uucore::error::FromIo;
 use uucore::error::{UError, UResult};
 use uucore::format_usage;
+use uucore::parse_glob;
 use uucore::parse_size::{parse_size, ParseSizeError};
 use uucore::InvalidEncodingHandling;
 #[cfg(windows)]
@ -488,55 +489,28 @@ fn file_as_vec(filename: impl AsRef<Path>) -> Vec<String> {

 // Given the --exclude-from and/or --exclude arguments, returns the globset lists
 // to ignore the files
-fn get_glob_ignore(matches: &ArgMatches) -> UResult<Vec<Pattern>> {
-    let mut excludes_from = if matches.contains_id(options::EXCLUDE_FROM) {
-        match matches.values_of(options::EXCLUDE_FROM) {
-            Some(all_files) => {
-                let mut exclusion = Vec::<String>::new();
-                // Read the exclude lists from all the files
-                // and add them into a vector of string
-                let files: Vec<String> = all_files.clone().map(|v| v.to_owned()).collect();
-                for f in files {
-                    exclusion.extend(file_as_vec(&f));
-                }
-                exclusion
-            }
-            None => Vec::<String>::new(),
-        }
-    } else {
-        Vec::<String>::new()
-    };
+fn build_exclude_patterns(matches: &ArgMatches) -> UResult<Vec<Pattern>> {
+    let exclude_from_iterator = matches
+        .values_of(options::EXCLUDE_FROM)
+        .unwrap_or_default()
+        .flat_map(|f| file_as_vec(&f));

-    let mut excludes = if matches.contains_id(options::EXCLUDE) {
-        match matches.values_of(options::EXCLUDE) {
-            Some(v) => {
-                // Read the various arguments
-                v.clone().map(|v| v.to_owned()).collect()
-            }
-            None => Vec::<String>::new(),
-        }
-    } else {
-        Vec::<String>::new()
-    };
+    let excludes_iterator = matches
+        .values_of(options::EXCLUDE)
+        .unwrap_or_default()
+        .map(|v| v.to_owned());

-    // Merge the two lines
-    excludes.append(&mut excludes_from);
-    if !&excludes.is_empty() {
-        let mut builder = Vec::new();
-        // Create the `Vec` of excludes
-        for f in excludes {
-            if matches.contains_id(options::VERBOSE) {
-                println!("adding {:?} to the exclude list ", &f);
-            }
-            match Pattern::new(&f) {
-                Ok(glob) => builder.push(glob),
-                Err(err) => return Err(DuError::InvalidGlob(err.to_string()).into()),
-            };
+    let mut exclude_patterns = Vec::new();
+    for f in excludes_iterator.chain(exclude_from_iterator) {
+        if matches.is_present(options::VERBOSE) {
+            println!("adding {:?} to the exclude list ", &f);
+        }
+        match parse_glob::from_str(&f) {
+            Ok(glob) => exclude_patterns.push(glob),
+            Err(err) => return Err(DuError::InvalidGlob(err.to_string()).into()),
        }
-        Ok(builder)
-    } else {
-        Ok(Vec::new())
    }
+    Ok(exclude_patterns)
 }

 #[uucore::main]
@ -615,85 +589,84 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
        "\n"
    };

-    let excludes = get_glob_ignore(&matches)?;
+    let excludes = build_exclude_patterns(&matches)?;

    let mut grand_total = 0;
    'loop_file: for path_string in files {
        // Skip if we don't want to ignore anything
        if !&excludes.is_empty() {
            for pattern in &excludes {
-                {
-                    if pattern.matches(path_string) {
-                        // if the directory is ignored, leave early
-                        if options.verbose {
-                            println!("{} ignored", path_string.quote());
-                        }
-                        continue 'loop_file;
+                if pattern.matches(path_string) {
+                    // if the directory is ignored, leave early
+                    if options.verbose {
+                        println!("{} ignored", path_string.quote());
                    }
+                    continue 'loop_file;
                }
            }
        }

        let path = PathBuf::from(&path_string);
-        match Stat::new(path, &options) {
-            Ok(stat) => {
-                let mut inodes: HashSet<FileInfo> = HashSet::new();
-                if let Some(inode) = stat.inode {
-                    inodes.insert(inode);
+        // Check existence of path provided in argument
+        if let Ok(stat) = Stat::new(path, &options) {
+            // Kick off the computation of disk usage from the initial path
+            let mut inodes: HashSet<FileInfo> = HashSet::new();
+            if let Some(inode) = stat.inode {
+                inodes.insert(inode);
+            }
+            let iter = du(stat, &options, 0, &mut inodes, &excludes);
+
+            // Sum up all the returned `Stat`s and display results
+            let (_, len) = iter.size_hint();
+            let len = len.unwrap();
+            for (index, stat) in iter.enumerate() {
+                let size = choose_size(&matches, &stat);
+
+                if threshold.map_or(false, |threshold| threshold.should_exclude(size)) {
+                    continue;
                }
-                let iter = du(stat, &options, 0, &mut inodes, &excludes);
-                let (_, len) = iter.size_hint();
-                let len = len.unwrap();
-                for (index, stat) in iter.enumerate() {
-                    let size = choose_size(&matches, &stat);

-                    if threshold.map_or(false, |threshold| threshold.should_exclude(size)) {
-                        continue;
-                    }
-
-                    if matches.contains_id(options::TIME) {
-                        let tm = {
-                            let secs = {
-                                match matches.value_of(options::TIME) {
-                                    Some(s) => match s {
-                                        "ctime" | "status" => stat.modified,
-                                        "access" | "atime" | "use" => stat.accessed,
-                                        "birth" | "creation" => stat
-                                            .created
-                                            .ok_or_else(|| DuError::InvalidTimeArg(s.into()))?,
-                                        // below should never happen as clap already restricts the values.
-                                        _ => unreachable!("Invalid field for --time"),
-                                    },
-                                    None => stat.modified,
-                                }
-                            };
-                            DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs(secs))
+                if matches.is_present(options::TIME) {
+                    let tm = {
+                        let secs = {
+                            match matches.value_of(options::TIME) {
+                                Some(s) => match s {
+                                    "ctime" | "status" => stat.modified,
+                                    "access" | "atime" | "use" => stat.accessed,
+                                    "birth" | "creation" => stat
+                                        .created
+                                        .ok_or_else(|| DuError::InvalidTimeArg(s.into()))?,
+                                    // below should never happen as clap already restricts the values.
+                                    _ => unreachable!("Invalid field for --time"),
+                                },
+                                None => stat.modified,
+                            }
                        };
-                        if !summarize || index == len - 1 {
-                            let time_str = tm.format(time_format_str).to_string();
-                            print!("{}\t{}\t", convert_size(size), time_str);
-                            print_verbatim(stat.path).unwrap();
-                            print!("{}", line_separator);
-                        }
-                    } else if !summarize || index == len - 1 {
-                        print!("{}\t", convert_size(size));
+                        DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs(secs))
+                    };
+                    if !summarize || index == len - 1 {
+                        let time_str = tm.format(time_format_str).to_string();
+                        print!("{}\t{}\t", convert_size(size), time_str);
                        print_verbatim(stat.path).unwrap();
                        print!("{}", line_separator);
                    }
-                    if options.total && index == (len - 1) {
-                        // The last element will be the total size of the the path under
-                        // path_string.  We add it to the grand total.
-                        grand_total += size;
-                    }
+                } else if !summarize || index == len - 1 {
+                    print!("{}\t", convert_size(size));
+                    print_verbatim(stat.path).unwrap();
+                    print!("{}", line_separator);
+                }
+                if options.total && index == (len - 1) {
+                    // The last element will be the total size of the the path under
+                    // path_string.  We add it to the grand total.
+                    grand_total += size;
                }
            }
-            Err(_) => {
-                show_error!(
-                    "{}: {}",
-                    path_string.maybe_quote(),
-                    "No such file or directory"
-                );
-            }
+        } else {
+            show_error!(
+                "{}: {}",
+                path_string.maybe_quote(),
+                "No such file or directory"
+            );
        }
    }

--- a/src/uucore/Cargo.toml
+++ b/src/uucore/Cargo.toml
@ -23,6 +23,7 @@ clap = "3.2"
 dns-lookup = { version="1.0.5", optional=true }
 dunce = "1.0.0"
 wild = "2.0"
+glob = "0.3.0"
 # * optional
 itertools = { version="0.10.0", optional=true }
 thiserror = { version="1.0", optional=true }
--- a/src/uucore/src/lib/lib.rs
+++ b/src/uucore/src/lib/lib.rs
@ -29,6 +29,7 @@ pub use crate::mods::ranges;
 pub use crate::mods::version_cmp;

 // * string parsing modules
+pub use crate::parser::parse_glob;
 pub use crate::parser::parse_size;
 pub use crate::parser::parse_time;

--- a/src/uucore/src/lib/parser.rs
+++ b/src/uucore/src/lib/parser.rs
@ -1,2 +1,3 @@
+pub mod parse_glob;
 pub mod parse_size;
 pub mod parse_time;
--- a/src/uucore/src/lib/parser/parse_glob.rs
+++ b/src/uucore/src/lib/parser/parse_glob.rs
@ -0,0 +1,109 @@
+//! Parsing a glob Pattern from a string.
+//!
+//! Use the [`from_str`] function to parse a [`Pattern`] from a string.
+
+// cSpell:words fnmatch
+
+use glob::{Pattern, PatternError};
+
+fn fix_negation(glob: &str) -> String {
+    let mut chars = glob.chars().collect::<Vec<_>>();
+
+    let mut i = 0;
+    while i < chars.len() {
+        if chars[i] == '[' && i + 4 <= glob.len() && chars[i + 1] == '^' {
+            match chars[i + 3..].iter().position(|x| *x == ']') {
+                None => (),
+                Some(j) => {
+                    chars[i + 1] = '!';
+                    i += j + 4;
+                    continue;
+                }
+            }
+        }
+
+        i += 1;
+    }
+
+    chars.into_iter().collect::<String>()
+}
+
+/// Parse a glob Pattern from a string.
+///
+/// This function amends the input string to replace any caret or circumflex
+/// character (^) used to negate a set of characters with an exclamation mark
+/// (!), which adapts rust's glob matching to function the way the GNU utils'
+/// fnmatch does.
+///
+/// # Examples
+///
+/// ```rust
+/// use std::time::Duration;
+/// use uucore::parse_glob::from_str;
+/// assert!(!from_str("[^abc]").unwrap().matches("a"));
+/// assert!(from_str("[^abc]").unwrap().matches("x"));
+/// ```
+pub fn from_str(glob: &str) -> Result<Pattern, PatternError> {
+    Pattern::new(&fix_negation(glob))
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_from_str() {
+        assert_eq!(from_str("[^abc]").unwrap(), Pattern::new("[!abc]").unwrap());
+    }
+
+    #[test]
+    fn test_fix_negation() {
+        // Happy/Simple case
+        assert_eq!(fix_negation("[^abc]"), "[!abc]");
+
+        // Should fix negations in a long regex
+        assert_eq!(fix_negation("foo[abc]  bar[^def]"), "foo[abc]  bar[!def]");
+
+        // Should fix multiple negations in a regex
+        assert_eq!(fix_negation("foo[^abc]bar[^def]"), "foo[!abc]bar[!def]");
+
+        // Should fix negation of the single character ]
+        assert_eq!(fix_negation("[^]]"), "[!]]");
+
+        // Should fix negation of the single character ^
+        assert_eq!(fix_negation("[^^]"), "[!^]");
+
+        // Should fix negation of the space character
+        assert_eq!(fix_negation("[^ ]"), "[! ]");
+
+        // Complicated patterns
+        assert_eq!(fix_negation("[^][]"), "[!][]");
+        assert_eq!(fix_negation("[^[]]"), "[![]]");
+
+        // More complex patterns that should be replaced
+        assert_eq!(fix_negation("[[]] [^a]"), "[[]] [!a]");
+        assert_eq!(fix_negation("[[] [^a]"), "[[] [!a]");
+        assert_eq!(fix_negation("[]] [^a]"), "[]] [!a]");
+    }
+
+    #[test]
+    fn test_fix_negation_should_not_amend() {
+        assert_eq!(fix_negation("abc"), "abc");
+
+        // Regex specifically matches either [ or ^
+        assert_eq!(fix_negation("[[^]"), "[[^]");
+
+        // Regex that specifically matches either space or ^
+        assert_eq!(fix_negation("[ ^]"), "[ ^]");
+
+        // Regex that specifically matches either [, space or ^
+        assert_eq!(fix_negation("[[ ^]"), "[[ ^]");
+        assert_eq!(fix_negation("[ [^]"), "[ [^]");
+
+        // Invalid globs (according to rust's glob implementation) will remain unamended
+        assert_eq!(fix_negation("[^]"), "[^]");
+        assert_eq!(fix_negation("[^"), "[^");
+        assert_eq!(fix_negation("[][^]"), "[][^]");
+    }
+}
--- a/tests/by-util/test_du.rs
+++ b/tests/by-util/test_du.rs
@ -747,6 +747,40 @@ fn test_du_exclude_mix() {
    assert!(result.stdout_str().contains("xcwww"));
 }

+#[test]
+// Disable on Windows because we are looking for /
+// And the tests would be more complex if we have to support \ too
+#[cfg(not(target_os = "windows"))]
+fn test_du_complex_exclude_patterns() {
+    let ts = TestScenario::new(util_name!());
+    let at = &ts.fixtures;
+
+    at.mkdir_all("azerty/xcwww/azeaze");
+    at.mkdir_all("azerty/xcwww/qzerty");
+    at.mkdir_all("azerty/xcwww/amazing");
+
+    // Negation in glob should work with both ^ and !
+    let result = ts
+        .ucmd()
+        .arg("--exclude=azerty/*/[^q]*")
+        .arg("azerty")
+        .succeeds();
+    assert!(!result.stdout_str().contains("amazing"));
+    assert!(result.stdout_str().contains("qzerty"));
+    assert!(!result.stdout_str().contains("azeaze"));
+    assert!(result.stdout_str().contains("xcwww"));
+
+    let result = ts
+        .ucmd()
+        .arg("--exclude=azerty/*/[!q]*")
+        .arg("azerty")
+        .succeeds();
+    assert!(!result.stdout_str().contains("amazing"));
+    assert!(result.stdout_str().contains("qzerty"));
+    assert!(!result.stdout_str().contains("azeaze"));
+    assert!(result.stdout_str().contains("xcwww"));
+}
+
 #[test]
 fn test_du_exclude_several_components() {
    let ts = TestScenario::new(util_name!());