update nu-glob based on latest glob 0.3.1 changes (#9099)

# Description
This PR updates `nu-glob` to add the latest changes and updates from
`rust-lang/glob` [v0.3.1](https://github.com/rust-lang/glob).

With these changes you can do this type of globbing
```rust
/// - `?` matches any single character.
///
/// - `*` matches any (possibly empty) sequence of characters.
///
/// - `**` matches the current directory and arbitrary subdirectories. This
///   sequence **must** form a single path component, so both `**a` and `b**`
///   are invalid and will result in an error.  A sequence of more than two
///   consecutive `*` characters is also invalid.
///
/// - `[...]` matches any character inside the brackets.  Character sequences
///   can also specify ranges of characters, as ordered by Unicode, so e.g.
///   `[0-9]` specifies any character between 0 and 9 inclusive. An unclosed
///   bracket is invalid.
///
/// - `[!...]` is the negation of `[...]`, i.e. it matches any characters
///   **not** in the brackets.
///
/// - The metacharacters `?`, `*`, `[`, `]` can be matched by using brackets
///   (e.g. `[?]`).  When a `]` occurs immediately following `[` or `[!` then it
///   is interpreted as being part of, rather then ending, the character set, so
///   `]` and NOT `]` can be matched by `[]]` and `[!]]` respectively.  The `-`
///   character can be specified inside a character sequence pattern by placing
///   it at the start or the end, e.g. `[abc-]`.
```
Example - with character sequences

![image](https://user-images.githubusercontent.com/343840/236266670-03bf9384-4917-4074-9687-2c1c0d8ef34a.png)

Example - with character sequence negation

![image](https://user-images.githubusercontent.com/343840/236266421-73c3ee2c-1d10-4da0-86be-0afb51b50604.png)

Example - normal globbing

![image](https://user-images.githubusercontent.com/343840/236267138-60f22228-b8d3-4bf2-911b-a80560fdfa4f.png)

Example - with character sequences

![image](https://user-images.githubusercontent.com/343840/236267475-8c38fce9-87fe-4544-9757-34d319ce55b8.png)

Not that, if you're using a character sequence by itself, you need to
enclose it in quotes, otherwise nushell will think it's a range. But if
you already have a type of a bare word already, no quotes are necessary,
as in the last example.

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A
clippy::needless_collect -A clippy::result_large_err` to check that
you're using the standard code style
- `cargo test --workspace` to check that all tests pass
- `cargo run -- crates/nu-std/tests/run.nu` to run the tests for the
standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
This commit is contained in:
Darren Schroeder 2023-05-08 09:07:01 -05:00 committed by GitHub
parent a5af77dd72
commit 388e84e7ef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 288 additions and 14 deletions

View file

@ -143,7 +143,7 @@ impl Command for Ls {
} else if is_empty_dir(current_dir(engine_state, stack)?) {
return Ok(Value::list(vec![], call_span).into_pipeline_data());
} else {
(PathBuf::from("./*"), call_span, false)
(PathBuf::from("*"), call_span, false)
}
}
};

View file

@ -61,7 +61,7 @@
#![doc(
html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
html_favicon_url = "https://www.rust-lang.org/favicon.ico",
html_root_url = "https://docs.rs/glob/0.3.0"
html_root_url = "https://docs.rs/glob/0.3.1"
)]
#![deny(missing_docs)]
@ -80,8 +80,10 @@ use std::io;
use std::path::{self, Component, Path, PathBuf};
use std::str::FromStr;
use CharSpecifier::{CharRange, SingleChar};
use MatchResult::{EntirePatternDoesntMatch, Match, SubPatternDoesntMatch};
use PatternToken::{AnyChar, AnyRecursiveSequence, AnySequence, Char};
use PatternToken::AnyExcept;
use PatternToken::{AnyChar, AnyRecursiveSequence, AnySequence, AnyWithin, Char};
/// An iterator that yields `Path`s from the filesystem that match a particular
/// pattern.
@ -179,7 +181,10 @@ pub fn glob_with(pattern: &str, options: MatchOptions) -> Result<Paths, PatternE
#[cfg(windows)]
fn check_windows_verbatim(p: &Path) -> bool {
match p.components().next() {
Some(Component::Prefix(ref p)) => p.kind().is_verbatim(),
Some(Component::Prefix(ref p)) => {
// Allow VerbatimDisk paths. std canonicalize() generates them, and they work fine
p.kind().is_verbatim() && !matches!(p.kind(), std::path::Prefix::VerbatimDisk(_))
}
_ => false,
}
}
@ -297,6 +302,11 @@ impl GlobError {
}
impl Error for GlobError {
#[allow(deprecated)]
fn description(&self) -> &str {
self.error.description()
}
#[allow(unknown_lints, bare_trait_objects)]
fn cause(&self) -> Option<&dyn Error> {
Some(&self.error)
@ -488,6 +498,21 @@ impl fmt::Display for PatternError {
/// sequence **must** form a single path component, so both `**a` and `b**`
/// are invalid and will result in an error. A sequence of more than two
/// consecutive `*` characters is also invalid.
///
/// - `[...]` matches any character inside the brackets. Character sequences
/// can also specify ranges of characters, as ordered by Unicode, so e.g.
/// `[0-9]` specifies any character between 0 and 9 inclusive. An unclosed
/// bracket is invalid.
///
/// - `[!...]` is the negation of `[...]`, i.e. it matches any characters
/// **not** in the brackets.
///
/// - The metacharacters `?`, `*`, `[`, `]` can be matched by using brackets
/// (e.g. `[?]`). When a `]` occurs immediately following `[` or `[!` then it
/// is interpreted as being part of, rather then ending, the character set, so
/// `]` and NOT `]` can be matched by `[]]` and `[!]]` respectively. The `-`
/// character can be specified inside a character sequence pattern by placing
/// it at the start or the end, e.g. `[abc-]`.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)]
pub struct Pattern {
original: String,
@ -516,10 +541,17 @@ enum PatternToken {
AnyChar,
AnySequence,
AnyRecursiveSequence,
AnyWithin(Vec<CharSpecifier>),
AnyExcept(Vec<CharSpecifier>),
}
#[allow(clippy::enum_variant_names)]
#[derive(Copy, Clone, PartialEq, Eq)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
enum CharSpecifier {
SingleChar(char),
CharRange(char, char),
}
#[derive(Copy, Clone, PartialEq)]
enum MatchResult {
Match,
SubPatternDoesntMatch,
@ -529,6 +561,7 @@ enum MatchResult {
const ERROR_WILDCARDS: &str = "wildcards are either regular `*` or recursive `**`";
const ERROR_RECURSIVE_WILDCARDS: &str = "recursive wildcards must form a single path \
component";
const ERROR_INVALID_RANGE: &str = "invalid range pattern";
impl Pattern {
/// This function compiles Unix shell style patterns.
@ -604,6 +637,36 @@ impl Pattern {
tokens.push(AnySequence);
}
}
'[' => {
if i + 4 <= chars.len() && chars[i + 1] == '!' {
match chars[i + 3..].iter().position(|x| *x == ']') {
None => (),
Some(j) => {
let chars = &chars[i + 2..i + 3 + j];
let cs = parse_char_specifiers(chars);
tokens.push(AnyExcept(cs));
i += j + 4;
continue;
}
}
} else if i + 3 <= chars.len() && chars[i + 1] != '!' {
match chars[i + 2..].iter().position(|x| *x == ']') {
None => (),
Some(j) => {
let cs = parse_char_specifiers(&chars[i + 1..i + 2 + j]);
tokens.push(AnyWithin(cs));
i += j + 3;
continue;
}
}
}
// if we get here then this is not a valid range pattern
return Err(PatternError {
pos: i,
msg: ERROR_INVALID_RANGE,
});
}
c => {
tokens.push(Char(c));
i += 1;
@ -618,6 +681,28 @@ impl Pattern {
})
}
/// Escape metacharacters within the given string by surrounding them in
/// brackets. The resulting string will, when compiled into a `Pattern`,
/// match the input string and nothing else.
pub fn escape(s: &str) -> String {
let mut escaped = String::new();
for c in s.chars() {
match c {
// note that ! does not need escaping because it is only special
// inside brackets
'?' | '*' | '[' | ']' => {
escaped.push('[');
escaped.push(c);
escaped.push(']');
}
c => {
escaped.push(c);
}
}
}
escaped
}
/// Return if the given `str` matches this `Pattern` using the default
/// match options (i.e. `MatchOptions::new()`).
///
@ -627,6 +712,7 @@ impl Pattern {
/// use nu_glob::Pattern;
///
/// assert!(Pattern::new("c?t").unwrap().matches("cat"));
/// assert!(Pattern::new("k[!e]tteh").unwrap().matches("kitteh"));
/// assert!(Pattern::new("d*g").unwrap().matches("doog"));
/// ```
pub fn matches(&self, str: &str) -> bool {
@ -715,7 +801,7 @@ impl Pattern {
let is_sep = path::is_separator(c);
if !match *token {
AnyChar
AnyChar | AnyWithin(..) | AnyExcept(..)
if (options.require_literal_separator && is_sep)
|| (follows_separator
&& options.require_literal_leading_dot
@ -724,6 +810,8 @@ impl Pattern {
false
}
AnyChar => true,
AnyWithin(ref specifiers) => in_char_specifiers(specifiers, c, options),
AnyExcept(ref specifiers) => !in_char_specifiers(specifiers, c, options),
Char(c2) => chars_eq(c, c2, options.case_sensitive),
AnySequence | AnyRecursiveSequence => unreachable!(),
} {
@ -820,6 +908,16 @@ fn fill_todo(
});
match dirs {
Ok(mut children) => {
// FIXME: This check messes up a lot of tests for some reason
// if options.require_literal_leading_dot {
// children.retain(|x| {
// !x.file_name()
// .expect("internal error: getting filename")
// .to_str()
// .expect("internal error: filename to_str")
// .starts_with('.')
// });
// }
children.sort_by(|p1, p2| p2.file_name().cmp(&p1.file_name()));
todo.extend(children.into_iter().map(|x| Ok((x, idx))));
@ -850,6 +948,64 @@ fn fill_todo(
}
}
fn parse_char_specifiers(s: &[char]) -> Vec<CharSpecifier> {
let mut cs = Vec::new();
let mut i = 0;
while i < s.len() {
if i + 3 <= s.len() && s[i + 1] == '-' {
cs.push(CharRange(s[i], s[i + 2]));
i += 3;
} else {
cs.push(SingleChar(s[i]));
i += 1;
}
}
cs
}
fn in_char_specifiers(specifiers: &[CharSpecifier], c: char, options: MatchOptions) -> bool {
for &specifier in specifiers.iter() {
match specifier {
SingleChar(sc) => {
if chars_eq(c, sc, options.case_sensitive) {
return true;
}
}
CharRange(start, end) => {
// FIXME: work with non-ascii chars properly (issue #1347)
if !options.case_sensitive && c.is_ascii() && start.is_ascii() && end.is_ascii() {
let start = start.to_ascii_lowercase();
let end = end.to_ascii_lowercase();
let start_up = start
.to_uppercase()
.next()
.expect("internal error: getting start uppercase");
let end_up = end
.to_uppercase()
.next()
.expect("internal error: getting end uppercase");
// only allow case insensitive matching when
// both start and end are within a-z or A-Z
if start != start_up && end != end_up {
let c = c.to_ascii_lowercase();
if c >= start && c <= end {
return true;
}
}
}
if c >= start && c <= end {
return true;
}
}
}
}
false
}
/// A helper function to determine if two chars are (possibly case-insensitively) equal.
fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool {
if cfg!(windows) && path::is_separator(a) && path::is_separator(b) {
@ -863,6 +1019,7 @@ fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool {
}
/// Configuration options to modify the behaviour of `Pattern::matches_with(..)`.
#[allow(missing_copy_implementations)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
pub struct MatchOptions {
/// Whether or not patterns should be matched in a case-sensitive manner.
@ -903,6 +1060,11 @@ impl MatchOptions {
/// recursive_match_hidden_dir: true,
/// }
/// ```
///
/// # Note
/// The behavior of this method doesn't match `default()`'s. This returns
/// `case_sensitive` as `true` while `default()` does it as `false`.
// FIXME: Consider unity the behavior with `default()` in a next major release.
pub fn new() -> Self {
Self {
case_sensitive: true,
@ -926,16 +1088,29 @@ mod test {
#[test]
fn test_wildcard_errors() {
assert_eq!(Pattern::new("a/**b").unwrap_err().pos, 4);
assert_eq!(Pattern::new("a/bc**").unwrap_err().pos, 3);
assert_eq!(Pattern::new("a/*****").unwrap_err().pos, 4);
assert_eq!(Pattern::new("a/b**c**d").unwrap_err().pos, 2);
assert_eq!(Pattern::new("a**b").unwrap_err().pos, 0);
assert!(Pattern::new("a/**b").unwrap_err().pos == 4);
assert!(Pattern::new("a/bc**").unwrap_err().pos == 3);
assert!(Pattern::new("a/*****").unwrap_err().pos == 4);
assert!(Pattern::new("a/b**c**d").unwrap_err().pos == 2);
assert!(Pattern::new("a**b").unwrap_err().pos == 0);
}
#[test]
fn test_unclosed_bracket_errors() {
assert!(Pattern::new("abc[def").unwrap_err().pos == 3);
assert!(Pattern::new("abc[!def").unwrap_err().pos == 3);
assert!(Pattern::new("abc[").unwrap_err().pos == 3);
assert!(Pattern::new("abc[!").unwrap_err().pos == 3);
assert!(Pattern::new("abc[d").unwrap_err().pos == 3);
assert!(Pattern::new("abc[!d").unwrap_err().pos == 3);
assert!(Pattern::new("abc[]").unwrap_err().pos == 3);
assert!(Pattern::new("abc[!]").unwrap_err().pos == 3);
}
#[test]
fn test_glob_errors() {
assert_eq!(glob("a/**b").err().unwrap().pos, 4);
assert!(glob("a/**b").err().unwrap().pos == 4);
assert!(glob("abc[def").err().unwrap().pos == 3);
}
// this test assumes that there is a /root directory and that
@ -1019,6 +1194,7 @@ mod test {
assert!(Pattern::new("a*a*a*a*a*a*a*a*a")
.unwrap()
.matches("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
assert!(Pattern::new("a*b[xyz]c*d").unwrap().matches("abxcdbxcddd"));
}
#[test]
@ -1072,7 +1248,57 @@ mod test {
#[test]
fn test_lots_of_files() {
// this is a good test because it touches lots of differently named files
glob("/*/*/*/*").unwrap().nth(10000);
glob("/*/*/*/*").unwrap().skip(10000).next();
}
#[test]
fn test_range_pattern() {
let pat = Pattern::new("a[0-9]b").unwrap();
for i in 0..10 {
assert!(pat.matches(&format!("a{}b", i)));
}
assert!(!pat.matches("a_b"));
let pat = Pattern::new("a[!0-9]b").unwrap();
for i in 0..10 {
assert!(!pat.matches(&format!("a{}b", i)));
}
assert!(pat.matches("a_b"));
let pats = ["[a-z123]", "[1a-z23]", "[123a-z]"];
for &p in pats.iter() {
let pat = Pattern::new(p).unwrap();
for c in "abcdefghijklmnopqrstuvwxyz".chars() {
assert!(pat.matches(&c.to_string()));
}
for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ".chars() {
let options = MatchOptions {
case_sensitive: false,
..MatchOptions::new()
};
assert!(pat.matches_with(&c.to_string(), options));
}
assert!(pat.matches("1"));
assert!(pat.matches("2"));
assert!(pat.matches("3"));
}
let pats = ["[abc-]", "[-abc]", "[a-c-]"];
for &p in pats.iter() {
let pat = Pattern::new(p).unwrap();
assert!(pat.matches("a"));
assert!(pat.matches("b"));
assert!(pat.matches("c"));
assert!(pat.matches("-"));
assert!(!pat.matches("d"));
}
let pat = Pattern::new("[2-1]").unwrap();
assert!(!pat.matches("1"));
assert!(!pat.matches("2"));
assert!(Pattern::new("[-]").unwrap().matches("-"));
assert!(!Pattern::new("[!-]").unwrap().matches("-"));
}
#[test]
@ -1093,6 +1319,13 @@ mod test {
assert!(!dir_pat.matches("some/other/path/to/hello.txt"));
}
#[test]
fn test_pattern_escape() {
let s = "_[_]_?_*_!_";
assert_eq!(Pattern::escape(s), "_[[]_[]]_[?]_[*]_!_".to_string());
assert!(Pattern::new(&Pattern::escape(s)).unwrap().matches(s));
}
#[test]
fn test_pattern_matches_case_insensitive() {
let pat = Pattern::new("aBcDeFg").unwrap();
@ -1109,6 +1342,33 @@ mod test {
assert!(pat.matches_with("AbCdEfG", options));
}
#[test]
fn test_pattern_matches_case_insensitive_range() {
let pat_within = Pattern::new("[a]").unwrap();
let pat_except = Pattern::new("[!a]").unwrap();
let options_case_insensitive = MatchOptions {
case_sensitive: false,
require_literal_separator: false,
require_literal_leading_dot: false,
recursive_match_hidden_dir: false,
};
let options_case_sensitive = MatchOptions {
case_sensitive: true,
require_literal_separator: false,
require_literal_leading_dot: false,
recursive_match_hidden_dir: false,
};
assert!(pat_within.matches_with("a", options_case_insensitive));
assert!(pat_within.matches_with("A", options_case_insensitive));
assert!(!pat_within.matches_with("A", options_case_sensitive));
assert!(!pat_except.matches_with("a", options_case_insensitive));
assert!(!pat_except.matches_with("A", options_case_insensitive));
assert!(pat_except.matches_with("A", options_case_sensitive));
}
#[test]
fn test_pattern_matches_require_literal_separator() {
let options_require_literal = MatchOptions {
@ -1133,6 +1393,9 @@ mod test {
assert!(!Pattern::new("abc*def")
.unwrap()
.matches_with("abc/def", options_require_literal));
assert!(!Pattern::new("abc[/]def")
.unwrap()
.matches_with("abc/def", options_require_literal));
assert!(Pattern::new("abc/def")
.unwrap()
@ -1143,6 +1406,9 @@ mod test {
assert!(Pattern::new("abc*def")
.unwrap()
.matches_with("abc/def", options_not_require_literal));
assert!(Pattern::new("abc[/]def")
.unwrap()
.matches_with("abc/def", options_not_require_literal));
}
#[test]
@ -1208,6 +1474,14 @@ mod test {
assert!(f(options_not_require_literal_leading_dot));
assert!(!f(options_require_literal_leading_dot));
let f = |options| {
Pattern::new("aaa/[.]bbb")
.unwrap()
.matches_with("aaa/.bbb", options)
};
assert!(f(options_not_require_literal_leading_dot));
assert!(!f(options_require_literal_leading_dot));
let f = |options| Pattern::new("**/*").unwrap().matches_with(".bbb", options);
assert!(f(options_not_require_literal_leading_dot));
assert!(!f(options_require_literal_leading_dot));