Merge pull request #4232 from TechHara/whitespace

cut: add whitespace option for separating fields
2025-01-19 00:24:13 +00:00 · 2023-01-01 12:58:28 +01:00 · 2023-01-01 12:58:28 +01:00 · 36f3507bed
commit 36f3507bed
parent a3c76678fe 276b115c02
5 changed files with 310 additions and 45 deletions
--- a/docs/src/extensions.md
+++ b/docs/src/extensions.md
@ -33,3 +33,7 @@ We provide a simple implementation of `more`, which is not part of GNU
 coreutils. We do not aim for full compatibility with the `more` utility from
 `util-linux`. Features from more modern pagers (like `less` and `bat`) are
 therefore welcomed.
+
+## `cut`
+
+`cut` can separate fields by whitespace (Space and Tab) with `-w` flag. This feature is adopted from [FreeBSD](https://www.freebsd.org/cgi/man.cgi?cut).
--- a/src/uu/cut/src/cut.rs
+++ b/src/uu/cut/src/cut.rs
@ -16,14 +16,16 @@ use uucore::display::Quotable;
 use uucore::error::{FromIo, UResult, USimpleError};

 use self::searcher::Searcher;
+use self::whitespace_searcher::WhitespaceSearcher;
 use uucore::ranges::Range;
 use uucore::{format_usage, show, show_error, show_if_err};

 mod searcher;
+mod whitespace_searcher;

 static NAME: &str = "cut";
 static USAGE: &str =
-    "{} [-d] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
+    "{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
 static ABOUT: &str =
    "Prints specified byte or field columns from each line of stdin or the input files";
 static LONG_HELP: &str = "
@ -85,6 +87,11 @@ static LONG_HELP: &str = "
        --delimiter (-d) option. Setting the delimiter is optional.
        If not set, a default delimiter of Tab will be used.

+        If the -w option is provided, fields will be separated by any number
+        of whitespace characters (Space and Tab). The output delimiter will
+        be a Tab unless explicitly specified. Only one of -d or -w option can be specified.
+        This is an extension adopted from FreeBSD.
+
    Optionally Filter based on delimiter
        If the --only-delimited (-s) flag is provided, only lines which
        contain the delimiter will be printed
@ -111,8 +118,13 @@ struct Options {
    zero_terminated: bool,
 }

+enum Delimiter {
+    Whitespace,
+    String(String), // FIXME: use char?
+}
+
 struct FieldOptions {
-    delimiter: String, // one char long, String because of UTF8 representation
+    delimiter: Delimiter,
    out_delimiter: Option<String>,
    only_delimited: bool,
    zero_terminated: bool,
@ -256,32 +268,24 @@ fn cut_fields_delimiter<R: Read>(
    Ok(())
 }

-#[allow(clippy::cognitive_complexity)]
-fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
-    let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
-    if let Some(ref o_delim) = opts.out_delimiter {
-        return cut_fields_delimiter(
-            reader,
-            ranges,
-            &opts.delimiter,
-            opts.only_delimited,
-            newline_char,
-            o_delim,
-        );
-    }
-
+fn cut_fields_whitespace<R: Read>(
+    reader: R,
+    ranges: &[Range],
+    only_delimited: bool,
+    newline_char: u8,
+    out_delim: &str,
+) -> UResult<()> {
    let mut buf_in = BufReader::new(reader);
    let mut out = stdout_writer();
-    let delim_len = opts.delimiter.len();

    let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
        let mut fields_pos = 1;
        let mut low_idx = 0;
-        let mut delim_search = Searcher::new(line, opts.delimiter.as_bytes()).peekable();
+        let mut delim_search = WhitespaceSearcher::new(line).peekable();
        let mut print_delim = false;

        if delim_search.peek().is_none() {
-            if !opts.only_delimited {
+            if !only_delimited {
                out.write_all(line)?;
                if line[line.len() - 1] != newline_char {
                    out.write_all(&[newline_char])?;
@ -290,42 +294,54 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes

            return Ok(true);
        }
-
+        // The logic is identical to `cut_fields_delimiter` function above, which uses
+        // `Searcher` that iterates over and returns the first position of the delimiter character.
+        // The main difference is that `WhitespaceSearcher` returns a pair of the first and last
+        // delimiter character positions, since each delimiter sequence length can vary.
        for &Range { low, high } in ranges {
            if low - fields_pos > 0 {
-                if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
-                    low_idx = if print_delim {
-                        delim_pos
-                    } else {
-                        delim_pos + delim_len
-                    }
-                } else {
-                    break;
-                }
+                // current field is not in the range, so jump to the field corresponding to the
+                // beginning of the range if any
+                low_idx = match delim_search.nth(low - fields_pos - 1) {
+                    Some((_, last)) => last,
+                    None => break,
+                };
            }

-            match delim_search.nth(high - low) {
-                Some(high_idx) => {
-                    let segment = &line[low_idx..high_idx];
-
-                    out.write_all(segment)?;
-
+            // at this point, current field is the first in the range
+            for _ in 0..=high - low {
+                // skip printing delimiter if this is the first matching field for this line
+                if print_delim {
+                    out.write_all(out_delim.as_bytes())?;
+                } else {
                    print_delim = true;
-                    low_idx = high_idx;
-                    fields_pos = high + 1;
                }
-                None => {
-                    let segment = &line[low_idx..line.len()];

-                    out.write_all(segment)?;
+                match delim_search.next() {
+                    // print the current field up to the next whitespace
+                    Some((first, last)) => {
+                        let segment = &line[low_idx..first];

-                    if line[line.len() - 1] == newline_char {
-                        return Ok(true);
+                        out.write_all(segment)?;
+
+                        low_idx = last;
+                        fields_pos = high + 1;
+                    }
+                    None => {
+                        // this is the last field in the line, so print the rest
+                        let segment = &line[low_idx..];
+
+                        out.write_all(segment)?;
+
+                        if line[line.len() - 1] == newline_char {
+                            return Ok(true);
+                        }
+                        break;
                    }
-                    break;
                }
            }
        }
+
        out.write_all(&[newline_char])?;
        Ok(true)
    });
@ -337,6 +353,97 @@ fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> URes
    Ok(())
 }

+fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
+    let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
+    match opts.delimiter {
+        Delimiter::Whitespace => cut_fields_whitespace(
+            reader,
+            ranges,
+            opts.only_delimited,
+            newline_char,
+            opts.out_delimiter.as_deref().unwrap_or("\t"),
+        ),
+        Delimiter::String(ref delimiter) => {
+            if let Some(ref o_delim) = opts.out_delimiter {
+                return cut_fields_delimiter(
+                    reader,
+                    ranges,
+                    delimiter,
+                    opts.only_delimited,
+                    newline_char,
+                    o_delim,
+                );
+            }
+
+            let mut buf_in = BufReader::new(reader);
+            let mut out = stdout_writer();
+            let delim_len = delimiter.len();
+
+            let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
+                let mut fields_pos = 1;
+                let mut low_idx = 0;
+                let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
+                let mut print_delim = false;
+
+                if delim_search.peek().is_none() {
+                    if !opts.only_delimited {
+                        out.write_all(line)?;
+                        if line[line.len() - 1] != newline_char {
+                            out.write_all(&[newline_char])?;
+                        }
+                    }
+
+                    return Ok(true);
+                }
+
+                for &Range { low, high } in ranges {
+                    if low - fields_pos > 0 {
+                        if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
+                            low_idx = if print_delim {
+                                delim_pos
+                            } else {
+                                delim_pos + delim_len
+                            }
+                        } else {
+                            break;
+                        }
+                    }
+
+                    match delim_search.nth(high - low) {
+                        Some(high_idx) => {
+                            let segment = &line[low_idx..high_idx];
+
+                            out.write_all(segment)?;
+
+                            print_delim = true;
+                            low_idx = high_idx;
+                            fields_pos = high + 1;
+                        }
+                        None => {
+                            let segment = &line[low_idx..line.len()];
+
+                            out.write_all(segment)?;
+
+                            if line[line.len() - 1] == newline_char {
+                                return Ok(true);
+                            }
+                            break;
+                        }
+                    }
+                }
+                out.write_all(&[newline_char])?;
+                Ok(true)
+            });
+
+            if let Err(e) = result {
+                return Err(USimpleError::new(1, e.to_string()));
+            }
+
+            Ok(())
+        }
+    }
+}
+
 fn cut_files(mut filenames: Vec<String>, mode: &Mode) {
    let mut stdin_read = false;

@ -387,6 +494,7 @@ mod options {
    pub const ZERO_TERMINATED: &str = "zero-terminated";
    pub const ONLY_DELIMITED: &str = "only-delimited";
    pub const OUTPUT_DELIMITER: &str = "output-delimiter";
+    pub const WHITESPACE_DELIMITED: &str = "whitespace-delimited";
    pub const COMPLEMENT: &str = "complement";
    pub const FILE: &str = "file";
 }
@ -449,9 +557,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                };

                let only_delimited = matches.get_flag(options::ONLY_DELIMITED);
+                let whitespace_delimited = matches.get_flag(options::WHITESPACE_DELIMITED);
                let zero_terminated = matches.get_flag(options::ZERO_TERMINATED);

                match matches.get_one::<String>(options::DELIMITER).map(|s| s.as_str()) {
+                    Some(_) if whitespace_delimited => {
+                            Err("invalid input: Only one of --delimiter (-d) or -w option can be specified".into())
+                        }
                    Some(mut delim) => {
                        // GNU's `cut` supports `-d=` to set the delimiter to `=`.
                        // Clap parsing is limited in this situation, see:
@ -474,7 +586,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                            Ok(Mode::Fields(
                                ranges,
                                FieldOptions {
-                                    delimiter: delim,
+                                    delimiter: Delimiter::String(delim),
                                    out_delimiter: out_delim,
                                    only_delimited,
                                    zero_terminated,
@ -485,7 +597,10 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                    None => Ok(Mode::Fields(
                        ranges,
                        FieldOptions {
-                            delimiter: "\t".to_owned(),
+                            delimiter: match whitespace_delimited {
+                                true => Delimiter::Whitespace,
+                                false => Delimiter::String("\t".to_owned()),
+                            },
                            out_delimiter: out_delim,
                            only_delimited,
                            zero_terminated,
@ -508,6 +623,11 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
            {
                Err("invalid input: The '--delimiter' ('-d') option only usable if printing a sequence of fields".into())
            }
+            Mode::Bytes(_, _) | Mode::Characters(_, _)
+                if matches.get_flag(options::WHITESPACE_DELIMITED) =>
+            {
+                Err("invalid input: The '-w' option only usable if printing a sequence of fields".into())
+            }
            Mode::Bytes(_, _) | Mode::Characters(_, _)
                if matches.get_flag(options::ONLY_DELIMITED) =>
            {
@ -563,6 +683,13 @@ pub fn uu_app() -> Command {
                .help("specify the delimiter character that separates fields in the input source. Defaults to Tab.")
                .value_name("DELIM"),
        )
+        .arg(
+            Arg::new(options::WHITESPACE_DELIMITED)
+                .short('w')
+                .help("Use any number of whitespace (Space, Tab) to separate fields in the input source (FreeBSD extension).")
+                .value_name("WHITESPACE")
+                .action(ArgAction::SetTrue),
+        )
        .arg(
            Arg::new(options::FIELDS)
                .short('f')
--- a/src/uu/cut/src/whitespace_searcher.rs
+++ b/src/uu/cut/src/whitespace_searcher.rs
@ -0,0 +1,97 @@
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+// spell-checker:ignore multispace
+
+use memchr::memchr2;
+
+pub struct WhitespaceSearcher<'a> {
+    haystack: &'a [u8],
+    position: usize,
+}
+
+impl<'a> WhitespaceSearcher<'a> {
+    pub fn new(haystack: &'a [u8]) -> WhitespaceSearcher<'a> {
+        WhitespaceSearcher {
+            haystack,
+            position: 0,
+        }
+    }
+}
+
+impl<'a> Iterator for WhitespaceSearcher<'a> {
+    type Item = (usize, usize);
+
+    // Iterate over sequences of consecutive whitespace (space and/or tab) characters.
+    // Returns (first, last) positions of each sequence, where `haystack[first..last]`
+    // corresponds to the delimiter.
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(match_idx) = memchr2(b' ', b'\t', self.haystack) {
+            let mut skip = match_idx + 1;
+            while skip < self.haystack.len()
+                && (self.haystack[skip] == b' ' || self.haystack[skip] == b'\t')
+            {
+                skip += 1;
+            }
+            let match_pos = self.position + match_idx;
+            self.haystack = &self.haystack[skip..];
+            self.position += skip;
+            Some((match_pos, self.position))
+        } else {
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_space() {
+        let iter = WhitespaceSearcher::new(" . . ".as_bytes());
+        let items: Vec<(usize, usize)> = iter.collect();
+        assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
+    }
+
+    #[test]
+    fn test_tab() {
+        let iter = WhitespaceSearcher::new("\t.\t.\t".as_bytes());
+        let items: Vec<(usize, usize)> = iter.collect();
+        assert_eq!(vec![(0, 1), (2, 3), (4, 5)], items);
+    }
+
+    #[test]
+    fn test_empty() {
+        let iter = WhitespaceSearcher::new("".as_bytes());
+        let items: Vec<(usize, usize)> = iter.collect();
+        assert_eq!(vec![] as Vec<(usize, usize)>, items);
+    }
+
+    fn test_multispace(line: &[u8], expected: &[(usize, usize)]) {
+        let iter = WhitespaceSearcher::new(line);
+        let items: Vec<(usize, usize)> = iter.collect();
+        assert_eq!(expected, items);
+    }
+
+    #[test]
+    fn test_multispace_normal() {
+        test_multispace(
+            "...  ... \t...\t ... \t ...".as_bytes(),
+            &[(3, 5), (8, 10), (13, 15), (18, 21)],
+        );
+    }
+
+    #[test]
+    fn test_multispace_begin() {
+        test_multispace(" \t\t...".as_bytes(), &[(0, 3)]);
+    }
+
+    #[test]
+    fn test_multispace_end() {
+        test_multispace("...\t  ".as_bytes(), &[(3, 6)]);
+    }
+}
--- a/tests/by-util/test_cut.rs
+++ b/tests/by-util/test_cut.rs
@ -81,6 +81,38 @@ fn test_field_sequence() {
    }
 }

+#[test]
+fn test_whitespace_delimited() {
+    new_ucmd!()
+        .args(&["-w", "-f", COMPLEX_SEQUENCE.sequence, INPUT])
+        .succeeds()
+        .stdout_only_fixture("whitespace_delimited.expected");
+}
+
+#[test]
+fn test_whitespace_with_explicit_delimiter() {
+    new_ucmd!()
+        .args(&["-w", "-f", COMPLEX_SEQUENCE.sequence, "-d:"])
+        .fails()
+        .code_is(1);
+}
+
+#[test]
+fn test_whitespace_with_byte() {
+    new_ucmd!()
+        .args(&["-w", "-b", COMPLEX_SEQUENCE.sequence])
+        .fails()
+        .code_is(1);
+}
+
+#[test]
+fn test_whitespace_with_char() {
+    new_ucmd!()
+        .args(&["-c", COMPLEX_SEQUENCE.sequence, "-w"])
+        .fails()
+        .code_is(1);
+}
+
 #[test]
 fn test_specify_delimiter() {
    for param in ["-d", "--delimiter", "--del"] {
--- a/tests/fixtures/cut/whitespace_delimited.expected
+++ b/tests/fixtures/cut/whitespace_delimited.expected
@ -0,0 +1,5 @@
+foo:bar:baz:qux:quux
+one:two:three:four:five:six:seven
+alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu
+the	quick	fox	over	the	dog
+sally	sells	down	the	seashore	are	the	seashells	sally	sells