From 664c7a6ec5a12a999af949f4ed1f93227ead5b6f Mon Sep 17 00:00:00 2001 From: Jeffrey Finkelstein Date: Sun, 1 Aug 2021 11:19:44 -0400 Subject: [PATCH] tac: add support for --regex option to tac Add support for `tac --regex`, where the line separator is interpreted as a regular expression. --- Cargo.lock | 1 + src/uu/tac/Cargo.toml | 1 + src/uu/tac/src/tac.rs | 90 ++++++++++++++++++++++++++++++++++++--- tests/by-util/test_tac.rs | 66 +++++++++++++++++++++++++++- 4 files changed, 152 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 808f62e15..f8de8d4a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3051,6 +3051,7 @@ version = "0.0.7" dependencies = [ "clap", "memchr 2.4.0", + "regex", "uucore", "uucore_procs", ] diff --git a/src/uu/tac/Cargo.toml b/src/uu/tac/Cargo.toml index 3ba1497a0..4a91786aa 100644 --- a/src/uu/tac/Cargo.toml +++ b/src/uu/tac/Cargo.toml @@ -16,6 +16,7 @@ path = "src/tac.rs" [dependencies] memchr = "2" +regex = "1" clap = { version = "2.33", features = ["wrap_help"] } uucore = { version=">=0.0.9", package="uucore", path="../../uucore" } uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/tac/src/tac.rs b/src/uu/tac/src/tac.rs index e54697f2b..4a93a7c65 100644 --- a/src/uu/tac/src/tac.rs +++ b/src/uu/tac/src/tac.rs @@ -69,7 +69,7 @@ pub fn uu_app() -> App<'static, 'static> { Arg::with_name(options::REGEX) .short("r") .long(options::REGEX) - .help("interpret the sequence as a regular expression (NOT IMPLEMENTED)") + .help("interpret the sequence as a regular expression") .takes_value(false), ) .arg( @@ -82,6 +82,82 @@ pub fn uu_app() -> App<'static, 'static> { .arg(Arg::with_name(options::FILE).hidden(true).multiple(true)) } +/// Print lines of a buffer in reverse, with line separator given as a regex. +/// +/// `data` contains the bytes of the file. +/// +/// `pattern` is the regular expression given as a +/// [`regex::bytes::Regex`] (not a [`regex::Regex`], since the input is +/// given as a slice of bytes). If `before` is `true`, then each match +/// of this pattern in `data` is interpreted as the start of a line. If +/// `before` is `false`, then each match of this pattern is interpreted +/// as the end of a line. +/// +/// This function writes each line in `data` to [`std::io::Stdout`] in +/// reverse. +/// +/// # Errors +/// +/// If there is a problem writing to `stdout`, then this function +/// returns [`std::io::Error`]. +fn buffer_tac_regex( + data: &[u8], + pattern: regex::bytes::Regex, + before: bool, +) -> std::io::Result<()> { + let mut out = stdout(); + + // The index of the line separator for the current line. + // + // As we scan through the `data` from right to left, we update this + // variable each time we find a new line separator. We restrict our + // regular expression search to only those bytes up to the line + // separator. + let mut this_line_end = data.len(); + + // The index of the start of the next line in the `data`. + // + // As we scan through the `data` from right to left, we update this + // variable each time we find a new line. + // + // If `before` is `true`, then each line starts immediately before + // the line separator. Otherwise, each line starts immediately after + // the line separator. + let mut following_line_start = data.len(); + + // Iterate over each byte in the buffer in reverse. When we find a + // line separator, write the line to stdout. + // + // The `before` flag controls whether the line separator appears at + // the end of the line (as in "abc\ndef\n") or at the beginning of + // the line (as in "/abc/def"). + for i in (0..data.len()).rev() { + // Determine if there is a match for `pattern` starting at index + // `i` in `data`. Only search up to the line ending that was + // found previously. + if let Some(match_) = pattern.find_at(&data[..this_line_end], i) { + // Record this index as the ending of the current line. + this_line_end = i; + + // The length of the match (that is, the line separator), in bytes. + let slen = match_.end() - match_.start(); + + if before { + out.write_all(&data[i..following_line_start])?; + following_line_start = i; + } else { + out.write_all(&data[i + slen..following_line_start])?; + following_line_start = i + slen; + } + } + } + + // After the loop terminates, write whatever bytes are remaining at + // the beginning of the buffer. + out.write_all(&data[0..following_line_start])?; + Ok(()) +} + /// Write lines from `data` to stdout in reverse. /// /// This function writes to [`stdout`] each line appearing in `data`, @@ -132,7 +208,7 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()> Ok(()) } -fn tac(filenames: Vec, before: bool, _: bool, separator: &str) -> i32 { +fn tac(filenames: Vec, before: bool, regex: bool, separator: &str) -> i32 { let mut exit_code = 0; for filename in &filenames { @@ -168,9 +244,13 @@ fn tac(filenames: Vec, before: bool, _: bool, separator: &str) -> i32 { exit_code = 1; continue; }; - - buffer_tac(&data, before, separator) - .unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e)); + if regex { + let pattern = crash_if_err!(1, regex::bytes::Regex::new(separator)); + buffer_tac_regex(&data, pattern, before) + } else { + buffer_tac(&data, before, separator) + } + .unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e)); } exit_code } diff --git a/tests/by-util/test_tac.rs b/tests/by-util/test_tac.rs index 202f76d66..323aa5149 100644 --- a/tests/by-util/test_tac.rs +++ b/tests/by-util/test_tac.rs @@ -1,4 +1,4 @@ -// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa +// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa axyz zyax zyxa use crate::common::util::*; #[test] @@ -205,3 +205,67 @@ fn test_null_separator() { .succeeds() .stdout_is("b\0a\0"); } + +#[test] +fn test_regex() { + new_ucmd!() + .args(&["-r", "-s", "[xyz]+"]) + .pipe_in("axyz") + .succeeds() + .no_stderr() + .stdout_is("zyax"); + + new_ucmd!() + .args(&["-r", "-s", ":+"]) + .pipe_in("a:b::c:::d::::") + .succeeds() + .no_stderr() + .stdout_is(":::d:::c::b:a:"); + + new_ucmd!() + .args(&["-r", "-s", r"[\+]+[-]+[\+]+"]) + // line 0 1 2 + // |--||-----||--------| + .pipe_in("a+-+b++--++c+d-e+---+") + .succeeds() + .no_stderr() + // line 2 1 0 + // |--------||-----||--| + .stdout_is("c+d-e+---+b++--++a+-+"); +} + +#[test] +fn test_regex_before() { + new_ucmd!() + .args(&["-b", "-r", "-s", "[xyz]+"]) + .pipe_in("axyz") + .succeeds() + .no_stderr() + .stdout_is("zyxa"); + + new_ucmd!() + .args(&["-b", "-r", "-s", ":+"]) + .pipe_in(":a::b:::c::::d") + .succeeds() + .stdout_is(":d::::c:::b::a"); + + // Because `tac` searches for matches of the regular expression from + // right to left, the second to last line is + // + // +--++b + // + // not + // + // ++--++b + // + new_ucmd!() + .args(&["-b", "-r", "-s", r"[\+]+[-]+[\+]+"]) + // line 0 1 2 + // |---||----||--------| + .pipe_in("+-+a++--++b+---+c+d-e") + .succeeds() + .no_stderr() + // line 2 1 0 + // |--------||----||---| + .stdout_is("+---+c+d-e+--++b+-+a+"); +}