From 664c7a6ec5a12a999af949f4ed1f93227ead5b6f Mon Sep 17 00:00:00 2001
From: Jeffrey Finkelstein <jeffrey.finkelstein@protonmail.com>
Date: Sun, 1 Aug 2021 11:19:44 -0400
Subject: [PATCH] tac: add support for --regex option to tac

Add support for `tac --regex`, where the line separator is interpreted
as a regular expression.
---
 Cargo.lock                |  1 +
 src/uu/tac/Cargo.toml     |  1 +
 src/uu/tac/src/tac.rs     | 90 ++++++++++++++++++++++++++++++++++++---
 tests/by-util/test_tac.rs | 66 +++++++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 808f62e15..f8de8d4a0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3051,6 +3051,7 @@ version = "0.0.7"
 dependencies = [
  "clap",
  "memchr 2.4.0",
+ "regex",
  "uucore",
  "uucore_procs",
 ]
diff --git a/src/uu/tac/Cargo.toml b/src/uu/tac/Cargo.toml
index 3ba1497a0..4a91786aa 100644
--- a/src/uu/tac/Cargo.toml
+++ b/src/uu/tac/Cargo.toml
@@ -16,6 +16,7 @@ path = "src/tac.rs"
 
 [dependencies]
 memchr = "2"
+regex = "1"
 clap = { version = "2.33", features = ["wrap_help"] }
 uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
 uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }
diff --git a/src/uu/tac/src/tac.rs b/src/uu/tac/src/tac.rs
index e54697f2b..4a93a7c65 100644
--- a/src/uu/tac/src/tac.rs
+++ b/src/uu/tac/src/tac.rs
@@ -69,7 +69,7 @@ pub fn uu_app() -> App<'static, 'static> {
             Arg::with_name(options::REGEX)
                 .short("r")
                 .long(options::REGEX)
-                .help("interpret the sequence as a regular expression (NOT IMPLEMENTED)")
+                .help("interpret the sequence as a regular expression")
                 .takes_value(false),
         )
         .arg(
@@ -82,6 +82,82 @@ pub fn uu_app() -> App<'static, 'static> {
         .arg(Arg::with_name(options::FILE).hidden(true).multiple(true))
 }
 
+/// Print lines of a buffer in reverse, with line separator given as a regex.
+///
+/// `data` contains the bytes of the file.
+///
+/// `pattern` is the regular expression given as a
+/// [`regex::bytes::Regex`] (not a [`regex::Regex`], since the input is
+/// given as a slice of bytes). If `before` is `true`, then each match
+/// of this pattern in `data` is interpreted as the start of a line. If
+/// `before` is `false`, then each match of this pattern is interpreted
+/// as the end of a line.
+///
+/// This function writes each line in `data` to [`std::io::Stdout`] in
+/// reverse.
+///
+/// # Errors
+///
+/// If there is a problem writing to `stdout`, then this function
+/// returns [`std::io::Error`].
+fn buffer_tac_regex(
+    data: &[u8],
+    pattern: regex::bytes::Regex,
+    before: bool,
+) -> std::io::Result<()> {
+    let mut out = stdout();
+
+    // The index of the line separator for the current line.
+    //
+    // As we scan through the `data` from right to left, we update this
+    // variable each time we find a new line separator. We restrict our
+    // regular expression search to only those bytes up to the line
+    // separator.
+    let mut this_line_end = data.len();
+
+    // The index of the start of the next line in the `data`.
+    //
+    // As we scan through the `data` from right to left, we update this
+    // variable each time we find a new line.
+    //
+    // If `before` is `true`, then each line starts immediately before
+    // the line separator. Otherwise, each line starts immediately after
+    // the line separator.
+    let mut following_line_start = data.len();
+
+    // Iterate over each byte in the buffer in reverse. When we find a
+    // line separator, write the line to stdout.
+    //
+    // The `before` flag controls whether the line separator appears at
+    // the end of the line (as in "abc\ndef\n") or at the beginning of
+    // the line (as in "/abc/def").
+    for i in (0..data.len()).rev() {
+        // Determine if there is a match for `pattern` starting at index
+        // `i` in `data`. Only search up to the line ending that was
+        // found previously.
+        if let Some(match_) = pattern.find_at(&data[..this_line_end], i) {
+            // Record this index as the ending of the current line.
+            this_line_end = i;
+
+            // The length of the match (that is, the line separator), in bytes.
+            let slen = match_.end() - match_.start();
+
+            if before {
+                out.write_all(&data[i..following_line_start])?;
+                following_line_start = i;
+            } else {
+                out.write_all(&data[i + slen..following_line_start])?;
+                following_line_start = i + slen;
+            }
+        }
+    }
+
+    // After the loop terminates, write whatever bytes are remaining at
+    // the beginning of the buffer.
+    out.write_all(&data[0..following_line_start])?;
+    Ok(())
+}
+
 /// Write lines from `data` to stdout in reverse.
 ///
 /// This function writes to [`stdout`] each line appearing in `data`,
@@ -132,7 +208,7 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
     Ok(())
 }
 
-fn tac(filenames: Vec<String>, before: bool, _: bool, separator: &str) -> i32 {
+fn tac(filenames: Vec<String>, before: bool, regex: bool, separator: &str) -> i32 {
     let mut exit_code = 0;
 
     for filename in &filenames {
@@ -168,9 +244,13 @@ fn tac(filenames: Vec<String>, before: bool, _: bool, separator: &str) -> i32 {
             exit_code = 1;
             continue;
         };
-
-        buffer_tac(&data, before, separator)
-            .unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e));
+        if regex {
+            let pattern = crash_if_err!(1, regex::bytes::Regex::new(separator));
+            buffer_tac_regex(&data, pattern, before)
+        } else {
+            buffer_tac(&data, before, separator)
+        }
+        .unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e));
     }
     exit_code
 }
diff --git a/tests/by-util/test_tac.rs b/tests/by-util/test_tac.rs
index 202f76d66..323aa5149 100644
--- a/tests/by-util/test_tac.rs
+++ b/tests/by-util/test_tac.rs
@@ -1,4 +1,4 @@
-// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa
+// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa axyz zyax zyxa
 use crate::common::util::*;
 
 #[test]
@@ -205,3 +205,67 @@ fn test_null_separator() {
         .succeeds()
         .stdout_is("b\0a\0");
 }
+
+#[test]
+fn test_regex() {
+    new_ucmd!()
+        .args(&["-r", "-s", "[xyz]+"])
+        .pipe_in("axyz")
+        .succeeds()
+        .no_stderr()
+        .stdout_is("zyax");
+
+    new_ucmd!()
+        .args(&["-r", "-s", ":+"])
+        .pipe_in("a:b::c:::d::::")
+        .succeeds()
+        .no_stderr()
+        .stdout_is(":::d:::c::b:a:");
+
+    new_ucmd!()
+        .args(&["-r", "-s", r"[\+]+[-]+[\+]+"])
+        //   line  0     1        2
+        //        |--||-----||--------|
+        .pipe_in("a+-+b++--++c+d-e+---+")
+        .succeeds()
+        .no_stderr()
+        //   line       2        1    0
+        //          |--------||-----||--|
+        .stdout_is("c+d-e+---+b++--++a+-+");
+}
+
+#[test]
+fn test_regex_before() {
+    new_ucmd!()
+        .args(&["-b", "-r", "-s", "[xyz]+"])
+        .pipe_in("axyz")
+        .succeeds()
+        .no_stderr()
+        .stdout_is("zyxa");
+
+    new_ucmd!()
+        .args(&["-b", "-r", "-s", ":+"])
+        .pipe_in(":a::b:::c::::d")
+        .succeeds()
+        .stdout_is(":d::::c:::b::a");
+
+    // Because `tac` searches for matches of the regular expression from
+    // right to left, the second to last line is
+    //
+    //     +--++b
+    //
+    // not
+    //
+    //     ++--++b
+    //
+    new_ucmd!()
+        .args(&["-b", "-r", "-s", r"[\+]+[-]+[\+]+"])
+        //   line   0     1       2
+        //        |---||----||--------|
+        .pipe_in("+-+a++--++b+---+c+d-e")
+        .succeeds()
+        .no_stderr()
+        //   line       2        1    0
+        //          |--------||----||---|
+        .stdout_is("+---+c+d-e+--++b+-+a+");
+}