tac: add support for --regex option to tac

Add support for `tac --regex`, where the line separator is interpreted
as a regular expression.
This commit is contained in:
Jeffrey Finkelstein 2021-08-01 11:19:44 -04:00
parent 92a1f1422e
commit 664c7a6ec5
4 changed files with 152 additions and 6 deletions

1
Cargo.lock generated
View file

@ -3051,6 +3051,7 @@ version = "0.0.7"
dependencies = [
"clap",
"memchr 2.4.0",
"regex",
"uucore",
"uucore_procs",
]

View file

@ -16,6 +16,7 @@ path = "src/tac.rs"
[dependencies]
memchr = "2"
regex = "1"
clap = { version = "2.33", features = ["wrap_help"] }
uucore = { version=">=0.0.9", package="uucore", path="../../uucore" }
uucore_procs = { version=">=0.0.6", package="uucore_procs", path="../../uucore_procs" }

View file

@ -69,7 +69,7 @@ pub fn uu_app() -> App<'static, 'static> {
Arg::with_name(options::REGEX)
.short("r")
.long(options::REGEX)
.help("interpret the sequence as a regular expression (NOT IMPLEMENTED)")
.help("interpret the sequence as a regular expression")
.takes_value(false),
)
.arg(
@ -82,6 +82,82 @@ pub fn uu_app() -> App<'static, 'static> {
.arg(Arg::with_name(options::FILE).hidden(true).multiple(true))
}
/// Print lines of a buffer in reverse, with line separator given as a regex.
///
/// `data` contains the bytes of the file.
///
/// `pattern` is the regular expression given as a
/// [`regex::bytes::Regex`] (not a [`regex::Regex`], since the input is
/// given as a slice of bytes). If `before` is `true`, then each match
/// of this pattern in `data` is interpreted as the start of a line. If
/// `before` is `false`, then each match of this pattern is interpreted
/// as the end of a line.
///
/// This function writes each line in `data` to [`std::io::Stdout`] in
/// reverse.
///
/// # Errors
///
/// If there is a problem writing to `stdout`, then this function
/// returns [`std::io::Error`].
fn buffer_tac_regex(
data: &[u8],
pattern: regex::bytes::Regex,
before: bool,
) -> std::io::Result<()> {
let mut out = stdout();
// The index of the line separator for the current line.
//
// As we scan through the `data` from right to left, we update this
// variable each time we find a new line separator. We restrict our
// regular expression search to only those bytes up to the line
// separator.
let mut this_line_end = data.len();
// The index of the start of the next line in the `data`.
//
// As we scan through the `data` from right to left, we update this
// variable each time we find a new line.
//
// If `before` is `true`, then each line starts immediately before
// the line separator. Otherwise, each line starts immediately after
// the line separator.
let mut following_line_start = data.len();
// Iterate over each byte in the buffer in reverse. When we find a
// line separator, write the line to stdout.
//
// The `before` flag controls whether the line separator appears at
// the end of the line (as in "abc\ndef\n") or at the beginning of
// the line (as in "/abc/def").
for i in (0..data.len()).rev() {
// Determine if there is a match for `pattern` starting at index
// `i` in `data`. Only search up to the line ending that was
// found previously.
if let Some(match_) = pattern.find_at(&data[..this_line_end], i) {
// Record this index as the ending of the current line.
this_line_end = i;
// The length of the match (that is, the line separator), in bytes.
let slen = match_.end() - match_.start();
if before {
out.write_all(&data[i..following_line_start])?;
following_line_start = i;
} else {
out.write_all(&data[i + slen..following_line_start])?;
following_line_start = i + slen;
}
}
}
// After the loop terminates, write whatever bytes are remaining at
// the beginning of the buffer.
out.write_all(&data[0..following_line_start])?;
Ok(())
}
/// Write lines from `data` to stdout in reverse.
///
/// This function writes to [`stdout`] each line appearing in `data`,
@ -132,7 +208,7 @@ fn buffer_tac(data: &[u8], before: bool, separator: &str) -> std::io::Result<()>
Ok(())
}
fn tac(filenames: Vec<String>, before: bool, _: bool, separator: &str) -> i32 {
fn tac(filenames: Vec<String>, before: bool, regex: bool, separator: &str) -> i32 {
let mut exit_code = 0;
for filename in &filenames {
@ -168,8 +244,12 @@ fn tac(filenames: Vec<String>, before: bool, _: bool, separator: &str) -> i32 {
exit_code = 1;
continue;
};
if regex {
let pattern = crash_if_err!(1, regex::bytes::Regex::new(separator));
buffer_tac_regex(&data, pattern, before)
} else {
buffer_tac(&data, before, separator)
}
.unwrap_or_else(|e| crash!(1, "failed to write to stdout: {}", e));
}
exit_code

View file

@ -1,4 +1,4 @@
// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa
// spell-checker:ignore axxbxx bxxaxx axxx axxxx xxaxx xxax xxxxa axyz zyax zyxa
use crate::common::util::*;
#[test]
@ -205,3 +205,67 @@ fn test_null_separator() {
.succeeds()
.stdout_is("b\0a\0");
}
#[test]
fn test_regex() {
new_ucmd!()
.args(&["-r", "-s", "[xyz]+"])
.pipe_in("axyz")
.succeeds()
.no_stderr()
.stdout_is("zyax");
new_ucmd!()
.args(&["-r", "-s", ":+"])
.pipe_in("a:b::c:::d::::")
.succeeds()
.no_stderr()
.stdout_is(":::d:::c::b:a:");
new_ucmd!()
.args(&["-r", "-s", r"[\+]+[-]+[\+]+"])
// line 0 1 2
// |--||-----||--------|
.pipe_in("a+-+b++--++c+d-e+---+")
.succeeds()
.no_stderr()
// line 2 1 0
// |--------||-----||--|
.stdout_is("c+d-e+---+b++--++a+-+");
}
#[test]
fn test_regex_before() {
new_ucmd!()
.args(&["-b", "-r", "-s", "[xyz]+"])
.pipe_in("axyz")
.succeeds()
.no_stderr()
.stdout_is("zyxa");
new_ucmd!()
.args(&["-b", "-r", "-s", ":+"])
.pipe_in(":a::b:::c::::d")
.succeeds()
.stdout_is(":d::::c:::b::a");
// Because `tac` searches for matches of the regular expression from
// right to left, the second to last line is
//
// +--++b
//
// not
//
// ++--++b
//
new_ucmd!()
.args(&["-b", "-r", "-s", r"[\+]+[-]+[\+]+"])
// line 0 1 2
// |---||----||--------|
.pipe_in("+-+a++--++b+---+c+d-e")
.succeeds()
.no_stderr()
// line 2 1 0
// |--------||----||---|
.stdout_is("+---+c+d-e+--++b+-+a+");
}