split: implementing separator option (#5331)

* split: implementing separator option

* split: separator option - handle multiple update

* split: style

* split: separator tests

* split: separator tests - stdin in ci/cd

* split: tests - ci/cd stdin errors

* split: refactor based on feedback

* split: improve test coverage

* split: fix broken pipe error in tests with stdin

* split: fix for handle_multiple_separator_options

* split: comments

* split: refactor separator code

* split: changes based on feedback

* split: changes based on feedback
This commit is contained in:
Yury Zhytkou 2023-10-02 18:42:46 -04:00 committed by GitHub
parent 9f6a720582
commit c5a0aa92f8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 348 additions and 38 deletions

View file

@ -11,3 +11,16 @@ Create output files containing consecutive or interleaved sections of input
## After Help
Output fixed-size pieces of INPUT to PREFIXaa, PREFIXab, ...; default size is 1000, and default PREFIX is 'x'. With no INPUT, or when INPUT is -, read standard input.
The SIZE argument is an integer and optional unit (example: 10K is 10*1024).
Units are K,M,G,T,P,E,Z,Y,R,Q (powers of 1024) or KB,MB,... (powers of 1000).
Binary prefixes can be used, too: KiB=K, MiB=M, and so on.
CHUNKS may be:
- N split into N files based on size of input
- K/N output Kth of N to stdout
- l/N split into N files without splitting lines/records
- l/K/N output Kth of N to stdout without splitting lines/records
- r/N like 'l' but use round robin distribution
- r/K/N likewise but only output Kth of N to stdout

View file

@ -11,7 +11,7 @@ mod platform;
use crate::filenames::FilenameIterator;
use crate::filenames::SuffixType;
use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command};
use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command, ValueHint};
use std::env;
use std::ffi::OsString;
use std::fmt;
@ -39,6 +39,7 @@ static OPT_HEX_SUFFIXES_SHORT: &str = "-x";
static OPT_SUFFIX_LENGTH: &str = "suffix-length";
static OPT_DEFAULT_SUFFIX_LENGTH: &str = "0";
static OPT_VERBOSE: &str = "verbose";
static OPT_SEPARATOR: &str = "separator";
//The ---io and ---io-blksize parameters are consumed and ignored.
//The parameter is included to make GNU coreutils tests pass.
static OPT_IO: &str = "-io";
@ -55,7 +56,6 @@ const AFTER_HELP: &str = help_section!("after help", "split.md");
#[uucore::main]
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let (args, obs_lines) = handle_obsolete(args);
let matches = uu_app().try_get_matches_from(args)?;
match Settings::from(&matches, &obs_lines) {
@ -145,6 +145,7 @@ fn should_extract_obs_lines(
&& !slice.starts_with("-C")
&& !slice.starts_with("-l")
&& !slice.starts_with("-n")
&& !slice.starts_with("-t")
}
/// Helper function to [`filter_args`]
@ -208,13 +209,18 @@ fn handle_preceding_options(
|| &slice[2..] == OPT_ADDITIONAL_SUFFIX
|| &slice[2..] == OPT_FILTER
|| &slice[2..] == OPT_NUMBER
|| &slice[2..] == OPT_SUFFIX_LENGTH;
|| &slice[2..] == OPT_SUFFIX_LENGTH
|| &slice[2..] == OPT_SEPARATOR;
}
// capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace)
// following slice should be treaded as value for this option
// even if it starts with '-' (which would be treated as hyphen prefixed value)
*preceding_short_opt_req_value =
slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a";
*preceding_short_opt_req_value = slice == "-b"
|| slice == "-C"
|| slice == "-l"
|| slice == "-n"
|| slice == "-a"
|| slice == "-t";
// slice is a value
// reset preceding option flags
if !slice.starts_with('-') {
@ -278,7 +284,7 @@ pub fn uu_app() -> Command {
.long(OPT_FILTER)
.allow_hyphen_values(true)
.value_name("COMMAND")
.value_hint(clap::ValueHint::CommandName)
.value_hint(ValueHint::CommandName)
.help(
"write to shell COMMAND; file name is $FILE (Currently not implemented for Windows)",
),
@ -293,7 +299,7 @@ pub fn uu_app() -> Command {
.arg(
Arg::new(OPT_NUMERIC_SUFFIXES_SHORT)
.short('d')
.action(clap::ArgAction::SetTrue)
.action(ArgAction::SetTrue)
.overrides_with_all([
OPT_NUMERIC_SUFFIXES,
OPT_NUMERIC_SUFFIXES_SHORT,
@ -314,12 +320,13 @@ pub fn uu_app() -> Command {
OPT_HEX_SUFFIXES,
OPT_HEX_SUFFIXES_SHORT
])
.value_name("FROM")
.help("same as -d, but allow setting the start value"),
)
.arg(
Arg::new(OPT_HEX_SUFFIXES_SHORT)
.short('x')
.action(clap::ArgAction::SetTrue)
.action(ArgAction::SetTrue)
.overrides_with_all([
OPT_NUMERIC_SUFFIXES,
OPT_NUMERIC_SUFFIXES_SHORT,
@ -340,6 +347,7 @@ pub fn uu_app() -> Command {
OPT_HEX_SUFFIXES,
OPT_HEX_SUFFIXES_SHORT
])
.value_name("FROM")
.help("same as -x, but allow setting the start value"),
)
.arg(
@ -357,6 +365,15 @@ pub fn uu_app() -> Command {
.help("print a diagnostic just before each output file is opened")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new(OPT_SEPARATOR)
.short('t')
.long(OPT_SEPARATOR)
.allow_hyphen_values(true)
.value_name("SEP")
.action(ArgAction::Append)
.help("use SEP instead of newline as the record separator; '\0' (zero) specifies the NUL character"),
)
.arg(
Arg::new(OPT_IO)
.long("io")
@ -372,7 +389,7 @@ pub fn uu_app() -> Command {
.arg(
Arg::new(ARG_INPUT)
.default_value("-")
.value_hint(clap::ValueHint::FilePath),
.value_hint(ValueHint::FilePath),
)
.arg(
Arg::new(ARG_PREFIX)
@ -696,6 +713,7 @@ struct Settings {
filter: Option<String>,
strategy: Strategy,
verbose: bool,
separator: u8,
/// Whether to *not* produce empty files when using `-n`.
///
@ -722,6 +740,12 @@ enum SettingsError {
/// Suffix is not large enough to split into specified chunks
SuffixTooSmall(usize),
/// Multi-character (Invalid) separator
MultiCharacterSeparator(String),
/// Multiple different separator characters
MultipleSeparatorCharacters,
/// The `--filter` option is not supported on Windows.
#[cfg(windows)]
NotSupported,
@ -743,6 +767,12 @@ impl fmt::Display for SettingsError {
Self::Strategy(e) => e.fmt(f),
Self::SuffixNotParsable(s) => write!(f, "invalid suffix length: {}", s.quote()),
Self::SuffixTooSmall(i) => write!(f, "the suffix length needs to be at least {i}"),
Self::MultiCharacterSeparator(s) => {
write!(f, "multi-character separator {}", s.quote())
}
Self::MultipleSeparatorCharacters => {
write!(f, "multiple separator characters specified")
}
Self::SuffixContainsSeparator(s) => write!(
f,
"invalid suffix {}, contains directory separator",
@ -783,6 +813,26 @@ impl Settings {
}
}
}
// Make sure that separator is only one UTF8 character (if specified)
// defaults to '\n' - newline character
// If the same separator (the same value) was used multiple times - `split` should NOT fail
// If the separator was used multiple times but with different values (not all values are the same) - `split` should fail
let separator = match matches.get_many::<String>(OPT_SEPARATOR) {
Some(mut sep_values) => {
let first = sep_values.next().unwrap(); // it is safe to just unwrap here since Clap should not return empty ValuesRef<'_,String> in the option from get_many() call
if !sep_values.all(|s| s == first) {
return Err(SettingsError::MultipleSeparatorCharacters);
}
match first.as_str() {
"\\0" => b'\0',
s if s.as_bytes().len() == 1 => s.as_bytes()[0],
s => return Err(SettingsError::MultiCharacterSeparator(s.to_owned())),
}
}
None => b'\n',
};
let result = Self {
suffix_length: suffix_length_str
.parse()
@ -791,6 +841,7 @@ impl Settings {
suffix_start,
additional_suffix,
verbose: matches.value_source("verbose") == Some(ValueSource::CommandLine),
separator,
strategy,
input: matches.get_one::<String>(ARG_INPUT).unwrap().to_owned(),
prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().to_owned(),
@ -1019,7 +1070,8 @@ impl<'a> Write for LineChunkWriter<'a> {
// corresponds to the current chunk number.
let mut prev = 0;
let mut total_bytes_written = 0;
for i in memchr::memchr_iter(b'\n', buf) {
let sep = self.settings.separator;
for i in memchr::memchr_iter(sep, buf) {
// If we have exceeded the number of lines to write in the
// current chunk, then start a new chunk and its
// corresponding writer.
@ -1036,8 +1088,8 @@ impl<'a> Write for LineChunkWriter<'a> {
}
// Write the line, starting from *after* the previous
// newline character and ending *after* the current
// newline character.
// separator character and ending *after* the current
// separator character.
let n = self.inner.write(&buf[prev..i + 1])?;
total_bytes_written += n;
prev = i + 1;
@ -1175,21 +1227,22 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
}
// Find the first newline character in the buffer.
match memchr::memchr(b'\n', buf) {
// If there is no newline character and the buffer is
// Find the first separator (default - newline character) in the buffer.
let sep = self.settings.separator;
match memchr::memchr(sep, buf) {
// If there is no separator character and the buffer is
// not empty, then write as many bytes as we can and
// then move on to the next chunk if necessary.
None => {
let end = self.num_bytes_remaining_in_current_chunk;
// This is ugly but here to match GNU behavior. If the input
// doesn't end with a \n, pretend that it does for handling
// doesn't end with a separator, pretend that it does for handling
// the second to last segment chunk. See `line-bytes.sh`.
if end == buf.len()
&& self.num_bytes_remaining_in_current_chunk
< self.chunk_size.try_into().unwrap()
&& buf[buf.len() - 1] != b'\n'
&& buf[buf.len() - 1] != sep
{
self.num_bytes_remaining_in_current_chunk = 0;
} else {
@ -1200,8 +1253,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
}
}
// If there is a newline character and the line
// (including the newline character) will fit in the
// If there is a separator character and the line
// (including the separator character) will fit in the
// current chunk, then write the entire line and
// continue to the next iteration. (See chunk 1 in the
// example comment above.)
@ -1212,8 +1265,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
buf = &buf[num_bytes_written..];
}
// If there is a newline character, the line
// (including the newline character) will not fit in
// If there is a separator character, the line
// (including the separator character) will not fit in
// the current chunk, *and* no other lines have been
// written to the current chunk, then write as many
// bytes as we can and continue to the next
@ -1230,8 +1283,8 @@ impl<'a> Write for LineBytesChunkWriter<'a> {
buf = &buf[num_bytes_written..];
}
// If there is a newline character, the line
// (including the newline character) will not fit in
// If there is a separator character, the line
// (including the separator character) will not fit in
// the current chunk, and at least one other line has
// been written to the current chunk, then signal to
// the next iteration that a new chunk needs to be
@ -1489,15 +1542,16 @@ where
let mut num_bytes_remaining_in_current_chunk = chunk_size;
let mut i = 0;
for line_result in reader.lines() {
let sep = settings.separator;
for line_result in reader.split(sep) {
let line = line_result.unwrap();
let maybe_writer = writers.get_mut(i);
let writer = maybe_writer.unwrap();
let bytes = line.as_bytes();
let bytes = line.as_slice();
writer.write_all(bytes)?;
writer.write_all(b"\n")?;
writer.write_all(&[sep])?;
// Add one byte for the newline character.
// Add one byte for the separator character.
let num_bytes = bytes.len() + 1;
if num_bytes > num_bytes_remaining_in_current_chunk {
num_bytes_remaining_in_current_chunk = chunk_size;
@ -1546,15 +1600,16 @@ where
let mut num_bytes_remaining_in_current_chunk = chunk_size;
let mut i = 0;
for line_result in reader.lines() {
let sep = settings.separator;
for line_result in reader.split(sep) {
let line = line_result?;
let bytes = line.as_bytes();
let bytes = line.as_slice();
if i == chunk_number {
writer.write_all(bytes)?;
writer.write_all(b"\n")?;
writer.write_all(&[sep])?;
}
// Add one byte for the newline character.
// Add one byte for the separator character.
let num_bytes = bytes.len() + 1;
if num_bytes >= num_bytes_remaining_in_current_chunk {
num_bytes_remaining_in_current_chunk = chunk_size;
@ -1601,13 +1656,14 @@ where
}
let num_chunks: usize = num_chunks.try_into().unwrap();
for (i, line_result) in reader.lines().enumerate() {
let sep = settings.separator;
for (i, line_result) in reader.split(sep).enumerate() {
let line = line_result.unwrap();
let maybe_writer = writers.get_mut(i % num_chunks);
let writer = maybe_writer.unwrap();
let bytes = line.as_bytes();
let bytes = line.as_slice();
writer.write_all(bytes)?;
writer.write_all(b"\n")?;
writer.write_all(&[sep])?;
}
Ok(())
@ -1632,7 +1688,7 @@ where
/// * [`split_into_n_chunks_by_line_round_robin`], which splits its input in the
/// same way, but writes each chunk to its own file.
fn kth_chunk_by_line_round_robin<R>(
_settings: &Settings,
settings: &Settings,
reader: &mut R,
chunk_number: u64,
num_chunks: u64,
@ -1646,12 +1702,13 @@ where
let num_chunks: usize = num_chunks.try_into().unwrap();
let chunk_number: usize = chunk_number.try_into().unwrap();
for (i, line_result) in reader.lines().enumerate() {
let sep = settings.separator;
for (i, line_result) in reader.split(sep).enumerate() {
let line = line_result?;
let bytes = line.as_bytes();
let bytes = line.as_slice();
if (i % num_chunks) == chunk_number {
writer.write_all(bytes)?;
writer.write_all(b"\n")?;
writer.write_all(&[sep])?;
}
}
Ok(())

View file

@ -1483,3 +1483,242 @@ fn test_split_non_utf8_argument_windows() {
.fails()
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
}
// Test '--separator' / '-t' option following GNU tests example
// test separators: '\n' , '\0' , ';'
// test with '--lines=2' , '--line-bytes=4' , '--number=l/3' , '--number=r/3' , '--number=l/1/3' , '--number=r/1/3'
#[test]
fn test_split_separator_nl_lines() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--lines=2", "-t", "\n"])
.pipe_in("1\n2\n3\n4\n5\n")
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\n2\n");
assert_eq!(file_read(&at, "xab"), "3\n4\n");
assert_eq!(file_read(&at, "xac"), "5\n");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nl_line_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--line-bytes=4", "-t", "\n"])
.pipe_in("1\n2\n3\n4\n5\n")
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\n2\n");
assert_eq!(file_read(&at, "xab"), "3\n4\n");
assert_eq!(file_read(&at, "xac"), "5\n");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nl_number_l() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--number=l/3", "--separator=\n", "fivelines.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\n2\n");
assert_eq!(file_read(&at, "xab"), "3\n4\n");
assert_eq!(file_read(&at, "xac"), "5\n");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nl_number_r() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--number=r/3", "--separator", "\n", "fivelines.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\n4\n");
assert_eq!(file_read(&at, "xab"), "2\n5\n");
assert_eq!(file_read(&at, "xac"), "3\n");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nul_lines() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--lines=2", "-t", "\\0", "separator_nul.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\02\0");
assert_eq!(file_read(&at, "xab"), "3\04\0");
assert_eq!(file_read(&at, "xac"), "5\0");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nul_line_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--line-bytes=4", "-t", "\\0", "separator_nul.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\02\0");
assert_eq!(file_read(&at, "xab"), "3\04\0");
assert_eq!(file_read(&at, "xac"), "5\0");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nul_number_l() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--number=l/3", "--separator=\\0", "separator_nul.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\02\0");
assert_eq!(file_read(&at, "xab"), "3\04\0");
assert_eq!(file_read(&at, "xac"), "5\0");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_nul_number_r() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--number=r/3", "--separator=\\0", "separator_nul.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1\04\0");
assert_eq!(file_read(&at, "xab"), "2\05\0");
assert_eq!(file_read(&at, "xac"), "3\0");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_semicolon_lines() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--lines=2", "-t", ";", "separator_semicolon.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1;2;");
assert_eq!(file_read(&at, "xab"), "3;4;");
assert_eq!(file_read(&at, "xac"), "5;");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_semicolon_line_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--line-bytes=4", "-t", ";", "separator_semicolon.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1;2;");
assert_eq!(file_read(&at, "xab"), "3;4;");
assert_eq!(file_read(&at, "xac"), "5;");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_semicolon_number_l() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--number=l/3", "--separator=;", "separator_semicolon.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1;2;");
assert_eq!(file_read(&at, "xab"), "3;4;");
assert_eq!(file_read(&at, "xac"), "5;");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_semicolon_number_r() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["--number=r/3", "--separator=;", "separator_semicolon.txt"])
.succeeds();
assert_eq!(file_read(&at, "xaa"), "1;4;");
assert_eq!(file_read(&at, "xab"), "2;5;");
assert_eq!(file_read(&at, "xac"), "3;");
assert!(!at.plus("xad").exists());
}
#[test]
fn test_split_separator_semicolon_number_kth_l() {
new_ucmd!()
.args(&[
"--number=l/1/3",
"--separator",
";",
"separator_semicolon.txt",
])
.succeeds()
.stdout_only("1;2;");
}
#[test]
fn test_split_separator_semicolon_number_kth_r() {
new_ucmd!()
.args(&[
"--number=r/1/3",
"--separator",
";",
"separator_semicolon.txt",
])
.succeeds()
.stdout_only("1;4;");
}
// Test error edge cases for separator option
#[test]
fn test_split_separator_no_value() {
new_ucmd!()
.args(&["-t"])
.ignore_stdin_write_error()
.pipe_in("a\n")
.fails()
.stderr_contains(
"error: a value is required for '--separator <SEP>' but none was supplied",
);
}
#[test]
fn test_split_separator_invalid_usage() {
let scene = TestScenario::new(util_name!());
scene
.ucmd()
.args(&["--separator=xx"])
.ignore_stdin_write_error()
.pipe_in("a\n")
.fails()
.no_stdout()
.stderr_contains("split: multi-character separator 'xx'");
scene
.ucmd()
.args(&["-ta", "-tb"])
.ignore_stdin_write_error()
.pipe_in("a\n")
.fails()
.no_stdout()
.stderr_contains("split: multiple separator characters specified");
scene
.ucmd()
.args(&["-t'\n'", "-tb"])
.ignore_stdin_write_error()
.pipe_in("a\n")
.fails()
.no_stdout()
.stderr_contains("split: multiple separator characters specified");
}
// Test using same separator multiple times
#[test]
fn test_split_separator_same_multiple() {
let scene = TestScenario::new(util_name!());
scene
.ucmd()
.args(&["--separator=:", "--separator=:", "fivelines.txt"])
.succeeds();
scene
.ucmd()
.args(&["-t:", "--separator=:", "fivelines.txt"])
.succeeds();
scene
.ucmd()
.args(&["-t", ":", "-t", ":", "fivelines.txt"])
.succeeds();
scene
.ucmd()
.args(&["-t:", "-t:", "-t,", "fivelines.txt"])
.fails();
}

BIN
tests/fixtures/split/separator_nul.txt vendored Normal file

Binary file not shown.

View file

@ -0,0 +1 @@
1;2;3;4;5;