coreutils/tests/by-util/test_split.rs
Owen Anderson 9fad6fde35
Fix a bug in split where chunking would be skipped when the chunk size (#3800)
* Fix a bug in split where chunking would be skipped when the chunk size
happened to be an exact divisor of the buffer size used to read the
input stream.

The issue here was that file was being split byte-wise in chunks of 1G.
The input stream was being read in chunks of 8KB, which evenly divides
the chunk size. Because the check to allocate the next output chunk was
done at the bottom of the loop previously, it would never occur because
the current input chunk was fully consumed at that point. By moving the
check to the top of the loop (but still late enough that we know we have
bytes to write) we resolve this issue.

This scenario is unfortunately hard to write a test for, since we don't
explicitly control the input chunk size.

Fixes https://github.com/uutils/coreutils/issues/3790
2022-08-16 11:02:52 +02:00

700 lines
20 KiB
Rust

// * This file is part of the uutils coreutils package.
// *
// * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code.
// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb
extern crate rand;
extern crate regex;
use self::rand::{thread_rng, Rng};
use self::regex::Regex;
use crate::common::util::*;
use rand::SeedableRng;
#[cfg(not(windows))]
use std::env;
use std::path::Path;
use std::{
fs::{read_dir, File},
io::{BufWriter, Read, Write},
};
fn random_chars(n: usize) -> String {
thread_rng()
.sample_iter(&rand::distributions::Alphanumeric)
.map(char::from)
.take(n)
.collect::<String>()
}
struct Glob {
directory: AtPath,
regex: Regex,
}
impl Glob {
fn new(at: &AtPath, directory: &str, regex: &str) -> Self {
Self {
directory: AtPath::new(Path::new(&at.plus_as_string(directory))),
regex: Regex::new(regex).unwrap(),
}
}
fn count(&self) -> usize {
self.collect().len()
}
/// Get all files in `self.directory` that match `self.regex`
fn collect(&self) -> Vec<String> {
read_dir(Path::new(&self.directory.subdir))
.unwrap()
.filter_map(|entry| {
let path = entry.unwrap().path();
let name = self
.directory
.minus_as_string(path.as_path().to_str().unwrap_or(""));
if self.regex.is_match(&name) {
Some(name)
} else {
None
}
})
.collect()
}
/// Accumulate bytes of all files in `self.collect()`
fn collate(&self) -> Vec<u8> {
let mut files = self.collect();
files.sort();
let mut data: Vec<u8> = vec![];
for name in &files {
data.extend(self.directory.read_bytes(name));
}
data
}
}
/// File handle that user can add random bytes (line-formatted or not) to
struct RandomFile {
inner: File,
}
impl RandomFile {
/// Size of each line that's being generated
const LINESIZE: usize = 32;
/// `create()` file handle located at `at` / `name`
fn new(at: &AtPath, name: &str) -> Self {
Self {
inner: File::create(&at.plus(name)).unwrap(),
}
}
fn add_bytes(&mut self, bytes: usize) {
// Note that just writing random characters isn't enough to cover all
// cases. We need truly random bytes.
let mut writer = BufWriter::new(&self.inner);
// Seed the rng so as to avoid spurious test failures.
let mut rng = rand::rngs::StdRng::seed_from_u64(123);
let mut buffer = [0; 1024];
let mut remaining_size = bytes;
while remaining_size > 0 {
let to_write = std::cmp::min(remaining_size, buffer.len());
let buf = &mut buffer[..to_write];
rng.fill(buf);
writer.write_all(buf).unwrap();
remaining_size -= to_write;
}
}
/// Add n lines each of size `RandomFile::LINESIZE`
fn add_lines(&mut self, lines: usize) {
let mut n = lines;
while n > 0 {
writeln!(self.inner, "{}", random_chars(Self::LINESIZE)).unwrap();
n -= 1;
}
}
}
#[test]
fn test_split_default() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_default";
RandomFile::new(&at, name).add_lines(2000);
ucmd.args(&[name]).succeeds();
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 2);
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_split_numeric_prefixed_chunks_by_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_num_prefixed_chunks_by_bytes";
RandomFile::new(&at, name).add_bytes(10000);
ucmd.args(&[
"-d", // --numeric-suffixes
"-b", // --bytes
"1000", name, "a",
])
.succeeds();
let glob = Glob::new(&at, ".", r"a\d\d$");
assert_eq!(glob.count(), 10);
for filename in glob.collect() {
assert_eq!(glob.directory.metadata(&filename).len(), 1000);
}
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_split_str_prefixed_chunks_by_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_str_prefixed_chunks_by_bytes";
RandomFile::new(&at, name).add_bytes(10000);
// Important that this is less than 1024 since that's our internal buffer
// size. Good to test that we don't overshoot.
ucmd.args(&["-b", "1000", name, "b"]).succeeds();
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 10);
for filename in glob.collect() {
assert_eq!(glob.directory.metadata(&filename).len(), 1000);
}
assert_eq!(glob.collate(), at.read_bytes(name));
}
// This is designed to test what happens when the desired part size is not a
// multiple of the buffer size and we hopefully don't overshoot the desired part
// size.
#[test]
fn test_split_bytes_prime_part_size() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "test_split_bytes_prime_part_size";
RandomFile::new(&at, name).add_bytes(10000);
// 1753 is prime and greater than the buffer size, 1024.
ucmd.args(&["-b", "1753", name, "b"]).succeeds();
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 6);
let mut fns = glob.collect();
// glob.collect() is not guaranteed to return in sorted order, so we sort.
fns.sort();
#[allow(clippy::needless_range_loop)]
for i in 0..5 {
assert_eq!(glob.directory.metadata(&fns[i]).len(), 1753);
}
assert_eq!(glob.directory.metadata(&fns[5]).len(), 1235);
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_split_num_prefixed_chunks_by_lines() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_num_prefixed_chunks_by_lines";
RandomFile::new(&at, name).add_lines(10000);
ucmd.args(&["-d", "-l", "1000", name, "c"]).succeeds();
let glob = Glob::new(&at, ".", r"c\d\d$");
assert_eq!(glob.count(), 10);
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_split_str_prefixed_chunks_by_lines() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_str_prefixed_chunks_by_lines";
RandomFile::new(&at, name).add_lines(10000);
ucmd.args(&["-l", "1000", name, "d"]).succeeds();
let glob = Glob::new(&at, ".", r"d[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 10);
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_split_additional_suffix() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_additional_suffix";
RandomFile::new(&at, name).add_lines(2000);
ucmd.args(&["--additional-suffix", ".txt", name]).succeeds();
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]].txt$");
assert_eq!(glob.count(), 2);
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_additional_suffix_no_slash() {
new_ucmd!()
.args(&["--additional-suffix", "a/b"])
.fails()
.usage_error("invalid suffix 'a/b', contains directory separator");
}
// note: the test_filter* tests below are unix-only
// windows support has been waived for now because of the difficulty of getting
// the `cmd` call right
// see https://github.com/rust-lang/rust/issues/29494
#[test]
#[cfg(unix)]
fn test_filter() {
// like `test_split_default()` but run a command before writing
let (at, mut ucmd) = at_and_ucmd!();
let name = "filtered";
let n_lines = 3;
RandomFile::new(&at, name).add_lines(n_lines);
// change all characters to 'i'
ucmd.args(&["--filter=sed s/./i/g > $FILE", name])
.succeeds();
// assert all characters are 'i' / no character is not 'i'
// (assert that command succeeded)
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
assert!(
glob.collate().iter().find(|&&c| {
// is not i
c != (b'i')
// is not newline
&& c != (b'\n')
}) == None
);
}
#[test]
#[cfg(unix)]
fn test_filter_with_env_var_set() {
// This test will ensure that if $FILE env var was set before running --filter, it'll stay that
// way
// implemented like `test_split_default()` but run a command before writing
let (at, mut ucmd) = at_and_ucmd!();
let name = "filtered";
let n_lines = 3;
RandomFile::new(&at, name).add_lines(n_lines);
let env_var_value = "some-value";
env::set_var("FILE", &env_var_value);
ucmd.args(&[format!("--filter={}", "cat > $FILE").as_str(), name])
.succeeds();
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.collate(), at.read_bytes(name));
assert!(env::var("FILE").unwrap_or_else(|_| "var was unset".to_owned()) == env_var_value);
}
#[test]
#[cfg(unix)]
fn test_filter_command_fails() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "filter-will-fail";
RandomFile::new(&at, name).add_lines(4);
ucmd.args(&["--filter=/a/path/that/totally/does/not/exist", name])
.fails();
}
#[test]
fn test_split_lines_number() {
// Test if stdout/stderr for '--lines' option is correct
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
at.touch("file");
scene
.ucmd()
.args(&["--lines", "2", "file"])
.succeeds()
.no_stderr()
.no_stdout();
scene
.ucmd()
.args(&["--lines", "2fb", "file"])
.fails()
.code_is(1)
.stderr_only("split: invalid number of lines: '2fb'");
}
#[test]
fn test_split_invalid_bytes_size() {
new_ucmd!()
.args(&["-b", "1024R"])
.fails()
.code_is(1)
.stderr_only("split: invalid number of bytes: '1024R'");
#[cfg(not(target_pointer_width = "128"))]
new_ucmd!()
.args(&["-b", "1Y"])
.fails()
.code_is(1)
.stderr_only("split: invalid number of bytes: '1Y': Value too large for defined data type");
#[cfg(target_pointer_width = "32")]
{
let sizes = ["1000G", "10T"];
for size in &sizes {
new_ucmd!().args(&["-b", size]).succeeds();
}
}
}
#[test]
fn test_split_chunks_num_chunks_oversized_32() {
#[cfg(target_pointer_width = "32")]
{
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
at.touch("file");
scene
.ucmd()
.args(&["--number", "5000000000", "file"])
.fails()
.code_is(1)
.stderr_only("split: Number of chunks too big");
}
}
#[test]
fn test_split_stdin_num_chunks() {
new_ucmd!()
.args(&["--number=1"])
.fails()
.code_is(1)
.stderr_only("split: -: cannot determine file size");
}
fn file_read(at: &AtPath, filename: &str) -> String {
let mut s = String::new();
at.open(filename).read_to_string(&mut s).unwrap();
s
}
// TODO Use char::from_digit() in Rust v1.51.0 or later.
fn char_from_digit(n: usize) -> char {
(b'a' + n as u8) as char
}
/// Test for the default suffix length behavior: dynamically increasing size.
#[test]
fn test_alphabetic_dynamic_suffix_length() {
let (at, mut ucmd) = at_and_ucmd!();
// Split into chunks of one byte each.
//
// The input file has (26^2) - 26 + 1 = 651 bytes. This is just
// enough to force `split` to dynamically increase the length of
// the filename for the very last chunk.
//
// We expect the output files to be named
//
// xaa, xab, xac, ..., xyx, xyy, xyz, xzaaa
//
ucmd.args(&["-b", "1", "sixhundredfiftyonebytes.txt"])
.succeeds();
for i in 0..25 {
for j in 0..26 {
let filename = format!("x{}{}", char_from_digit(i), char_from_digit(j),);
let contents = file_read(&at, &filename);
assert_eq!(contents, "a");
}
}
assert_eq!(file_read(&at, "xzaaa"), "a");
}
/// Test for the default suffix length behavior: dynamically increasing size.
#[test]
fn test_numeric_dynamic_suffix_length() {
let (at, mut ucmd) = at_and_ucmd!();
// Split into chunks of one byte each, use numbers instead of
// letters as file suffixes.
//
// The input file has (10^2) - 10 + 1 = 91 bytes. This is just
// enough to force `split` to dynamically increase the length of
// the filename for the very last chunk.
//
// x00, x01, x02, ..., x87, x88, x89, x9000
//
ucmd.args(&["-d", "-b", "1", "ninetyonebytes.txt"])
.succeeds();
for i in 0..90 {
let filename = format!("x{:02}", i);
let contents = file_read(&at, &filename);
assert_eq!(contents, "a");
}
assert_eq!(file_read(&at, "x9000"), "a");
}
#[test]
fn test_hex_dynamic_suffix_length() {
let (at, mut ucmd) = at_and_ucmd!();
// Split into chunks of one byte each, use hexadecimal digits
// instead of letters as file suffixes.
//
// The input file has (16^2) - 16 + 1 = 241 bytes. This is just
// enough to force `split` to dynamically increase the length of
// the filename for the very last chunk.
//
// x00, x01, x02, ..., xed, xee, xef, xf000
//
ucmd.args(&["-x", "-b", "1", "twohundredfortyonebytes.txt"])
.succeeds();
for i in 0..240 {
let filename = format!("x{:02x}", i);
let contents = file_read(&at, &filename);
assert_eq!(contents, "a");
}
assert_eq!(file_read(&at, "xf000"), "a");
}
#[test]
fn test_suffixes_exhausted() {
new_ucmd!()
.args(&["-b", "1", "-a", "1", "asciilowercase.txt"])
.fails()
.stderr_only("split: output file suffixes exhausted");
}
#[test]
fn test_verbose() {
new_ucmd!()
.args(&["-b", "5", "--verbose", "asciilowercase.txt"])
.succeeds()
.stdout_only(
"creating file 'xaa'
creating file 'xab'
creating file 'xac'
creating file 'xad'
creating file 'xae'
creating file 'xaf'
",
);
}
#[test]
fn test_number() {
let (at, mut ucmd) = at_and_ucmd!();
let file_read = |f| {
let mut s = String::new();
at.open(f).read_to_string(&mut s).unwrap();
s
};
ucmd.args(&["-n", "5", "asciilowercase.txt"]).succeeds();
assert_eq!(file_read("xaa"), "abcde");
assert_eq!(file_read("xab"), "fghij");
assert_eq!(file_read("xac"), "klmno");
assert_eq!(file_read("xad"), "pqrst");
assert_eq!(file_read("xae"), "uvwxyz\n");
}
#[test]
fn test_split_number_with_io_blksize() {
let (at, mut ucmd) = at_and_ucmd!();
let file_read = |f| {
let mut s = String::new();
at.open(f).read_to_string(&mut s).unwrap();
s
};
ucmd.args(&["-n", "5", "asciilowercase.txt", "---io-blksize", "1024"])
.succeeds();
assert_eq!(file_read("xaa"), "abcde");
assert_eq!(file_read("xab"), "fghij");
assert_eq!(file_read("xac"), "klmno");
assert_eq!(file_read("xad"), "pqrst");
assert_eq!(file_read("xae"), "uvwxyz\n");
}
#[test]
fn test_split_default_with_io_blksize() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "split_default_with_io_blksize";
RandomFile::new(&at, name).add_lines(2000);
ucmd.args(&[name, "---io-blksize", "2M"]).succeeds();
let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 2);
assert_eq!(glob.collate(), at.read_bytes(name));
}
#[test]
fn test_invalid_suffix_length() {
new_ucmd!()
.args(&["-a", "xyz"])
.fails()
.no_stdout()
.stderr_contains("invalid suffix length: 'xyz'");
}
#[test]
fn test_include_newlines() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-l", "2", "fivelines.txt"]).succeeds();
let mut s = String::new();
at.open("xaa").read_to_string(&mut s).unwrap();
assert_eq!(s, "1\n2\n");
let mut s = String::new();
at.open("xab").read_to_string(&mut s).unwrap();
assert_eq!(s, "3\n4\n");
let mut s = String::new();
at.open("xac").read_to_string(&mut s).unwrap();
assert_eq!(s, "5\n");
}
#[test]
fn test_allow_empty_files() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-n", "4", "threebytes.txt"])
.succeeds()
.no_stdout()
.no_stderr();
assert_eq!(at.read("xaa"), "a");
assert_eq!(at.read("xab"), "b");
assert_eq!(at.read("xac"), "c");
assert_eq!(at.read("xad"), "");
}
#[test]
fn test_elide_empty_files() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-e", "-n", "4", "threebytes.txt"])
.succeeds()
.no_stdout()
.no_stderr();
assert_eq!(at.read("xaa"), "a");
assert_eq!(at.read("xab"), "b");
assert_eq!(at.read("xac"), "c");
assert!(!at.plus("xad").exists());
}
#[test]
#[cfg(unix)]
fn test_elide_dev_null() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-e", "-n", "3", "/dev/null"])
.succeeds()
.no_stdout()
.no_stderr();
assert!(!at.plus("xaa").exists());
assert!(!at.plus("xab").exists());
assert!(!at.plus("xac").exists());
}
#[test]
fn test_lines() {
let (at, mut ucmd) = at_and_ucmd!();
let file_read = |f| {
let mut s = String::new();
at.open(f).read_to_string(&mut s).unwrap();
s
};
// Split into two files without splitting up lines.
ucmd.args(&["-n", "l/2", "fivelines.txt"]).succeeds();
assert_eq!(file_read("xaa"), "1\n2\n3\n");
assert_eq!(file_read("xab"), "4\n5\n");
}
#[test]
fn test_lines_kth() {
new_ucmd!()
.args(&["-n", "l/3/10", "onehundredlines.txt"])
.succeeds()
.stdout_only("20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n");
}
#[test]
fn test_line_bytes() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-C", "8", "letters.txt"]).succeeds();
assert_eq!(at.read("xaa"), "aaaaaaaa");
assert_eq!(at.read("xab"), "a\nbbbb\n");
assert_eq!(at.read("xac"), "cccc\ndd\n");
assert_eq!(at.read("xad"), "ee\n");
}
#[test]
fn test_line_bytes_no_final_newline() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-C", "2"])
.pipe_in("1\n2222\n3\n4")
.succeeds()
.no_stdout()
.no_stderr();
assert_eq!(at.read("xaa"), "1\n");
assert_eq!(at.read("xab"), "22");
assert_eq!(at.read("xac"), "22");
assert_eq!(at.read("xad"), "\n");
assert_eq!(at.read("xae"), "3\n");
assert_eq!(at.read("xaf"), "4");
}
#[test]
fn test_line_bytes_no_empty_file() {
let (at, mut ucmd) = at_and_ucmd!();
ucmd.args(&["-C", "1"])
.pipe_in("1\n2222\n3\n4")
.succeeds()
.no_stdout()
.no_stderr();
assert_eq!(at.read("xaa"), "1");
assert_eq!(at.read("xab"), "\n");
assert_eq!(at.read("xac"), "2");
assert_eq!(at.read("xad"), "2");
assert_eq!(at.read("xae"), "2");
assert_eq!(at.read("xaf"), "2");
assert_eq!(at.read("xag"), "\n");
assert_eq!(at.read("xah"), "3");
assert_eq!(at.read("xai"), "\n");
assert_eq!(at.read("xaj"), "4");
assert!(!at.plus("xak").exists());
}
#[test]
fn test_guard_input() {
let ts = TestScenario::new(util_name!());
let at = &ts.fixtures;
ts.ucmd()
.args(&["-C", "6"])
.pipe_in("1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n")
.succeeds()
.no_stdout()
.no_stderr();
assert_eq!(at.read("xaa"), "1\n2\n3\n");
ts.ucmd()
.args(&["-C", "6"])
.pipe_in("1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n")
.succeeds()
.no_stdout()
.no_stderr();
assert_eq!(at.read("xaa"), "1\n2\n3\n");
ts.ucmd()
.args(&["-C", "6", "xaa"])
.fails()
.stderr_only("split: 'xaa' would overwrite input; aborting");
assert_eq!(at.read("xaa"), "1\n2\n3\n");
}
#[test]
fn test_multiple_of_input_chunk() {
let (at, mut ucmd) = at_and_ucmd!();
let name = "multiple_of_input_chunk";
RandomFile::new(&at, name).add_bytes(16 * 1024);
ucmd.args(&["-b", "8K", name, "b"]).succeeds();
let glob = Glob::new(&at, ".", r"b[[:alpha:]][[:alpha:]]$");
assert_eq!(glob.count(), 2);
for filename in glob.collect() {
assert_eq!(glob.directory.metadata(&filename).len(), 8 * 1024);
}
assert_eq!(glob.collate(), at.read_bytes(name));
}