diff --git a/Cargo.lock b/Cargo.lock index 8d740bb9a..6ff3cd5c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1372,9 +1372,6 @@ name = "serde" version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" -dependencies = [ - "serde_derive", -] [[package]] name = "serde_cbor" @@ -1453,9 +1450,6 @@ name = "smallvec" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" -dependencies = [ - "serde", -] [[package]] name = "strsim" @@ -2396,8 +2390,6 @@ dependencies = [ "rand 0.7.3", "rayon", "semver", - "serde", - "serde_json", "smallvec 1.6.1", "tempdir", "unicode-width", diff --git a/src/uu/sort/BENCHMARKING.md b/src/uu/sort/BENCHMARKING.md index 1caea0326..71c331105 100644 --- a/src/uu/sort/BENCHMARKING.md +++ b/src/uu/sort/BENCHMARKING.md @@ -69,6 +69,14 @@ Run `cargo build --release` before benchmarking after you make a change! - Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers_si.txt -h -o output.txt"`. +## External sorting + +Try running commands with the `-S` option set to an amount of memory to be used, such as `1M`. Additionally, you could try sorting +huge files (ideally multiple Gigabytes) with `-S`. Creating such a large file can be achieved by running `cat shuffled_wordlist.txt | sort -R >> shuffled_wordlist.txt` +multiple times (this will add the contents of `shuffled_wordlist.txt` to itself). +Example: Run `hyperfine './target/release/coreutils sort shuffled_wordlist.txt -S 1M' 'sort shuffled_wordlist.txt -S 1M'` +` + ## Stdout and stdin performance Try to run the above benchmarks by piping the input through stdin (standard input) and redirect the diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 80ffc92c9..3784ccbb0 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -15,15 +15,13 @@ edition = "2018" path = "src/sort.rs" [dependencies] -serde_json = { version = "1.0.64", default-features = false, features = ["alloc"] } -serde = { version = "1.0", features = ["derive"] } rayon = "1.5" rand = "0.7" clap = "2.33" fnv = "1.0.7" itertools = "0.10.0" semver = "0.9.0" -smallvec = { version="1.6.1", features=["serde"] } +smallvec = "1.6.1" unicode-width = "0.1.8" uucore = { version=">=0.0.8", package="uucore", path="../../uucore", features=["fs"] } uucore_procs = { version=">=0.0.5", package="uucore_procs", path="../../uucore_procs" } diff --git a/src/uu/sort/src/external_sort/mod.rs b/src/uu/sort/src/external_sort/mod.rs index fd942d4a7..725b17bbd 100644 --- a/src/uu/sort/src/external_sort/mod.rs +++ b/src/uu/sort/src/external_sort/mod.rs @@ -1,50 +1,32 @@ -use std::clone::Clone; -use std::cmp::Ordering::Less; +use std::cmp::Ordering; use std::collections::VecDeque; -use std::error::Error; use std::fs::{File, OpenOptions}; -use std::io::SeekFrom::Start; +use std::io::SeekFrom; use std::io::{BufRead, BufReader, BufWriter, Seek, Write}; -use std::marker::PhantomData; -use std::path::PathBuf; +use std::path::Path; -use serde::de::DeserializeOwned; -use serde::Serialize; -use serde_json; use tempdir::TempDir; use super::{GlobalSettings, Line}; -/// Trait for types that can be used by -/// [ExternalSorter](struct.ExternalSorter.html). Must be sortable, cloneable, -/// serializeable, and able to report on it's size -pub trait ExternallySortable: Clone + Serialize + DeserializeOwned { - /// Get the size, in bytes, of this object (used to constrain the buffer - /// used in the external sort). - fn get_size(&self) -> u64; -} - /// Iterator that provides sorted `T`s -pub struct ExtSortedIterator { +pub struct ExtSortedIterator { buffers: Vec>, chunk_offsets: Vec, - max_per_chunk: u64, - chunks: u64, + max_per_chunk: usize, + chunks: usize, tmp_dir: TempDir, settings: GlobalSettings, failed: bool, } -impl Iterator for ExtSortedIterator -where - Line: ExternallySortable, -{ - type Item = Result>; +impl Iterator for ExtSortedIterator { + type Item = Line; /// # Errors /// /// This method can fail due to issues reading intermediate sorted chunks - /// from disk, or due to serde deserialization issues + /// from disk fn next(&mut self) -> Option { if self.failed { return None; @@ -53,29 +35,18 @@ where let mut empty = true; for chunk_num in 0..self.chunks { if self.buffers[chunk_num as usize].is_empty() { - let mut f = match File::open(self.tmp_dir.path().join(chunk_num.to_string())) { - Ok(f) => f, - Err(e) => { - self.failed = true; - return Some(Err(Box::new(e))); - } - }; - match f.seek(Start(self.chunk_offsets[chunk_num as usize])) { - Ok(_) => (), - Err(e) => { - self.failed = true; - return Some(Err(Box::new(e))); - } - } - let bytes_read = - match fill_buff(&mut self.buffers[chunk_num as usize], f, self.max_per_chunk) { - Ok(bytes_read) => bytes_read, - Err(e) => { - self.failed = true; - return Some(Err(e)); - } - }; - self.chunk_offsets[chunk_num as usize] += bytes_read; + let mut f = crash_if_err!( + 1, + File::open(self.tmp_dir.path().join(chunk_num.to_string())) + ); + crash_if_err!(1, f.seek(SeekFrom::Start(self.chunk_offsets[chunk_num]))); + let bytes_read = fill_buff( + &mut self.buffers[chunk_num as usize], + f, + self.max_per_chunk, + &self.settings, + ); + self.chunk_offsets[chunk_num as usize] += bytes_read as u64; if !self.buffers[chunk_num as usize].is_empty() { empty = false; } @@ -91,205 +62,150 @@ where // check is_empty() before unwrap()ing let mut idx = 0; for chunk_num in 0..self.chunks as usize { - if !self.buffers[chunk_num].is_empty() { - if self.buffers[idx].is_empty() - || (super::compare_by)( + if !self.buffers[chunk_num].is_empty() + && (self.buffers[idx].is_empty() + || super::compare_by( self.buffers[chunk_num].front().unwrap(), self.buffers[idx].front().unwrap(), &self.settings, - ) == Less - { - idx = chunk_num; - } + ) == Ordering::Less) + { + idx = chunk_num; } } // unwrap due to checks above let r = self.buffers[idx].pop_front().unwrap(); - Some(Ok(r)) + Some(r) } } -/// Perform an external sort on an unsorted stream of incoming data -pub struct ExternalSorter -where - Line: ExternallySortable, -{ - tmp_dir: Option, - buffer_bytes: u64, - phantom: PhantomData, - settings: GlobalSettings, -} +/// Sort (based on `compare`) the `T`s provided by `unsorted` and return an +/// iterator +/// +/// # Errors +/// +/// This method can fail due to issues writing intermediate sorted chunks +/// to disk. +pub fn ext_sort( + unsorted: impl Iterator, + settings: &GlobalSettings, +) -> ExtSortedIterator { + let tmp_dir = crash_if_err!(1, TempDir::new_in(&settings.tmp_dir, "uutils_sort")); -impl ExternalSorter -where - Line: ExternallySortable, -{ - /// Create a new `ExternalSorter` with a specified memory buffer and - /// temporary directory - pub fn new( - buffer_bytes: u64, - tmp_dir: Option, - settings: GlobalSettings, - ) -> ExternalSorter { - ExternalSorter { - buffer_bytes, - tmp_dir, - phantom: PhantomData, + let mut iter = ExtSortedIterator { + buffers: Vec::new(), + chunk_offsets: Vec::new(), + max_per_chunk: 0, + chunks: 0, + tmp_dir, + settings: settings.clone(), + failed: false, + }; + + let mut total_read = 0; + let mut chunk = Vec::new(); + + // make the initial chunks on disk + for seq in unsorted { + let seq_size = seq.estimate_size(); + total_read += seq_size; + + chunk.push(seq); + + if total_read >= settings.buffer_size { + super::sort_by(&mut chunk, &settings); + write_chunk( + settings, + &iter.tmp_dir.path().join(iter.chunks.to_string()), + &mut chunk, + ); + chunk.clear(); + total_read = 0; + iter.chunks += 1; + } + } + // write the last chunk + if !chunk.is_empty() { + super::sort_by(&mut chunk, &settings); + write_chunk( settings, - } + &iter.tmp_dir.path().join(iter.chunks.to_string()), + &mut chunk, + ); + iter.chunks += 1; } - /// Sort (based on `compare`) the `T`s provided by `unsorted` and return an - /// iterator - /// - /// # Errors - /// - /// This method can fail due to issues writing intermediate sorted chunks - /// to disk, or due to serde serialization issues - pub fn sort_by( - &self, - unsorted: I, - settings: GlobalSettings, - ) -> Result, Box> - where - I: Iterator, - { - let tmp_dir = match self.tmp_dir { - Some(ref p) => TempDir::new_in(p, "uutils_sort")?, - None => TempDir::new("uutils_sort")?, - }; - // creating the thing we need to return first due to the face that we need to - // borrow tmp_dir and move it out - let mut iter = ExtSortedIterator { - buffers: Vec::new(), - chunk_offsets: Vec::new(), - max_per_chunk: 0, - chunks: 0, - tmp_dir, - settings, - failed: false, - }; + // initialize buffers for each chunk + // + // Having a right sized buffer for each chunk for smallish values seems silly to me? + // + // We will have to have the entire iter in memory sometime right? + // Set minimum to the size of the writer buffer, ~8K - { - let mut total_read = 0; - let mut chunk = Vec::new(); - // Initial buffer is specified by user - let mut adjusted_buffer_size = self.buffer_bytes; - let (iter_size, _) = unsorted.size_hint(); - - // make the initial chunks on disk - for seq in unsorted { - let seq_size = seq.get_size(); - total_read += seq_size; - - // GNU minimum is 16 * (sizeof struct + 2), but GNU uses about - // 1/10 the memory that we do. And GNU even says in the code it may - // not work on small buffer sizes. - // - // The following seems to work pretty well, and has about the same max - // RSS as lower minimum values. - // - let minimum_buffer_size: u64 = iter_size as u64 * seq_size / 8; - - adjusted_buffer_size = - // Grow buffer size for a struct/Line larger than buffer - if adjusted_buffer_size < seq_size { - seq_size - } else if adjusted_buffer_size < minimum_buffer_size { - minimum_buffer_size - } else { - adjusted_buffer_size - }; - chunk.push(seq); - - if total_read >= adjusted_buffer_size { - super::sort_by(&mut chunk, &self.settings); - self.write_chunk( - &iter.tmp_dir.path().join(iter.chunks.to_string()), - &mut chunk, - )?; - chunk.clear(); - total_read = 0; - iter.chunks += 1; - } - } - // write the last chunk - if chunk.len() > 0 { - super::sort_by(&mut chunk, &self.settings); - self.write_chunk( - &iter.tmp_dir.path().join(iter.chunks.to_string()), - &mut chunk, - )?; - iter.chunks += 1; - } - - // initialize buffers for each chunk - // - // Having a right sized buffer for each chunk for smallish values seems silly to me? - // - // We will have to have the entire iter in memory sometime right? - // Set minimum to the size of the writer buffer, ~8K - // - const MINIMUM_READBACK_BUFFER: u64 = 8200; - let right_sized_buffer = adjusted_buffer_size - .checked_div(iter.chunks) - .unwrap_or(adjusted_buffer_size); - iter.max_per_chunk = if right_sized_buffer > MINIMUM_READBACK_BUFFER { - right_sized_buffer - } else { - MINIMUM_READBACK_BUFFER - }; - iter.buffers = vec![VecDeque::new(); iter.chunks as usize]; - iter.chunk_offsets = vec![0 as u64; iter.chunks as usize]; - for chunk_num in 0..iter.chunks { - let offset = fill_buff( - &mut iter.buffers[chunk_num as usize], - File::open(iter.tmp_dir.path().join(chunk_num.to_string()))?, - iter.max_per_chunk, - )?; - iter.chunk_offsets[chunk_num as usize] = offset; - } - } - - Ok(iter) + const MINIMUM_READBACK_BUFFER: usize = 8200; + let right_sized_buffer = settings + .buffer_size + .checked_div(iter.chunks) + .unwrap_or(settings.buffer_size); + iter.max_per_chunk = if right_sized_buffer > MINIMUM_READBACK_BUFFER { + right_sized_buffer + } else { + MINIMUM_READBACK_BUFFER + }; + iter.buffers = vec![VecDeque::new(); iter.chunks]; + iter.chunk_offsets = vec![0; iter.chunks]; + for chunk_num in 0..iter.chunks { + let offset = fill_buff( + &mut iter.buffers[chunk_num], + crash_if_err!( + 1, + File::open(iter.tmp_dir.path().join(chunk_num.to_string())) + ), + iter.max_per_chunk, + &settings, + ); + iter.chunk_offsets[chunk_num] = offset as u64; } - fn write_chunk(&self, file: &PathBuf, chunk: &mut Vec) -> Result<(), Box> { - let new_file = OpenOptions::new().create(true).append(true).open(file)?; - let mut buf_write = Box::new(BufWriter::new(new_file)) as Box; - for s in chunk { - let mut serialized = serde_json::to_string(&s).expect("JSON write error: "); - serialized.push_str("\n"); - buf_write.write(serialized.as_bytes())?; - } - buf_write.flush()?; - - Ok(()) - } + iter } -fn fill_buff( +fn write_chunk(settings: &GlobalSettings, file: &Path, chunk: &mut Vec) { + let new_file = crash_if_err!(1, OpenOptions::new().create(true).append(true).open(file)); + let mut buf_write = BufWriter::new(new_file); + for s in chunk { + crash_if_err!(1, buf_write.write_all(s.line.as_bytes())); + crash_if_err!( + 1, + buf_write.write_all(if settings.zero_terminated { "\0" } else { "\n" }.as_bytes(),) + ); + } + crash_if_err!(1, buf_write.flush()); +} + +fn fill_buff( vec: &mut VecDeque, file: File, - max_bytes: u64, -) -> Result> -where - Line: ExternallySortable, -{ + max_bytes: usize, + settings: &GlobalSettings, +) -> usize { let mut total_read = 0; let mut bytes_read = 0; - for line in BufReader::new(file).lines() { - let line_s = line?; + for line in BufReader::new(file).split(if settings.zero_terminated { + b'\0' + } else { + b'\n' + }) { + let line_s = String::from_utf8(crash_if_err!(1, line)).unwrap(); bytes_read += line_s.len() + 1; - // This is where the bad stuff happens usually - let deserialized: Line = serde_json::from_str(&line_s).expect("JSON read error: "); - total_read += deserialized.get_size(); + let deserialized = Line::new(line_s, settings); + total_read += deserialized.estimate_size(); vec.push_back(deserialized); if total_read > max_bytes { break; } } - Ok(bytes_read as u64) + bytes_read } diff --git a/src/uu/sort/src/numeric_str_cmp.rs b/src/uu/sort/src/numeric_str_cmp.rs index b74d97867..f8666b701 100644 --- a/src/uu/sort/src/numeric_str_cmp.rs +++ b/src/uu/sort/src/numeric_str_cmp.rs @@ -14,21 +14,20 @@ //! More specifically, exponent can be understood so that the original number is in (1..10)*10^exponent. //! From that follows the constraints of this algorithm: It is able to compare numbers in ±(1*10^[i64::MIN]..10*10^[i64::MAX]). -use serde::{Deserialize, Serialize}; use std::{cmp::Ordering, ops::Range}; -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Clone)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] enum Sign { Negative, Positive, } -#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct NumInfo { exponent: i64, sign: Sign, } -#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct NumInfoParseSettings { pub accept_si_units: bool, pub thousands_separator: Option, diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index c82524796..d8978cb2b 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -20,8 +20,8 @@ mod external_sort; mod numeric_str_cmp; use clap::{App, Arg}; +use external_sort::ext_sort; use custom_str_cmp::custom_str_cmp; -use external_sort::{ExternalSorter, ExternallySortable}; use fnv::FnvHasher; use itertools::Itertools; use numeric_str_cmp::{numeric_str_cmp, NumInfo, NumInfoParseSettings}; @@ -29,14 +29,13 @@ use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; use rayon::prelude::*; use semver::Version; -use serde::{Deserialize, Serialize}; use smallvec::SmallVec; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::env; use std::fs::File; use std::hash::{Hash, Hasher}; -use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write}; +use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write}; use std::mem::replace; use std::ops::Range; use std::path::Path; @@ -106,7 +105,7 @@ enum SortMode { Default, } #[derive(Clone)] -struct GlobalSettings { +pub struct GlobalSettings { mode: SortMode, debug: bool, ignore_blanks: bool, @@ -206,7 +205,7 @@ impl From<&GlobalSettings> for KeySettings { } } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Clone)] /// Represents the string selected by a FieldSelector. struct SelectionRange { range: Range, @@ -228,7 +227,7 @@ impl SelectionRange { } } -#[derive(Serialize, Deserialize, Clone)] +#[derive(Clone)] enum NumCache { AsF64(GeneralF64ParseResult), WithInfo(NumInfo), @@ -249,7 +248,8 @@ impl NumCache { } } } -#[derive(Serialize, Deserialize, Clone)] + +#[derive(Clone)] struct Selection { range: SelectionRange, num_cache: NumCache, @@ -264,22 +264,21 @@ impl Selection { type Field = Range; -#[derive(Serialize, Deserialize, Clone)] -struct Line { +#[derive(Clone)] +pub struct Line { line: String, // The common case is not to specify fields. Let's make this fast. selections: SmallVec<[Selection; 1]>, } -impl ExternallySortable for Line { - fn get_size(&self) -> u64 { - // Currently 96 bytes, but that could change, so we get that size here - std::mem::size_of::() as u64 - } -} - impl Line { - fn new(line: String, settings: &GlobalSettings) -> Self { + pub fn estimate_size(&self) -> usize { + self.line.capacity() + + self.selections.capacity() * std::mem::size_of::() + + std::mem::size_of::() + } + + pub fn new(line: String, settings: &GlobalSettings) -> Self { let fields = if settings .selectors .iter() @@ -291,7 +290,7 @@ impl Line { None }; - let selections = settings + let selections: SmallVec<[Selection; 1]> = settings .selectors .iter() .map(|selector| { @@ -683,7 +682,7 @@ impl FieldSelector { } struct MergeableFile<'a> { - lines: Lines>>, + lines: Box + 'a>, current_line: Line, settings: &'a GlobalSettings, } @@ -723,11 +722,11 @@ impl<'a> FileMerger<'a> { settings, } } - fn push_file(&mut self, mut lines: Lines>>) { - if let Some(Ok(next_line)) = lines.next() { + fn push_file(&mut self, mut lines: Box + 'a>) { + if let Some(next_line) = lines.next() { let mergeable_file = MergeableFile { lines, - current_line: Line::new(next_line, &self.settings), + current_line: next_line, settings: &self.settings, }; self.heap.push(mergeable_file); @@ -741,11 +740,8 @@ impl<'a> Iterator for FileMerger<'a> { match self.heap.pop() { Some(mut current) => { match current.lines.next() { - Some(Ok(next_line)) => { - let ret = replace( - &mut current.current_line, - Line::new(next_line, &self.settings), - ); + Some(next_line) => { + let ret = replace(&mut current.current_line, next_line); self.heap.push(current); Some(ret) } @@ -1113,90 +1109,106 @@ pub fn uumain(args: impl uucore::Args) -> i32 { exec(files, settings) } +fn file_to_lines_iter<'a>( + file: &str, + settings: &'a GlobalSettings, +) -> Option + 'a> { + let (reader, _) = match open(file) { + Some(x) => x, + None => return None, + }; + + let buf_reader = BufReader::new(reader); + + Some( + buf_reader + .split(if settings.zero_terminated { + b'\0' + } else { + b'\n' + }) + .map(move |line| { + Line::new( + crash_if_err!(1, String::from_utf8(crash_if_err!(1, line))), + settings, + ) + }), + ) +} + +fn output_sorted_lines(iter: impl Iterator, settings: &GlobalSettings) { + if settings.unique { + print_sorted( + iter.dedup_by(|a, b| compare_by(a, b, &settings) == Ordering::Equal), + &settings, + ); + } else { + print_sorted(iter, &settings); + } +} + fn exec(files: Vec, settings: GlobalSettings) -> i32 { - let mut lines = Vec::new(); - let mut file_merger = FileMerger::new(&settings); + if settings.merge { + let mut file_merger = FileMerger::new(&settings); + for lines in files + .iter() + .filter_map(|file| file_to_lines_iter(file, &settings)) + { + file_merger.push_file(Box::new(lines)); + } + output_sorted_lines(file_merger, &settings); + } else { + let lines = files + .iter() + .filter_map(|file| file_to_lines_iter(file, &settings)) + .flatten(); - for path in &files { - let (reader, _) = match open(path) { - Some(x) => x, - None => continue, - }; + if settings.check { + return exec_check_file(lines, &settings); + } - let buf_reader = BufReader::new(reader); - - if settings.merge { - file_merger.push_file(buf_reader.lines()); - } else if settings.zero_terminated { - for line in buf_reader.split(b'\0').flatten() { - lines.push(Line::new( - std::str::from_utf8(&line) - .expect("Could not parse string from zero terminated input.") - .to_string(), - &settings, - )); - } + // Only use ext_sorter when we need to. + // Probably faster that we don't create + // an owned value each run + if settings.ext_sort { + let sorted_lines = ext_sort(lines, &settings); + output_sorted_lines(sorted_lines, &settings); } else { - for line in buf_reader.lines() { - if let Ok(n) = line { - lines.push(Line::new(n, &settings)); + let mut lines = vec![]; + + // This is duplicated from fn file_to_lines_iter, but using that function directly results in a performance regression. + for (file, _) in files.iter().map(|file| open(file)).flatten() { + let buf_reader = BufReader::new(file); + for line in buf_reader.split(if settings.zero_terminated { + b'\0' } else { - break; + b'\n' + }) { + let string = crash_if_err!(1, String::from_utf8(crash_if_err!(1, line))); + lines.push(Line::new(string, &settings)); } } + + sort_by(&mut lines, &settings); + output_sorted_lines(lines.into_iter(), &settings); } } - if settings.check { - return exec_check_file(&lines, &settings); - } - - // Only use ext_sorter when we need to. - // Probably faster that we don't create - // an owned value each run - if settings.ext_sort { - lines = ext_sort_by(lines, settings.clone()); - } else { - sort_by(&mut lines, &settings); - } - - if settings.merge { - if settings.unique { - print_sorted( - file_merger.dedup_by(|a, b| compare_by(a, b, &settings) == Ordering::Equal), - &settings, - ) - } else { - print_sorted(file_merger, &settings) - } - } else if settings.unique { - print_sorted( - lines - .into_iter() - .dedup_by(|a, b| compare_by(a, b, &settings) == Ordering::Equal), - &settings, - ) - } else { - print_sorted(lines.into_iter(), &settings) - } - 0 } -fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 { +fn exec_check_file(unwrapped_lines: impl Iterator, settings: &GlobalSettings) -> i32 { // errors yields the line before each disorder, // plus the last line (quirk of .coalesce()) - let mut errors = - unwrapped_lines - .iter() - .enumerate() - .coalesce(|(last_i, last_line), (i, line)| { - if compare_by(&last_line, &line, &settings) == Ordering::Greater { - Err(((last_i, last_line), (i, line))) - } else { - Ok((i, line)) - } - }); + let mut errors = unwrapped_lines + .enumerate() + .coalesce(|(last_i, last_line), (i, line)| { + if compare_by(&last_line, &line, &settings) == Ordering::Greater { + Err(((last_i, last_line), (i, line))) + } else { + Ok((i, line)) + } + }); if let Some((first_error_index, _line)) = errors.next() { // Check for a second "error", as .coalesce() always returns the last // line, no matter what our merging function does. @@ -1215,20 +1227,6 @@ fn exec_check_file(unwrapped_lines: &[Line], settings: &GlobalSettings) -> i32 { } } -fn ext_sort_by(unsorted: Vec, settings: GlobalSettings) -> Vec { - let external_sorter = ExternalSorter::new( - settings.buffer_size as u64, - Some(settings.tmp_dir.clone()), - settings.clone(), - ); - let iter = external_sorter - .sort_by(unsorted.into_iter(), settings) - .unwrap() - .map(|x| x.unwrap()) - .collect::>(); - iter -} - fn sort_by(unsorted: &mut Vec, settings: &GlobalSettings) { if settings.stable || settings.unique { unsorted.par_sort_by(|a, b| compare_by(a, b, &settings)) @@ -1332,7 +1330,7 @@ fn get_leading_gen(input: &str) -> Range { leading_whitespace_len..input.len() } -#[derive(Serialize, Deserialize, Copy, Clone, PartialEq, PartialOrd)] +#[derive(Copy, Clone, PartialEq, PartialOrd)] enum GeneralF64ParseResult { Invalid, NaN, diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index eac9490a5..4465e861f 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -37,7 +37,29 @@ fn test_larger_than_specified_segment() { .arg("50K") .arg("ext_sort.txt") .succeeds() - .stdout_is_fixture(format!("{}", "ext_sort.expected")); + .stdout_is_fixture("ext_sort.expected"); +} + +#[test] +fn test_smaller_than_specified_segment() { + new_ucmd!() + .arg("-n") + .arg("-S") + .arg("100M") + .arg("ext_sort.txt") + .succeeds() + .stdout_is_fixture("ext_sort.expected"); +} + +#[test] +fn test_extsort_zero_terminated() { + new_ucmd!() + .arg("-z") + .arg("-S") + .arg("10K") + .arg("zero-terminated.txt") + .succeeds() + .stdout_is_fixture("zero-terminated.expected"); } #[test]