sort: Add a GNU-style Random Sorter (#1922)

This commit is contained in:
electricboogie 2021-03-29 06:05:52 -05:00 committed by GitHub
parent 8cc7a90d7c
commit da5f2f3a6c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 261 additions and 44 deletions

19
Cargo.lock generated
View file

@ -1317,6 +1317,12 @@ dependencies = [
"maybe-uninit", "maybe-uninit",
] ]
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.8.0" version = "0.8.0"
@ -1443,6 +1449,17 @@ dependencies = [
"serde_json", "serde_json",
] ]
[[package]]
name = "twox-hash"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04f8ab788026715fa63b31960869617cba39117e520eb415b0139543e325ab59"
dependencies = [
"cfg-if 0.1.10",
"rand 0.7.3",
"static_assertions",
]
[[package]] [[package]]
name = "typenum" name = "typenum"
version = "1.13.0" version = "1.13.0"
@ -2200,7 +2217,9 @@ version = "0.0.4"
dependencies = [ dependencies = [
"clap", "clap",
"itertools 0.8.2", "itertools 0.8.2",
"rand 0.7.3",
"semver", "semver",
"twox-hash",
"uucore", "uucore",
"uucore_procs", "uucore_procs",
] ]

View file

@ -15,7 +15,9 @@ edition = "2018"
path = "src/sort.rs" path = "src/sort.rs"
[dependencies] [dependencies]
rand = "0.7"
clap = "2.33" clap = "2.33"
twox-hash = "1.6.0"
itertools = "0.8.0" itertools = "0.8.0"
semver = "0.9.0" semver = "0.9.0"
uucore = { version=">=0.0.7", package="uucore", path="../../uucore", features=["fs"] } uucore = { version=">=0.0.7", package="uucore", path="../../uucore", features=["fs"] }

View file

@ -1,6 +1,7 @@
// * This file is part of the uutils coreutils package. // * This file is part of the uutils coreutils package.
// * // *
// * (c) Michael Yin <mikeyin@mikeyin.org> // * (c) Michael Yin <mikeyin@mikeyin.org>
// * (c) Robert Swinford <robert.swinford..AT..gmail.com>
// * // *
// * For the full copyright and license information, please view the LICENSE // * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code. // * file that was distributed with this source code.
@ -12,13 +13,17 @@ extern crate uucore;
use clap::{App, Arg}; use clap::{App, Arg};
use itertools::Itertools; use itertools::Itertools;
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use semver::Version; use semver::Version;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::fs::File; use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write}; use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
use std::mem::replace; use std::mem::replace;
use std::path::Path; use std::path::Path;
use twox_hash::XxHash64;
use uucore::fs::is_stdin_interactive; // for Iterator::dedup() use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
static NAME: &str = "sort"; static NAME: &str = "sort";
@ -34,16 +39,18 @@ static OPT_DICTIONARY_ORDER: &str = "dictionary-order";
static OPT_MERGE: &str = "merge"; static OPT_MERGE: &str = "merge";
static OPT_CHECK: &str = "check"; static OPT_CHECK: &str = "check";
static OPT_IGNORE_CASE: &str = "ignore-case"; static OPT_IGNORE_CASE: &str = "ignore-case";
static OPT_IGNORE_BLANKS: &str = "ignore-blanks";
static OPT_OUTPUT: &str = "output"; static OPT_OUTPUT: &str = "output";
static OPT_REVERSE: &str = "reverse"; static OPT_REVERSE: &str = "reverse";
static OPT_STABLE: &str = "stable"; static OPT_STABLE: &str = "stable";
static OPT_UNIQUE: &str = "unique"; static OPT_UNIQUE: &str = "unique";
static OPT_RANDOM: &str = "random-sort";
static ARG_FILES: &str = "files"; static ARG_FILES: &str = "files";
static DECIMAL_PT: char = '.'; static DECIMAL_PT: char = '.';
static THOUSANDS_SEP: char = ','; static THOUSANDS_SEP: char = ',';
#[derive(Eq, Ord, PartialEq, PartialOrd)]
enum SortMode { enum SortMode {
Numeric, Numeric,
HumanNumeric, HumanNumeric,
@ -60,8 +67,10 @@ struct Settings {
stable: bool, stable: bool,
unique: bool, unique: bool,
check: bool, check: bool,
random: bool,
compare_fns: Vec<fn(&str, &str) -> Ordering>, compare_fns: Vec<fn(&str, &str) -> Ordering>,
transform_fns: Vec<fn(&str) -> String>, transform_fns: Vec<fn(&str) -> String>,
salt: String,
} }
impl Default for Settings { impl Default for Settings {
@ -74,8 +83,10 @@ impl Default for Settings {
stable: false, stable: false,
unique: false, unique: false,
check: false, check: false,
random: false,
compare_fns: Vec::new(), compare_fns: Vec::new(),
transform_fns: Vec::new(), transform_fns: Vec::new(),
salt: String::new(),
} }
} }
} }
@ -155,17 +166,14 @@ impl<'a> Iterator for FileMerger<'a> {
} }
} }
} }
fn get_usage() -> String { fn get_usage() -> String {
format!( format!(
"{0} {1} "{0} {1}
Usage: Usage:
{0} [OPTION]... [FILE]... {0} [OPTION]... [FILE]...
Write the sorted concatenation of all FILE(s) to standard output. Write the sorted concatenation of all FILE(s) to standard output.
Mandatory arguments for long options are mandatory for short options too. Mandatory arguments for long options are mandatory for short options too.
With no FILE, or when FILE is -, read standard input.", With no FILE, or when FILE is -, read standard input.",
NAME, VERSION NAME, VERSION
) )
@ -228,6 +236,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.long(OPT_IGNORE_CASE) .long(OPT_IGNORE_CASE)
.help("fold lower case to upper case characters"), .help("fold lower case to upper case characters"),
) )
.arg(
Arg::with_name(OPT_IGNORE_BLANKS)
.short("b")
.long(OPT_IGNORE_BLANKS)
.help("ignore leading blanks when finding sort keys in each line"),
)
.arg( .arg(
Arg::with_name(OPT_OUTPUT) Arg::with_name(OPT_OUTPUT)
.short("o") .short("o")
@ -236,6 +250,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.takes_value(true) .takes_value(true)
.value_name("FILENAME"), .value_name("FILENAME"),
) )
.arg(
Arg::with_name(OPT_RANDOM)
.short("R")
.long(OPT_RANDOM)
.help("shuffle in random order"),
)
.arg( .arg(
Arg::with_name(OPT_REVERSE) Arg::with_name(OPT_REVERSE)
.short("r") .short("r")
@ -285,11 +305,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
settings.transform_fns.push(|s| s.to_uppercase()); settings.transform_fns.push(|s| s.to_uppercase());
} }
if matches.is_present(OPT_IGNORE_BLANKS) {
settings.transform_fns.push(|s| s.trim_start().to_string());
}
settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from); settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from);
settings.reverse = matches.is_present(OPT_REVERSE); settings.reverse = matches.is_present(OPT_REVERSE);
settings.stable = matches.is_present(OPT_STABLE); settings.stable = matches.is_present(OPT_STABLE);
settings.unique = matches.is_present(OPT_UNIQUE); settings.unique = matches.is_present(OPT_UNIQUE);
if matches.is_present(OPT_RANDOM) {
settings.random = matches.is_present(OPT_RANDOM);
settings.salt = get_rand_string();
}
//let mut files = matches.free; //let mut files = matches.free;
if files.is_empty() { if files.is_empty() {
/* if no file, default to stdin */ /* if no file, default to stdin */
@ -313,10 +342,10 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
} }
} }
exec(files, &settings) exec(files, &mut settings)
} }
fn exec(files: Vec<String>, settings: &Settings) -> i32 { fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
let mut lines = Vec::new(); let mut lines = Vec::new();
let mut file_merger = FileMerger::new(&settings); let mut file_merger = FileMerger::new(&settings);
@ -351,6 +380,13 @@ fn exec(files: Vec<String>, settings: &Settings) -> i32 {
} else { } else {
print_sorted(file_merger, &settings.outfile) print_sorted(file_merger, &settings.outfile)
} }
} else if settings.unique && settings.mode == SortMode::Numeric {
print_sorted(
lines
.iter()
.dedup_by(|a, b| num_sort_dedup(a) == num_sort_dedup(b)),
&settings.outfile,
)
} else if settings.unique { } else if settings.unique {
print_sorted(lines.iter().dedup(), &settings.outfile) print_sorted(lines.iter().dedup(), &settings.outfile)
} else { } else {
@ -419,7 +455,11 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
}; };
for compare_fn in &settings.compare_fns { for compare_fn in &settings.compare_fns {
let cmp = compare_fn(a, b); let cmp: Ordering = if settings.random {
random_shuffle(a, b, settings.salt.clone())
} else {
compare_fn(a, b)
};
if cmp != Ordering::Equal { if cmp != Ordering::Equal {
if settings.reverse { if settings.reverse {
return cmp.reverse(); return cmp.reverse();
@ -431,36 +471,60 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
Ordering::Equal Ordering::Equal
} }
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
fn permissive_f64_parse(a: &str) -> f64 {
// Maybe should be split on non-digit, but then 10e100 won't parse properly.
// On the flip side, this will give NEG_INFINITY for "1,234", which might be OK
// because there's no way to handle both CSV and thousands separators without a new flag.
// GNU sort treats "1,234" as "1" in numeric, so maybe it's fine.
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
match a.split_whitespace().next() {
None => std::f64::NEG_INFINITY,
Some(sa) => match sa.parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a,
Err(_) => std::f64::NEG_INFINITY,
},
}
}
fn default_compare(a: &str, b: &str) -> Ordering { fn default_compare(a: &str, b: &str) -> Ordering {
a.cmp(b) a.cmp(b)
} }
/// Compares two floating point numbers, with errors being assumed to be -inf. fn get_leading_number(a: &str) -> &str {
/// Stops coercing at the first whitespace char, so 1e2 will parse as 100 but let mut s = "";
/// 1,000 will parse as -inf. for c in a.chars() {
if !c.is_numeric() && !c.eq(&'-') && !c.eq(&' ') && !c.eq(&'.') && !c.eq(&',') {
s = a.trim().split(c).next().unwrap();
break;
}
s = a.trim();
}
return s;
}
// Matches GNU behavior, see:
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
// Specifically *not* the same as sort -n | uniq
fn num_sort_dedup(a: &str) -> &str {
// Empty lines are dumped
if a.is_empty() {
return "0"
// And lines that don't begin numerically are dumped
} else if !a.trim().chars().nth(0).unwrap_or('\0').is_numeric() {
return "0"
} else {
// Prepare lines for comparison of only the numerical leading numbers
return get_leading_number(a)
};
}
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
fn permissive_f64_parse(a: &str) -> f64 {
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
match a.parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a,
Err(_) => std::f64::NEG_INFINITY,
}
}
/// Compares two floats, with errors and non-numerics assumed to be -inf.
/// Stops coercing at the first non-numeric char.
fn numeric_compare(a: &str, b: &str) -> Ordering { fn numeric_compare(a: &str, b: &str) -> Ordering {
#![allow(clippy::comparison_chain)] #![allow(clippy::comparison_chain)]
let fa = permissive_f64_parse(a);
let fb = permissive_f64_parse(b); let sa = get_leading_number(a);
// f64::cmp isn't implemented because NaN messes with it let sb = get_leading_number(b);
// but we sidestep that with permissive_f64_parse so just fake it
let fa = permissive_f64_parse(sa);
let fb = permissive_f64_parse(sb);
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
if fa > fb { if fa > fb {
Ordering::Greater Ordering::Greater
} else if fa < fb { } else if fa < fb {
@ -471,10 +535,10 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
} }
fn human_numeric_convert(a: &str) -> f64 { fn human_numeric_convert(a: &str) -> f64 {
let int_str: String = a.chars().take_while(|c| c.is_numeric()).collect(); let int_str = get_leading_number(a);
let suffix = a.chars().find(|c| !c.is_numeric()); let (_, s) = a.split_at(int_str.len());
let int_part = int_str.parse::<f64>().unwrap_or(-1f64) as f64; let int_part = permissive_f64_parse(int_str);
let suffix: f64 = match suffix.unwrap_or('\0') { let suffix: f64 = match s.parse().unwrap_or('\0') {
'K' => 1000f64, 'K' => 1000f64,
'M' => 1E6, 'M' => 1E6,
'G' => 1E9, 'G' => 1E9,
@ -501,6 +565,30 @@ fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
} }
} }
fn random_shuffle(a: &str, b: &str, salt: String) -> Ordering {
#![allow(clippy::comparison_chain)]
let salt_slice = salt.as_str();
let da = hash(&[a, salt_slice].concat());
let db = hash(&[b, salt_slice].concat());
da.cmp(&db)
}
fn get_rand_string() -> String {
thread_rng()
.sample_iter(&Alphanumeric)
.take(16)
.map(char::from)
.collect::<String>()
}
fn hash<T: Hash>(t: &T) -> u64 {
let mut s: XxHash64 = Default::default();
t.hash(&mut s);
s.finish()
}
#[derive(Eq, Ord, PartialEq, PartialOrd)] #[derive(Eq, Ord, PartialEq, PartialOrd)]
enum Month { enum Month {
Unknown, Unknown,
@ -606,3 +694,65 @@ fn open(path: &str) -> Option<(Box<dyn Read>, bool)> {
} }
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_compare() {
let a = "your own";
let b = "your place";
assert_eq!(Ordering::Less, default_compare(a, b));
}
#[test]
fn test_numeric_compare1() {
let a = "149:7";
let b = "150:5";
assert_eq!(Ordering::Less, numeric_compare(a, b));
}
#[test]
fn test_numeric_compare2() {
let a = "-1.02";
let b = "1";
assert_eq!(Ordering::Less, numeric_compare(a, b));
}
#[test]
fn test_human_numeric_compare() {
let a = "300K";
let b = "1M";
assert_eq!(Ordering::Less, human_numeric_size_compare(a, b));
}
#[test]
fn test_month_compare() {
let a = "JaN";
let b = "OCt";
assert_eq!(Ordering::Less, month_compare(a, b));
}
#[test]
fn test_version_compare() {
let a = "1.2.3-alpha2";
let b = "1.4.0";
assert_eq!(Ordering::Less, version_compare(a, b));
}
#[test]
fn test_random_compare() {
let a = "9";
let b = "9";
let c = get_rand_string();
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
}
}

View file

@ -2,22 +2,43 @@ use crate::common::util::*;
#[test] #[test]
fn test_numeric_floats_and_ints() { fn test_numeric_floats_and_ints() {
test_helper("numeric_floats_and_ints", "-n"); for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1\n-8\n1.04\n-1";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8\n-1\n1\n1.04\n1.444\n8.013\n");
}
} }
#[test] #[test]
fn test_numeric_floats() { fn test_numeric_floats() {
test_helper("numeric_floats", "-n"); for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.040000000\n1.444\n1.58590\n8.013\n");
}
} }
#[test] #[test]
fn test_numeric_floats_with_nan() { fn test_numeric_floats_with_nan() {
test_helper("numeric_floats_with_nan", "-n"); for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n1.0/0.0\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.0/0.0\n1.040000000\n1.444\n1.58590\n");
}
} }
#[test] #[test]
fn test_numeric_unfixed_floats() { fn test_numeric_unfixed_floats() {
test_helper("numeric_unfixed_floats", "-n"); test_helper("numeric_fixed_floats", "-n");
} }
#[test] #[test]
@ -32,12 +53,26 @@ fn test_numeric_unsorted_ints() {
#[test] #[test]
fn test_human_block_sizes() { fn test_human_block_sizes() {
test_helper("human_block_sizes", "-h"); for human_numeric_sort_param in vec!["-h", "--human-numeric-sort"] {
let input = "8981K\n909991M\n-8T\n21G\n0.8M";
new_ucmd!()
.arg(human_numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8T\n0.8M\n8981K\n21G\n909991M\n");
}
} }
#[test] #[test]
fn test_month_default() { fn test_month_default() {
test_helper("month_default", "-M"); for month_sort_param in vec!["-M", "--month-sort"] {
let input = "JAn\nMAY\n000may\nJun\nFeb";
new_ucmd!()
.arg(month_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("000may\nJAn\nFeb\nMAY\nJun\n");
}
} }
#[test] #[test]
@ -47,12 +82,23 @@ fn test_month_stable() {
#[test] #[test]
fn test_default_unsorted_ints() { fn test_default_unsorted_ints() {
test_helper("default_unsorted_ints", ""); let input = "9\n1909888\n000\n1\n2";
new_ucmd!()
.pipe_in(input)
.succeeds()
.stdout_only("000\n1\n1909888\n2\n9\n");
} }
#[test] #[test]
fn test_numeric_unique_ints() { fn test_numeric_unique_ints() {
test_helper("numeric_unsorted_ints_unique", "-nu"); for numeric_unique_sort_param in vec!["-nu"] {
let input = "9\n9\n8\n1\n";
new_ucmd!()
.arg(numeric_unique_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("1\n8\n9\n");
}
} }
#[test] #[test]