sort: Add a GNU-style Random Sorter (#1922)

This commit is contained in:
electricboogie 2021-03-29 06:05:52 -05:00 committed by GitHub
parent 8cc7a90d7c
commit da5f2f3a6c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 261 additions and 44 deletions

19
Cargo.lock generated
View file

@ -1317,6 +1317,12 @@ dependencies = [
"maybe-uninit",
]
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strsim"
version = "0.8.0"
@ -1443,6 +1449,17 @@ dependencies = [
"serde_json",
]
[[package]]
name = "twox-hash"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04f8ab788026715fa63b31960869617cba39117e520eb415b0139543e325ab59"
dependencies = [
"cfg-if 0.1.10",
"rand 0.7.3",
"static_assertions",
]
[[package]]
name = "typenum"
version = "1.13.0"
@ -2200,7 +2217,9 @@ version = "0.0.4"
dependencies = [
"clap",
"itertools 0.8.2",
"rand 0.7.3",
"semver",
"twox-hash",
"uucore",
"uucore_procs",
]

View file

@ -15,7 +15,9 @@ edition = "2018"
path = "src/sort.rs"
[dependencies]
rand = "0.7"
clap = "2.33"
twox-hash = "1.6.0"
itertools = "0.8.0"
semver = "0.9.0"
uucore = { version=">=0.0.7", package="uucore", path="../../uucore", features=["fs"] }

View file

@ -1,6 +1,7 @@
// * This file is part of the uutils coreutils package.
// *
// * (c) Michael Yin <mikeyin@mikeyin.org>
// * (c) Robert Swinford <robert.swinford..AT..gmail.com>
// *
// * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code.
@ -12,13 +13,17 @@ extern crate uucore;
use clap::{App, Arg};
use itertools::Itertools;
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use semver::Version;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
use std::mem::replace;
use std::path::Path;
use twox_hash::XxHash64;
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
static NAME: &str = "sort";
@ -34,16 +39,18 @@ static OPT_DICTIONARY_ORDER: &str = "dictionary-order";
static OPT_MERGE: &str = "merge";
static OPT_CHECK: &str = "check";
static OPT_IGNORE_CASE: &str = "ignore-case";
static OPT_IGNORE_BLANKS: &str = "ignore-blanks";
static OPT_OUTPUT: &str = "output";
static OPT_REVERSE: &str = "reverse";
static OPT_STABLE: &str = "stable";
static OPT_UNIQUE: &str = "unique";
static OPT_RANDOM: &str = "random-sort";
static ARG_FILES: &str = "files";
static DECIMAL_PT: char = '.';
static THOUSANDS_SEP: char = ',';
#[derive(Eq, Ord, PartialEq, PartialOrd)]
enum SortMode {
Numeric,
HumanNumeric,
@ -60,8 +67,10 @@ struct Settings {
stable: bool,
unique: bool,
check: bool,
random: bool,
compare_fns: Vec<fn(&str, &str) -> Ordering>,
transform_fns: Vec<fn(&str) -> String>,
salt: String,
}
impl Default for Settings {
@ -74,8 +83,10 @@ impl Default for Settings {
stable: false,
unique: false,
check: false,
random: false,
compare_fns: Vec::new(),
transform_fns: Vec::new(),
salt: String::new(),
}
}
}
@ -155,17 +166,14 @@ impl<'a> Iterator for FileMerger<'a> {
}
}
}
fn get_usage() -> String {
format!(
"{0} {1}
Usage:
{0} [OPTION]... [FILE]...
Write the sorted concatenation of all FILE(s) to standard output.
Mandatory arguments for long options are mandatory for short options too.
With no FILE, or when FILE is -, read standard input.",
NAME, VERSION
)
@ -228,6 +236,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.long(OPT_IGNORE_CASE)
.help("fold lower case to upper case characters"),
)
.arg(
Arg::with_name(OPT_IGNORE_BLANKS)
.short("b")
.long(OPT_IGNORE_BLANKS)
.help("ignore leading blanks when finding sort keys in each line"),
)
.arg(
Arg::with_name(OPT_OUTPUT)
.short("o")
@ -236,6 +250,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
.takes_value(true)
.value_name("FILENAME"),
)
.arg(
Arg::with_name(OPT_RANDOM)
.short("R")
.long(OPT_RANDOM)
.help("shuffle in random order"),
)
.arg(
Arg::with_name(OPT_REVERSE)
.short("r")
@ -285,11 +305,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
settings.transform_fns.push(|s| s.to_uppercase());
}
if matches.is_present(OPT_IGNORE_BLANKS) {
settings.transform_fns.push(|s| s.trim_start().to_string());
}
settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from);
settings.reverse = matches.is_present(OPT_REVERSE);
settings.stable = matches.is_present(OPT_STABLE);
settings.unique = matches.is_present(OPT_UNIQUE);
if matches.is_present(OPT_RANDOM) {
settings.random = matches.is_present(OPT_RANDOM);
settings.salt = get_rand_string();
}
//let mut files = matches.free;
if files.is_empty() {
/* if no file, default to stdin */
@ -313,10 +342,10 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
}
}
exec(files, &settings)
exec(files, &mut settings)
}
fn exec(files: Vec<String>, settings: &Settings) -> i32 {
fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
let mut lines = Vec::new();
let mut file_merger = FileMerger::new(&settings);
@ -351,6 +380,13 @@ fn exec(files: Vec<String>, settings: &Settings) -> i32 {
} else {
print_sorted(file_merger, &settings.outfile)
}
} else if settings.unique && settings.mode == SortMode::Numeric {
print_sorted(
lines
.iter()
.dedup_by(|a, b| num_sort_dedup(a) == num_sort_dedup(b)),
&settings.outfile,
)
} else if settings.unique {
print_sorted(lines.iter().dedup(), &settings.outfile)
} else {
@ -419,7 +455,11 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
};
for compare_fn in &settings.compare_fns {
let cmp = compare_fn(a, b);
let cmp: Ordering = if settings.random {
random_shuffle(a, b, settings.salt.clone())
} else {
compare_fn(a, b)
};
if cmp != Ordering::Equal {
if settings.reverse {
return cmp.reverse();
@ -431,36 +471,60 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
Ordering::Equal
}
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
fn permissive_f64_parse(a: &str) -> f64 {
// Maybe should be split on non-digit, but then 10e100 won't parse properly.
// On the flip side, this will give NEG_INFINITY for "1,234", which might be OK
// because there's no way to handle both CSV and thousands separators without a new flag.
// GNU sort treats "1,234" as "1" in numeric, so maybe it's fine.
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
match a.split_whitespace().next() {
None => std::f64::NEG_INFINITY,
Some(sa) => match sa.parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a,
Err(_) => std::f64::NEG_INFINITY,
},
}
}
fn default_compare(a: &str, b: &str) -> Ordering {
a.cmp(b)
}
/// Compares two floating point numbers, with errors being assumed to be -inf.
/// Stops coercing at the first whitespace char, so 1e2 will parse as 100 but
/// 1,000 will parse as -inf.
fn get_leading_number(a: &str) -> &str {
let mut s = "";
for c in a.chars() {
if !c.is_numeric() && !c.eq(&'-') && !c.eq(&' ') && !c.eq(&'.') && !c.eq(&',') {
s = a.trim().split(c).next().unwrap();
break;
}
s = a.trim();
}
return s;
}
// Matches GNU behavior, see:
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
// Specifically *not* the same as sort -n | uniq
fn num_sort_dedup(a: &str) -> &str {
// Empty lines are dumped
if a.is_empty() {
return "0"
// And lines that don't begin numerically are dumped
} else if !a.trim().chars().nth(0).unwrap_or('\0').is_numeric() {
return "0"
} else {
// Prepare lines for comparison of only the numerical leading numbers
return get_leading_number(a)
};
}
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
fn permissive_f64_parse(a: &str) -> f64 {
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
match a.parse::<f64>() {
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
Ok(a) => a,
Err(_) => std::f64::NEG_INFINITY,
}
}
/// Compares two floats, with errors and non-numerics assumed to be -inf.
/// Stops coercing at the first non-numeric char.
fn numeric_compare(a: &str, b: &str) -> Ordering {
#![allow(clippy::comparison_chain)]
let fa = permissive_f64_parse(a);
let fb = permissive_f64_parse(b);
// f64::cmp isn't implemented because NaN messes with it
// but we sidestep that with permissive_f64_parse so just fake it
let sa = get_leading_number(a);
let sb = get_leading_number(b);
let fa = permissive_f64_parse(sa);
let fb = permissive_f64_parse(sb);
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
if fa > fb {
Ordering::Greater
} else if fa < fb {
@ -471,10 +535,10 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
}
fn human_numeric_convert(a: &str) -> f64 {
let int_str: String = a.chars().take_while(|c| c.is_numeric()).collect();
let suffix = a.chars().find(|c| !c.is_numeric());
let int_part = int_str.parse::<f64>().unwrap_or(-1f64) as f64;
let suffix: f64 = match suffix.unwrap_or('\0') {
let int_str = get_leading_number(a);
let (_, s) = a.split_at(int_str.len());
let int_part = permissive_f64_parse(int_str);
let suffix: f64 = match s.parse().unwrap_or('\0') {
'K' => 1000f64,
'M' => 1E6,
'G' => 1E9,
@ -501,6 +565,30 @@ fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
}
}
fn random_shuffle(a: &str, b: &str, salt: String) -> Ordering {
#![allow(clippy::comparison_chain)]
let salt_slice = salt.as_str();
let da = hash(&[a, salt_slice].concat());
let db = hash(&[b, salt_slice].concat());
da.cmp(&db)
}
fn get_rand_string() -> String {
thread_rng()
.sample_iter(&Alphanumeric)
.take(16)
.map(char::from)
.collect::<String>()
}
fn hash<T: Hash>(t: &T) -> u64 {
let mut s: XxHash64 = Default::default();
t.hash(&mut s);
s.finish()
}
#[derive(Eq, Ord, PartialEq, PartialOrd)]
enum Month {
Unknown,
@ -606,3 +694,65 @@ fn open(path: &str) -> Option<(Box<dyn Read>, bool)> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_compare() {
let a = "your own";
let b = "your place";
assert_eq!(Ordering::Less, default_compare(a, b));
}
#[test]
fn test_numeric_compare1() {
let a = "149:7";
let b = "150:5";
assert_eq!(Ordering::Less, numeric_compare(a, b));
}
#[test]
fn test_numeric_compare2() {
let a = "-1.02";
let b = "1";
assert_eq!(Ordering::Less, numeric_compare(a, b));
}
#[test]
fn test_human_numeric_compare() {
let a = "300K";
let b = "1M";
assert_eq!(Ordering::Less, human_numeric_size_compare(a, b));
}
#[test]
fn test_month_compare() {
let a = "JaN";
let b = "OCt";
assert_eq!(Ordering::Less, month_compare(a, b));
}
#[test]
fn test_version_compare() {
let a = "1.2.3-alpha2";
let b = "1.4.0";
assert_eq!(Ordering::Less, version_compare(a, b));
}
#[test]
fn test_random_compare() {
let a = "9";
let b = "9";
let c = get_rand_string();
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
}
}

View file

@ -2,22 +2,43 @@ use crate::common::util::*;
#[test]
fn test_numeric_floats_and_ints() {
test_helper("numeric_floats_and_ints", "-n");
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1\n-8\n1.04\n-1";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8\n-1\n1\n1.04\n1.444\n8.013\n");
}
}
#[test]
fn test_numeric_floats() {
test_helper("numeric_floats", "-n");
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n8.013\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.040000000\n1.444\n1.58590\n8.013\n");
}
}
#[test]
fn test_numeric_floats_with_nan() {
test_helper("numeric_floats_with_nan", "-n");
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
let input = "1.444\n1.0/0.0\n1.58590\n-8.90880\n1.040000000\n-.05";
new_ucmd!()
.arg(numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8.90880\n-.05\n1.0/0.0\n1.040000000\n1.444\n1.58590\n");
}
}
#[test]
fn test_numeric_unfixed_floats() {
test_helper("numeric_unfixed_floats", "-n");
test_helper("numeric_fixed_floats", "-n");
}
#[test]
@ -32,12 +53,26 @@ fn test_numeric_unsorted_ints() {
#[test]
fn test_human_block_sizes() {
test_helper("human_block_sizes", "-h");
for human_numeric_sort_param in vec!["-h", "--human-numeric-sort"] {
let input = "8981K\n909991M\n-8T\n21G\n0.8M";
new_ucmd!()
.arg(human_numeric_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("-8T\n0.8M\n8981K\n21G\n909991M\n");
}
}
#[test]
fn test_month_default() {
test_helper("month_default", "-M");
for month_sort_param in vec!["-M", "--month-sort"] {
let input = "JAn\nMAY\n000may\nJun\nFeb";
new_ucmd!()
.arg(month_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("000may\nJAn\nFeb\nMAY\nJun\n");
}
}
#[test]
@ -47,12 +82,23 @@ fn test_month_stable() {
#[test]
fn test_default_unsorted_ints() {
test_helper("default_unsorted_ints", "");
let input = "9\n1909888\n000\n1\n2";
new_ucmd!()
.pipe_in(input)
.succeeds()
.stdout_only("000\n1\n1909888\n2\n9\n");
}
#[test]
fn test_numeric_unique_ints() {
test_helper("numeric_unsorted_ints_unique", "-nu");
for numeric_unique_sort_param in vec!["-nu"] {
let input = "9\n9\n8\n1\n";
new_ucmd!()
.arg(numeric_unique_sort_param)
.pipe_in(input)
.succeeds()
.stdout_only("1\n8\n9\n");
}
}
#[test]