mirror of
https://github.com/uutils/coreutils
synced 2024-12-14 07:12:44 +00:00
sort: Add a GNU-style Random Sorter (#1922)
This commit is contained in:
parent
8cc7a90d7c
commit
da5f2f3a6c
4 changed files with 261 additions and 44 deletions
19
Cargo.lock
generated
19
Cargo.lock
generated
|
@ -1317,6 +1317,12 @@ dependencies = [
|
|||
"maybe-uninit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "static_assertions"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.8.0"
|
||||
|
@ -1443,6 +1449,17 @@ dependencies = [
|
|||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04f8ab788026715fa63b31960869617cba39117e520eb415b0139543e325ab59"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.10",
|
||||
"rand 0.7.3",
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.13.0"
|
||||
|
@ -2200,7 +2217,9 @@ version = "0.0.4"
|
|||
dependencies = [
|
||||
"clap",
|
||||
"itertools 0.8.2",
|
||||
"rand 0.7.3",
|
||||
"semver",
|
||||
"twox-hash",
|
||||
"uucore",
|
||||
"uucore_procs",
|
||||
]
|
||||
|
|
|
@ -15,7 +15,9 @@ edition = "2018"
|
|||
path = "src/sort.rs"
|
||||
|
||||
[dependencies]
|
||||
rand = "0.7"
|
||||
clap = "2.33"
|
||||
twox-hash = "1.6.0"
|
||||
itertools = "0.8.0"
|
||||
semver = "0.9.0"
|
||||
uucore = { version=">=0.0.7", package="uucore", path="../../uucore", features=["fs"] }
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// * This file is part of the uutils coreutils package.
|
||||
// *
|
||||
// * (c) Michael Yin <mikeyin@mikeyin.org>
|
||||
// * (c) Robert Swinford <robert.swinford..AT..gmail.com>
|
||||
// *
|
||||
// * For the full copyright and license information, please view the LICENSE
|
||||
// * file that was distributed with this source code.
|
||||
|
@ -12,13 +13,17 @@ extern crate uucore;
|
|||
|
||||
use clap::{App, Arg};
|
||||
use itertools::Itertools;
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand::{thread_rng, Rng};
|
||||
use semver::Version;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fs::File;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Lines, Read, Write};
|
||||
use std::mem::replace;
|
||||
use std::path::Path;
|
||||
use twox_hash::XxHash64;
|
||||
use uucore::fs::is_stdin_interactive; // for Iterator::dedup()
|
||||
|
||||
static NAME: &str = "sort";
|
||||
|
@ -34,16 +39,18 @@ static OPT_DICTIONARY_ORDER: &str = "dictionary-order";
|
|||
static OPT_MERGE: &str = "merge";
|
||||
static OPT_CHECK: &str = "check";
|
||||
static OPT_IGNORE_CASE: &str = "ignore-case";
|
||||
static OPT_IGNORE_BLANKS: &str = "ignore-blanks";
|
||||
static OPT_OUTPUT: &str = "output";
|
||||
static OPT_REVERSE: &str = "reverse";
|
||||
static OPT_STABLE: &str = "stable";
|
||||
static OPT_UNIQUE: &str = "unique";
|
||||
static OPT_RANDOM: &str = "random-sort";
|
||||
|
||||
static ARG_FILES: &str = "files";
|
||||
|
||||
static DECIMAL_PT: char = '.';
|
||||
static THOUSANDS_SEP: char = ',';
|
||||
|
||||
#[derive(Eq, Ord, PartialEq, PartialOrd)]
|
||||
enum SortMode {
|
||||
Numeric,
|
||||
HumanNumeric,
|
||||
|
@ -60,8 +67,10 @@ struct Settings {
|
|||
stable: bool,
|
||||
unique: bool,
|
||||
check: bool,
|
||||
random: bool,
|
||||
compare_fns: Vec<fn(&str, &str) -> Ordering>,
|
||||
transform_fns: Vec<fn(&str) -> String>,
|
||||
salt: String,
|
||||
}
|
||||
|
||||
impl Default for Settings {
|
||||
|
@ -74,8 +83,10 @@ impl Default for Settings {
|
|||
stable: false,
|
||||
unique: false,
|
||||
check: false,
|
||||
random: false,
|
||||
compare_fns: Vec::new(),
|
||||
transform_fns: Vec::new(),
|
||||
salt: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -155,17 +166,14 @@ impl<'a> Iterator for FileMerger<'a> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_usage() -> String {
|
||||
format!(
|
||||
"{0} {1}
|
||||
|
||||
Usage:
|
||||
{0} [OPTION]... [FILE]...
|
||||
|
||||
Write the sorted concatenation of all FILE(s) to standard output.
|
||||
|
||||
Mandatory arguments for long options are mandatory for short options too.
|
||||
|
||||
With no FILE, or when FILE is -, read standard input.",
|
||||
NAME, VERSION
|
||||
)
|
||||
|
@ -228,6 +236,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
.long(OPT_IGNORE_CASE)
|
||||
.help("fold lower case to upper case characters"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_IGNORE_BLANKS)
|
||||
.short("b")
|
||||
.long(OPT_IGNORE_BLANKS)
|
||||
.help("ignore leading blanks when finding sort keys in each line"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_OUTPUT)
|
||||
.short("o")
|
||||
|
@ -236,6 +250,12 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
.takes_value(true)
|
||||
.value_name("FILENAME"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_RANDOM)
|
||||
.short("R")
|
||||
.long(OPT_RANDOM)
|
||||
.help("shuffle in random order"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OPT_REVERSE)
|
||||
.short("r")
|
||||
|
@ -285,11 +305,20 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
settings.transform_fns.push(|s| s.to_uppercase());
|
||||
}
|
||||
|
||||
if matches.is_present(OPT_IGNORE_BLANKS) {
|
||||
settings.transform_fns.push(|s| s.trim_start().to_string());
|
||||
}
|
||||
|
||||
settings.outfile = matches.value_of(OPT_OUTPUT).map(String::from);
|
||||
settings.reverse = matches.is_present(OPT_REVERSE);
|
||||
settings.stable = matches.is_present(OPT_STABLE);
|
||||
settings.unique = matches.is_present(OPT_UNIQUE);
|
||||
|
||||
if matches.is_present(OPT_RANDOM) {
|
||||
settings.random = matches.is_present(OPT_RANDOM);
|
||||
settings.salt = get_rand_string();
|
||||
}
|
||||
|
||||
//let mut files = matches.free;
|
||||
if files.is_empty() {
|
||||
/* if no file, default to stdin */
|
||||
|
@ -313,10 +342,10 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
|
|||
}
|
||||
}
|
||||
|
||||
exec(files, &settings)
|
||||
exec(files, &mut settings)
|
||||
}
|
||||
|
||||
fn exec(files: Vec<String>, settings: &Settings) -> i32 {
|
||||
fn exec(files: Vec<String>, settings: &mut Settings) -> i32 {
|
||||
let mut lines = Vec::new();
|
||||
let mut file_merger = FileMerger::new(&settings);
|
||||
|
||||
|
@ -351,6 +380,13 @@ fn exec(files: Vec<String>, settings: &Settings) -> i32 {
|
|||
} else {
|
||||
print_sorted(file_merger, &settings.outfile)
|
||||
}
|
||||
} else if settings.unique && settings.mode == SortMode::Numeric {
|
||||
print_sorted(
|
||||
lines
|
||||
.iter()
|
||||
.dedup_by(|a, b| num_sort_dedup(a) == num_sort_dedup(b)),
|
||||
&settings.outfile,
|
||||
)
|
||||
} else if settings.unique {
|
||||
print_sorted(lines.iter().dedup(), &settings.outfile)
|
||||
} else {
|
||||
|
@ -419,7 +455,11 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
|
|||
};
|
||||
|
||||
for compare_fn in &settings.compare_fns {
|
||||
let cmp = compare_fn(a, b);
|
||||
let cmp: Ordering = if settings.random {
|
||||
random_shuffle(a, b, settings.salt.clone())
|
||||
} else {
|
||||
compare_fn(a, b)
|
||||
};
|
||||
if cmp != Ordering::Equal {
|
||||
if settings.reverse {
|
||||
return cmp.reverse();
|
||||
|
@ -431,36 +471,60 @@ fn compare_by(a: &str, b: &str, settings: &Settings) -> Ordering {
|
|||
Ordering::Equal
|
||||
}
|
||||
|
||||
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
|
||||
fn permissive_f64_parse(a: &str) -> f64 {
|
||||
// Maybe should be split on non-digit, but then 10e100 won't parse properly.
|
||||
// On the flip side, this will give NEG_INFINITY for "1,234", which might be OK
|
||||
// because there's no way to handle both CSV and thousands separators without a new flag.
|
||||
// GNU sort treats "1,234" as "1" in numeric, so maybe it's fine.
|
||||
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
|
||||
match a.split_whitespace().next() {
|
||||
None => std::f64::NEG_INFINITY,
|
||||
Some(sa) => match sa.parse::<f64>() {
|
||||
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
|
||||
Ok(a) => a,
|
||||
Err(_) => std::f64::NEG_INFINITY,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn default_compare(a: &str, b: &str) -> Ordering {
|
||||
a.cmp(b)
|
||||
}
|
||||
|
||||
/// Compares two floating point numbers, with errors being assumed to be -inf.
|
||||
/// Stops coercing at the first whitespace char, so 1e2 will parse as 100 but
|
||||
/// 1,000 will parse as -inf.
|
||||
fn get_leading_number(a: &str) -> &str {
|
||||
let mut s = "";
|
||||
for c in a.chars() {
|
||||
if !c.is_numeric() && !c.eq(&'-') && !c.eq(&' ') && !c.eq(&'.') && !c.eq(&',') {
|
||||
s = a.trim().split(c).next().unwrap();
|
||||
break;
|
||||
}
|
||||
s = a.trim();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Matches GNU behavior, see:
|
||||
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
|
||||
// Specifically *not* the same as sort -n | uniq
|
||||
fn num_sort_dedup(a: &str) -> &str {
|
||||
// Empty lines are dumped
|
||||
if a.is_empty() {
|
||||
return "0"
|
||||
// And lines that don't begin numerically are dumped
|
||||
} else if !a.trim().chars().nth(0).unwrap_or('\0').is_numeric() {
|
||||
return "0"
|
||||
} else {
|
||||
// Prepare lines for comparison of only the numerical leading numbers
|
||||
return get_leading_number(a)
|
||||
};
|
||||
}
|
||||
|
||||
/// Parse the beginning string into an f64, returning -inf instead of NaN on errors.
|
||||
fn permissive_f64_parse(a: &str) -> f64 {
|
||||
// GNU sort treats "NaN" as non-number in numeric, so it needs special care.
|
||||
match a.parse::<f64>() {
|
||||
Ok(a) if a.is_nan() => std::f64::NEG_INFINITY,
|
||||
Ok(a) => a,
|
||||
Err(_) => std::f64::NEG_INFINITY,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compares two floats, with errors and non-numerics assumed to be -inf.
|
||||
/// Stops coercing at the first non-numeric char.
|
||||
fn numeric_compare(a: &str, b: &str) -> Ordering {
|
||||
#![allow(clippy::comparison_chain)]
|
||||
let fa = permissive_f64_parse(a);
|
||||
let fb = permissive_f64_parse(b);
|
||||
// f64::cmp isn't implemented because NaN messes with it
|
||||
// but we sidestep that with permissive_f64_parse so just fake it
|
||||
|
||||
let sa = get_leading_number(a);
|
||||
let sb = get_leading_number(b);
|
||||
|
||||
let fa = permissive_f64_parse(sa);
|
||||
let fb = permissive_f64_parse(sb);
|
||||
|
||||
// f64::cmp isn't implemented (due to NaN issues); implement directly instead
|
||||
if fa > fb {
|
||||
Ordering::Greater
|
||||
} else if fa < fb {
|
||||
|
@ -471,10 +535,10 @@ fn numeric_compare(a: &str, b: &str) -> Ordering {
|
|||
}
|
||||
|
||||
fn human_numeric_convert(a: &str) -> f64 {
|
||||
let int_str: String = a.chars().take_while(|c| c.is_numeric()).collect();
|
||||
let suffix = a.chars().find(|c| !c.is_numeric());
|
||||
let int_part = int_str.parse::<f64>().unwrap_or(-1f64) as f64;
|
||||
let suffix: f64 = match suffix.unwrap_or('\0') {
|
||||
let int_str = get_leading_number(a);
|
||||
let (_, s) = a.split_at(int_str.len());
|
||||
let int_part = permissive_f64_parse(int_str);
|
||||
let suffix: f64 = match s.parse().unwrap_or('\0') {
|
||||
'K' => 1000f64,
|
||||
'M' => 1E6,
|
||||
'G' => 1E9,
|
||||
|
@ -501,6 +565,30 @@ fn human_numeric_size_compare(a: &str, b: &str) -> Ordering {
|
|||
}
|
||||
}
|
||||
|
||||
fn random_shuffle(a: &str, b: &str, salt: String) -> Ordering {
|
||||
#![allow(clippy::comparison_chain)]
|
||||
let salt_slice = salt.as_str();
|
||||
|
||||
let da = hash(&[a, salt_slice].concat());
|
||||
let db = hash(&[b, salt_slice].concat());
|
||||
|
||||
da.cmp(&db)
|
||||
}
|
||||
|
||||
fn get_rand_string() -> String {
|
||||
thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(16)
|
||||
.map(char::from)
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
fn hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s: XxHash64 = Default::default();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
|
||||
#[derive(Eq, Ord, PartialEq, PartialOrd)]
|
||||
enum Month {
|
||||
Unknown,
|
||||
|
@ -606,3 +694,65 @@ fn open(path: &str) -> Option<(Box<dyn Read>, bool)> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_default_compare() {
|
||||
let a = "your own";
|
||||
let b = "your place";
|
||||
|
||||
assert_eq!(Ordering::Less, default_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_compare1() {
|
||||
let a = "149:7";
|
||||
let b = "150:5";
|
||||
|
||||
assert_eq!(Ordering::Less, numeric_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_compare2() {
|
||||
let a = "-1.02";
|
||||
let b = "1";
|
||||
|
||||
assert_eq!(Ordering::Less, numeric_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_human_numeric_compare() {
|
||||
let a = "300K";
|
||||
let b = "1M";
|
||||
|
||||
assert_eq!(Ordering::Less, human_numeric_size_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_month_compare() {
|
||||
let a = "JaN";
|
||||
let b = "OCt";
|
||||
|
||||
assert_eq!(Ordering::Less, month_compare(a, b));
|
||||
}
|
||||
#[test]
|
||||
fn test_version_compare() {
|
||||
let a = "1.2.3-alpha2";
|
||||
let b = "1.4.0";
|
||||
|
||||
assert_eq!(Ordering::Less, version_compare(a, b));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_random_compare() {
|
||||
let a = "9";
|
||||
let b = "9";
|
||||
let c = get_rand_string();
|
||||
|
||||
assert_eq!(Ordering::Equal, random_shuffle(a, b, c));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,22 +2,43 @@ use crate::common::util::*;
|
|||
|
||||
#[test]
|
||||
fn test_numeric_floats_and_ints() {
|
||||
test_helper("numeric_floats_and_ints", "-n");
|
||||
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
|
||||
let input = "1.444\n8.013\n1\n-8\n1.04\n-1";
|
||||
new_ucmd!()
|
||||
.arg(numeric_sort_param)
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("-8\n-1\n1\n1.04\n1.444\n8.013\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_floats() {
|
||||
test_helper("numeric_floats", "-n");
|
||||
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
|
||||
let input = "1.444\n8.013\n1.58590\n-8.90880\n1.040000000\n-.05";
|
||||
new_ucmd!()
|
||||
.arg(numeric_sort_param)
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("-8.90880\n-.05\n1.040000000\n1.444\n1.58590\n8.013\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_floats_with_nan() {
|
||||
test_helper("numeric_floats_with_nan", "-n");
|
||||
for numeric_sort_param in vec!["-n", "--numeric-sort"] {
|
||||
let input = "1.444\n1.0/0.0\n1.58590\n-8.90880\n1.040000000\n-.05";
|
||||
new_ucmd!()
|
||||
.arg(numeric_sort_param)
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("-8.90880\n-.05\n1.0/0.0\n1.040000000\n1.444\n1.58590\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_unfixed_floats() {
|
||||
test_helper("numeric_unfixed_floats", "-n");
|
||||
test_helper("numeric_fixed_floats", "-n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -32,12 +53,26 @@ fn test_numeric_unsorted_ints() {
|
|||
|
||||
#[test]
|
||||
fn test_human_block_sizes() {
|
||||
test_helper("human_block_sizes", "-h");
|
||||
for human_numeric_sort_param in vec!["-h", "--human-numeric-sort"] {
|
||||
let input = "8981K\n909991M\n-8T\n21G\n0.8M";
|
||||
new_ucmd!()
|
||||
.arg(human_numeric_sort_param)
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("-8T\n0.8M\n8981K\n21G\n909991M\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_month_default() {
|
||||
test_helper("month_default", "-M");
|
||||
for month_sort_param in vec!["-M", "--month-sort"] {
|
||||
let input = "JAn\nMAY\n000may\nJun\nFeb";
|
||||
new_ucmd!()
|
||||
.arg(month_sort_param)
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("000may\nJAn\nFeb\nMAY\nJun\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -47,12 +82,23 @@ fn test_month_stable() {
|
|||
|
||||
#[test]
|
||||
fn test_default_unsorted_ints() {
|
||||
test_helper("default_unsorted_ints", "");
|
||||
let input = "9\n1909888\n000\n1\n2";
|
||||
new_ucmd!()
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("000\n1\n1909888\n2\n9\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numeric_unique_ints() {
|
||||
test_helper("numeric_unsorted_ints_unique", "-nu");
|
||||
for numeric_unique_sort_param in vec!["-nu"] {
|
||||
let input = "9\n9\n8\n1\n";
|
||||
new_ucmd!()
|
||||
.arg(numeric_unique_sort_param)
|
||||
.pipe_in(input)
|
||||
.succeeds()
|
||||
.stdout_only("1\n8\n9\n");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
Loading…
Reference in a new issue