From 30ae952b83e1bd5f9d3e4475fcddffa4fba4afc5 Mon Sep 17 00:00:00 2001 From: Terts Diepraam Date: Tue, 8 Feb 2022 14:12:31 +0100 Subject: [PATCH 1/3] shuf: remove custom randomization logic --- src/uu/shuf/src/shuf.rs | 87 ++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/src/uu/shuf/src/shuf.rs b/src/uu/shuf/src/shuf.rs index da2dfff1b..058ac8637 100644 --- a/src/uu/shuf/src/shuf.rs +++ b/src/uu/shuf/src/shuf.rs @@ -8,7 +8,8 @@ // spell-checker:ignore (ToDO) cmdline evec seps rvec fdata use clap::{crate_version, App, AppSettings, Arg}; -use rand::Rng; +use rand::prelude::SliceRandom; +use rand::RngCore; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, Read, Write}; use uucore::display::Quotable; @@ -254,40 +255,35 @@ fn shuf_bytes(input: &mut Vec<&[u8]>, opts: Options) -> UResult<()> { None => WrappedRng::RngDefault(rand::thread_rng()), }; - // we're generating a random usize. To keep things fair, we take this number mod ceil(log2(length+1)) - let mut len_mod = 1; - let mut len = input.len(); - while len > 0 { - len >>= 1; - len_mod <<= 1; + if input.is_empty() { + return Ok(()); } - let mut count = opts.head_count; - while count > 0 && !input.is_empty() { - let mut r = input.len(); - while r >= input.len() { - r = rng.next_usize() % len_mod; + if opts.repeat { + for _ in 0..opts.head_count { + // Returns None is the slice is empty. We checked this before, so + // this is safe. + let r = input.choose(&mut rng).unwrap(); + + output + .write_all(r) + .map_err_context(|| "write failed".to_string())?; + output + .write_all(&[opts.sep]) + .map_err_context(|| "write failed".to_string())?; } - - // write the randomly chosen value and the separator - output - .write_all(input[r]) - .map_err_context(|| "write failed".to_string())?; - output - .write_all(&[opts.sep]) - .map_err_context(|| "write failed".to_string())?; - - // if we do not allow repeats, remove the chosen value from the input vector - if !opts.repeat { - // shrink the mask if we will drop below a power of 2 - if input.len() % 2 == 0 && len_mod > 2 { - len_mod >>= 1; - } - input.swap_remove(r); + } else { + let (shuffled, _) = input.partial_shuffle(&mut rng, opts.head_count); + for r in shuffled { + output + .write_all(r) + .map_err_context(|| "write failed".to_string())?; + output + .write_all(&[opts.sep]) + .map_err_context(|| "write failed".to_string())?; } - - count -= 1; } + Ok(()) } @@ -311,11 +307,32 @@ enum WrappedRng { RngDefault(rand::rngs::ThreadRng), } -impl WrappedRng { - fn next_usize(&mut self) -> usize { - match *self { - WrappedRng::RngFile(ref mut r) => r.gen(), - WrappedRng::RngDefault(ref mut r) => r.gen(), +impl RngCore for WrappedRng { + fn next_u32(&mut self) -> u32 { + match self { + Self::RngFile(r) => r.next_u32(), + Self::RngDefault(r) => r.next_u32(), + } + } + + fn next_u64(&mut self) -> u64 { + match self { + Self::RngFile(r) => r.next_u64(), + Self::RngDefault(r) => r.next_u64(), + } + } + + fn fill_bytes(&mut self, dest: &mut [u8]) { + match self { + Self::RngFile(r) => r.fill_bytes(dest), + Self::RngDefault(r) => r.fill_bytes(dest), + } + } + + fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), rand::Error> { + match self { + Self::RngFile(r) => r.try_fill_bytes(dest), + Self::RngDefault(r) => r.try_fill_bytes(dest), } } } From 95388147020b2687e04959327a6962fe37654844 Mon Sep 17 00:00:00 2001 From: Terts Diepraam Date: Tue, 8 Feb 2022 14:20:24 +0100 Subject: [PATCH 2/3] shuf: use split_once for parsing the range --- src/uu/shuf/src/shuf.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/uu/shuf/src/shuf.rs b/src/uu/shuf/src/shuf.rs index 058ac8637..3dcd7b0e2 100644 --- a/src/uu/shuf/src/shuf.rs +++ b/src/uu/shuf/src/shuf.rs @@ -288,17 +288,16 @@ fn shuf_bytes(input: &mut Vec<&[u8]>, opts: Options) -> UResult<()> { } fn parse_range(input_range: &str) -> Result<(usize, usize), String> { - let split: Vec<&str> = input_range.split('-').collect(); - if split.len() != 2 { - Err(format!("invalid input range: {}", input_range.quote())) - } else { - let begin = split[0] + if let Some((from, to)) = input_range.split_once('-') { + let begin = from .parse::() - .map_err(|_| format!("invalid input range: {}", split[0].quote()))?; - let end = split[1] + .map_err(|_| format!("invalid input range: {}", from.quote()))?; + let end = to .parse::() - .map_err(|_| format!("invalid input range: {}", split[1].quote()))?; + .map_err(|_| format!("invalid input range: {}", to.quote()))?; Ok((begin, end + 1)) + } else { + Err(format!("invalid input range: {}", input_range.quote())) } } From dc24c9563e009f82979f30050940544527d56436 Mon Sep 17 00:00:00 2001 From: Terts Diepraam Date: Tue, 8 Feb 2022 21:05:39 +0100 Subject: [PATCH 3/3] shuf: BENCHMARKING.md --- src/uu/shuf/BENCHMARKING.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/uu/shuf/BENCHMARKING.md diff --git a/src/uu/shuf/BENCHMARKING.md b/src/uu/shuf/BENCHMARKING.md new file mode 100644 index 000000000..7607f04b4 --- /dev/null +++ b/src/uu/shuf/BENCHMARKING.md @@ -0,0 +1,28 @@ +# Benchmarking shuf + +`shuf` is a simple utility, but there are at least two important cases +benchmark: with and without repetition. + +When benchmarking changes, make sure to always build with the `--release` flag. +You can compare with another branch by compiling on that branch and than +renaming the executable from `shuf` to `shuf.old`. + +## Without repetition + +By default, `shuf` samples without repetition. To benchmark only the +randomization and not IO, we can pass the `-i` flag with a range of numbers to +randomly sample from. An example of a command that works well for testing: + +```shell +hyperfine --warmup 10 "target/release/shuf -i 0-10000000" +``` + +## With repetition + +When repetition is allowed, `shuf` works very differently under the hood, so it +should be benchmarked separately. In this case we have to pass the `-n` flag or +the command will run forever. An example of a hyperfine command is + +```shell +hyperfine --warmup 10 "target/release/shuf -r -n 10000000 -i 0-1000" +```