mirror of
https://github.com/uutils/coreutils
synced 2024-11-17 02:08:09 +00:00
Merge pull request #3099 from tertsdiepraam/shuf-refactor
`shuf`: remove custom logic for bounded randomization
This commit is contained in:
commit
e5be9c1aaf
2 changed files with 87 additions and 43 deletions
28
src/uu/shuf/BENCHMARKING.md
Normal file
28
src/uu/shuf/BENCHMARKING.md
Normal file
|
@ -0,0 +1,28 @@
|
|||
# Benchmarking shuf
|
||||
|
||||
`shuf` is a simple utility, but there are at least two important cases
|
||||
benchmark: with and without repetition.
|
||||
|
||||
When benchmarking changes, make sure to always build with the `--release` flag.
|
||||
You can compare with another branch by compiling on that branch and than
|
||||
renaming the executable from `shuf` to `shuf.old`.
|
||||
|
||||
## Without repetition
|
||||
|
||||
By default, `shuf` samples without repetition. To benchmark only the
|
||||
randomization and not IO, we can pass the `-i` flag with a range of numbers to
|
||||
randomly sample from. An example of a command that works well for testing:
|
||||
|
||||
```shell
|
||||
hyperfine --warmup 10 "target/release/shuf -i 0-10000000"
|
||||
```
|
||||
|
||||
## With repetition
|
||||
|
||||
When repetition is allowed, `shuf` works very differently under the hood, so it
|
||||
should be benchmarked separately. In this case we have to pass the `-n` flag or
|
||||
the command will run forever. An example of a hyperfine command is
|
||||
|
||||
```shell
|
||||
hyperfine --warmup 10 "target/release/shuf -r -n 10000000 -i 0-1000"
|
||||
```
|
|
@ -8,7 +8,8 @@
|
|||
// spell-checker:ignore (ToDO) cmdline evec seps rvec fdata
|
||||
|
||||
use clap::{crate_version, App, AppSettings, Arg};
|
||||
use rand::Rng;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::RngCore;
|
||||
use std::fs::File;
|
||||
use std::io::{stdin, stdout, BufReader, BufWriter, Read, Write};
|
||||
use uucore::display::Quotable;
|
||||
|
@ -254,55 +255,49 @@ fn shuf_bytes(input: &mut Vec<&[u8]>, opts: Options) -> UResult<()> {
|
|||
None => WrappedRng::RngDefault(rand::thread_rng()),
|
||||
};
|
||||
|
||||
// we're generating a random usize. To keep things fair, we take this number mod ceil(log2(length+1))
|
||||
let mut len_mod = 1;
|
||||
let mut len = input.len();
|
||||
while len > 0 {
|
||||
len >>= 1;
|
||||
len_mod <<= 1;
|
||||
if input.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut count = opts.head_count;
|
||||
while count > 0 && !input.is_empty() {
|
||||
let mut r = input.len();
|
||||
while r >= input.len() {
|
||||
r = rng.next_usize() % len_mod;
|
||||
if opts.repeat {
|
||||
for _ in 0..opts.head_count {
|
||||
// Returns None is the slice is empty. We checked this before, so
|
||||
// this is safe.
|
||||
let r = input.choose(&mut rng).unwrap();
|
||||
|
||||
output
|
||||
.write_all(r)
|
||||
.map_err_context(|| "write failed".to_string())?;
|
||||
output
|
||||
.write_all(&[opts.sep])
|
||||
.map_err_context(|| "write failed".to_string())?;
|
||||
}
|
||||
|
||||
// write the randomly chosen value and the separator
|
||||
output
|
||||
.write_all(input[r])
|
||||
.map_err_context(|| "write failed".to_string())?;
|
||||
output
|
||||
.write_all(&[opts.sep])
|
||||
.map_err_context(|| "write failed".to_string())?;
|
||||
|
||||
// if we do not allow repeats, remove the chosen value from the input vector
|
||||
if !opts.repeat {
|
||||
// shrink the mask if we will drop below a power of 2
|
||||
if input.len() % 2 == 0 && len_mod > 2 {
|
||||
len_mod >>= 1;
|
||||
}
|
||||
input.swap_remove(r);
|
||||
} else {
|
||||
let (shuffled, _) = input.partial_shuffle(&mut rng, opts.head_count);
|
||||
for r in shuffled {
|
||||
output
|
||||
.write_all(r)
|
||||
.map_err_context(|| "write failed".to_string())?;
|
||||
output
|
||||
.write_all(&[opts.sep])
|
||||
.map_err_context(|| "write failed".to_string())?;
|
||||
}
|
||||
|
||||
count -= 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_range(input_range: &str) -> Result<(usize, usize), String> {
|
||||
let split: Vec<&str> = input_range.split('-').collect();
|
||||
if split.len() != 2 {
|
||||
Err(format!("invalid input range: {}", input_range.quote()))
|
||||
} else {
|
||||
let begin = split[0]
|
||||
if let Some((from, to)) = input_range.split_once('-') {
|
||||
let begin = from
|
||||
.parse::<usize>()
|
||||
.map_err(|_| format!("invalid input range: {}", split[0].quote()))?;
|
||||
let end = split[1]
|
||||
.map_err(|_| format!("invalid input range: {}", from.quote()))?;
|
||||
let end = to
|
||||
.parse::<usize>()
|
||||
.map_err(|_| format!("invalid input range: {}", split[1].quote()))?;
|
||||
.map_err(|_| format!("invalid input range: {}", to.quote()))?;
|
||||
Ok((begin, end + 1))
|
||||
} else {
|
||||
Err(format!("invalid input range: {}", input_range.quote()))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -311,11 +306,32 @@ enum WrappedRng {
|
|||
RngDefault(rand::rngs::ThreadRng),
|
||||
}
|
||||
|
||||
impl WrappedRng {
|
||||
fn next_usize(&mut self) -> usize {
|
||||
match *self {
|
||||
WrappedRng::RngFile(ref mut r) => r.gen(),
|
||||
WrappedRng::RngDefault(ref mut r) => r.gen(),
|
||||
impl RngCore for WrappedRng {
|
||||
fn next_u32(&mut self) -> u32 {
|
||||
match self {
|
||||
Self::RngFile(r) => r.next_u32(),
|
||||
Self::RngDefault(r) => r.next_u32(),
|
||||
}
|
||||
}
|
||||
|
||||
fn next_u64(&mut self) -> u64 {
|
||||
match self {
|
||||
Self::RngFile(r) => r.next_u64(),
|
||||
Self::RngDefault(r) => r.next_u64(),
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_bytes(&mut self, dest: &mut [u8]) {
|
||||
match self {
|
||||
Self::RngFile(r) => r.fill_bytes(dest),
|
||||
Self::RngDefault(r) => r.fill_bytes(dest),
|
||||
}
|
||||
}
|
||||
|
||||
fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), rand::Error> {
|
||||
match self {
|
||||
Self::RngFile(r) => r.try_fill_bytes(dest),
|
||||
Self::RngDefault(r) => r.try_fill_bytes(dest),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue