coreutils/src/shuf/shuf.rs

272 lines
9.1 KiB
Rust
Raw Normal View History

2014-07-10 02:11:19 +00:00
#![crate_name = "shuf"]
2015-05-01 00:43:26 +00:00
#![feature(rustc_private)]
2014-07-10 01:19:59 +00:00
/*
* This file is part of the uutils coreutils package.
*
* (c) Arcterus <arcterus@mail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
extern crate getopts;
extern crate libc;
2015-05-01 00:43:26 +00:00
extern crate rand;
2014-07-10 01:19:59 +00:00
2015-05-01 00:43:26 +00:00
use rand::read::ReadRng;
use rand::{Rng, ThreadRng};
use std::fs::File;
use std::io::{stdin, stdout, BufReader, BufWriter, Read, Write};
use std::usize::MAX as MAX_USIZE;
2014-07-10 01:19:59 +00:00
#[path = "../common/util.rs"]
2015-01-08 12:54:22 +00:00
#[macro_use]
2014-07-10 01:19:59 +00:00
mod util;
enum Mode {
Default,
Echo,
2015-05-01 00:43:26 +00:00
InputRange((usize, usize))
2014-07-10 01:19:59 +00:00
}
static NAME: &'static str = "shuf";
static VERSION: &'static str = "0.0.1";
pub fn uumain(args: Vec<String>) -> i32 {
2014-07-10 01:19:59 +00:00
let opts = [
getopts::optflag("e", "echo", "treat each ARG as an input line"),
getopts::optopt("i", "input-range", "treat each number LO through HI as an input line", "LO-HI"),
getopts::optopt("n", "head-count", "output at most COUNT lines", "COUNT"),
getopts::optopt("o", "output", "write result to FILE instead of standard output", "FILE"),
getopts::optopt("", "random-source", "get random bytes from FILE", "FILE"),
getopts::optflag("r", "repeat", "output lines can be repeated"),
getopts::optflag("z", "zero-terminated", "end lines with 0 byte, not newline"),
getopts::optflag("h", "help", "display this help and exit"),
getopts::optflag("V", "version", "output version information and exit")
];
2015-05-01 00:43:26 +00:00
let mut matches = match getopts::getopts(&args[1..], &opts) {
2014-07-10 01:19:59 +00:00
Ok(m) => m,
Err(f) => {
crash!(1, "{}", f)
}
};
if matches.opt_present("help") {
println!("{name} v{version}
Usage:
{prog} [OPTION]... [FILE]
{prog} -e [OPTION]... [ARG]...
{prog} -i LO-HI [OPTION]...\n
{usage}
With no FILE, or when FILE is -, read standard input.",
2015-05-01 00:43:26 +00:00
name = NAME, version = VERSION, prog = &args[0][..],
2014-11-19 20:55:25 +00:00
usage = getopts::usage("Write a random permutation of the input lines to standard output.", &opts));
2014-07-10 01:19:59 +00:00
} else if matches.opt_present("version") {
println!("{} v{}", NAME, VERSION);
} else {
let echo = matches.opt_present("echo");
let mode = match matches.opt_str("input-range") {
Some(range) => {
if echo {
show_error!("cannot specify more than one mode");
return 1;
}
match parse_range(range) {
2014-11-19 20:50:37 +00:00
Ok(m) => Mode::InputRange(m),
2015-05-01 00:43:26 +00:00
Err(msg) => {
crash!(1, "{}", msg);
},
2014-07-10 01:19:59 +00:00
}
}
2014-07-10 02:30:38 +00:00
None => {
if echo {
2014-11-19 20:50:37 +00:00
Mode::Echo
2014-07-10 02:30:38 +00:00
} else {
if matches.free.len() == 0 {
matches.free.push("-".to_string());
2015-05-01 00:43:26 +00:00
} else if matches.free.len() > 1 {
show_error!("extra operand '{}'", &matches.free[1][..]);
2014-07-10 02:30:38 +00:00
}
2014-11-19 20:50:37 +00:00
Mode::Default
2014-07-10 02:30:38 +00:00
}
}
2014-07-10 01:19:59 +00:00
};
let repeat = matches.opt_present("repeat");
2015-05-01 00:43:26 +00:00
let sep = if matches.opt_present("zero-terminated") {
0x00 as u8
} else {
0x0a as u8
};
2014-07-10 01:19:59 +00:00
let count = match matches.opt_str("head-count") {
2015-01-10 18:07:08 +00:00
Some(cnt) => match cnt.parse::<usize>() {
Ok(val) => val,
Err(e) => {
show_error!("'{}' is not a valid count: {}", cnt, e);
2014-07-10 01:19:59 +00:00
return 1;
}
},
2015-05-01 00:43:26 +00:00
None => MAX_USIZE,
2014-07-10 01:19:59 +00:00
};
let output = matches.opt_str("output");
let random = matches.opt_str("random-source");
2015-05-01 00:43:26 +00:00
match mode {
Mode::Echo => {
// XXX: this doesn't correctly handle non-UTF-8 cmdline args
let mut evec = matches.free.iter().map(|a| a.as_bytes()).collect::<Vec<&[u8]>>();
find_seps(&mut evec, sep);
shuf_bytes(&mut evec, repeat, count, sep, output, random);
},
Mode::InputRange((b, e)) => {
let rvec = (b..e).map(|x| format!("{}", x)).collect::<Vec<String>>();
let mut rvec = rvec.iter().map(|a| a.as_bytes()).collect::<Vec<&[u8]>>();
shuf_bytes(&mut rvec, repeat, count, sep, output, random);
2014-07-10 01:19:59 +00:00
},
2015-05-01 00:43:26 +00:00
Mode::Default => {
let fdata = read_input_file(&matches.free[0][..]);
let mut fdata = vec!(&fdata[..]);
find_seps(&mut fdata, sep);
shuf_bytes(&mut fdata, repeat, count, sep, output, random);
}
2014-07-10 01:19:59 +00:00
}
}
0
}
2015-05-01 00:43:26 +00:00
fn read_input_file(filename: &str) -> Vec<u8> {
let mut file = BufReader::new(
if filename == "-" {
Box::new(stdin()) as Box<Read>
} else {
match File::open(filename) {
Ok(f) => Box::new(f) as Box<Read>,
Err(e) => crash!(1, "failed to open '{}': {}", filename, e),
}
});
2014-07-10 01:19:59 +00:00
2015-05-01 00:43:26 +00:00
let mut data = Vec::new();
match file.read_to_end(&mut data) {
Err(e) => crash!(1, "failed reading '{}': {}", filename, e),
Ok(_) => (),
};
data
}
2015-05-01 00:43:26 +00:00
fn find_seps(data: &mut Vec<&[u8]>, sep: u8) {
// need to use for loop so we don't borrow the vector as we modify it in place
// basic idea:
// * We don't care about the order of the result. This lets us slice the slices
// without making a new vector.
// * Starting from the end of the vector, we examine each element.
// * If that element contains the separator, we remove it from the vector,
// and then sub-slice it into slices that do not contain the separator.
// * We maintain the invariant throughout that each element in the vector past
// the ith element does not have any separators remaining.
for i in (0..data.len()).rev() {
if data[i].contains(&sep) {
let this = data.swap_remove(i);
let mut p = 0;
let mut i = 1;
loop {
if i == this.len() {
break;
}
if this[i] == sep {
data.push(&this[p..i]);
p = i + 1;
}
i += 1;
}
if p < this.len() {
data.push(&this[p..i]);
}
}
}
}
2015-05-01 00:43:26 +00:00
fn shuf_bytes(input: &mut Vec<&[u8]>, repeat: bool, count: usize, sep: u8, output: Option<String>, random: Option<String>) {
let mut output = BufWriter::new(
match output {
None => Box::new(stdout()) as Box<Write>,
Some(s) => match File::create(&s[..]) {
Ok(f) => Box::new(f) as Box<Write>,
Err(e) => crash!(1, "failed to open '{}' for writing: {}", &s[..], e),
},
});
2014-07-10 01:19:59 +00:00
let mut rng = match random {
2015-05-01 00:43:26 +00:00
Some(r) => WrappedRng::RngFile(rand::read::ReadRng::new(match File::open(&r[..]) {
Ok(f) => f,
Err(e) => crash!(1, "failed to open random source '{}': {}", &r[..], e),
})),
None => WrappedRng::RngDefault(rand::thread_rng()),
2014-07-10 01:19:59 +00:00
};
2015-05-01 00:43:26 +00:00
// we're generating a random usize. To keep things fair, we take this number mod ceil(log2(length+1))
let mut len_mod = 1;
let mut len = input.len();
while len > 0 {
len >>= 1;
len_mod <<= 1;
}
drop(len);
let mut count = count;
while count > 0 && input.len() > 0 {
let mut r = input.len();
while r >= input.len() {
r = rng.next_usize() % len_mod;
}
// write the randomly chosen value and the separator
output.write_all(input[r]).unwrap_or_else(|e| crash!(1, "write failed: {}", e));
output.write_all(&[sep]).unwrap_or_else(|e| crash!(1, "write failed: {}", e));
// if we do not allow repeats, remove the chosen value from the input vector
2014-07-10 01:19:59 +00:00
if !repeat {
2015-05-01 00:43:26 +00:00
// shrink the mask if we will drop below a power of 2
if input.len() % 2 == 0 && len_mod > 2 {
len_mod >>= 1;
}
input.swap_remove(r);
2014-07-10 01:19:59 +00:00
}
2015-05-01 00:43:26 +00:00
count -= 1;
2014-07-10 01:19:59 +00:00
}
}
2015-05-01 00:43:26 +00:00
fn parse_range(input_range: String) -> Result<(usize, usize), String> {
let split: Vec<&str> = input_range.split('-').collect();
2014-07-10 01:19:59 +00:00
if split.len() != 2 {
2015-05-01 00:43:26 +00:00
Err("invalid range format".to_string())
2014-07-10 01:19:59 +00:00
} else {
2015-01-10 18:07:08 +00:00
let begin = match split[0].parse::<usize>() {
Ok(m) => m,
2015-05-01 00:43:26 +00:00
Err(e)=> return Err(format!("{} is not a valid number: {}", split[0], e)),
2014-07-10 01:19:59 +00:00
};
2015-01-10 18:07:08 +00:00
let end = match split[1].parse::<usize>() {
Ok(m) => m,
2015-05-01 00:43:26 +00:00
Err(e)=> return Err(format!("{} is not a valid number: {}", split[1], e)),
2014-07-10 01:19:59 +00:00
};
2015-05-01 00:43:26 +00:00
Ok((begin, end + 1))
}
}
enum WrappedRng {
RngFile(rand::read::ReadRng<File>),
RngDefault(rand::ThreadRng),
}
impl WrappedRng {
fn next_usize(&mut self) -> usize {
match self {
&mut WrappedRng::RngFile(ref mut r) => r.next_u32() as usize,
&mut WrappedRng::RngDefault(ref mut r) => r.next_u32() as usize,
}
2014-07-10 01:19:59 +00:00
}
}