#![crate_name = "shuf"] #![feature(rustc_private)] /* * This file is part of the uutils coreutils package. * * (c) Arcterus * * For the full copyright and license information, please view the LICENSE * file that was distributed with this source code. */ extern crate getopts; extern crate libc; extern crate rand; use rand::read::ReadRng; use rand::{Rng, ThreadRng}; use std::fs::File; use std::io::{stdin, stdout, BufReader, BufWriter, Read, Write}; use std::usize::MAX as MAX_USIZE; #[path = "../common/util.rs"] #[macro_use] mod util; enum Mode { Default, Echo, InputRange((usize, usize)) } static NAME: &'static str = "shuf"; static VERSION: &'static str = "0.0.1"; pub fn uumain(args: Vec) -> i32 { let opts = [ getopts::optflag("e", "echo", "treat each ARG as an input line"), getopts::optopt("i", "input-range", "treat each number LO through HI as an input line", "LO-HI"), getopts::optopt("n", "head-count", "output at most COUNT lines", "COUNT"), getopts::optopt("o", "output", "write result to FILE instead of standard output", "FILE"), getopts::optopt("", "random-source", "get random bytes from FILE", "FILE"), getopts::optflag("r", "repeat", "output lines can be repeated"), getopts::optflag("z", "zero-terminated", "end lines with 0 byte, not newline"), getopts::optflag("h", "help", "display this help and exit"), getopts::optflag("V", "version", "output version information and exit") ]; let mut matches = match getopts::getopts(&args[1..], &opts) { Ok(m) => m, Err(f) => { crash!(1, "{}", f) } }; if matches.opt_present("help") { println!("{name} v{version} Usage: {prog} [OPTION]... [FILE] {prog} -e [OPTION]... [ARG]... {prog} -i LO-HI [OPTION]...\n {usage} With no FILE, or when FILE is -, read standard input.", name = NAME, version = VERSION, prog = &args[0][..], usage = getopts::usage("Write a random permutation of the input lines to standard output.", &opts)); } else if matches.opt_present("version") { println!("{} v{}", NAME, VERSION); } else { let echo = matches.opt_present("echo"); let mode = match matches.opt_str("input-range") { Some(range) => { if echo { show_error!("cannot specify more than one mode"); return 1; } match parse_range(range) { Ok(m) => Mode::InputRange(m), Err(msg) => { crash!(1, "{}", msg); }, } } None => { if echo { Mode::Echo } else { if matches.free.len() == 0 { matches.free.push("-".to_string()); } else if matches.free.len() > 1 { show_error!("extra operand '{}'", &matches.free[1][..]); } Mode::Default } } }; let repeat = matches.opt_present("repeat"); let sep = if matches.opt_present("zero-terminated") { 0x00 as u8 } else { 0x0a as u8 }; let count = match matches.opt_str("head-count") { Some(cnt) => match cnt.parse::() { Ok(val) => val, Err(e) => { show_error!("'{}' is not a valid count: {}", cnt, e); return 1; } }, None => MAX_USIZE, }; let output = matches.opt_str("output"); let random = matches.opt_str("random-source"); match mode { Mode::Echo => { // XXX: this doesn't correctly handle non-UTF-8 cmdline args let mut evec = matches.free.iter().map(|a| a.as_bytes()).collect::>(); find_seps(&mut evec, sep); shuf_bytes(&mut evec, repeat, count, sep, output, random); }, Mode::InputRange((b, e)) => { let rvec = (b..e).map(|x| format!("{}", x)).collect::>(); let mut rvec = rvec.iter().map(|a| a.as_bytes()).collect::>(); shuf_bytes(&mut rvec, repeat, count, sep, output, random); }, Mode::Default => { let fdata = read_input_file(&matches.free[0][..]); let mut fdata = vec!(&fdata[..]); find_seps(&mut fdata, sep); shuf_bytes(&mut fdata, repeat, count, sep, output, random); } } } 0 } fn read_input_file(filename: &str) -> Vec { let mut file = BufReader::new( if filename == "-" { Box::new(stdin()) as Box } else { match File::open(filename) { Ok(f) => Box::new(f) as Box, Err(e) => crash!(1, "failed to open '{}': {}", filename, e), } }); let mut data = Vec::new(); match file.read_to_end(&mut data) { Err(e) => crash!(1, "failed reading '{}': {}", filename, e), Ok(_) => (), }; data } fn find_seps(data: &mut Vec<&[u8]>, sep: u8) { // need to use for loop so we don't borrow the vector as we modify it in place // basic idea: // * We don't care about the order of the result. This lets us slice the slices // without making a new vector. // * Starting from the end of the vector, we examine each element. // * If that element contains the separator, we remove it from the vector, // and then sub-slice it into slices that do not contain the separator. // * We maintain the invariant throughout that each element in the vector past // the ith element does not have any separators remaining. for i in (0..data.len()).rev() { if data[i].contains(&sep) { let this = data.swap_remove(i); let mut p = 0; let mut i = 1; loop { if i == this.len() { break; } if this[i] == sep { data.push(&this[p..i]); p = i + 1; } i += 1; } if p < this.len() { data.push(&this[p..i]); } } } } fn shuf_bytes(input: &mut Vec<&[u8]>, repeat: bool, count: usize, sep: u8, output: Option, random: Option) { let mut output = BufWriter::new( match output { None => Box::new(stdout()) as Box, Some(s) => match File::create(&s[..]) { Ok(f) => Box::new(f) as Box, Err(e) => crash!(1, "failed to open '{}' for writing: {}", &s[..], e), }, }); let mut rng = match random { Some(r) => WrappedRng::RngFile(rand::read::ReadRng::new(match File::open(&r[..]) { Ok(f) => f, Err(e) => crash!(1, "failed to open random source '{}': {}", &r[..], e), })), None => WrappedRng::RngDefault(rand::thread_rng()), }; // we're generating a random usize. To keep things fair, we take this number mod ceil(log2(length+1)) let mut len_mod = 1; let mut len = input.len(); while len > 0 { len >>= 1; len_mod <<= 1; } drop(len); let mut count = count; while count > 0 && input.len() > 0 { let mut r = input.len(); while r >= input.len() { r = rng.next_usize() % len_mod; } // write the randomly chosen value and the separator output.write_all(input[r]).unwrap_or_else(|e| crash!(1, "write failed: {}", e)); output.write_all(&[sep]).unwrap_or_else(|e| crash!(1, "write failed: {}", e)); // if we do not allow repeats, remove the chosen value from the input vector if !repeat { // shrink the mask if we will drop below a power of 2 if input.len() % 2 == 0 && len_mod > 2 { len_mod >>= 1; } input.swap_remove(r); } count -= 1; } } fn parse_range(input_range: String) -> Result<(usize, usize), String> { let split: Vec<&str> = input_range.split('-').collect(); if split.len() != 2 { Err("invalid range format".to_string()) } else { let begin = match split[0].parse::() { Ok(m) => m, Err(e)=> return Err(format!("{} is not a valid number: {}", split[0], e)), }; let end = match split[1].parse::() { Ok(m) => m, Err(e)=> return Err(format!("{} is not a valid number: {}", split[1], e)), }; Ok((begin, end + 1)) } } enum WrappedRng { RngFile(rand::read::ReadRng), RngDefault(rand::ThreadRng), } impl WrappedRng { fn next_usize(&mut self) -> usize { match self { &mut WrappedRng::RngFile(ref mut r) => r.next_u32() as usize, &mut WrappedRng::RngDefault(ref mut r) => r.next_u32() as usize, } } }