mirror of
https://github.com/uutils/coreutils
synced 2024-12-16 16:22:40 +00:00
f06d9fe130
Creates BufReader in buffer.rs. BufReader uses a stack allocated buffer to read into and returns selected slices into the buffer. This does away with any dynamic allocations in the 'newline loop. 1.5 to 2.5 more performant than previous version. 1.5 to 2.0 times more performant than GNU.
555 lines
18 KiB
Rust
555 lines
18 KiB
Rust
#![crate_name = "cut"]
|
|
|
|
/*
|
|
* This file is part of the uutils coreutils package.
|
|
*
|
|
* (c) Rolf Morel <rolfmorel@gmail.com>
|
|
*
|
|
* For the full copyright and license information, please view the LICENSE
|
|
* file that was distributed with this source code.
|
|
*/
|
|
|
|
#![feature(macro_rules)]
|
|
|
|
extern crate getopts;
|
|
extern crate libc;
|
|
|
|
use std::io::{stdio, File, BufferedWriter, BufferedReader, print};
|
|
use getopts::{optopt, optflag, getopts, usage};
|
|
|
|
use ranges::Range;
|
|
|
|
#[path = "../common/util.rs"]
|
|
mod util;
|
|
mod ranges;
|
|
mod buffer;
|
|
|
|
static NAME: &'static str = "cut";
|
|
static VERSION: &'static str = "1.0.0";
|
|
|
|
struct Options {
|
|
out_delim: Option<String>,
|
|
}
|
|
|
|
struct FieldOptions {
|
|
delimiter: String, // one char long, String because of UTF8 representation
|
|
out_delimeter: Option<String>,
|
|
only_delimited: bool,
|
|
}
|
|
|
|
enum Mode {
|
|
Bytes(Vec<Range>, Options),
|
|
Characters(Vec<Range>, Options),
|
|
Fields(Vec<Range>, FieldOptions),
|
|
}
|
|
|
|
fn list_to_ranges(list: &str, complement: bool) -> Result<Vec<Range>, String> {
|
|
if complement {
|
|
Range::from_list(list).map(|r| ranges::complement(&r))
|
|
} else {
|
|
Range::from_list(list)
|
|
}
|
|
}
|
|
|
|
fn cut_bytes<R: Reader>(reader: R,
|
|
ranges: &Vec<Range>,
|
|
opts: &Options) -> int {
|
|
use buffer::Bytes::{Select, NewlineFound, Complete, Partial, EndOfFile};
|
|
|
|
let mut buf_read = buffer::BufReader::new(reader);
|
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
|
|
|
'newline: loop {
|
|
let mut cur_pos = 1;
|
|
let mut print_delim = false;
|
|
|
|
for &Range { low: low, high: high } in ranges.iter() {
|
|
// skip upto low
|
|
let orig_pos = cur_pos;
|
|
loop {
|
|
match buf_read.select(low - cur_pos) {
|
|
NewlineFound(_) => {
|
|
out.write(&[b'\n']).unwrap();
|
|
continue 'newline
|
|
}
|
|
Complete(bytes) => {
|
|
cur_pos += bytes.len();
|
|
break
|
|
}
|
|
Partial(bytes) => cur_pos += bytes.len(),
|
|
EndOfFile => {
|
|
if orig_pos != cur_pos {
|
|
out.write(&[b'\n']).unwrap();
|
|
}
|
|
|
|
break 'newline
|
|
}
|
|
}
|
|
}
|
|
|
|
match opts.out_delim {
|
|
Some(ref delim) => {
|
|
if print_delim {
|
|
out.write(delim.as_bytes()).unwrap();
|
|
}
|
|
print_delim = true;
|
|
}
|
|
None => ()
|
|
}
|
|
|
|
// write out from low to high
|
|
loop {
|
|
match buf_read.select(high - cur_pos + 1) {
|
|
NewlineFound(bytes) => {
|
|
out.write(bytes).unwrap();
|
|
continue 'newline
|
|
}
|
|
Complete(bytes) => {
|
|
out.write(bytes).unwrap();
|
|
cur_pos = high + 1;
|
|
break
|
|
}
|
|
Partial(bytes) => {
|
|
cur_pos += bytes.len();
|
|
out.write(bytes).unwrap();
|
|
}
|
|
EndOfFile => {
|
|
if cur_pos != low || low == high {
|
|
out.write(&[b'\n']).unwrap();
|
|
}
|
|
|
|
break 'newline
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
buf_read.consume_line();
|
|
out.write([b'\n']).unwrap();
|
|
}
|
|
|
|
0
|
|
}
|
|
|
|
fn cut_characters<R: Reader>(reader: R,
|
|
ranges: &Vec<Range>,
|
|
opts: &Options) -> int {
|
|
let mut buf_in = BufferedReader::new(reader);
|
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
|
|
|
'newline: loop {
|
|
let line = match buf_in.read_line() {
|
|
Ok(line) => line,
|
|
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
|
_ => fail!(),
|
|
};
|
|
|
|
let mut char_pos = 0;
|
|
let mut char_indices = line.as_slice().char_indices();
|
|
let mut print_delim = false;
|
|
|
|
for &Range { low: low, high: high } in ranges.iter() {
|
|
let low_idx = match char_indices.nth(low - char_pos - 1) {
|
|
Some((low_idx, _)) => low_idx,
|
|
None => break
|
|
};
|
|
|
|
match opts.out_delim {
|
|
Some(ref delim) => {
|
|
if print_delim {
|
|
out.write(delim.as_bytes()).unwrap();
|
|
}
|
|
print_delim = true;
|
|
}
|
|
None => ()
|
|
}
|
|
|
|
match char_indices.nth(high - low) {
|
|
Some((high_idx, _)) => {
|
|
let segment = line.as_bytes().slice(low_idx, high_idx);
|
|
|
|
out.write(segment).unwrap();
|
|
}
|
|
None => {
|
|
let bytes = line.as_bytes();
|
|
let segment = bytes.slice(low_idx, bytes.len());
|
|
|
|
out.write(segment).unwrap();
|
|
|
|
if line.as_bytes()[bytes.len() - 1] == b'\n' {
|
|
continue 'newline
|
|
}
|
|
}
|
|
}
|
|
|
|
char_pos = high + 1;
|
|
}
|
|
out.write(&[b'\n']).unwrap();
|
|
}
|
|
|
|
0
|
|
}
|
|
|
|
#[deriving(Clone)]
|
|
struct Searcher<'a> {
|
|
haystack: &'a [u8],
|
|
needle: &'a [u8],
|
|
position: uint
|
|
}
|
|
|
|
impl<'a> Searcher<'a> {
|
|
fn new(haystack: &'a [u8], needle: &'a [u8]) -> Searcher<'a> {
|
|
Searcher {
|
|
haystack: haystack,
|
|
needle: needle,
|
|
position: 0
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator<(uint, uint)> for Searcher<'a> {
|
|
fn next(&mut self) -> Option<(uint, uint)> {
|
|
if self.needle.len() == 1 {
|
|
for offset in range(self.position, self.haystack.len()) {
|
|
if self.haystack[offset] == self.needle[0] {
|
|
self.position = offset + 1;
|
|
return Some((offset, offset + 1));
|
|
}
|
|
}
|
|
|
|
self.position = self.haystack.len();
|
|
return None;
|
|
}
|
|
|
|
while self.position + self.needle.len() <= self.haystack.len() {
|
|
if self.haystack.slice(self.position,
|
|
self.position + self.needle.len()) == self.needle {
|
|
let match_pos = self.position;
|
|
self.position += self.needle.len();
|
|
return Some((match_pos, match_pos + self.needle.len()));
|
|
} else {
|
|
self.position += 1;
|
|
}
|
|
}
|
|
None
|
|
}
|
|
}
|
|
|
|
fn cut_fields_delimiter<R: Reader>(reader: R,
|
|
ranges: &Vec<Range>,
|
|
delim: &String,
|
|
only_delimited: bool,
|
|
out_delim: &String) -> int {
|
|
let mut buf_in = BufferedReader::new(reader);
|
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
|
|
|
'newline: loop {
|
|
let line = match buf_in.read_until(b'\n') {
|
|
Ok(line) => line,
|
|
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
|
_ => fail!(),
|
|
};
|
|
|
|
let mut fields_pos = 1;
|
|
let mut low_idx = 0;
|
|
let mut delim_search = Searcher::new(line.as_slice(),
|
|
delim.as_bytes()).peekable();
|
|
let mut print_delim = false;
|
|
|
|
if delim_search.peek().is_none() {
|
|
if ! only_delimited {
|
|
out.write(line.as_slice()).unwrap();
|
|
if *line.get(line.len() - 1) != b'\n' {
|
|
out.write([b'\n']).unwrap();
|
|
}
|
|
}
|
|
|
|
continue
|
|
}
|
|
|
|
for &Range { low: low, high: high } in ranges.iter() {
|
|
if low - fields_pos > 0 {
|
|
low_idx = match delim_search.nth(low - fields_pos - 1) {
|
|
Some((_, beyond_delim)) => beyond_delim,
|
|
None => break
|
|
};
|
|
}
|
|
|
|
for _ in range(0, high - low + 1) {
|
|
if print_delim {
|
|
out.write_str(out_delim.as_slice()).unwrap();
|
|
}
|
|
|
|
match delim_search.next() {
|
|
Some((high_idx, next_low_idx)) => {
|
|
let segment = line.slice(low_idx, high_idx);
|
|
|
|
out.write(segment).unwrap();
|
|
|
|
print_delim = true;
|
|
|
|
low_idx = next_low_idx;
|
|
fields_pos = high + 1;
|
|
}
|
|
None => {
|
|
let segment = line.slice(low_idx, line.len());
|
|
|
|
out.write(segment).unwrap();
|
|
|
|
if *line.get(line.len() - 1) == b'\n' {
|
|
continue 'newline
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
out.write(&[b'\n']).unwrap();
|
|
}
|
|
|
|
0
|
|
}
|
|
|
|
fn cut_fields<R: Reader>(reader: R,
|
|
ranges: &Vec<Range>,
|
|
opts: &FieldOptions) -> int {
|
|
match opts.out_delimeter {
|
|
Some(ref delim) => {
|
|
return cut_fields_delimiter(reader, ranges, &opts.delimiter,
|
|
opts.only_delimited, delim);
|
|
}
|
|
None => ()
|
|
}
|
|
|
|
let mut buf_in = BufferedReader::new(reader);
|
|
let mut out = BufferedWriter::new(stdio::stdout_raw());
|
|
|
|
'newline: loop {
|
|
let line = match buf_in.read_until(b'\n') {
|
|
Ok(line) => line,
|
|
Err(std::io::IoError { kind: std::io::EndOfFile, .. }) => break,
|
|
_ => fail!(),
|
|
};
|
|
|
|
let mut fields_pos = 1;
|
|
let mut low_idx = 0;
|
|
let mut delim_search = Searcher::new(line.as_slice(),
|
|
opts.delimiter.as_bytes()).peekable();
|
|
let mut print_delim = false;
|
|
|
|
if delim_search.peek().is_none() {
|
|
if ! opts.only_delimited {
|
|
out.write(line.as_slice()).unwrap();
|
|
if *line.get(line.len() - 1) != b'\n' {
|
|
out.write([b'\n']).unwrap();
|
|
}
|
|
}
|
|
|
|
continue
|
|
}
|
|
|
|
for &Range { low: low, high: high } in ranges.iter() {
|
|
if low - fields_pos > 0 {
|
|
low_idx = match delim_search.nth(low - fields_pos - 1) {
|
|
Some((_, beyond_delim)) => beyond_delim,
|
|
None => break
|
|
};
|
|
}
|
|
|
|
if print_delim {
|
|
if low_idx >= opts.delimiter.as_bytes().len() {
|
|
low_idx -= opts.delimiter.as_bytes().len();
|
|
}
|
|
}
|
|
|
|
match delim_search.nth(high - low) {
|
|
Some((high_idx, next_low_idx)) => {
|
|
let segment = line.slice(low_idx, high_idx);
|
|
|
|
out.write(segment).unwrap();
|
|
|
|
print_delim = true;
|
|
low_idx = next_low_idx;
|
|
fields_pos = high + 1;
|
|
}
|
|
None => {
|
|
let segment = line.slice(low_idx, line.len());
|
|
|
|
out.write(segment).unwrap();
|
|
|
|
if *line.get(line.len() - 1) == b'\n' {
|
|
continue 'newline
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
out.write(&[b'\n']).unwrap();
|
|
}
|
|
|
|
0
|
|
}
|
|
|
|
fn cut_files(mut filenames: Vec<String>, mode: Mode) -> int {
|
|
let mut stdin_read = false;
|
|
let mut exit_code = 0;
|
|
|
|
if filenames.len() == 0 { filenames.push("-".to_string()); }
|
|
|
|
for filename in filenames.iter() {
|
|
if filename.as_slice() == "-" {
|
|
if stdin_read { continue }
|
|
|
|
exit_code |= match mode {
|
|
Bytes(ref ranges, ref opts) => {
|
|
cut_bytes(stdio::stdin_raw(), ranges, opts)
|
|
}
|
|
Characters(ref ranges, ref opts) => {
|
|
cut_characters(stdio::stdin_raw(), ranges, opts)
|
|
}
|
|
Fields(ref ranges, ref opts) => {
|
|
cut_fields(stdio::stdin_raw(), ranges, opts)
|
|
}
|
|
};
|
|
|
|
stdin_read = true;
|
|
} else {
|
|
let path = Path::new(filename.as_slice());
|
|
|
|
if ! path.exists() {
|
|
show_error!("{}: No such file or directory", filename);
|
|
continue
|
|
}
|
|
|
|
let file = match File::open(&path) {
|
|
Ok(f) => f,
|
|
Err(e) => {
|
|
show_error!("{}: {}", filename, e.desc);
|
|
continue
|
|
}
|
|
};
|
|
|
|
exit_code |= match mode {
|
|
Bytes(ref ranges, ref opts) => cut_bytes(file, ranges, opts),
|
|
Characters(ref ranges, ref opts) => {
|
|
cut_characters(file, ranges, opts)
|
|
}
|
|
Fields(ref ranges, ref opts) => cut_fields(file, ranges, opts)
|
|
};
|
|
}
|
|
}
|
|
|
|
exit_code
|
|
}
|
|
|
|
pub fn uumain(args: Vec<String>) -> int {
|
|
let opts = [
|
|
optopt("b", "bytes", "select only these bytes", "LIST"),
|
|
optopt("c", "characters", "select only these characters", "LIST"),
|
|
optopt("d", "delimiter", "use DELIM instead of TAB for field delimiter", "DELIM"),
|
|
optopt("f", "fields", "select only these fields; also print any line that contains no delimiter character, unless the -s option is specified", "LIST"),
|
|
optflag("n", "", "(ignored)"),
|
|
optflag("", "complement", "complement the set of selected bytes, characters or fields"),
|
|
optflag("s", "only-delimited", "do not print lines not containing delimiters"),
|
|
optopt("", "output-delimiter", "use STRING as the output delimiter the default is to use the input delimiter", "STRING"),
|
|
optflag("", "help", "display this help and exit"),
|
|
optflag("", "version", "output version information and exit"),
|
|
];
|
|
|
|
let matches = match getopts(args.tail(), opts) {
|
|
Ok(m) => m,
|
|
Err(f) => {
|
|
show_error!("Invalid options\n{}", f)
|
|
return 1;
|
|
}
|
|
};
|
|
|
|
if matches.opt_present("help") {
|
|
println!("Usage:");
|
|
println!(" {0} OPTION... [FILE]...", args.get(0));
|
|
println!("");
|
|
print(usage("Print selected parts of lines from each FILE to standard output.", opts).as_slice());
|
|
println!("");
|
|
println!("Use one, and only one of -b, -c or -f. Each LIST is made up of one");
|
|
println!("range, or many ranges separated by commas. Selected input is written");
|
|
println!("in the same order that it is read, and is written exactly once.");
|
|
println!("Each range is one of:");
|
|
println!("");
|
|
println!(" N N'th byte, character or field, counted from 1");
|
|
println!(" N- from N'th byte, character or field, to end of line");
|
|
println!(" N-M from N'th to M'th (included) byte, character or field");
|
|
println!(" -M from first to M'th (included) byte, character or field");
|
|
println!("");
|
|
println!("With no FILE, or when FILE is -, read standard input.");
|
|
return 0;
|
|
}
|
|
|
|
if matches.opt_present("version") {
|
|
println!("{} {}", NAME, VERSION);
|
|
return 0;
|
|
}
|
|
|
|
let complement = matches.opt_present("complement");
|
|
|
|
let mode_parse = match (matches.opt_str("bytes"),
|
|
matches.opt_str("characters"),
|
|
matches.opt_str("fields")) {
|
|
(Some(byte_ranges), None, None) => {
|
|
list_to_ranges(byte_ranges.as_slice(), complement).map(|ranges|
|
|
Bytes(ranges,
|
|
Options { out_delim: matches.opt_str("output-delimiter") })
|
|
)
|
|
}
|
|
(None, Some(char_ranges), None) => {
|
|
list_to_ranges(char_ranges.as_slice(), complement).map(|ranges|
|
|
Characters(ranges,
|
|
Options { out_delim: matches.opt_str("output-delimiter") })
|
|
)
|
|
}
|
|
(None, None, Some(field_ranges)) => {
|
|
list_to_ranges(field_ranges.as_slice(), complement).and_then(|ranges|
|
|
{
|
|
let out_delim = matches.opt_str("output-delimiter");
|
|
let only_delimited = matches.opt_present("only-delimited");
|
|
|
|
match matches.opt_str("delimiter") {
|
|
Some(delim) => {
|
|
if delim.as_slice().char_len() != 1 {
|
|
Err("the delimiter must be a single character".to_string())
|
|
} else {
|
|
Ok(Fields(ranges,
|
|
FieldOptions {
|
|
delimiter: delim,
|
|
out_delimeter: out_delim,
|
|
only_delimited: only_delimited
|
|
}))
|
|
}
|
|
}
|
|
None => Ok(Fields(ranges,
|
|
FieldOptions {
|
|
delimiter: "\t".to_string(),
|
|
out_delimeter: out_delim,
|
|
only_delimited: only_delimited
|
|
}))
|
|
}
|
|
}
|
|
)
|
|
}
|
|
(ref b, ref c, ref f) if b.is_some() || c.is_some() || f.is_some() => {
|
|
Err("only one type of list may be specified".to_string())
|
|
}
|
|
_ => Err("you must specify a list of bytes, characters, or fields".to_string())
|
|
};
|
|
|
|
match mode_parse {
|
|
Ok(mode) => cut_files(matches.free, mode),
|
|
Err(err_msg) => {
|
|
show_error!("{}\n\
|
|
Try '{} --help' for more information",
|
|
err_msg, args.get(0));
|
|
1
|
|
}
|
|
}
|
|
}
|