coreutils/src/ptx/ptx.rs
Joseph Crail e9b008cf70 Remove unstable features from ptx.
I cleaned up string references, whitespace, and use of unstable
features. I also added a comment about reverting to connect, making
others aware that the method should be replaced by join after 1.3.
2015-08-12 00:01:10 -04:00

566 lines
19 KiB
Rust

#![crate_name = "ptx"]
/*
* This file is part of the uutils coreutils package.
*
* (c) Dorota Kapturkiewicz <dokaptur@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
extern crate aho_corasick;
extern crate getopts;
extern crate memchr;
extern crate regex_syntax;
extern crate regex;
use std::collections::{HashMap, HashSet, BTreeSet};
use std::default::Default;
use std::fs::File;
use getopts::{Options, Matches};
use std::io::{stdin, stdout, BufReader, BufWriter, BufRead, Read, Write};
use regex::Regex;
use std::cmp;
#[path = "../common/util.rs"]
#[macro_use]
mod util;
static NAME: &'static str = "ptx";
static VERSION: &'static str = "1.0.0";
#[derive(Debug)]
enum OutFormat {
Dumb,
Roff,
Tex,
}
#[derive(Debug)]
struct Config {
format : OutFormat,
gnu_ext : bool,
auto_ref : bool,
input_ref : bool,
right_ref : bool,
ignore_case : bool,
macro_name : String,
trunc_str : String,
context_regex : String,
line_width : usize,
gap_size : usize,
}
impl Default for Config {
fn default() -> Config {
Config {
format : OutFormat::Dumb,
gnu_ext : true,
auto_ref : false,
input_ref : false,
right_ref : false,
ignore_case : false,
macro_name : "xx".to_string(),
trunc_str : "/".to_string(),
context_regex : "\\w+".to_string(),
line_width : 72,
gap_size : 3
}
}
}
fn read_word_filter_file(matches: &Matches, option: &str) -> HashSet<String> {
let filename = matches.opt_str(option).expect("parsing options failed!");
let reader = BufReader::new(crash_if_err!(1, File::open(filename)));
let mut words: HashSet<String> = HashSet::new();
for word in reader.lines() {
words.insert(crash_if_err!(1, word));
}
words
}
#[derive(Debug)]
struct WordFilter {
only_specified: bool,
ignore_specified: bool,
only_set: HashSet<String>,
ignore_set: HashSet<String>,
word_regex: String,
}
impl WordFilter {
fn new(matches: &Matches, config: &Config) -> WordFilter {
let (o, oset): (bool, HashSet<String>) =
if matches.opt_present("o") {
(true, read_word_filter_file(matches, "o"))
} else {
(false, HashSet::new())
};
let (i, iset): (bool, HashSet<String>) =
if matches.opt_present("i") {
(true, read_word_filter_file(matches, "i"))
} else {
(false, HashSet::new())
};
if matches.opt_present("b") {
crash!(1, "-b not implemented yet");
}
let reg =
if matches.opt_present("W") {
matches.opt_str("W").expect("parsing options failed!")
} else if config.gnu_ext {
"\\w+".to_string()
} else {
"[^ \t\n]+".to_string()
};
WordFilter {
only_specified: o,
ignore_specified: i,
only_set: oset,
ignore_set: iset,
word_regex: reg
}
}
}
#[derive(Debug, PartialOrd, PartialEq, Eq, Ord)]
struct WordRef {
word: String,
global_line_nr: usize,
local_line_nr: usize,
position: usize,
position_end: usize,
filename: String,
}
fn print_version() {
println!("{} {}", NAME, VERSION);
}
fn print_usage(opts: &Options) {
let brief = "Usage: ptx [OPTION]... [INPUT]... (without -G) or: \
ptx -G [OPTION]... [INPUT [OUTPUT]] \n Output a permuted index, \
including context, of the words in the input files. \n\n Mandatory \
arguments to long options are mandatory for short options too.";
let explaination = "With no FILE, or when FILE is -, read standard input. \
Default is '-F /'.";
println!("{}\n{}", opts.usage(&brief), explaination);
}
fn get_config(matches: &Matches) -> Config {
let mut config: Config = Default::default();
let err_msg = "parsing options failed";
if matches.opt_present("G") {
config.gnu_ext = false;
config.format = OutFormat::Roff;
config.context_regex = "[^ \t\n]+".to_string();
} else {
crash!(1, "GNU extensions not implemented yet");
}
if matches.opt_present("S") {
crash!(1, "-S not implemented yet");
}
config.auto_ref = matches.opt_present("A");
config.input_ref = matches.opt_present("r");
config.right_ref &= matches.opt_present("R");
config.ignore_case = matches.opt_present("f");
if matches.opt_present("M") {
config.macro_name =
matches.opt_str("M").expect(err_msg).to_string();
}
if matches.opt_present("F") {
config.trunc_str =
matches.opt_str("F").expect(err_msg).to_string();
}
if matches.opt_present("w") {
let width_str = matches.opt_str("w").expect(err_msg);
config.line_width = crash_if_err!(
1, usize::from_str_radix(&width_str, 10));
}
if matches.opt_present("g") {
let gap_str = matches.opt_str("g").expect(err_msg);
config.gap_size = crash_if_err!(
1, usize::from_str_radix(&gap_str, 10));
}
if matches.opt_present("O") {
config.format = OutFormat::Roff;
}
if matches.opt_present("T") {
config.format = OutFormat::Tex;
}
config
}
fn read_input(input_files: &Vec<String>, config: &Config) ->
HashMap<String, (Vec<String>, usize)> {
let mut file_map : HashMap<String, (Vec<String>, usize)> =
HashMap::new();
let mut files = Vec::new();
if input_files.is_empty() {
files.push("-");
} else {
if config.gnu_ext {
for file in input_files {
files.push(&file);
}
} else {
files.push(&input_files[0]);
}
}
let mut lines_so_far: usize = 0;
for filename in files {
let reader: BufReader<Box<Read>> = BufReader::new(
if filename == "-" {
Box::new(stdin())
} else {
let file = crash_if_err!(1, File::open(filename));
Box::new(file)
});
let lines: Vec<String> = reader.lines().map(|x| crash_if_err!(1, x))
.collect();
let size = lines.len();
file_map.insert(filename.to_string(), (lines, lines_so_far));
lines_so_far += size
}
file_map
}
fn create_word_set(config: &Config, filter: &WordFilter,
file_map: &HashMap<String, (Vec<String>, usize)>)->
BTreeSet<WordRef> {
let reg = Regex::new(&filter.word_regex).unwrap();
let ref_reg = Regex::new(&config.context_regex).unwrap();
let mut word_set: BTreeSet<WordRef> = BTreeSet::new();
for (file, lines) in file_map.iter() {
let mut count: usize = 0;
let offs = lines.1;
for line in (lines.0).iter() {
// if -r, exclude reference from word set
let (ref_beg, ref_end) = match ref_reg.find(line) {
Some(x) => x,
None => (0,0)
};
// match words with given regex
for (beg, end) in reg.find_iter(line) {
if config.input_ref && ((beg, end) == (ref_beg, ref_end)) {
continue;
}
let mut word = line[beg .. end].to_string();
if filter.only_specified &&
!(filter.only_set.contains(&word)) {
continue;
}
if filter.ignore_specified &&
filter.ignore_set.contains(&word) {
continue;
}
if config.ignore_case {
word = word.to_lowercase();
}
word_set.insert(WordRef{
word: word,
filename: String::from(file.clone()),
global_line_nr: offs + count,
local_line_nr: count,
position: beg,
position_end: end
});
}
count += 1;
}
}
word_set
}
fn get_reference(config: &Config, word_ref: &WordRef, line: &String) ->
String {
if config.auto_ref {
format!("{}:{}", word_ref.filename, word_ref.local_line_nr + 1)
} else if config.input_ref {
let reg = Regex::new(&config.context_regex).unwrap();
let (beg, end) = match reg.find(line) {
Some(x) => x,
None => (0,0)
};
format!("{}", &line[beg .. end])
} else {
String::new()
}
}
fn assert_str_integrity(s: &Vec<char>, beg: usize, end: usize) {
assert!(beg <= end);
assert!(end <= s.len());
}
fn trim_broken_word_left(s: &Vec<char>, beg: usize, end: usize) -> usize {
assert_str_integrity(s, beg, end);
if beg == end || beg == 0 || s[beg].is_whitespace() ||
s[beg-1].is_whitespace() {
return beg;
}
let mut b = beg;
while b < end && !s[b].is_whitespace() {
b += 1;
}
b
}
fn trim_broken_word_right(s: &Vec<char>, beg: usize, end: usize) -> usize {
assert_str_integrity(s, beg, end);
if beg == end || end == s.len() || s[end-1].is_whitespace() ||
s[end].is_whitespace() {
return end;
}
let mut e = end;
while beg < e && !s[e-1].is_whitespace() {
e -= 1;
}
e
}
fn trim_idx(s: &Vec<char>, beg: usize, end: usize) -> (usize, usize) {
assert_str_integrity(s, beg, end);
let mut b = beg;
let mut e = end;
while b < e && s[b].is_whitespace() {
b += 1;
}
while b < e && s[e-1].is_whitespace() {
e -= 1;
}
(b,e)
}
fn get_output_chunks(all_before: &String, keyword: &String, all_after: &String,
config: &Config) -> (String, String, String, String) {
assert_eq!(all_before.trim().to_string(), *all_before);
assert_eq!(keyword.trim().to_string(), *keyword);
assert_eq!(all_after.trim().to_string(), *all_after);
let mut head = String::new();
let mut before = String::new();
let mut after = String::new();
let mut tail = String::new();
let half_line_size = cmp::max((config.line_width/2) as isize -
(2*config.trunc_str.len()) as isize, 0) as usize;
let max_after_size = cmp::max(half_line_size as isize -
keyword.len() as isize - 1, 0) as usize;
let max_before_size = half_line_size;
let all_before_vec: Vec<char> = all_before.chars().collect();
let all_after_vec: Vec<char> = all_after.chars().collect();
// get before
let mut bb_tmp =
cmp::max(all_before.len() as isize - max_before_size as isize, 0) as usize;
bb_tmp = trim_broken_word_left(&all_before_vec, bb_tmp, all_before.len());
let (before_beg, before_end) =
trim_idx(&all_before_vec, bb_tmp, all_before.len());
before.push_str(&all_before[before_beg .. before_end]);
assert!(max_before_size >= before.len());
// get after
let mut ae_tmp = cmp::min(max_after_size, all_after.len());
ae_tmp = trim_broken_word_right(&all_after_vec, 0, ae_tmp);
let (after_beg, after_end) = trim_idx(&all_after_vec, 0, ae_tmp);
after.push_str(&all_after[after_beg .. after_end]);
assert!(max_after_size >= after.len());
// get tail
let max_tail_size = max_before_size - before.len();
let (tb, _) = trim_idx(&all_after_vec, after_end, all_after.len());
let mut te_tmp = cmp::min(tb + max_tail_size, all_after.len());
te_tmp = trim_broken_word_right(&all_after_vec, tb, te_tmp);
let (tail_beg, tail_end) = trim_idx(&all_after_vec, tb, te_tmp);
tail.push_str(&all_after[tail_beg .. tail_end]);
// get head
let max_head_size = max_after_size - after.len();
let (_, he) = trim_idx(&all_before_vec, 0, before_beg);
let mut hb_tmp =
cmp::max(he as isize - max_head_size as isize, 0) as usize;
hb_tmp = trim_broken_word_left(&all_before_vec, hb_tmp, he);
let (head_beg, head_end) = trim_idx(&all_before_vec, hb_tmp, he);
head.push_str(&all_before[head_beg .. head_end]);
// put right context truncation string if needed
if after_end != all_after.len() && tail_beg == tail_end {
after.push_str(&config.trunc_str);
} else if after_end != all_after.len() && tail_end != all_after.len() {
tail.push_str(&config.trunc_str);
}
// put left context truncation string if needed
if before_beg != 0 && head_beg == head_end {
before = format!("{}{}", config.trunc_str, before);
} else if before_beg != 0 && head_beg != 0 {
head = format!("{}{}", config.trunc_str, head);
}
// add space before "after" if needed
if after.len() > 0 {
after = format!(" {}", after);
}
(tail, before, after, head)
}
fn tex_mapper(x: char) -> String {
match x {
'\\' => "\\backslash{}".to_string(),
'$' | '%' | '#' | '&' | '_' => format!("\\{}", x),
'}' | '{' => format!("$\\{}$", x),
_ => x.to_string()
}
}
fn adjust_tex_str(context: &str) -> String {
let ws_reg = Regex::new(r"[\t\n\v\f\r ]").unwrap();
let mut fix: String = ws_reg.replace_all(context, " ").trim().to_string();
let mapped_chunks: Vec<String> = fix.chars().map(tex_mapper).collect();
// NB: Using deprecated connect() until Rust 1.3 becomes stable.
// When 1.3 is released, replace connect() with join().
fix = mapped_chunks.connect("");
fix
}
fn format_tex_line(config: &Config, word_ref: &WordRef, line: &String,
reference: &String) -> String {
let mut output = String::new();
output.push_str(&format!("\\{} ", config.macro_name));
let all_before = if config.input_ref {
let before = &line[0 .. word_ref.position];
adjust_tex_str(before.trim().trim_left_matches(reference))
} else {
adjust_tex_str(&line[0 .. word_ref.position])
};
let keyword = adjust_tex_str(
&line[word_ref.position .. word_ref.position_end]);
let all_after = adjust_tex_str(
&line[word_ref.position_end .. line.len()]);
let (tail, before, after, head) =
get_output_chunks(&all_before, &keyword, &all_after, &config);
output.push_str(&format!("{5}{0}{6}{5}{1}{6}{5}{2}{6}{5}{3}{6}{5}{4}{6}",
tail, before, keyword, after, head, "{", "}"));
if config.auto_ref || config.input_ref {
output.push_str(
&format!("{}{}{}", "{", adjust_tex_str(&reference), "}"));
}
output
}
fn adjust_roff_str(context: &str) -> String {
let ws_reg = Regex::new(r"[\t\n\v\f\r]").unwrap();
ws_reg.replace_all(context, " ").replace("\"", "\"\"").trim().to_string()
}
fn format_roff_line(config: &Config, word_ref: &WordRef, line: &str,
reference: &str) -> String {
let mut output = String::new();
output.push_str(&format!(".{}", config.macro_name));
let all_before = if config.input_ref {
let before = &line[0 .. word_ref.position];
adjust_roff_str(before.trim().trim_left_matches(reference))
} else {
adjust_roff_str(&line[0 .. word_ref.position])
};
let keyword = adjust_roff_str(
&line[word_ref.position .. word_ref.position_end]);
let all_after = adjust_roff_str(
&line[word_ref.position_end .. line.len()]);
let (tail, before, after, head) =
get_output_chunks(&all_before, &keyword, &all_after, &config);
output.push_str(&format!(" \"{}\" \"{}\" \"{}{}\" \"{}\"",
tail, before, keyword, after, head));
if config.auto_ref || config.input_ref {
output.push_str(&format!(" \"{}\"", adjust_roff_str(&reference)));
}
output
}
fn write_traditional_output(config: &Config,
file_map: &HashMap<String, (Vec<String>,usize)>,
words: &BTreeSet<WordRef>, output_filename: &String) {
let mut writer: BufWriter<Box<Write>> = BufWriter::new(
if output_filename == "-" {
Box::new(stdout())
} else {
let file = crash_if_err!(1, File::create(output_filename));
Box::new(file)
});
for word_ref in words.iter() {
let file_map_value : &(Vec<String>, usize) =
file_map.get(&(word_ref.filename))
.expect("Missing file in file map");
let (ref lines, _) = *(file_map_value);
let reference =
get_reference(config, word_ref, &lines[word_ref.local_line_nr]);
let output_line: String = match config.format {
OutFormat::Tex => format_tex_line(
config, word_ref, &lines[word_ref.local_line_nr], &reference),
OutFormat::Roff => format_roff_line(
config, word_ref, &lines[word_ref.local_line_nr], &reference),
OutFormat::Dumb => crash!(
1, "There is no dumb format with GNU extensions disabled")
};
crash_if_err!(1, writeln!(writer, "{}", output_line));
}
}
pub fn uumain(args: Vec<String>) -> i32 {
let mut opts = Options::new();
opts.optflag("A", "auto-reference",
"output automatically generated references");
opts.optflag("G", "traditional", "behave more like System V 'ptx'");
opts.optopt("F", "flag-truncation",
"use STRING for flagging line truncations", "STRING");
opts.optopt("M", "macro-name", "macro name to use instead of 'xx'",
"STRING");
opts.optflag("O", "format=roff", "generate output as roff directives");
opts.optflag("R", "right-side-refs",
"put references at right, not counted in -w");
opts.optopt("S", "sentence-regexp", "for end of lines or end of sentences",
"REGEXP");
opts.optflag("T", "format=tex", "generate output as TeX directives");
opts.optopt("W", "word-regexp", "use REGEXP to match each keyword",
"REGEXP");
opts.optopt("b", "break-file", "word break characters in this FILE",
"FILE");
opts.optflag("f", "ignore-case",
"fold lower case to upper case for sorting");
opts.optopt("g", "gap-size", "gap size in columns between output fields",
"NUMBER");
opts.optopt("i", "ignore-file", "read ignore word list from FILE", "FILE");
opts.optopt("o", "only-file", "read only word list from this FILE",
"FILE");
opts.optflag("r", "references", "first field of each line is a reference");
opts.optopt("w", "width", "output width in columns, reference excluded",
"NUMBER");
opts.optflag("", "help", "display this help and exit");
opts.optflag("", "version", "output version information and exit");
let matches = return_if_err!(1, opts.parse(&args[1..]));
if matches.opt_present("help") {
print_usage(&opts);
return 0;
}
if matches.opt_present("version") {
print_version();
return 0;
}
let config = get_config(&matches);
let word_filter = WordFilter::new(&matches, &config);
let file_map =
read_input(&matches.free, &config);
let word_set = create_word_set(&config, &word_filter, &file_map);
let output_file = if !config.gnu_ext && matches.free.len() == 2 {
matches.free[1].clone()
} else {
"-".to_string()
};
write_traditional_output(&config, &file_map, &word_set, &output_file);
0
}