mirror of
synced 2024-12-18 09:03:14 +00:00
597 lines
19 KiB
597 lines
19 KiB
#![crate_name = "uu_ptx"]
* This file is part of the uutils coreutils package.
* (c) Dorota Kapturkiewicz <dokaptur@gmail.com>
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
extern crate aho_corasick;
extern crate getopts;
extern crate memchr;
extern crate regex;
extern crate regex_syntax;
extern crate uucore;
use getopts::{Matches, Options};
use regex::Regex;
use std::cmp;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::default::Default;
use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write};
static NAME: &str = "ptx";
static VERSION: &str = env!("CARGO_PKG_VERSION");
enum OutFormat {
struct Config {
format: OutFormat,
gnu_ext: bool,
auto_ref: bool,
input_ref: bool,
right_ref: bool,
ignore_case: bool,
macro_name: String,
trunc_str: String,
context_regex: String,
line_width: usize,
gap_size: usize,
impl Default for Config {
fn default() -> Config {
Config {
format: OutFormat::Dumb,
gnu_ext: true,
auto_ref: false,
input_ref: false,
right_ref: false,
ignore_case: false,
macro_name: "xx".to_owned(),
trunc_str: "/".to_owned(),
context_regex: "\\w+".to_owned(),
line_width: 72,
gap_size: 3,
fn read_word_filter_file(matches: &Matches, option: &str) -> HashSet<String> {
let filename = matches.opt_str(option).expect("parsing options failed!");
let reader = BufReader::new(crash_if_err!(1, File::open(filename)));
let mut words: HashSet<String> = HashSet::new();
for word in reader.lines() {
words.insert(crash_if_err!(1, word));
struct WordFilter {
only_specified: bool,
ignore_specified: bool,
only_set: HashSet<String>,
ignore_set: HashSet<String>,
word_regex: String,
impl WordFilter {
fn new(matches: &Matches, config: &Config) -> WordFilter {
let (o, oset): (bool, HashSet<String>) = if matches.opt_present("o") {
(true, read_word_filter_file(matches, "o"))
} else {
(false, HashSet::new())
let (i, iset): (bool, HashSet<String>) = if matches.opt_present("i") {
(true, read_word_filter_file(matches, "i"))
} else {
(false, HashSet::new())
if matches.opt_present("b") {
crash!(1, "-b not implemented yet");
let reg = if matches.opt_present("W") {
matches.opt_str("W").expect("parsing options failed!")
} else if config.gnu_ext {
} else {
"[^ \t\n]+".to_owned()
WordFilter {
only_specified: o,
ignore_specified: i,
only_set: oset,
ignore_set: iset,
word_regex: reg,
#[derive(Debug, PartialOrd, PartialEq, Eq, Ord)]
struct WordRef {
word: String,
global_line_nr: usize,
local_line_nr: usize,
position: usize,
position_end: usize,
filename: String,
fn print_version() {
println!("{} {}", NAME, VERSION);
fn print_usage(opts: &Options) {
let brief = "Usage: ptx [OPTION]... [INPUT]... (without -G) or: \
ptx -G [OPTION]... [INPUT [OUTPUT]] \n Output a permuted index, \
including context, of the words in the input files. \n\n Mandatory \
arguments to long options are mandatory for short options too.";
let explaination = "With no FILE, or when FILE is -, read standard input. \
Default is '-F /'.";
println!("{}\n{}", opts.usage(&brief), explaination);
fn get_config(matches: &Matches) -> Config {
let mut config: Config = Default::default();
let err_msg = "parsing options failed";
if matches.opt_present("G") {
config.gnu_ext = false;
config.format = OutFormat::Roff;
config.context_regex = "[^ \t\n]+".to_owned();
} else {
crash!(1, "GNU extensions not implemented yet");
if matches.opt_present("S") {
crash!(1, "-S not implemented yet");
config.auto_ref = matches.opt_present("A");
config.input_ref = matches.opt_present("r");
config.right_ref &= matches.opt_present("R");
config.ignore_case = matches.opt_present("f");
if matches.opt_present("M") {
config.macro_name = matches.opt_str("M").expect(err_msg);
if matches.opt_present("F") {
config.trunc_str = matches.opt_str("F").expect(err_msg);
if matches.opt_present("w") {
let width_str = matches.opt_str("w").expect(err_msg);
config.line_width = crash_if_err!(1, usize::from_str_radix(&width_str, 10));
if matches.opt_present("g") {
let gap_str = matches.opt_str("g").expect(err_msg);
config.gap_size = crash_if_err!(1, usize::from_str_radix(&gap_str, 10));
if matches.opt_present("O") {
config.format = OutFormat::Roff;
if matches.opt_present("T") {
config.format = OutFormat::Tex;
fn read_input(input_files: &[String], config: &Config) -> HashMap<String, (Vec<String>, usize)> {
let mut file_map: HashMap<String, (Vec<String>, usize)> = HashMap::new();
let mut files = Vec::new();
if input_files.is_empty() {
} else {
if config.gnu_ext {
for file in input_files {
} else {
let mut lines_so_far: usize = 0;
for filename in files {
let reader: BufReader<Box<dyn Read>> = BufReader::new(if filename == "-" {
} else {
let file = crash_if_err!(1, File::open(filename));
let lines: Vec<String> = reader.lines().map(|x| crash_if_err!(1, x)).collect();
let size = lines.len();
file_map.insert(filename.to_owned(), (lines, lines_so_far));
lines_so_far += size
fn create_word_set(
config: &Config,
filter: &WordFilter,
file_map: &HashMap<String, (Vec<String>, usize)>,
) -> BTreeSet<WordRef> {
let reg = Regex::new(&filter.word_regex).unwrap();
let ref_reg = Regex::new(&config.context_regex).unwrap();
let mut word_set: BTreeSet<WordRef> = BTreeSet::new();
for (file, lines) in file_map.iter() {
let mut count: usize = 0;
let offs = lines.1;
for line in &lines.0 {
// if -r, exclude reference from word set
let (ref_beg, ref_end) = match ref_reg.find(line) {
Some(x) => (x.start(), x.end()),
None => (0, 0),
// match words with given regex
for mat in reg.find_iter(line) {
let (beg, end) = (mat.start(), mat.end());
if config.input_ref && ((beg, end) == (ref_beg, ref_end)) {
let mut word = line[beg..end].to_owned();
if filter.only_specified && !(filter.only_set.contains(&word)) {
if filter.ignore_specified && filter.ignore_set.contains(&word) {
if config.ignore_case {
word = word.to_lowercase();
word_set.insert(WordRef {
filename: file.clone(),
global_line_nr: offs + count,
local_line_nr: count,
position: beg,
position_end: end,
count += 1;
fn get_reference(config: &Config, word_ref: &WordRef, line: &str) -> String {
if config.auto_ref {
format!("{}:{}", word_ref.filename, word_ref.local_line_nr + 1)
} else if config.input_ref {
let reg = Regex::new(&config.context_regex).unwrap();
let (beg, end) = match reg.find(line) {
Some(x) => (x.start(), x.end()),
None => (0, 0),
} else {
fn assert_str_integrity(s: &[char], beg: usize, end: usize) {
assert!(beg <= end);
assert!(end <= s.len());
fn trim_broken_word_left(s: &[char], beg: usize, end: usize) -> usize {
assert_str_integrity(s, beg, end);
if beg == end || beg == 0 || s[beg].is_whitespace() || s[beg - 1].is_whitespace() {
return beg;
let mut b = beg;
while b < end && !s[b].is_whitespace() {
b += 1;
fn trim_broken_word_right(s: &[char], beg: usize, end: usize) -> usize {
assert_str_integrity(s, beg, end);
if beg == end || end == s.len() || s[end - 1].is_whitespace() || s[end].is_whitespace() {
return end;
let mut e = end;
while beg < e && !s[e - 1].is_whitespace() {
e -= 1;
fn trim_idx(s: &[char], beg: usize, end: usize) -> (usize, usize) {
assert_str_integrity(s, beg, end);
let mut b = beg;
let mut e = end;
while b < e && s[b].is_whitespace() {
b += 1;
while b < e && s[e - 1].is_whitespace() {
e -= 1;
(b, e)
fn get_output_chunks(
all_before: &str,
keyword: &str,
all_after: &str,
config: &Config,
) -> (String, String, String, String) {
assert_eq!(all_before.trim(), all_before);
assert_eq!(keyword.trim(), keyword);
assert_eq!(all_after.trim(), all_after);
let mut head = String::new();
let mut before = String::new();
let mut after = String::new();
let mut tail = String::new();
let half_line_size = cmp::max(
(config.line_width / 2) as isize - (2 * config.trunc_str.len()) as isize,
) as usize;
let max_after_size = cmp::max(half_line_size as isize - keyword.len() as isize - 1, 0) as usize;
let max_before_size = half_line_size;
let all_before_vec: Vec<char> = all_before.chars().collect();
let all_after_vec: Vec<char> = all_after.chars().collect();
// get before
let mut bb_tmp = cmp::max(all_before.len() as isize - max_before_size as isize, 0) as usize;
bb_tmp = trim_broken_word_left(&all_before_vec, bb_tmp, all_before.len());
let (before_beg, before_end) = trim_idx(&all_before_vec, bb_tmp, all_before.len());
assert!(max_before_size >= before.len());
// get after
let mut ae_tmp = cmp::min(max_after_size, all_after.len());
ae_tmp = trim_broken_word_right(&all_after_vec, 0, ae_tmp);
let (after_beg, after_end) = trim_idx(&all_after_vec, 0, ae_tmp);
assert!(max_after_size >= after.len());
// get tail
let max_tail_size = max_before_size - before.len();
let (tb, _) = trim_idx(&all_after_vec, after_end, all_after.len());
let mut te_tmp = cmp::min(tb + max_tail_size, all_after.len());
te_tmp = trim_broken_word_right(&all_after_vec, tb, te_tmp);
let (tail_beg, tail_end) = trim_idx(&all_after_vec, tb, te_tmp);
// get head
let max_head_size = max_after_size - after.len();
let (_, he) = trim_idx(&all_before_vec, 0, before_beg);
let mut hb_tmp = cmp::max(he as isize - max_head_size as isize, 0) as usize;
hb_tmp = trim_broken_word_left(&all_before_vec, hb_tmp, he);
let (head_beg, head_end) = trim_idx(&all_before_vec, hb_tmp, he);
// put right context truncation string if needed
if after_end != all_after.len() && tail_beg == tail_end {
} else if after_end != all_after.len() && tail_end != all_after.len() {
// put left context truncation string if needed
if before_beg != 0 && head_beg == head_end {
before = format!("{}{}", config.trunc_str, before);
} else if before_beg != 0 && head_beg != 0 {
head = format!("{}{}", config.trunc_str, head);
// add space before "after" if needed
if !after.is_empty() {
after = format!(" {}", after);
(tail, before, after, head)
fn tex_mapper(x: char) -> String {
match x {
'\\' => "\\backslash{}".to_owned(),
'$' | '%' | '#' | '&' | '_' => format!("\\{}", x),
'}' | '{' => format!("$\\{}$", x),
_ => x.to_string(),
fn adjust_tex_str(context: &str) -> String {
let ws_reg = Regex::new(r"[\t\n\v\f\r ]").unwrap();
let mut fix: String = ws_reg.replace_all(context, " ").trim().to_owned();
let mapped_chunks: Vec<String> = fix.chars().map(tex_mapper).collect();
fix = mapped_chunks.join("");
fn format_tex_line(config: &Config, word_ref: &WordRef, line: &str, reference: &str) -> String {
let mut output = String::new();
output.push_str(&format!("\\{} ", config.macro_name));
let all_before = if config.input_ref {
let before = &line[0..word_ref.position];
} else {
let keyword = adjust_tex_str(&line[word_ref.position..word_ref.position_end]);
let all_after = adjust_tex_str(&line[word_ref.position_end..line.len()]);
let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config);
tail, before, keyword, after, head, "{", "}"
if config.auto_ref || config.input_ref {
output.push_str(&format!("{}{}{}", "{", adjust_tex_str(&reference), "}"));
fn adjust_roff_str(context: &str) -> String {
let ws_reg = Regex::new(r"[\t\n\v\f\r]").unwrap();
.replace_all(context, " ")
.replace("\"", "\"\"")
fn format_roff_line(config: &Config, word_ref: &WordRef, line: &str, reference: &str) -> String {
let mut output = String::new();
output.push_str(&format!(".{}", config.macro_name));
let all_before = if config.input_ref {
let before = &line[0..word_ref.position];
} else {
let keyword = adjust_roff_str(&line[word_ref.position..word_ref.position_end]);
let all_after = adjust_roff_str(&line[word_ref.position_end..line.len()]);
let (tail, before, after, head) = get_output_chunks(&all_before, &keyword, &all_after, &config);
" \"{}\" \"{}\" \"{}{}\" \"{}\"",
tail, before, keyword, after, head
if config.auto_ref || config.input_ref {
output.push_str(&format!(" \"{}\"", adjust_roff_str(&reference)));
fn write_traditional_output(
config: &Config,
file_map: &HashMap<String, (Vec<String>, usize)>,
words: &BTreeSet<WordRef>,
output_filename: &str,
) {
let mut writer: BufWriter<Box<dyn Write>> = BufWriter::new(if output_filename == "-" {
} else {
let file = crash_if_err!(1, File::create(output_filename));
for word_ref in words.iter() {
let file_map_value: &(Vec<String>, usize) = file_map
.expect("Missing file in file map");
let (ref lines, _) = *(file_map_value);
let reference = get_reference(config, word_ref, &lines[word_ref.local_line_nr]);
let output_line: String = match config.format {
OutFormat::Tex => {
format_tex_line(config, word_ref, &lines[word_ref.local_line_nr], &reference)
OutFormat::Roff => {
format_roff_line(config, word_ref, &lines[word_ref.local_line_nr], &reference)
OutFormat::Dumb => crash!(1, "There is no dumb format with GNU extensions disabled"),
crash_if_err!(1, writeln!(writer, "{}", output_line));
pub fn uumain(args: Vec<String>) -> i32 {
let mut opts = Options::new();
"output automatically generated references",
opts.optflag("G", "traditional", "behave more like System V 'ptx'");
"use STRING for flagging line truncations",
"macro name to use instead of 'xx'",
opts.optflag("O", "format=roff", "generate output as roff directives");
"put references at right, not counted in -w",
"for end of lines or end of sentences",
opts.optflag("T", "format=tex", "generate output as TeX directives");
"use REGEXP to match each keyword",
"word break characters in this FILE",
"fold lower case to upper case for sorting",
"gap size in columns between output fields",
"read ignore word list from FILE",
"read only word list from this FILE",
opts.optflag("r", "references", "first field of each line is a reference");
"output width in columns, reference excluded",
opts.optflag("", "help", "display this help and exit");
opts.optflag("", "version", "output version information and exit");
let matches = return_if_err!(1, opts.parse(&args[1..]));
if matches.opt_present("help") {
return 0;
if matches.opt_present("version") {
return 0;
let config = get_config(&matches);
let word_filter = WordFilter::new(&matches, &config);
let file_map = read_input(&matches.free, &config);
let word_set = create_word_set(&config, &word_filter, &file_map);
let output_file = if !config.gnu_ext && matches.free.len() == 2 {
} else {
write_traditional_output(&config, &file_map, &word_set, &output_file);