coreutils/fmt/linebreak.rs
2014-06-30 19:27:54 -04:00

450 lines
17 KiB
Rust

/*
* This file is part of `fmt` from the uutils coreutils package.
*
* (c) kwantam <kwantam@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
use FmtOptions;
use parasplit::{Paragraph, ParaWords, WordInfo};
use std::i64;
use std::cmp;
use std::mem;
use std::num;
struct BreakArgs<'a> {
opts : &'a FmtOptions,
init_len : uint,
indent_str : &'a str,
indent_len : uint,
uniform : bool,
ostream : &'a mut Box<Writer>
}
impl<'a> BreakArgs<'a> {
#[inline(always)]
fn compute_width<'b>(&self, winfo: &WordInfo<'b>, posn: uint, fresh: bool) -> uint {
if fresh {
0
} else {
let post = winfo.after_tab;
match winfo.before_tab {
None => post,
Some(pre) => post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
}
}
}
}
pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer>) {
// indent
let pIndent = para.indent_str.as_slice();
let pIndentLen = para.indent_len;
// words
let pWords = ParaWords::new(opts, para);
let mut pWords_words = pWords.words();
// the first word will *always* appear on the first line
// make sure of this here
let (w, w_len) = match pWords_words.next() {
Some(winfo) => (winfo.word, winfo.word_nchars),
None => {
silent_unwrap!(ostream.write_char('\n'));
return;
}
};
// print the init, if it exists, and get its length
let pInitLen = w_len +
if opts.crown || opts.tagged {
// handle "init" portion
silent_unwrap!(ostream.write(para.init_str.as_bytes()));
para.init_len
} else if !para.mail_header {
// for non-(crown, tagged) that's the same as a normal indent
silent_unwrap!(ostream.write(pIndent.as_bytes()));
pIndentLen
} else {
// except that mail headers get no indent at all
0
};
// write first word after writing init
silent_unwrap!(ostream.write(w.as_bytes()));
// does this paragraph require uniform spacing?
let uniform = para.mail_header || opts.uniform;
let mut break_args = BreakArgs {
opts : opts,
init_len : pInitLen,
indent_str : pIndent,
indent_len : pIndentLen,
uniform : uniform,
ostream : ostream
};
if opts.quick || para.mail_header {
break_simple(pWords_words, &mut break_args);
} else {
break_knuth_plass(pWords_words, &mut break_args);
}
}
// break_simple implements a "greedy" breaking algorithm: print words until
// maxlength would be exceeded, then print a linebreak and indent and continue.
fn break_simple<'a, T: Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
silent_unwrap!(args.ostream.write_char('\n'));
}
#[inline(always)]
fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
// compute the length of this word, considering how tabs will expand at this position on the line
let wlen = winfo.word_nchars + args.compute_width(winfo, l, false);
let slen = compute_slen(args.uniform, winfo.new_line, winfo.sentence_start, prev_punct);
if l + wlen + slen > args.opts.width {
write_newline(args.indent_str, args.ostream);
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
(args.indent_len + winfo.word_nchars, winfo.ends_punct)
} else {
write_with_spaces(winfo.word, slen, args.ostream);
(l + wlen + slen, winfo.ends_punct)
}
}
// break_knuth_plass implements an "optimal" breaking algorithm in the style of
// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software,
// Practice and Experience. Vol. 11, No. 11, November 1981.
// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf
fn break_knuth_plass<'a, T: Clone + Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
// run the algorithm to get the breakpoints
let breakpoints = find_kp_breakpoints(iter.clone(), args);
// iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it
let (mut prev_punct, mut fresh) =
breakpoints.iter().rev().fold((false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| {
if fresh {
write_newline(args.indent_str, args.ostream);
}
// at each breakpoint, keep emitting words until we find the word matching this breakpoint
for winfo in iter {
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
winfo.new_line, winfo.sentence_start, prev_punct);
fresh = false;
prev_punct = winfo.ends_punct;
// We find identical breakpoints here by comparing addresses of the references.
// This is OK because the backing vector is not mutating once we are linebreaking.
if winfo as *const _ == next_break as *const _ {
// OK, we found the matching word
if break_before {
write_newline(args.indent_str, args.ostream);
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
} else {
// breaking after this word, so that means "fresh" is true for the next iteration
write_with_spaces(word, slen, args.ostream);
fresh = true;
}
break;
} else {
write_with_spaces(word, slen, args.ostream);
}
}
(prev_punct, fresh)
});
// after the last linebreak, write out the rest of the final line.
for winfo in iter {
if fresh {
write_newline(args.indent_str, args.ostream);
}
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
winfo.new_line, winfo.sentence_start, prev_punct);
prev_punct = winfo.ends_punct;
fresh = false;
write_with_spaces(word, slen, args.ostream);
}
silent_unwrap!(args.ostream.write_char('\n'));
}
struct LineBreak<'a> {
prev : uint,
linebreak : Option<&'a WordInfo<'a>>,
break_before : bool,
demerits : i64,
prev_rat : f32,
length : uint,
fresh : bool
}
fn find_kp_breakpoints<'a, T: Iterator<&'a WordInfo<'a>>>(iter: T, args: &BreakArgs<'a>) -> Vec<(&'a WordInfo<'a>, bool)> {
let mut iter = iter.peekable();
// set up the initial null linebreak
let mut linebreaks = vec!(LineBreak {
prev : 0,
linebreak : None,
break_before : false,
demerits : 0,
prev_rat : 0.0f32,
length : args.init_len,
fresh : false
});
// this vec holds the current active linebreaks; next_ holds the breaks that will be active for the next word
let active_breaks = &mut vec!(0);
let next_active_breaks = &mut vec!();
let stretch = (args.opts.width - args.opts.goal) as int;
let minlength = args.opts.goal - stretch as uint;
let mut new_linebreaks = vec!();
let mut is_sentence_start = false;
let mut least_demerits = 0;
loop {
let w =
match iter.next() {
None => break,
Some(w) => w
};
// if this is the last word, we don't add additional demerits for this break
let (is_last_word, is_sentence_end) =
match iter.peek() {
None => (true, true),
Some(&&WordInfo { sentence_start: st, new_line: nl, .. }) => (false, st || (nl && w.ends_punct))
};
// should we be adding extra space at the beginning of the next sentence?
let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false);
let mut ld_new = i64::MAX;
let mut ld_next = i64::MAX;
let mut ld_idx = 0;
new_linebreaks.clear();
next_active_breaks.clear();
// go through each active break, extending it and possibly adding a new active
// break if we are above the minimum required length
for &i in active_breaks.iter() {
let active = linebreaks.get_mut(i);
// normalize demerits to avoid overflow, and record if this is the least
active.demerits -= least_demerits;
if active.demerits < ld_next {
ld_next = active.demerits;
ld_idx = i;
}
// get the new length
let tlen = w.word_nchars + args.compute_width(w, active.length, active.fresh) + slen + active.length;
// if tlen is longer than args.opts.width, we drop this break from the active list
// otherwise, we extend the break, and possibly add a new break at this point
if tlen <= args.opts.width {
// this break will still be active next time
next_active_breaks.push(i);
// we can put this word on this line
active.fresh = false;
active.length = tlen;
// if we're above the minlength, we can also consider breaking here
if tlen >= minlength {
let (new_demerits, new_ratio) =
if is_last_word {
// there is no penalty for the final line's length
(0, 0.0)
} else {
compute_demerits((args.opts.goal - tlen) as int, stretch, w.word_nchars as int, active.prev_rat)
};
// do not even consider adding a line that has too many demerits
// also, try to detect overflow by checking signum
let total_demerits = new_demerits + active.demerits;
if new_demerits < BAD_INFTY_SQ && total_demerits < ld_new && num::signum(active.demerits) <= num::signum(new_demerits) {
ld_new = total_demerits;
new_linebreaks.push(LineBreak {
prev : i,
linebreak : Some(w),
break_before : false,
demerits : total_demerits,
prev_rat : new_ratio,
length : args.indent_len,
fresh : true
});
}
}
}
}
// if we generated any new linebreaks, add the last one to the list
// the last one is always the best because we don't add to new_linebreaks unless
// it's better than the best one so far
match new_linebreaks.pop() {
None => (),
Some(lb) => {
next_active_breaks.push(linebreaks.len());
linebreaks.push(lb);
}
}
if next_active_breaks.is_empty() {
// every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx
let new_break = restart_active_breaks(args, linebreaks.get(ld_idx), ld_idx, w, slen, minlength);
next_active_breaks.push(linebreaks.len());
linebreaks.push(new_break);
least_demerits = 0;
} else {
// next time around, normalize out the demerits fields
// on active linebreaks to make overflow less likely
least_demerits = cmp::max(ld_next, 0);
}
// swap in new list of active breaks
mem::swap(active_breaks, next_active_breaks);
// If this was the last word in a sentence, the next one must be the first in the next.
is_sentence_start = is_sentence_end;
}
// return the best path
build_best_path(&linebreaks, active_breaks)
}
#[inline(always)]
fn build_best_path<'a>(paths: &Vec<LineBreak<'a>>, active: &Vec<uint>) -> Vec<(&'a WordInfo<'a>, bool)> {
let mut breakwords = vec!();
// of the active paths, we select the one with the fewest demerits
let mut best_idx = match active.iter().min_by(|&&a| paths.get(a).demerits) {
None => crash!(1, "Failed to find a k-p linebreak solution. This should never happen."),
Some(&s) => s
};
// now, chase the pointers back through the break list, recording
// the words at which we should break
loop {
let next_best = paths.get(best_idx);
match next_best.linebreak {
None => return breakwords,
Some(prev) => {
breakwords.push((prev, next_best.break_before));
best_idx = next_best.prev
}
}
}
}
// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed
static BAD_INFTY: i64 = 10000000;
static BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
// badness = BAD_MULT * abs(r) ^ 3
static BAD_MULT: f32 = 100.0;
// DR_MULT is multiplier for delta-R between lines
static DR_MULT: f32 = 600.0;
// DL_MULT is penalty multiplier for short words at end of line
static DL_MULT: f32 = 300.0;
#[inline(always)]
fn compute_demerits(delta_len: int, stretch: int, wlen: int, prev_rat: f32) -> (i64, f32) {
// how much stretch are we using?
let ratio =
if delta_len == 0 {
0.0f32
} else {
delta_len as f32 / stretch as f32
};
// compute badness given the stretch ratio
let bad_linelen =
if num::abs(ratio) > 1.0f32 {
BAD_INFTY
} else {
(BAD_MULT * num::abs(num::pow(ratio, 3))) as i64
};
// we penalize lines ending in really short words
let bad_wordlen =
if wlen >= stretch {
0
} else {
(DL_MULT * num::abs(num::pow((stretch - wlen) as f32 / (stretch - 1) as f32, 3))) as i64
};
// we penalize lines that have very different ratios from previous lines
let bad_deltaR = (DR_MULT * num::abs(num::pow((ratio - prev_rat) / 2.0, 3))) as i64;
let demerits = num::pow(1 + bad_linelen + bad_wordlen + bad_deltaR, 2);
(demerits, ratio)
}
#[inline(always)]
fn restart_active_breaks<'a>(args: &BreakArgs<'a>, active: &LineBreak<'a>, act_idx: uint, w: &'a WordInfo<'a>, slen: uint, min: uint) -> LineBreak<'a> {
let (break_before, line_length) =
if active.fresh {
// never break before a word if that word would be the first on a line
(false, args.indent_len)
} else {
// choose the lesser evil: breaking too early, or breaking too late
let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh);
let underlen: int = (min - active.length) as int;
let overlen: int = ((wlen + slen + active.length) - args.opts.width) as int;
if overlen > underlen {
// break early, put this word on the next line
(true, args.indent_len + w.word_nchars)
} else {
(false, args.indent_len)
}
};
// restart the linebreak. This will be our only active path.
LineBreak {
prev : act_idx,
linebreak : Some(w),
break_before : break_before,
demerits : 0, // this is the only active break, so we can reset the demerit count
prev_rat : if break_before { 1.0 } else { -1.0 },
length : line_length,
fresh : !break_before
}
}
// Number of spaces to add before a word, based on mode, newline, sentence start.
#[inline(always)]
fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> uint {
if uniform || newline {
if start || (newline && punct) {
2
} else {
1
}
} else {
0
}
}
// If we're on a fresh line, slen=0 and we slice off leading whitespace.
// Otherwise, compute slen and leave whitespace alone.
#[inline(always)]
fn slice_if_fresh<'a>(fresh: bool, word: &'a str, start: uint, uniform: bool, newline: bool, sstart: bool, punct: bool) -> (uint, &'a str) {
if fresh {
(0, word.slice_from(start))
} else {
(compute_slen(uniform, newline, sstart, punct), word)
}
}
// Write a newline and add the indent.
#[inline(always)]
fn write_newline(indent: &str, ostream: &mut Box<Writer>) {
silent_unwrap!(ostream.write_char('\n'));
silent_unwrap!(ostream.write(indent.as_bytes()));
}
// Write the word, along with slen spaces.
#[inline(always)]
fn write_with_spaces(word: &str, slen: uint, ostream: &mut Box<Writer>) {
if slen == 2 {
silent_unwrap!(ostream.write(" ".as_bytes()));
} else if slen == 1 {
silent_unwrap!(ostream.write_char(' '));
}
silent_unwrap!(ostream.write(word.as_bytes()));
}