mirror of
https://github.com/uutils/coreutils
synced 2024-12-22 19:13:25 +00:00
450 lines
17 KiB
Rust
450 lines
17 KiB
Rust
/*
|
|
* This file is part of `fmt` from the uutils coreutils package.
|
|
*
|
|
* (c) kwantam <kwantam@gmail.com>
|
|
*
|
|
* For the full copyright and license information, please view the LICENSE
|
|
* file that was distributed with this source code.
|
|
*/
|
|
|
|
use FmtOptions;
|
|
use parasplit::{Paragraph, ParaWords, WordInfo};
|
|
use std::i64;
|
|
use std::cmp;
|
|
use std::mem;
|
|
use std::num;
|
|
|
|
struct BreakArgs<'a> {
|
|
opts : &'a FmtOptions,
|
|
init_len : uint,
|
|
indent_str : &'a str,
|
|
indent_len : uint,
|
|
uniform : bool,
|
|
ostream : &'a mut Box<Writer>
|
|
}
|
|
|
|
impl<'a> BreakArgs<'a> {
|
|
#[inline(always)]
|
|
fn compute_width<'b>(&self, winfo: &WordInfo<'b>, posn: uint, fresh: bool) -> uint {
|
|
if fresh {
|
|
0
|
|
} else {
|
|
let post = winfo.after_tab;
|
|
match winfo.before_tab {
|
|
None => post,
|
|
Some(pre) => post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer>) {
|
|
// indent
|
|
let pIndent = para.indent_str.as_slice();
|
|
let pIndentLen = para.indent_len;
|
|
|
|
// words
|
|
let pWords = ParaWords::new(opts, para);
|
|
let mut pWords_words = pWords.words();
|
|
|
|
// the first word will *always* appear on the first line
|
|
// make sure of this here
|
|
let (w, w_len) = match pWords_words.next() {
|
|
Some(winfo) => (winfo.word, winfo.word_nchars),
|
|
None => {
|
|
silent_unwrap!(ostream.write_char('\n'));
|
|
return;
|
|
}
|
|
};
|
|
// print the init, if it exists, and get its length
|
|
let pInitLen = w_len +
|
|
if opts.crown || opts.tagged {
|
|
// handle "init" portion
|
|
silent_unwrap!(ostream.write(para.init_str.as_bytes()));
|
|
para.init_len
|
|
} else if !para.mail_header {
|
|
// for non-(crown, tagged) that's the same as a normal indent
|
|
silent_unwrap!(ostream.write(pIndent.as_bytes()));
|
|
pIndentLen
|
|
} else {
|
|
// except that mail headers get no indent at all
|
|
0
|
|
};
|
|
// write first word after writing init
|
|
silent_unwrap!(ostream.write(w.as_bytes()));
|
|
|
|
// does this paragraph require uniform spacing?
|
|
let uniform = para.mail_header || opts.uniform;
|
|
|
|
let mut break_args = BreakArgs {
|
|
opts : opts,
|
|
init_len : pInitLen,
|
|
indent_str : pIndent,
|
|
indent_len : pIndentLen,
|
|
uniform : uniform,
|
|
ostream : ostream
|
|
};
|
|
|
|
if opts.quick || para.mail_header {
|
|
break_simple(pWords_words, &mut break_args);
|
|
} else {
|
|
break_knuth_plass(pWords_words, &mut break_args);
|
|
}
|
|
}
|
|
|
|
// break_simple implements a "greedy" breaking algorithm: print words until
|
|
// maxlength would be exceeded, then print a linebreak and indent and continue.
|
|
fn break_simple<'a, T: Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
|
|
iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
|
|
silent_unwrap!(args.ostream.write_char('\n'));
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
|
|
// compute the length of this word, considering how tabs will expand at this position on the line
|
|
let wlen = winfo.word_nchars + args.compute_width(winfo, l, false);
|
|
|
|
let slen = compute_slen(args.uniform, winfo.new_line, winfo.sentence_start, prev_punct);
|
|
|
|
if l + wlen + slen > args.opts.width {
|
|
write_newline(args.indent_str, args.ostream);
|
|
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
|
|
(args.indent_len + winfo.word_nchars, winfo.ends_punct)
|
|
} else {
|
|
write_with_spaces(winfo.word, slen, args.ostream);
|
|
(l + wlen + slen, winfo.ends_punct)
|
|
}
|
|
}
|
|
|
|
// break_knuth_plass implements an "optimal" breaking algorithm in the style of
|
|
// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software,
|
|
// Practice and Experience. Vol. 11, No. 11, November 1981.
|
|
// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf
|
|
fn break_knuth_plass<'a, T: Clone + Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
|
|
// run the algorithm to get the breakpoints
|
|
let breakpoints = find_kp_breakpoints(iter.clone(), args);
|
|
|
|
// iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it
|
|
let (mut prev_punct, mut fresh) =
|
|
breakpoints.iter().rev().fold((false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| {
|
|
if fresh {
|
|
write_newline(args.indent_str, args.ostream);
|
|
}
|
|
// at each breakpoint, keep emitting words until we find the word matching this breakpoint
|
|
for winfo in iter {
|
|
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
|
|
winfo.new_line, winfo.sentence_start, prev_punct);
|
|
fresh = false;
|
|
prev_punct = winfo.ends_punct;
|
|
|
|
// We find identical breakpoints here by comparing addresses of the references.
|
|
// This is OK because the backing vector is not mutating once we are linebreaking.
|
|
if winfo as *const _ == next_break as *const _ {
|
|
// OK, we found the matching word
|
|
if break_before {
|
|
write_newline(args.indent_str, args.ostream);
|
|
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
|
|
} else {
|
|
// breaking after this word, so that means "fresh" is true for the next iteration
|
|
write_with_spaces(word, slen, args.ostream);
|
|
fresh = true;
|
|
}
|
|
break;
|
|
} else {
|
|
write_with_spaces(word, slen, args.ostream);
|
|
}
|
|
}
|
|
(prev_punct, fresh)
|
|
});
|
|
|
|
// after the last linebreak, write out the rest of the final line.
|
|
for winfo in iter {
|
|
if fresh {
|
|
write_newline(args.indent_str, args.ostream);
|
|
}
|
|
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
|
|
winfo.new_line, winfo.sentence_start, prev_punct);
|
|
prev_punct = winfo.ends_punct;
|
|
fresh = false;
|
|
write_with_spaces(word, slen, args.ostream);
|
|
}
|
|
silent_unwrap!(args.ostream.write_char('\n'));
|
|
}
|
|
|
|
struct LineBreak<'a> {
|
|
prev : uint,
|
|
linebreak : Option<&'a WordInfo<'a>>,
|
|
break_before : bool,
|
|
demerits : i64,
|
|
prev_rat : f32,
|
|
length : uint,
|
|
fresh : bool
|
|
}
|
|
|
|
fn find_kp_breakpoints<'a, T: Iterator<&'a WordInfo<'a>>>(iter: T, args: &BreakArgs<'a>) -> Vec<(&'a WordInfo<'a>, bool)> {
|
|
let mut iter = iter.peekable();
|
|
// set up the initial null linebreak
|
|
let mut linebreaks = vec!(LineBreak {
|
|
prev : 0,
|
|
linebreak : None,
|
|
break_before : false,
|
|
demerits : 0,
|
|
prev_rat : 0.0f32,
|
|
length : args.init_len,
|
|
fresh : false
|
|
});
|
|
// this vec holds the current active linebreaks; next_ holds the breaks that will be active for the next word
|
|
let active_breaks = &mut vec!(0);
|
|
let next_active_breaks = &mut vec!();
|
|
|
|
let stretch = (args.opts.width - args.opts.goal) as int;
|
|
let minlength = args.opts.goal - stretch as uint;
|
|
let mut new_linebreaks = vec!();
|
|
let mut is_sentence_start = false;
|
|
let mut least_demerits = 0;
|
|
loop {
|
|
let w =
|
|
match iter.next() {
|
|
None => break,
|
|
Some(w) => w
|
|
};
|
|
|
|
// if this is the last word, we don't add additional demerits for this break
|
|
let (is_last_word, is_sentence_end) =
|
|
match iter.peek() {
|
|
None => (true, true),
|
|
Some(&&WordInfo { sentence_start: st, new_line: nl, .. }) => (false, st || (nl && w.ends_punct))
|
|
};
|
|
|
|
// should we be adding extra space at the beginning of the next sentence?
|
|
let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false);
|
|
|
|
let mut ld_new = i64::MAX;
|
|
let mut ld_next = i64::MAX;
|
|
let mut ld_idx = 0;
|
|
new_linebreaks.clear();
|
|
next_active_breaks.clear();
|
|
// go through each active break, extending it and possibly adding a new active
|
|
// break if we are above the minimum required length
|
|
for &i in active_breaks.iter() {
|
|
let active = linebreaks.get_mut(i);
|
|
// normalize demerits to avoid overflow, and record if this is the least
|
|
active.demerits -= least_demerits;
|
|
if active.demerits < ld_next {
|
|
ld_next = active.demerits;
|
|
ld_idx = i;
|
|
}
|
|
|
|
// get the new length
|
|
let tlen = w.word_nchars + args.compute_width(w, active.length, active.fresh) + slen + active.length;
|
|
|
|
// if tlen is longer than args.opts.width, we drop this break from the active list
|
|
// otherwise, we extend the break, and possibly add a new break at this point
|
|
if tlen <= args.opts.width {
|
|
// this break will still be active next time
|
|
next_active_breaks.push(i);
|
|
// we can put this word on this line
|
|
active.fresh = false;
|
|
active.length = tlen;
|
|
|
|
// if we're above the minlength, we can also consider breaking here
|
|
if tlen >= minlength {
|
|
let (new_demerits, new_ratio) =
|
|
if is_last_word {
|
|
// there is no penalty for the final line's length
|
|
(0, 0.0)
|
|
} else {
|
|
compute_demerits((args.opts.goal - tlen) as int, stretch, w.word_nchars as int, active.prev_rat)
|
|
};
|
|
|
|
// do not even consider adding a line that has too many demerits
|
|
// also, try to detect overflow by checking signum
|
|
let total_demerits = new_demerits + active.demerits;
|
|
if new_demerits < BAD_INFTY_SQ && total_demerits < ld_new && num::signum(active.demerits) <= num::signum(new_demerits) {
|
|
ld_new = total_demerits;
|
|
new_linebreaks.push(LineBreak {
|
|
prev : i,
|
|
linebreak : Some(w),
|
|
break_before : false,
|
|
demerits : total_demerits,
|
|
prev_rat : new_ratio,
|
|
length : args.indent_len,
|
|
fresh : true
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// if we generated any new linebreaks, add the last one to the list
|
|
// the last one is always the best because we don't add to new_linebreaks unless
|
|
// it's better than the best one so far
|
|
match new_linebreaks.pop() {
|
|
None => (),
|
|
Some(lb) => {
|
|
next_active_breaks.push(linebreaks.len());
|
|
linebreaks.push(lb);
|
|
}
|
|
}
|
|
|
|
if next_active_breaks.is_empty() {
|
|
// every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx
|
|
let new_break = restart_active_breaks(args, linebreaks.get(ld_idx), ld_idx, w, slen, minlength);
|
|
next_active_breaks.push(linebreaks.len());
|
|
linebreaks.push(new_break);
|
|
least_demerits = 0;
|
|
} else {
|
|
// next time around, normalize out the demerits fields
|
|
// on active linebreaks to make overflow less likely
|
|
least_demerits = cmp::max(ld_next, 0);
|
|
}
|
|
// swap in new list of active breaks
|
|
mem::swap(active_breaks, next_active_breaks);
|
|
// If this was the last word in a sentence, the next one must be the first in the next.
|
|
is_sentence_start = is_sentence_end;
|
|
}
|
|
|
|
// return the best path
|
|
build_best_path(&linebreaks, active_breaks)
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn build_best_path<'a>(paths: &Vec<LineBreak<'a>>, active: &Vec<uint>) -> Vec<(&'a WordInfo<'a>, bool)> {
|
|
let mut breakwords = vec!();
|
|
// of the active paths, we select the one with the fewest demerits
|
|
let mut best_idx = match active.iter().min_by(|&&a| paths.get(a).demerits) {
|
|
None => crash!(1, "Failed to find a k-p linebreak solution. This should never happen."),
|
|
Some(&s) => s
|
|
};
|
|
|
|
// now, chase the pointers back through the break list, recording
|
|
// the words at which we should break
|
|
loop {
|
|
let next_best = paths.get(best_idx);
|
|
match next_best.linebreak {
|
|
None => return breakwords,
|
|
Some(prev) => {
|
|
breakwords.push((prev, next_best.break_before));
|
|
best_idx = next_best.prev
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed
|
|
static BAD_INFTY: i64 = 10000000;
|
|
static BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
|
|
// badness = BAD_MULT * abs(r) ^ 3
|
|
static BAD_MULT: f32 = 100.0;
|
|
// DR_MULT is multiplier for delta-R between lines
|
|
static DR_MULT: f32 = 600.0;
|
|
// DL_MULT is penalty multiplier for short words at end of line
|
|
static DL_MULT: f32 = 300.0;
|
|
|
|
#[inline(always)]
|
|
fn compute_demerits(delta_len: int, stretch: int, wlen: int, prev_rat: f32) -> (i64, f32) {
|
|
// how much stretch are we using?
|
|
let ratio =
|
|
if delta_len == 0 {
|
|
0.0f32
|
|
} else {
|
|
delta_len as f32 / stretch as f32
|
|
};
|
|
|
|
// compute badness given the stretch ratio
|
|
let bad_linelen =
|
|
if num::abs(ratio) > 1.0f32 {
|
|
BAD_INFTY
|
|
} else {
|
|
(BAD_MULT * num::abs(num::pow(ratio, 3))) as i64
|
|
};
|
|
|
|
// we penalize lines ending in really short words
|
|
let bad_wordlen =
|
|
if wlen >= stretch {
|
|
0
|
|
} else {
|
|
(DL_MULT * num::abs(num::pow((stretch - wlen) as f32 / (stretch - 1) as f32, 3))) as i64
|
|
};
|
|
|
|
// we penalize lines that have very different ratios from previous lines
|
|
let bad_deltaR = (DR_MULT * num::abs(num::pow((ratio - prev_rat) / 2.0, 3))) as i64;
|
|
|
|
let demerits = num::pow(1 + bad_linelen + bad_wordlen + bad_deltaR, 2);
|
|
|
|
(demerits, ratio)
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn restart_active_breaks<'a>(args: &BreakArgs<'a>, active: &LineBreak<'a>, act_idx: uint, w: &'a WordInfo<'a>, slen: uint, min: uint) -> LineBreak<'a> {
|
|
let (break_before, line_length) =
|
|
if active.fresh {
|
|
// never break before a word if that word would be the first on a line
|
|
(false, args.indent_len)
|
|
} else {
|
|
// choose the lesser evil: breaking too early, or breaking too late
|
|
let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh);
|
|
let underlen: int = (min - active.length) as int;
|
|
let overlen: int = ((wlen + slen + active.length) - args.opts.width) as int;
|
|
if overlen > underlen {
|
|
// break early, put this word on the next line
|
|
(true, args.indent_len + w.word_nchars)
|
|
} else {
|
|
(false, args.indent_len)
|
|
}
|
|
};
|
|
|
|
// restart the linebreak. This will be our only active path.
|
|
LineBreak {
|
|
prev : act_idx,
|
|
linebreak : Some(w),
|
|
break_before : break_before,
|
|
demerits : 0, // this is the only active break, so we can reset the demerit count
|
|
prev_rat : if break_before { 1.0 } else { -1.0 },
|
|
length : line_length,
|
|
fresh : !break_before
|
|
}
|
|
}
|
|
|
|
// Number of spaces to add before a word, based on mode, newline, sentence start.
|
|
#[inline(always)]
|
|
fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> uint {
|
|
if uniform || newline {
|
|
if start || (newline && punct) {
|
|
2
|
|
} else {
|
|
1
|
|
}
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
|
|
// If we're on a fresh line, slen=0 and we slice off leading whitespace.
|
|
// Otherwise, compute slen and leave whitespace alone.
|
|
#[inline(always)]
|
|
fn slice_if_fresh<'a>(fresh: bool, word: &'a str, start: uint, uniform: bool, newline: bool, sstart: bool, punct: bool) -> (uint, &'a str) {
|
|
if fresh {
|
|
(0, word.slice_from(start))
|
|
} else {
|
|
(compute_slen(uniform, newline, sstart, punct), word)
|
|
}
|
|
}
|
|
|
|
// Write a newline and add the indent.
|
|
#[inline(always)]
|
|
fn write_newline(indent: &str, ostream: &mut Box<Writer>) {
|
|
silent_unwrap!(ostream.write_char('\n'));
|
|
silent_unwrap!(ostream.write(indent.as_bytes()));
|
|
}
|
|
|
|
// Write the word, along with slen spaces.
|
|
#[inline(always)]
|
|
fn write_with_spaces(word: &str, slen: uint, ostream: &mut Box<Writer>) {
|
|
if slen == 2 {
|
|
silent_unwrap!(ostream.write(" ".as_bytes()));
|
|
} else if slen == 1 {
|
|
silent_unwrap!(ostream.write_char(' '));
|
|
}
|
|
silent_unwrap!(ostream.write(word.as_bytes()));
|
|
}
|