mirror of
https://github.com/uutils/coreutils
synced 2025-01-10 12:19:18 +00:00
8be67f7d4d
fmt: - Implemented Knuth-Plass optimal linebreaking strategy. - Added commandline switch -q for "quick" (greedy) split mode that does not use Knuth-Plass. - Right now, Knuth-Plass runs about half as fast. It also uses more memory. - Updated fmt to use char_width (see below) instead of assuming each character width is 1. - Use i64 for demerits instead of int in K-P, since int is pointer sized and will only be 32 bits on some architectures. - incremented version number - Incorporated improvements suggested by huonw and Arcterus. - K-P uses indices of linebreaks vector instead of raw pointers. This gets rid of a lot of allocation of boxes and improves safety to boot. - Added a support module for computing displayed widths of unicode strings based on Markus Kuhn's free implementation at http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - This is in `charwidth.rs`, but this is a temporary measure until the Char trait implements .width(). I am submitting a PR for this soon, and the code in charwidth() is what's generated libcore. closes #223
564 lines
22 KiB
Rust
564 lines
22 KiB
Rust
/*
|
|
* This file is part of `fmt` from the uutils coreutils package.
|
|
*
|
|
* (c) kwantam <kwantam@gmail.com>
|
|
*
|
|
* For the full copyright and license information, please view the LICENSE
|
|
* file that was distributed with this source code.
|
|
*/
|
|
|
|
use core::iter::Peekable;
|
|
use std::io::Lines;
|
|
use std::slice::Items;
|
|
use std::str::CharRange;
|
|
use FileOrStdReader;
|
|
use FmtOptions;
|
|
use charwidth;
|
|
|
|
#[inline(always)]
|
|
fn char_width(c: char) -> uint {
|
|
if (c as uint) < 0xA0 {
|
|
// if it is ASCII, call it exactly 1 wide (including control chars)
|
|
// calling control chars' widths 1 is consistent with OpenBSD fmt
|
|
1
|
|
} else {
|
|
// otherwise, get the unicode width
|
|
// note that we shouldn't actually get None here because only c < 0xA0
|
|
// can return None, but for safety and future-proofing we do it this way
|
|
charwidth::width(c).unwrap_or(1)
|
|
}
|
|
}
|
|
|
|
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
|
|
// NoFormatLines; otherwise, they are FormatLines
|
|
#[deriving(Show)]
|
|
enum Line {
|
|
FormatLine(FileLine),
|
|
NoFormatLine(String, bool)
|
|
}
|
|
|
|
impl Line {
|
|
// when we know that it's a FormatLine, as in the ParagraphStream iterator
|
|
fn get_fileline(self) -> FileLine {
|
|
match self {
|
|
FormatLine(fl) => fl,
|
|
NoFormatLine(..) => fail!("Found NoFormatLine when expecting FormatLine")
|
|
}
|
|
}
|
|
|
|
// when we know that it's a NoFormatLine, as in the ParagraphStream iterator
|
|
fn get_noformatline(self) -> (String, bool) {
|
|
match self {
|
|
NoFormatLine(s, b) => (s, b),
|
|
FormatLine(..) => fail!("Found FormatLine when expecting NoFormatLine")
|
|
}
|
|
}
|
|
}
|
|
|
|
// each line's prefix has to be considered to know whether to merge it with
|
|
// the next line or not
|
|
#[deriving(Show)]
|
|
struct FileLine {
|
|
line : String,
|
|
indent_end : uint, // the end of the indent, always the start of the text
|
|
pfxind_end : uint, // the end of the PREFIX's indent, that is, the spaces before the prefix
|
|
indent_len : uint, // display length of indent taking into account tabs
|
|
prefix_len : uint, // PREFIX indent length taking into account tabs
|
|
}
|
|
|
|
// iterator that produces a stream of Lines from a file
|
|
pub struct FileLines<'a> {
|
|
opts : &'a FmtOptions,
|
|
lines : Lines<'a, FileOrStdReader>,
|
|
}
|
|
|
|
impl<'a> FileLines<'a> {
|
|
fn new<'a>(opts: &'a FmtOptions, lines: Lines<'a, FileOrStdReader>) -> FileLines<'a> {
|
|
FileLines { opts: opts, lines: lines }
|
|
}
|
|
|
|
// returns true if this line should be formatted
|
|
fn match_prefix(&self, line: &str) -> (bool, uint) {
|
|
if !self.opts.use_prefix { return (true, 0u); }
|
|
|
|
FileLines::match_prefix_generic(self.opts.prefix.as_slice(), line, self.opts.xprefix)
|
|
}
|
|
|
|
// returns true if this line should be formatted
|
|
fn match_anti_prefix(&self, line: &str) -> bool {
|
|
if !self.opts.use_anti_prefix { return true; }
|
|
|
|
match FileLines::match_prefix_generic(self.opts.anti_prefix.as_slice(), line, self.opts.xanti_prefix) {
|
|
(true, _) => false,
|
|
(_ , _) => true
|
|
}
|
|
}
|
|
|
|
fn match_prefix_generic(pfx: &str, line: &str, exact: bool) -> (bool, uint) {
|
|
if line.starts_with(pfx) {
|
|
return (true, 0);
|
|
}
|
|
|
|
if !exact {
|
|
// we do it this way rather than byte indexing to support unicode whitespace chars
|
|
let mut i = 0u;
|
|
while (i < line.len()) && line.char_at(i).is_whitespace() {
|
|
i = match line.char_range_at(i) { CharRange { ch: _ , next: nxi } => nxi };
|
|
if line.slice_from(i).starts_with(pfx) {
|
|
return (true, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
(false, 0)
|
|
}
|
|
|
|
fn compute_indent(&self, string: &str, prefix_end: uint) -> (uint, uint, uint) {
|
|
let mut prefix_len = 0;
|
|
let mut indent_len = 0;
|
|
let mut indent_end = 0;
|
|
for (os, c) in string.char_indices() {
|
|
if os == prefix_end {
|
|
// we found the end of the prefix, so this is the printed length of the prefix here
|
|
prefix_len = indent_len;
|
|
}
|
|
|
|
if (os >= prefix_end) && !c.is_whitespace() {
|
|
// found first non-whitespace after prefix, this is indent_end
|
|
indent_end = os;
|
|
break;
|
|
} else if c == '\t' {
|
|
// compute tab length
|
|
indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
|
|
} else {
|
|
// non-tab character
|
|
indent_len += char_width(c);
|
|
}
|
|
}
|
|
(indent_end, prefix_len, indent_len)
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator<Line> for FileLines<'a> {
|
|
fn next(&mut self) -> Option<Line> {
|
|
let n =
|
|
match self.lines.next() {
|
|
Some(t) => match t {
|
|
Ok(tt) => tt,
|
|
Err(_) => return None
|
|
},
|
|
None => return None
|
|
};
|
|
|
|
// if this line is entirely whitespace,
|
|
// emit a blank line
|
|
// Err(true) indicates that this was a linebreak,
|
|
// which is important to know when detecting mail headers
|
|
if n.as_slice().is_whitespace() {
|
|
return Some(NoFormatLine("\n".to_string(), true));
|
|
}
|
|
|
|
// if this line does not match the prefix,
|
|
// emit the line unprocessed and iterate again
|
|
let (pmatch, poffset) = self.match_prefix(n.as_slice());
|
|
if !pmatch {
|
|
return Some(NoFormatLine(n, false));
|
|
} else if n.as_slice().slice_from(poffset + self.opts.prefix.len()).is_whitespace() {
|
|
// if the line matches the prefix, but is blank after,
|
|
// don't allow lines to be combined through it (that is,
|
|
// treat it like a blank line, except that since it's
|
|
// not truly blank we will not allow mail headers on the
|
|
// following line)
|
|
return Some(NoFormatLine(n, false));
|
|
}
|
|
|
|
// skip if this line matches the anti_prefix
|
|
// (NOTE definition of match_anti_prefix is TRUE if we should process)
|
|
if !self.match_anti_prefix(n.as_slice()) {
|
|
return Some(NoFormatLine(n, false));
|
|
}
|
|
|
|
// figure out the indent, prefix, and prefixindent ending points
|
|
let prefix_end = poffset + self.opts.prefix.len();
|
|
let (indent_end, prefix_len, indent_len) = self.compute_indent(n.as_slice(), prefix_end);
|
|
|
|
Some(FormatLine(FileLine {
|
|
line : n,
|
|
indent_end : indent_end,
|
|
pfxind_end : poffset,
|
|
indent_len : indent_len,
|
|
prefix_len : prefix_len
|
|
}))
|
|
}
|
|
}
|
|
|
|
// a paragraph : a collection of FileLines that are to be formatted
|
|
// plus info about the paragraph's indentation
|
|
// (but we only retain the String from the FileLine; the other info
|
|
// is only there to help us in deciding how to merge lines into Paragraphs
|
|
#[deriving(Show)]
|
|
pub struct Paragraph {
|
|
lines : Vec<String>, // the lines of the file
|
|
pub init_str : String, // string representing the init, that is, the first line's indent
|
|
pub init_len : uint, // printable length of the init string considering TABWIDTH
|
|
init_end : uint, // byte location of end of init in first line String
|
|
pub indent_str : String, // string representing indent
|
|
pub indent_len : uint, // length of above
|
|
indent_end : uint, // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
|
|
pub mail_header : bool // we need to know if this is a mail header because we do word splitting differently in that case
|
|
}
|
|
|
|
// an iterator producing a stream of paragraphs from a stream of lines
|
|
// given a set of options.
|
|
pub struct ParagraphStream<'a> {
|
|
lines : Peekable<Line, FileLines<'a>>,
|
|
next_mail : bool,
|
|
opts : &'a FmtOptions,
|
|
}
|
|
|
|
impl<'a> ParagraphStream<'a> {
|
|
pub fn new<'a>(opts: &'a FmtOptions, reader: &'a mut FileOrStdReader) -> ParagraphStream<'a> {
|
|
let lines = FileLines::new(opts, reader.lines()).peekable();
|
|
// at the beginning of the file, we might find mail headers
|
|
ParagraphStream { lines: lines, next_mail: true, opts: opts }
|
|
}
|
|
|
|
// detect RFC822 mail header
|
|
fn is_mail_header(line: &FileLine) -> bool {
|
|
// a mail header begins with either "From " (envelope sender line)
|
|
// or with a sequence of printable ASCII chars (33 to 126, inclusive,
|
|
// except colon) followed by a colon.
|
|
if line.indent_end > 0 {
|
|
false
|
|
} else {
|
|
let lSlice = line.line.as_slice();
|
|
if lSlice.starts_with("From ") {
|
|
true
|
|
} else {
|
|
let colonPosn =
|
|
match lSlice.find(':') {
|
|
Some(n) => n,
|
|
None => return false
|
|
};
|
|
|
|
// header field must be nonzero length
|
|
if colonPosn == 0 { return false; }
|
|
|
|
return lSlice.slice_to(colonPosn).chars().all(|x| match x as uint {
|
|
y if y < 33 || y > 126 => false,
|
|
_ => true
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator<Result<Paragraph, String>> for ParagraphStream<'a> {
|
|
fn next(&mut self) -> Option<Result<Paragraph, String>> {
|
|
// return a NoFormatLine in an Err; it should immediately be output
|
|
let noformat =
|
|
match self.lines.peek() {
|
|
None => return None,
|
|
Some(l) => match l {
|
|
&FormatLine(_) => false,
|
|
&NoFormatLine(_, _) => true
|
|
}
|
|
};
|
|
|
|
// found a NoFormatLine, immediately dump it out
|
|
if noformat {
|
|
let (s, nm) = self.lines.next().unwrap().get_noformatline();
|
|
self.next_mail = nm;
|
|
return Some(Err(s));
|
|
}
|
|
|
|
// found a FormatLine, now build a paragraph
|
|
let mut init_str = String::new();
|
|
let mut init_end = 0;
|
|
let mut init_len = 0;
|
|
let mut indent_str = String::new();
|
|
let mut indent_end = 0;
|
|
let mut indent_len = 0;
|
|
let mut prefix_len = 0;
|
|
let mut pfxind_end = 0;
|
|
let mut pLines = Vec::new();
|
|
|
|
let mut in_mail = false;
|
|
let mut second_done = false; // for when we use crown or tagged mode
|
|
loop {
|
|
{ // peek ahead
|
|
// need to explicitly force fl out of scope before we can call self.lines.next()
|
|
let fl =
|
|
match self.lines.peek() {
|
|
None => break,
|
|
Some(l) => {
|
|
match l {
|
|
&FormatLine(ref x) => x,
|
|
&NoFormatLine(..) => break
|
|
}
|
|
}
|
|
};
|
|
|
|
if pLines.len() == 0 {
|
|
// first time through the loop, get things set up
|
|
// detect mail header
|
|
if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
|
|
in_mail = true;
|
|
// there can't be any indent or pfxind because otherwise is_mail_header would fail
|
|
// since there cannot be any whitespace before the colon in a valid header field
|
|
indent_str.push_str(" ");
|
|
indent_len = 2;
|
|
} else {
|
|
if self.opts.crown || self.opts.tagged {
|
|
init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
|
|
init_len = fl.indent_len;
|
|
init_end = fl.indent_end;
|
|
} else {
|
|
second_done = true;
|
|
}
|
|
|
|
// these will be overwritten in the 2nd line of crown or tagged mode, but
|
|
// we are not guaranteed to get to the 2nd line, e.g., if the next line
|
|
// is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
|
|
indent_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
|
|
indent_len = fl.indent_len;
|
|
indent_end = fl.indent_end;
|
|
|
|
// save these to check for matching lines
|
|
prefix_len = fl.prefix_len;
|
|
pfxind_end = fl.pfxind_end;
|
|
|
|
// in tagged mode, add 4 spaces of additional indenting by default
|
|
// (gnu fmt's behavior is different: it seems to find the closest column to
|
|
// indent_end that is divisible by 3. But honesly that behavior seems
|
|
// pretty arbitrary.
|
|
// Perhaps a better default would be 1 TABWIDTH? But ugh that's so big.
|
|
if self.opts.tagged {
|
|
indent_str.push_str(" ");
|
|
indent_len += 4;
|
|
}
|
|
}
|
|
} else if in_mail {
|
|
// lines following mail headers must begin with spaces
|
|
if fl.indent_end == 0 || (self.opts.use_prefix && fl.pfxind_end == 0) {
|
|
break; // this line does not begin with spaces
|
|
}
|
|
} else if !second_done {
|
|
// now we have enough info to handle crown margin and tagged mode
|
|
if prefix_len != fl.prefix_len || pfxind_end != fl.pfxind_end {
|
|
// in both crown and tagged modes we require that prefix_len is the same
|
|
break;
|
|
} else if self.opts.tagged && indent_len - 4 == fl.indent_len && indent_end == fl.indent_end {
|
|
// in tagged mode, indent has to be *different* on following lines
|
|
break;
|
|
} else {
|
|
// this is part of the same paragraph, get the indent info from this line
|
|
indent_str.clear();
|
|
indent_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
|
|
indent_len = fl.indent_len;
|
|
indent_end = fl.indent_end;
|
|
}
|
|
second_done = true;
|
|
} else {
|
|
// detect mismatch
|
|
if indent_end != fl.indent_end || pfxind_end != fl.pfxind_end || indent_len != fl.indent_len || prefix_len != fl.prefix_len {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
pLines.push(self.lines.next().unwrap().get_fileline().line);
|
|
|
|
// when we're in split-only mode, we never join lines, so stop here
|
|
if self.opts.split_only {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// if this was a mail header, then the next line can be detected as one. Otherwise, it cannot.
|
|
// NOTE next_mail is true at ParagraphStream instantiation, and is set to true after a blank
|
|
// NoFormatLine.
|
|
self.next_mail = in_mail;
|
|
|
|
Some(Ok(Paragraph {
|
|
lines : pLines,
|
|
init_str : init_str,
|
|
init_len : init_len,
|
|
init_end : init_end,
|
|
indent_str : indent_str,
|
|
indent_len : indent_len,
|
|
indent_end : indent_end,
|
|
mail_header : in_mail
|
|
}))
|
|
}
|
|
}
|
|
|
|
pub struct ParaWords<'a> {
|
|
opts : &'a FmtOptions,
|
|
para : &'a Paragraph,
|
|
words : Vec<WordInfo<'a>>
|
|
}
|
|
|
|
impl<'a> ParaWords<'a> {
|
|
pub fn new<'a>(opts: &'a FmtOptions, para: &'a Paragraph) -> ParaWords<'a> {
|
|
let mut pw = ParaWords { opts: opts, para: para, words: Vec::new() };
|
|
pw.create_words();
|
|
pw
|
|
}
|
|
|
|
fn create_words<'r>(&'r mut self) {
|
|
if self.para.mail_header {
|
|
// no extra spacing for mail headers; always exactly 1 space
|
|
// safe to trim_left on every line of a mail header, since the
|
|
// first line is guaranteed not to have any spaces
|
|
self.words.extend(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
|
|
word : x,
|
|
word_start : 0,
|
|
word_nchars : x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped)
|
|
before_tab : None,
|
|
after_tab : 0,
|
|
sentence_start : false,
|
|
ends_punct : false,
|
|
new_line : false
|
|
}));
|
|
} else {
|
|
// first line
|
|
self.words.extend(
|
|
if self.opts.crown || self.opts.tagged {
|
|
// crown and tagged mode has the "init" in the first line, so slice from there
|
|
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
|
|
} else {
|
|
// otherwise we slice from the indent
|
|
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
|
|
});
|
|
|
|
if self.para.lines.len() > 1 {
|
|
let indent_end = self.para.indent_end;
|
|
let opts = self.opts;
|
|
self.words.extend(
|
|
self.para.lines.iter().skip(1).flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end))));
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn words(&'a self) -> Items<'a, WordInfo<'a>> { return self.words.iter() }
|
|
}
|
|
|
|
struct WordSplit<'a> {
|
|
opts : &'a FmtOptions,
|
|
string : &'a str,
|
|
length : uint,
|
|
position : uint,
|
|
prev_punct : bool
|
|
}
|
|
|
|
impl<'a> WordSplit<'a> {
|
|
fn analyze_tabs(&self, string: &str) -> (Option<uint>, uint, Option<uint>) {
|
|
// given a string, determine (length before tab) and (printed length after first tab)
|
|
// if there are no tabs, beforetab = -1 and aftertab is the printed length
|
|
let mut beforetab = None;
|
|
let mut aftertab = 0;
|
|
let mut word_start = None;
|
|
for (os, c) in string.char_indices() {
|
|
if !c.is_whitespace() {
|
|
word_start = Some(os);
|
|
break;
|
|
} else if c == '\t' {
|
|
if beforetab == None {
|
|
beforetab = Some(aftertab);
|
|
aftertab = 0;
|
|
} else {
|
|
aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth;
|
|
}
|
|
} else {
|
|
aftertab += 1;
|
|
}
|
|
}
|
|
(beforetab, aftertab, word_start)
|
|
}
|
|
}
|
|
|
|
impl<'a> WordSplit<'a> {
|
|
fn new<'a>(opts: &'a FmtOptions, string: &'a str) -> WordSplit<'a> {
|
|
// wordsplits *must* start at a non-whitespace character
|
|
let trim_string = string.trim_left();
|
|
WordSplit { opts: opts, string: trim_string, length: string.len(), position: 0, prev_punct: false }
|
|
}
|
|
|
|
fn is_punctuation(c: char) -> bool {
|
|
match c {
|
|
'!' | '.' | '?' => true,
|
|
_ => false
|
|
}
|
|
}
|
|
}
|
|
|
|
pub struct WordInfo<'a> {
|
|
pub word : &'a str,
|
|
pub word_start : uint,
|
|
pub word_nchars : uint,
|
|
pub before_tab : Option<uint>,
|
|
pub after_tab : uint,
|
|
pub sentence_start : bool,
|
|
pub ends_punct : bool,
|
|
pub new_line : bool
|
|
}
|
|
|
|
// returns (&str, is_start_of_sentence)
|
|
impl<'a> Iterator<WordInfo<'a>> for WordSplit<'a> {
|
|
fn next(&mut self) -> Option<WordInfo<'a>> {
|
|
if self.position >= self.length {
|
|
return None
|
|
}
|
|
|
|
let old_position = self.position;
|
|
let new_line = old_position == 0;
|
|
|
|
// find the start of the next word, and record if we find a tab character
|
|
let (before_tab, after_tab, word_start) = match self.analyze_tabs(self.string.slice_from(old_position)) {
|
|
(b, a, Some(s)) => (b, a, s + old_position),
|
|
(_, _, None) => {
|
|
self.position = self.length;
|
|
return None;
|
|
}
|
|
};
|
|
|
|
// find the beginning of the next whitespace
|
|
// note that this preserves the invariant that self.position
|
|
// points to whitespace character OR end of string
|
|
let mut word_nchars = 0;
|
|
self.position =
|
|
match self.string.slice_from(word_start)
|
|
.find(|x: char| if !x.is_whitespace() { word_nchars += char_width(x); false } else { true }) {
|
|
None => self.length,
|
|
Some(s) => s + word_start
|
|
};
|
|
|
|
let word_start_relative = word_start - old_position;
|
|
// if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence.
|
|
let is_start_of_sentence = self.prev_punct && (before_tab.is_some() || word_start_relative > 1);
|
|
|
|
// now record whether this word ends in punctuation
|
|
self.prev_punct = match self.string.char_range_at_reverse(self.position) {
|
|
CharRange { ch, next: _ } => WordSplit::is_punctuation(ch)
|
|
};
|
|
|
|
let (word, word_start_relative, before_tab, after_tab) =
|
|
if self.opts.uniform {
|
|
(self.string.slice(word_start, self.position), 0, None, 0)
|
|
} else {
|
|
(self.string.slice(old_position, self.position), word_start_relative, before_tab, after_tab)
|
|
};
|
|
|
|
Some(WordInfo {
|
|
word : word,
|
|
word_start : word_start_relative,
|
|
word_nchars : word_nchars,
|
|
before_tab : before_tab,
|
|
after_tab : after_tab,
|
|
sentence_start : is_start_of_sentence,
|
|
ends_punct : self.prev_punct,
|
|
new_line : new_line
|
|
})
|
|
}
|
|
}
|