mirror of
https://github.com/uutils/coreutils
synced 2024-12-15 07:42:48 +00:00
Merge pull request #290 from kwantam/master
fmt: correct tab support, better formatting
This commit is contained in:
commit
32d843f500
3 changed files with 328 additions and 210 deletions
51
fmt/fmt.rs
51
fmt/fmt.rs
|
@ -1,4 +1,4 @@
|
|||
#![crate_id(name="fmt", vers="0.0.1", author="kwantam")]
|
||||
#![crate_id(name="fmt", vers="0.0.2", author="kwantam")]
|
||||
/*
|
||||
* This file is part of `fmt` from the uutils coreutils package.
|
||||
*
|
||||
|
@ -12,20 +12,19 @@
|
|||
|
||||
extern crate core;
|
||||
extern crate getopts;
|
||||
extern crate libc;
|
||||
|
||||
use std::io::{BufferedReader, BufferedWriter, File, IoResult};
|
||||
use std::io::stdio::{stdin_raw, stdout_raw, stdout};
|
||||
use std::io::stdio::{stdin_raw, stdout_raw};
|
||||
use std::os;
|
||||
use linebreak::break_simple;
|
||||
use parasplit::{ParagraphStream, ParaWords};
|
||||
use linebreak::break_lines;
|
||||
use parasplit::ParagraphStream;
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! silent_unwrap(
|
||||
($exp:expr) => (
|
||||
match $exp {
|
||||
Ok(_) => (),
|
||||
Err(_) => unsafe { ::libc::exit(1) }
|
||||
Err(_) => unsafe { ::util::libc::exit(1) }
|
||||
}
|
||||
)
|
||||
)
|
||||
|
@ -36,7 +35,7 @@ mod parasplit;
|
|||
|
||||
// program's NAME and VERSION are used for -V and -h
|
||||
static NAME: &'static str = "fmt";
|
||||
static VERSION: &'static str = "0.0.1";
|
||||
static VERSION: &'static str = "0.0.2";
|
||||
|
||||
struct FmtOptions {
|
||||
crown : bool,
|
||||
|
@ -46,7 +45,6 @@ struct FmtOptions {
|
|||
use_prefix : bool,
|
||||
prefix : String,
|
||||
xprefix : bool,
|
||||
prefix_len : uint,
|
||||
use_anti_prefix : bool,
|
||||
anti_prefix : String,
|
||||
xanti_prefix : bool,
|
||||
|
@ -106,7 +104,6 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
use_prefix : false,
|
||||
prefix : String::new(),
|
||||
xprefix : false,
|
||||
prefix_len : 0,
|
||||
use_anti_prefix : false,
|
||||
anti_prefix : String::new(),
|
||||
xanti_prefix : false,
|
||||
|
@ -127,7 +124,6 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
Some(s) => {
|
||||
fmt_opts.prefix = s;
|
||||
fmt_opts.use_prefix = true;
|
||||
fmt_opts.prefix_len = fmt_opts.prefix.as_slice().char_len()
|
||||
}
|
||||
None => ()
|
||||
};
|
||||
|
@ -206,36 +202,7 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
for paraResult in pStream {
|
||||
match paraResult {
|
||||
Err(s) => silent_unwrap!(ostream.write(s.as_bytes())),
|
||||
Ok(para) => {
|
||||
// indent
|
||||
let pIndent = para.pfxind_str.clone().append(fmt_opts.prefix.as_slice()).append(para.indent_str.as_slice());
|
||||
let pIndentLen = para.pfxind_len + fmt_opts.prefix_len + para.indent_len;
|
||||
|
||||
// words
|
||||
let pWords = ParaWords::new(&fmt_opts, ¶);
|
||||
let mut pWords_words = pWords.words().map(|&x| x);
|
||||
|
||||
// print the init, if it exists, and get its length
|
||||
let pInitLen =
|
||||
if fmt_opts.crown || fmt_opts.tagged {
|
||||
// handle "init" portion
|
||||
silent_unwrap!(ostream.write(para.init_str.as_bytes()));
|
||||
para.init_len
|
||||
} else if !para.mail_header {
|
||||
// for non-(crown, tagged) that's the same as a normal indent
|
||||
silent_unwrap!(ostream.write(pIndent.as_bytes()));
|
||||
pIndentLen
|
||||
} else {
|
||||
// except that mail headers get no indent at all
|
||||
0
|
||||
};
|
||||
|
||||
// does ths paragraph require uniform spacing?
|
||||
let uniform = para.mail_header || fmt_opts.uniform;
|
||||
|
||||
break_simple(&mut pWords_words, fmt_opts.width, pIndent.as_slice(), pIndentLen, pInitLen, uniform, &mut ostream);
|
||||
silent_unwrap!(ostream.write("\n".as_bytes()));
|
||||
}
|
||||
Ok(para) => break_lines(¶, &fmt_opts, &mut ostream)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -247,7 +214,9 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
}
|
||||
|
||||
fn print_usage(arg0: &str, opts: &[getopts::OptGroup], errmsg: &str) {
|
||||
break_simple(&mut getopts::short_usage(arg0, opts).as_slice().words(), 64, " ", 7, 0, true, &mut(box stdout() as Box<Writer>));
|
||||
let short_usage = getopts::short_usage(arg0, opts);
|
||||
println!("{}", short_usage.as_slice().slice_to(60));
|
||||
print!(" {}", short_usage.as_slice().slice_from(60));
|
||||
println!("\n\n{}{}", getopts::usage("Reformat paragraphs from input files (or stdin) to stdout.", opts), errmsg);
|
||||
}
|
||||
|
||||
|
|
166
fmt/linebreak.rs
166
fmt/linebreak.rs
|
@ -7,27 +7,157 @@
|
|||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
// break_simple implements the "tight" breaking algorithm: print words until
|
||||
// maxlength would be exceeded, then print a linebreak and indent and continue.
|
||||
// Note that any first line indent should already have been printed before
|
||||
// calling this function, and the length of said indent should be passed as
|
||||
// init_len
|
||||
pub fn break_simple<'a, T: Iterator<&'a str>>(s: &'a mut T, maxlen: uint, indent_str: &'a str, indent_len: uint, init_len: uint, uniform: bool, ostream: &mut Box<Writer>) -> uint {
|
||||
s.fold(init_len, |l, w| accum_words_simple(maxlen, indent_len, indent_str, ostream, uniform, l, w))
|
||||
use FmtOptions;
|
||||
use parasplit::{Paragraph, ParaWords, WordInfo};
|
||||
|
||||
struct BreakArgs<'a> {
|
||||
opts : &'a FmtOptions,
|
||||
init_len : uint,
|
||||
indent_str : &'a str,
|
||||
indent_len : uint,
|
||||
uniform : bool,
|
||||
ostream : &'a mut Box<Writer>
|
||||
}
|
||||
|
||||
fn accum_words_simple(maxlen: uint, indent_len: uint, indent_str: &str, ostream: &mut Box<Writer>, uniform: bool, l: uint, w: &str) -> uint {
|
||||
let wlen = w.len();
|
||||
let lnew =
|
||||
if l + wlen > maxlen {
|
||||
silent_unwrap!(ostream.write("\n".as_bytes()));
|
||||
silent_unwrap!(ostream.write(indent_str.as_bytes()));
|
||||
indent_len
|
||||
impl<'a> BreakArgs<'a> {
|
||||
#[inline(always)]
|
||||
fn compute_width(&self, pre: uint, post: uint, posn: uint) -> uint {
|
||||
post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
|
||||
}
|
||||
}
|
||||
|
||||
pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer>) {
|
||||
// indent
|
||||
let pIndent = para.indent_str.as_slice();
|
||||
let pIndentLen = para.indent_len;
|
||||
|
||||
// words
|
||||
let pWords = ParaWords::new(opts, para);
|
||||
let mut pWords_words = pWords.words();
|
||||
|
||||
// the first word will *always* appear on the first line
|
||||
// make sure of this here
|
||||
let (w, w_len) = match pWords_words.next() {
|
||||
Some(winfo) => (winfo.word, winfo.word_nchars),
|
||||
None => {
|
||||
silent_unwrap!(ostream.write_char('\n'));
|
||||
return;
|
||||
}
|
||||
};
|
||||
// print the init, if it exists, and get its length
|
||||
let pInitLen = w_len +
|
||||
if opts.crown || opts.tagged {
|
||||
// handle "init" portion
|
||||
silent_unwrap!(ostream.write(para.init_str.as_bytes()));
|
||||
para.init_len
|
||||
} else if !para.mail_header {
|
||||
// for non-(crown, tagged) that's the same as a normal indent
|
||||
silent_unwrap!(ostream.write(pIndent.as_bytes()));
|
||||
pIndentLen
|
||||
} else {
|
||||
l
|
||||
// except that mail headers get no indent at all
|
||||
0
|
||||
};
|
||||
// write first word after writing init
|
||||
silent_unwrap!(ostream.write(w.as_bytes()));
|
||||
|
||||
// does this paragraph require uniform spacing?
|
||||
let uniform = para.mail_header || opts.uniform;
|
||||
|
||||
let mut break_args = BreakArgs {
|
||||
opts : opts,
|
||||
init_len : pInitLen,
|
||||
indent_str : pIndent,
|
||||
indent_len : pIndentLen,
|
||||
uniform : uniform,
|
||||
ostream : ostream
|
||||
};
|
||||
|
||||
break_simple(&mut pWords_words, &mut break_args);
|
||||
}
|
||||
|
||||
/*
|
||||
* break_simple implements the "tight" breaking algorithm: print words until
|
||||
* maxlength would be exceeded, then print a linebreak and indent and continue.
|
||||
* Note that any first line indent should already have been printed before
|
||||
* calling this function, and the displayed length of said indent passed as
|
||||
* args.init_len
|
||||
*/
|
||||
fn break_simple<'a,T: Iterator<&'a WordInfo<'a>>>(iter: &'a mut T, args: &mut BreakArgs<'a>) {
|
||||
iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
|
||||
silent_unwrap!(args.ostream.write_char('\n'));
|
||||
}
|
||||
|
||||
fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
|
||||
// compute the length of this word, considering how tabs will expand at this position on the line
|
||||
let wlen = winfo.word_nchars +
|
||||
if winfo.before_tab.is_some() {
|
||||
args.compute_width(winfo.before_tab.unwrap(), winfo.after_tab, l)
|
||||
} else {
|
||||
winfo.after_tab
|
||||
};
|
||||
|
||||
silent_unwrap!(ostream.write(w.as_bytes()));
|
||||
if uniform { silent_unwrap!(ostream.write(" ".as_bytes())); }
|
||||
lnew + wlen + 1
|
||||
let splen =
|
||||
if args.uniform || winfo.new_line {
|
||||
if winfo.sentence_start || (winfo.new_line && prev_punct) { 2 }
|
||||
else { 1 }
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if l + wlen + splen > args.opts.width {
|
||||
let wtrim = winfo.word.slice_from(winfo.word_start);
|
||||
silent_unwrap!(args.ostream.write_char('\n'));
|
||||
silent_unwrap!(args.ostream.write(args.indent_str.as_bytes()));
|
||||
silent_unwrap!(args.ostream.write(wtrim.as_bytes()));
|
||||
(args.indent_len + wtrim.len(), winfo.ends_punct)
|
||||
} else {
|
||||
if splen == 2 { silent_unwrap!(args.ostream.write(" ".as_bytes())); }
|
||||
else if splen == 1 { silent_unwrap!(args.ostream.write_char(' ')) }
|
||||
silent_unwrap!(args.ostream.write(winfo.word.as_bytes()));
|
||||
(l + wlen + splen, winfo.ends_punct)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
enum PreviousBreak<'a> {
|
||||
ParaStart,
|
||||
PrevBreak(&'a LineBreak<'a>)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LineBreak<'a> {
|
||||
prev : PreviousBreak<'a>,
|
||||
breakafter : &'a str,
|
||||
demerits : uint
|
||||
}
|
||||
|
||||
// when comparing two LineBreaks, compare their demerits
|
||||
#[allow(dead_code)]
|
||||
impl<'a> PartialEq for LineBreak<'a> {
|
||||
fn eq(&self, other: &LineBreak) -> bool {
|
||||
self.demerits == other.demerits
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE "less than" in this case means "worse", i.e., more demerits
|
||||
#[allow(dead_code)]
|
||||
impl<'a> PartialOrd for LineBreak<'a> {
|
||||
fn lt(&self, other: &LineBreak) -> bool {
|
||||
self.demerits > other.demerits
|
||||
}
|
||||
}
|
||||
|
||||
// we have to satisfy Eq to implement Ord
|
||||
#[allow(dead_code)]
|
||||
impl<'a> Eq for LineBreak<'a> {}
|
||||
|
||||
// NOTE again here we reverse the ordering:
|
||||
// if other has more demerits, self is Greater
|
||||
#[allow(dead_code)]
|
||||
impl<'a> Ord for LineBreak<'a> {
|
||||
fn cmp(&self, other: &LineBreak) -> Ordering {
|
||||
other.demerits.cmp(&self.demerits)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
321
fmt/parasplit.rs
321
fmt/parasplit.rs
|
@ -46,14 +46,13 @@ impl Line {
|
|||
struct FileLine {
|
||||
line : String,
|
||||
indent_end : uint, // the end of the indent, always the start of the text
|
||||
prefix_end : uint, // the end of the PREFIX
|
||||
pfxind_end : uint, // the end of the PREFIX's indent, that is, the spaces before the prefix
|
||||
indent_len : uint, // display length of indent taking into account TABWIDTH
|
||||
pfxind_len : uint, // PREFIX indent length taking into account TABWIDTH
|
||||
indent_len : uint, // display length of indent taking into account tabs
|
||||
prefix_len : uint, // PREFIX indent length taking into account tabs
|
||||
}
|
||||
|
||||
// iterator that produces a stream of Lines from a file
|
||||
struct FileLines<'a> {
|
||||
pub struct FileLines<'a> {
|
||||
opts : &'a FmtOptions,
|
||||
lines : Lines<'a, FileOrStdReader>,
|
||||
}
|
||||
|
@ -99,14 +98,35 @@ impl<'a> FileLines<'a> {
|
|||
(false, 0)
|
||||
}
|
||||
|
||||
fn displayed_length(&self, s: &str) -> uint {
|
||||
s.char_len() + (self.opts.tabwidth - 1) * s.chars().filter(|x| x == &'\t').count()
|
||||
fn compute_indent(&self, string: &str, prefix_end: uint) -> (uint, uint, uint) {
|
||||
let mut prefix_len = 0;
|
||||
let mut indent_len = 0;
|
||||
let mut indent_end = 0;
|
||||
for (os, c) in string.char_indices() {
|
||||
if os == prefix_end {
|
||||
// we found the end of the prefix, so this is the printed length of the prefix here
|
||||
prefix_len = indent_len;
|
||||
}
|
||||
|
||||
if (os >= prefix_end) && !c.is_whitespace() {
|
||||
// found first non-whitespace after prefix, this is indent_end
|
||||
indent_end = os;
|
||||
break;
|
||||
} else if c == '\t' {
|
||||
// compute tab length
|
||||
indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
|
||||
} else {
|
||||
// non-tab character
|
||||
indent_len += 1;
|
||||
}
|
||||
}
|
||||
(indent_end, prefix_len, indent_len)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator<Line> for FileLines<'a> {
|
||||
fn next(&mut self) -> Option<Line> {
|
||||
let mut n =
|
||||
let n =
|
||||
match self.lines.next() {
|
||||
Some(t) => match t {
|
||||
Ok(tt) => tt,
|
||||
|
@ -128,79 +148,31 @@ impl<'a> Iterator<Line> for FileLines<'a> {
|
|||
let (pmatch, poffset) = self.match_prefix(n.as_slice());
|
||||
if !pmatch {
|
||||
return Some(NoFormatLine(n, false));
|
||||
} else if n.as_slice().slice_from(poffset + self.opts.prefix.len()).is_whitespace() {
|
||||
// if the line matches the prefix, but is blank after,
|
||||
// don't allow lines to be combined through it (that is,
|
||||
// treat it like a blank line, except that since it's
|
||||
// not truly blank we will not allow mail headers on the
|
||||
// following line)
|
||||
return Some(NoFormatLine(n, false));
|
||||
}
|
||||
|
||||
// if this line matches the anti_prefix
|
||||
// skip if this line matches the anti_prefix
|
||||
// (NOTE definition of match_anti_prefix is TRUE if we should process)
|
||||
if !self.match_anti_prefix(n.as_slice()) {
|
||||
return Some(NoFormatLine(n, false));
|
||||
}
|
||||
|
||||
// replace trailing newline, if any, with space
|
||||
let CharRange {ch, next: i} = n.as_slice().char_range_at_reverse(n.len());
|
||||
if ch == '\n' {
|
||||
unsafe {
|
||||
let nmut = n.as_mut_bytes();
|
||||
nmut[i] = ' ' as u8;
|
||||
}
|
||||
if i > 0 {
|
||||
let CharRange {ch, next: _} = n.as_slice().char_range_at_reverse(i);
|
||||
if ch == '.' {
|
||||
n.push_char(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let nLen = n.len();
|
||||
// figure out the indent, prefix, and prefixindent ending points
|
||||
let (indEnd, pfxEnd, pfxIndEnd) =
|
||||
if self.opts.use_prefix {
|
||||
let pfxEnd = poffset + self.opts.prefix.len();
|
||||
let nSlice = n.as_slice().slice_from(pfxEnd);
|
||||
let nSlice2 = nSlice.trim_left();
|
||||
(pfxEnd + nSlice.len() - nSlice2.len(), pfxEnd, poffset)
|
||||
} else {
|
||||
let nSlice = n.as_slice().trim_left();
|
||||
(nLen - nSlice.len(), 0, 0)
|
||||
};
|
||||
|
||||
// indent length
|
||||
let indLen =
|
||||
if indEnd > 0 {
|
||||
self.displayed_length(n.as_slice().slice(pfxEnd, indEnd))
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// prefix indent length
|
||||
let pfxIndLen =
|
||||
if pfxIndEnd > 0 {
|
||||
self.displayed_length(n.as_slice().slice_to(pfxIndEnd))
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
// if we are in uniform mode, all tabs after the indent should be replaced by spaces.
|
||||
// NOTE that in this implementation, [?!.]\t is NOT detected as a sentence break, but
|
||||
// [?!.]\t\t is. We could expand tabs to two spaces to force detection of tab as
|
||||
// sentence ending
|
||||
if self.opts.uniform {
|
||||
let tabinds: Vec<uint> = n.as_slice().slice_from(indEnd).char_indices().filter_map(|(i, c)| if c == '\t' { Some(i) } else { None }).collect();
|
||||
unsafe {
|
||||
let nmut = n.as_mut_bytes();
|
||||
for i in tabinds.iter() {
|
||||
nmut[*i] = ' ' as u8;
|
||||
}
|
||||
}
|
||||
}
|
||||
let prefix_end = poffset + self.opts.prefix.len();
|
||||
let (indent_end, prefix_len, indent_len) = self.compute_indent(n.as_slice(), prefix_end);
|
||||
|
||||
Some(FormatLine(FileLine {
|
||||
line : n,
|
||||
indent_end : indEnd,
|
||||
prefix_end : pfxEnd,
|
||||
pfxind_end : pfxIndEnd,
|
||||
indent_len : indLen,
|
||||
pfxind_len : pfxIndLen,
|
||||
indent_end : indent_end,
|
||||
pfxind_end : poffset,
|
||||
indent_len : indent_len,
|
||||
prefix_len : prefix_len
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
@ -211,22 +183,18 @@ impl<'a> Iterator<Line> for FileLines<'a> {
|
|||
// is only there to help us in deciding how to merge lines into Paragraphs
|
||||
#[deriving(Show)]
|
||||
pub struct Paragraph {
|
||||
lines : Vec<String>, // the lines of the file
|
||||
lines : Vec<String>, // the lines of the file
|
||||
pub init_str : String, // string representing the init, that is, the first line's indent
|
||||
pub init_len : uint, // printable length of the init string considering TABWIDTH
|
||||
init_end : uint, // byte location of end of init in first line String
|
||||
init_end : uint, // byte location of end of init in first line String
|
||||
pub indent_str : String, // string representing indent
|
||||
pub indent_len : uint, // length of above
|
||||
indent_end : uint, // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
|
||||
pub pfxind_str : String, // string representing the prefix indent
|
||||
pub pfxind_len : uint, // length of above
|
||||
indent_end : uint, // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
|
||||
pub mail_header : bool // we need to know if this is a mail header because we do word splitting differently in that case
|
||||
}
|
||||
|
||||
// an iterator producing a stream of paragraphs from a stream of lines
|
||||
// given a set of options.
|
||||
// NOTE as you iterate through the paragraphs, any NoFormatLines are
|
||||
// immediately dumped to stdout!
|
||||
pub struct ParagraphStream<'a> {
|
||||
lines : Peekable<Line,FileLines<'a>>,
|
||||
next_mail : bool,
|
||||
|
@ -296,8 +264,8 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
|
|||
let mut indent_str = String::new();
|
||||
let mut indent_end = 0;
|
||||
let mut indent_len = 0;
|
||||
let mut pfxind_str = String::new();
|
||||
let mut pfxind_len = 0;
|
||||
let mut prefix_len = 0;
|
||||
let mut pfxind_end = 0;
|
||||
let mut pLines = Vec::new();
|
||||
|
||||
let mut in_mail = false;
|
||||
|
@ -328,17 +296,23 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
|
|||
} else {
|
||||
if self.opts.crown || self.opts.tagged {
|
||||
init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
|
||||
init_len = fl.indent_len + fl.pfxind_len + self.opts.prefix_len;
|
||||
init_len = fl.indent_len;
|
||||
init_end = fl.indent_end;
|
||||
}
|
||||
} else {
|
||||
second_done = true;
|
||||
}
|
||||
|
||||
// these will be overwritten in the 2nd line of crown or tagged mode, but
|
||||
// we are not guaranteed to get to the 2nd line, e.g., if the next line
|
||||
// is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
|
||||
indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end));
|
||||
indent_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
|
||||
indent_len = fl.indent_len;
|
||||
indent_end = fl.indent_end;
|
||||
|
||||
// save these to check for matching lines
|
||||
prefix_len = fl.prefix_len;
|
||||
pfxind_end = fl.pfxind_end;
|
||||
|
||||
// in tagged mode, add 4 spaces of additional indenting by default
|
||||
// (gnu fmt's behavior is different: it seems to find the closest column to
|
||||
// indent_end that is divisible by 3. But honesly that behavior seems
|
||||
|
@ -348,36 +322,31 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
|
|||
indent_str.push_str(" ");
|
||||
indent_len += 4;
|
||||
}
|
||||
|
||||
if self.opts.use_prefix {
|
||||
pfxind_str.push_str(fl.line.as_slice().slice_to(fl.pfxind_end));
|
||||
pfxind_len = fl.pfxind_len;
|
||||
}
|
||||
}
|
||||
} else if in_mail {
|
||||
// lines following mail headers must begin with spaces
|
||||
if (self.opts.use_prefix && fl.pfxind_end == 0) || (!self.opts.use_prefix && fl.indent_end == 0) {
|
||||
if fl.indent_end == 0 || (self.opts.use_prefix && fl.pfxind_end == 0) {
|
||||
break; // this line does not begin with spaces
|
||||
}
|
||||
} else if !second_done && (self.opts.crown || self.opts.tagged) {
|
||||
} else if !second_done {
|
||||
// now we have enough info to handle crown margin and tagged mode
|
||||
if pfxind_len != fl.pfxind_len {
|
||||
// in both crown and tagged modes we require that pfxind is the same
|
||||
if prefix_len != fl.prefix_len || pfxind_end != fl.pfxind_end {
|
||||
// in both crown and tagged modes we require that prefix_len is the same
|
||||
break;
|
||||
} else if self.opts.tagged && (indent_end == fl.indent_end) {
|
||||
// in tagged mode, indent also has to be different
|
||||
} else if self.opts.tagged && indent_len - 4 == fl.indent_len && indent_end == fl.indent_end {
|
||||
// in tagged mode, indent has to be *different* on following lines
|
||||
break;
|
||||
} else {
|
||||
// this is part of the same paragraph, get the indent info from this line
|
||||
indent_str.clear();
|
||||
indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end));
|
||||
indent_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
|
||||
indent_len = fl.indent_len;
|
||||
indent_end = fl.indent_end;
|
||||
}
|
||||
second_done = true;
|
||||
} else {
|
||||
// detect mismatch
|
||||
if (indent_end != fl.indent_end) || (indent_len != fl.indent_len) || (pfxind_len != fl.pfxind_len) {
|
||||
if indent_end != fl.indent_end || pfxind_end != fl.pfxind_end || indent_len != fl.indent_len || prefix_len != fl.prefix_len {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -404,8 +373,6 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
|
|||
indent_str : indent_str,
|
||||
indent_len : indent_len,
|
||||
indent_end : indent_end,
|
||||
pfxind_str : pfxind_str,
|
||||
pfxind_len : pfxind_len,
|
||||
mail_header : in_mail
|
||||
}))
|
||||
}
|
||||
|
@ -414,7 +381,7 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
|
|||
pub struct ParaWords<'a> {
|
||||
opts : &'a FmtOptions,
|
||||
para : &'a Paragraph,
|
||||
words : Vec<&'a str>
|
||||
words : Vec<WordInfo<'a>>
|
||||
}
|
||||
|
||||
impl<'a> ParaWords<'a> {
|
||||
|
@ -429,44 +396,80 @@ impl<'a> ParaWords<'a> {
|
|||
// no extra spacing for mail headers; always exactly 1 space
|
||||
// safe to trim_left on every line of a mail header, since the
|
||||
// first line is guaranteed not to have any spaces
|
||||
self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().trim_left().words()).collect());
|
||||
self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
|
||||
word : x,
|
||||
word_start : 0,
|
||||
word_nchars : x.char_len(),
|
||||
before_tab : None,
|
||||
after_tab : 0,
|
||||
sentence_start : false,
|
||||
ends_punct : false,
|
||||
new_line : false
|
||||
}).collect());
|
||||
} else {
|
||||
// first line
|
||||
self.words.push_all_move(
|
||||
if self.opts.crown || self.opts.tagged {
|
||||
// crown and tagged mode has the "init" in the first line, so slice from there
|
||||
WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
|
||||
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
|
||||
} else {
|
||||
// otherwise we slice from the indent
|
||||
WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
|
||||
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
|
||||
}.collect());
|
||||
|
||||
if self.para.lines.len() > 1 {
|
||||
let indent_end = self.para.indent_end;
|
||||
let uniform = self.opts.uniform;
|
||||
let opts = self.opts;
|
||||
self.words.push_all_move(
|
||||
self.para.lines.iter().skip(1)
|
||||
.flat_map(|x| WordSplit::new(uniform, x.as_slice().slice_from(indent_end)))
|
||||
.flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end)))
|
||||
.collect());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn words(&'a self) -> Items<'a,&'a str> { return self.words.iter() }
|
||||
pub fn words(&'a self) -> Items<'a,WordInfo<'a>> { return self.words.iter() }
|
||||
}
|
||||
|
||||
struct WordSplit<'a> {
|
||||
uniform : bool,
|
||||
string : &'a str,
|
||||
length : uint,
|
||||
position : uint
|
||||
opts : &'a FmtOptions,
|
||||
string : &'a str,
|
||||
length : uint,
|
||||
position : uint,
|
||||
prev_punct : bool
|
||||
}
|
||||
|
||||
impl<'a> WordSplit<'a> {
|
||||
fn new<'a>(uniform: bool, string: &'a str) -> WordSplit<'a> {
|
||||
fn analyze_tabs(&self, string: &str) -> (Option<uint>, uint, Option<uint>) {
|
||||
// given a string, determine (length before tab) and (printed length after first tab)
|
||||
// if there are no tabs, beforetab = -1 and aftertab is the printed length
|
||||
let mut beforetab = None;
|
||||
let mut aftertab = 0;
|
||||
let mut word_start = None;
|
||||
for (os, c) in string.char_indices() {
|
||||
if !c.is_whitespace() {
|
||||
word_start = Some(os);
|
||||
break;
|
||||
} else if c == '\t' {
|
||||
if beforetab == None {
|
||||
beforetab = Some(aftertab);
|
||||
aftertab = 0;
|
||||
} else {
|
||||
aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth;
|
||||
}
|
||||
} else {
|
||||
aftertab += 1;
|
||||
}
|
||||
}
|
||||
(beforetab, aftertab, word_start)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> WordSplit<'a> {
|
||||
fn new<'a>(opts: &'a FmtOptions, string: &'a str) -> WordSplit<'a> {
|
||||
// wordsplits *must* start at a non-whitespace character
|
||||
let trim_string = string.trim_left();
|
||||
WordSplit { uniform: uniform, string: trim_string, length: string.len(), position: 0 }
|
||||
WordSplit { opts: opts, string: trim_string, length: string.len(), position: 0, prev_punct: false }
|
||||
}
|
||||
|
||||
fn is_punctuation(c: char) -> bool {
|
||||
|
@ -477,56 +480,72 @@ impl<'a> WordSplit<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator<&'a str> for WordSplit<'a> {
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
pub struct WordInfo<'a> {
|
||||
pub word : &'a str,
|
||||
pub word_start : uint,
|
||||
pub word_nchars : uint,
|
||||
pub before_tab : Option<uint>,
|
||||
pub after_tab : uint,
|
||||
pub sentence_start : bool,
|
||||
pub ends_punct : bool,
|
||||
pub new_line : bool
|
||||
}
|
||||
|
||||
// returns (&str, is_start_of_sentence)
|
||||
impl<'a> Iterator<WordInfo<'a>> for WordSplit<'a> {
|
||||
fn next(&mut self) -> Option<WordInfo<'a>> {
|
||||
if self.position >= self.length {
|
||||
return None
|
||||
}
|
||||
|
||||
let old_position = self.position;
|
||||
let new_line = old_position == 0;
|
||||
|
||||
// find the start of the next whitespace segment
|
||||
let ws_start =
|
||||
match self.string.slice_from(old_position).find(|x: char| x.is_whitespace()) {
|
||||
None => self.length,
|
||||
Some(s) => s + old_position
|
||||
};
|
||||
|
||||
if ws_start == self.length {
|
||||
self.position = self.length;
|
||||
return Some(self.string.slice_from(old_position));
|
||||
}
|
||||
|
||||
// find the end of the next whitespace segment
|
||||
// note that this preserves the invariant that self.position points to
|
||||
// non-whitespace character OR end of string
|
||||
self.position =
|
||||
match self.string.slice_from(ws_start).find(|x: char| !x.is_whitespace()) {
|
||||
None => self.length,
|
||||
Some(s) => s + ws_start
|
||||
};
|
||||
|
||||
let is_sentence_end = match self.string.char_range_at_reverse(ws_start) {
|
||||
CharRange { ch, next: _ } if WordSplit::is_punctuation(ch) => self.position - ws_start > 2,
|
||||
_ => false
|
||||
// find the start of the next word, and record if we find a tab character
|
||||
let (before_tab, after_tab, word_start) = match self.analyze_tabs(self.string.slice_from(old_position)) {
|
||||
(b, a, Some(s)) => (b, a, s + old_position),
|
||||
(_, _, None) => {
|
||||
self.position = self.length;
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
Some(
|
||||
if self.uniform {
|
||||
// if the last non-whitespace character is a [?!.] and
|
||||
// there are two or more spaces, this is the end of a
|
||||
// sentence, so keep one extra space.
|
||||
if is_sentence_end {
|
||||
self.string.slice(old_position, ws_start + 1)
|
||||
} else {
|
||||
self.string.slice(old_position, ws_start)
|
||||
}
|
||||
// find the beginning of the next whitespace
|
||||
// note that this preserves the invariant that self.position
|
||||
// points to whitespace character OR end of string
|
||||
let mut word_nchars = 0;
|
||||
self.position =
|
||||
match self.string.slice_from(word_start)
|
||||
.find(|x: char| if !x.is_whitespace() { word_nchars += 1; false } else { true }) {
|
||||
None => self.length,
|
||||
Some(s) => s + word_start
|
||||
};
|
||||
|
||||
let word_start_relative = word_start - old_position;
|
||||
// if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence.
|
||||
let is_start_of_sentence = self.prev_punct && (before_tab.is_some() || word_start_relative > 1);
|
||||
|
||||
// now record whether this word ends in punctuation
|
||||
self.prev_punct = match self.string.char_range_at_reverse(self.position) {
|
||||
CharRange { ch, next: _ } => WordSplit::is_punctuation(ch)
|
||||
};
|
||||
|
||||
let (word, word_start_relative, before_tab, after_tab) =
|
||||
if self.opts.uniform {
|
||||
(self.string.slice(word_start, self.position), 0, None, 0)
|
||||
} else {
|
||||
// in non-uniform mode, we just keep the whole thing
|
||||
// eventually we will want to annotate where the sentence boundaries are
|
||||
// so that we can give preference to splitting lines appropriately
|
||||
self.string.slice(old_position, self.position)
|
||||
}
|
||||
)
|
||||
(self.string.slice(old_position, self.position), word_start_relative, before_tab, after_tab)
|
||||
};
|
||||
|
||||
Some(WordInfo {
|
||||
word : word,
|
||||
word_start : word_start_relative,
|
||||
word_nchars : word_nchars,
|
||||
before_tab : before_tab,
|
||||
after_tab : after_tab,
|
||||
sentence_start : is_start_of_sentence,
|
||||
ends_punct : self.prev_punct,
|
||||
new_line : new_line
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue