Merge pull request #290 from kwantam/master

fmt: correct tab support, better formatting
2024-12-15 07:42:48 +00:00 · 2014-06-20 16:27:27 -07:00 · 2014-06-20 16:27:27 -07:00 · 32d843f500
commit 32d843f500
parent eb5f199c8f c9ee0a3e4d
3 changed files with 328 additions and 210 deletions
--- a/fmt/fmt.rs
+++ b/fmt/fmt.rs
@ -1,4 +1,4 @@
-#![crate_id(name="fmt", vers="0.0.1", author="kwantam")]
+#![crate_id(name="fmt", vers="0.0.2", author="kwantam")]
 /*
 * This file is part of `fmt` from the uutils coreutils package.
 *
@ -12,20 +12,19 @@

 extern crate core;
 extern crate getopts;
-extern crate libc;

 use std::io::{BufferedReader, BufferedWriter, File, IoResult};
-use std::io::stdio::{stdin_raw, stdout_raw, stdout};
+use std::io::stdio::{stdin_raw, stdout_raw};
 use std::os;
-use linebreak::break_simple;
-use parasplit::{ParagraphStream, ParaWords};
+use linebreak::break_lines;
+use parasplit::ParagraphStream;

 #[macro_export]
 macro_rules! silent_unwrap(
    ($exp:expr) => (
        match $exp {
            Ok(_) => (),
-            Err(_) => unsafe { ::libc::exit(1) }
+            Err(_) => unsafe { ::util::libc::exit(1) }
        }
    )
 )
@ -36,7 +35,7 @@ mod parasplit;

 // program's NAME and VERSION are used for -V and -h
 static NAME: &'static str = "fmt";
-static VERSION: &'static str = "0.0.1";
+static VERSION: &'static str = "0.0.2";

 struct FmtOptions {
    crown           : bool,
@ -46,7 +45,6 @@ struct FmtOptions {
    use_prefix      : bool,
    prefix          : String,
    xprefix         : bool,
-    prefix_len      : uint,
    use_anti_prefix : bool,
    anti_prefix     : String,
    xanti_prefix    : bool,
@ -106,7 +104,6 @@ pub fn uumain(args: Vec<String>) -> int {
        use_prefix      : false,
        prefix          : String::new(),
        xprefix         : false,
-        prefix_len      : 0,
        use_anti_prefix : false,
        anti_prefix     : String::new(),
        xanti_prefix    : false,
@ -127,7 +124,6 @@ pub fn uumain(args: Vec<String>) -> int {
        Some(s) => {
            fmt_opts.prefix = s;
            fmt_opts.use_prefix = true;
-            fmt_opts.prefix_len = fmt_opts.prefix.as_slice().char_len()
        }
        None => ()
    };
@ -206,36 +202,7 @@ pub fn uumain(args: Vec<String>) -> int {
        for paraResult in pStream {
            match paraResult {
                Err(s) => silent_unwrap!(ostream.write(s.as_bytes())),
-                Ok(para) => {
-                    // indent
-                    let pIndent = para.pfxind_str.clone().append(fmt_opts.prefix.as_slice()).append(para.indent_str.as_slice());
-                    let pIndentLen = para.pfxind_len + fmt_opts.prefix_len + para.indent_len;
-
-                    // words
-                    let pWords = ParaWords::new(&fmt_opts, &para);
-                    let mut pWords_words = pWords.words().map(|&x| x);
-
-                    // print the init, if it exists, and get its length
-                    let pInitLen =
-                        if fmt_opts.crown || fmt_opts.tagged {
-                            // handle "init" portion
-                            silent_unwrap!(ostream.write(para.init_str.as_bytes()));
-                            para.init_len
-                        } else if !para.mail_header {
-                            // for non-(crown, tagged) that's the same as a normal indent
-                            silent_unwrap!(ostream.write(pIndent.as_bytes()));
-                            pIndentLen
-                        } else {
-                            // except that mail headers get no indent at all
-                            0
-                        };
-
-                    // does ths paragraph require uniform spacing?
-                    let uniform = para.mail_header || fmt_opts.uniform;
-
-                    break_simple(&mut pWords_words, fmt_opts.width, pIndent.as_slice(), pIndentLen, pInitLen, uniform, &mut ostream);
-                    silent_unwrap!(ostream.write("\n".as_bytes()));
-                }
+                Ok(para) => break_lines(&para, &fmt_opts, &mut ostream)
            }
        }

@ -247,7 +214,9 @@ pub fn uumain(args: Vec<String>) -> int {
 }

 fn print_usage(arg0: &str, opts: &[getopts::OptGroup], errmsg: &str) {
-    break_simple(&mut getopts::short_usage(arg0, opts).as_slice().words(), 64, "       ", 7, 0, true, &mut(box stdout() as Box<Writer>));
+    let short_usage = getopts::short_usage(arg0, opts);
+    println!("{}", short_usage.as_slice().slice_to(60));
+    print!("      {}", short_usage.as_slice().slice_from(60));
    println!("\n\n{}{}", getopts::usage("Reformat paragraphs from input files (or stdin) to stdout.", opts), errmsg);
 }

--- a/fmt/linebreak.rs
+++ b/fmt/linebreak.rs
@ -7,27 +7,157 @@
 * file that was distributed with this source code.
 */

-// break_simple implements the "tight" breaking algorithm: print words until
-// maxlength would be exceeded, then print a linebreak and indent and continue.
-// Note that any first line indent should already have been printed before
-// calling this function, and the length of said indent should be passed as
-// init_len
-pub fn break_simple<'a, T: Iterator<&'a str>>(s: &'a mut T, maxlen: uint, indent_str: &'a str, indent_len: uint, init_len: uint, uniform: bool, ostream: &mut Box<Writer>) -> uint {
-    s.fold(init_len, |l, w| accum_words_simple(maxlen, indent_len, indent_str, ostream, uniform, l, w))
+use FmtOptions;
+use parasplit::{Paragraph, ParaWords, WordInfo};
+
+struct BreakArgs<'a> {
+    opts       : &'a FmtOptions,
+    init_len   : uint,
+    indent_str : &'a str,
+    indent_len : uint,
+    uniform    : bool,
+    ostream    : &'a mut Box<Writer>
 }

-fn accum_words_simple(maxlen: uint, indent_len: uint, indent_str: &str, ostream: &mut Box<Writer>, uniform: bool, l: uint, w: &str) -> uint {
-    let wlen = w.len();
-    let lnew =
-        if l + wlen > maxlen {
-            silent_unwrap!(ostream.write("\n".as_bytes()));
-            silent_unwrap!(ostream.write(indent_str.as_bytes()));
-            indent_len
+impl<'a> BreakArgs<'a> {
+    #[inline(always)]
+    fn compute_width(&self, pre: uint, post: uint, posn: uint) -> uint {
+        post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
+    }
+}
+
+pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer>) {
+    // indent
+    let pIndent = para.indent_str.as_slice();
+    let pIndentLen = para.indent_len;
+
+    // words
+    let pWords = ParaWords::new(opts, para);
+    let mut pWords_words = pWords.words();
+
+    // the first word will *always* appear on the first line
+    // make sure of this here
+    let (w, w_len) = match pWords_words.next() {
+        Some(winfo) => (winfo.word, winfo.word_nchars),
+        None => {
+            silent_unwrap!(ostream.write_char('\n'));
+            return;
+        }
+    };
+    // print the init, if it exists, and get its length
+    let pInitLen = w_len +
+        if opts.crown || opts.tagged {
+            // handle "init" portion
+            silent_unwrap!(ostream.write(para.init_str.as_bytes()));
+            para.init_len
+        } else if !para.mail_header {
+            // for non-(crown, tagged) that's the same as a normal indent
+            silent_unwrap!(ostream.write(pIndent.as_bytes()));
+            pIndentLen
        } else {
-            l
+            // except that mail headers get no indent at all
+            0
+        };
+    // write first word after writing init
+    silent_unwrap!(ostream.write(w.as_bytes()));
+
+    // does this paragraph require uniform spacing?
+    let uniform = para.mail_header || opts.uniform;
+
+    let mut break_args = BreakArgs {
+        opts       : opts,
+        init_len   : pInitLen,
+        indent_str : pIndent,
+        indent_len : pIndentLen,
+        uniform    : uniform,
+        ostream    : ostream
+    };
+
+    break_simple(&mut pWords_words, &mut break_args);
+}
+
+/*
+ * break_simple implements the "tight" breaking algorithm: print words until
+ * maxlength would be exceeded, then print a linebreak and indent and continue.
+ * Note that any first line indent should already have been printed before
+ * calling this function, and the displayed length of said indent passed as
+ * args.init_len
+ */
+fn break_simple<'a,T: Iterator<&'a WordInfo<'a>>>(iter: &'a mut T, args: &mut BreakArgs<'a>) {
+    iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
+    silent_unwrap!(args.ostream.write_char('\n'));
+}
+
+fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
+    // compute the length of this word, considering how tabs will expand at this position on the line
+    let wlen = winfo.word_nchars +
+        if winfo.before_tab.is_some() {
+            args.compute_width(winfo.before_tab.unwrap(), winfo.after_tab, l)
+        } else {
+            winfo.after_tab
        };

-    silent_unwrap!(ostream.write(w.as_bytes()));
-    if uniform { silent_unwrap!(ostream.write(" ".as_bytes())); }
-    lnew + wlen + 1
+    let splen =
+        if args.uniform || winfo.new_line {
+            if winfo.sentence_start || (winfo.new_line && prev_punct) { 2 }
+            else { 1 }
+        } else {
+            0
+        };
+
+    if l + wlen + splen > args.opts.width {
+        let wtrim = winfo.word.slice_from(winfo.word_start);
+        silent_unwrap!(args.ostream.write_char('\n'));
+        silent_unwrap!(args.ostream.write(args.indent_str.as_bytes()));
+        silent_unwrap!(args.ostream.write(wtrim.as_bytes()));
+        (args.indent_len + wtrim.len(), winfo.ends_punct)
+    } else {
+        if splen == 2 { silent_unwrap!(args.ostream.write("  ".as_bytes())); }
+        else if splen == 1 { silent_unwrap!(args.ostream.write_char(' ')) }
+        silent_unwrap!(args.ostream.write(winfo.word.as_bytes()));
+        (l + wlen + splen, winfo.ends_punct)
+    }
 }
+
+#[allow(dead_code)]
+enum PreviousBreak<'a> {
+    ParaStart,
+    PrevBreak(&'a LineBreak<'a>)
+}
+
+#[allow(dead_code)]
+struct LineBreak<'a> {
+    prev       : PreviousBreak<'a>,
+    breakafter : &'a str,
+    demerits   : uint
+}
+
+// when comparing two LineBreaks, compare their demerits
+#[allow(dead_code)]
+impl<'a> PartialEq for LineBreak<'a> {
+    fn eq(&self, other: &LineBreak) -> bool {
+        self.demerits == other.demerits
+    }
+}
+
+// NOTE "less than" in this case means "worse", i.e., more demerits
+#[allow(dead_code)]
+impl<'a> PartialOrd for LineBreak<'a> {
+    fn lt(&self, other: &LineBreak) -> bool {
+        self.demerits > other.demerits
+    }
+}
+
+// we have to satisfy Eq to implement Ord
+#[allow(dead_code)]
+impl<'a> Eq for LineBreak<'a> {}
+
+// NOTE again here we reverse the ordering:
+// if other has more demerits, self is Greater
+#[allow(dead_code)]
+impl<'a> Ord for LineBreak<'a> {
+    fn cmp(&self, other: &LineBreak) -> Ordering {
+        other.demerits.cmp(&self.demerits)
+    }
+}
+
--- a/fmt/parasplit.rs
+++ b/fmt/parasplit.rs
@ -46,14 +46,13 @@ impl Line {
 struct FileLine {
    line       : String,
    indent_end : uint,     // the end of the indent, always the start of the text
-    prefix_end : uint,     // the end of the PREFIX
    pfxind_end : uint,     // the end of the PREFIX's indent, that is, the spaces before the prefix
-    indent_len : uint,     // display length of indent taking into account TABWIDTH
-    pfxind_len : uint,     // PREFIX indent length taking into account TABWIDTH
+    indent_len : uint,     // display length of indent taking into account tabs
+    prefix_len : uint,     // PREFIX indent length taking into account tabs
 }

 // iterator that produces a stream of Lines from a file
-struct FileLines<'a> {
+pub struct FileLines<'a> {
    opts  : &'a FmtOptions,
    lines : Lines<'a, FileOrStdReader>,
 }
@ -99,14 +98,35 @@ impl<'a> FileLines<'a> {
        (false, 0)
    }

-    fn displayed_length(&self, s: &str) -> uint {
-        s.char_len() + (self.opts.tabwidth - 1) * s.chars().filter(|x| x == &'\t').count()
+    fn compute_indent(&self, string: &str, prefix_end: uint) -> (uint, uint, uint) {
+        let mut prefix_len = 0;
+        let mut indent_len = 0;
+        let mut indent_end = 0;
+        for (os, c) in string.char_indices() {
+            if os == prefix_end {
+                // we found the end of the prefix, so this is the printed length of the prefix here
+                prefix_len = indent_len;
+            }
+
+            if (os >= prefix_end) && !c.is_whitespace() {
+                // found first non-whitespace after prefix, this is indent_end
+                indent_end = os;
+                break;
+            } else if c == '\t' {
+                // compute tab length
+                indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
+            } else {
+                // non-tab character
+                indent_len += 1;
+            }
+        }
+        (indent_end, prefix_len, indent_len)
    }
 }

 impl<'a> Iterator<Line> for FileLines<'a> {
    fn next(&mut self) -> Option<Line> {
-        let mut n =
+        let n =
            match self.lines.next() {
                Some(t) => match t {
                    Ok(tt) => tt,
@ -128,79 +148,31 @@ impl<'a> Iterator<Line> for FileLines<'a> {
        let (pmatch, poffset) = self.match_prefix(n.as_slice());
        if !pmatch {
            return Some(NoFormatLine(n, false));
+        } else if n.as_slice().slice_from(poffset + self.opts.prefix.len()).is_whitespace() {
+            // if the line matches the prefix, but is blank after,
+            // don't allow lines to be combined through it (that is,
+            // treat it like a blank line, except that since it's
+            // not truly blank we will not allow mail headers on the
+            // following line)
+            return Some(NoFormatLine(n, false));
        }

-        // if this line matches the anti_prefix
+        // skip if this line matches the anti_prefix
        // (NOTE definition of match_anti_prefix is TRUE if we should process)
        if !self.match_anti_prefix(n.as_slice()) {
            return Some(NoFormatLine(n, false));
        }

-        // replace trailing newline, if any, with space
-        let CharRange {ch, next: i} = n.as_slice().char_range_at_reverse(n.len());
-        if ch == '\n' {
-            unsafe {
-                let nmut = n.as_mut_bytes();
-                nmut[i] = ' ' as u8;
-            }
-            if i > 0 {
-                let CharRange {ch, next: _} = n.as_slice().char_range_at_reverse(i);
-                if ch == '.' {
-                    n.push_char(' ');
-                }
-            }
-        }
-
-        let nLen = n.len();
        // figure out the indent, prefix, and prefixindent ending points
-        let (indEnd, pfxEnd, pfxIndEnd) = 
-            if self.opts.use_prefix {
-                let pfxEnd = poffset + self.opts.prefix.len();
-                let nSlice = n.as_slice().slice_from(pfxEnd);
-                let nSlice2 = nSlice.trim_left();
-                (pfxEnd + nSlice.len() - nSlice2.len(), pfxEnd, poffset)
-            } else {
-                let nSlice = n.as_slice().trim_left();
-                (nLen - nSlice.len(), 0, 0)
-            };
-
-        // indent length
-        let indLen =
-            if indEnd > 0 {
-                self.displayed_length(n.as_slice().slice(pfxEnd, indEnd))
-            } else {
-                0
-            };
-
-        // prefix indent length
-        let pfxIndLen =
-            if pfxIndEnd > 0 {
-                self.displayed_length(n.as_slice().slice_to(pfxIndEnd))
-            } else {
-                0
-            };
-
-        // if we are in uniform mode, all tabs after the indent should be replaced by spaces.
-        // NOTE that in this implementation, [?!.]\t is NOT detected as a sentence break, but
-        // [?!.]\t\t is. We could expand tabs to two spaces to force detection of tab as
-        // sentence ending
-        if self.opts.uniform {
-            let tabinds: Vec<uint> = n.as_slice().slice_from(indEnd).char_indices().filter_map(|(i, c)| if c == '\t' { Some(i) } else { None }).collect();
-            unsafe {
-                let nmut = n.as_mut_bytes();
-                for i in tabinds.iter() {
-                    nmut[*i] = ' ' as u8;
-                }
-            }
-        }
+        let prefix_end = poffset + self.opts.prefix.len();
+        let (indent_end, prefix_len, indent_len) = self.compute_indent(n.as_slice(), prefix_end);

        Some(FormatLine(FileLine {
            line       : n,
-            indent_end : indEnd,
-            prefix_end : pfxEnd,
-            pfxind_end : pfxIndEnd,
-            indent_len : indLen,
-            pfxind_len : pfxIndLen,
+            indent_end : indent_end,
+            pfxind_end : poffset,
+            indent_len : indent_len,
+            prefix_len : prefix_len
        }))
    }
 }
@ -211,22 +183,18 @@ impl<'a> Iterator<Line> for FileLines<'a> {
 // is only there to help us in deciding how to merge lines into Paragraphs
 #[deriving(Show)]
 pub struct Paragraph {
-    lines           : Vec<String>,  // the lines of the file
+        lines       : Vec<String>,  // the lines of the file
    pub init_str    : String,       // string representing the init, that is, the first line's indent
    pub init_len    : uint,         // printable length of the init string considering TABWIDTH
-    init_end        : uint,         // byte location of end of init in first line String
+        init_end    : uint,         // byte location of end of init in first line String
    pub indent_str  : String,       // string representing indent
    pub indent_len  : uint,         // length of above
-    indent_end      : uint,         // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
-    pub pfxind_str  : String,       // string representing the prefix indent
-    pub pfxind_len  : uint,         // length of above
+        indent_end  : uint,         // byte location of end of indent (in crown and tagged mode, only applies to 2nd line and onward)
    pub mail_header : bool          // we need to know if this is a mail header because we do word splitting differently in that case
 }

 // an iterator producing a stream of paragraphs from a stream of lines
 // given a set of options.
-// NOTE as you iterate through the paragraphs, any NoFormatLines are
-// immediately dumped to stdout!
 pub struct ParagraphStream<'a> {
    lines     : Peekable<Line,FileLines<'a>>,
    next_mail : bool,
@ -296,8 +264,8 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
        let mut indent_str = String::new();
        let mut indent_end = 0;
        let mut indent_len = 0;
-        let mut pfxind_str = String::new();
-        let mut pfxind_len = 0;
+        let mut prefix_len = 0;
+        let mut pfxind_end = 0;
        let mut pLines = Vec::new();

        let mut in_mail = false;
@ -328,17 +296,23 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
                    } else {
                        if self.opts.crown || self.opts.tagged {
                            init_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
-                            init_len = fl.indent_len + fl.pfxind_len + self.opts.prefix_len;
+                            init_len = fl.indent_len;
                            init_end = fl.indent_end;
-                        } 
+                        } else {
+                            second_done = true;
+                        }

                        // these will be overwritten in the 2nd line of crown or tagged mode, but
                        // we are not guaranteed to get to the 2nd line, e.g., if the next line
                        // is a NoFormatLine or None. Thus, we set sane defaults the 1st time around
-                        indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end));
+                        indent_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
                        indent_len = fl.indent_len;
                        indent_end = fl.indent_end;

+                        // save these to check for matching lines
+                        prefix_len = fl.prefix_len;
+                        pfxind_end = fl.pfxind_end;
+
                        // in tagged mode, add 4 spaces of additional indenting by default
                        // (gnu fmt's behavior is different: it seems to find the closest column to
                        // indent_end that is divisible by 3. But honesly that behavior seems
@ -348,36 +322,31 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
                            indent_str.push_str("    ");
                            indent_len += 4;
                        }
-
-                        if self.opts.use_prefix {
-                            pfxind_str.push_str(fl.line.as_slice().slice_to(fl.pfxind_end));
-                            pfxind_len = fl.pfxind_len;
-                        }
                    }
                } else if in_mail {
                    // lines following mail headers must begin with spaces
-                    if (self.opts.use_prefix && fl.pfxind_end == 0) || (!self.opts.use_prefix && fl.indent_end == 0) {
+                    if fl.indent_end == 0 || (self.opts.use_prefix && fl.pfxind_end == 0) {
                        break;  // this line does not begin with spaces
                    }
-                } else if !second_done && (self.opts.crown || self.opts.tagged) {
+                } else if !second_done {
                    // now we have enough info to handle crown margin and tagged mode
-                    if pfxind_len != fl.pfxind_len {
-                        // in both crown and tagged modes we require that pfxind is the same
+                    if prefix_len != fl.prefix_len || pfxind_end != fl.pfxind_end {
+                        // in both crown and tagged modes we require that prefix_len is the same
                        break;
-                    } else if self.opts.tagged && (indent_end == fl.indent_end) {
-                        // in tagged mode, indent also has to be different
+                    } else if self.opts.tagged && indent_len - 4 == fl.indent_len && indent_end == fl.indent_end {
+                        // in tagged mode, indent has to be *different* on following lines
                        break;
                    } else {
                        // this is part of the same paragraph, get the indent info from this line
                        indent_str.clear();
-                        indent_str.push_str(fl.line.as_slice().slice(fl.prefix_end, fl.indent_end));
+                        indent_str.push_str(fl.line.as_slice().slice_to(fl.indent_end));
                        indent_len = fl.indent_len;
                        indent_end = fl.indent_end;
                    }
                    second_done = true;
                } else {
                    // detect mismatch
-                    if (indent_end != fl.indent_end) || (indent_len != fl.indent_len) || (pfxind_len != fl.pfxind_len) {
+                    if indent_end != fl.indent_end || pfxind_end != fl.pfxind_end || indent_len != fl.indent_len || prefix_len != fl.prefix_len {
                        break;
                    }
                }
@ -404,8 +373,6 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
            indent_str  : indent_str,
            indent_len  : indent_len,
            indent_end  : indent_end,
-            pfxind_str  : pfxind_str,
-            pfxind_len  : pfxind_len,
            mail_header : in_mail
        }))
    }
@ -414,7 +381,7 @@ impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
 pub struct ParaWords<'a> {
    opts  : &'a FmtOptions,
    para  : &'a Paragraph,
-    words : Vec<&'a str>
+    words : Vec<WordInfo<'a>>
 }

 impl<'a> ParaWords<'a> {
@ -429,44 +396,80 @@ impl<'a> ParaWords<'a> {
            // no extra spacing for mail headers; always exactly 1 space
            // safe to trim_left on every line of a mail header, since the
            // first line is guaranteed not to have any spaces
-            self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().trim_left().words()).collect());
+            self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
+                word           : x,
+                word_start     : 0,
+                word_nchars    : x.char_len(),
+                before_tab     : None,
+                after_tab      : 0,
+                sentence_start : false,
+                ends_punct     : false,
+                new_line       : false
+            }).collect());
        } else {
            // first line
            self.words.push_all_move(
                if self.opts.crown || self.opts.tagged {
                    // crown and tagged mode has the "init" in the first line, so slice from there
-                    WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
+                    WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
                } else {
                    // otherwise we slice from the indent
-                    WordSplit::new(self.opts.uniform, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
+                    WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
                }.collect());

            if self.para.lines.len() > 1 {
                let indent_end = self.para.indent_end;
-                let uniform = self.opts.uniform;
+                let opts = self.opts;
                self.words.push_all_move(
                    self.para.lines.iter().skip(1)
-                    .flat_map(|x| WordSplit::new(uniform, x.as_slice().slice_from(indent_end)))
+                    .flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end)))
                    .collect());
            }
        }
    }

-    pub fn words(&'a self) -> Items<'a,&'a str> { return self.words.iter() }
+    pub fn words(&'a self) -> Items<'a,WordInfo<'a>> { return self.words.iter() }
 }

 struct WordSplit<'a> {
-    uniform  : bool,
-    string   : &'a str,
-    length   : uint,
-    position : uint
+    opts       : &'a FmtOptions,
+    string     : &'a str,
+    length     : uint,
+    position   : uint,
+    prev_punct : bool
 }

 impl<'a> WordSplit<'a> {
-    fn new<'a>(uniform: bool, string: &'a str) -> WordSplit<'a> {
+    fn analyze_tabs(&self, string: &str) -> (Option<uint>, uint, Option<uint>) {
+        // given a string, determine (length before tab) and (printed length after first tab)
+        // if there are no tabs, beforetab = -1 and aftertab is the printed length
+        let mut beforetab = None;
+        let mut aftertab = 0;
+        let mut word_start = None;
+        for (os, c) in string.char_indices() {
+            if !c.is_whitespace() {
+                word_start = Some(os);
+                break;
+            } else if c == '\t' {
+                if beforetab == None {
+                    beforetab = Some(aftertab);
+                    aftertab = 0;
+                } else {
+                    aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth;
+                }
+            } else {
+                aftertab += 1;
+            }
+        }
+        (beforetab, aftertab, word_start)
+    }
+}
+
+impl<'a> WordSplit<'a> {
+    fn new<'a>(opts: &'a FmtOptions, string: &'a str) -> WordSplit<'a> {
        // wordsplits *must* start at a non-whitespace character
        let trim_string = string.trim_left();
-        WordSplit { uniform: uniform, string: trim_string, length: string.len(), position: 0 }
+        WordSplit { opts: opts, string: trim_string, length: string.len(), position: 0, prev_punct: false }
    }

    fn is_punctuation(c: char) -> bool {
@ -477,56 +480,72 @@ impl<'a> WordSplit<'a> {
    }
 }

-impl<'a> Iterator<&'a str> for WordSplit<'a> {
-    fn next(&mut self) -> Option<&'a str> {
+pub struct WordInfo<'a> {
+    pub word           : &'a str,
+    pub word_start     : uint,
+    pub word_nchars    : uint,
+    pub before_tab     : Option<uint>,
+    pub after_tab      : uint,
+    pub sentence_start : bool,
+    pub ends_punct     : bool,
+    pub new_line       : bool
+}
+
+// returns (&str, is_start_of_sentence)
+impl<'a> Iterator<WordInfo<'a>> for WordSplit<'a> {
+    fn next(&mut self) -> Option<WordInfo<'a>> {
        if self.position >= self.length {
            return None
        }

        let old_position = self.position;
+        let new_line = old_position == 0;

-        // find the start of the next whitespace segment
-        let ws_start =
-            match self.string.slice_from(old_position).find(|x: char| x.is_whitespace()) {
-                None => self.length,
-                Some(s) => s + old_position
-            };
-
-        if ws_start == self.length {
-            self.position = self.length;
-            return Some(self.string.slice_from(old_position));
-        }
-
-        // find the end of the next whitespace segment
-        // note that this preserves the invariant that self.position points to
-        // non-whitespace character OR end of string
-        self.position =
-            match self.string.slice_from(ws_start).find(|x: char| !x.is_whitespace()) {
-                None => self.length,
-                Some(s) => s + ws_start
-            };
-
-        let is_sentence_end = match self.string.char_range_at_reverse(ws_start) {
-            CharRange { ch, next: _ } if WordSplit::is_punctuation(ch) => self.position - ws_start > 2,
-            _ => false
+        // find the start of the next word, and record if we find a tab character
+        let (before_tab, after_tab, word_start) = match self.analyze_tabs(self.string.slice_from(old_position)) {
+            (b, a, Some(s)) => (b, a, s + old_position),
+            (_, _, None) => {
+                self.position = self.length;
+                return None;
+            }
        };

-        Some(
-            if self.uniform {
-                // if the last non-whitespace character is a [?!.] and
-                // there are two or more spaces, this is the end of a
-                // sentence, so keep one extra space.
-                if is_sentence_end {
-                    self.string.slice(old_position, ws_start + 1)
-                } else {
-                    self.string.slice(old_position, ws_start)
-                }
+        // find the beginning of the next whitespace
+        // note that this preserves the invariant that self.position
+        // points to whitespace character OR end of string
+        let mut word_nchars = 0;
+        self.position =
+            match self.string.slice_from(word_start)
+            .find(|x: char| if !x.is_whitespace() { word_nchars += 1; false } else { true }) {
+                None => self.length,
+                Some(s) => s + word_start
+            };
+
+        let word_start_relative = word_start - old_position;
+        // if the previous sentence was punctuation and this sentence has >2 whitespace or one tab, is a new sentence.
+        let is_start_of_sentence = self.prev_punct && (before_tab.is_some() || word_start_relative > 1);
+
+        // now record whether this word ends in punctuation
+        self.prev_punct = match self.string.char_range_at_reverse(self.position) {
+            CharRange { ch, next: _ } => WordSplit::is_punctuation(ch)
+        };
+
+        let (word, word_start_relative, before_tab, after_tab) =
+            if self.opts.uniform {
+                (self.string.slice(word_start, self.position), 0, None, 0)
            } else {
-                // in non-uniform mode, we just keep the whole thing
-                // eventually we will want to annotate where the sentence boundaries are
-                // so that we can give preference to splitting lines appropriately
-                self.string.slice(old_position, self.position)
-            }
-        )
+                (self.string.slice(old_position, self.position), word_start_relative, before_tab, after_tab)
+            };
+
+        Some(WordInfo {
+            word           : word,
+            word_start     : word_start_relative,
+            word_nchars    : word_nchars,
+            before_tab     : before_tab,
+            after_tab      : after_tab,
+            sentence_start : is_start_of_sentence,
+            ends_punct     : self.prev_punct,
+            new_line       : new_line
+        })
    }
 }