From 8be67f7d4d3cb4a67cfa53cce8c446b78efeb8f8 Mon Sep 17 00:00:00 2001
From: kwantam <kwantam@gmail.com>
Date: Wed, 25 Jun 2014 23:52:28 -0400
Subject: [PATCH] fmt Knuth-Plass implementation; unicode char_width

fmt:
- Implemented Knuth-Plass optimal linebreaking strategy.
- Added commandline switch -q for "quick" (greedy) split
  mode that does not use Knuth-Plass.
- Right now, Knuth-Plass runs about half as fast. It also
  uses more memory.
- Updated fmt to use char_width (see below) instead of
  assuming each character width is 1.
- Use i64 for demerits instead of int in K-P, since int is
  pointer sized and will only be 32 bits on some
  architectures.
- incremented version number
- Incorporated improvements suggested by huonw and Arcterus.
  - K-P uses indices of linebreaks vector instead of raw
    pointers. This gets rid of a lot of allocation of boxes
    and improves safety to boot.
- Added a support module for computing displayed widths of unicode
  strings based on Markus Kuhn's free implementation at
    http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
- This is in `charwidth.rs`, but this is a temporary measure
  until the Char trait implements .width(). I am submitting
  a PR for this soon, and the code in charwidth() is what's
  generated libcore.

closes #223
---
 fmt/charwidth.rs | 170 ++++++++++++++++++++
 fmt/fmt.rs       |  25 +--
 fmt/linebreak.rs | 411 ++++++++++++++++++++++++++++++++++++++++-------
 fmt/parasplit.rs |  43 +++--
 4 files changed, 563 insertions(+), 86 deletions(-)
 create mode 100644 fmt/charwidth.rs
diff --git a/fmt/charwidth.rs b/fmt/charwidth.rs
new file mode 100644
index 000000000..d135fd841
--- /dev/null
+++ b/fmt/charwidth.rs
@@ -0,0 +1,170 @@
+/*
+ * This file is part of `fmt` from the uutils coreutils package.
+ *
+ * (c) kwantam <kwantam@gmail.com>
+ *
+ * For the full copyright and license information, please view the LICENSE
+ * file that was distributed with this source code.
+ */
+
+fn bsearch_range_value_table(c: char, r: &'static [(char, char, uint)]) -> uint {
+    match r.bsearch(|&(lo, hi, _)| {
+        if lo <= c && c <= hi { Equal }
+        else if hi < c { Less }
+        else { Greater }
+    }) {
+        Some(idx) => {
+            let (_, _, result) = r[idx];
+            result
+        }
+        None => 1
+    }
+}
+
+pub fn width(c: char) -> Option<uint> {
+    match c as uint {
+        _c @ 0 => Some(0),          // null is zero width
+        cu if cu < 0x20 => None,    // control sequences have no width
+        cu if cu < 0x7F => Some(1), // ASCII
+        cu if cu < 0xA0 => None,    // more control sequences
+        _ => Some(bsearch_range_value_table(c, charwidth_table))
+    }
+}
+
+// character width table. Based on Markus Kuhn's free wcwidth() implementation,
+//     http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+static charwidth_table : &'static [(char, char, uint)] = &[
+    ('\u0300', '\u036f', 0), ('\u0483', '\u0487', 0), ('\u0488', '\u0489', 0), ('\u0591',
+    '\u05bd', 0), ('\u05bf', '\u05bf', 0), ('\u05c1', '\u05c2', 0), ('\u05c4', '\u05c5', 0),
+    ('\u05c7', '\u05c7', 0), ('\u0600', '\u0605', 0), ('\u0610', '\u061a', 0), ('\u061c',
+    '\u061c', 0), ('\u064b', '\u065f', 0), ('\u0670', '\u0670', 0), ('\u06d6', '\u06dc', 0),
+    ('\u06dd', '\u06dd', 0), ('\u06df', '\u06e4', 0), ('\u06e7', '\u06e8', 0), ('\u06ea',
+    '\u06ed', 0), ('\u070f', '\u070f', 0), ('\u0711', '\u0711', 0), ('\u0730', '\u074a', 0),
+    ('\u07a6', '\u07b0', 0), ('\u07eb', '\u07f3', 0), ('\u0816', '\u0819', 0), ('\u081b',
+    '\u0823', 0), ('\u0825', '\u0827', 0), ('\u0829', '\u082d', 0), ('\u0859', '\u085b', 0),
+    ('\u08e4', '\u0902', 0), ('\u093a', '\u093a', 0), ('\u093c', '\u093c', 0), ('\u0941',
+    '\u0948', 0), ('\u094d', '\u094d', 0), ('\u0951', '\u0957', 0), ('\u0962', '\u0963', 0),
+    ('\u0981', '\u0981', 0), ('\u09bc', '\u09bc', 0), ('\u09c1', '\u09c4', 0), ('\u09cd',
+    '\u09cd', 0), ('\u09e2', '\u09e3', 0), ('\u0a01', '\u0a02', 0), ('\u0a3c', '\u0a3c', 0),
+    ('\u0a41', '\u0a51', 0), ('\u0a70', '\u0a71', 0), ('\u0a75', '\u0a82', 0), ('\u0abc',
+    '\u0abc', 0), ('\u0ac1', '\u0ac8', 0), ('\u0acd', '\u0acd', 0), ('\u0ae2', '\u0ae3', 0),
+    ('\u0b01', '\u0b01', 0), ('\u0b3c', '\u0b3c', 0), ('\u0b3f', '\u0b3f', 0), ('\u0b41',
+    '\u0b44', 0), ('\u0b4d', '\u0b56', 0), ('\u0b62', '\u0b63', 0), ('\u0b82', '\u0b82', 0),
+    ('\u0bc0', '\u0bc0', 0), ('\u0bcd', '\u0bcd', 0), ('\u0c00', '\u0c00', 0), ('\u0c3e',
+    '\u0c40', 0), ('\u0c46', '\u0c56', 0), ('\u0c62', '\u0c63', 0), ('\u0c81', '\u0c81', 0),
+    ('\u0cbc', '\u0cbc', 0), ('\u0cbf', '\u0cbf', 0), ('\u0cc6', '\u0cc6', 0), ('\u0ccc',
+    '\u0ccd', 0), ('\u0ce2', '\u0ce3', 0), ('\u0d01', '\u0d01', 0), ('\u0d41', '\u0d44', 0),
+    ('\u0d4d', '\u0d4d', 0), ('\u0d62', '\u0d63', 0), ('\u0dca', '\u0dca', 0), ('\u0dd2',
+    '\u0dd6', 0), ('\u0e31', '\u0e31', 0), ('\u0e34', '\u0e3a', 0), ('\u0e47', '\u0e4e', 0),
+    ('\u0eb1', '\u0eb1', 0), ('\u0eb4', '\u0ebc', 0), ('\u0ec8', '\u0ecd', 0), ('\u0f18',
+    '\u0f19', 0), ('\u0f35', '\u0f35', 0), ('\u0f37', '\u0f37', 0), ('\u0f39', '\u0f39', 0),
+    ('\u0f71', '\u0f7e', 0), ('\u0f80', '\u0f84', 0), ('\u0f86', '\u0f87', 0), ('\u0f8d',
+    '\u0fbc', 0), ('\u0fc6', '\u0fc6', 0), ('\u102d', '\u1030', 0), ('\u1032', '\u1037', 0),
+    ('\u1039', '\u103a', 0), ('\u103d', '\u103e', 0), ('\u1058', '\u1059', 0), ('\u105e',
+    '\u1060', 0), ('\u1071', '\u1074', 0), ('\u1082', '\u1082', 0), ('\u1085', '\u1086', 0),
+    ('\u108d', '\u108d', 0), ('\u109d', '\u109d', 0), ('\u1100', '\u115f', 2), ('\u1160',
+    '\u11ff', 0), ('\u135d', '\u135f', 0), ('\u1712', '\u1714', 0), ('\u1732', '\u1734', 0),
+    ('\u1752', '\u1753', 0), ('\u1772', '\u1773', 0), ('\u17b4', '\u17b5', 0), ('\u17b7',
+    '\u17bd', 0), ('\u17c6', '\u17c6', 0), ('\u17c9', '\u17d3', 0), ('\u17dd', '\u17dd', 0),
+    ('\u180b', '\u180d', 0), ('\u180e', '\u180e', 0), ('\u18a9', '\u18a9', 0), ('\u1920',
+    '\u1922', 0), ('\u1927', '\u1928', 0), ('\u1932', '\u1932', 0), ('\u1939', '\u193b', 0),
+    ('\u1a17', '\u1a18', 0), ('\u1a1b', '\u1a1b', 0), ('\u1a56', '\u1a56', 0), ('\u1a58',
+    '\u1a60', 0), ('\u1a62', '\u1a62', 0), ('\u1a65', '\u1a6c', 0), ('\u1a73', '\u1a7f', 0),
+    ('\u1ab0', '\u1abd', 0), ('\u1abe', '\u1abe', 0), ('\u1b00', '\u1b03', 0), ('\u1b34',
+    '\u1b34', 0), ('\u1b36', '\u1b3a', 0), ('\u1b3c', '\u1b3c', 0), ('\u1b42', '\u1b42', 0),
+    ('\u1b6b', '\u1b73', 0), ('\u1b80', '\u1b81', 0), ('\u1ba2', '\u1ba5', 0), ('\u1ba8',
+    '\u1ba9', 0), ('\u1bab', '\u1bad', 0), ('\u1be6', '\u1be6', 0), ('\u1be8', '\u1be9', 0),
+    ('\u1bed', '\u1bed', 0), ('\u1bef', '\u1bf1', 0), ('\u1c2c', '\u1c33', 0), ('\u1c36',
+    '\u1c37', 0), ('\u1cd0', '\u1cd2', 0), ('\u1cd4', '\u1ce0', 0), ('\u1ce2', '\u1ce8', 0),
+    ('\u1ced', '\u1ced', 0), ('\u1cf4', '\u1cf4', 0), ('\u1cf8', '\u1cf9', 0), ('\u1dc0',
+    '\u1dff', 0), ('\u200b', '\u200f', 0), ('\u202a', '\u202e', 0), ('\u2060', '\u206f', 0),
+    ('\u20d0', '\u20dc', 0), ('\u20dd', '\u20e0', 0), ('\u20e1', '\u20e1', 0), ('\u20e2',
+    '\u20e4', 0), ('\u20e5', '\u20f0', 0), ('\u2329', '\u2329', 2), ('\u232a', '\u232a', 2),
+    ('\u2cef', '\u2cf1', 0), ('\u2d7f', '\u2d7f', 0), ('\u2de0', '\u2dff', 0), ('\u2e80',
+    '\u2e99', 2), ('\u2e9b', '\u2ef3', 2), ('\u2f00', '\u2fd5', 2), ('\u2ff0', '\u2ffb', 2),
+    ('\u3000', '\u3000', 2), ('\u3001', '\u3003', 2), ('\u3004', '\u3004', 2), ('\u3005',
+    '\u3005', 2), ('\u3006', '\u3006', 2), ('\u3007', '\u3007', 2), ('\u3008', '\u3008', 2),
+    ('\u3009', '\u3009', 2), ('\u300a', '\u300a', 2), ('\u300b', '\u300b', 2), ('\u300c',
+    '\u300c', 2), ('\u300d', '\u300d', 2), ('\u300e', '\u300e', 2), ('\u300f', '\u300f', 2),
+    ('\u3010', '\u3010', 2), ('\u3011', '\u3011', 2), ('\u3012', '\u3013', 2), ('\u3014',
+    '\u3014', 2), ('\u3015', '\u3015', 2), ('\u3016', '\u3016', 2), ('\u3017', '\u3017', 2),
+    ('\u3018', '\u3018', 2), ('\u3019', '\u3019', 2), ('\u301a', '\u301a', 2), ('\u301b',
+    '\u301b', 2), ('\u301c', '\u301c', 2), ('\u301d', '\u301d', 2), ('\u301e', '\u301f', 2),
+    ('\u3020', '\u3020', 2), ('\u3021', '\u3029', 2), ('\u302a', '\u302d', 0), ('\u302a',
+    '\u302d', 2), ('\u302e', '\u302f', 2), ('\u3030', '\u3030', 2), ('\u3031', '\u3035', 2),
+    ('\u3036', '\u3037', 2), ('\u3038', '\u303a', 2), ('\u303b', '\u303b', 2), ('\u303c',
+    '\u303c', 2), ('\u303d', '\u303d', 2), ('\u303e', '\u303e', 2), ('\u3041', '\u3096', 2),
+    ('\u3099', '\u309a', 0), ('\u3099', '\u309a', 2), ('\u309b', '\u309c', 2), ('\u309d',
+    '\u309e', 2), ('\u309f', '\u309f', 2), ('\u30a0', '\u30a0', 2), ('\u30a1', '\u30fa', 2),
+    ('\u30fb', '\u30fb', 2), ('\u30fc', '\u30fe', 2), ('\u30ff', '\u30ff', 2), ('\u3105',
+    '\u312d', 2), ('\u3131', '\u318e', 2), ('\u3190', '\u3191', 2), ('\u3192', '\u3195', 2),
+    ('\u3196', '\u319f', 2), ('\u31a0', '\u31ba', 2), ('\u31c0', '\u31e3', 2), ('\u31f0',
+    '\u31ff', 2), ('\u3200', '\u321e', 2), ('\u3220', '\u3229', 2), ('\u322a', '\u3247', 2),
+    ('\u3250', '\u3250', 2), ('\u3251', '\u325f', 2), ('\u3260', '\u327f', 2), ('\u3280',
+    '\u3289', 2), ('\u328a', '\u32b0', 2), ('\u32b1', '\u32bf', 2), ('\u32c0', '\u32fe', 2),
+    ('\u3300', '\u33ff', 2), ('\u3400', '\u4db5', 2), ('\u4db6', '\u4dbf', 2), ('\u4e00',
+    '\u9fcc', 2), ('\u9fcd', '\u9fff', 2), ('\ua000', '\ua014', 2), ('\ua015', '\ua015', 2),
+    ('\ua016', '\ua48c', 2), ('\ua490', '\ua4c6', 2), ('\ua66f', '\ua66f', 0), ('\ua670',
+    '\ua672', 0), ('\ua674', '\ua67d', 0), ('\ua69f', '\ua69f', 0), ('\ua6f0', '\ua6f1', 0),
+    ('\ua802', '\ua802', 0), ('\ua806', '\ua806', 0), ('\ua80b', '\ua80b', 0), ('\ua825',
+    '\ua826', 0), ('\ua8c4', '\ua8c4', 0), ('\ua8e0', '\ua8f1', 0), ('\ua926', '\ua92d', 0),
+    ('\ua947', '\ua951', 0), ('\ua960', '\ua97c', 2), ('\ua980', '\ua982', 0), ('\ua9b3',
+    '\ua9b3', 0), ('\ua9b6', '\ua9b9', 0), ('\ua9bc', '\ua9bc', 0), ('\ua9e5', '\ua9e5', 0),
+    ('\uaa29', '\uaa2e', 0), ('\uaa31', '\uaa32', 0), ('\uaa35', '\uaa36', 0), ('\uaa43',
+    '\uaa43', 0), ('\uaa4c', '\uaa4c', 0), ('\uaa7c', '\uaa7c', 0), ('\uaab0', '\uaab0', 0),
+    ('\uaab2', '\uaab4', 0), ('\uaab7', '\uaab8', 0), ('\uaabe', '\uaabf', 0), ('\uaac1',
+    '\uaac1', 0), ('\uaaec', '\uaaed', 0), ('\uaaf6', '\uaaf6', 0), ('\uabe5', '\uabe5', 0),
+    ('\uabe8', '\uabe8', 0), ('\uabed', '\uabed', 0), ('\uac00', '\ud7a3', 2), ('\uf900',
+    '\ufa6d', 2), ('\ufa6e', '\ufa6f', 2), ('\ufa70', '\ufad9', 2), ('\ufada', '\ufaff', 2),
+    ('\ufb1e', '\ufb1e', 0), ('\ufe00', '\ufe0f', 0), ('\ufe10', '\ufe16', 2), ('\ufe17',
+    '\ufe17', 2), ('\ufe18', '\ufe18', 2), ('\ufe19', '\ufe19', 2), ('\ufe20', '\ufe2d', 0),
+    ('\ufe30', '\ufe30', 2), ('\ufe31', '\ufe32', 2), ('\ufe33', '\ufe34', 2), ('\ufe35',
+    '\ufe35', 2), ('\ufe36', '\ufe36', 2), ('\ufe37', '\ufe37', 2), ('\ufe38', '\ufe38', 2),
+    ('\ufe39', '\ufe39', 2), ('\ufe3a', '\ufe3a', 2), ('\ufe3b', '\ufe3b', 2), ('\ufe3c',
+    '\ufe3c', 2), ('\ufe3d', '\ufe3d', 2), ('\ufe3e', '\ufe3e', 2), ('\ufe3f', '\ufe3f', 2),
+    ('\ufe40', '\ufe40', 2), ('\ufe41', '\ufe41', 2), ('\ufe42', '\ufe42', 2), ('\ufe43',
+    '\ufe43', 2), ('\ufe44', '\ufe44', 2), ('\ufe45', '\ufe46', 2), ('\ufe47', '\ufe47', 2),
+    ('\ufe48', '\ufe48', 2), ('\ufe49', '\ufe4c', 2), ('\ufe4d', '\ufe4f', 2), ('\ufe50',
+    '\ufe52', 2), ('\ufe54', '\ufe57', 2), ('\ufe58', '\ufe58', 2), ('\ufe59', '\ufe59', 2),
+    ('\ufe5a', '\ufe5a', 2), ('\ufe5b', '\ufe5b', 2), ('\ufe5c', '\ufe5c', 2), ('\ufe5d',
+    '\ufe5d', 2), ('\ufe5e', '\ufe5e', 2), ('\ufe5f', '\ufe61', 2), ('\ufe62', '\ufe62', 2),
+    ('\ufe63', '\ufe63', 2), ('\ufe64', '\ufe66', 2), ('\ufe68', '\ufe68', 2), ('\ufe69',
+    '\ufe69', 2), ('\ufe6a', '\ufe6b', 2), ('\ufeff', '\ufeff', 0), ('\uff01', '\uff03', 2),
+    ('\uff04', '\uff04', 2), ('\uff05', '\uff07', 2), ('\uff08', '\uff08', 2), ('\uff09',
+    '\uff09', 2), ('\uff0a', '\uff0a', 2), ('\uff0b', '\uff0b', 2), ('\uff0c', '\uff0c', 2),
+    ('\uff0d', '\uff0d', 2), ('\uff0e', '\uff0f', 2), ('\uff10', '\uff19', 2), ('\uff1a',
+    '\uff1b', 2), ('\uff1c', '\uff1e', 2), ('\uff1f', '\uff20', 2), ('\uff21', '\uff3a', 2),
+    ('\uff3b', '\uff3b', 2), ('\uff3c', '\uff3c', 2), ('\uff3d', '\uff3d', 2), ('\uff3e',
+    '\uff3e', 2), ('\uff3f', '\uff3f', 2), ('\uff40', '\uff40', 2), ('\uff41', '\uff5a', 2),
+    ('\uff5b', '\uff5b', 2), ('\uff5c', '\uff5c', 2), ('\uff5d', '\uff5d', 2), ('\uff5e',
+    '\uff5e', 2), ('\uff5f', '\uff5f', 2), ('\uff60', '\uff60', 2), ('\uffe0', '\uffe1', 2),
+    ('\uffe2', '\uffe2', 2), ('\uffe3', '\uffe3', 2), ('\uffe4', '\uffe4', 2), ('\uffe5',
+    '\uffe6', 2), ('\ufff9', '\ufffb', 0), ('\U000101fd', '\U000101fd', 0), ('\U000102e0',
+    '\U000102e0', 0), ('\U00010376', '\U0001037a', 0), ('\U00010a01', '\U00010a0f', 0),
+    ('\U00010a38', '\U00010a3f', 0), ('\U00010ae5', '\U00010ae6', 0), ('\U00011001',
+    '\U00011001', 0), ('\U00011038', '\U00011046', 0), ('\U0001107f', '\U00011081', 0),
+    ('\U000110b3', '\U000110b6', 0), ('\U000110b9', '\U000110ba', 0), ('\U000110bd',
+    '\U000110bd', 0), ('\U00011100', '\U00011102', 0), ('\U00011127', '\U0001112b', 0),
+    ('\U0001112d', '\U00011134', 0), ('\U00011173', '\U00011173', 0), ('\U00011180',
+    '\U00011181', 0), ('\U000111b6', '\U000111be', 0), ('\U0001122f', '\U00011231', 0),
+    ('\U00011234', '\U00011234', 0), ('\U00011236', '\U00011237', 0), ('\U000112df',
+    '\U000112df', 0), ('\U000112e3', '\U000112ea', 0), ('\U00011301', '\U00011301', 0),
+    ('\U0001133c', '\U0001133c', 0), ('\U00011340', '\U00011340', 0), ('\U00011366',
+    '\U00011374', 0), ('\U000114b3', '\U000114b8', 0), ('\U000114ba', '\U000114ba', 0),
+    ('\U000114bf', '\U000114c0', 0), ('\U000114c2', '\U000114c3', 0), ('\U000115b2',
+    '\U000115b5', 0), ('\U000115bc', '\U000115bd', 0), ('\U000115bf', '\U000115c0', 0),
+    ('\U00011633', '\U0001163a', 0), ('\U0001163d', '\U0001163d', 0), ('\U0001163f',
+    '\U00011640', 0), ('\U000116ab', '\U000116ab', 0), ('\U000116ad', '\U000116ad', 0),
+    ('\U000116b0', '\U000116b5', 0), ('\U000116b7', '\U000116b7', 0), ('\U00016af0',
+    '\U00016af4', 0), ('\U00016b30', '\U00016b36', 0), ('\U00016f8f', '\U00016f92', 0),
+    ('\U0001b000', '\U0001b001', 2), ('\U0001bc9d', '\U0001bc9e', 0), ('\U0001bca0',
+    '\U0001bca3', 0), ('\U0001d167', '\U0001d169', 0), ('\U0001d173', '\U0001d17a', 0),
+    ('\U0001d17b', '\U0001d182', 0), ('\U0001d185', '\U0001d18b', 0), ('\U0001d1aa',
+    '\U0001d1ad', 0), ('\U0001d242', '\U0001d244', 0), ('\U0001e8d0', '\U0001e8d6', 0),
+    ('\U0001f200', '\U0001f202', 2), ('\U0001f210', '\U0001f23a', 2), ('\U0001f240',
+    '\U0001f248', 2), ('\U0001f250', '\U0001f251', 2), ('\U00020000', '\U0002a6d6', 2),
+    ('\U0002a6d7', '\U0002a6ff', 2), ('\U0002a700', '\U0002b734', 2), ('\U0002b735',
+    '\U0002b73f', 2), ('\U0002b740', '\U0002b81d', 2), ('\U0002b81e', '\U0002f7ff', 2),
+    ('\U0002f800', '\U0002fa1d', 2), ('\U0002fa1e', '\U0002fffd', 2), ('\U00030000',
+    '\U0003fffd', 2), ('\U000e0001', '\U000e007f', 0), ('\U000e0100', '\U000e01ef', 0)
+];
diff --git a/fmt/fmt.rs b/fmt/fmt.rs
index f4e6f64fb..2c76f219e 100644
--- a/fmt/fmt.rs
+++ b/fmt/fmt.rs
@@ -1,4 +1,4 @@
-#![crate_id(name="fmt", vers="0.0.2", author="kwantam")]
+#![crate_id(name="fmt", vers="0.0.3", author="kwantam")]
 /*
  * This file is part of `fmt` from the uutils coreutils package.
  *
@@ -13,6 +13,7 @@
 extern crate core;
 extern crate getopts;
 
+use std::cmp;
 use std::io::{BufferedReader, BufferedWriter, File, IoResult};
 use std::io::stdio::{stdin_raw, stdout_raw};
 use linebreak::break_lines;
@@ -31,10 +32,11 @@ macro_rules! silent_unwrap(
 mod util;
 mod linebreak;
 mod parasplit;
+mod charwidth;
 
 // program's NAME and VERSION are used for -V and -h
 static NAME: &'static str = "fmt";
-static VERSION: &'static str = "0.0.2";
+static VERSION: &'static str = "0.0.3";
 
 struct FmtOptions {
     crown           : bool,
@@ -48,6 +50,7 @@ struct FmtOptions {
     anti_prefix     : String,
     xanti_prefix    : bool,
     uniform         : bool,
+    quick           : bool,
     width           : uint,
     goal            : uint,
     tabwidth        : uint,
@@ -68,8 +71,10 @@ pub fn uumain(args: Vec<String>) -> int {
         getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."),
         getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."),
 
-        getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"),
-        getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"),
+        getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 79.", "WIDTH"),
+        getopts::optopt("g", "goal", "Goal width, default ~0.94*WIDTH. Must be less than WIDTH.", "GOAL"),
+
+        getopts::optflag("q", "quick", "Break lines more quickly at the expense of a potentially more ragged appearance."),
 
         getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"),
 
@@ -96,6 +101,7 @@ pub fn uumain(args: Vec<String>) -> int {
         tagged          : false,
         mail            : false,
         uniform         : false,
+        quick           : false,
         split_only      : false,
         use_prefix      : false,
         prefix          : String::new(),
@@ -103,8 +109,8 @@ pub fn uumain(args: Vec<String>) -> int {
         use_anti_prefix : false,
         anti_prefix     : String::new(),
         xanti_prefix    : false,
-        width           : 78,
-        goal            : 72,
+        width           : 79,
+        goal            : 74,
         tabwidth        : 8,
     };
 
@@ -112,6 +118,7 @@ pub fn uumain(args: Vec<String>) -> int {
     if matches.opt_present("c") { fmt_opts.crown        = true; fmt_opts.tagged = false; }
     if matches.opt_present("m") { fmt_opts.mail         = true; }
     if matches.opt_present("u") { fmt_opts.uniform      = true; }
+    if matches.opt_present("q") { fmt_opts.quick        = true; }
     if matches.opt_present("s") { fmt_opts.split_only   = true; fmt_opts.crown  = false; fmt_opts.tagged = false; }
     if matches.opt_present("x") { fmt_opts.xprefix      = true; }
     if matches.opt_present("X") { fmt_opts.xanti_prefix = true; }
@@ -139,7 +146,7 @@ pub fn uumain(args: Vec<String>) -> int {
                     Some(t) => t,
                     None => { crash!(1, "Invalid WIDTH specification: `{}'", s); }
                 };
-            fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4);
+            fmt_opts.goal = cmp::min(fmt_opts.width * 94 / 100, fmt_opts.width - 3);
         }
         None => ()
     };
@@ -152,7 +159,7 @@ pub fn uumain(args: Vec<String>) -> int {
                     None => { crash!(1, "Invalid GOAL specification: `{}'", s); }
                 };
             if !matches.opt_present("w") {
-                fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4);
+                fmt_opts.width = cmp::max(fmt_opts.goal * 100 / 94, fmt_opts.goal + 3);
             } else if fmt_opts.goal > fmt_opts.width {
                 crash!(1, "GOAL cannot be greater than WIDTH.");
             }
@@ -189,7 +196,7 @@ pub fn uumain(args: Vec<String>) -> int {
         let mut fp =
             match open_file(i) {
                 Err(e) => {
-                    show_warning!("{}: {}",i,e);
+                    show_warning!("{}: {}", i, e);
                     continue;
                 }
                 Ok(f) => f
diff --git a/fmt/linebreak.rs b/fmt/linebreak.rs
index 727f014d4..89f85b164 100644
--- a/fmt/linebreak.rs
+++ b/fmt/linebreak.rs
@@ -9,6 +9,10 @@
 
 use FmtOptions;
 use parasplit::{Paragraph, ParaWords, WordInfo};
+use std::i64;
+use std::cmp;
+use std::mem;
+use std::num;
 
 struct BreakArgs<'a> {
     opts       : &'a FmtOptions,
@@ -21,8 +25,16 @@ struct BreakArgs<'a> {
 
 impl<'a> BreakArgs<'a> {
     #[inline(always)]
-    fn compute_width(&self, pre: uint, post: uint, posn: uint) -> uint {
-        post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
+    fn compute_width<'b>(&self, winfo: &WordInfo<'b>, posn: uint, fresh: bool) -> uint {
+        if fresh {
+            0
+        } else {
+            let post = winfo.after_tab;
+            match winfo.before_tab {
+                None => post,
+                Some(pre) => post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
+            }
+        }
     }
 }
 
@@ -73,91 +85,366 @@ pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer
         ostream    : ostream
     };
 
-    break_simple(&mut pWords_words, &mut break_args);
+    if opts.quick || para.mail_header {
+        break_simple(pWords_words, &mut break_args);
+    } else {
+        break_knuth_plass(pWords_words, &mut break_args);
+    }
 }
 
-/*
- * break_simple implements the "tight" breaking algorithm: print words until
- * maxlength would be exceeded, then print a linebreak and indent and continue.
- * Note that any first line indent should already have been printed before
- * calling this function, and the displayed length of said indent passed as
- * args.init_len
- */
-fn break_simple<'a,T: Iterator<&'a WordInfo<'a>>>(iter: &'a mut T, args: &mut BreakArgs<'a>) {
+// break_simple implements a "greedy" breaking algorithm: print words until
+// maxlength would be exceeded, then print a linebreak and indent and continue.
+fn break_simple<'a, T: Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
     iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
     silent_unwrap!(args.ostream.write_char('\n'));
 }
 
+#[inline(always)]
 fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
     // compute the length of this word, considering how tabs will expand at this position on the line
-    let wlen = winfo.word_nchars +
-        if winfo.before_tab.is_some() {
-            args.compute_width(winfo.before_tab.unwrap(), winfo.after_tab, l)
-        } else {
-            winfo.after_tab
-        };
+    let wlen = winfo.word_nchars + args.compute_width(winfo, l, false);
 
-    let splen =
-        if args.uniform || winfo.new_line {
-            if winfo.sentence_start || (winfo.new_line && prev_punct) { 2 }
-            else { 1 }
-        } else {
-            0
-        };
+    let slen = compute_slen(args.uniform, winfo.new_line, winfo.sentence_start, prev_punct);
 
-    if l + wlen + splen > args.opts.width {
-        let wtrim = winfo.word.slice_from(winfo.word_start);
-        silent_unwrap!(args.ostream.write_char('\n'));
-        silent_unwrap!(args.ostream.write(args.indent_str.as_bytes()));
-        silent_unwrap!(args.ostream.write(wtrim.as_bytes()));
-        (args.indent_len + wtrim.len(), winfo.ends_punct)
+    if l + wlen + slen > args.opts.width {
+        write_newline(args.indent_str, args.ostream);
+        write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
+        (args.indent_len + winfo.word_nchars, winfo.ends_punct)
     } else {
-        if splen == 2 { silent_unwrap!(args.ostream.write("  ".as_bytes())); }
-        else if splen == 1 { silent_unwrap!(args.ostream.write_char(' ')) }
-        silent_unwrap!(args.ostream.write(winfo.word.as_bytes()));
-        (l + wlen + splen, winfo.ends_punct)
+        write_with_spaces(winfo.word, slen, args.ostream);
+        (l + wlen + slen, winfo.ends_punct)
     }
 }
 
-#[allow(dead_code)]
-enum PreviousBreak<'a> {
-    ParaStart,
-    PrevBreak(&'a LineBreak<'a>)
+// break_knuth_plass implements an "optimal" breaking algorithm in the style of
+//    Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software,
+//    Practice and Experience. Vol. 11, No. 11, November 1981.
+//    http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf
+fn break_knuth_plass<'a, T: Clone + Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
+    // run the algorithm to get the breakpoints
+    let breakpoints = find_kp_breakpoints(iter.clone(), args);
+
+    // iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it
+    let (mut prev_punct, mut fresh) =
+        breakpoints.iter().rev().fold((false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| {
+            if fresh {
+                write_newline(args.indent_str, args.ostream);
+            }
+            // at each breakpoint, keep emitting words until we find the word matching this breakpoint
+            for winfo in iter {
+                let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
+                                                  winfo.new_line, winfo.sentence_start, prev_punct);
+                fresh = false;
+                prev_punct = winfo.ends_punct;
+
+                // We find identical breakpoints here by comparing addresses of the references.
+                // This is OK because the backing vector is not mutating once we are linebreaking.
+                if winfo as *_ == next_break as *_ {
+                    // OK, we found the matching word
+                    if break_before {
+                        write_newline(args.indent_str, args.ostream);
+                        write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
+                    } else {
+                        // breaking after this word, so that means "fresh" is true for the next iteration
+                        write_with_spaces(word, slen, args.ostream);
+                        fresh = true;
+                    }
+                    break;
+                } else {
+                    write_with_spaces(word, slen, args.ostream);
+                }
+            }
+            (prev_punct, fresh)
+        });
+
+    // after the last linebreak, write out the rest of the final line.
+    for winfo in iter {
+        if fresh {
+            write_newline(args.indent_str, args.ostream);
+        }
+        let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
+                                          winfo.new_line, winfo.sentence_start, prev_punct);
+        prev_punct = winfo.ends_punct;
+        fresh = false;
+        write_with_spaces(word, slen, args.ostream);
+    }
+    silent_unwrap!(args.ostream.write_char('\n'));
 }
 
-#[allow(dead_code)]
 struct LineBreak<'a> {
-    prev       : PreviousBreak<'a>,
-    breakafter : &'a str,
-    demerits   : uint
+    prev         : uint,
+    linebreak    : Option<&'a WordInfo<'a>>,
+    break_before : bool,
+    demerits     : i64,
+    prev_rat     : f32,
+    length       : uint,
+    fresh        : bool
 }
 
-// when comparing two LineBreaks, compare their demerits
-#[allow(dead_code)]
-impl<'a> PartialEq for LineBreak<'a> {
-    fn eq(&self, other: &LineBreak) -> bool {
-        self.demerits == other.demerits
+fn find_kp_breakpoints<'a, T: Iterator<&'a WordInfo<'a>>>(iter: T, args: &BreakArgs<'a>) -> Vec<(&'a WordInfo<'a>, bool)> {
+    let mut iter = iter.peekable();
+    // set up the initial null linebreak
+    let mut linebreaks = vec!(LineBreak {
+        prev         : 0,
+        linebreak    : None,
+        break_before : false,
+        demerits     : 0,
+        prev_rat     : 0.0f32,
+        length       : args.init_len,
+        fresh        : false
+    });
+    // this vec holds the current active linebreaks; next_ holds the breaks that will be active for the next word
+    let active_breaks = &mut vec!(0);
+    let next_active_breaks = &mut vec!();
+
+    let stretch = (args.opts.width - args.opts.goal) as int;
+    let minlength = args.opts.goal - stretch as uint;
+    let mut new_linebreaks = vec!();
+    let mut is_sentence_start = false;
+    let mut least_demerits = 0;
+    loop {
+        let w =
+            match iter.next() {
+                None => break,
+                Some(w) => w
+            };
+
+        // if this is the last word, we don't add additional demerits for this break
+        let (is_last_word, is_sentence_end) =
+            match iter.peek() {
+                None => (true, true),
+                Some(&&WordInfo { sentence_start: st, new_line: nl, .. }) => (false, st || (nl && w.ends_punct))
+            };
+
+        // should we be adding extra space at the beginning of the next sentence?
+        let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false);
+
+        let mut ld_new = i64::MAX;
+        let mut ld_next = i64::MAX;
+        let mut ld_idx = 0;
+        new_linebreaks.clear();
+        next_active_breaks.clear();
+        // go through each active break, extending it and possibly adding a new active
+        // break if we are above the minimum required length
+        for &i in active_breaks.iter() {
+            let active = linebreaks.get_mut(i);
+            // normalize demerits to avoid overflow, and record if this is the least
+            active.demerits -= least_demerits;
+            if active.demerits < ld_next {
+                ld_next = active.demerits;
+                ld_idx = i;
+            }
+
+            // get the new length
+            let tlen = w.word_nchars + args.compute_width(w, active.length, active.fresh) + slen + active.length;
+
+            // if tlen is longer than args.opts.width, we drop this break from the active list
+            // otherwise, we extend the break, and possibly add a new break at this point
+            if tlen <= args.opts.width {
+                // this break will still be active next time
+                next_active_breaks.push(i);
+                // we can put this word on this line
+                active.fresh = false;
+                active.length = tlen;
+
+                // if we're above the minlength, we can also consider breaking here
+                if tlen >= minlength {
+                    let (new_demerits, new_ratio) =
+                        if is_last_word {
+                            // there is no penalty for the final line's length
+                            (0, 0.0)
+                        } else {
+                            compute_demerits((args.opts.goal - tlen) as int, stretch, w.word_nchars as int, active.prev_rat)
+                        };
+
+                    // do not even consider adding a line that has too many demerits
+                    // also, try to detect overflow by checking signum
+                    let total_demerits = new_demerits + active.demerits;
+                    if new_demerits < BAD_INFTY_SQ && total_demerits < ld_new && num::signum(active.demerits) <= num::signum(new_demerits) {
+                        ld_new = total_demerits;
+                        new_linebreaks.push(LineBreak {
+                            prev         : i,
+                            linebreak    : Some(w),
+                            break_before : false,
+                            demerits     : total_demerits,
+                            prev_rat     : new_ratio,
+                            length       : args.indent_len,
+                            fresh        : true
+                        });
+                    }
+                }
+            }
+        }
+
+        // if we generated any new linebreaks, add the last one to the list
+        // the last one is always the best because we don't add to new_linebreaks unless
+        // it's better than the best one so far
+        match new_linebreaks.pop() {
+            None => (),
+            Some(lb) => {
+                next_active_breaks.push(linebreaks.len());
+                linebreaks.push(lb);
+            }
+        }
+
+        if next_active_breaks.is_empty() {
+            // every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx
+            let new_break = restart_active_breaks(args, linebreaks.get(ld_idx), ld_idx, w, slen, minlength);
+            next_active_breaks.push(linebreaks.len());
+            linebreaks.push(new_break);
+            least_demerits = 0;
+        } else {
+            // next time around, normalize out the demerits fields
+            // on active linebreaks to make overflow less likely
+            least_demerits = cmp::max(ld_next, 0);
+        }
+        // swap in new list of active breaks
+        mem::swap(active_breaks, next_active_breaks);
+        // If this was the last word in a sentence, the next one must be the first in the next.
+        is_sentence_start = is_sentence_end;
+    }
+
+    // return the best path
+    build_best_path(&linebreaks, active_breaks)
+}
+
+#[inline(always)]
+fn build_best_path<'a>(paths: &Vec<LineBreak<'a>>, active: &Vec<uint>) -> Vec<(&'a WordInfo<'a>, bool)> {
+    let mut breakwords = vec!();
+    // of the active paths, we select the one with the fewest demerits
+    let mut best_idx = match active.iter().min_by(|&&a| paths.get(a).demerits) {
+        None => crash!(1, "Failed to find a k-p linebreak solution. This should never happen."),
+        Some(&s) => s
+    };
+
+    // now, chase the pointers back through the break list, recording
+    // the words at which we should break
+    loop {
+        let next_best = paths.get(best_idx);
+        match next_best.linebreak {
+            None => return breakwords,
+            Some(prev) => {
+                breakwords.push((prev, next_best.break_before));
+                best_idx = next_best.prev
+            }
+        }
     }
 }
 
-// NOTE "less than" in this case means "worse", i.e., more demerits
-#[allow(dead_code)]
-impl<'a> PartialOrd for LineBreak<'a> {
-    fn lt(&self, other: &LineBreak) -> bool {
-        self.demerits > other.demerits
+// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed
+static BAD_INFTY: i64 = 10000000;
+static BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
+// badness = BAD_MULT * abs(r) ^ 3
+static BAD_MULT: f32 = 100.0;
+// DR_MULT is multiplier for delta-R between lines
+static DR_MULT: f32 = 600.0;
+// DL_MULT is penalty multiplier for short words at end of line
+static DL_MULT: f32 = 300.0;
+
+#[inline(always)]
+fn compute_demerits(delta_len: int, stretch: int, wlen: int, prev_rat: f32) -> (i64, f32) {
+    // how much stretch are we using?
+    let ratio =
+        if delta_len == 0 {
+            0.0f32
+        } else {
+            delta_len as f32 / stretch as f32
+        };
+
+    // compute badness given the stretch ratio
+    let bad_linelen =
+        if num::abs(ratio) > 1.0f32 {
+            BAD_INFTY
+        } else {
+            (BAD_MULT * num::abs(num::pow(ratio, 3))) as i64
+        };
+
+    // we penalize lines ending in really short words
+    let bad_wordlen =
+        if wlen >= stretch {
+            0
+        } else {
+            (DL_MULT * num::abs(num::pow((stretch - wlen) as f32 / (stretch - 1) as f32, 3))) as i64
+        };
+
+    // we penalize lines that have very different ratios from previous lines
+    let bad_deltaR = (DR_MULT * num::abs(num::pow((ratio - prev_rat) / 2.0, 3))) as i64;
+
+    let demerits = num::pow(1 + bad_linelen + bad_wordlen + bad_deltaR, 2);
+
+    (demerits, ratio)
+}
+
+#[inline(always)]
+fn restart_active_breaks<'a>(args: &BreakArgs<'a>, active: &LineBreak<'a>, act_idx: uint, w: &'a WordInfo<'a>, slen: uint, min: uint) -> LineBreak<'a> {
+    let (break_before, line_length) =
+        if active.fresh {
+            // never break before a word if that word would be the first on a line
+            (false, args.indent_len)
+        } else {
+            // choose the lesser evil: breaking too early, or breaking too late
+            let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh);
+            let underlen: int = (min - active.length) as int;
+            let overlen: int = ((wlen + slen + active.length) - args.opts.width) as int;
+            if overlen > underlen {
+                // break early, put this word on the next line
+                (true, args.indent_len + w.word_nchars)
+            } else {
+                (false, args.indent_len)
+            }
+        };
+
+    // restart the linebreak. This will be our only active path.
+    LineBreak {
+        prev         : act_idx,
+        linebreak    : Some(w),
+        break_before : break_before,
+        demerits     : 0, // this is the only active break, so we can reset the demerit count
+        prev_rat     : if break_before { 1.0 } else { -1.0 },
+        length       : line_length,
+        fresh        : !break_before
     }
 }
 
-// we have to satisfy Eq to implement Ord
-#[allow(dead_code)]
-impl<'a> Eq for LineBreak<'a> {}
-
-// NOTE again here we reverse the ordering:
-// if other has more demerits, self is Greater
-#[allow(dead_code)]
-impl<'a> Ord for LineBreak<'a> {
-    fn cmp(&self, other: &LineBreak) -> Ordering {
-        other.demerits.cmp(&self.demerits)
+// Number of spaces to add before a word, based on mode, newline, sentence start.
+#[inline(always)]
+fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> uint {
+    if uniform || newline {
+        if start || (newline && punct) {
+            2
+        } else {
+            1
+        }
+    } else {
+        0
     }
 }
 
+// If we're on a fresh line, slen=0 and we slice off leading whitespace.
+// Otherwise, compute slen and leave whitespace alone.
+#[inline(always)]
+fn slice_if_fresh<'a>(fresh: bool, word: &'a str, start: uint, uniform: bool, newline: bool, sstart: bool, punct: bool) -> (uint, &'a str) {
+    if fresh {
+        (0, word.slice_from(start))
+    } else {
+        (compute_slen(uniform, newline, sstart, punct), word)
+    }
+}
+
+// Write a newline and add the indent.
+#[inline(always)]
+fn write_newline(indent: &str, ostream: &mut Box<Writer>) {
+    silent_unwrap!(ostream.write_char('\n'));
+    silent_unwrap!(ostream.write(indent.as_bytes()));
+}
+
+// Write the word, along with slen spaces.
+#[inline(always)]
+fn write_with_spaces(word: &str, slen: uint, ostream: &mut Box<Writer>) {
+    if slen == 2 {
+        silent_unwrap!(ostream.write("  ".as_bytes()));
+    } else if slen == 1 {
+        silent_unwrap!(ostream.write_char(' '));
+    }
+    silent_unwrap!(ostream.write(word.as_bytes()));
+}
diff --git a/fmt/parasplit.rs b/fmt/parasplit.rs
index c4833d02f..50911a93b 100644
--- a/fmt/parasplit.rs
+++ b/fmt/parasplit.rs
@@ -13,6 +13,21 @@ use std::slice::Items;
 use std::str::CharRange;
 use FileOrStdReader;
 use FmtOptions;
+use charwidth;
+
+#[inline(always)]
+fn char_width(c: char) -> uint {
+    if (c as uint) < 0xA0 {
+        // if it is ASCII, call it exactly 1 wide (including control chars)
+        // calling control chars' widths 1 is consistent with OpenBSD fmt
+        1
+    } else {
+        // otherwise, get the unicode width
+        // note that we shouldn't actually get None here because only c < 0xA0
+        // can return None, but for safety and future-proofing we do it this way
+        charwidth::width(c).unwrap_or(1)
+    }
+}
 
 // lines with PSKIP, lacking PREFIX, or which are entirely blank are
 // NoFormatLines; otherwise, they are FormatLines
@@ -117,7 +132,7 @@ impl<'a> FileLines<'a> {
                 indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
             } else {
                 // non-tab character
-                indent_len += 1;
+                indent_len += char_width(c);
             }
         }
         (indent_end, prefix_len, indent_len)
@@ -196,7 +211,7 @@ pub struct Paragraph {
 // an iterator producing a stream of paragraphs from a stream of lines
 // given a set of options.
 pub struct ParagraphStream<'a> {
-    lines     : Peekable<Line,FileLines<'a>>,
+    lines     : Peekable<Line, FileLines<'a>>,
     next_mail : bool,
     opts      : &'a FmtOptions,
 }
@@ -238,8 +253,8 @@ impl<'a> ParagraphStream<'a> {
     }
 }
 
-impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
-    fn next(&mut self) -> Option<Result<Paragraph,String>> {
+impl<'a> Iterator<Result<Paragraph, String>> for ParagraphStream<'a> {
+    fn next(&mut self) -> Option<Result<Paragraph, String>> {
         // return a NoFormatLine in an Err; it should immediately be output
         let noformat =
             match self.lines.peek() {
@@ -396,39 +411,37 @@ impl<'a> ParaWords<'a> {
             // no extra spacing for mail headers; always exactly 1 space
             // safe to trim_left on every line of a mail header, since the
             // first line is guaranteed not to have any spaces
-            self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
+            self.words.extend(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
                 word           : x,
                 word_start     : 0,
-                word_nchars    : x.char_len(),
+                word_nchars    : x.len(),  // OK for mail headers; only ASCII allowed (unicode is escaped)
                 before_tab     : None,
                 after_tab      : 0,
                 sentence_start : false,
                 ends_punct     : false,
                 new_line       : false
-            }).collect());
+            }));
         } else {
             // first line
-            self.words.push_all_move(
+            self.words.extend(
                 if self.opts.crown || self.opts.tagged {
                     // crown and tagged mode has the "init" in the first line, so slice from there
                     WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
                 } else {
                     // otherwise we slice from the indent
                     WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
-                }.collect());
+                });
 
             if self.para.lines.len() > 1 {
                 let indent_end = self.para.indent_end;
                 let opts = self.opts;
-                self.words.push_all_move(
-                    self.para.lines.iter().skip(1)
-                    .flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end)))
-                    .collect());
+                self.words.extend(
+                    self.para.lines.iter().skip(1).flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end))));
             }
         }
     }
 
-    pub fn words(&'a self) -> Items<'a,WordInfo<'a>> { return self.words.iter() }
+    pub fn words(&'a self) -> Items<'a, WordInfo<'a>> { return self.words.iter() }
 }
 
 struct WordSplit<'a> {
@@ -516,7 +529,7 @@ impl<'a> Iterator<WordInfo<'a>> for WordSplit<'a> {
         let mut word_nchars = 0;
         self.position =
             match self.string.slice_from(word_start)
-            .find(|x: char| if !x.is_whitespace() { word_nchars += 1; false } else { true }) {
+            .find(|x: char| if !x.is_whitespace() { word_nchars += char_width(x); false } else { true }) {
                 None => self.length,
                 Some(s) => s + word_start
             };