From 8be67f7d4d3cb4a67cfa53cce8c446b78efeb8f8 Mon Sep 17 00:00:00 2001 From: kwantam Date: Wed, 25 Jun 2014 23:52:28 -0400 Subject: [PATCH] fmt Knuth-Plass implementation; unicode char_width fmt: - Implemented Knuth-Plass optimal linebreaking strategy. - Added commandline switch -q for "quick" (greedy) split mode that does not use Knuth-Plass. - Right now, Knuth-Plass runs about half as fast. It also uses more memory. - Updated fmt to use char_width (see below) instead of assuming each character width is 1. - Use i64 for demerits instead of int in K-P, since int is pointer sized and will only be 32 bits on some architectures. - incremented version number - Incorporated improvements suggested by huonw and Arcterus. - K-P uses indices of linebreaks vector instead of raw pointers. This gets rid of a lot of allocation of boxes and improves safety to boot. - Added a support module for computing displayed widths of unicode strings based on Markus Kuhn's free implementation at http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - This is in `charwidth.rs`, but this is a temporary measure until the Char trait implements .width(). I am submitting a PR for this soon, and the code in charwidth() is what's generated libcore. closes #223 --- fmt/charwidth.rs | 170 ++++++++++++++++++++ fmt/fmt.rs | 25 +-- fmt/linebreak.rs | 411 ++++++++++++++++++++++++++++++++++++++++------- fmt/parasplit.rs | 43 +++-- 4 files changed, 563 insertions(+), 86 deletions(-) create mode 100644 fmt/charwidth.rs diff --git a/fmt/charwidth.rs b/fmt/charwidth.rs new file mode 100644 index 000000000..d135fd841 --- /dev/null +++ b/fmt/charwidth.rs @@ -0,0 +1,170 @@ +/* + * This file is part of `fmt` from the uutils coreutils package. + * + * (c) kwantam + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +fn bsearch_range_value_table(c: char, r: &'static [(char, char, uint)]) -> uint { + match r.bsearch(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Some(idx) => { + let (_, _, result) = r[idx]; + result + } + None => 1 + } +} + +pub fn width(c: char) -> Option { + match c as uint { + _c @ 0 => Some(0), // null is zero width + cu if cu < 0x20 => None, // control sequences have no width + cu if cu < 0x7F => Some(1), // ASCII + cu if cu < 0xA0 => None, // more control sequences + _ => Some(bsearch_range_value_table(c, charwidth_table)) + } +} + +// character width table. Based on Markus Kuhn's free wcwidth() implementation, +// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c +static charwidth_table : &'static [(char, char, uint)] = &[ + ('\u0300', '\u036f', 0), ('\u0483', '\u0487', 0), ('\u0488', '\u0489', 0), ('\u0591', + '\u05bd', 0), ('\u05bf', '\u05bf', 0), ('\u05c1', '\u05c2', 0), ('\u05c4', '\u05c5', 0), + ('\u05c7', '\u05c7', 0), ('\u0600', '\u0605', 0), ('\u0610', '\u061a', 0), ('\u061c', + '\u061c', 0), ('\u064b', '\u065f', 0), ('\u0670', '\u0670', 0), ('\u06d6', '\u06dc', 0), + ('\u06dd', '\u06dd', 0), ('\u06df', '\u06e4', 0), ('\u06e7', '\u06e8', 0), ('\u06ea', + '\u06ed', 0), ('\u070f', '\u070f', 0), ('\u0711', '\u0711', 0), ('\u0730', '\u074a', 0), + ('\u07a6', '\u07b0', 0), ('\u07eb', '\u07f3', 0), ('\u0816', '\u0819', 0), ('\u081b', + '\u0823', 0), ('\u0825', '\u0827', 0), ('\u0829', '\u082d', 0), ('\u0859', '\u085b', 0), + ('\u08e4', '\u0902', 0), ('\u093a', '\u093a', 0), ('\u093c', '\u093c', 0), ('\u0941', + '\u0948', 0), ('\u094d', '\u094d', 0), ('\u0951', '\u0957', 0), ('\u0962', '\u0963', 0), + ('\u0981', '\u0981', 0), ('\u09bc', '\u09bc', 0), ('\u09c1', '\u09c4', 0), ('\u09cd', + '\u09cd', 0), ('\u09e2', '\u09e3', 0), ('\u0a01', '\u0a02', 0), ('\u0a3c', '\u0a3c', 0), + ('\u0a41', '\u0a51', 0), ('\u0a70', '\u0a71', 0), ('\u0a75', '\u0a82', 0), ('\u0abc', + '\u0abc', 0), ('\u0ac1', '\u0ac8', 0), ('\u0acd', '\u0acd', 0), ('\u0ae2', '\u0ae3', 0), + ('\u0b01', '\u0b01', 0), ('\u0b3c', '\u0b3c', 0), ('\u0b3f', '\u0b3f', 0), ('\u0b41', + '\u0b44', 0), ('\u0b4d', '\u0b56', 0), ('\u0b62', '\u0b63', 0), ('\u0b82', '\u0b82', 0), + ('\u0bc0', '\u0bc0', 0), ('\u0bcd', '\u0bcd', 0), ('\u0c00', '\u0c00', 0), ('\u0c3e', + '\u0c40', 0), ('\u0c46', '\u0c56', 0), ('\u0c62', '\u0c63', 0), ('\u0c81', '\u0c81', 0), + ('\u0cbc', '\u0cbc', 0), ('\u0cbf', '\u0cbf', 0), ('\u0cc6', '\u0cc6', 0), ('\u0ccc', + '\u0ccd', 0), ('\u0ce2', '\u0ce3', 0), ('\u0d01', '\u0d01', 0), ('\u0d41', '\u0d44', 0), + ('\u0d4d', '\u0d4d', 0), ('\u0d62', '\u0d63', 0), ('\u0dca', '\u0dca', 0), ('\u0dd2', + '\u0dd6', 0), ('\u0e31', '\u0e31', 0), ('\u0e34', '\u0e3a', 0), ('\u0e47', '\u0e4e', 0), + ('\u0eb1', '\u0eb1', 0), ('\u0eb4', '\u0ebc', 0), ('\u0ec8', '\u0ecd', 0), ('\u0f18', + '\u0f19', 0), ('\u0f35', '\u0f35', 0), ('\u0f37', '\u0f37', 0), ('\u0f39', '\u0f39', 0), + ('\u0f71', '\u0f7e', 0), ('\u0f80', '\u0f84', 0), ('\u0f86', '\u0f87', 0), ('\u0f8d', + '\u0fbc', 0), ('\u0fc6', '\u0fc6', 0), ('\u102d', '\u1030', 0), ('\u1032', '\u1037', 0), + ('\u1039', '\u103a', 0), ('\u103d', '\u103e', 0), ('\u1058', '\u1059', 0), ('\u105e', + '\u1060', 0), ('\u1071', '\u1074', 0), ('\u1082', '\u1082', 0), ('\u1085', '\u1086', 0), + ('\u108d', '\u108d', 0), ('\u109d', '\u109d', 0), ('\u1100', '\u115f', 2), ('\u1160', + '\u11ff', 0), ('\u135d', '\u135f', 0), ('\u1712', '\u1714', 0), ('\u1732', '\u1734', 0), + ('\u1752', '\u1753', 0), ('\u1772', '\u1773', 0), ('\u17b4', '\u17b5', 0), ('\u17b7', + '\u17bd', 0), ('\u17c6', '\u17c6', 0), ('\u17c9', '\u17d3', 0), ('\u17dd', '\u17dd', 0), + ('\u180b', '\u180d', 0), ('\u180e', '\u180e', 0), ('\u18a9', '\u18a9', 0), ('\u1920', + '\u1922', 0), ('\u1927', '\u1928', 0), ('\u1932', '\u1932', 0), ('\u1939', '\u193b', 0), + ('\u1a17', '\u1a18', 0), ('\u1a1b', '\u1a1b', 0), ('\u1a56', '\u1a56', 0), ('\u1a58', + '\u1a60', 0), ('\u1a62', '\u1a62', 0), ('\u1a65', '\u1a6c', 0), ('\u1a73', '\u1a7f', 0), + ('\u1ab0', '\u1abd', 0), ('\u1abe', '\u1abe', 0), ('\u1b00', '\u1b03', 0), ('\u1b34', + '\u1b34', 0), ('\u1b36', '\u1b3a', 0), ('\u1b3c', '\u1b3c', 0), ('\u1b42', '\u1b42', 0), + ('\u1b6b', '\u1b73', 0), ('\u1b80', '\u1b81', 0), ('\u1ba2', '\u1ba5', 0), ('\u1ba8', + '\u1ba9', 0), ('\u1bab', '\u1bad', 0), ('\u1be6', '\u1be6', 0), ('\u1be8', '\u1be9', 0), + ('\u1bed', '\u1bed', 0), ('\u1bef', '\u1bf1', 0), ('\u1c2c', '\u1c33', 0), ('\u1c36', + '\u1c37', 0), ('\u1cd0', '\u1cd2', 0), ('\u1cd4', '\u1ce0', 0), ('\u1ce2', '\u1ce8', 0), + ('\u1ced', '\u1ced', 0), ('\u1cf4', '\u1cf4', 0), ('\u1cf8', '\u1cf9', 0), ('\u1dc0', + '\u1dff', 0), ('\u200b', '\u200f', 0), ('\u202a', '\u202e', 0), ('\u2060', '\u206f', 0), + ('\u20d0', '\u20dc', 0), ('\u20dd', '\u20e0', 0), ('\u20e1', '\u20e1', 0), ('\u20e2', + '\u20e4', 0), ('\u20e5', '\u20f0', 0), ('\u2329', '\u2329', 2), ('\u232a', '\u232a', 2), + ('\u2cef', '\u2cf1', 0), ('\u2d7f', '\u2d7f', 0), ('\u2de0', '\u2dff', 0), ('\u2e80', + '\u2e99', 2), ('\u2e9b', '\u2ef3', 2), ('\u2f00', '\u2fd5', 2), ('\u2ff0', '\u2ffb', 2), + ('\u3000', '\u3000', 2), ('\u3001', '\u3003', 2), ('\u3004', '\u3004', 2), ('\u3005', + '\u3005', 2), ('\u3006', '\u3006', 2), ('\u3007', '\u3007', 2), ('\u3008', '\u3008', 2), + ('\u3009', '\u3009', 2), ('\u300a', '\u300a', 2), ('\u300b', '\u300b', 2), ('\u300c', + '\u300c', 2), ('\u300d', '\u300d', 2), ('\u300e', '\u300e', 2), ('\u300f', '\u300f', 2), + ('\u3010', '\u3010', 2), ('\u3011', '\u3011', 2), ('\u3012', '\u3013', 2), ('\u3014', + '\u3014', 2), ('\u3015', '\u3015', 2), ('\u3016', '\u3016', 2), ('\u3017', '\u3017', 2), + ('\u3018', '\u3018', 2), ('\u3019', '\u3019', 2), ('\u301a', '\u301a', 2), ('\u301b', + '\u301b', 2), ('\u301c', '\u301c', 2), ('\u301d', '\u301d', 2), ('\u301e', '\u301f', 2), + ('\u3020', '\u3020', 2), ('\u3021', '\u3029', 2), ('\u302a', '\u302d', 0), ('\u302a', + '\u302d', 2), ('\u302e', '\u302f', 2), ('\u3030', '\u3030', 2), ('\u3031', '\u3035', 2), + ('\u3036', '\u3037', 2), ('\u3038', '\u303a', 2), ('\u303b', '\u303b', 2), ('\u303c', + '\u303c', 2), ('\u303d', '\u303d', 2), ('\u303e', '\u303e', 2), ('\u3041', '\u3096', 2), + ('\u3099', '\u309a', 0), ('\u3099', '\u309a', 2), ('\u309b', '\u309c', 2), ('\u309d', + '\u309e', 2), ('\u309f', '\u309f', 2), ('\u30a0', '\u30a0', 2), ('\u30a1', '\u30fa', 2), + ('\u30fb', '\u30fb', 2), ('\u30fc', '\u30fe', 2), ('\u30ff', '\u30ff', 2), ('\u3105', + '\u312d', 2), ('\u3131', '\u318e', 2), ('\u3190', '\u3191', 2), ('\u3192', '\u3195', 2), + ('\u3196', '\u319f', 2), ('\u31a0', '\u31ba', 2), ('\u31c0', '\u31e3', 2), ('\u31f0', + '\u31ff', 2), ('\u3200', '\u321e', 2), ('\u3220', '\u3229', 2), ('\u322a', '\u3247', 2), + ('\u3250', '\u3250', 2), ('\u3251', '\u325f', 2), ('\u3260', '\u327f', 2), ('\u3280', + '\u3289', 2), ('\u328a', '\u32b0', 2), ('\u32b1', '\u32bf', 2), ('\u32c0', '\u32fe', 2), + ('\u3300', '\u33ff', 2), ('\u3400', '\u4db5', 2), ('\u4db6', '\u4dbf', 2), ('\u4e00', + '\u9fcc', 2), ('\u9fcd', '\u9fff', 2), ('\ua000', '\ua014', 2), ('\ua015', '\ua015', 2), + ('\ua016', '\ua48c', 2), ('\ua490', '\ua4c6', 2), ('\ua66f', '\ua66f', 0), ('\ua670', + '\ua672', 0), ('\ua674', '\ua67d', 0), ('\ua69f', '\ua69f', 0), ('\ua6f0', '\ua6f1', 0), + ('\ua802', '\ua802', 0), ('\ua806', '\ua806', 0), ('\ua80b', '\ua80b', 0), ('\ua825', + '\ua826', 0), ('\ua8c4', '\ua8c4', 0), ('\ua8e0', '\ua8f1', 0), ('\ua926', '\ua92d', 0), + ('\ua947', '\ua951', 0), ('\ua960', '\ua97c', 2), ('\ua980', '\ua982', 0), ('\ua9b3', + '\ua9b3', 0), ('\ua9b6', '\ua9b9', 0), ('\ua9bc', '\ua9bc', 0), ('\ua9e5', '\ua9e5', 0), + ('\uaa29', '\uaa2e', 0), ('\uaa31', '\uaa32', 0), ('\uaa35', '\uaa36', 0), ('\uaa43', + '\uaa43', 0), ('\uaa4c', '\uaa4c', 0), ('\uaa7c', '\uaa7c', 0), ('\uaab0', '\uaab0', 0), + ('\uaab2', '\uaab4', 0), ('\uaab7', '\uaab8', 0), ('\uaabe', '\uaabf', 0), ('\uaac1', + '\uaac1', 0), ('\uaaec', '\uaaed', 0), ('\uaaf6', '\uaaf6', 0), ('\uabe5', '\uabe5', 0), + ('\uabe8', '\uabe8', 0), ('\uabed', '\uabed', 0), ('\uac00', '\ud7a3', 2), ('\uf900', + '\ufa6d', 2), ('\ufa6e', '\ufa6f', 2), ('\ufa70', '\ufad9', 2), ('\ufada', '\ufaff', 2), + ('\ufb1e', '\ufb1e', 0), ('\ufe00', '\ufe0f', 0), ('\ufe10', '\ufe16', 2), ('\ufe17', + '\ufe17', 2), ('\ufe18', '\ufe18', 2), ('\ufe19', '\ufe19', 2), ('\ufe20', '\ufe2d', 0), + ('\ufe30', '\ufe30', 2), ('\ufe31', '\ufe32', 2), ('\ufe33', '\ufe34', 2), ('\ufe35', + '\ufe35', 2), ('\ufe36', '\ufe36', 2), ('\ufe37', '\ufe37', 2), ('\ufe38', '\ufe38', 2), + ('\ufe39', '\ufe39', 2), ('\ufe3a', '\ufe3a', 2), ('\ufe3b', '\ufe3b', 2), ('\ufe3c', + '\ufe3c', 2), ('\ufe3d', '\ufe3d', 2), ('\ufe3e', '\ufe3e', 2), ('\ufe3f', '\ufe3f', 2), + ('\ufe40', '\ufe40', 2), ('\ufe41', '\ufe41', 2), ('\ufe42', '\ufe42', 2), ('\ufe43', + '\ufe43', 2), ('\ufe44', '\ufe44', 2), ('\ufe45', '\ufe46', 2), ('\ufe47', '\ufe47', 2), + ('\ufe48', '\ufe48', 2), ('\ufe49', '\ufe4c', 2), ('\ufe4d', '\ufe4f', 2), ('\ufe50', + '\ufe52', 2), ('\ufe54', '\ufe57', 2), ('\ufe58', '\ufe58', 2), ('\ufe59', '\ufe59', 2), + ('\ufe5a', '\ufe5a', 2), ('\ufe5b', '\ufe5b', 2), ('\ufe5c', '\ufe5c', 2), ('\ufe5d', + '\ufe5d', 2), ('\ufe5e', '\ufe5e', 2), ('\ufe5f', '\ufe61', 2), ('\ufe62', '\ufe62', 2), + ('\ufe63', '\ufe63', 2), ('\ufe64', '\ufe66', 2), ('\ufe68', '\ufe68', 2), ('\ufe69', + '\ufe69', 2), ('\ufe6a', '\ufe6b', 2), ('\ufeff', '\ufeff', 0), ('\uff01', '\uff03', 2), + ('\uff04', '\uff04', 2), ('\uff05', '\uff07', 2), ('\uff08', '\uff08', 2), ('\uff09', + '\uff09', 2), ('\uff0a', '\uff0a', 2), ('\uff0b', '\uff0b', 2), ('\uff0c', '\uff0c', 2), + ('\uff0d', '\uff0d', 2), ('\uff0e', '\uff0f', 2), ('\uff10', '\uff19', 2), ('\uff1a', + '\uff1b', 2), ('\uff1c', '\uff1e', 2), ('\uff1f', '\uff20', 2), ('\uff21', '\uff3a', 2), + ('\uff3b', '\uff3b', 2), ('\uff3c', '\uff3c', 2), ('\uff3d', '\uff3d', 2), ('\uff3e', + '\uff3e', 2), ('\uff3f', '\uff3f', 2), ('\uff40', '\uff40', 2), ('\uff41', '\uff5a', 2), + ('\uff5b', '\uff5b', 2), ('\uff5c', '\uff5c', 2), ('\uff5d', '\uff5d', 2), ('\uff5e', + '\uff5e', 2), ('\uff5f', '\uff5f', 2), ('\uff60', '\uff60', 2), ('\uffe0', '\uffe1', 2), + ('\uffe2', '\uffe2', 2), ('\uffe3', '\uffe3', 2), ('\uffe4', '\uffe4', 2), ('\uffe5', + '\uffe6', 2), ('\ufff9', '\ufffb', 0), ('\U000101fd', '\U000101fd', 0), ('\U000102e0', + '\U000102e0', 0), ('\U00010376', '\U0001037a', 0), ('\U00010a01', '\U00010a0f', 0), + ('\U00010a38', '\U00010a3f', 0), ('\U00010ae5', '\U00010ae6', 0), ('\U00011001', + '\U00011001', 0), ('\U00011038', '\U00011046', 0), ('\U0001107f', '\U00011081', 0), + ('\U000110b3', '\U000110b6', 0), ('\U000110b9', '\U000110ba', 0), ('\U000110bd', + '\U000110bd', 0), ('\U00011100', '\U00011102', 0), ('\U00011127', '\U0001112b', 0), + ('\U0001112d', '\U00011134', 0), ('\U00011173', '\U00011173', 0), ('\U00011180', + '\U00011181', 0), ('\U000111b6', '\U000111be', 0), ('\U0001122f', '\U00011231', 0), + ('\U00011234', '\U00011234', 0), ('\U00011236', '\U00011237', 0), ('\U000112df', + '\U000112df', 0), ('\U000112e3', '\U000112ea', 0), ('\U00011301', '\U00011301', 0), + ('\U0001133c', '\U0001133c', 0), ('\U00011340', '\U00011340', 0), ('\U00011366', + '\U00011374', 0), ('\U000114b3', '\U000114b8', 0), ('\U000114ba', '\U000114ba', 0), + ('\U000114bf', '\U000114c0', 0), ('\U000114c2', '\U000114c3', 0), ('\U000115b2', + '\U000115b5', 0), ('\U000115bc', '\U000115bd', 0), ('\U000115bf', '\U000115c0', 0), + ('\U00011633', '\U0001163a', 0), ('\U0001163d', '\U0001163d', 0), ('\U0001163f', + '\U00011640', 0), ('\U000116ab', '\U000116ab', 0), ('\U000116ad', '\U000116ad', 0), + ('\U000116b0', '\U000116b5', 0), ('\U000116b7', '\U000116b7', 0), ('\U00016af0', + '\U00016af4', 0), ('\U00016b30', '\U00016b36', 0), ('\U00016f8f', '\U00016f92', 0), + ('\U0001b000', '\U0001b001', 2), ('\U0001bc9d', '\U0001bc9e', 0), ('\U0001bca0', + '\U0001bca3', 0), ('\U0001d167', '\U0001d169', 0), ('\U0001d173', '\U0001d17a', 0), + ('\U0001d17b', '\U0001d182', 0), ('\U0001d185', '\U0001d18b', 0), ('\U0001d1aa', + '\U0001d1ad', 0), ('\U0001d242', '\U0001d244', 0), ('\U0001e8d0', '\U0001e8d6', 0), + ('\U0001f200', '\U0001f202', 2), ('\U0001f210', '\U0001f23a', 2), ('\U0001f240', + '\U0001f248', 2), ('\U0001f250', '\U0001f251', 2), ('\U00020000', '\U0002a6d6', 2), + ('\U0002a6d7', '\U0002a6ff', 2), ('\U0002a700', '\U0002b734', 2), ('\U0002b735', + '\U0002b73f', 2), ('\U0002b740', '\U0002b81d', 2), ('\U0002b81e', '\U0002f7ff', 2), + ('\U0002f800', '\U0002fa1d', 2), ('\U0002fa1e', '\U0002fffd', 2), ('\U00030000', + '\U0003fffd', 2), ('\U000e0001', '\U000e007f', 0), ('\U000e0100', '\U000e01ef', 0) +]; diff --git a/fmt/fmt.rs b/fmt/fmt.rs index f4e6f64fb..2c76f219e 100644 --- a/fmt/fmt.rs +++ b/fmt/fmt.rs @@ -1,4 +1,4 @@ -#![crate_id(name="fmt", vers="0.0.2", author="kwantam")] +#![crate_id(name="fmt", vers="0.0.3", author="kwantam")] /* * This file is part of `fmt` from the uutils coreutils package. * @@ -13,6 +13,7 @@ extern crate core; extern crate getopts; +use std::cmp; use std::io::{BufferedReader, BufferedWriter, File, IoResult}; use std::io::stdio::{stdin_raw, stdout_raw}; use linebreak::break_lines; @@ -31,10 +32,11 @@ macro_rules! silent_unwrap( mod util; mod linebreak; mod parasplit; +mod charwidth; // program's NAME and VERSION are used for -V and -h static NAME: &'static str = "fmt"; -static VERSION: &'static str = "0.0.2"; +static VERSION: &'static str = "0.0.3"; struct FmtOptions { crown : bool, @@ -48,6 +50,7 @@ struct FmtOptions { anti_prefix : String, xanti_prefix : bool, uniform : bool, + quick : bool, width : uint, goal : uint, tabwidth : uint, @@ -68,8 +71,10 @@ pub fn uumain(args: Vec) -> int { getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."), getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."), - getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"), - getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"), + getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 79.", "WIDTH"), + getopts::optopt("g", "goal", "Goal width, default ~0.94*WIDTH. Must be less than WIDTH.", "GOAL"), + + getopts::optflag("q", "quick", "Break lines more quickly at the expense of a potentially more ragged appearance."), getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"), @@ -96,6 +101,7 @@ pub fn uumain(args: Vec) -> int { tagged : false, mail : false, uniform : false, + quick : false, split_only : false, use_prefix : false, prefix : String::new(), @@ -103,8 +109,8 @@ pub fn uumain(args: Vec) -> int { use_anti_prefix : false, anti_prefix : String::new(), xanti_prefix : false, - width : 78, - goal : 72, + width : 79, + goal : 74, tabwidth : 8, }; @@ -112,6 +118,7 @@ pub fn uumain(args: Vec) -> int { if matches.opt_present("c") { fmt_opts.crown = true; fmt_opts.tagged = false; } if matches.opt_present("m") { fmt_opts.mail = true; } if matches.opt_present("u") { fmt_opts.uniform = true; } + if matches.opt_present("q") { fmt_opts.quick = true; } if matches.opt_present("s") { fmt_opts.split_only = true; fmt_opts.crown = false; fmt_opts.tagged = false; } if matches.opt_present("x") { fmt_opts.xprefix = true; } if matches.opt_present("X") { fmt_opts.xanti_prefix = true; } @@ -139,7 +146,7 @@ pub fn uumain(args: Vec) -> int { Some(t) => t, None => { crash!(1, "Invalid WIDTH specification: `{}'", s); } }; - fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4); + fmt_opts.goal = cmp::min(fmt_opts.width * 94 / 100, fmt_opts.width - 3); } None => () }; @@ -152,7 +159,7 @@ pub fn uumain(args: Vec) -> int { None => { crash!(1, "Invalid GOAL specification: `{}'", s); } }; if !matches.opt_present("w") { - fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4); + fmt_opts.width = cmp::max(fmt_opts.goal * 100 / 94, fmt_opts.goal + 3); } else if fmt_opts.goal > fmt_opts.width { crash!(1, "GOAL cannot be greater than WIDTH."); } @@ -189,7 +196,7 @@ pub fn uumain(args: Vec) -> int { let mut fp = match open_file(i) { Err(e) => { - show_warning!("{}: {}",i,e); + show_warning!("{}: {}", i, e); continue; } Ok(f) => f diff --git a/fmt/linebreak.rs b/fmt/linebreak.rs index 727f014d4..89f85b164 100644 --- a/fmt/linebreak.rs +++ b/fmt/linebreak.rs @@ -9,6 +9,10 @@ use FmtOptions; use parasplit::{Paragraph, ParaWords, WordInfo}; +use std::i64; +use std::cmp; +use std::mem; +use std::num; struct BreakArgs<'a> { opts : &'a FmtOptions, @@ -21,8 +25,16 @@ struct BreakArgs<'a> { impl<'a> BreakArgs<'a> { #[inline(always)] - fn compute_width(&self, pre: uint, post: uint, posn: uint) -> uint { - post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn + fn compute_width<'b>(&self, winfo: &WordInfo<'b>, posn: uint, fresh: bool) -> uint { + if fresh { + 0 + } else { + let post = winfo.after_tab; + match winfo.before_tab { + None => post, + Some(pre) => post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn + } + } } } @@ -73,91 +85,366 @@ pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box>>(iter: &'a mut T, args: &mut BreakArgs<'a>) { +// break_simple implements a "greedy" breaking algorithm: print words until +// maxlength would be exceeded, then print a linebreak and indent and continue. +fn break_simple<'a, T: Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) { iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo)); silent_unwrap!(args.ostream.write_char('\n')); } +#[inline(always)] fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) { // compute the length of this word, considering how tabs will expand at this position on the line - let wlen = winfo.word_nchars + - if winfo.before_tab.is_some() { - args.compute_width(winfo.before_tab.unwrap(), winfo.after_tab, l) - } else { - winfo.after_tab - }; + let wlen = winfo.word_nchars + args.compute_width(winfo, l, false); - let splen = - if args.uniform || winfo.new_line { - if winfo.sentence_start || (winfo.new_line && prev_punct) { 2 } - else { 1 } - } else { - 0 - }; + let slen = compute_slen(args.uniform, winfo.new_line, winfo.sentence_start, prev_punct); - if l + wlen + splen > args.opts.width { - let wtrim = winfo.word.slice_from(winfo.word_start); - silent_unwrap!(args.ostream.write_char('\n')); - silent_unwrap!(args.ostream.write(args.indent_str.as_bytes())); - silent_unwrap!(args.ostream.write(wtrim.as_bytes())); - (args.indent_len + wtrim.len(), winfo.ends_punct) + if l + wlen + slen > args.opts.width { + write_newline(args.indent_str, args.ostream); + write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream); + (args.indent_len + winfo.word_nchars, winfo.ends_punct) } else { - if splen == 2 { silent_unwrap!(args.ostream.write(" ".as_bytes())); } - else if splen == 1 { silent_unwrap!(args.ostream.write_char(' ')) } - silent_unwrap!(args.ostream.write(winfo.word.as_bytes())); - (l + wlen + splen, winfo.ends_punct) + write_with_spaces(winfo.word, slen, args.ostream); + (l + wlen + slen, winfo.ends_punct) } } -#[allow(dead_code)] -enum PreviousBreak<'a> { - ParaStart, - PrevBreak(&'a LineBreak<'a>) +// break_knuth_plass implements an "optimal" breaking algorithm in the style of +// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software, +// Practice and Experience. Vol. 11, No. 11, November 1981. +// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf +fn break_knuth_plass<'a, T: Clone + Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) { + // run the algorithm to get the breakpoints + let breakpoints = find_kp_breakpoints(iter.clone(), args); + + // iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it + let (mut prev_punct, mut fresh) = + breakpoints.iter().rev().fold((false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| { + if fresh { + write_newline(args.indent_str, args.ostream); + } + // at each breakpoint, keep emitting words until we find the word matching this breakpoint + for winfo in iter { + let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform, + winfo.new_line, winfo.sentence_start, prev_punct); + fresh = false; + prev_punct = winfo.ends_punct; + + // We find identical breakpoints here by comparing addresses of the references. + // This is OK because the backing vector is not mutating once we are linebreaking. + if winfo as *_ == next_break as *_ { + // OK, we found the matching word + if break_before { + write_newline(args.indent_str, args.ostream); + write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream); + } else { + // breaking after this word, so that means "fresh" is true for the next iteration + write_with_spaces(word, slen, args.ostream); + fresh = true; + } + break; + } else { + write_with_spaces(word, slen, args.ostream); + } + } + (prev_punct, fresh) + }); + + // after the last linebreak, write out the rest of the final line. + for winfo in iter { + if fresh { + write_newline(args.indent_str, args.ostream); + } + let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform, + winfo.new_line, winfo.sentence_start, prev_punct); + prev_punct = winfo.ends_punct; + fresh = false; + write_with_spaces(word, slen, args.ostream); + } + silent_unwrap!(args.ostream.write_char('\n')); } -#[allow(dead_code)] struct LineBreak<'a> { - prev : PreviousBreak<'a>, - breakafter : &'a str, - demerits : uint + prev : uint, + linebreak : Option<&'a WordInfo<'a>>, + break_before : bool, + demerits : i64, + prev_rat : f32, + length : uint, + fresh : bool } -// when comparing two LineBreaks, compare their demerits -#[allow(dead_code)] -impl<'a> PartialEq for LineBreak<'a> { - fn eq(&self, other: &LineBreak) -> bool { - self.demerits == other.demerits +fn find_kp_breakpoints<'a, T: Iterator<&'a WordInfo<'a>>>(iter: T, args: &BreakArgs<'a>) -> Vec<(&'a WordInfo<'a>, bool)> { + let mut iter = iter.peekable(); + // set up the initial null linebreak + let mut linebreaks = vec!(LineBreak { + prev : 0, + linebreak : None, + break_before : false, + demerits : 0, + prev_rat : 0.0f32, + length : args.init_len, + fresh : false + }); + // this vec holds the current active linebreaks; next_ holds the breaks that will be active for the next word + let active_breaks = &mut vec!(0); + let next_active_breaks = &mut vec!(); + + let stretch = (args.opts.width - args.opts.goal) as int; + let minlength = args.opts.goal - stretch as uint; + let mut new_linebreaks = vec!(); + let mut is_sentence_start = false; + let mut least_demerits = 0; + loop { + let w = + match iter.next() { + None => break, + Some(w) => w + }; + + // if this is the last word, we don't add additional demerits for this break + let (is_last_word, is_sentence_end) = + match iter.peek() { + None => (true, true), + Some(&&WordInfo { sentence_start: st, new_line: nl, .. }) => (false, st || (nl && w.ends_punct)) + }; + + // should we be adding extra space at the beginning of the next sentence? + let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false); + + let mut ld_new = i64::MAX; + let mut ld_next = i64::MAX; + let mut ld_idx = 0; + new_linebreaks.clear(); + next_active_breaks.clear(); + // go through each active break, extending it and possibly adding a new active + // break if we are above the minimum required length + for &i in active_breaks.iter() { + let active = linebreaks.get_mut(i); + // normalize demerits to avoid overflow, and record if this is the least + active.demerits -= least_demerits; + if active.demerits < ld_next { + ld_next = active.demerits; + ld_idx = i; + } + + // get the new length + let tlen = w.word_nchars + args.compute_width(w, active.length, active.fresh) + slen + active.length; + + // if tlen is longer than args.opts.width, we drop this break from the active list + // otherwise, we extend the break, and possibly add a new break at this point + if tlen <= args.opts.width { + // this break will still be active next time + next_active_breaks.push(i); + // we can put this word on this line + active.fresh = false; + active.length = tlen; + + // if we're above the minlength, we can also consider breaking here + if tlen >= minlength { + let (new_demerits, new_ratio) = + if is_last_word { + // there is no penalty for the final line's length + (0, 0.0) + } else { + compute_demerits((args.opts.goal - tlen) as int, stretch, w.word_nchars as int, active.prev_rat) + }; + + // do not even consider adding a line that has too many demerits + // also, try to detect overflow by checking signum + let total_demerits = new_demerits + active.demerits; + if new_demerits < BAD_INFTY_SQ && total_demerits < ld_new && num::signum(active.demerits) <= num::signum(new_demerits) { + ld_new = total_demerits; + new_linebreaks.push(LineBreak { + prev : i, + linebreak : Some(w), + break_before : false, + demerits : total_demerits, + prev_rat : new_ratio, + length : args.indent_len, + fresh : true + }); + } + } + } + } + + // if we generated any new linebreaks, add the last one to the list + // the last one is always the best because we don't add to new_linebreaks unless + // it's better than the best one so far + match new_linebreaks.pop() { + None => (), + Some(lb) => { + next_active_breaks.push(linebreaks.len()); + linebreaks.push(lb); + } + } + + if next_active_breaks.is_empty() { + // every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx + let new_break = restart_active_breaks(args, linebreaks.get(ld_idx), ld_idx, w, slen, minlength); + next_active_breaks.push(linebreaks.len()); + linebreaks.push(new_break); + least_demerits = 0; + } else { + // next time around, normalize out the demerits fields + // on active linebreaks to make overflow less likely + least_demerits = cmp::max(ld_next, 0); + } + // swap in new list of active breaks + mem::swap(active_breaks, next_active_breaks); + // If this was the last word in a sentence, the next one must be the first in the next. + is_sentence_start = is_sentence_end; + } + + // return the best path + build_best_path(&linebreaks, active_breaks) +} + +#[inline(always)] +fn build_best_path<'a>(paths: &Vec>, active: &Vec) -> Vec<(&'a WordInfo<'a>, bool)> { + let mut breakwords = vec!(); + // of the active paths, we select the one with the fewest demerits + let mut best_idx = match active.iter().min_by(|&&a| paths.get(a).demerits) { + None => crash!(1, "Failed to find a k-p linebreak solution. This should never happen."), + Some(&s) => s + }; + + // now, chase the pointers back through the break list, recording + // the words at which we should break + loop { + let next_best = paths.get(best_idx); + match next_best.linebreak { + None => return breakwords, + Some(prev) => { + breakwords.push((prev, next_best.break_before)); + best_idx = next_best.prev + } + } } } -// NOTE "less than" in this case means "worse", i.e., more demerits -#[allow(dead_code)] -impl<'a> PartialOrd for LineBreak<'a> { - fn lt(&self, other: &LineBreak) -> bool { - self.demerits > other.demerits +// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed +static BAD_INFTY: i64 = 10000000; +static BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY; +// badness = BAD_MULT * abs(r) ^ 3 +static BAD_MULT: f32 = 100.0; +// DR_MULT is multiplier for delta-R between lines +static DR_MULT: f32 = 600.0; +// DL_MULT is penalty multiplier for short words at end of line +static DL_MULT: f32 = 300.0; + +#[inline(always)] +fn compute_demerits(delta_len: int, stretch: int, wlen: int, prev_rat: f32) -> (i64, f32) { + // how much stretch are we using? + let ratio = + if delta_len == 0 { + 0.0f32 + } else { + delta_len as f32 / stretch as f32 + }; + + // compute badness given the stretch ratio + let bad_linelen = + if num::abs(ratio) > 1.0f32 { + BAD_INFTY + } else { + (BAD_MULT * num::abs(num::pow(ratio, 3))) as i64 + }; + + // we penalize lines ending in really short words + let bad_wordlen = + if wlen >= stretch { + 0 + } else { + (DL_MULT * num::abs(num::pow((stretch - wlen) as f32 / (stretch - 1) as f32, 3))) as i64 + }; + + // we penalize lines that have very different ratios from previous lines + let bad_deltaR = (DR_MULT * num::abs(num::pow((ratio - prev_rat) / 2.0, 3))) as i64; + + let demerits = num::pow(1 + bad_linelen + bad_wordlen + bad_deltaR, 2); + + (demerits, ratio) +} + +#[inline(always)] +fn restart_active_breaks<'a>(args: &BreakArgs<'a>, active: &LineBreak<'a>, act_idx: uint, w: &'a WordInfo<'a>, slen: uint, min: uint) -> LineBreak<'a> { + let (break_before, line_length) = + if active.fresh { + // never break before a word if that word would be the first on a line + (false, args.indent_len) + } else { + // choose the lesser evil: breaking too early, or breaking too late + let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh); + let underlen: int = (min - active.length) as int; + let overlen: int = ((wlen + slen + active.length) - args.opts.width) as int; + if overlen > underlen { + // break early, put this word on the next line + (true, args.indent_len + w.word_nchars) + } else { + (false, args.indent_len) + } + }; + + // restart the linebreak. This will be our only active path. + LineBreak { + prev : act_idx, + linebreak : Some(w), + break_before : break_before, + demerits : 0, // this is the only active break, so we can reset the demerit count + prev_rat : if break_before { 1.0 } else { -1.0 }, + length : line_length, + fresh : !break_before } } -// we have to satisfy Eq to implement Ord -#[allow(dead_code)] -impl<'a> Eq for LineBreak<'a> {} - -// NOTE again here we reverse the ordering: -// if other has more demerits, self is Greater -#[allow(dead_code)] -impl<'a> Ord for LineBreak<'a> { - fn cmp(&self, other: &LineBreak) -> Ordering { - other.demerits.cmp(&self.demerits) +// Number of spaces to add before a word, based on mode, newline, sentence start. +#[inline(always)] +fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> uint { + if uniform || newline { + if start || (newline && punct) { + 2 + } else { + 1 + } + } else { + 0 } } +// If we're on a fresh line, slen=0 and we slice off leading whitespace. +// Otherwise, compute slen and leave whitespace alone. +#[inline(always)] +fn slice_if_fresh<'a>(fresh: bool, word: &'a str, start: uint, uniform: bool, newline: bool, sstart: bool, punct: bool) -> (uint, &'a str) { + if fresh { + (0, word.slice_from(start)) + } else { + (compute_slen(uniform, newline, sstart, punct), word) + } +} + +// Write a newline and add the indent. +#[inline(always)] +fn write_newline(indent: &str, ostream: &mut Box) { + silent_unwrap!(ostream.write_char('\n')); + silent_unwrap!(ostream.write(indent.as_bytes())); +} + +// Write the word, along with slen spaces. +#[inline(always)] +fn write_with_spaces(word: &str, slen: uint, ostream: &mut Box) { + if slen == 2 { + silent_unwrap!(ostream.write(" ".as_bytes())); + } else if slen == 1 { + silent_unwrap!(ostream.write_char(' ')); + } + silent_unwrap!(ostream.write(word.as_bytes())); +} diff --git a/fmt/parasplit.rs b/fmt/parasplit.rs index c4833d02f..50911a93b 100644 --- a/fmt/parasplit.rs +++ b/fmt/parasplit.rs @@ -13,6 +13,21 @@ use std::slice::Items; use std::str::CharRange; use FileOrStdReader; use FmtOptions; +use charwidth; + +#[inline(always)] +fn char_width(c: char) -> uint { + if (c as uint) < 0xA0 { + // if it is ASCII, call it exactly 1 wide (including control chars) + // calling control chars' widths 1 is consistent with OpenBSD fmt + 1 + } else { + // otherwise, get the unicode width + // note that we shouldn't actually get None here because only c < 0xA0 + // can return None, but for safety and future-proofing we do it this way + charwidth::width(c).unwrap_or(1) + } +} // lines with PSKIP, lacking PREFIX, or which are entirely blank are // NoFormatLines; otherwise, they are FormatLines @@ -117,7 +132,7 @@ impl<'a> FileLines<'a> { indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth; } else { // non-tab character - indent_len += 1; + indent_len += char_width(c); } } (indent_end, prefix_len, indent_len) @@ -196,7 +211,7 @@ pub struct Paragraph { // an iterator producing a stream of paragraphs from a stream of lines // given a set of options. pub struct ParagraphStream<'a> { - lines : Peekable>, + lines : Peekable>, next_mail : bool, opts : &'a FmtOptions, } @@ -238,8 +253,8 @@ impl<'a> ParagraphStream<'a> { } } -impl<'a> Iterator> for ParagraphStream<'a> { - fn next(&mut self) -> Option> { +impl<'a> Iterator> for ParagraphStream<'a> { + fn next(&mut self) -> Option> { // return a NoFormatLine in an Err; it should immediately be output let noformat = match self.lines.peek() { @@ -396,39 +411,37 @@ impl<'a> ParaWords<'a> { // no extra spacing for mail headers; always exactly 1 space // safe to trim_left on every line of a mail header, since the // first line is guaranteed not to have any spaces - self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo { + self.words.extend(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo { word : x, word_start : 0, - word_nchars : x.char_len(), + word_nchars : x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped) before_tab : None, after_tab : 0, sentence_start : false, ends_punct : false, new_line : false - }).collect()); + })); } else { // first line - self.words.push_all_move( + self.words.extend( if self.opts.crown || self.opts.tagged { // crown and tagged mode has the "init" in the first line, so slice from there WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end)) } else { // otherwise we slice from the indent WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end)) - }.collect()); + }); if self.para.lines.len() > 1 { let indent_end = self.para.indent_end; let opts = self.opts; - self.words.push_all_move( - self.para.lines.iter().skip(1) - .flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end))) - .collect()); + self.words.extend( + self.para.lines.iter().skip(1).flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end)))); } } } - pub fn words(&'a self) -> Items<'a,WordInfo<'a>> { return self.words.iter() } + pub fn words(&'a self) -> Items<'a, WordInfo<'a>> { return self.words.iter() } } struct WordSplit<'a> { @@ -516,7 +529,7 @@ impl<'a> Iterator> for WordSplit<'a> { let mut word_nchars = 0; self.position = match self.string.slice_from(word_start) - .find(|x: char| if !x.is_whitespace() { word_nchars += 1; false } else { true }) { + .find(|x: char| if !x.is_whitespace() { word_nchars += char_width(x); false } else { true }) { None => self.length, Some(s) => s + word_start };