Merge pull request #318 from kwantam/master

fmt: Knuth-Plass implementation; common: add unicode char_width function
This commit is contained in:
Arcterus 2014-06-30 16:11:04 -07:00
commit 760be3f9e1
4 changed files with 563 additions and 86 deletions

170
fmt/charwidth.rs Normal file
View file

@ -0,0 +1,170 @@
/*
* This file is part of `fmt` from the uutils coreutils package.
*
* (c) kwantam <kwantam@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
fn bsearch_range_value_table(c: char, r: &'static [(char, char, uint)]) -> uint {
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, result) = r[idx];
result
}
None => 1
}
}
pub fn width(c: char) -> Option<uint> {
match c as uint {
_c @ 0 => Some(0), // null is zero width
cu if cu < 0x20 => None, // control sequences have no width
cu if cu < 0x7F => Some(1), // ASCII
cu if cu < 0xA0 => None, // more control sequences
_ => Some(bsearch_range_value_table(c, charwidth_table))
}
}
// character width table. Based on Markus Kuhn's free wcwidth() implementation,
// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
static charwidth_table : &'static [(char, char, uint)] = &[
('\u0300', '\u036f', 0), ('\u0483', '\u0487', 0), ('\u0488', '\u0489', 0), ('\u0591',
'\u05bd', 0), ('\u05bf', '\u05bf', 0), ('\u05c1', '\u05c2', 0), ('\u05c4', '\u05c5', 0),
('\u05c7', '\u05c7', 0), ('\u0600', '\u0605', 0), ('\u0610', '\u061a', 0), ('\u061c',
'\u061c', 0), ('\u064b', '\u065f', 0), ('\u0670', '\u0670', 0), ('\u06d6', '\u06dc', 0),
('\u06dd', '\u06dd', 0), ('\u06df', '\u06e4', 0), ('\u06e7', '\u06e8', 0), ('\u06ea',
'\u06ed', 0), ('\u070f', '\u070f', 0), ('\u0711', '\u0711', 0), ('\u0730', '\u074a', 0),
('\u07a6', '\u07b0', 0), ('\u07eb', '\u07f3', 0), ('\u0816', '\u0819', 0), ('\u081b',
'\u0823', 0), ('\u0825', '\u0827', 0), ('\u0829', '\u082d', 0), ('\u0859', '\u085b', 0),
('\u08e4', '\u0902', 0), ('\u093a', '\u093a', 0), ('\u093c', '\u093c', 0), ('\u0941',
'\u0948', 0), ('\u094d', '\u094d', 0), ('\u0951', '\u0957', 0), ('\u0962', '\u0963', 0),
('\u0981', '\u0981', 0), ('\u09bc', '\u09bc', 0), ('\u09c1', '\u09c4', 0), ('\u09cd',
'\u09cd', 0), ('\u09e2', '\u09e3', 0), ('\u0a01', '\u0a02', 0), ('\u0a3c', '\u0a3c', 0),
('\u0a41', '\u0a51', 0), ('\u0a70', '\u0a71', 0), ('\u0a75', '\u0a82', 0), ('\u0abc',
'\u0abc', 0), ('\u0ac1', '\u0ac8', 0), ('\u0acd', '\u0acd', 0), ('\u0ae2', '\u0ae3', 0),
('\u0b01', '\u0b01', 0), ('\u0b3c', '\u0b3c', 0), ('\u0b3f', '\u0b3f', 0), ('\u0b41',
'\u0b44', 0), ('\u0b4d', '\u0b56', 0), ('\u0b62', '\u0b63', 0), ('\u0b82', '\u0b82', 0),
('\u0bc0', '\u0bc0', 0), ('\u0bcd', '\u0bcd', 0), ('\u0c00', '\u0c00', 0), ('\u0c3e',
'\u0c40', 0), ('\u0c46', '\u0c56', 0), ('\u0c62', '\u0c63', 0), ('\u0c81', '\u0c81', 0),
('\u0cbc', '\u0cbc', 0), ('\u0cbf', '\u0cbf', 0), ('\u0cc6', '\u0cc6', 0), ('\u0ccc',
'\u0ccd', 0), ('\u0ce2', '\u0ce3', 0), ('\u0d01', '\u0d01', 0), ('\u0d41', '\u0d44', 0),
('\u0d4d', '\u0d4d', 0), ('\u0d62', '\u0d63', 0), ('\u0dca', '\u0dca', 0), ('\u0dd2',
'\u0dd6', 0), ('\u0e31', '\u0e31', 0), ('\u0e34', '\u0e3a', 0), ('\u0e47', '\u0e4e', 0),
('\u0eb1', '\u0eb1', 0), ('\u0eb4', '\u0ebc', 0), ('\u0ec8', '\u0ecd', 0), ('\u0f18',
'\u0f19', 0), ('\u0f35', '\u0f35', 0), ('\u0f37', '\u0f37', 0), ('\u0f39', '\u0f39', 0),
('\u0f71', '\u0f7e', 0), ('\u0f80', '\u0f84', 0), ('\u0f86', '\u0f87', 0), ('\u0f8d',
'\u0fbc', 0), ('\u0fc6', '\u0fc6', 0), ('\u102d', '\u1030', 0), ('\u1032', '\u1037', 0),
('\u1039', '\u103a', 0), ('\u103d', '\u103e', 0), ('\u1058', '\u1059', 0), ('\u105e',
'\u1060', 0), ('\u1071', '\u1074', 0), ('\u1082', '\u1082', 0), ('\u1085', '\u1086', 0),
('\u108d', '\u108d', 0), ('\u109d', '\u109d', 0), ('\u1100', '\u115f', 2), ('\u1160',
'\u11ff', 0), ('\u135d', '\u135f', 0), ('\u1712', '\u1714', 0), ('\u1732', '\u1734', 0),
('\u1752', '\u1753', 0), ('\u1772', '\u1773', 0), ('\u17b4', '\u17b5', 0), ('\u17b7',
'\u17bd', 0), ('\u17c6', '\u17c6', 0), ('\u17c9', '\u17d3', 0), ('\u17dd', '\u17dd', 0),
('\u180b', '\u180d', 0), ('\u180e', '\u180e', 0), ('\u18a9', '\u18a9', 0), ('\u1920',
'\u1922', 0), ('\u1927', '\u1928', 0), ('\u1932', '\u1932', 0), ('\u1939', '\u193b', 0),
('\u1a17', '\u1a18', 0), ('\u1a1b', '\u1a1b', 0), ('\u1a56', '\u1a56', 0), ('\u1a58',
'\u1a60', 0), ('\u1a62', '\u1a62', 0), ('\u1a65', '\u1a6c', 0), ('\u1a73', '\u1a7f', 0),
('\u1ab0', '\u1abd', 0), ('\u1abe', '\u1abe', 0), ('\u1b00', '\u1b03', 0), ('\u1b34',
'\u1b34', 0), ('\u1b36', '\u1b3a', 0), ('\u1b3c', '\u1b3c', 0), ('\u1b42', '\u1b42', 0),
('\u1b6b', '\u1b73', 0), ('\u1b80', '\u1b81', 0), ('\u1ba2', '\u1ba5', 0), ('\u1ba8',
'\u1ba9', 0), ('\u1bab', '\u1bad', 0), ('\u1be6', '\u1be6', 0), ('\u1be8', '\u1be9', 0),
('\u1bed', '\u1bed', 0), ('\u1bef', '\u1bf1', 0), ('\u1c2c', '\u1c33', 0), ('\u1c36',
'\u1c37', 0), ('\u1cd0', '\u1cd2', 0), ('\u1cd4', '\u1ce0', 0), ('\u1ce2', '\u1ce8', 0),
('\u1ced', '\u1ced', 0), ('\u1cf4', '\u1cf4', 0), ('\u1cf8', '\u1cf9', 0), ('\u1dc0',
'\u1dff', 0), ('\u200b', '\u200f', 0), ('\u202a', '\u202e', 0), ('\u2060', '\u206f', 0),
('\u20d0', '\u20dc', 0), ('\u20dd', '\u20e0', 0), ('\u20e1', '\u20e1', 0), ('\u20e2',
'\u20e4', 0), ('\u20e5', '\u20f0', 0), ('\u2329', '\u2329', 2), ('\u232a', '\u232a', 2),
('\u2cef', '\u2cf1', 0), ('\u2d7f', '\u2d7f', 0), ('\u2de0', '\u2dff', 0), ('\u2e80',
'\u2e99', 2), ('\u2e9b', '\u2ef3', 2), ('\u2f00', '\u2fd5', 2), ('\u2ff0', '\u2ffb', 2),
('\u3000', '\u3000', 2), ('\u3001', '\u3003', 2), ('\u3004', '\u3004', 2), ('\u3005',
'\u3005', 2), ('\u3006', '\u3006', 2), ('\u3007', '\u3007', 2), ('\u3008', '\u3008', 2),
('\u3009', '\u3009', 2), ('\u300a', '\u300a', 2), ('\u300b', '\u300b', 2), ('\u300c',
'\u300c', 2), ('\u300d', '\u300d', 2), ('\u300e', '\u300e', 2), ('\u300f', '\u300f', 2),
('\u3010', '\u3010', 2), ('\u3011', '\u3011', 2), ('\u3012', '\u3013', 2), ('\u3014',
'\u3014', 2), ('\u3015', '\u3015', 2), ('\u3016', '\u3016', 2), ('\u3017', '\u3017', 2),
('\u3018', '\u3018', 2), ('\u3019', '\u3019', 2), ('\u301a', '\u301a', 2), ('\u301b',
'\u301b', 2), ('\u301c', '\u301c', 2), ('\u301d', '\u301d', 2), ('\u301e', '\u301f', 2),
('\u3020', '\u3020', 2), ('\u3021', '\u3029', 2), ('\u302a', '\u302d', 0), ('\u302a',
'\u302d', 2), ('\u302e', '\u302f', 2), ('\u3030', '\u3030', 2), ('\u3031', '\u3035', 2),
('\u3036', '\u3037', 2), ('\u3038', '\u303a', 2), ('\u303b', '\u303b', 2), ('\u303c',
'\u303c', 2), ('\u303d', '\u303d', 2), ('\u303e', '\u303e', 2), ('\u3041', '\u3096', 2),
('\u3099', '\u309a', 0), ('\u3099', '\u309a', 2), ('\u309b', '\u309c', 2), ('\u309d',
'\u309e', 2), ('\u309f', '\u309f', 2), ('\u30a0', '\u30a0', 2), ('\u30a1', '\u30fa', 2),
('\u30fb', '\u30fb', 2), ('\u30fc', '\u30fe', 2), ('\u30ff', '\u30ff', 2), ('\u3105',
'\u312d', 2), ('\u3131', '\u318e', 2), ('\u3190', '\u3191', 2), ('\u3192', '\u3195', 2),
('\u3196', '\u319f', 2), ('\u31a0', '\u31ba', 2), ('\u31c0', '\u31e3', 2), ('\u31f0',
'\u31ff', 2), ('\u3200', '\u321e', 2), ('\u3220', '\u3229', 2), ('\u322a', '\u3247', 2),
('\u3250', '\u3250', 2), ('\u3251', '\u325f', 2), ('\u3260', '\u327f', 2), ('\u3280',
'\u3289', 2), ('\u328a', '\u32b0', 2), ('\u32b1', '\u32bf', 2), ('\u32c0', '\u32fe', 2),
('\u3300', '\u33ff', 2), ('\u3400', '\u4db5', 2), ('\u4db6', '\u4dbf', 2), ('\u4e00',
'\u9fcc', 2), ('\u9fcd', '\u9fff', 2), ('\ua000', '\ua014', 2), ('\ua015', '\ua015', 2),
('\ua016', '\ua48c', 2), ('\ua490', '\ua4c6', 2), ('\ua66f', '\ua66f', 0), ('\ua670',
'\ua672', 0), ('\ua674', '\ua67d', 0), ('\ua69f', '\ua69f', 0), ('\ua6f0', '\ua6f1', 0),
('\ua802', '\ua802', 0), ('\ua806', '\ua806', 0), ('\ua80b', '\ua80b', 0), ('\ua825',
'\ua826', 0), ('\ua8c4', '\ua8c4', 0), ('\ua8e0', '\ua8f1', 0), ('\ua926', '\ua92d', 0),
('\ua947', '\ua951', 0), ('\ua960', '\ua97c', 2), ('\ua980', '\ua982', 0), ('\ua9b3',
'\ua9b3', 0), ('\ua9b6', '\ua9b9', 0), ('\ua9bc', '\ua9bc', 0), ('\ua9e5', '\ua9e5', 0),
('\uaa29', '\uaa2e', 0), ('\uaa31', '\uaa32', 0), ('\uaa35', '\uaa36', 0), ('\uaa43',
'\uaa43', 0), ('\uaa4c', '\uaa4c', 0), ('\uaa7c', '\uaa7c', 0), ('\uaab0', '\uaab0', 0),
('\uaab2', '\uaab4', 0), ('\uaab7', '\uaab8', 0), ('\uaabe', '\uaabf', 0), ('\uaac1',
'\uaac1', 0), ('\uaaec', '\uaaed', 0), ('\uaaf6', '\uaaf6', 0), ('\uabe5', '\uabe5', 0),
('\uabe8', '\uabe8', 0), ('\uabed', '\uabed', 0), ('\uac00', '\ud7a3', 2), ('\uf900',
'\ufa6d', 2), ('\ufa6e', '\ufa6f', 2), ('\ufa70', '\ufad9', 2), ('\ufada', '\ufaff', 2),
('\ufb1e', '\ufb1e', 0), ('\ufe00', '\ufe0f', 0), ('\ufe10', '\ufe16', 2), ('\ufe17',
'\ufe17', 2), ('\ufe18', '\ufe18', 2), ('\ufe19', '\ufe19', 2), ('\ufe20', '\ufe2d', 0),
('\ufe30', '\ufe30', 2), ('\ufe31', '\ufe32', 2), ('\ufe33', '\ufe34', 2), ('\ufe35',
'\ufe35', 2), ('\ufe36', '\ufe36', 2), ('\ufe37', '\ufe37', 2), ('\ufe38', '\ufe38', 2),
('\ufe39', '\ufe39', 2), ('\ufe3a', '\ufe3a', 2), ('\ufe3b', '\ufe3b', 2), ('\ufe3c',
'\ufe3c', 2), ('\ufe3d', '\ufe3d', 2), ('\ufe3e', '\ufe3e', 2), ('\ufe3f', '\ufe3f', 2),
('\ufe40', '\ufe40', 2), ('\ufe41', '\ufe41', 2), ('\ufe42', '\ufe42', 2), ('\ufe43',
'\ufe43', 2), ('\ufe44', '\ufe44', 2), ('\ufe45', '\ufe46', 2), ('\ufe47', '\ufe47', 2),
('\ufe48', '\ufe48', 2), ('\ufe49', '\ufe4c', 2), ('\ufe4d', '\ufe4f', 2), ('\ufe50',
'\ufe52', 2), ('\ufe54', '\ufe57', 2), ('\ufe58', '\ufe58', 2), ('\ufe59', '\ufe59', 2),
('\ufe5a', '\ufe5a', 2), ('\ufe5b', '\ufe5b', 2), ('\ufe5c', '\ufe5c', 2), ('\ufe5d',
'\ufe5d', 2), ('\ufe5e', '\ufe5e', 2), ('\ufe5f', '\ufe61', 2), ('\ufe62', '\ufe62', 2),
('\ufe63', '\ufe63', 2), ('\ufe64', '\ufe66', 2), ('\ufe68', '\ufe68', 2), ('\ufe69',
'\ufe69', 2), ('\ufe6a', '\ufe6b', 2), ('\ufeff', '\ufeff', 0), ('\uff01', '\uff03', 2),
('\uff04', '\uff04', 2), ('\uff05', '\uff07', 2), ('\uff08', '\uff08', 2), ('\uff09',
'\uff09', 2), ('\uff0a', '\uff0a', 2), ('\uff0b', '\uff0b', 2), ('\uff0c', '\uff0c', 2),
('\uff0d', '\uff0d', 2), ('\uff0e', '\uff0f', 2), ('\uff10', '\uff19', 2), ('\uff1a',
'\uff1b', 2), ('\uff1c', '\uff1e', 2), ('\uff1f', '\uff20', 2), ('\uff21', '\uff3a', 2),
('\uff3b', '\uff3b', 2), ('\uff3c', '\uff3c', 2), ('\uff3d', '\uff3d', 2), ('\uff3e',
'\uff3e', 2), ('\uff3f', '\uff3f', 2), ('\uff40', '\uff40', 2), ('\uff41', '\uff5a', 2),
('\uff5b', '\uff5b', 2), ('\uff5c', '\uff5c', 2), ('\uff5d', '\uff5d', 2), ('\uff5e',
'\uff5e', 2), ('\uff5f', '\uff5f', 2), ('\uff60', '\uff60', 2), ('\uffe0', '\uffe1', 2),
('\uffe2', '\uffe2', 2), ('\uffe3', '\uffe3', 2), ('\uffe4', '\uffe4', 2), ('\uffe5',
'\uffe6', 2), ('\ufff9', '\ufffb', 0), ('\U000101fd', '\U000101fd', 0), ('\U000102e0',
'\U000102e0', 0), ('\U00010376', '\U0001037a', 0), ('\U00010a01', '\U00010a0f', 0),
('\U00010a38', '\U00010a3f', 0), ('\U00010ae5', '\U00010ae6', 0), ('\U00011001',
'\U00011001', 0), ('\U00011038', '\U00011046', 0), ('\U0001107f', '\U00011081', 0),
('\U000110b3', '\U000110b6', 0), ('\U000110b9', '\U000110ba', 0), ('\U000110bd',
'\U000110bd', 0), ('\U00011100', '\U00011102', 0), ('\U00011127', '\U0001112b', 0),
('\U0001112d', '\U00011134', 0), ('\U00011173', '\U00011173', 0), ('\U00011180',
'\U00011181', 0), ('\U000111b6', '\U000111be', 0), ('\U0001122f', '\U00011231', 0),
('\U00011234', '\U00011234', 0), ('\U00011236', '\U00011237', 0), ('\U000112df',
'\U000112df', 0), ('\U000112e3', '\U000112ea', 0), ('\U00011301', '\U00011301', 0),
('\U0001133c', '\U0001133c', 0), ('\U00011340', '\U00011340', 0), ('\U00011366',
'\U00011374', 0), ('\U000114b3', '\U000114b8', 0), ('\U000114ba', '\U000114ba', 0),
('\U000114bf', '\U000114c0', 0), ('\U000114c2', '\U000114c3', 0), ('\U000115b2',
'\U000115b5', 0), ('\U000115bc', '\U000115bd', 0), ('\U000115bf', '\U000115c0', 0),
('\U00011633', '\U0001163a', 0), ('\U0001163d', '\U0001163d', 0), ('\U0001163f',
'\U00011640', 0), ('\U000116ab', '\U000116ab', 0), ('\U000116ad', '\U000116ad', 0),
('\U000116b0', '\U000116b5', 0), ('\U000116b7', '\U000116b7', 0), ('\U00016af0',
'\U00016af4', 0), ('\U00016b30', '\U00016b36', 0), ('\U00016f8f', '\U00016f92', 0),
('\U0001b000', '\U0001b001', 2), ('\U0001bc9d', '\U0001bc9e', 0), ('\U0001bca0',
'\U0001bca3', 0), ('\U0001d167', '\U0001d169', 0), ('\U0001d173', '\U0001d17a', 0),
('\U0001d17b', '\U0001d182', 0), ('\U0001d185', '\U0001d18b', 0), ('\U0001d1aa',
'\U0001d1ad', 0), ('\U0001d242', '\U0001d244', 0), ('\U0001e8d0', '\U0001e8d6', 0),
('\U0001f200', '\U0001f202', 2), ('\U0001f210', '\U0001f23a', 2), ('\U0001f240',
'\U0001f248', 2), ('\U0001f250', '\U0001f251', 2), ('\U00020000', '\U0002a6d6', 2),
('\U0002a6d7', '\U0002a6ff', 2), ('\U0002a700', '\U0002b734', 2), ('\U0002b735',
'\U0002b73f', 2), ('\U0002b740', '\U0002b81d', 2), ('\U0002b81e', '\U0002f7ff', 2),
('\U0002f800', '\U0002fa1d', 2), ('\U0002fa1e', '\U0002fffd', 2), ('\U00030000',
'\U0003fffd', 2), ('\U000e0001', '\U000e007f', 0), ('\U000e0100', '\U000e01ef', 0)
];

View file

@ -1,4 +1,4 @@
#![crate_id(name="fmt", vers="0.0.2", author="kwantam")]
#![crate_id(name="fmt", vers="0.0.3", author="kwantam")]
/*
* This file is part of `fmt` from the uutils coreutils package.
*
@ -13,6 +13,7 @@
extern crate core;
extern crate getopts;
use std::cmp;
use std::io::{BufferedReader, BufferedWriter, File, IoResult};
use std::io::stdio::{stdin_raw, stdout_raw};
use linebreak::break_lines;
@ -31,10 +32,11 @@ macro_rules! silent_unwrap(
mod util;
mod linebreak;
mod parasplit;
mod charwidth;
// program's NAME and VERSION are used for -V and -h
static NAME: &'static str = "fmt";
static VERSION: &'static str = "0.0.2";
static VERSION: &'static str = "0.0.3";
struct FmtOptions {
crown : bool,
@ -48,6 +50,7 @@ struct FmtOptions {
anti_prefix : String,
xanti_prefix : bool,
uniform : bool,
quick : bool,
width : uint,
goal : uint,
tabwidth : uint,
@ -68,8 +71,10 @@ pub fn uumain(args: Vec<String>) -> int {
getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."),
getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."),
getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"),
getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"),
getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 79.", "WIDTH"),
getopts::optopt("g", "goal", "Goal width, default ~0.94*WIDTH. Must be less than WIDTH.", "GOAL"),
getopts::optflag("q", "quick", "Break lines more quickly at the expense of a potentially more ragged appearance."),
getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"),
@ -96,6 +101,7 @@ pub fn uumain(args: Vec<String>) -> int {
tagged : false,
mail : false,
uniform : false,
quick : false,
split_only : false,
use_prefix : false,
prefix : String::new(),
@ -103,8 +109,8 @@ pub fn uumain(args: Vec<String>) -> int {
use_anti_prefix : false,
anti_prefix : String::new(),
xanti_prefix : false,
width : 78,
goal : 72,
width : 79,
goal : 74,
tabwidth : 8,
};
@ -112,6 +118,7 @@ pub fn uumain(args: Vec<String>) -> int {
if matches.opt_present("c") { fmt_opts.crown = true; fmt_opts.tagged = false; }
if matches.opt_present("m") { fmt_opts.mail = true; }
if matches.opt_present("u") { fmt_opts.uniform = true; }
if matches.opt_present("q") { fmt_opts.quick = true; }
if matches.opt_present("s") { fmt_opts.split_only = true; fmt_opts.crown = false; fmt_opts.tagged = false; }
if matches.opt_present("x") { fmt_opts.xprefix = true; }
if matches.opt_present("X") { fmt_opts.xanti_prefix = true; }
@ -139,7 +146,7 @@ pub fn uumain(args: Vec<String>) -> int {
Some(t) => t,
None => { crash!(1, "Invalid WIDTH specification: `{}'", s); }
};
fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4);
fmt_opts.goal = cmp::min(fmt_opts.width * 94 / 100, fmt_opts.width - 3);
}
None => ()
};
@ -152,7 +159,7 @@ pub fn uumain(args: Vec<String>) -> int {
None => { crash!(1, "Invalid GOAL specification: `{}'", s); }
};
if !matches.opt_present("w") {
fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4);
fmt_opts.width = cmp::max(fmt_opts.goal * 100 / 94, fmt_opts.goal + 3);
} else if fmt_opts.goal > fmt_opts.width {
crash!(1, "GOAL cannot be greater than WIDTH.");
}
@ -189,7 +196,7 @@ pub fn uumain(args: Vec<String>) -> int {
let mut fp =
match open_file(i) {
Err(e) => {
show_warning!("{}: {}",i,e);
show_warning!("{}: {}", i, e);
continue;
}
Ok(f) => f

View file

@ -9,6 +9,10 @@
use FmtOptions;
use parasplit::{Paragraph, ParaWords, WordInfo};
use std::i64;
use std::cmp;
use std::mem;
use std::num;
struct BreakArgs<'a> {
opts : &'a FmtOptions,
@ -21,8 +25,16 @@ struct BreakArgs<'a> {
impl<'a> BreakArgs<'a> {
#[inline(always)]
fn compute_width(&self, pre: uint, post: uint, posn: uint) -> uint {
post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
fn compute_width<'b>(&self, winfo: &WordInfo<'b>, posn: uint, fresh: bool) -> uint {
if fresh {
0
} else {
let post = winfo.after_tab;
match winfo.before_tab {
None => post,
Some(pre) => post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
}
}
}
}
@ -73,91 +85,366 @@ pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer
ostream : ostream
};
break_simple(&mut pWords_words, &mut break_args);
if opts.quick || para.mail_header {
break_simple(pWords_words, &mut break_args);
} else {
break_knuth_plass(pWords_words, &mut break_args);
}
}
/*
* break_simple implements the "tight" breaking algorithm: print words until
* maxlength would be exceeded, then print a linebreak and indent and continue.
* Note that any first line indent should already have been printed before
* calling this function, and the displayed length of said indent passed as
* args.init_len
*/
fn break_simple<'a,T: Iterator<&'a WordInfo<'a>>>(iter: &'a mut T, args: &mut BreakArgs<'a>) {
// break_simple implements a "greedy" breaking algorithm: print words until
// maxlength would be exceeded, then print a linebreak and indent and continue.
fn break_simple<'a, T: Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
silent_unwrap!(args.ostream.write_char('\n'));
}
#[inline(always)]
fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
// compute the length of this word, considering how tabs will expand at this position on the line
let wlen = winfo.word_nchars +
if winfo.before_tab.is_some() {
args.compute_width(winfo.before_tab.unwrap(), winfo.after_tab, l)
} else {
winfo.after_tab
};
let wlen = winfo.word_nchars + args.compute_width(winfo, l, false);
let splen =
if args.uniform || winfo.new_line {
if winfo.sentence_start || (winfo.new_line && prev_punct) { 2 }
else { 1 }
} else {
0
};
let slen = compute_slen(args.uniform, winfo.new_line, winfo.sentence_start, prev_punct);
if l + wlen + splen > args.opts.width {
let wtrim = winfo.word.slice_from(winfo.word_start);
silent_unwrap!(args.ostream.write_char('\n'));
silent_unwrap!(args.ostream.write(args.indent_str.as_bytes()));
silent_unwrap!(args.ostream.write(wtrim.as_bytes()));
(args.indent_len + wtrim.len(), winfo.ends_punct)
if l + wlen + slen > args.opts.width {
write_newline(args.indent_str, args.ostream);
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
(args.indent_len + winfo.word_nchars, winfo.ends_punct)
} else {
if splen == 2 { silent_unwrap!(args.ostream.write(" ".as_bytes())); }
else if splen == 1 { silent_unwrap!(args.ostream.write_char(' ')) }
silent_unwrap!(args.ostream.write(winfo.word.as_bytes()));
(l + wlen + splen, winfo.ends_punct)
write_with_spaces(winfo.word, slen, args.ostream);
(l + wlen + slen, winfo.ends_punct)
}
}
#[allow(dead_code)]
enum PreviousBreak<'a> {
ParaStart,
PrevBreak(&'a LineBreak<'a>)
// break_knuth_plass implements an "optimal" breaking algorithm in the style of
// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software,
// Practice and Experience. Vol. 11, No. 11, November 1981.
// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf
fn break_knuth_plass<'a, T: Clone + Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
// run the algorithm to get the breakpoints
let breakpoints = find_kp_breakpoints(iter.clone(), args);
// iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it
let (mut prev_punct, mut fresh) =
breakpoints.iter().rev().fold((false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| {
if fresh {
write_newline(args.indent_str, args.ostream);
}
// at each breakpoint, keep emitting words until we find the word matching this breakpoint
for winfo in iter {
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
winfo.new_line, winfo.sentence_start, prev_punct);
fresh = false;
prev_punct = winfo.ends_punct;
// We find identical breakpoints here by comparing addresses of the references.
// This is OK because the backing vector is not mutating once we are linebreaking.
if winfo as *_ == next_break as *_ {
// OK, we found the matching word
if break_before {
write_newline(args.indent_str, args.ostream);
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
} else {
// breaking after this word, so that means "fresh" is true for the next iteration
write_with_spaces(word, slen, args.ostream);
fresh = true;
}
break;
} else {
write_with_spaces(word, slen, args.ostream);
}
}
(prev_punct, fresh)
});
// after the last linebreak, write out the rest of the final line.
for winfo in iter {
if fresh {
write_newline(args.indent_str, args.ostream);
}
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
winfo.new_line, winfo.sentence_start, prev_punct);
prev_punct = winfo.ends_punct;
fresh = false;
write_with_spaces(word, slen, args.ostream);
}
silent_unwrap!(args.ostream.write_char('\n'));
}
#[allow(dead_code)]
struct LineBreak<'a> {
prev : PreviousBreak<'a>,
breakafter : &'a str,
demerits : uint
prev : uint,
linebreak : Option<&'a WordInfo<'a>>,
break_before : bool,
demerits : i64,
prev_rat : f32,
length : uint,
fresh : bool
}
// when comparing two LineBreaks, compare their demerits
#[allow(dead_code)]
impl<'a> PartialEq for LineBreak<'a> {
fn eq(&self, other: &LineBreak) -> bool {
self.demerits == other.demerits
fn find_kp_breakpoints<'a, T: Iterator<&'a WordInfo<'a>>>(iter: T, args: &BreakArgs<'a>) -> Vec<(&'a WordInfo<'a>, bool)> {
let mut iter = iter.peekable();
// set up the initial null linebreak
let mut linebreaks = vec!(LineBreak {
prev : 0,
linebreak : None,
break_before : false,
demerits : 0,
prev_rat : 0.0f32,
length : args.init_len,
fresh : false
});
// this vec holds the current active linebreaks; next_ holds the breaks that will be active for the next word
let active_breaks = &mut vec!(0);
let next_active_breaks = &mut vec!();
let stretch = (args.opts.width - args.opts.goal) as int;
let minlength = args.opts.goal - stretch as uint;
let mut new_linebreaks = vec!();
let mut is_sentence_start = false;
let mut least_demerits = 0;
loop {
let w =
match iter.next() {
None => break,
Some(w) => w
};
// if this is the last word, we don't add additional demerits for this break
let (is_last_word, is_sentence_end) =
match iter.peek() {
None => (true, true),
Some(&&WordInfo { sentence_start: st, new_line: nl, .. }) => (false, st || (nl && w.ends_punct))
};
// should we be adding extra space at the beginning of the next sentence?
let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false);
let mut ld_new = i64::MAX;
let mut ld_next = i64::MAX;
let mut ld_idx = 0;
new_linebreaks.clear();
next_active_breaks.clear();
// go through each active break, extending it and possibly adding a new active
// break if we are above the minimum required length
for &i in active_breaks.iter() {
let active = linebreaks.get_mut(i);
// normalize demerits to avoid overflow, and record if this is the least
active.demerits -= least_demerits;
if active.demerits < ld_next {
ld_next = active.demerits;
ld_idx = i;
}
// get the new length
let tlen = w.word_nchars + args.compute_width(w, active.length, active.fresh) + slen + active.length;
// if tlen is longer than args.opts.width, we drop this break from the active list
// otherwise, we extend the break, and possibly add a new break at this point
if tlen <= args.opts.width {
// this break will still be active next time
next_active_breaks.push(i);
// we can put this word on this line
active.fresh = false;
active.length = tlen;
// if we're above the minlength, we can also consider breaking here
if tlen >= minlength {
let (new_demerits, new_ratio) =
if is_last_word {
// there is no penalty for the final line's length
(0, 0.0)
} else {
compute_demerits((args.opts.goal - tlen) as int, stretch, w.word_nchars as int, active.prev_rat)
};
// do not even consider adding a line that has too many demerits
// also, try to detect overflow by checking signum
let total_demerits = new_demerits + active.demerits;
if new_demerits < BAD_INFTY_SQ && total_demerits < ld_new && num::signum(active.demerits) <= num::signum(new_demerits) {
ld_new = total_demerits;
new_linebreaks.push(LineBreak {
prev : i,
linebreak : Some(w),
break_before : false,
demerits : total_demerits,
prev_rat : new_ratio,
length : args.indent_len,
fresh : true
});
}
}
}
}
// if we generated any new linebreaks, add the last one to the list
// the last one is always the best because we don't add to new_linebreaks unless
// it's better than the best one so far
match new_linebreaks.pop() {
None => (),
Some(lb) => {
next_active_breaks.push(linebreaks.len());
linebreaks.push(lb);
}
}
if next_active_breaks.is_empty() {
// every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx
let new_break = restart_active_breaks(args, linebreaks.get(ld_idx), ld_idx, w, slen, minlength);
next_active_breaks.push(linebreaks.len());
linebreaks.push(new_break);
least_demerits = 0;
} else {
// next time around, normalize out the demerits fields
// on active linebreaks to make overflow less likely
least_demerits = cmp::max(ld_next, 0);
}
// swap in new list of active breaks
mem::swap(active_breaks, next_active_breaks);
// If this was the last word in a sentence, the next one must be the first in the next.
is_sentence_start = is_sentence_end;
}
// return the best path
build_best_path(&linebreaks, active_breaks)
}
#[inline(always)]
fn build_best_path<'a>(paths: &Vec<LineBreak<'a>>, active: &Vec<uint>) -> Vec<(&'a WordInfo<'a>, bool)> {
let mut breakwords = vec!();
// of the active paths, we select the one with the fewest demerits
let mut best_idx = match active.iter().min_by(|&&a| paths.get(a).demerits) {
None => crash!(1, "Failed to find a k-p linebreak solution. This should never happen."),
Some(&s) => s
};
// now, chase the pointers back through the break list, recording
// the words at which we should break
loop {
let next_best = paths.get(best_idx);
match next_best.linebreak {
None => return breakwords,
Some(prev) => {
breakwords.push((prev, next_best.break_before));
best_idx = next_best.prev
}
}
}
}
// NOTE "less than" in this case means "worse", i.e., more demerits
#[allow(dead_code)]
impl<'a> PartialOrd for LineBreak<'a> {
fn lt(&self, other: &LineBreak) -> bool {
self.demerits > other.demerits
// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed
static BAD_INFTY: i64 = 10000000;
static BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
// badness = BAD_MULT * abs(r) ^ 3
static BAD_MULT: f32 = 100.0;
// DR_MULT is multiplier for delta-R between lines
static DR_MULT: f32 = 600.0;
// DL_MULT is penalty multiplier for short words at end of line
static DL_MULT: f32 = 300.0;
#[inline(always)]
fn compute_demerits(delta_len: int, stretch: int, wlen: int, prev_rat: f32) -> (i64, f32) {
// how much stretch are we using?
let ratio =
if delta_len == 0 {
0.0f32
} else {
delta_len as f32 / stretch as f32
};
// compute badness given the stretch ratio
let bad_linelen =
if num::abs(ratio) > 1.0f32 {
BAD_INFTY
} else {
(BAD_MULT * num::abs(num::pow(ratio, 3))) as i64
};
// we penalize lines ending in really short words
let bad_wordlen =
if wlen >= stretch {
0
} else {
(DL_MULT * num::abs(num::pow((stretch - wlen) as f32 / (stretch - 1) as f32, 3))) as i64
};
// we penalize lines that have very different ratios from previous lines
let bad_deltaR = (DR_MULT * num::abs(num::pow((ratio - prev_rat) / 2.0, 3))) as i64;
let demerits = num::pow(1 + bad_linelen + bad_wordlen + bad_deltaR, 2);
(demerits, ratio)
}
#[inline(always)]
fn restart_active_breaks<'a>(args: &BreakArgs<'a>, active: &LineBreak<'a>, act_idx: uint, w: &'a WordInfo<'a>, slen: uint, min: uint) -> LineBreak<'a> {
let (break_before, line_length) =
if active.fresh {
// never break before a word if that word would be the first on a line
(false, args.indent_len)
} else {
// choose the lesser evil: breaking too early, or breaking too late
let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh);
let underlen: int = (min - active.length) as int;
let overlen: int = ((wlen + slen + active.length) - args.opts.width) as int;
if overlen > underlen {
// break early, put this word on the next line
(true, args.indent_len + w.word_nchars)
} else {
(false, args.indent_len)
}
};
// restart the linebreak. This will be our only active path.
LineBreak {
prev : act_idx,
linebreak : Some(w),
break_before : break_before,
demerits : 0, // this is the only active break, so we can reset the demerit count
prev_rat : if break_before { 1.0 } else { -1.0 },
length : line_length,
fresh : !break_before
}
}
// we have to satisfy Eq to implement Ord
#[allow(dead_code)]
impl<'a> Eq for LineBreak<'a> {}
// NOTE again here we reverse the ordering:
// if other has more demerits, self is Greater
#[allow(dead_code)]
impl<'a> Ord for LineBreak<'a> {
fn cmp(&self, other: &LineBreak) -> Ordering {
other.demerits.cmp(&self.demerits)
// Number of spaces to add before a word, based on mode, newline, sentence start.
#[inline(always)]
fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> uint {
if uniform || newline {
if start || (newline && punct) {
2
} else {
1
}
} else {
0
}
}
// If we're on a fresh line, slen=0 and we slice off leading whitespace.
// Otherwise, compute slen and leave whitespace alone.
#[inline(always)]
fn slice_if_fresh<'a>(fresh: bool, word: &'a str, start: uint, uniform: bool, newline: bool, sstart: bool, punct: bool) -> (uint, &'a str) {
if fresh {
(0, word.slice_from(start))
} else {
(compute_slen(uniform, newline, sstart, punct), word)
}
}
// Write a newline and add the indent.
#[inline(always)]
fn write_newline(indent: &str, ostream: &mut Box<Writer>) {
silent_unwrap!(ostream.write_char('\n'));
silent_unwrap!(ostream.write(indent.as_bytes()));
}
// Write the word, along with slen spaces.
#[inline(always)]
fn write_with_spaces(word: &str, slen: uint, ostream: &mut Box<Writer>) {
if slen == 2 {
silent_unwrap!(ostream.write(" ".as_bytes()));
} else if slen == 1 {
silent_unwrap!(ostream.write_char(' '));
}
silent_unwrap!(ostream.write(word.as_bytes()));
}

View file

@ -13,6 +13,21 @@ use std::slice::Items;
use std::str::CharRange;
use FileOrStdReader;
use FmtOptions;
use charwidth;
#[inline(always)]
fn char_width(c: char) -> uint {
if (c as uint) < 0xA0 {
// if it is ASCII, call it exactly 1 wide (including control chars)
// calling control chars' widths 1 is consistent with OpenBSD fmt
1
} else {
// otherwise, get the unicode width
// note that we shouldn't actually get None here because only c < 0xA0
// can return None, but for safety and future-proofing we do it this way
charwidth::width(c).unwrap_or(1)
}
}
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
// NoFormatLines; otherwise, they are FormatLines
@ -117,7 +132,7 @@ impl<'a> FileLines<'a> {
indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
} else {
// non-tab character
indent_len += 1;
indent_len += char_width(c);
}
}
(indent_end, prefix_len, indent_len)
@ -196,7 +211,7 @@ pub struct Paragraph {
// an iterator producing a stream of paragraphs from a stream of lines
// given a set of options.
pub struct ParagraphStream<'a> {
lines : Peekable<Line,FileLines<'a>>,
lines : Peekable<Line, FileLines<'a>>,
next_mail : bool,
opts : &'a FmtOptions,
}
@ -238,8 +253,8 @@ impl<'a> ParagraphStream<'a> {
}
}
impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
fn next(&mut self) -> Option<Result<Paragraph,String>> {
impl<'a> Iterator<Result<Paragraph, String>> for ParagraphStream<'a> {
fn next(&mut self) -> Option<Result<Paragraph, String>> {
// return a NoFormatLine in an Err; it should immediately be output
let noformat =
match self.lines.peek() {
@ -396,39 +411,37 @@ impl<'a> ParaWords<'a> {
// no extra spacing for mail headers; always exactly 1 space
// safe to trim_left on every line of a mail header, since the
// first line is guaranteed not to have any spaces
self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
self.words.extend(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
word : x,
word_start : 0,
word_nchars : x.char_len(),
word_nchars : x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped)
before_tab : None,
after_tab : 0,
sentence_start : false,
ends_punct : false,
new_line : false
}).collect());
}));
} else {
// first line
self.words.push_all_move(
self.words.extend(
if self.opts.crown || self.opts.tagged {
// crown and tagged mode has the "init" in the first line, so slice from there
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
} else {
// otherwise we slice from the indent
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
}.collect());
});
if self.para.lines.len() > 1 {
let indent_end = self.para.indent_end;
let opts = self.opts;
self.words.push_all_move(
self.para.lines.iter().skip(1)
.flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end)))
.collect());
self.words.extend(
self.para.lines.iter().skip(1).flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end))));
}
}
}
pub fn words(&'a self) -> Items<'a,WordInfo<'a>> { return self.words.iter() }
pub fn words(&'a self) -> Items<'a, WordInfo<'a>> { return self.words.iter() }
}
struct WordSplit<'a> {
@ -516,7 +529,7 @@ impl<'a> Iterator<WordInfo<'a>> for WordSplit<'a> {
let mut word_nchars = 0;
self.position =
match self.string.slice_from(word_start)
.find(|x: char| if !x.is_whitespace() { word_nchars += 1; false } else { true }) {
.find(|x: char| if !x.is_whitespace() { word_nchars += char_width(x); false } else { true }) {
None => self.length,
Some(s) => s + word_start
};