mirror of
https://github.com/uutils/coreutils
synced 2024-11-15 09:27:21 +00:00
Merge pull request #318 from kwantam/master
fmt: Knuth-Plass implementation; common: add unicode char_width function
This commit is contained in:
commit
760be3f9e1
4 changed files with 563 additions and 86 deletions
170
fmt/charwidth.rs
Normal file
170
fmt/charwidth.rs
Normal file
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
* This file is part of `fmt` from the uutils coreutils package.
|
||||
*
|
||||
* (c) kwantam <kwantam@gmail.com>
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
fn bsearch_range_value_table(c: char, r: &'static [(char, char, uint)]) -> uint {
|
||||
match r.bsearch(|&(lo, hi, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Some(idx) => {
|
||||
let (_, _, result) = r[idx];
|
||||
result
|
||||
}
|
||||
None => 1
|
||||
}
|
||||
}
|
||||
|
||||
pub fn width(c: char) -> Option<uint> {
|
||||
match c as uint {
|
||||
_c @ 0 => Some(0), // null is zero width
|
||||
cu if cu < 0x20 => None, // control sequences have no width
|
||||
cu if cu < 0x7F => Some(1), // ASCII
|
||||
cu if cu < 0xA0 => None, // more control sequences
|
||||
_ => Some(bsearch_range_value_table(c, charwidth_table))
|
||||
}
|
||||
}
|
||||
|
||||
// character width table. Based on Markus Kuhn's free wcwidth() implementation,
|
||||
// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
|
||||
static charwidth_table : &'static [(char, char, uint)] = &[
|
||||
('\u0300', '\u036f', 0), ('\u0483', '\u0487', 0), ('\u0488', '\u0489', 0), ('\u0591',
|
||||
'\u05bd', 0), ('\u05bf', '\u05bf', 0), ('\u05c1', '\u05c2', 0), ('\u05c4', '\u05c5', 0),
|
||||
('\u05c7', '\u05c7', 0), ('\u0600', '\u0605', 0), ('\u0610', '\u061a', 0), ('\u061c',
|
||||
'\u061c', 0), ('\u064b', '\u065f', 0), ('\u0670', '\u0670', 0), ('\u06d6', '\u06dc', 0),
|
||||
('\u06dd', '\u06dd', 0), ('\u06df', '\u06e4', 0), ('\u06e7', '\u06e8', 0), ('\u06ea',
|
||||
'\u06ed', 0), ('\u070f', '\u070f', 0), ('\u0711', '\u0711', 0), ('\u0730', '\u074a', 0),
|
||||
('\u07a6', '\u07b0', 0), ('\u07eb', '\u07f3', 0), ('\u0816', '\u0819', 0), ('\u081b',
|
||||
'\u0823', 0), ('\u0825', '\u0827', 0), ('\u0829', '\u082d', 0), ('\u0859', '\u085b', 0),
|
||||
('\u08e4', '\u0902', 0), ('\u093a', '\u093a', 0), ('\u093c', '\u093c', 0), ('\u0941',
|
||||
'\u0948', 0), ('\u094d', '\u094d', 0), ('\u0951', '\u0957', 0), ('\u0962', '\u0963', 0),
|
||||
('\u0981', '\u0981', 0), ('\u09bc', '\u09bc', 0), ('\u09c1', '\u09c4', 0), ('\u09cd',
|
||||
'\u09cd', 0), ('\u09e2', '\u09e3', 0), ('\u0a01', '\u0a02', 0), ('\u0a3c', '\u0a3c', 0),
|
||||
('\u0a41', '\u0a51', 0), ('\u0a70', '\u0a71', 0), ('\u0a75', '\u0a82', 0), ('\u0abc',
|
||||
'\u0abc', 0), ('\u0ac1', '\u0ac8', 0), ('\u0acd', '\u0acd', 0), ('\u0ae2', '\u0ae3', 0),
|
||||
('\u0b01', '\u0b01', 0), ('\u0b3c', '\u0b3c', 0), ('\u0b3f', '\u0b3f', 0), ('\u0b41',
|
||||
'\u0b44', 0), ('\u0b4d', '\u0b56', 0), ('\u0b62', '\u0b63', 0), ('\u0b82', '\u0b82', 0),
|
||||
('\u0bc0', '\u0bc0', 0), ('\u0bcd', '\u0bcd', 0), ('\u0c00', '\u0c00', 0), ('\u0c3e',
|
||||
'\u0c40', 0), ('\u0c46', '\u0c56', 0), ('\u0c62', '\u0c63', 0), ('\u0c81', '\u0c81', 0),
|
||||
('\u0cbc', '\u0cbc', 0), ('\u0cbf', '\u0cbf', 0), ('\u0cc6', '\u0cc6', 0), ('\u0ccc',
|
||||
'\u0ccd', 0), ('\u0ce2', '\u0ce3', 0), ('\u0d01', '\u0d01', 0), ('\u0d41', '\u0d44', 0),
|
||||
('\u0d4d', '\u0d4d', 0), ('\u0d62', '\u0d63', 0), ('\u0dca', '\u0dca', 0), ('\u0dd2',
|
||||
'\u0dd6', 0), ('\u0e31', '\u0e31', 0), ('\u0e34', '\u0e3a', 0), ('\u0e47', '\u0e4e', 0),
|
||||
('\u0eb1', '\u0eb1', 0), ('\u0eb4', '\u0ebc', 0), ('\u0ec8', '\u0ecd', 0), ('\u0f18',
|
||||
'\u0f19', 0), ('\u0f35', '\u0f35', 0), ('\u0f37', '\u0f37', 0), ('\u0f39', '\u0f39', 0),
|
||||
('\u0f71', '\u0f7e', 0), ('\u0f80', '\u0f84', 0), ('\u0f86', '\u0f87', 0), ('\u0f8d',
|
||||
'\u0fbc', 0), ('\u0fc6', '\u0fc6', 0), ('\u102d', '\u1030', 0), ('\u1032', '\u1037', 0),
|
||||
('\u1039', '\u103a', 0), ('\u103d', '\u103e', 0), ('\u1058', '\u1059', 0), ('\u105e',
|
||||
'\u1060', 0), ('\u1071', '\u1074', 0), ('\u1082', '\u1082', 0), ('\u1085', '\u1086', 0),
|
||||
('\u108d', '\u108d', 0), ('\u109d', '\u109d', 0), ('\u1100', '\u115f', 2), ('\u1160',
|
||||
'\u11ff', 0), ('\u135d', '\u135f', 0), ('\u1712', '\u1714', 0), ('\u1732', '\u1734', 0),
|
||||
('\u1752', '\u1753', 0), ('\u1772', '\u1773', 0), ('\u17b4', '\u17b5', 0), ('\u17b7',
|
||||
'\u17bd', 0), ('\u17c6', '\u17c6', 0), ('\u17c9', '\u17d3', 0), ('\u17dd', '\u17dd', 0),
|
||||
('\u180b', '\u180d', 0), ('\u180e', '\u180e', 0), ('\u18a9', '\u18a9', 0), ('\u1920',
|
||||
'\u1922', 0), ('\u1927', '\u1928', 0), ('\u1932', '\u1932', 0), ('\u1939', '\u193b', 0),
|
||||
('\u1a17', '\u1a18', 0), ('\u1a1b', '\u1a1b', 0), ('\u1a56', '\u1a56', 0), ('\u1a58',
|
||||
'\u1a60', 0), ('\u1a62', '\u1a62', 0), ('\u1a65', '\u1a6c', 0), ('\u1a73', '\u1a7f', 0),
|
||||
('\u1ab0', '\u1abd', 0), ('\u1abe', '\u1abe', 0), ('\u1b00', '\u1b03', 0), ('\u1b34',
|
||||
'\u1b34', 0), ('\u1b36', '\u1b3a', 0), ('\u1b3c', '\u1b3c', 0), ('\u1b42', '\u1b42', 0),
|
||||
('\u1b6b', '\u1b73', 0), ('\u1b80', '\u1b81', 0), ('\u1ba2', '\u1ba5', 0), ('\u1ba8',
|
||||
'\u1ba9', 0), ('\u1bab', '\u1bad', 0), ('\u1be6', '\u1be6', 0), ('\u1be8', '\u1be9', 0),
|
||||
('\u1bed', '\u1bed', 0), ('\u1bef', '\u1bf1', 0), ('\u1c2c', '\u1c33', 0), ('\u1c36',
|
||||
'\u1c37', 0), ('\u1cd0', '\u1cd2', 0), ('\u1cd4', '\u1ce0', 0), ('\u1ce2', '\u1ce8', 0),
|
||||
('\u1ced', '\u1ced', 0), ('\u1cf4', '\u1cf4', 0), ('\u1cf8', '\u1cf9', 0), ('\u1dc0',
|
||||
'\u1dff', 0), ('\u200b', '\u200f', 0), ('\u202a', '\u202e', 0), ('\u2060', '\u206f', 0),
|
||||
('\u20d0', '\u20dc', 0), ('\u20dd', '\u20e0', 0), ('\u20e1', '\u20e1', 0), ('\u20e2',
|
||||
'\u20e4', 0), ('\u20e5', '\u20f0', 0), ('\u2329', '\u2329', 2), ('\u232a', '\u232a', 2),
|
||||
('\u2cef', '\u2cf1', 0), ('\u2d7f', '\u2d7f', 0), ('\u2de0', '\u2dff', 0), ('\u2e80',
|
||||
'\u2e99', 2), ('\u2e9b', '\u2ef3', 2), ('\u2f00', '\u2fd5', 2), ('\u2ff0', '\u2ffb', 2),
|
||||
('\u3000', '\u3000', 2), ('\u3001', '\u3003', 2), ('\u3004', '\u3004', 2), ('\u3005',
|
||||
'\u3005', 2), ('\u3006', '\u3006', 2), ('\u3007', '\u3007', 2), ('\u3008', '\u3008', 2),
|
||||
('\u3009', '\u3009', 2), ('\u300a', '\u300a', 2), ('\u300b', '\u300b', 2), ('\u300c',
|
||||
'\u300c', 2), ('\u300d', '\u300d', 2), ('\u300e', '\u300e', 2), ('\u300f', '\u300f', 2),
|
||||
('\u3010', '\u3010', 2), ('\u3011', '\u3011', 2), ('\u3012', '\u3013', 2), ('\u3014',
|
||||
'\u3014', 2), ('\u3015', '\u3015', 2), ('\u3016', '\u3016', 2), ('\u3017', '\u3017', 2),
|
||||
('\u3018', '\u3018', 2), ('\u3019', '\u3019', 2), ('\u301a', '\u301a', 2), ('\u301b',
|
||||
'\u301b', 2), ('\u301c', '\u301c', 2), ('\u301d', '\u301d', 2), ('\u301e', '\u301f', 2),
|
||||
('\u3020', '\u3020', 2), ('\u3021', '\u3029', 2), ('\u302a', '\u302d', 0), ('\u302a',
|
||||
'\u302d', 2), ('\u302e', '\u302f', 2), ('\u3030', '\u3030', 2), ('\u3031', '\u3035', 2),
|
||||
('\u3036', '\u3037', 2), ('\u3038', '\u303a', 2), ('\u303b', '\u303b', 2), ('\u303c',
|
||||
'\u303c', 2), ('\u303d', '\u303d', 2), ('\u303e', '\u303e', 2), ('\u3041', '\u3096', 2),
|
||||
('\u3099', '\u309a', 0), ('\u3099', '\u309a', 2), ('\u309b', '\u309c', 2), ('\u309d',
|
||||
'\u309e', 2), ('\u309f', '\u309f', 2), ('\u30a0', '\u30a0', 2), ('\u30a1', '\u30fa', 2),
|
||||
('\u30fb', '\u30fb', 2), ('\u30fc', '\u30fe', 2), ('\u30ff', '\u30ff', 2), ('\u3105',
|
||||
'\u312d', 2), ('\u3131', '\u318e', 2), ('\u3190', '\u3191', 2), ('\u3192', '\u3195', 2),
|
||||
('\u3196', '\u319f', 2), ('\u31a0', '\u31ba', 2), ('\u31c0', '\u31e3', 2), ('\u31f0',
|
||||
'\u31ff', 2), ('\u3200', '\u321e', 2), ('\u3220', '\u3229', 2), ('\u322a', '\u3247', 2),
|
||||
('\u3250', '\u3250', 2), ('\u3251', '\u325f', 2), ('\u3260', '\u327f', 2), ('\u3280',
|
||||
'\u3289', 2), ('\u328a', '\u32b0', 2), ('\u32b1', '\u32bf', 2), ('\u32c0', '\u32fe', 2),
|
||||
('\u3300', '\u33ff', 2), ('\u3400', '\u4db5', 2), ('\u4db6', '\u4dbf', 2), ('\u4e00',
|
||||
'\u9fcc', 2), ('\u9fcd', '\u9fff', 2), ('\ua000', '\ua014', 2), ('\ua015', '\ua015', 2),
|
||||
('\ua016', '\ua48c', 2), ('\ua490', '\ua4c6', 2), ('\ua66f', '\ua66f', 0), ('\ua670',
|
||||
'\ua672', 0), ('\ua674', '\ua67d', 0), ('\ua69f', '\ua69f', 0), ('\ua6f0', '\ua6f1', 0),
|
||||
('\ua802', '\ua802', 0), ('\ua806', '\ua806', 0), ('\ua80b', '\ua80b', 0), ('\ua825',
|
||||
'\ua826', 0), ('\ua8c4', '\ua8c4', 0), ('\ua8e0', '\ua8f1', 0), ('\ua926', '\ua92d', 0),
|
||||
('\ua947', '\ua951', 0), ('\ua960', '\ua97c', 2), ('\ua980', '\ua982', 0), ('\ua9b3',
|
||||
'\ua9b3', 0), ('\ua9b6', '\ua9b9', 0), ('\ua9bc', '\ua9bc', 0), ('\ua9e5', '\ua9e5', 0),
|
||||
('\uaa29', '\uaa2e', 0), ('\uaa31', '\uaa32', 0), ('\uaa35', '\uaa36', 0), ('\uaa43',
|
||||
'\uaa43', 0), ('\uaa4c', '\uaa4c', 0), ('\uaa7c', '\uaa7c', 0), ('\uaab0', '\uaab0', 0),
|
||||
('\uaab2', '\uaab4', 0), ('\uaab7', '\uaab8', 0), ('\uaabe', '\uaabf', 0), ('\uaac1',
|
||||
'\uaac1', 0), ('\uaaec', '\uaaed', 0), ('\uaaf6', '\uaaf6', 0), ('\uabe5', '\uabe5', 0),
|
||||
('\uabe8', '\uabe8', 0), ('\uabed', '\uabed', 0), ('\uac00', '\ud7a3', 2), ('\uf900',
|
||||
'\ufa6d', 2), ('\ufa6e', '\ufa6f', 2), ('\ufa70', '\ufad9', 2), ('\ufada', '\ufaff', 2),
|
||||
('\ufb1e', '\ufb1e', 0), ('\ufe00', '\ufe0f', 0), ('\ufe10', '\ufe16', 2), ('\ufe17',
|
||||
'\ufe17', 2), ('\ufe18', '\ufe18', 2), ('\ufe19', '\ufe19', 2), ('\ufe20', '\ufe2d', 0),
|
||||
('\ufe30', '\ufe30', 2), ('\ufe31', '\ufe32', 2), ('\ufe33', '\ufe34', 2), ('\ufe35',
|
||||
'\ufe35', 2), ('\ufe36', '\ufe36', 2), ('\ufe37', '\ufe37', 2), ('\ufe38', '\ufe38', 2),
|
||||
('\ufe39', '\ufe39', 2), ('\ufe3a', '\ufe3a', 2), ('\ufe3b', '\ufe3b', 2), ('\ufe3c',
|
||||
'\ufe3c', 2), ('\ufe3d', '\ufe3d', 2), ('\ufe3e', '\ufe3e', 2), ('\ufe3f', '\ufe3f', 2),
|
||||
('\ufe40', '\ufe40', 2), ('\ufe41', '\ufe41', 2), ('\ufe42', '\ufe42', 2), ('\ufe43',
|
||||
'\ufe43', 2), ('\ufe44', '\ufe44', 2), ('\ufe45', '\ufe46', 2), ('\ufe47', '\ufe47', 2),
|
||||
('\ufe48', '\ufe48', 2), ('\ufe49', '\ufe4c', 2), ('\ufe4d', '\ufe4f', 2), ('\ufe50',
|
||||
'\ufe52', 2), ('\ufe54', '\ufe57', 2), ('\ufe58', '\ufe58', 2), ('\ufe59', '\ufe59', 2),
|
||||
('\ufe5a', '\ufe5a', 2), ('\ufe5b', '\ufe5b', 2), ('\ufe5c', '\ufe5c', 2), ('\ufe5d',
|
||||
'\ufe5d', 2), ('\ufe5e', '\ufe5e', 2), ('\ufe5f', '\ufe61', 2), ('\ufe62', '\ufe62', 2),
|
||||
('\ufe63', '\ufe63', 2), ('\ufe64', '\ufe66', 2), ('\ufe68', '\ufe68', 2), ('\ufe69',
|
||||
'\ufe69', 2), ('\ufe6a', '\ufe6b', 2), ('\ufeff', '\ufeff', 0), ('\uff01', '\uff03', 2),
|
||||
('\uff04', '\uff04', 2), ('\uff05', '\uff07', 2), ('\uff08', '\uff08', 2), ('\uff09',
|
||||
'\uff09', 2), ('\uff0a', '\uff0a', 2), ('\uff0b', '\uff0b', 2), ('\uff0c', '\uff0c', 2),
|
||||
('\uff0d', '\uff0d', 2), ('\uff0e', '\uff0f', 2), ('\uff10', '\uff19', 2), ('\uff1a',
|
||||
'\uff1b', 2), ('\uff1c', '\uff1e', 2), ('\uff1f', '\uff20', 2), ('\uff21', '\uff3a', 2),
|
||||
('\uff3b', '\uff3b', 2), ('\uff3c', '\uff3c', 2), ('\uff3d', '\uff3d', 2), ('\uff3e',
|
||||
'\uff3e', 2), ('\uff3f', '\uff3f', 2), ('\uff40', '\uff40', 2), ('\uff41', '\uff5a', 2),
|
||||
('\uff5b', '\uff5b', 2), ('\uff5c', '\uff5c', 2), ('\uff5d', '\uff5d', 2), ('\uff5e',
|
||||
'\uff5e', 2), ('\uff5f', '\uff5f', 2), ('\uff60', '\uff60', 2), ('\uffe0', '\uffe1', 2),
|
||||
('\uffe2', '\uffe2', 2), ('\uffe3', '\uffe3', 2), ('\uffe4', '\uffe4', 2), ('\uffe5',
|
||||
'\uffe6', 2), ('\ufff9', '\ufffb', 0), ('\U000101fd', '\U000101fd', 0), ('\U000102e0',
|
||||
'\U000102e0', 0), ('\U00010376', '\U0001037a', 0), ('\U00010a01', '\U00010a0f', 0),
|
||||
('\U00010a38', '\U00010a3f', 0), ('\U00010ae5', '\U00010ae6', 0), ('\U00011001',
|
||||
'\U00011001', 0), ('\U00011038', '\U00011046', 0), ('\U0001107f', '\U00011081', 0),
|
||||
('\U000110b3', '\U000110b6', 0), ('\U000110b9', '\U000110ba', 0), ('\U000110bd',
|
||||
'\U000110bd', 0), ('\U00011100', '\U00011102', 0), ('\U00011127', '\U0001112b', 0),
|
||||
('\U0001112d', '\U00011134', 0), ('\U00011173', '\U00011173', 0), ('\U00011180',
|
||||
'\U00011181', 0), ('\U000111b6', '\U000111be', 0), ('\U0001122f', '\U00011231', 0),
|
||||
('\U00011234', '\U00011234', 0), ('\U00011236', '\U00011237', 0), ('\U000112df',
|
||||
'\U000112df', 0), ('\U000112e3', '\U000112ea', 0), ('\U00011301', '\U00011301', 0),
|
||||
('\U0001133c', '\U0001133c', 0), ('\U00011340', '\U00011340', 0), ('\U00011366',
|
||||
'\U00011374', 0), ('\U000114b3', '\U000114b8', 0), ('\U000114ba', '\U000114ba', 0),
|
||||
('\U000114bf', '\U000114c0', 0), ('\U000114c2', '\U000114c3', 0), ('\U000115b2',
|
||||
'\U000115b5', 0), ('\U000115bc', '\U000115bd', 0), ('\U000115bf', '\U000115c0', 0),
|
||||
('\U00011633', '\U0001163a', 0), ('\U0001163d', '\U0001163d', 0), ('\U0001163f',
|
||||
'\U00011640', 0), ('\U000116ab', '\U000116ab', 0), ('\U000116ad', '\U000116ad', 0),
|
||||
('\U000116b0', '\U000116b5', 0), ('\U000116b7', '\U000116b7', 0), ('\U00016af0',
|
||||
'\U00016af4', 0), ('\U00016b30', '\U00016b36', 0), ('\U00016f8f', '\U00016f92', 0),
|
||||
('\U0001b000', '\U0001b001', 2), ('\U0001bc9d', '\U0001bc9e', 0), ('\U0001bca0',
|
||||
'\U0001bca3', 0), ('\U0001d167', '\U0001d169', 0), ('\U0001d173', '\U0001d17a', 0),
|
||||
('\U0001d17b', '\U0001d182', 0), ('\U0001d185', '\U0001d18b', 0), ('\U0001d1aa',
|
||||
'\U0001d1ad', 0), ('\U0001d242', '\U0001d244', 0), ('\U0001e8d0', '\U0001e8d6', 0),
|
||||
('\U0001f200', '\U0001f202', 2), ('\U0001f210', '\U0001f23a', 2), ('\U0001f240',
|
||||
'\U0001f248', 2), ('\U0001f250', '\U0001f251', 2), ('\U00020000', '\U0002a6d6', 2),
|
||||
('\U0002a6d7', '\U0002a6ff', 2), ('\U0002a700', '\U0002b734', 2), ('\U0002b735',
|
||||
'\U0002b73f', 2), ('\U0002b740', '\U0002b81d', 2), ('\U0002b81e', '\U0002f7ff', 2),
|
||||
('\U0002f800', '\U0002fa1d', 2), ('\U0002fa1e', '\U0002fffd', 2), ('\U00030000',
|
||||
'\U0003fffd', 2), ('\U000e0001', '\U000e007f', 0), ('\U000e0100', '\U000e01ef', 0)
|
||||
];
|
25
fmt/fmt.rs
25
fmt/fmt.rs
|
@ -1,4 +1,4 @@
|
|||
#![crate_id(name="fmt", vers="0.0.2", author="kwantam")]
|
||||
#![crate_id(name="fmt", vers="0.0.3", author="kwantam")]
|
||||
/*
|
||||
* This file is part of `fmt` from the uutils coreutils package.
|
||||
*
|
||||
|
@ -13,6 +13,7 @@
|
|||
extern crate core;
|
||||
extern crate getopts;
|
||||
|
||||
use std::cmp;
|
||||
use std::io::{BufferedReader, BufferedWriter, File, IoResult};
|
||||
use std::io::stdio::{stdin_raw, stdout_raw};
|
||||
use linebreak::break_lines;
|
||||
|
@ -31,10 +32,11 @@ macro_rules! silent_unwrap(
|
|||
mod util;
|
||||
mod linebreak;
|
||||
mod parasplit;
|
||||
mod charwidth;
|
||||
|
||||
// program's NAME and VERSION are used for -V and -h
|
||||
static NAME: &'static str = "fmt";
|
||||
static VERSION: &'static str = "0.0.2";
|
||||
static VERSION: &'static str = "0.0.3";
|
||||
|
||||
struct FmtOptions {
|
||||
crown : bool,
|
||||
|
@ -48,6 +50,7 @@ struct FmtOptions {
|
|||
anti_prefix : String,
|
||||
xanti_prefix : bool,
|
||||
uniform : bool,
|
||||
quick : bool,
|
||||
width : uint,
|
||||
goal : uint,
|
||||
tabwidth : uint,
|
||||
|
@ -68,8 +71,10 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
getopts::optflag("x", "exact-prefix", "PREFIX must match at the beginning of the line with no preceding whitespace."),
|
||||
getopts::optflag("X", "exact-skip-prefix", "PSKIP must match at the beginning of the line with no preceding whitespace."),
|
||||
|
||||
getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 78.", "WIDTH"),
|
||||
getopts::optopt("g", "goal", "Goal width, default ~0.92*WIDTH. Must be less than WIDTH.", "GOAL"),
|
||||
getopts::optopt("w", "width", "Fill output lines up to a maximum of WIDTH columns, default 79.", "WIDTH"),
|
||||
getopts::optopt("g", "goal", "Goal width, default ~0.94*WIDTH. Must be less than WIDTH.", "GOAL"),
|
||||
|
||||
getopts::optflag("q", "quick", "Break lines more quickly at the expense of a potentially more ragged appearance."),
|
||||
|
||||
getopts::optopt("T", "tab-width", "Treat tabs as TABWIDTH spaces for determining line length, default 8. Note that this is used only for calculating line lengths; tabs are preserved in the output.", "TABWIDTH"),
|
||||
|
||||
|
@ -96,6 +101,7 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
tagged : false,
|
||||
mail : false,
|
||||
uniform : false,
|
||||
quick : false,
|
||||
split_only : false,
|
||||
use_prefix : false,
|
||||
prefix : String::new(),
|
||||
|
@ -103,8 +109,8 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
use_anti_prefix : false,
|
||||
anti_prefix : String::new(),
|
||||
xanti_prefix : false,
|
||||
width : 78,
|
||||
goal : 72,
|
||||
width : 79,
|
||||
goal : 74,
|
||||
tabwidth : 8,
|
||||
};
|
||||
|
||||
|
@ -112,6 +118,7 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
if matches.opt_present("c") { fmt_opts.crown = true; fmt_opts.tagged = false; }
|
||||
if matches.opt_present("m") { fmt_opts.mail = true; }
|
||||
if matches.opt_present("u") { fmt_opts.uniform = true; }
|
||||
if matches.opt_present("q") { fmt_opts.quick = true; }
|
||||
if matches.opt_present("s") { fmt_opts.split_only = true; fmt_opts.crown = false; fmt_opts.tagged = false; }
|
||||
if matches.opt_present("x") { fmt_opts.xprefix = true; }
|
||||
if matches.opt_present("X") { fmt_opts.xanti_prefix = true; }
|
||||
|
@ -139,7 +146,7 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
Some(t) => t,
|
||||
None => { crash!(1, "Invalid WIDTH specification: `{}'", s); }
|
||||
};
|
||||
fmt_opts.goal = std::cmp::min(fmt_opts.width * 92 / 100, fmt_opts.width - 4);
|
||||
fmt_opts.goal = cmp::min(fmt_opts.width * 94 / 100, fmt_opts.width - 3);
|
||||
}
|
||||
None => ()
|
||||
};
|
||||
|
@ -152,7 +159,7 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
None => { crash!(1, "Invalid GOAL specification: `{}'", s); }
|
||||
};
|
||||
if !matches.opt_present("w") {
|
||||
fmt_opts.width = std::cmp::max(fmt_opts.goal * 100 / 92, fmt_opts.goal + 4);
|
||||
fmt_opts.width = cmp::max(fmt_opts.goal * 100 / 94, fmt_opts.goal + 3);
|
||||
} else if fmt_opts.goal > fmt_opts.width {
|
||||
crash!(1, "GOAL cannot be greater than WIDTH.");
|
||||
}
|
||||
|
@ -189,7 +196,7 @@ pub fn uumain(args: Vec<String>) -> int {
|
|||
let mut fp =
|
||||
match open_file(i) {
|
||||
Err(e) => {
|
||||
show_warning!("{}: {}",i,e);
|
||||
show_warning!("{}: {}", i, e);
|
||||
continue;
|
||||
}
|
||||
Ok(f) => f
|
||||
|
|
411
fmt/linebreak.rs
411
fmt/linebreak.rs
|
@ -9,6 +9,10 @@
|
|||
|
||||
use FmtOptions;
|
||||
use parasplit::{Paragraph, ParaWords, WordInfo};
|
||||
use std::i64;
|
||||
use std::cmp;
|
||||
use std::mem;
|
||||
use std::num;
|
||||
|
||||
struct BreakArgs<'a> {
|
||||
opts : &'a FmtOptions,
|
||||
|
@ -21,8 +25,16 @@ struct BreakArgs<'a> {
|
|||
|
||||
impl<'a> BreakArgs<'a> {
|
||||
#[inline(always)]
|
||||
fn compute_width(&self, pre: uint, post: uint, posn: uint) -> uint {
|
||||
post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
|
||||
fn compute_width<'b>(&self, winfo: &WordInfo<'b>, posn: uint, fresh: bool) -> uint {
|
||||
if fresh {
|
||||
0
|
||||
} else {
|
||||
let post = winfo.after_tab;
|
||||
match winfo.before_tab {
|
||||
None => post,
|
||||
Some(pre) => post + ((pre + posn) / self.opts.tabwidth + 1) * self.opts.tabwidth - posn
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -73,91 +85,366 @@ pub fn break_lines(para: &Paragraph, opts: &FmtOptions, ostream: &mut Box<Writer
|
|||
ostream : ostream
|
||||
};
|
||||
|
||||
break_simple(&mut pWords_words, &mut break_args);
|
||||
if opts.quick || para.mail_header {
|
||||
break_simple(pWords_words, &mut break_args);
|
||||
} else {
|
||||
break_knuth_plass(pWords_words, &mut break_args);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* break_simple implements the "tight" breaking algorithm: print words until
|
||||
* maxlength would be exceeded, then print a linebreak and indent and continue.
|
||||
* Note that any first line indent should already have been printed before
|
||||
* calling this function, and the displayed length of said indent passed as
|
||||
* args.init_len
|
||||
*/
|
||||
fn break_simple<'a,T: Iterator<&'a WordInfo<'a>>>(iter: &'a mut T, args: &mut BreakArgs<'a>) {
|
||||
// break_simple implements a "greedy" breaking algorithm: print words until
|
||||
// maxlength would be exceeded, then print a linebreak and indent and continue.
|
||||
fn break_simple<'a, T: Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
|
||||
iter.fold((args.init_len, false), |l, winfo| accum_words_simple(args, l, winfo));
|
||||
silent_unwrap!(args.ostream.write_char('\n'));
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn accum_words_simple<'a>(args: &mut BreakArgs<'a>, (l, prev_punct): (uint, bool), winfo: &'a WordInfo<'a>) -> (uint, bool) {
|
||||
// compute the length of this word, considering how tabs will expand at this position on the line
|
||||
let wlen = winfo.word_nchars +
|
||||
if winfo.before_tab.is_some() {
|
||||
args.compute_width(winfo.before_tab.unwrap(), winfo.after_tab, l)
|
||||
} else {
|
||||
winfo.after_tab
|
||||
};
|
||||
let wlen = winfo.word_nchars + args.compute_width(winfo, l, false);
|
||||
|
||||
let splen =
|
||||
if args.uniform || winfo.new_line {
|
||||
if winfo.sentence_start || (winfo.new_line && prev_punct) { 2 }
|
||||
else { 1 }
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let slen = compute_slen(args.uniform, winfo.new_line, winfo.sentence_start, prev_punct);
|
||||
|
||||
if l + wlen + splen > args.opts.width {
|
||||
let wtrim = winfo.word.slice_from(winfo.word_start);
|
||||
silent_unwrap!(args.ostream.write_char('\n'));
|
||||
silent_unwrap!(args.ostream.write(args.indent_str.as_bytes()));
|
||||
silent_unwrap!(args.ostream.write(wtrim.as_bytes()));
|
||||
(args.indent_len + wtrim.len(), winfo.ends_punct)
|
||||
if l + wlen + slen > args.opts.width {
|
||||
write_newline(args.indent_str, args.ostream);
|
||||
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
|
||||
(args.indent_len + winfo.word_nchars, winfo.ends_punct)
|
||||
} else {
|
||||
if splen == 2 { silent_unwrap!(args.ostream.write(" ".as_bytes())); }
|
||||
else if splen == 1 { silent_unwrap!(args.ostream.write_char(' ')) }
|
||||
silent_unwrap!(args.ostream.write(winfo.word.as_bytes()));
|
||||
(l + wlen + splen, winfo.ends_punct)
|
||||
write_with_spaces(winfo.word, slen, args.ostream);
|
||||
(l + wlen + slen, winfo.ends_punct)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
enum PreviousBreak<'a> {
|
||||
ParaStart,
|
||||
PrevBreak(&'a LineBreak<'a>)
|
||||
// break_knuth_plass implements an "optimal" breaking algorithm in the style of
|
||||
// Knuth, D.E., and Plass, M.F. "Breaking Paragraphs into Lines." in Software,
|
||||
// Practice and Experience. Vol. 11, No. 11, November 1981.
|
||||
// http://onlinelibrary.wiley.com/doi/10.1002/spe.4380111102/pdf
|
||||
fn break_knuth_plass<'a, T: Clone + Iterator<&'a WordInfo<'a>>>(mut iter: T, args: &mut BreakArgs<'a>) {
|
||||
// run the algorithm to get the breakpoints
|
||||
let breakpoints = find_kp_breakpoints(iter.clone(), args);
|
||||
|
||||
// iterate through the breakpoints (note that breakpoints is in reverse break order, so we .rev() it
|
||||
let (mut prev_punct, mut fresh) =
|
||||
breakpoints.iter().rev().fold((false, false), |(mut prev_punct, mut fresh), &(next_break, break_before)| {
|
||||
if fresh {
|
||||
write_newline(args.indent_str, args.ostream);
|
||||
}
|
||||
// at each breakpoint, keep emitting words until we find the word matching this breakpoint
|
||||
for winfo in iter {
|
||||
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
|
||||
winfo.new_line, winfo.sentence_start, prev_punct);
|
||||
fresh = false;
|
||||
prev_punct = winfo.ends_punct;
|
||||
|
||||
// We find identical breakpoints here by comparing addresses of the references.
|
||||
// This is OK because the backing vector is not mutating once we are linebreaking.
|
||||
if winfo as *_ == next_break as *_ {
|
||||
// OK, we found the matching word
|
||||
if break_before {
|
||||
write_newline(args.indent_str, args.ostream);
|
||||
write_with_spaces(winfo.word.slice_from(winfo.word_start), 0, args.ostream);
|
||||
} else {
|
||||
// breaking after this word, so that means "fresh" is true for the next iteration
|
||||
write_with_spaces(word, slen, args.ostream);
|
||||
fresh = true;
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
write_with_spaces(word, slen, args.ostream);
|
||||
}
|
||||
}
|
||||
(prev_punct, fresh)
|
||||
});
|
||||
|
||||
// after the last linebreak, write out the rest of the final line.
|
||||
for winfo in iter {
|
||||
if fresh {
|
||||
write_newline(args.indent_str, args.ostream);
|
||||
}
|
||||
let (slen, word) = slice_if_fresh(fresh, winfo.word, winfo.word_start, args.uniform,
|
||||
winfo.new_line, winfo.sentence_start, prev_punct);
|
||||
prev_punct = winfo.ends_punct;
|
||||
fresh = false;
|
||||
write_with_spaces(word, slen, args.ostream);
|
||||
}
|
||||
silent_unwrap!(args.ostream.write_char('\n'));
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LineBreak<'a> {
|
||||
prev : PreviousBreak<'a>,
|
||||
breakafter : &'a str,
|
||||
demerits : uint
|
||||
prev : uint,
|
||||
linebreak : Option<&'a WordInfo<'a>>,
|
||||
break_before : bool,
|
||||
demerits : i64,
|
||||
prev_rat : f32,
|
||||
length : uint,
|
||||
fresh : bool
|
||||
}
|
||||
|
||||
// when comparing two LineBreaks, compare their demerits
|
||||
#[allow(dead_code)]
|
||||
impl<'a> PartialEq for LineBreak<'a> {
|
||||
fn eq(&self, other: &LineBreak) -> bool {
|
||||
self.demerits == other.demerits
|
||||
fn find_kp_breakpoints<'a, T: Iterator<&'a WordInfo<'a>>>(iter: T, args: &BreakArgs<'a>) -> Vec<(&'a WordInfo<'a>, bool)> {
|
||||
let mut iter = iter.peekable();
|
||||
// set up the initial null linebreak
|
||||
let mut linebreaks = vec!(LineBreak {
|
||||
prev : 0,
|
||||
linebreak : None,
|
||||
break_before : false,
|
||||
demerits : 0,
|
||||
prev_rat : 0.0f32,
|
||||
length : args.init_len,
|
||||
fresh : false
|
||||
});
|
||||
// this vec holds the current active linebreaks; next_ holds the breaks that will be active for the next word
|
||||
let active_breaks = &mut vec!(0);
|
||||
let next_active_breaks = &mut vec!();
|
||||
|
||||
let stretch = (args.opts.width - args.opts.goal) as int;
|
||||
let minlength = args.opts.goal - stretch as uint;
|
||||
let mut new_linebreaks = vec!();
|
||||
let mut is_sentence_start = false;
|
||||
let mut least_demerits = 0;
|
||||
loop {
|
||||
let w =
|
||||
match iter.next() {
|
||||
None => break,
|
||||
Some(w) => w
|
||||
};
|
||||
|
||||
// if this is the last word, we don't add additional demerits for this break
|
||||
let (is_last_word, is_sentence_end) =
|
||||
match iter.peek() {
|
||||
None => (true, true),
|
||||
Some(&&WordInfo { sentence_start: st, new_line: nl, .. }) => (false, st || (nl && w.ends_punct))
|
||||
};
|
||||
|
||||
// should we be adding extra space at the beginning of the next sentence?
|
||||
let slen = compute_slen(args.uniform, w.new_line, is_sentence_start, false);
|
||||
|
||||
let mut ld_new = i64::MAX;
|
||||
let mut ld_next = i64::MAX;
|
||||
let mut ld_idx = 0;
|
||||
new_linebreaks.clear();
|
||||
next_active_breaks.clear();
|
||||
// go through each active break, extending it and possibly adding a new active
|
||||
// break if we are above the minimum required length
|
||||
for &i in active_breaks.iter() {
|
||||
let active = linebreaks.get_mut(i);
|
||||
// normalize demerits to avoid overflow, and record if this is the least
|
||||
active.demerits -= least_demerits;
|
||||
if active.demerits < ld_next {
|
||||
ld_next = active.demerits;
|
||||
ld_idx = i;
|
||||
}
|
||||
|
||||
// get the new length
|
||||
let tlen = w.word_nchars + args.compute_width(w, active.length, active.fresh) + slen + active.length;
|
||||
|
||||
// if tlen is longer than args.opts.width, we drop this break from the active list
|
||||
// otherwise, we extend the break, and possibly add a new break at this point
|
||||
if tlen <= args.opts.width {
|
||||
// this break will still be active next time
|
||||
next_active_breaks.push(i);
|
||||
// we can put this word on this line
|
||||
active.fresh = false;
|
||||
active.length = tlen;
|
||||
|
||||
// if we're above the minlength, we can also consider breaking here
|
||||
if tlen >= minlength {
|
||||
let (new_demerits, new_ratio) =
|
||||
if is_last_word {
|
||||
// there is no penalty for the final line's length
|
||||
(0, 0.0)
|
||||
} else {
|
||||
compute_demerits((args.opts.goal - tlen) as int, stretch, w.word_nchars as int, active.prev_rat)
|
||||
};
|
||||
|
||||
// do not even consider adding a line that has too many demerits
|
||||
// also, try to detect overflow by checking signum
|
||||
let total_demerits = new_demerits + active.demerits;
|
||||
if new_demerits < BAD_INFTY_SQ && total_demerits < ld_new && num::signum(active.demerits) <= num::signum(new_demerits) {
|
||||
ld_new = total_demerits;
|
||||
new_linebreaks.push(LineBreak {
|
||||
prev : i,
|
||||
linebreak : Some(w),
|
||||
break_before : false,
|
||||
demerits : total_demerits,
|
||||
prev_rat : new_ratio,
|
||||
length : args.indent_len,
|
||||
fresh : true
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we generated any new linebreaks, add the last one to the list
|
||||
// the last one is always the best because we don't add to new_linebreaks unless
|
||||
// it's better than the best one so far
|
||||
match new_linebreaks.pop() {
|
||||
None => (),
|
||||
Some(lb) => {
|
||||
next_active_breaks.push(linebreaks.len());
|
||||
linebreaks.push(lb);
|
||||
}
|
||||
}
|
||||
|
||||
if next_active_breaks.is_empty() {
|
||||
// every potential linebreak is too long! choose the linebreak with the least demerits, ld_idx
|
||||
let new_break = restart_active_breaks(args, linebreaks.get(ld_idx), ld_idx, w, slen, minlength);
|
||||
next_active_breaks.push(linebreaks.len());
|
||||
linebreaks.push(new_break);
|
||||
least_demerits = 0;
|
||||
} else {
|
||||
// next time around, normalize out the demerits fields
|
||||
// on active linebreaks to make overflow less likely
|
||||
least_demerits = cmp::max(ld_next, 0);
|
||||
}
|
||||
// swap in new list of active breaks
|
||||
mem::swap(active_breaks, next_active_breaks);
|
||||
// If this was the last word in a sentence, the next one must be the first in the next.
|
||||
is_sentence_start = is_sentence_end;
|
||||
}
|
||||
|
||||
// return the best path
|
||||
build_best_path(&linebreaks, active_breaks)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn build_best_path<'a>(paths: &Vec<LineBreak<'a>>, active: &Vec<uint>) -> Vec<(&'a WordInfo<'a>, bool)> {
|
||||
let mut breakwords = vec!();
|
||||
// of the active paths, we select the one with the fewest demerits
|
||||
let mut best_idx = match active.iter().min_by(|&&a| paths.get(a).demerits) {
|
||||
None => crash!(1, "Failed to find a k-p linebreak solution. This should never happen."),
|
||||
Some(&s) => s
|
||||
};
|
||||
|
||||
// now, chase the pointers back through the break list, recording
|
||||
// the words at which we should break
|
||||
loop {
|
||||
let next_best = paths.get(best_idx);
|
||||
match next_best.linebreak {
|
||||
None => return breakwords,
|
||||
Some(prev) => {
|
||||
breakwords.push((prev, next_best.break_before));
|
||||
best_idx = next_best.prev
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE "less than" in this case means "worse", i.e., more demerits
|
||||
#[allow(dead_code)]
|
||||
impl<'a> PartialOrd for LineBreak<'a> {
|
||||
fn lt(&self, other: &LineBreak) -> bool {
|
||||
self.demerits > other.demerits
|
||||
// "infinite" badness is more like (1+BAD_INFTY)^2 because of how demerits are computed
|
||||
static BAD_INFTY: i64 = 10000000;
|
||||
static BAD_INFTY_SQ: i64 = BAD_INFTY * BAD_INFTY;
|
||||
// badness = BAD_MULT * abs(r) ^ 3
|
||||
static BAD_MULT: f32 = 100.0;
|
||||
// DR_MULT is multiplier for delta-R between lines
|
||||
static DR_MULT: f32 = 600.0;
|
||||
// DL_MULT is penalty multiplier for short words at end of line
|
||||
static DL_MULT: f32 = 300.0;
|
||||
|
||||
#[inline(always)]
|
||||
fn compute_demerits(delta_len: int, stretch: int, wlen: int, prev_rat: f32) -> (i64, f32) {
|
||||
// how much stretch are we using?
|
||||
let ratio =
|
||||
if delta_len == 0 {
|
||||
0.0f32
|
||||
} else {
|
||||
delta_len as f32 / stretch as f32
|
||||
};
|
||||
|
||||
// compute badness given the stretch ratio
|
||||
let bad_linelen =
|
||||
if num::abs(ratio) > 1.0f32 {
|
||||
BAD_INFTY
|
||||
} else {
|
||||
(BAD_MULT * num::abs(num::pow(ratio, 3))) as i64
|
||||
};
|
||||
|
||||
// we penalize lines ending in really short words
|
||||
let bad_wordlen =
|
||||
if wlen >= stretch {
|
||||
0
|
||||
} else {
|
||||
(DL_MULT * num::abs(num::pow((stretch - wlen) as f32 / (stretch - 1) as f32, 3))) as i64
|
||||
};
|
||||
|
||||
// we penalize lines that have very different ratios from previous lines
|
||||
let bad_deltaR = (DR_MULT * num::abs(num::pow((ratio - prev_rat) / 2.0, 3))) as i64;
|
||||
|
||||
let demerits = num::pow(1 + bad_linelen + bad_wordlen + bad_deltaR, 2);
|
||||
|
||||
(demerits, ratio)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn restart_active_breaks<'a>(args: &BreakArgs<'a>, active: &LineBreak<'a>, act_idx: uint, w: &'a WordInfo<'a>, slen: uint, min: uint) -> LineBreak<'a> {
|
||||
let (break_before, line_length) =
|
||||
if active.fresh {
|
||||
// never break before a word if that word would be the first on a line
|
||||
(false, args.indent_len)
|
||||
} else {
|
||||
// choose the lesser evil: breaking too early, or breaking too late
|
||||
let wlen = w.word_nchars + args.compute_width(w, active.length, active.fresh);
|
||||
let underlen: int = (min - active.length) as int;
|
||||
let overlen: int = ((wlen + slen + active.length) - args.opts.width) as int;
|
||||
if overlen > underlen {
|
||||
// break early, put this word on the next line
|
||||
(true, args.indent_len + w.word_nchars)
|
||||
} else {
|
||||
(false, args.indent_len)
|
||||
}
|
||||
};
|
||||
|
||||
// restart the linebreak. This will be our only active path.
|
||||
LineBreak {
|
||||
prev : act_idx,
|
||||
linebreak : Some(w),
|
||||
break_before : break_before,
|
||||
demerits : 0, // this is the only active break, so we can reset the demerit count
|
||||
prev_rat : if break_before { 1.0 } else { -1.0 },
|
||||
length : line_length,
|
||||
fresh : !break_before
|
||||
}
|
||||
}
|
||||
|
||||
// we have to satisfy Eq to implement Ord
|
||||
#[allow(dead_code)]
|
||||
impl<'a> Eq for LineBreak<'a> {}
|
||||
|
||||
// NOTE again here we reverse the ordering:
|
||||
// if other has more demerits, self is Greater
|
||||
#[allow(dead_code)]
|
||||
impl<'a> Ord for LineBreak<'a> {
|
||||
fn cmp(&self, other: &LineBreak) -> Ordering {
|
||||
other.demerits.cmp(&self.demerits)
|
||||
// Number of spaces to add before a word, based on mode, newline, sentence start.
|
||||
#[inline(always)]
|
||||
fn compute_slen(uniform: bool, newline: bool, start: bool, punct: bool) -> uint {
|
||||
if uniform || newline {
|
||||
if start || (newline && punct) {
|
||||
2
|
||||
} else {
|
||||
1
|
||||
}
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
// If we're on a fresh line, slen=0 and we slice off leading whitespace.
|
||||
// Otherwise, compute slen and leave whitespace alone.
|
||||
#[inline(always)]
|
||||
fn slice_if_fresh<'a>(fresh: bool, word: &'a str, start: uint, uniform: bool, newline: bool, sstart: bool, punct: bool) -> (uint, &'a str) {
|
||||
if fresh {
|
||||
(0, word.slice_from(start))
|
||||
} else {
|
||||
(compute_slen(uniform, newline, sstart, punct), word)
|
||||
}
|
||||
}
|
||||
|
||||
// Write a newline and add the indent.
|
||||
#[inline(always)]
|
||||
fn write_newline(indent: &str, ostream: &mut Box<Writer>) {
|
||||
silent_unwrap!(ostream.write_char('\n'));
|
||||
silent_unwrap!(ostream.write(indent.as_bytes()));
|
||||
}
|
||||
|
||||
// Write the word, along with slen spaces.
|
||||
#[inline(always)]
|
||||
fn write_with_spaces(word: &str, slen: uint, ostream: &mut Box<Writer>) {
|
||||
if slen == 2 {
|
||||
silent_unwrap!(ostream.write(" ".as_bytes()));
|
||||
} else if slen == 1 {
|
||||
silent_unwrap!(ostream.write_char(' '));
|
||||
}
|
||||
silent_unwrap!(ostream.write(word.as_bytes()));
|
||||
}
|
||||
|
|
|
@ -13,6 +13,21 @@ use std::slice::Items;
|
|||
use std::str::CharRange;
|
||||
use FileOrStdReader;
|
||||
use FmtOptions;
|
||||
use charwidth;
|
||||
|
||||
#[inline(always)]
|
||||
fn char_width(c: char) -> uint {
|
||||
if (c as uint) < 0xA0 {
|
||||
// if it is ASCII, call it exactly 1 wide (including control chars)
|
||||
// calling control chars' widths 1 is consistent with OpenBSD fmt
|
||||
1
|
||||
} else {
|
||||
// otherwise, get the unicode width
|
||||
// note that we shouldn't actually get None here because only c < 0xA0
|
||||
// can return None, but for safety and future-proofing we do it this way
|
||||
charwidth::width(c).unwrap_or(1)
|
||||
}
|
||||
}
|
||||
|
||||
// lines with PSKIP, lacking PREFIX, or which are entirely blank are
|
||||
// NoFormatLines; otherwise, they are FormatLines
|
||||
|
@ -117,7 +132,7 @@ impl<'a> FileLines<'a> {
|
|||
indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
|
||||
} else {
|
||||
// non-tab character
|
||||
indent_len += 1;
|
||||
indent_len += char_width(c);
|
||||
}
|
||||
}
|
||||
(indent_end, prefix_len, indent_len)
|
||||
|
@ -196,7 +211,7 @@ pub struct Paragraph {
|
|||
// an iterator producing a stream of paragraphs from a stream of lines
|
||||
// given a set of options.
|
||||
pub struct ParagraphStream<'a> {
|
||||
lines : Peekable<Line,FileLines<'a>>,
|
||||
lines : Peekable<Line, FileLines<'a>>,
|
||||
next_mail : bool,
|
||||
opts : &'a FmtOptions,
|
||||
}
|
||||
|
@ -238,8 +253,8 @@ impl<'a> ParagraphStream<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator<Result<Paragraph,String>> for ParagraphStream<'a> {
|
||||
fn next(&mut self) -> Option<Result<Paragraph,String>> {
|
||||
impl<'a> Iterator<Result<Paragraph, String>> for ParagraphStream<'a> {
|
||||
fn next(&mut self) -> Option<Result<Paragraph, String>> {
|
||||
// return a NoFormatLine in an Err; it should immediately be output
|
||||
let noformat =
|
||||
match self.lines.peek() {
|
||||
|
@ -396,39 +411,37 @@ impl<'a> ParaWords<'a> {
|
|||
// no extra spacing for mail headers; always exactly 1 space
|
||||
// safe to trim_left on every line of a mail header, since the
|
||||
// first line is guaranteed not to have any spaces
|
||||
self.words.push_all_move(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
|
||||
self.words.extend(self.para.lines.iter().flat_map(|x| x.as_slice().words()).map(|x| WordInfo {
|
||||
word : x,
|
||||
word_start : 0,
|
||||
word_nchars : x.char_len(),
|
||||
word_nchars : x.len(), // OK for mail headers; only ASCII allowed (unicode is escaped)
|
||||
before_tab : None,
|
||||
after_tab : 0,
|
||||
sentence_start : false,
|
||||
ends_punct : false,
|
||||
new_line : false
|
||||
}).collect());
|
||||
}));
|
||||
} else {
|
||||
// first line
|
||||
self.words.push_all_move(
|
||||
self.words.extend(
|
||||
if self.opts.crown || self.opts.tagged {
|
||||
// crown and tagged mode has the "init" in the first line, so slice from there
|
||||
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.init_end))
|
||||
} else {
|
||||
// otherwise we slice from the indent
|
||||
WordSplit::new(self.opts, self.para.lines.get(0).as_slice().slice_from(self.para.indent_end))
|
||||
}.collect());
|
||||
});
|
||||
|
||||
if self.para.lines.len() > 1 {
|
||||
let indent_end = self.para.indent_end;
|
||||
let opts = self.opts;
|
||||
self.words.push_all_move(
|
||||
self.para.lines.iter().skip(1)
|
||||
.flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end)))
|
||||
.collect());
|
||||
self.words.extend(
|
||||
self.para.lines.iter().skip(1).flat_map(|x| WordSplit::new(opts, x.as_slice().slice_from(indent_end))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn words(&'a self) -> Items<'a,WordInfo<'a>> { return self.words.iter() }
|
||||
pub fn words(&'a self) -> Items<'a, WordInfo<'a>> { return self.words.iter() }
|
||||
}
|
||||
|
||||
struct WordSplit<'a> {
|
||||
|
@ -516,7 +529,7 @@ impl<'a> Iterator<WordInfo<'a>> for WordSplit<'a> {
|
|||
let mut word_nchars = 0;
|
||||
self.position =
|
||||
match self.string.slice_from(word_start)
|
||||
.find(|x: char| if !x.is_whitespace() { word_nchars += 1; false } else { true }) {
|
||||
.find(|x: char| if !x.is_whitespace() { word_nchars += char_width(x); false } else { true }) {
|
||||
None => self.length,
|
||||
Some(s) => s + word_start
|
||||
};
|
||||
|
|
Loading…
Reference in a new issue