coreutils/fmt/charwidth.rs
kwantam 8be67f7d4d fmt Knuth-Plass implementation; unicode char_width
fmt:
- Implemented Knuth-Plass optimal linebreaking strategy.
- Added commandline switch -q for "quick" (greedy) split
  mode that does not use Knuth-Plass.
- Right now, Knuth-Plass runs about half as fast. It also
  uses more memory.
- Updated fmt to use char_width (see below) instead of
  assuming each character width is 1.
- Use i64 for demerits instead of int in K-P, since int is
  pointer sized and will only be 32 bits on some
  architectures.
- incremented version number
- Incorporated improvements suggested by huonw and Arcterus.
  - K-P uses indices of linebreaks vector instead of raw
    pointers. This gets rid of a lot of allocation of boxes
    and improves safety to boot.
- Added a support module for computing displayed widths of unicode
  strings based on Markus Kuhn's free implementation at
    http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
- This is in `charwidth.rs`, but this is a temporary measure
  until the Char trait implements .width(). I am submitting
  a PR for this soon, and the code in charwidth() is what's
  generated libcore.

closes #223
2014-06-30 19:09:22 -04:00

170 lines
13 KiB
Rust

/*
* This file is part of `fmt` from the uutils coreutils package.
*
* (c) kwantam <kwantam@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
fn bsearch_range_value_table(c: char, r: &'static [(char, char, uint)]) -> uint {
match r.bsearch(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Some(idx) => {
let (_, _, result) = r[idx];
result
}
None => 1
}
}
pub fn width(c: char) -> Option<uint> {
match c as uint {
_c @ 0 => Some(0), // null is zero width
cu if cu < 0x20 => None, // control sequences have no width
cu if cu < 0x7F => Some(1), // ASCII
cu if cu < 0xA0 => None, // more control sequences
_ => Some(bsearch_range_value_table(c, charwidth_table))
}
}
// character width table. Based on Markus Kuhn's free wcwidth() implementation,
// http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
static charwidth_table : &'static [(char, char, uint)] = &[
('\u0300', '\u036f', 0), ('\u0483', '\u0487', 0), ('\u0488', '\u0489', 0), ('\u0591',
'\u05bd', 0), ('\u05bf', '\u05bf', 0), ('\u05c1', '\u05c2', 0), ('\u05c4', '\u05c5', 0),
('\u05c7', '\u05c7', 0), ('\u0600', '\u0605', 0), ('\u0610', '\u061a', 0), ('\u061c',
'\u061c', 0), ('\u064b', '\u065f', 0), ('\u0670', '\u0670', 0), ('\u06d6', '\u06dc', 0),
('\u06dd', '\u06dd', 0), ('\u06df', '\u06e4', 0), ('\u06e7', '\u06e8', 0), ('\u06ea',
'\u06ed', 0), ('\u070f', '\u070f', 0), ('\u0711', '\u0711', 0), ('\u0730', '\u074a', 0),
('\u07a6', '\u07b0', 0), ('\u07eb', '\u07f3', 0), ('\u0816', '\u0819', 0), ('\u081b',
'\u0823', 0), ('\u0825', '\u0827', 0), ('\u0829', '\u082d', 0), ('\u0859', '\u085b', 0),
('\u08e4', '\u0902', 0), ('\u093a', '\u093a', 0), ('\u093c', '\u093c', 0), ('\u0941',
'\u0948', 0), ('\u094d', '\u094d', 0), ('\u0951', '\u0957', 0), ('\u0962', '\u0963', 0),
('\u0981', '\u0981', 0), ('\u09bc', '\u09bc', 0), ('\u09c1', '\u09c4', 0), ('\u09cd',
'\u09cd', 0), ('\u09e2', '\u09e3', 0), ('\u0a01', '\u0a02', 0), ('\u0a3c', '\u0a3c', 0),
('\u0a41', '\u0a51', 0), ('\u0a70', '\u0a71', 0), ('\u0a75', '\u0a82', 0), ('\u0abc',
'\u0abc', 0), ('\u0ac1', '\u0ac8', 0), ('\u0acd', '\u0acd', 0), ('\u0ae2', '\u0ae3', 0),
('\u0b01', '\u0b01', 0), ('\u0b3c', '\u0b3c', 0), ('\u0b3f', '\u0b3f', 0), ('\u0b41',
'\u0b44', 0), ('\u0b4d', '\u0b56', 0), ('\u0b62', '\u0b63', 0), ('\u0b82', '\u0b82', 0),
('\u0bc0', '\u0bc0', 0), ('\u0bcd', '\u0bcd', 0), ('\u0c00', '\u0c00', 0), ('\u0c3e',
'\u0c40', 0), ('\u0c46', '\u0c56', 0), ('\u0c62', '\u0c63', 0), ('\u0c81', '\u0c81', 0),
('\u0cbc', '\u0cbc', 0), ('\u0cbf', '\u0cbf', 0), ('\u0cc6', '\u0cc6', 0), ('\u0ccc',
'\u0ccd', 0), ('\u0ce2', '\u0ce3', 0), ('\u0d01', '\u0d01', 0), ('\u0d41', '\u0d44', 0),
('\u0d4d', '\u0d4d', 0), ('\u0d62', '\u0d63', 0), ('\u0dca', '\u0dca', 0), ('\u0dd2',
'\u0dd6', 0), ('\u0e31', '\u0e31', 0), ('\u0e34', '\u0e3a', 0), ('\u0e47', '\u0e4e', 0),
('\u0eb1', '\u0eb1', 0), ('\u0eb4', '\u0ebc', 0), ('\u0ec8', '\u0ecd', 0), ('\u0f18',
'\u0f19', 0), ('\u0f35', '\u0f35', 0), ('\u0f37', '\u0f37', 0), ('\u0f39', '\u0f39', 0),
('\u0f71', '\u0f7e', 0), ('\u0f80', '\u0f84', 0), ('\u0f86', '\u0f87', 0), ('\u0f8d',
'\u0fbc', 0), ('\u0fc6', '\u0fc6', 0), ('\u102d', '\u1030', 0), ('\u1032', '\u1037', 0),
('\u1039', '\u103a', 0), ('\u103d', '\u103e', 0), ('\u1058', '\u1059', 0), ('\u105e',
'\u1060', 0), ('\u1071', '\u1074', 0), ('\u1082', '\u1082', 0), ('\u1085', '\u1086', 0),
('\u108d', '\u108d', 0), ('\u109d', '\u109d', 0), ('\u1100', '\u115f', 2), ('\u1160',
'\u11ff', 0), ('\u135d', '\u135f', 0), ('\u1712', '\u1714', 0), ('\u1732', '\u1734', 0),
('\u1752', '\u1753', 0), ('\u1772', '\u1773', 0), ('\u17b4', '\u17b5', 0), ('\u17b7',
'\u17bd', 0), ('\u17c6', '\u17c6', 0), ('\u17c9', '\u17d3', 0), ('\u17dd', '\u17dd', 0),
('\u180b', '\u180d', 0), ('\u180e', '\u180e', 0), ('\u18a9', '\u18a9', 0), ('\u1920',
'\u1922', 0), ('\u1927', '\u1928', 0), ('\u1932', '\u1932', 0), ('\u1939', '\u193b', 0),
('\u1a17', '\u1a18', 0), ('\u1a1b', '\u1a1b', 0), ('\u1a56', '\u1a56', 0), ('\u1a58',
'\u1a60', 0), ('\u1a62', '\u1a62', 0), ('\u1a65', '\u1a6c', 0), ('\u1a73', '\u1a7f', 0),
('\u1ab0', '\u1abd', 0), ('\u1abe', '\u1abe', 0), ('\u1b00', '\u1b03', 0), ('\u1b34',
'\u1b34', 0), ('\u1b36', '\u1b3a', 0), ('\u1b3c', '\u1b3c', 0), ('\u1b42', '\u1b42', 0),
('\u1b6b', '\u1b73', 0), ('\u1b80', '\u1b81', 0), ('\u1ba2', '\u1ba5', 0), ('\u1ba8',
'\u1ba9', 0), ('\u1bab', '\u1bad', 0), ('\u1be6', '\u1be6', 0), ('\u1be8', '\u1be9', 0),
('\u1bed', '\u1bed', 0), ('\u1bef', '\u1bf1', 0), ('\u1c2c', '\u1c33', 0), ('\u1c36',
'\u1c37', 0), ('\u1cd0', '\u1cd2', 0), ('\u1cd4', '\u1ce0', 0), ('\u1ce2', '\u1ce8', 0),
('\u1ced', '\u1ced', 0), ('\u1cf4', '\u1cf4', 0), ('\u1cf8', '\u1cf9', 0), ('\u1dc0',
'\u1dff', 0), ('\u200b', '\u200f', 0), ('\u202a', '\u202e', 0), ('\u2060', '\u206f', 0),
('\u20d0', '\u20dc', 0), ('\u20dd', '\u20e0', 0), ('\u20e1', '\u20e1', 0), ('\u20e2',
'\u20e4', 0), ('\u20e5', '\u20f0', 0), ('\u2329', '\u2329', 2), ('\u232a', '\u232a', 2),
('\u2cef', '\u2cf1', 0), ('\u2d7f', '\u2d7f', 0), ('\u2de0', '\u2dff', 0), ('\u2e80',
'\u2e99', 2), ('\u2e9b', '\u2ef3', 2), ('\u2f00', '\u2fd5', 2), ('\u2ff0', '\u2ffb', 2),
('\u3000', '\u3000', 2), ('\u3001', '\u3003', 2), ('\u3004', '\u3004', 2), ('\u3005',
'\u3005', 2), ('\u3006', '\u3006', 2), ('\u3007', '\u3007', 2), ('\u3008', '\u3008', 2),
('\u3009', '\u3009', 2), ('\u300a', '\u300a', 2), ('\u300b', '\u300b', 2), ('\u300c',
'\u300c', 2), ('\u300d', '\u300d', 2), ('\u300e', '\u300e', 2), ('\u300f', '\u300f', 2),
('\u3010', '\u3010', 2), ('\u3011', '\u3011', 2), ('\u3012', '\u3013', 2), ('\u3014',
'\u3014', 2), ('\u3015', '\u3015', 2), ('\u3016', '\u3016', 2), ('\u3017', '\u3017', 2),
('\u3018', '\u3018', 2), ('\u3019', '\u3019', 2), ('\u301a', '\u301a', 2), ('\u301b',
'\u301b', 2), ('\u301c', '\u301c', 2), ('\u301d', '\u301d', 2), ('\u301e', '\u301f', 2),
('\u3020', '\u3020', 2), ('\u3021', '\u3029', 2), ('\u302a', '\u302d', 0), ('\u302a',
'\u302d', 2), ('\u302e', '\u302f', 2), ('\u3030', '\u3030', 2), ('\u3031', '\u3035', 2),
('\u3036', '\u3037', 2), ('\u3038', '\u303a', 2), ('\u303b', '\u303b', 2), ('\u303c',
'\u303c', 2), ('\u303d', '\u303d', 2), ('\u303e', '\u303e', 2), ('\u3041', '\u3096', 2),
('\u3099', '\u309a', 0), ('\u3099', '\u309a', 2), ('\u309b', '\u309c', 2), ('\u309d',
'\u309e', 2), ('\u309f', '\u309f', 2), ('\u30a0', '\u30a0', 2), ('\u30a1', '\u30fa', 2),
('\u30fb', '\u30fb', 2), ('\u30fc', '\u30fe', 2), ('\u30ff', '\u30ff', 2), ('\u3105',
'\u312d', 2), ('\u3131', '\u318e', 2), ('\u3190', '\u3191', 2), ('\u3192', '\u3195', 2),
('\u3196', '\u319f', 2), ('\u31a0', '\u31ba', 2), ('\u31c0', '\u31e3', 2), ('\u31f0',
'\u31ff', 2), ('\u3200', '\u321e', 2), ('\u3220', '\u3229', 2), ('\u322a', '\u3247', 2),
('\u3250', '\u3250', 2), ('\u3251', '\u325f', 2), ('\u3260', '\u327f', 2), ('\u3280',
'\u3289', 2), ('\u328a', '\u32b0', 2), ('\u32b1', '\u32bf', 2), ('\u32c0', '\u32fe', 2),
('\u3300', '\u33ff', 2), ('\u3400', '\u4db5', 2), ('\u4db6', '\u4dbf', 2), ('\u4e00',
'\u9fcc', 2), ('\u9fcd', '\u9fff', 2), ('\ua000', '\ua014', 2), ('\ua015', '\ua015', 2),
('\ua016', '\ua48c', 2), ('\ua490', '\ua4c6', 2), ('\ua66f', '\ua66f', 0), ('\ua670',
'\ua672', 0), ('\ua674', '\ua67d', 0), ('\ua69f', '\ua69f', 0), ('\ua6f0', '\ua6f1', 0),
('\ua802', '\ua802', 0), ('\ua806', '\ua806', 0), ('\ua80b', '\ua80b', 0), ('\ua825',
'\ua826', 0), ('\ua8c4', '\ua8c4', 0), ('\ua8e0', '\ua8f1', 0), ('\ua926', '\ua92d', 0),
('\ua947', '\ua951', 0), ('\ua960', '\ua97c', 2), ('\ua980', '\ua982', 0), ('\ua9b3',
'\ua9b3', 0), ('\ua9b6', '\ua9b9', 0), ('\ua9bc', '\ua9bc', 0), ('\ua9e5', '\ua9e5', 0),
('\uaa29', '\uaa2e', 0), ('\uaa31', '\uaa32', 0), ('\uaa35', '\uaa36', 0), ('\uaa43',
'\uaa43', 0), ('\uaa4c', '\uaa4c', 0), ('\uaa7c', '\uaa7c', 0), ('\uaab0', '\uaab0', 0),
('\uaab2', '\uaab4', 0), ('\uaab7', '\uaab8', 0), ('\uaabe', '\uaabf', 0), ('\uaac1',
'\uaac1', 0), ('\uaaec', '\uaaed', 0), ('\uaaf6', '\uaaf6', 0), ('\uabe5', '\uabe5', 0),
('\uabe8', '\uabe8', 0), ('\uabed', '\uabed', 0), ('\uac00', '\ud7a3', 2), ('\uf900',
'\ufa6d', 2), ('\ufa6e', '\ufa6f', 2), ('\ufa70', '\ufad9', 2), ('\ufada', '\ufaff', 2),
('\ufb1e', '\ufb1e', 0), ('\ufe00', '\ufe0f', 0), ('\ufe10', '\ufe16', 2), ('\ufe17',
'\ufe17', 2), ('\ufe18', '\ufe18', 2), ('\ufe19', '\ufe19', 2), ('\ufe20', '\ufe2d', 0),
('\ufe30', '\ufe30', 2), ('\ufe31', '\ufe32', 2), ('\ufe33', '\ufe34', 2), ('\ufe35',
'\ufe35', 2), ('\ufe36', '\ufe36', 2), ('\ufe37', '\ufe37', 2), ('\ufe38', '\ufe38', 2),
('\ufe39', '\ufe39', 2), ('\ufe3a', '\ufe3a', 2), ('\ufe3b', '\ufe3b', 2), ('\ufe3c',
'\ufe3c', 2), ('\ufe3d', '\ufe3d', 2), ('\ufe3e', '\ufe3e', 2), ('\ufe3f', '\ufe3f', 2),
('\ufe40', '\ufe40', 2), ('\ufe41', '\ufe41', 2), ('\ufe42', '\ufe42', 2), ('\ufe43',
'\ufe43', 2), ('\ufe44', '\ufe44', 2), ('\ufe45', '\ufe46', 2), ('\ufe47', '\ufe47', 2),
('\ufe48', '\ufe48', 2), ('\ufe49', '\ufe4c', 2), ('\ufe4d', '\ufe4f', 2), ('\ufe50',
'\ufe52', 2), ('\ufe54', '\ufe57', 2), ('\ufe58', '\ufe58', 2), ('\ufe59', '\ufe59', 2),
('\ufe5a', '\ufe5a', 2), ('\ufe5b', '\ufe5b', 2), ('\ufe5c', '\ufe5c', 2), ('\ufe5d',
'\ufe5d', 2), ('\ufe5e', '\ufe5e', 2), ('\ufe5f', '\ufe61', 2), ('\ufe62', '\ufe62', 2),
('\ufe63', '\ufe63', 2), ('\ufe64', '\ufe66', 2), ('\ufe68', '\ufe68', 2), ('\ufe69',
'\ufe69', 2), ('\ufe6a', '\ufe6b', 2), ('\ufeff', '\ufeff', 0), ('\uff01', '\uff03', 2),
('\uff04', '\uff04', 2), ('\uff05', '\uff07', 2), ('\uff08', '\uff08', 2), ('\uff09',
'\uff09', 2), ('\uff0a', '\uff0a', 2), ('\uff0b', '\uff0b', 2), ('\uff0c', '\uff0c', 2),
('\uff0d', '\uff0d', 2), ('\uff0e', '\uff0f', 2), ('\uff10', '\uff19', 2), ('\uff1a',
'\uff1b', 2), ('\uff1c', '\uff1e', 2), ('\uff1f', '\uff20', 2), ('\uff21', '\uff3a', 2),
('\uff3b', '\uff3b', 2), ('\uff3c', '\uff3c', 2), ('\uff3d', '\uff3d', 2), ('\uff3e',
'\uff3e', 2), ('\uff3f', '\uff3f', 2), ('\uff40', '\uff40', 2), ('\uff41', '\uff5a', 2),
('\uff5b', '\uff5b', 2), ('\uff5c', '\uff5c', 2), ('\uff5d', '\uff5d', 2), ('\uff5e',
'\uff5e', 2), ('\uff5f', '\uff5f', 2), ('\uff60', '\uff60', 2), ('\uffe0', '\uffe1', 2),
('\uffe2', '\uffe2', 2), ('\uffe3', '\uffe3', 2), ('\uffe4', '\uffe4', 2), ('\uffe5',
'\uffe6', 2), ('\ufff9', '\ufffb', 0), ('\U000101fd', '\U000101fd', 0), ('\U000102e0',
'\U000102e0', 0), ('\U00010376', '\U0001037a', 0), ('\U00010a01', '\U00010a0f', 0),
('\U00010a38', '\U00010a3f', 0), ('\U00010ae5', '\U00010ae6', 0), ('\U00011001',
'\U00011001', 0), ('\U00011038', '\U00011046', 0), ('\U0001107f', '\U00011081', 0),
('\U000110b3', '\U000110b6', 0), ('\U000110b9', '\U000110ba', 0), ('\U000110bd',
'\U000110bd', 0), ('\U00011100', '\U00011102', 0), ('\U00011127', '\U0001112b', 0),
('\U0001112d', '\U00011134', 0), ('\U00011173', '\U00011173', 0), ('\U00011180',
'\U00011181', 0), ('\U000111b6', '\U000111be', 0), ('\U0001122f', '\U00011231', 0),
('\U00011234', '\U00011234', 0), ('\U00011236', '\U00011237', 0), ('\U000112df',
'\U000112df', 0), ('\U000112e3', '\U000112ea', 0), ('\U00011301', '\U00011301', 0),
('\U0001133c', '\U0001133c', 0), ('\U00011340', '\U00011340', 0), ('\U00011366',
'\U00011374', 0), ('\U000114b3', '\U000114b8', 0), ('\U000114ba', '\U000114ba', 0),
('\U000114bf', '\U000114c0', 0), ('\U000114c2', '\U000114c3', 0), ('\U000115b2',
'\U000115b5', 0), ('\U000115bc', '\U000115bd', 0), ('\U000115bf', '\U000115c0', 0),
('\U00011633', '\U0001163a', 0), ('\U0001163d', '\U0001163d', 0), ('\U0001163f',
'\U00011640', 0), ('\U000116ab', '\U000116ab', 0), ('\U000116ad', '\U000116ad', 0),
('\U000116b0', '\U000116b5', 0), ('\U000116b7', '\U000116b7', 0), ('\U00016af0',
'\U00016af4', 0), ('\U00016b30', '\U00016b36', 0), ('\U00016f8f', '\U00016f92', 0),
('\U0001b000', '\U0001b001', 2), ('\U0001bc9d', '\U0001bc9e', 0), ('\U0001bca0',
'\U0001bca3', 0), ('\U0001d167', '\U0001d169', 0), ('\U0001d173', '\U0001d17a', 0),
('\U0001d17b', '\U0001d182', 0), ('\U0001d185', '\U0001d18b', 0), ('\U0001d1aa',
'\U0001d1ad', 0), ('\U0001d242', '\U0001d244', 0), ('\U0001e8d0', '\U0001e8d6', 0),
('\U0001f200', '\U0001f202', 2), ('\U0001f210', '\U0001f23a', 2), ('\U0001f240',
'\U0001f248', 2), ('\U0001f250', '\U0001f251', 2), ('\U00020000', '\U0002a6d6', 2),
('\U0002a6d7', '\U0002a6ff', 2), ('\U0002a700', '\U0002b734', 2), ('\U0002b735',
'\U0002b73f', 2), ('\U0002b740', '\U0002b81d', 2), ('\U0002b81e', '\U0002f7ff', 2),
('\U0002f800', '\U0002fa1d', 2), ('\U0002fa1e', '\U0002fffd', 2), ('\U00030000',
'\U0003fffd', 2), ('\U000e0001', '\U000e007f', 0), ('\U000e0100', '\U000e01ef', 0)
];