mirror of
https://github.com/uutils/coreutils
synced 2024-12-13 14:52:41 +00:00
Implement a fast path for character counting in wc.
When wc is invoked with only the -m flag, we only need to count the number of Unicode characters in the input. In order to do so, we don't actually need to decode the input bytes into characters. Rather, we can simply count the number of non-continuation bytes in the UTF-8 stream, since every character will contain exactly one non-continuation byte. On my laptop, this speeds up `wc -m odyssey1024.txt` from 745ms to 109ms.
This commit is contained in:
parent
ba24565b60
commit
13762cae05
2 changed files with 25 additions and 1 deletions
|
@ -134,3 +134,26 @@ pub(crate) fn count_bytes_and_lines_fast<R: Read>(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io::Error>) {
|
||||
/// Mask of the value bits of a continuation byte
|
||||
const CONT_MASK: u8 = 0b0011_1111u8;
|
||||
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
|
||||
const TAG_CONT_U8: u8 = 0b1000_0000u8;
|
||||
|
||||
let mut total = WordCount::default();
|
||||
let mut buf = [0; BUF_SIZE];
|
||||
loop {
|
||||
match handle.read(&mut buf) {
|
||||
Ok(0) => return (total, None),
|
||||
Ok(n) => {
|
||||
total.chars += buf[..n]
|
||||
.iter()
|
||||
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
|
||||
.count();
|
||||
}
|
||||
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||
Err(e) => return (total, Some(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ extern crate uucore;
|
|||
mod count_fast;
|
||||
mod countable;
|
||||
mod word_count;
|
||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
|
||||
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast};
|
||||
use countable::WordCountable;
|
||||
use unicode_width::UnicodeWidthChar;
|
||||
use utf8::{BufReadDecoder, BufReadDecoderError};
|
||||
|
@ -315,6 +315,7 @@ fn word_count_from_reader<T: WordCountable>(
|
|||
) {
|
||||
// Specialize scanning loop to improve the performance.
|
||||
(false, false, false, false, false) => unreachable!(),
|
||||
(false, true, false, false, false) => count_chars_fast(&mut reader),
|
||||
(true, false, false, false, false) => {
|
||||
// Fast path when only show_bytes is true.
|
||||
let (bytes, error) = count_bytes_fast(&mut reader);
|
||||
|
|
Loading…
Reference in a new issue