Implement a fast path for character counting in wc.

When wc is invoked with only the -m flag, we only need to count the
number of Unicode characters in the input. In order to do so, we don't
actually need to decode the input bytes into characters. Rather, we can
simply count the number of non-continuation bytes in the UTF-8 stream,
since every character will contain exactly one non-continuation byte.

On my laptop, this speeds up `wc -m odyssey1024.txt` from 745ms to
109ms.
This commit is contained in:
Owen Anderson 2022-07-20 22:33:03 -07:00
parent ba24565b60
commit 13762cae05
2 changed files with 25 additions and 1 deletions

View file

@ -134,3 +134,26 @@ pub(crate) fn count_bytes_and_lines_fast<R: Read>(
}
}
}
pub(crate) fn count_chars_fast<R: Read>(handle: &mut R) -> (WordCount, Option<io::Error>) {
/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
const TAG_CONT_U8: u8 = 0b1000_0000u8;
let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE];
loop {
match handle.read(&mut buf) {
Ok(0) => return (total, None),
Ok(n) => {
total.chars += buf[..n]
.iter()
.filter(|&&byte| (byte & !CONT_MASK) != TAG_CONT_U8)
.count();
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return (total, Some(e)),
}
}
}

View file

@ -13,7 +13,7 @@ extern crate uucore;
mod count_fast;
mod countable;
mod word_count;
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast};
use count_fast::{count_bytes_and_lines_fast, count_bytes_fast, count_chars_fast};
use countable::WordCountable;
use unicode_width::UnicodeWidthChar;
use utf8::{BufReadDecoder, BufReadDecoderError};
@ -315,6 +315,7 @@ fn word_count_from_reader<T: WordCountable>(
) {
// Specialize scanning loop to improve the performance.
(false, false, false, false, false) => unreachable!(),
(false, true, false, false, false) => count_chars_fast(&mut reader),
(true, false, false, false, false) => {
// Fast path when only show_bytes is true.
let (bytes, error) = count_bytes_fast(&mut reader);