Size: count unicode graphmemes as single char (#2482)

This commit is contained in:
Chris Gillespie 2020-09-02 09:54:00 -07:00 committed by GitHub
parent 47c5346934
commit 666e6a7b57
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 32 additions and 14 deletions

1
Cargo.lock generated
View file

@ -3093,6 +3093,7 @@ dependencies = [
"trash",
"typetag",
"umask",
"unicode-segmentation",
"unicode-xid",
"url 2.1.1",
"users",

View file

@ -87,6 +87,7 @@ termcolor = "1.1.0"
toml = "0.5.6"
typetag = "0.1.5"
umask = "1.0.0"
unicode-segmentation = "1.6.0"
unicode-xid = "0.2.1"
uuid_crate = {package = "uuid", version = "0.8.1", features = ["v4"], optional = true}
which = {version = "4.0.2", optional = true}

View file

@ -1,8 +1,11 @@
extern crate unicode_segmentation;
use crate::commands::WholeStreamCommand;
use crate::prelude::*;
use indexmap::indexmap;
use nu_errors::ShellError;
use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
use unicode_segmentation::UnicodeSegmentation;
pub struct Size;
@ -29,7 +32,8 @@ impl WholeStreamCommand for Size {
}
fn examples(&self) -> Vec<Example> {
vec![Example {
vec![
Example {
description: "Count the number of words in a string",
example: r#"echo "There are seven words in this sentence" | size"#,
result: Some(vec![UntaggedValue::row(indexmap! {
@ -39,7 +43,19 @@ impl WholeStreamCommand for Size {
"bytes".to_string() => UntaggedValue::int(38).into(),
})
.into()]),
}]
},
Example {
description: "Counts unicode characters correctly in a string",
example: r#"echo "Amélie Amelie" | size"#,
result: Some(vec![UntaggedValue::row(indexmap! {
"lines".to_string() => UntaggedValue::int(0).into(),
"words".to_string() => UntaggedValue::int(2).into(),
"chars".to_string() => UntaggedValue::int(13).into(),
"bytes".to_string() => UntaggedValue::int(15).into(),
})
.into()]),
},
]
}
}
@ -72,15 +88,15 @@ fn count(contents: &str, tag: impl Into<Tag>) -> Value {
let bytes = contents.len() as i64;
let mut end_of_word = true;
for c in contents.chars() {
for c in UnicodeSegmentation::graphemes(contents, true) {
chars += 1;
match c {
'\n' => {
"\n" => {
lines += 1;
end_of_word = true;
}
' ' => end_of_word = true,
" " => end_of_word = true,
_ => {
if end_of_word {
words += 1;