Size: count unicode graphmemes as single char (#2482)

This commit is contained in:
Chris Gillespie 2020-09-02 09:54:00 -07:00 committed by GitHub
parent 47c5346934
commit 666e6a7b57
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 32 additions and 14 deletions

1
Cargo.lock generated
View file

@ -3093,6 +3093,7 @@ dependencies = [
"trash", "trash",
"typetag", "typetag",
"umask", "umask",
"unicode-segmentation",
"unicode-xid", "unicode-xid",
"url 2.1.1", "url 2.1.1",
"users", "users",

View file

@ -87,6 +87,7 @@ termcolor = "1.1.0"
toml = "0.5.6" toml = "0.5.6"
typetag = "0.1.5" typetag = "0.1.5"
umask = "1.0.0" umask = "1.0.0"
unicode-segmentation = "1.6.0"
unicode-xid = "0.2.1" unicode-xid = "0.2.1"
uuid_crate = {package = "uuid", version = "0.8.1", features = ["v4"], optional = true} uuid_crate = {package = "uuid", version = "0.8.1", features = ["v4"], optional = true}
which = {version = "4.0.2", optional = true} which = {version = "4.0.2", optional = true}

View file

@ -1,8 +1,11 @@
extern crate unicode_segmentation;
use crate::commands::WholeStreamCommand; use crate::commands::WholeStreamCommand;
use crate::prelude::*; use crate::prelude::*;
use indexmap::indexmap; use indexmap::indexmap;
use nu_errors::ShellError; use nu_errors::ShellError;
use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value}; use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
use unicode_segmentation::UnicodeSegmentation;
pub struct Size; pub struct Size;
@ -29,17 +32,30 @@ impl WholeStreamCommand for Size {
} }
fn examples(&self) -> Vec<Example> { fn examples(&self) -> Vec<Example> {
vec![Example { vec![
description: "Count the number of words in a string", Example {
example: r#"echo "There are seven words in this sentence" | size"#, description: "Count the number of words in a string",
result: Some(vec![UntaggedValue::row(indexmap! { example: r#"echo "There are seven words in this sentence" | size"#,
"lines".to_string() => UntaggedValue::int(0).into(), result: Some(vec![UntaggedValue::row(indexmap! {
"words".to_string() => UntaggedValue::int(7).into(), "lines".to_string() => UntaggedValue::int(0).into(),
"chars".to_string() => UntaggedValue::int(38).into(), "words".to_string() => UntaggedValue::int(7).into(),
"bytes".to_string() => UntaggedValue::int(38).into(), "chars".to_string() => UntaggedValue::int(38).into(),
}) "bytes".to_string() => UntaggedValue::int(38).into(),
.into()]), })
}] .into()]),
},
Example {
description: "Counts unicode characters correctly in a string",
example: r#"echo "Amélie Amelie" | size"#,
result: Some(vec![UntaggedValue::row(indexmap! {
"lines".to_string() => UntaggedValue::int(0).into(),
"words".to_string() => UntaggedValue::int(2).into(),
"chars".to_string() => UntaggedValue::int(13).into(),
"bytes".to_string() => UntaggedValue::int(15).into(),
})
.into()]),
},
]
} }
} }
@ -72,15 +88,15 @@ fn count(contents: &str, tag: impl Into<Tag>) -> Value {
let bytes = contents.len() as i64; let bytes = contents.len() as i64;
let mut end_of_word = true; let mut end_of_word = true;
for c in contents.chars() { for c in UnicodeSegmentation::graphemes(contents, true) {
chars += 1; chars += 1;
match c { match c {
'\n' => { "\n" => {
lines += 1; lines += 1;
end_of_word = true; end_of_word = true;
} }
' ' => end_of_word = true, " " => end_of_word = true,
_ => { _ => {
if end_of_word { if end_of_word {
words += 1; words += 1;