Size: count unicode graphmemes as single char (#2482)

2024-12-27 21:43:09 +00:00 · 2020-09-02 09:54:00 -07:00 · 2020-09-02 09:54:00 -07:00 · 666e6a7b57
commit 666e6a7b57
parent 47c5346934
3 changed files with 32 additions and 14 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3093,6 +3093,7 @@ dependencies = [
 "trash",
 "typetag",
 "umask",
+ "unicode-segmentation",
 "unicode-xid",
 "url 2.1.1",
 "users",
--- a/crates/nu-cli/Cargo.toml
+++ b/crates/nu-cli/Cargo.toml
@ -87,6 +87,7 @@ termcolor = "1.1.0"
 toml = "0.5.6"
 typetag = "0.1.5"
 umask = "1.0.0"
+unicode-segmentation = "1.6.0"
 unicode-xid = "0.2.1"
 uuid_crate = {package = "uuid", version = "0.8.1", features = ["v4"], optional = true}
 which = {version = "4.0.2", optional = true}
--- a/crates/nu-cli/src/commands/size.rs
+++ b/crates/nu-cli/src/commands/size.rs
@ -1,8 +1,11 @@
+extern crate unicode_segmentation;
+
 use crate::commands::WholeStreamCommand;
 use crate::prelude::*;
 use indexmap::indexmap;
 use nu_errors::ShellError;
 use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
+use unicode_segmentation::UnicodeSegmentation;

 pub struct Size;

@ -29,17 +32,30 @@ impl WholeStreamCommand for Size {
    }

    fn examples(&self) -> Vec<Example> {
-        vec![Example {
-            description: "Count the number of words in a string",
-            example: r#"echo "There are seven words in this sentence" | size"#,
-            result: Some(vec![UntaggedValue::row(indexmap! {
-                "lines".to_string() => UntaggedValue::int(0).into(),
-                "words".to_string() => UntaggedValue::int(7).into(),
-                "chars".to_string() => UntaggedValue::int(38).into(),
-                "bytes".to_string() => UntaggedValue::int(38).into(),
-            })
-            .into()]),
-        }]
+        vec![
+            Example {
+                description: "Count the number of words in a string",
+                example: r#"echo "There are seven words in this sentence" | size"#,
+                result: Some(vec![UntaggedValue::row(indexmap! {
+                        "lines".to_string() => UntaggedValue::int(0).into(),
+                        "words".to_string() => UntaggedValue::int(7).into(),
+                        "chars".to_string() => UntaggedValue::int(38).into(),
+                        "bytes".to_string() => UntaggedValue::int(38).into(),
+                })
+                .into()]),
+            },
+            Example {
+                description: "Counts unicode characters correctly in a string",
+                example: r#"echo "Amélie Amelie" | size"#,
+                result: Some(vec![UntaggedValue::row(indexmap! {
+                        "lines".to_string() => UntaggedValue::int(0).into(),
+                        "words".to_string() => UntaggedValue::int(2).into(),
+                        "chars".to_string() => UntaggedValue::int(13).into(),
+                        "bytes".to_string() => UntaggedValue::int(15).into(),
+                })
+                .into()]),
+            },
+        ]
    }
 }

@ -72,15 +88,15 @@ fn count(contents: &str, tag: impl Into<Tag>) -> Value {
    let bytes = contents.len() as i64;
    let mut end_of_word = true;

-    for c in contents.chars() {
+    for c in UnicodeSegmentation::graphemes(contents, true) {
        chars += 1;

        match c {
-            '\n' => {
+            "\n" => {
                lines += 1;
                end_of_word = true;
            }
-            ' ' => end_of_word = true,
+            " " => end_of_word = true,
            _ => {
                if end_of_word {
                    words += 1;