add a split words command (#6363)

* add a split words command * changed regex
2024-12-27 21:43:09 +00:00 · 2022-08-19 12:55:54 -05:00 · 2022-08-19 12:55:54 -05:00 · 99c42582fe
commit 99c42582fe
parent 5a56d47f25
3 changed files with 324 additions and 0 deletions
--- a/crates/nu-command/src/default_context.rs
+++ b/crates/nu-command/src/default_context.rs
@ -184,6 +184,7 @@ pub fn create_default_context() -> EngineState {
            SplitChars,
            SplitColumn,
            SplitRow,
            SplitWords,
            Str,
            StrCamelCase,
            StrCapitalize,
--- a/crates/nu-command/src/strings/split/mod.rs
+++ b/crates/nu-command/src/strings/split/mod.rs
@ -3,9 +3,11 @@ pub mod column;
 pub mod command;
 pub mod list;
 pub mod row;
 pub mod words;
 pub use chars::SubCommand as SplitChars;
 pub use column::SubCommand as SplitColumn;
 pub use command::SplitCommand as Split;
 pub use list::SubCommand as SplitList;
 pub use row::SubCommand as SplitRow;
 pub use words::SubCommand as SplitWords;
--- a/crates/nu-command/src/strings/split/words.rs
+++ b/crates/nu-command/src/strings/split/words.rs
@ -0,0 +1,321 @@
 use fancy_regex::Regex;
 use nu_engine::CallExt;
 use nu_protocol::{
    ast::Call,
    engine::{Command, EngineState, Stack},
    Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
 };
 #[derive(Clone)]
 pub struct SubCommand;
 impl Command for SubCommand {
    fn name(&self) -> &str {
        "split words"
    }
    fn signature(&self) -> Signature {
        Signature::build("split words")
            .category(Category::Strings)
            // .switch(
            //     "ignore-hyphenated",
            //     "ignore hyphenated words, splitting at the hyphen",
            //     Some('i'),
            // )
            // .switch(
            //     "ignore-apostrophes",
            //     "ignore apostrophes in words by removing them",
            //     Some('a'),
            // )
            // .switch(
            //     "ignore-punctuation",
            //     "ignore punctuation around words by removing them",
            //     Some('p'),
            // )
            .named(
                "min-word-length",
                SyntaxShape::Int,
                "The minimum word length",
                Some('l'),
            )
    }
    fn usage(&self) -> &str {
        "Split a string's words into separate rows"
    }
    fn search_terms(&self) -> Vec<&str> {
        vec!["word", "separate", "divide"]
    }
    fn examples(&self) -> Vec<Example> {
        vec![
            Example {
                description: "Split the string's words into separate rows",
                example: "'hello world' | split words",
                result: Some(Value::List {
                    vals: vec![Value::test_string("hello"), Value::test_string("world")],
                    span: Span::test_data(),
                }),
            },
            Example {
                description:
                    "Split the string's words, of at least 3 characters, into separate rows",
                example: "'hello to the world' | split words -l 3",
                result: Some(Value::List {
                    vals: vec![
                        Value::test_string("hello"),
                        Value::test_string("the"),
                        Value::test_string("world"),
                    ],
                    span: Span::test_data(),
                }),
            },
        ]
    }
    fn run(
        &self,
        engine_state: &EngineState,
        stack: &mut Stack,
        call: &Call,
        input: PipelineData,
    ) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
        split_words(engine_state, stack, call, input)
    }
 }
 fn split_words(
    engine_state: &EngineState,
    stack: &mut Stack,
    call: &Call,
    input: PipelineData,
 ) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
    let span = call.head;
    // let ignore_hyphenated = call.has_flag("ignore-hyphenated");
    // let ignore_apostrophes = call.has_flag("ignore-apostrophes");
    // let ignore_punctuation = call.has_flag("ignore-punctuation");
    let word_length: Option<usize> = call.get_flag(engine_state, stack, "min-word-length")?;
    input.flat_map(
        move |x| split_words_helper(&x, word_length, span),
        engine_state.ctrlc.clone(),
    )
 }
 fn split_words_helper(v: &Value, word_length: Option<usize>, span: Span) -> Vec<Value> {
    // There are some options here with this regex.
    // [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes
    // [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes
    // [^\p{L}\'] = do not match any unicode uppercase or lowercase letters or apostrophes
    // Let's go with the unicode one in hopes that it works on more than just ascii characters
    let regex_replace = Regex::new(r"[^\p{L}\']").expect("regular expression error");
    match v.span() {
        Ok(v_span) => {
            if let Ok(s) = v.as_string() {
                // let splits = s.unicode_words();
                // let words = trim_to_words(s);
                // let words: Vec<&str> = s.split_whitespace().collect();
                let replaced_string = regex_replace.replace_all(&s, " ").to_string();
                replaced_string
                    .split(' ')
                    .filter_map(|s| {
                        if s.trim() != "" {
                            if let Some(len) = word_length {
                                if s.chars().count() >= len {
                                    Some(Value::string(s, v_span))
                                } else {
                                    None
                                }
                            } else {
                                Some(Value::string(s, v_span))
                            }
                        } else {
                            None
                        }
                    })
                    .collect()
            } else {
                vec![Value::Error {
                    error: ShellError::PipelineMismatch("string".into(), span, v_span),
                }]
            }
        }
        Err(error) => vec![Value::Error { error }],
    }
 }
 // original at least 1 char long
 // curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{1,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
 // benchmark INCLUDING DOWNLOAD: 1sec 253ms 91µs 511ns
 //    1839 the
 //     942 and
 //     811 to
 //     695 a
 //     638 of
 //     610 it
 //     553 she
 //     546 i
 //     486 you
 //     462 said
 // original at least 2 chars long
 // curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{2,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
 //    1839 the
 //     942 and
 //     811 to
 //     638 of
 //     610 it
 //     553 she
 //     486 you
 //     462 said
 //     435 in
 //     403 alice
 // regex means, replace everything that is not A-Z or a-z or ' with a space
 // ❯ $contents | str replace "[^A-Za-z\']" " " -a | split row ' ' | where ($it | str length) > 1 | uniq -i -c | sort-by count --reverse | first 10
 // benchmark: 1sec 775ms 471µs 600ns
 // ╭───┬───────┬───────╮
 // │ # │ value │ count │
 // ├───┼───────┼───────┤
 // │ 0 │ the   │  1839 │
 // │ 1 │ and   │   942 │
 // │ 2 │ to    │   811 │
 // │ 3 │ of    │   638 │
 // │ 4 │ it    │   610 │
 // │ 5 │ she   │   553 │
 // │ 6 │ you   │   486 │
 // │ 7 │ said  │   462 │
 // │ 8 │ in    │   435 │
 // │ 9 │ alice │   403 │
 // ╰───┴───────┴───────╯
 // $alice |str replace "[^A-Za-z\']" " " -a | split row ' ' | uniq -i -c | sort-by count --reverse | first 10
 // benchmark: 1sec 518ms 701µs 200ns
 // ╭───┬───────┬───────╮
 // │ # │ value │ count │
 // ├───┼───────┼───────┤
 // │ 0 │ the   │  1839 │
 // │ 1 │ and   │   942 │
 // │ 2 │ to    │   811 │
 // │ 3 │ a     │   695 │
 // │ 4 │ of    │   638 │
 // │ 5 │ it    │   610 │
 // │ 6 │ she   │   553 │
 // │ 7 │ i     │   546 │
 // │ 8 │ you   │   486 │
 // │ 9 │ said  │   462 │
 // ├───┼───────┼───────┤
 // │ # │ value │ count │
 // ╰───┴───────┴───────╯
 // s.unicode_words()
 // $alice | str downcase | split words | sort | uniq -c | sort-by count | reverse | first 10
 // benchmark: 4sec 965ms 285µs 800ns
 // ╭───┬───────┬───────╮
 // │ # │ value │ count │
 // ├───┼───────┼───────┤
 // │ 0 │ the   │  1839 │
 // │ 1 │ and   │   941 │
 // │ 2 │ to    │   811 │
 // │ 3 │ a     │   695 │
 // │ 4 │ of    │   638 │
 // │ 5 │ it    │   542 │
 // │ 6 │ she   │   538 │
 // │ 7 │ said  │   460 │
 // │ 8 │ in    │   434 │
 // │ 9 │ you   │   426 │
 // ├───┼───────┼───────┤
 // │ # │ value │ count │
 // ╰───┴───────┴───────╯
 // trim_to_words
 // benchmark: 5sec 992ms 76µs 200ns
 // ╭───┬───────┬───────╮
 // │ # │ value │ count │
 // ├───┼───────┼───────┤
 // │ 0 │ the   │  1829 │
 // │ 1 │ and   │   918 │
 // │ 2 │ to    │   801 │
 // │ 3 │ a     │   689 │
 // │ 4 │ of    │   632 │
 // │ 5 │ she   │   537 │
 // │ 6 │ it    │   493 │
 // │ 7 │ said  │   457 │
 // │ 8 │ in    │   430 │
 // │ 9 │ you   │   413 │
 // ├───┼───────┼───────┤
 // │ # │ value │ count │
 // ╰───┴───────┴───────╯
 // fn trim_to_words(content: String) -> std::vec::Vec<std::string::String> {
 //     let content: Vec<String> = content
 //         .to_lowercase()
 //         .replace(&['-'][..], " ")
 //         //should 's be replaced?
 //         .replace("'s", "")
 //         .replace(
 //             &[
 //                 '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'',
 //                 '’', '?', '!', '“', '‘',
 //             ][..],
 //             "",
 //         )
 //         .split_whitespace()
 //         .map(String::from)
 //         .collect::<Vec<String>>();
 //     content
 // }
 // split_whitespace()
 // benchmark: 9sec 379ms 790µs 900ns
 // ╭───┬───────┬───────╮
 // │ # │ value │ count │
 // ├───┼───────┼───────┤
 // │ 0 │ the   │  1683 │
 // │ 1 │ and   │   783 │
 // │ 2 │ to    │   778 │
 // │ 3 │ a     │   667 │
 // │ 4 │ of    │   605 │
 // │ 5 │ she   │   485 │
 // │ 6 │ said  │   416 │
 // │ 7 │ in    │   406 │
 // │ 8 │ it    │   357 │
 // │ 9 │ was   │   329 │
 // ├───┼───────┼───────┤
 // │ # │ value │ count │
 // ╰───┴───────┴───────╯
 // current
 // $alice | str downcase | split words | uniq -c | sort-by count --reverse | first 10
 // benchmark: 1sec 481ms 604µs 700ns
 // ╭───┬───────┬───────╮
 // │ # │ value │ count │
 // ├───┼───────┼───────┤
 // │ 0 │ the   │  1839 │
 // │ 1 │ and   │   942 │
 // │ 2 │ to    │   811 │
 // │ 3 │ a     │   695 │
 // │ 4 │ of    │   638 │
 // │ 5 │ it    │   610 │
 // │ 6 │ she   │   553 │
 // │ 7 │ i     │   546 │
 // │ 8 │ you   │   486 │
 // │ 9 │ said  │   462 │
 // ├───┼───────┼───────┤
 // │ # │ value │ count │
 // ╰───┴───────┴───────╯
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn test_examples() {
        use crate::test_examples;
        test_examples(SubCommand {})
    }
 }