From 6a2539534f48e69ec7a63274b41387c9288f0e5f Mon Sep 17 00:00:00 2001 From: Antoine Stevan <44101798+amtoine@users.noreply.github.com> Date: Fri, 20 Oct 2023 11:34:55 +0200 Subject: [PATCH] deprecate `size` to `str size` (#10772) related to - https://discord.com/channels/601130461678272522/614613939334152217/1164530991931605062 # Description it appears `size` is a command that operates on `string`s only and gives the user information about the chars, graphemes and bytes of a string. this looks like a command that should be a subcommand to `str` :smirk: this PR - adds `str size` - deprecates `size` `size` is planned to be removed in 0.88 # User-Facing Changes `str size` can be used for the same result as `size`. # Tests + Formatting # After Submitting write a removal PR for `size` --- crates/nu-command/src/default_context.rs | 1 + crates/nu-command/src/strings/size.rs | 10 + crates/nu-command/src/strings/str_/mod.rs | 2 + crates/nu-command/src/strings/str_/size.rs | 398 +++++++++++++++++++++ 4 files changed, 411 insertions(+) create mode 100644 crates/nu-command/src/strings/str_/size.rs diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index a94ee8e1d7..9b78a8a87e 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -190,6 +190,7 @@ pub fn add_shell_command_context(mut engine_state: EngineState) -> EngineState { StrIndexOf, StrLength, StrReverse, + StrSize, StrStartsWith, StrSubstring, StrTrim, diff --git a/crates/nu-command/src/strings/size.rs b/crates/nu-command/src/strings/size.rs index 45b934a19f..918d4aca75 100644 --- a/crates/nu-command/src/strings/size.rs +++ b/crates/nu-command/src/strings/size.rs @@ -40,6 +40,16 @@ impl Command for Size { call: &Call, input: PipelineData, ) -> Result { + nu_protocol::report_error_new( + engine_state, + &ShellError::GenericError( + "Deprecated command".into(), + "`size` is deprecated and will be removed in 0.88.".into(), + Some(call.head), + Some("Use `str size` instead".into()), + vec![], + ), + ); size(engine_state, call, input) } diff --git a/crates/nu-command/src/strings/str_/mod.rs b/crates/nu-command/src/strings/str_/mod.rs index f246720c14..f5c705f712 100644 --- a/crates/nu-command/src/strings/str_/mod.rs +++ b/crates/nu-command/src/strings/str_/mod.rs @@ -8,6 +8,7 @@ mod join; mod length; mod replace; mod reverse; +mod size; mod starts_with; mod substring; mod trim; @@ -22,6 +23,7 @@ pub use join::*; pub use length::SubCommand as StrLength; pub use replace::SubCommand as StrReplace; pub use reverse::SubCommand as StrReverse; +pub use size::SubCommand as StrSize; pub use starts_with::SubCommand as StrStartsWith; pub use substring::SubCommand as StrSubstring; pub use trim::Trim as StrTrim; diff --git a/crates/nu-command/src/strings/str_/size.rs b/crates/nu-command/src/strings/str_/size.rs new file mode 100644 index 0000000000..4cfce108b1 --- /dev/null +++ b/crates/nu-command/src/strings/str_/size.rs @@ -0,0 +1,398 @@ +use fancy_regex::Regex; +use nu_protocol::ast::Call; +use nu_protocol::engine::{Command, EngineState, Stack}; +use nu_protocol::{ + record, Category, Example, PipelineData, Record, ShellError, Signature, Span, Type, Value, +}; +use std::collections::BTreeMap; +use std::{fmt, str}; +use unicode_segmentation::UnicodeSegmentation; + +// borrowed liberally from here https://github.com/dead10ck/uwc +pub type Counted = BTreeMap; + +#[derive(Clone)] +pub struct SubCommand; + +impl Command for SubCommand { + fn name(&self) -> &str { + "str size" + } + + fn signature(&self) -> Signature { + Signature::build("str size") + .category(Category::Strings) + .input_output_types(vec![(Type::String, Type::Record(vec![]))]) + } + + fn usage(&self) -> &str { + "Gather word count statistics on the text." + } + + fn search_terms(&self) -> Vec<&str> { + vec!["count", "word", "character", "unicode", "wc"] + } + + fn run( + &self, + engine_state: &EngineState, + _stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + size(engine_state, call, input) + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Count the number of words in a string", + example: r#""There are seven words in this sentence" | str size"#, + result: Some(Value::test_record(Record { + cols: vec![ + "lines".into(), + "words".into(), + "bytes".into(), + "chars".into(), + "graphemes".into(), + ], + vals: vec![ + Value::test_int(1), + Value::test_int(7), + Value::test_int(38), + Value::test_int(38), + Value::test_int(38), + ], + })), + }, + Example { + description: "Counts unicode characters", + example: r#"'今天天气真好' | str size "#, + result: Some(Value::test_record(Record { + cols: vec![ + "lines".into(), + "words".into(), + "bytes".into(), + "chars".into(), + "graphemes".into(), + ], + vals: vec![ + Value::test_int(1), + Value::test_int(6), + Value::test_int(18), + Value::test_int(6), + Value::test_int(6), + ], + })), + }, + Example { + description: "Counts Unicode characters correctly in a string", + example: r#""Amélie Amelie" | str size"#, + result: Some(Value::test_record(Record { + cols: vec![ + "lines".into(), + "words".into(), + "bytes".into(), + "chars".into(), + "graphemes".into(), + ], + vals: vec![ + Value::test_int(1), + Value::test_int(2), + Value::test_int(15), + Value::test_int(14), + Value::test_int(13), + ], + })), + }, + ] + } +} + +fn size( + engine_state: &EngineState, + call: &Call, + input: PipelineData, +) -> Result { + let span = call.head; + // This doesn't match explicit nulls + if matches!(input, PipelineData::Empty) { + return Err(ShellError::PipelineEmpty { dst_span: span }); + } + input.map( + move |v| { + let value_span = v.span(); + // First, obtain the span. If this fails, propagate the error that results. + if let Value::Error { error, .. } = v { + return Value::error(*error, span); + } + // Now, check if it's a string. + match v.as_string() { + Ok(s) => counter(&s, span), + Err(_) => Value::error( + ShellError::PipelineMismatch { + exp_input_type: "string".into(), + dst_span: span, + src_span: value_span, + }, + span, + ), + } + }, + engine_state.ctrlc.clone(), + ) +} + +fn counter(contents: &str, span: Span) -> Value { + let counts = uwc_count(&ALL_COUNTERS[..], contents); + + fn get_count(counts: &BTreeMap, counter: Counter, span: Span) -> Value { + Value::int(counts.get(&counter).copied().unwrap_or(0) as i64, span) + } + + let record = record! { + "lines" => get_count(&counts, Counter::Lines, span), + "words" => get_count(&counts, Counter::Words, span), + "bytes" => get_count(&counts, Counter::Bytes, span), + "chars" => get_count(&counts, Counter::CodePoints, span), + "graphemes" => get_count(&counts, Counter::GraphemeClusters, span), + }; + + Value::record(record, span) +} + +/// Take all the counts in `other_counts` and sum them into `accum`. +// pub fn sum_counts(accum: &mut Counted, other_counts: &Counted) { +// for (counter, count) in other_counts { +// let entry = accum.entry(*counter).or_insert(0); +// *entry += count; +// } +// } + +/// Sums all the `Counted` instances into a new one. +// pub fn sum_all_counts<'a, I>(counts: I) -> Counted +// where +// I: IntoIterator, +// { +// let mut totals = BTreeMap::new(); +// for counts in counts { +// sum_counts(&mut totals, counts); +// } +// totals +// } + +/// Something that counts things in `&str`s. +pub trait Count { + /// Counts something in the given `&str`. + fn count(&self, s: &str) -> usize; +} + +impl Count for Counter { + fn count(&self, s: &str) -> usize { + match *self { + Counter::GraphemeClusters => s.graphemes(true).count(), + Counter::Bytes => s.len(), + Counter::Lines => { + const LF: &str = "\n"; // 0xe0000a + const CR: &str = "\r"; // 0xe0000d + const CRLF: &str = "\r\n"; // 0xe00d0a + const NEL: &str = "\u{0085}"; // 0x00c285 + const FF: &str = "\u{000C}"; // 0x00000c + const LS: &str = "\u{2028}"; // 0xe280a8 + const PS: &str = "\u{2029}"; // 0xe280a9 + + // use regex here because it can search for CRLF first and not duplicate the count + let line_ending_types = [CRLF, LF, CR, NEL, FF, LS, PS]; + let pattern = &line_ending_types.join("|"); + let newline_pattern = Regex::new(pattern).expect("Unable to create regex"); + let line_endings = newline_pattern + .find_iter(s) + .map(|f| match f { + Ok(mat) => mat.as_str().to_string(), + Err(_) => "".to_string(), + }) + .collect::>(); + + let has_line_ending_suffix = + line_ending_types.iter().any(|&suffix| s.ends_with(suffix)); + // eprintln!("suffix = {}", has_line_ending_suffix); + + if has_line_ending_suffix { + line_endings.len() + } else { + line_endings.len() + 1 + } + } + Counter::Words => s.unicode_words().count(), + Counter::CodePoints => s.chars().count(), + } + } +} + +/// Different types of counters. +#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] +pub enum Counter { + /// Counts lines. + Lines, + + /// Counts words. + Words, + + /// Counts the total number of bytes. + Bytes, + + /// Counts grapheme clusters. The input is required to be valid UTF-8. + GraphemeClusters, + + /// Counts unicode code points + CodePoints, +} + +/// A convenience array of all counter types. +pub const ALL_COUNTERS: [Counter; 5] = [ + Counter::GraphemeClusters, + Counter::Bytes, + Counter::Lines, + Counter::Words, + Counter::CodePoints, +]; + +impl fmt::Display for Counter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = match *self { + Counter::GraphemeClusters => "graphemes", + Counter::Bytes => "bytes", + Counter::Lines => "lines", + Counter::Words => "words", + Counter::CodePoints => "codepoints", + }; + + write!(f, "{s}") + } +} + +/// Counts the given `Counter`s in the given `&str`. +pub fn uwc_count<'a, I>(counters: I, s: &str) -> Counted +where + I: IntoIterator, +{ + let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect(); + if let Some(lines) = counts.get_mut(&Counter::Lines) { + if s.is_empty() { + // If s is empty, indeed, the count is 0 + *lines = 0; + } else if *lines == 0 && !s.is_empty() { + // If s is not empty and the count is 0, it means there + // is a line without a line ending, so let's make it 1 + *lines = 1; + } else { + // no change, whatever the count is, is right + } + } + counts +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_examples() { + use crate::test_examples; + + test_examples(SubCommand {}) + } +} + +#[test] +fn test_one_newline() { + let s = "\n".to_string(); + let counts = uwc_count(&ALL_COUNTERS[..], &s); + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::Lines, 1); + correct_counts.insert(Counter::Words, 0); + correct_counts.insert(Counter::GraphemeClusters, 1); + correct_counts.insert(Counter::Bytes, 1); + correct_counts.insert(Counter::CodePoints, 1); + + assert_eq!(correct_counts, counts); +} + +#[test] +fn test_count_counts_lines() { + // const LF: &str = "\n"; // 0xe0000a + // const CR: &str = "\r"; // 0xe0000d + // const CRLF: &str = "\r\n"; // 0xe00d0a + const NEL: &str = "\u{0085}"; // 0x00c285 + const FF: &str = "\u{000C}"; // 0x00000c + const LS: &str = "\u{2028}"; // 0xe280a8 + const PS: &str = "\u{2029}"; // 0xe280a9 + + // * \r\n is a single grapheme cluster + // * trailing newlines are counted + // * NEL is 2 bytes + // * FF is 1 byte + // * LS is 3 bytes + // * PS is 3 bytes + let mut s = String::from("foo\r\nbar\n\nbaz"); + s += NEL; + s += "quux"; + s += FF; + s += LS; + s += "xi"; + s += PS; + s += "\n"; + + let counts = uwc_count(&ALL_COUNTERS[..], &s); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::Lines, 8); + correct_counts.insert(Counter::Words, 5); + correct_counts.insert(Counter::GraphemeClusters, 23); + correct_counts.insert(Counter::Bytes, 29); + + // one more than grapheme clusters because of \r\n + correct_counts.insert(Counter::CodePoints, 24); + + assert_eq!(correct_counts, counts); +} + +#[test] +fn test_count_counts_words() { + let i_can_eat_glass = "Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα."; + let s = String::from(i_can_eat_glass); + + let counts = uwc_count(&ALL_COUNTERS[..], &s); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::GraphemeClusters, 50); + correct_counts.insert(Counter::Lines, 1); + correct_counts.insert(Counter::Bytes, i_can_eat_glass.len()); + correct_counts.insert(Counter::Words, 9); + correct_counts.insert(Counter::CodePoints, 50); + + assert_eq!(correct_counts, counts); +} + +#[test] +fn test_count_counts_codepoints() { + // these are NOT the same! One is e + ́́ , and one is é, a single codepoint + let one = "é"; + let two = "é"; + + let counters = [Counter::CodePoints]; + + let counts = uwc_count(&counters[..], one); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::CodePoints, 1); + + assert_eq!(correct_counts, counts); + + let counts = uwc_count(&counters[..], two); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::CodePoints, 2); + + assert_eq!(correct_counts, counts); +}