diff --git a/src/uu/tr/src/operation.rs b/src/uu/tr/src/operation.rs index 960ab7ada..2ff43b2a5 100644 --- a/src/uu/tr/src/operation.rs +++ b/src/uu/tr/src/operation.rs @@ -26,10 +26,132 @@ mod unicode_table { pub static BLANK: &'static [char] = &[SPACE, HT]; } +struct Repeat(char); + +impl Repeat { + fn new(element: char) -> Repeat { + Repeat(element) + } +} + +impl Iterator for Repeat { + type Item = char; + + fn next(&mut self) -> Option { + Some(self.0) + } + + fn last(self) -> Option { + Some(self.0) + } + + fn any(&mut self, mut f: F) -> bool + where + Self: Sized, + F: FnMut(Self::Item) -> bool, + { + f(self.0) + } +} + +fn truncate_iterator(input: Option) -> impl Fn((usize, T)) -> Option { + move |(idx, c)| match input { + Some(s) => match s.cmp(&idx) { + std::cmp::Ordering::Greater => Some(c), + _ => None, + }, + None => Some(c), + } +} + +#[derive(Debug, Clone, Copy)] pub enum Sequence { Char(char), - CharRange(Box>), + CharRange(u32, u32), CharStar(char), + CharRepeat(char, usize), + Alnum, + Alpha, + Blank, + Control, + Digit, + Graph, + Lower, + Print, + Punct, + Space, + Upper, + Xdigit, +} + +impl Sequence { + pub fn flatten(&self) -> Box> { + match self { + Sequence::Char(c) => Box::new(std::iter::once(*c)), + Sequence::CharRange(l, r) => Box::new((*l..=*r).flat_map(char::from_u32)), + Sequence::CharStar(c) => Box::new(Repeat::new(*c)), + Sequence::CharRepeat(c, n) => Box::new(Repeat::new(*c).take(*n)), + Sequence::Alnum => Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z')), + Sequence::Alpha => Box::new(('A'..='Z').chain('a'..='z')), + Sequence::Blank => Box::new(unicode_table::BLANK.into_iter().cloned()), + Sequence::Control => Box::new( + (0..=31) + .chain(std::iter::once(127)) + .flat_map(char::from_u32), + ), + Sequence::Digit => Box::new('0'..='9'), + Sequence::Graph => Box::new( + (48..=57) // digit + .chain(65..=90) // uppercase + .chain(97..=122) // lowercase + // punctuations + .chain(33..=47) + .chain(58..=64) + .chain(91..=96) + .chain(123..=126) + .chain(std::iter::once(32)) // space + .flat_map(char::from_u32), + ), + Sequence::Lower => Box::new('a'..='z'), + Sequence::Print => Box::new( + (48..=57) // digit + .chain(65..=90) // uppercase + .chain(97..=122) // lowercase + // punctuations + .chain(33..=47) + .chain(58..=64) + .chain(91..=96) + .chain(123..=126) + .flat_map(char::from_u32), + ), + Sequence::Punct => Box::new( + (33..=47) + .chain(58..=64) + .chain(91..=96) + .chain(123..=126) + .flat_map(char::from_u32), + ), + Sequence::Space => Box::new(unicode_table::SPACES.into_iter().cloned()), + Sequence::Upper => Box::new('A'..='Z'), + Sequence::Xdigit => Box::new(('0'..='9').chain('A'..='F').chain('a'..='f')), + } + } + + pub fn last(&self) -> Option { + match self { + Sequence::CharStar(c) => Some(*c), + // TODO: Can be optimized further... + rest => rest.flatten().last(), + } + } + + pub fn len(&self) -> Option { + match self { + Sequence::CharStar(_) => None, + // TODO: Is there a fix for this? + rest => Some(rest.flatten().count()), + } + } } impl Sequence { @@ -70,16 +192,6 @@ impl Sequence { .unwrap() } - pub fn dissolve(self) -> Box> { - match self { - Sequence::Char(c) => Box::new(std::iter::once(c)), - Sequence::CharRange(r) => r, - Sequence::CharStar(c) => Box::new(std::iter::repeat(c)), - } - } - - /// Sequence parsers - fn parse_char(input: &str) -> IResult<&str, Sequence> { anychar(input).map(|(l, r)| (l, Sequence::Char(r))) } @@ -115,7 +227,7 @@ impl Sequence { separated_pair(anychar, tag("-"), anychar)(input).map(|(l, (a, b))| { (l, { let (start, end) = (u32::from(a), u32::from(b)); - Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) + Sequence::CharRange(start, end) }) }) } @@ -129,7 +241,7 @@ impl Sequence { .map(|(l, (a, b))| { (l, { let (start, end) = (u32::from(a), u32::from(b)); - Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) + Sequence::CharRange(start, end) }) }) } @@ -143,7 +255,7 @@ impl Sequence { .map(|(l, (a, b))| { (l, { let (start, end) = (u32::from_str_radix(a, 8).unwrap(), u32::from(b)); - Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) + Sequence::CharRange(start, end) }) }) } @@ -157,7 +269,7 @@ impl Sequence { .map(|(l, (a, b))| { (l, { let (start, end) = (u32::from(a), u32::from_str_radix(b, 8).unwrap()); - Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) + Sequence::CharRange(start, end) }) }) } @@ -174,7 +286,7 @@ impl Sequence { u32::from_str_radix(a, 8).unwrap(), u32::from_str_radix(b, 8).unwrap(), ); - Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) + Sequence::CharRange(start, end) }) }) } @@ -189,136 +301,55 @@ impl Sequence { separated_pair(anychar, tag("*"), digit1), tag("]"), )(input) - .map(|(l, (c, n))| { - ( - l, - Sequence::CharRange(Box::new(std::iter::repeat(c).take(n.parse().unwrap()))), - ) - }) + .map(|(l, (c, n))| (l, Sequence::CharRepeat(c, n.parse().unwrap()))) } fn parse_alnum(input: &str) -> IResult<&str, Sequence> { - tag("[:alnum:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z'))), - ) - }) + tag("[:alnum:]")(input).map(|(l, _)| (l, Sequence::Alnum)) } fn parse_alpha(input: &str) -> IResult<&str, Sequence> { - tag("[:alpha:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new(('A'..='Z').chain('a'..='z'))), - ) - }) + tag("[:alpha:]")(input).map(|(l, _)| (l, Sequence::Alpha)) } fn parse_blank(input: &str) -> IResult<&str, Sequence> { - tag("[:blank:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new(unicode_table::BLANK.into_iter().cloned())), - ) - }) + tag("[:blank:]")(input).map(|(l, _)| (l, Sequence::Blank)) } fn parse_control(input: &str) -> IResult<&str, Sequence> { - tag("[:cntrl:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new( - (0..=31) - .chain(std::iter::once(127)) - .flat_map(char::from_u32), - )), - ) - }) + tag("[:cntrl:]")(input).map(|(l, _)| (l, Sequence::Control)) } fn parse_digit(input: &str) -> IResult<&str, Sequence> { - tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('0'..='9')))) + tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::Digit)) } fn parse_graph(input: &str) -> IResult<&str, Sequence> { - tag("[:graph:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new( - (48..=57) // digit - .chain(65..=90) // uppercase - .chain(97..=122) // lowercase - // punctuations - .chain(33..=47) - .chain(58..=64) - .chain(91..=96) - .chain(123..=126) - .chain(std::iter::once(32)) // space - .flat_map(char::from_u32), - )), - ) - }) + tag("[:graph:]")(input).map(|(l, _)| (l, Sequence::Graph)) } fn parse_lower(input: &str) -> IResult<&str, Sequence> { - tag("[:lower:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('a'..='z')))) + tag("[:lower:]")(input).map(|(l, _)| (l, Sequence::Lower)) } fn parse_print(input: &str) -> IResult<&str, Sequence> { - tag("[:print:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new( - (48..=57) // digit - .chain(65..=90) // uppercase - .chain(97..=122) // lowercase - // punctuations - .chain(33..=47) - .chain(58..=64) - .chain(91..=96) - .chain(123..=126) - .flat_map(char::from_u32), - )), - ) - }) + tag("[:print:]")(input).map(|(l, _)| (l, Sequence::Print)) } fn parse_punct(input: &str) -> IResult<&str, Sequence> { - tag("[:punct:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new( - (33..=47) - .chain(58..=64) - .chain(91..=96) - .chain(123..=126) - .flat_map(char::from_u32), - )), - ) - }) + tag("[:punct:]")(input).map(|(l, _)| (l, Sequence::Punct)) } fn parse_space(input: &str) -> IResult<&str, Sequence> { - tag("[:space:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new(unicode_table::SPACES.into_iter().cloned())), - ) - }) + tag("[:space:]")(input).map(|(l, _)| (l, Sequence::Space)) } fn parse_upper(input: &str) -> IResult<&str, Sequence> { - tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('A'..='Z')))) + tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::Upper)) } fn parse_xdigit(input: &str) -> IResult<&str, Sequence> { - tag("[:xdigit:]")(input).map(|(l, _)| { - ( - l, - Sequence::CharRange(Box::new(('0'..='9').chain('A'..='F').chain('a'..='f'))), - ) - }) + tag("[:xdigit:]")(input).map(|(l, _)| (l, Sequence::Xdigit)) } fn parse_char_equal(input: &str) -> IResult<&str, Sequence> { @@ -339,10 +370,7 @@ pub struct DeleteOperation { impl DeleteOperation { pub fn new(set: Vec, complement_flag: bool) -> DeleteOperation { DeleteOperation { - set: set - .into_iter() - .flat_map(Sequence::dissolve) - .collect::>(), + set: set.iter().flat_map(Sequence::flatten).collect::>(), complement_flag, } } @@ -355,21 +383,30 @@ impl SymbolTranslator for DeleteOperation { } } -#[derive(Debug)] pub struct TranslateOperationComplement { iter: u32, set1: Vec, - set2: Vec, + set2: Box>, fallback: char, translation_map: HashMap, } impl TranslateOperationComplement { - fn new(set1: Vec, set2: Vec, fallback: char) -> TranslateOperationComplement { + fn new( + set1: Vec, + set2: Vec, + set1_truncate_length: Option, + fallback: char, + ) -> TranslateOperationComplement { TranslateOperationComplement { iter: 0, - set1, - set2: set2.into_iter().rev().collect(), + set1: set1 + .iter() + .flat_map(Sequence::flatten) + .enumerate() + .filter_map(truncate_iterator(set1_truncate_length)) + .collect(), + set2: Box::new(set2.into_iter().flat_map(|c| Sequence::flatten(&c))), fallback, translation_map: HashMap::new(), } @@ -382,61 +419,83 @@ pub struct TranslateOperationStandard { } impl TranslateOperationStandard { - fn new(set1: Vec, set2: Vec, fallback: char) -> TranslateOperationStandard { + fn new( + set1: Vec, + set2: Vec, + set1_truncate_length: Option, + fallback: char, + ) -> TranslateOperationStandard { TranslateOperationStandard { translation_map: set1 - .into_iter() - .zip(set2.into_iter().chain(std::iter::repeat(fallback))) + .iter() + .flat_map(Sequence::flatten) + .zip( + set2.iter() + .flat_map(Sequence::flatten) + .chain(Repeat(fallback)), + ) + .enumerate() + .filter_map(truncate_iterator(set1_truncate_length)) .collect::>(), } } } -#[derive(Debug)] pub enum TranslateOperation { Standard(TranslateOperationStandard), Complement(TranslateOperationComplement), } impl TranslateOperation { - fn next_complement_char(mut iter: u32, ignore_list: &[char]) -> (u32, char) { - while (char::from_u32(iter).is_none() - || ignore_list - .iter() - .map(|c| u32::from(*c)) - .any(|c| iter.eq(&c))) - && iter.ne(&u32::MAX) - { - iter = iter.saturating_add(1) - } - (iter.saturating_add(1), char::from_u32(iter).unwrap()) + fn next_complement_char(iter: u32, ignore_list: &[char]) -> (u32, char) { + (iter..) + .filter_map(char::from_u32) + .filter(|c| !ignore_list.iter().any(|s| s.eq(c))) + .map(|c| (u32::from(c) + 1, c)) + .next() + .expect("exhausted all possible characters") } } impl TranslateOperation { pub fn new( - pset1: Vec, - pset2: Vec, + set1: Vec, + set2: Vec, truncate_set1: bool, complement: bool, ) -> TranslateOperation { - // TODO: Only some translation is acceptable i.e. uppercase/lowercase transform. - let mut set1 = pset1 - .into_iter() - .flat_map(Sequence::dissolve) - .collect::>(); - let set2 = pset2 - .into_iter() - .flat_map(Sequence::dissolve) - .collect::>(); - let fallback = set2.last().cloned().unwrap(); - if truncate_set1 { - set1.truncate(set2.len()); - } - if complement { - TranslateOperation::Complement(TranslateOperationComplement::new(set1, set2, fallback)) + let fallback = set2 + .iter() + .rev() + .next() + .map(Sequence::last) + .flatten() + .unwrap(); + let set1_truncate_length = if truncate_set1 { + set2.iter() + .map(Sequence::len) + .reduce(|a, b| match (a, b) { + (Some(l), Some(r)) => Some(l + r), + _ => None, + }) + .flatten() } else { - TranslateOperation::Standard(TranslateOperationStandard::new(set1, set2, fallback)) + None + }; + if complement { + TranslateOperation::Complement(TranslateOperationComplement::new( + set1, + set2, + set1_truncate_length, + fallback, + )) + } else { + TranslateOperation::Standard(TranslateOperationStandard::new( + set1, + set2, + set1_truncate_length, + fallback, + )) } } } @@ -466,7 +525,7 @@ impl SymbolTranslator for TranslateOperation { Some(*c) } else { while translation_map.get(¤t).is_none() { - if let Some(p) = set2.pop() { + if let Some(p) = set2.next() { let (next_index, next_value) = TranslateOperation::next_complement_char(*iter, &*set1); *iter = next_index; @@ -484,18 +543,15 @@ impl SymbolTranslator for TranslateOperation { #[derive(Debug, Clone)] pub struct SqueezeOperation { - squeeze_set: Vec, + set1: Vec, complement: bool, previous: Option, } impl SqueezeOperation { - pub fn new(squeeze_set: Vec, complement: bool) -> SqueezeOperation { + pub fn new(set1: Vec, complement: bool) -> SqueezeOperation { SqueezeOperation { - squeeze_set: squeeze_set - .into_iter() - .flat_map(Sequence::dissolve) - .collect(), + set1: set1.iter().flat_map(Sequence::flatten).collect(), complement, previous: None, } @@ -505,7 +561,7 @@ impl SqueezeOperation { impl SymbolTranslator for SqueezeOperation { fn translate(&mut self, current: char) -> Option { if self.complement { - let next = if self.squeeze_set.iter().any(|c| c.eq(¤t)) { + let next = if self.set1.iter().any(|c| c.eq(¤t)) { Some(current) } else { match self.previous { @@ -526,7 +582,7 @@ impl SymbolTranslator for SqueezeOperation { self.previous = Some(current); next } else { - let next = if self.squeeze_set.iter().any(|c| c.eq(¤t)) { + let next = if self.set1.iter().any(|c| c.eq(¤t)) { match self.previous { Some(v) if v == current => None, _ => Some(current), @@ -542,7 +598,7 @@ impl SymbolTranslator for SqueezeOperation { pub fn translate_input(input: &mut R, output: &mut W, mut translator: T) where - T: SymbolTranslator + Debug, + T: SymbolTranslator, R: BufRead, W: Write, {