Attempting to fix star expansion

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>
This commit is contained in:
Hanif Bin Ariffin 2021-07-24 22:06:19 +08:00
parent d5dbedb2e4
commit 279a7cf6b3

View file

@ -26,10 +26,132 @@ mod unicode_table {
pub static BLANK: &'static [char] = &[SPACE, HT]; pub static BLANK: &'static [char] = &[SPACE, HT];
} }
struct Repeat(char);
impl Repeat {
fn new(element: char) -> Repeat {
Repeat(element)
}
}
impl Iterator for Repeat {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
Some(self.0)
}
fn last(self) -> Option<Self::Item> {
Some(self.0)
}
fn any<F>(&mut self, mut f: F) -> bool
where
Self: Sized,
F: FnMut(Self::Item) -> bool,
{
f(self.0)
}
}
fn truncate_iterator<T>(input: Option<usize>) -> impl Fn((usize, T)) -> Option<T> {
move |(idx, c)| match input {
Some(s) => match s.cmp(&idx) {
std::cmp::Ordering::Greater => Some(c),
_ => None,
},
None => Some(c),
}
}
#[derive(Debug, Clone, Copy)]
pub enum Sequence { pub enum Sequence {
Char(char), Char(char),
CharRange(Box<dyn Iterator<Item = char>>), CharRange(u32, u32),
CharStar(char), CharStar(char),
CharRepeat(char, usize),
Alnum,
Alpha,
Blank,
Control,
Digit,
Graph,
Lower,
Print,
Punct,
Space,
Upper,
Xdigit,
}
impl Sequence {
pub fn flatten(&self) -> Box<dyn Iterator<Item = char>> {
match self {
Sequence::Char(c) => Box::new(std::iter::once(*c)),
Sequence::CharRange(l, r) => Box::new((*l..=*r).flat_map(char::from_u32)),
Sequence::CharStar(c) => Box::new(Repeat::new(*c)),
Sequence::CharRepeat(c, n) => Box::new(Repeat::new(*c).take(*n)),
Sequence::Alnum => Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z')),
Sequence::Alpha => Box::new(('A'..='Z').chain('a'..='z')),
Sequence::Blank => Box::new(unicode_table::BLANK.into_iter().cloned()),
Sequence::Control => Box::new(
(0..=31)
.chain(std::iter::once(127))
.flat_map(char::from_u32),
),
Sequence::Digit => Box::new('0'..='9'),
Sequence::Graph => Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.chain(std::iter::once(32)) // space
.flat_map(char::from_u32),
),
Sequence::Lower => Box::new('a'..='z'),
Sequence::Print => Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
),
Sequence::Punct => Box::new(
(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
),
Sequence::Space => Box::new(unicode_table::SPACES.into_iter().cloned()),
Sequence::Upper => Box::new('A'..='Z'),
Sequence::Xdigit => Box::new(('0'..='9').chain('A'..='F').chain('a'..='f')),
}
}
pub fn last(&self) -> Option<char> {
match self {
Sequence::CharStar(c) => Some(*c),
// TODO: Can be optimized further...
rest => rest.flatten().last(),
}
}
pub fn len(&self) -> Option<usize> {
match self {
Sequence::CharStar(_) => None,
// TODO: Is there a fix for this?
rest => Some(rest.flatten().count()),
}
}
} }
impl Sequence { impl Sequence {
@ -70,16 +192,6 @@ impl Sequence {
.unwrap() .unwrap()
} }
pub fn dissolve(self) -> Box<dyn Iterator<Item = char>> {
match self {
Sequence::Char(c) => Box::new(std::iter::once(c)),
Sequence::CharRange(r) => r,
Sequence::CharStar(c) => Box::new(std::iter::repeat(c)),
}
}
/// Sequence parsers
fn parse_char(input: &str) -> IResult<&str, Sequence> { fn parse_char(input: &str) -> IResult<&str, Sequence> {
anychar(input).map(|(l, r)| (l, Sequence::Char(r))) anychar(input).map(|(l, r)| (l, Sequence::Char(r)))
} }
@ -115,7 +227,7 @@ impl Sequence {
separated_pair(anychar, tag("-"), anychar)(input).map(|(l, (a, b))| { separated_pair(anychar, tag("-"), anychar)(input).map(|(l, (a, b))| {
(l, { (l, {
let (start, end) = (u32::from(a), u32::from(b)); let (start, end) = (u32::from(a), u32::from(b));
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) Sequence::CharRange(start, end)
}) })
}) })
} }
@ -129,7 +241,7 @@ impl Sequence {
.map(|(l, (a, b))| { .map(|(l, (a, b))| {
(l, { (l, {
let (start, end) = (u32::from(a), u32::from(b)); let (start, end) = (u32::from(a), u32::from(b));
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) Sequence::CharRange(start, end)
}) })
}) })
} }
@ -143,7 +255,7 @@ impl Sequence {
.map(|(l, (a, b))| { .map(|(l, (a, b))| {
(l, { (l, {
let (start, end) = (u32::from_str_radix(a, 8).unwrap(), u32::from(b)); let (start, end) = (u32::from_str_radix(a, 8).unwrap(), u32::from(b));
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) Sequence::CharRange(start, end)
}) })
}) })
} }
@ -157,7 +269,7 @@ impl Sequence {
.map(|(l, (a, b))| { .map(|(l, (a, b))| {
(l, { (l, {
let (start, end) = (u32::from(a), u32::from_str_radix(b, 8).unwrap()); let (start, end) = (u32::from(a), u32::from_str_radix(b, 8).unwrap());
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) Sequence::CharRange(start, end)
}) })
}) })
} }
@ -174,7 +286,7 @@ impl Sequence {
u32::from_str_radix(a, 8).unwrap(), u32::from_str_radix(a, 8).unwrap(),
u32::from_str_radix(b, 8).unwrap(), u32::from_str_radix(b, 8).unwrap(),
); );
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32))) Sequence::CharRange(start, end)
}) })
}) })
} }
@ -189,136 +301,55 @@ impl Sequence {
separated_pair(anychar, tag("*"), digit1), separated_pair(anychar, tag("*"), digit1),
tag("]"), tag("]"),
)(input) )(input)
.map(|(l, (c, n))| { .map(|(l, (c, n))| (l, Sequence::CharRepeat(c, n.parse().unwrap())))
(
l,
Sequence::CharRange(Box::new(std::iter::repeat(c).take(n.parse().unwrap()))),
)
})
} }
fn parse_alnum(input: &str) -> IResult<&str, Sequence> { fn parse_alnum(input: &str) -> IResult<&str, Sequence> {
tag("[:alnum:]")(input).map(|(l, _)| { tag("[:alnum:]")(input).map(|(l, _)| (l, Sequence::Alnum))
(
l,
Sequence::CharRange(Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z'))),
)
})
} }
fn parse_alpha(input: &str) -> IResult<&str, Sequence> { fn parse_alpha(input: &str) -> IResult<&str, Sequence> {
tag("[:alpha:]")(input).map(|(l, _)| { tag("[:alpha:]")(input).map(|(l, _)| (l, Sequence::Alpha))
(
l,
Sequence::CharRange(Box::new(('A'..='Z').chain('a'..='z'))),
)
})
} }
fn parse_blank(input: &str) -> IResult<&str, Sequence> { fn parse_blank(input: &str) -> IResult<&str, Sequence> {
tag("[:blank:]")(input).map(|(l, _)| { tag("[:blank:]")(input).map(|(l, _)| (l, Sequence::Blank))
(
l,
Sequence::CharRange(Box::new(unicode_table::BLANK.into_iter().cloned())),
)
})
} }
fn parse_control(input: &str) -> IResult<&str, Sequence> { fn parse_control(input: &str) -> IResult<&str, Sequence> {
tag("[:cntrl:]")(input).map(|(l, _)| { tag("[:cntrl:]")(input).map(|(l, _)| (l, Sequence::Control))
(
l,
Sequence::CharRange(Box::new(
(0..=31)
.chain(std::iter::once(127))
.flat_map(char::from_u32),
)),
)
})
} }
fn parse_digit(input: &str) -> IResult<&str, Sequence> { fn parse_digit(input: &str) -> IResult<&str, Sequence> {
tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('0'..='9')))) tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::Digit))
} }
fn parse_graph(input: &str) -> IResult<&str, Sequence> { fn parse_graph(input: &str) -> IResult<&str, Sequence> {
tag("[:graph:]")(input).map(|(l, _)| { tag("[:graph:]")(input).map(|(l, _)| (l, Sequence::Graph))
(
l,
Sequence::CharRange(Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.chain(std::iter::once(32)) // space
.flat_map(char::from_u32),
)),
)
})
} }
fn parse_lower(input: &str) -> IResult<&str, Sequence> { fn parse_lower(input: &str) -> IResult<&str, Sequence> {
tag("[:lower:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('a'..='z')))) tag("[:lower:]")(input).map(|(l, _)| (l, Sequence::Lower))
} }
fn parse_print(input: &str) -> IResult<&str, Sequence> { fn parse_print(input: &str) -> IResult<&str, Sequence> {
tag("[:print:]")(input).map(|(l, _)| { tag("[:print:]")(input).map(|(l, _)| (l, Sequence::Print))
(
l,
Sequence::CharRange(Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
)),
)
})
} }
fn parse_punct(input: &str) -> IResult<&str, Sequence> { fn parse_punct(input: &str) -> IResult<&str, Sequence> {
tag("[:punct:]")(input).map(|(l, _)| { tag("[:punct:]")(input).map(|(l, _)| (l, Sequence::Punct))
(
l,
Sequence::CharRange(Box::new(
(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
)),
)
})
} }
fn parse_space(input: &str) -> IResult<&str, Sequence> { fn parse_space(input: &str) -> IResult<&str, Sequence> {
tag("[:space:]")(input).map(|(l, _)| { tag("[:space:]")(input).map(|(l, _)| (l, Sequence::Space))
(
l,
Sequence::CharRange(Box::new(unicode_table::SPACES.into_iter().cloned())),
)
})
} }
fn parse_upper(input: &str) -> IResult<&str, Sequence> { fn parse_upper(input: &str) -> IResult<&str, Sequence> {
tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('A'..='Z')))) tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::Upper))
} }
fn parse_xdigit(input: &str) -> IResult<&str, Sequence> { fn parse_xdigit(input: &str) -> IResult<&str, Sequence> {
tag("[:xdigit:]")(input).map(|(l, _)| { tag("[:xdigit:]")(input).map(|(l, _)| (l, Sequence::Xdigit))
(
l,
Sequence::CharRange(Box::new(('0'..='9').chain('A'..='F').chain('a'..='f'))),
)
})
} }
fn parse_char_equal(input: &str) -> IResult<&str, Sequence> { fn parse_char_equal(input: &str) -> IResult<&str, Sequence> {
@ -339,10 +370,7 @@ pub struct DeleteOperation {
impl DeleteOperation { impl DeleteOperation {
pub fn new(set: Vec<Sequence>, complement_flag: bool) -> DeleteOperation { pub fn new(set: Vec<Sequence>, complement_flag: bool) -> DeleteOperation {
DeleteOperation { DeleteOperation {
set: set set: set.iter().flat_map(Sequence::flatten).collect::<Vec<_>>(),
.into_iter()
.flat_map(Sequence::dissolve)
.collect::<Vec<_>>(),
complement_flag, complement_flag,
} }
} }
@ -355,21 +383,30 @@ impl SymbolTranslator for DeleteOperation {
} }
} }
#[derive(Debug)]
pub struct TranslateOperationComplement { pub struct TranslateOperationComplement {
iter: u32, iter: u32,
set1: Vec<char>, set1: Vec<char>,
set2: Vec<char>, set2: Box<dyn Iterator<Item = char>>,
fallback: char, fallback: char,
translation_map: HashMap<char, char>, translation_map: HashMap<char, char>,
} }
impl TranslateOperationComplement { impl TranslateOperationComplement {
fn new(set1: Vec<char>, set2: Vec<char>, fallback: char) -> TranslateOperationComplement { fn new(
set1: Vec<Sequence>,
set2: Vec<Sequence>,
set1_truncate_length: Option<usize>,
fallback: char,
) -> TranslateOperationComplement {
TranslateOperationComplement { TranslateOperationComplement {
iter: 0, iter: 0,
set1, set1: set1
set2: set2.into_iter().rev().collect(), .iter()
.flat_map(Sequence::flatten)
.enumerate()
.filter_map(truncate_iterator(set1_truncate_length))
.collect(),
set2: Box::new(set2.into_iter().flat_map(|c| Sequence::flatten(&c))),
fallback, fallback,
translation_map: HashMap::new(), translation_map: HashMap::new(),
} }
@ -382,61 +419,83 @@ pub struct TranslateOperationStandard {
} }
impl TranslateOperationStandard { impl TranslateOperationStandard {
fn new(set1: Vec<char>, set2: Vec<char>, fallback: char) -> TranslateOperationStandard { fn new(
set1: Vec<Sequence>,
set2: Vec<Sequence>,
set1_truncate_length: Option<usize>,
fallback: char,
) -> TranslateOperationStandard {
TranslateOperationStandard { TranslateOperationStandard {
translation_map: set1 translation_map: set1
.into_iter() .iter()
.zip(set2.into_iter().chain(std::iter::repeat(fallback))) .flat_map(Sequence::flatten)
.zip(
set2.iter()
.flat_map(Sequence::flatten)
.chain(Repeat(fallback)),
)
.enumerate()
.filter_map(truncate_iterator(set1_truncate_length))
.collect::<HashMap<_, _>>(), .collect::<HashMap<_, _>>(),
} }
} }
} }
#[derive(Debug)]
pub enum TranslateOperation { pub enum TranslateOperation {
Standard(TranslateOperationStandard), Standard(TranslateOperationStandard),
Complement(TranslateOperationComplement), Complement(TranslateOperationComplement),
} }
impl TranslateOperation { impl TranslateOperation {
fn next_complement_char(mut iter: u32, ignore_list: &[char]) -> (u32, char) { fn next_complement_char(iter: u32, ignore_list: &[char]) -> (u32, char) {
while (char::from_u32(iter).is_none() (iter..)
|| ignore_list .filter_map(char::from_u32)
.iter() .filter(|c| !ignore_list.iter().any(|s| s.eq(c)))
.map(|c| u32::from(*c)) .map(|c| (u32::from(c) + 1, c))
.any(|c| iter.eq(&c))) .next()
&& iter.ne(&u32::MAX) .expect("exhausted all possible characters")
{
iter = iter.saturating_add(1)
}
(iter.saturating_add(1), char::from_u32(iter).unwrap())
} }
} }
impl TranslateOperation { impl TranslateOperation {
pub fn new( pub fn new(
pset1: Vec<Sequence>, set1: Vec<Sequence>,
pset2: Vec<Sequence>, set2: Vec<Sequence>,
truncate_set1: bool, truncate_set1: bool,
complement: bool, complement: bool,
) -> TranslateOperation { ) -> TranslateOperation {
// TODO: Only some translation is acceptable i.e. uppercase/lowercase transform. let fallback = set2
let mut set1 = pset1 .iter()
.into_iter() .rev()
.flat_map(Sequence::dissolve) .next()
.collect::<Vec<_>>(); .map(Sequence::last)
let set2 = pset2 .flatten()
.into_iter() .unwrap();
.flat_map(Sequence::dissolve) let set1_truncate_length = if truncate_set1 {
.collect::<Vec<_>>(); set2.iter()
let fallback = set2.last().cloned().unwrap(); .map(Sequence::len)
if truncate_set1 { .reduce(|a, b| match (a, b) {
set1.truncate(set2.len()); (Some(l), Some(r)) => Some(l + r),
} _ => None,
if complement { })
TranslateOperation::Complement(TranslateOperationComplement::new(set1, set2, fallback)) .flatten()
} else { } else {
TranslateOperation::Standard(TranslateOperationStandard::new(set1, set2, fallback)) None
};
if complement {
TranslateOperation::Complement(TranslateOperationComplement::new(
set1,
set2,
set1_truncate_length,
fallback,
))
} else {
TranslateOperation::Standard(TranslateOperationStandard::new(
set1,
set2,
set1_truncate_length,
fallback,
))
} }
} }
} }
@ -466,7 +525,7 @@ impl SymbolTranslator for TranslateOperation {
Some(*c) Some(*c)
} else { } else {
while translation_map.get(&current).is_none() { while translation_map.get(&current).is_none() {
if let Some(p) = set2.pop() { if let Some(p) = set2.next() {
let (next_index, next_value) = let (next_index, next_value) =
TranslateOperation::next_complement_char(*iter, &*set1); TranslateOperation::next_complement_char(*iter, &*set1);
*iter = next_index; *iter = next_index;
@ -484,18 +543,15 @@ impl SymbolTranslator for TranslateOperation {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct SqueezeOperation { pub struct SqueezeOperation {
squeeze_set: Vec<char>, set1: Vec<char>,
complement: bool, complement: bool,
previous: Option<char>, previous: Option<char>,
} }
impl SqueezeOperation { impl SqueezeOperation {
pub fn new(squeeze_set: Vec<Sequence>, complement: bool) -> SqueezeOperation { pub fn new(set1: Vec<Sequence>, complement: bool) -> SqueezeOperation {
SqueezeOperation { SqueezeOperation {
squeeze_set: squeeze_set set1: set1.iter().flat_map(Sequence::flatten).collect(),
.into_iter()
.flat_map(Sequence::dissolve)
.collect(),
complement, complement,
previous: None, previous: None,
} }
@ -505,7 +561,7 @@ impl SqueezeOperation {
impl SymbolTranslator for SqueezeOperation { impl SymbolTranslator for SqueezeOperation {
fn translate(&mut self, current: char) -> Option<char> { fn translate(&mut self, current: char) -> Option<char> {
if self.complement { if self.complement {
let next = if self.squeeze_set.iter().any(|c| c.eq(&current)) { let next = if self.set1.iter().any(|c| c.eq(&current)) {
Some(current) Some(current)
} else { } else {
match self.previous { match self.previous {
@ -526,7 +582,7 @@ impl SymbolTranslator for SqueezeOperation {
self.previous = Some(current); self.previous = Some(current);
next next
} else { } else {
let next = if self.squeeze_set.iter().any(|c| c.eq(&current)) { let next = if self.set1.iter().any(|c| c.eq(&current)) {
match self.previous { match self.previous {
Some(v) if v == current => None, Some(v) if v == current => None,
_ => Some(current), _ => Some(current),
@ -542,7 +598,7 @@ impl SymbolTranslator for SqueezeOperation {
pub fn translate_input<T, R, W>(input: &mut R, output: &mut W, mut translator: T) pub fn translate_input<T, R, W>(input: &mut R, output: &mut W, mut translator: T)
where where
T: SymbolTranslator + Debug, T: SymbolTranslator,
R: BufRead, R: BufRead,
W: Write, W: Write,
{ {