Attempting to fix star expansion

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>
This commit is contained in:
Hanif Bin Ariffin 2021-07-24 22:06:19 +08:00
parent d5dbedb2e4
commit 279a7cf6b3

View file

@ -26,10 +26,132 @@ mod unicode_table {
pub static BLANK: &'static [char] = &[SPACE, HT];
}
struct Repeat(char);
impl Repeat {
fn new(element: char) -> Repeat {
Repeat(element)
}
}
impl Iterator for Repeat {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
Some(self.0)
}
fn last(self) -> Option<Self::Item> {
Some(self.0)
}
fn any<F>(&mut self, mut f: F) -> bool
where
Self: Sized,
F: FnMut(Self::Item) -> bool,
{
f(self.0)
}
}
fn truncate_iterator<T>(input: Option<usize>) -> impl Fn((usize, T)) -> Option<T> {
move |(idx, c)| match input {
Some(s) => match s.cmp(&idx) {
std::cmp::Ordering::Greater => Some(c),
_ => None,
},
None => Some(c),
}
}
#[derive(Debug, Clone, Copy)]
pub enum Sequence {
Char(char),
CharRange(Box<dyn Iterator<Item = char>>),
CharRange(u32, u32),
CharStar(char),
CharRepeat(char, usize),
Alnum,
Alpha,
Blank,
Control,
Digit,
Graph,
Lower,
Print,
Punct,
Space,
Upper,
Xdigit,
}
impl Sequence {
pub fn flatten(&self) -> Box<dyn Iterator<Item = char>> {
match self {
Sequence::Char(c) => Box::new(std::iter::once(*c)),
Sequence::CharRange(l, r) => Box::new((*l..=*r).flat_map(char::from_u32)),
Sequence::CharStar(c) => Box::new(Repeat::new(*c)),
Sequence::CharRepeat(c, n) => Box::new(Repeat::new(*c).take(*n)),
Sequence::Alnum => Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z')),
Sequence::Alpha => Box::new(('A'..='Z').chain('a'..='z')),
Sequence::Blank => Box::new(unicode_table::BLANK.into_iter().cloned()),
Sequence::Control => Box::new(
(0..=31)
.chain(std::iter::once(127))
.flat_map(char::from_u32),
),
Sequence::Digit => Box::new('0'..='9'),
Sequence::Graph => Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.chain(std::iter::once(32)) // space
.flat_map(char::from_u32),
),
Sequence::Lower => Box::new('a'..='z'),
Sequence::Print => Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
),
Sequence::Punct => Box::new(
(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
),
Sequence::Space => Box::new(unicode_table::SPACES.into_iter().cloned()),
Sequence::Upper => Box::new('A'..='Z'),
Sequence::Xdigit => Box::new(('0'..='9').chain('A'..='F').chain('a'..='f')),
}
}
pub fn last(&self) -> Option<char> {
match self {
Sequence::CharStar(c) => Some(*c),
// TODO: Can be optimized further...
rest => rest.flatten().last(),
}
}
pub fn len(&self) -> Option<usize> {
match self {
Sequence::CharStar(_) => None,
// TODO: Is there a fix for this?
rest => Some(rest.flatten().count()),
}
}
}
impl Sequence {
@ -70,16 +192,6 @@ impl Sequence {
.unwrap()
}
pub fn dissolve(self) -> Box<dyn Iterator<Item = char>> {
match self {
Sequence::Char(c) => Box::new(std::iter::once(c)),
Sequence::CharRange(r) => r,
Sequence::CharStar(c) => Box::new(std::iter::repeat(c)),
}
}
/// Sequence parsers
fn parse_char(input: &str) -> IResult<&str, Sequence> {
anychar(input).map(|(l, r)| (l, Sequence::Char(r)))
}
@ -115,7 +227,7 @@ impl Sequence {
separated_pair(anychar, tag("-"), anychar)(input).map(|(l, (a, b))| {
(l, {
let (start, end) = (u32::from(a), u32::from(b));
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32)))
Sequence::CharRange(start, end)
})
})
}
@ -129,7 +241,7 @@ impl Sequence {
.map(|(l, (a, b))| {
(l, {
let (start, end) = (u32::from(a), u32::from(b));
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32)))
Sequence::CharRange(start, end)
})
})
}
@ -143,7 +255,7 @@ impl Sequence {
.map(|(l, (a, b))| {
(l, {
let (start, end) = (u32::from_str_radix(a, 8).unwrap(), u32::from(b));
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32)))
Sequence::CharRange(start, end)
})
})
}
@ -157,7 +269,7 @@ impl Sequence {
.map(|(l, (a, b))| {
(l, {
let (start, end) = (u32::from(a), u32::from_str_radix(b, 8).unwrap());
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32)))
Sequence::CharRange(start, end)
})
})
}
@ -174,7 +286,7 @@ impl Sequence {
u32::from_str_radix(a, 8).unwrap(),
u32::from_str_radix(b, 8).unwrap(),
);
Sequence::CharRange(Box::new((start..=end).filter_map(std::char::from_u32)))
Sequence::CharRange(start, end)
})
})
}
@ -189,136 +301,55 @@ impl Sequence {
separated_pair(anychar, tag("*"), digit1),
tag("]"),
)(input)
.map(|(l, (c, n))| {
(
l,
Sequence::CharRange(Box::new(std::iter::repeat(c).take(n.parse().unwrap()))),
)
})
.map(|(l, (c, n))| (l, Sequence::CharRepeat(c, n.parse().unwrap())))
}
fn parse_alnum(input: &str) -> IResult<&str, Sequence> {
tag("[:alnum:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(('0'..='9').chain('A'..='Z').chain('a'..='z'))),
)
})
tag("[:alnum:]")(input).map(|(l, _)| (l, Sequence::Alnum))
}
fn parse_alpha(input: &str) -> IResult<&str, Sequence> {
tag("[:alpha:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(('A'..='Z').chain('a'..='z'))),
)
})
tag("[:alpha:]")(input).map(|(l, _)| (l, Sequence::Alpha))
}
fn parse_blank(input: &str) -> IResult<&str, Sequence> {
tag("[:blank:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(unicode_table::BLANK.into_iter().cloned())),
)
})
tag("[:blank:]")(input).map(|(l, _)| (l, Sequence::Blank))
}
fn parse_control(input: &str) -> IResult<&str, Sequence> {
tag("[:cntrl:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(
(0..=31)
.chain(std::iter::once(127))
.flat_map(char::from_u32),
)),
)
})
tag("[:cntrl:]")(input).map(|(l, _)| (l, Sequence::Control))
}
fn parse_digit(input: &str) -> IResult<&str, Sequence> {
tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('0'..='9'))))
tag("[:digit:]")(input).map(|(l, _)| (l, Sequence::Digit))
}
fn parse_graph(input: &str) -> IResult<&str, Sequence> {
tag("[:graph:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.chain(std::iter::once(32)) // space
.flat_map(char::from_u32),
)),
)
})
tag("[:graph:]")(input).map(|(l, _)| (l, Sequence::Graph))
}
fn parse_lower(input: &str) -> IResult<&str, Sequence> {
tag("[:lower:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('a'..='z'))))
tag("[:lower:]")(input).map(|(l, _)| (l, Sequence::Lower))
}
fn parse_print(input: &str) -> IResult<&str, Sequence> {
tag("[:print:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(
(48..=57) // digit
.chain(65..=90) // uppercase
.chain(97..=122) // lowercase
// punctuations
.chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
)),
)
})
tag("[:print:]")(input).map(|(l, _)| (l, Sequence::Print))
}
fn parse_punct(input: &str) -> IResult<&str, Sequence> {
tag("[:punct:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(
(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.flat_map(char::from_u32),
)),
)
})
tag("[:punct:]")(input).map(|(l, _)| (l, Sequence::Punct))
}
fn parse_space(input: &str) -> IResult<&str, Sequence> {
tag("[:space:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(unicode_table::SPACES.into_iter().cloned())),
)
})
tag("[:space:]")(input).map(|(l, _)| (l, Sequence::Space))
}
fn parse_upper(input: &str) -> IResult<&str, Sequence> {
tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::CharRange(Box::new('A'..='Z'))))
tag("[:upper:]")(input).map(|(l, _)| (l, Sequence::Upper))
}
fn parse_xdigit(input: &str) -> IResult<&str, Sequence> {
tag("[:xdigit:]")(input).map(|(l, _)| {
(
l,
Sequence::CharRange(Box::new(('0'..='9').chain('A'..='F').chain('a'..='f'))),
)
})
tag("[:xdigit:]")(input).map(|(l, _)| (l, Sequence::Xdigit))
}
fn parse_char_equal(input: &str) -> IResult<&str, Sequence> {
@ -339,10 +370,7 @@ pub struct DeleteOperation {
impl DeleteOperation {
pub fn new(set: Vec<Sequence>, complement_flag: bool) -> DeleteOperation {
DeleteOperation {
set: set
.into_iter()
.flat_map(Sequence::dissolve)
.collect::<Vec<_>>(),
set: set.iter().flat_map(Sequence::flatten).collect::<Vec<_>>(),
complement_flag,
}
}
@ -355,21 +383,30 @@ impl SymbolTranslator for DeleteOperation {
}
}
#[derive(Debug)]
pub struct TranslateOperationComplement {
iter: u32,
set1: Vec<char>,
set2: Vec<char>,
set2: Box<dyn Iterator<Item = char>>,
fallback: char,
translation_map: HashMap<char, char>,
}
impl TranslateOperationComplement {
fn new(set1: Vec<char>, set2: Vec<char>, fallback: char) -> TranslateOperationComplement {
fn new(
set1: Vec<Sequence>,
set2: Vec<Sequence>,
set1_truncate_length: Option<usize>,
fallback: char,
) -> TranslateOperationComplement {
TranslateOperationComplement {
iter: 0,
set1,
set2: set2.into_iter().rev().collect(),
set1: set1
.iter()
.flat_map(Sequence::flatten)
.enumerate()
.filter_map(truncate_iterator(set1_truncate_length))
.collect(),
set2: Box::new(set2.into_iter().flat_map(|c| Sequence::flatten(&c))),
fallback,
translation_map: HashMap::new(),
}
@ -382,61 +419,83 @@ pub struct TranslateOperationStandard {
}
impl TranslateOperationStandard {
fn new(set1: Vec<char>, set2: Vec<char>, fallback: char) -> TranslateOperationStandard {
fn new(
set1: Vec<Sequence>,
set2: Vec<Sequence>,
set1_truncate_length: Option<usize>,
fallback: char,
) -> TranslateOperationStandard {
TranslateOperationStandard {
translation_map: set1
.into_iter()
.zip(set2.into_iter().chain(std::iter::repeat(fallback)))
.iter()
.flat_map(Sequence::flatten)
.zip(
set2.iter()
.flat_map(Sequence::flatten)
.chain(Repeat(fallback)),
)
.enumerate()
.filter_map(truncate_iterator(set1_truncate_length))
.collect::<HashMap<_, _>>(),
}
}
}
#[derive(Debug)]
pub enum TranslateOperation {
Standard(TranslateOperationStandard),
Complement(TranslateOperationComplement),
}
impl TranslateOperation {
fn next_complement_char(mut iter: u32, ignore_list: &[char]) -> (u32, char) {
while (char::from_u32(iter).is_none()
|| ignore_list
.iter()
.map(|c| u32::from(*c))
.any(|c| iter.eq(&c)))
&& iter.ne(&u32::MAX)
{
iter = iter.saturating_add(1)
}
(iter.saturating_add(1), char::from_u32(iter).unwrap())
fn next_complement_char(iter: u32, ignore_list: &[char]) -> (u32, char) {
(iter..)
.filter_map(char::from_u32)
.filter(|c| !ignore_list.iter().any(|s| s.eq(c)))
.map(|c| (u32::from(c) + 1, c))
.next()
.expect("exhausted all possible characters")
}
}
impl TranslateOperation {
pub fn new(
pset1: Vec<Sequence>,
pset2: Vec<Sequence>,
set1: Vec<Sequence>,
set2: Vec<Sequence>,
truncate_set1: bool,
complement: bool,
) -> TranslateOperation {
// TODO: Only some translation is acceptable i.e. uppercase/lowercase transform.
let mut set1 = pset1
.into_iter()
.flat_map(Sequence::dissolve)
.collect::<Vec<_>>();
let set2 = pset2
.into_iter()
.flat_map(Sequence::dissolve)
.collect::<Vec<_>>();
let fallback = set2.last().cloned().unwrap();
if truncate_set1 {
set1.truncate(set2.len());
}
if complement {
TranslateOperation::Complement(TranslateOperationComplement::new(set1, set2, fallback))
let fallback = set2
.iter()
.rev()
.next()
.map(Sequence::last)
.flatten()
.unwrap();
let set1_truncate_length = if truncate_set1 {
set2.iter()
.map(Sequence::len)
.reduce(|a, b| match (a, b) {
(Some(l), Some(r)) => Some(l + r),
_ => None,
})
.flatten()
} else {
TranslateOperation::Standard(TranslateOperationStandard::new(set1, set2, fallback))
None
};
if complement {
TranslateOperation::Complement(TranslateOperationComplement::new(
set1,
set2,
set1_truncate_length,
fallback,
))
} else {
TranslateOperation::Standard(TranslateOperationStandard::new(
set1,
set2,
set1_truncate_length,
fallback,
))
}
}
}
@ -466,7 +525,7 @@ impl SymbolTranslator for TranslateOperation {
Some(*c)
} else {
while translation_map.get(&current).is_none() {
if let Some(p) = set2.pop() {
if let Some(p) = set2.next() {
let (next_index, next_value) =
TranslateOperation::next_complement_char(*iter, &*set1);
*iter = next_index;
@ -484,18 +543,15 @@ impl SymbolTranslator for TranslateOperation {
#[derive(Debug, Clone)]
pub struct SqueezeOperation {
squeeze_set: Vec<char>,
set1: Vec<char>,
complement: bool,
previous: Option<char>,
}
impl SqueezeOperation {
pub fn new(squeeze_set: Vec<Sequence>, complement: bool) -> SqueezeOperation {
pub fn new(set1: Vec<Sequence>, complement: bool) -> SqueezeOperation {
SqueezeOperation {
squeeze_set: squeeze_set
.into_iter()
.flat_map(Sequence::dissolve)
.collect(),
set1: set1.iter().flat_map(Sequence::flatten).collect(),
complement,
previous: None,
}
@ -505,7 +561,7 @@ impl SqueezeOperation {
impl SymbolTranslator for SqueezeOperation {
fn translate(&mut self, current: char) -> Option<char> {
if self.complement {
let next = if self.squeeze_set.iter().any(|c| c.eq(&current)) {
let next = if self.set1.iter().any(|c| c.eq(&current)) {
Some(current)
} else {
match self.previous {
@ -526,7 +582,7 @@ impl SymbolTranslator for SqueezeOperation {
self.previous = Some(current);
next
} else {
let next = if self.squeeze_set.iter().any(|c| c.eq(&current)) {
let next = if self.set1.iter().any(|c| c.eq(&current)) {
match self.previous {
Some(v) if v == current => None,
_ => Some(current),
@ -542,7 +598,7 @@ impl SymbolTranslator for SqueezeOperation {
pub fn translate_input<T, R, W>(input: &mut R, output: &mut W, mut translator: T)
where
T: SymbolTranslator + Debug,
T: SymbolTranslator,
R: BufRead,
W: Write,
{