Reimplemented everything using new expansion module

Signed-off-by: Hanif Bin Ariffin <hanif.ariffin.4326@gmail.com>
This commit is contained in:
Hanif Bin Ariffin 2021-07-18 14:09:26 +08:00
parent c4e04c5384
commit 05d2973510
3 changed files with 46 additions and 355 deletions

View file

@ -1,146 +0,0 @@
// * This file is part of the uutils coreutils package.
// *
// * (c) Michael Gehring <mg@ebfe.org>
// * (c) kwantam <kwantam@gmail.com>
// * * 2015-04-28 ~ created `expand` module to eliminate most allocs during setup
// *
// * For the full copyright and license information, please view the LICENSE
// * file that was distributed with this source code.
// spell-checker:ignore (ToDO) allocs slen unesc
use std::char::from_u32;
use std::cmp::min;
use std::iter::Peekable;
use std::ops::RangeInclusive;
/// Parse a backslash escape sequence to the corresponding character. Assumes
/// the string starts from the character _after_ the `\` and is not empty.
///
/// Returns a tuple containing the character and the number of characters
/// consumed from the input. The alphabetic escape sequences consume 1
/// character; octal escape sequences consume 1 to 3 octal digits.
#[inline]
fn parse_sequence(s: &str) -> (char, usize) {
let mut s = s.chars();
let c = s.next().expect("invalid escape: empty string");
if ('0'..='7').contains(&c) {
let mut v = c.to_digit(8).unwrap();
let mut consumed = 1;
let bits_per_digit = 3;
for c in s.take(2) {
match c.to_digit(8) {
Some(c) => {
v = (v << bits_per_digit) | c;
consumed += 1;
}
None => break,
}
}
(from_u32(v).expect("invalid octal escape"), consumed)
} else {
(
match c {
'a' => 0x07u8 as char,
'b' => 0x08u8 as char,
'f' => 0x0cu8 as char,
'v' => 0x0bu8 as char,
'n' => '\n',
'r' => '\r',
't' => '\t',
c => c,
},
1,
)
}
}
struct Unescape<'a> {
string: &'a str,
}
impl<'a> Iterator for Unescape<'a> {
type Item = char;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(min(slen, 1), None)
}
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.string.is_empty() {
return None;
}
// is the next character an escape?
let (ret, idx) = match self.string.chars().next().unwrap() {
'\\' if self.string.len() > 1 => {
// yes---it's \ and it's not the last char in a string
// we know that \ is 1 byte long so we can index into the string safely
let (c, consumed) = parse_sequence(&self.string[1..]);
(Some(c), 1 + consumed)
}
c => (Some(c), c.len_utf8()), // not an escape char
};
self.string = &self.string[idx..]; // advance the pointer to the next char
ret
}
}
pub struct ExpandSet<'a> {
range: RangeInclusive<u32>,
unesc: Peekable<Unescape<'a>>,
}
impl<'a> Iterator for ExpandSet<'a> {
type Item = char;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.unesc.size_hint()
}
#[inline]
fn next(&mut self) -> Option<Self::Item> {
// while the Range has elements, try to return chars from it
// but make sure that they actually turn out to be Chars!
for n in &mut self.range {
if let Some(c) = from_u32(n) {
return Some(c);
}
}
if let Some(first) = self.unesc.next() {
// peek ahead
if self.unesc.peek() == Some(&'-') && self.unesc.size_hint().0 > 1 {
self.unesc.next(); // this is the '-'
let last = self.unesc.next().unwrap(); // this is the end of the range
{
self.range = first as u32 + 1..=last as u32;
}
}
return Some(first); // in any case, return the next char
}
None
}
}
impl<'a> ExpandSet<'a> {
#[inline]
pub fn new(s: &'a str) -> ExpandSet<'a> {
ExpandSet {
range: 0..=0,
unesc: Unescape { string: s }.peekable(),
}
}
}

View file

@ -422,7 +422,7 @@ impl SqueezeOperationNew {
impl SymbolTranslatorNew for SqueezeOperationNew { impl SymbolTranslatorNew for SqueezeOperationNew {
fn translate(&mut self, current: char) -> Option<char> { fn translate(&mut self, current: char) -> Option<char> {
if self.complement { if self.complement {
if self.squeeze_set.iter().any(|c| c.eq(&current)) { let next = if self.squeeze_set.iter().any(|c| c.eq(&current)) {
Some(current) Some(current)
} else { } else {
match self.previous { match self.previous {
@ -439,33 +439,35 @@ impl SymbolTranslatorNew for SqueezeOperationNew {
Some(current) Some(current)
} }
} }
} };
self.previous = Some(current);
next
} else { } else {
if self.squeeze_set.iter().any(|c| c.eq(&current)) { let next = if self.squeeze_set.iter().any(|c| c.eq(&current)) {
match self.previous { match self.previous {
Some(v) => { Some(v) => {
if v.eq(&current) { if v.eq(&current) {
None None
} else { } else {
self.previous = Some(current);
Some(current) Some(current)
} }
} }
None => { None => Some(current),
self.previous = Some(current);
Some(current)
}
} }
} else { } else {
Some(current) Some(current)
} };
self.previous = Some(current);
next
} }
} }
} }
pub fn translate_input_new<T>(input: &mut dyn BufRead, output: &mut dyn Write, mut translator: T) pub fn translate_input_new<T, R, W>(input: &mut R, output: &mut W, mut translator: T)
where where
T: SymbolTranslatorNew, T: SymbolTranslatorNew,
R: BufRead,
W: Write,
{ {
let mut buf = String::new(); let mut buf = String::new();
let mut output_buf = String::new(); let mut output_buf = String::new();

View file

@ -14,22 +14,18 @@
extern crate uucore; extern crate uucore;
extern crate nom; extern crate nom;
mod expand;
mod operation; mod operation;
use bit_set::BitSet;
use clap::{crate_version, App, Arg}; use clap::{crate_version, App, Arg};
use fnv::FnvHashMap; use nom::AsBytes;
use operation::{translate_input_new, Sequence, SqueezeOperationNew, TranslateOperationNew}; use operation::{translate_input_new, Sequence, SqueezeOperationNew, TranslateOperationNew};
use std::io::{stdin, stdout, BufRead, BufWriter, Write}; use std::io::{stdin, stdout, BufReader, BufWriter};
use crate::{expand::ExpandSet, operation::DeleteOperationNew}; use crate::operation::DeleteOperationNew;
use uucore::InvalidEncodingHandling; use uucore::InvalidEncodingHandling;
static ABOUT: &str = "translate or delete characters"; static ABOUT: &str = "translate or delete characters";
const BUFFER_LEN: usize = 1024;
mod options { mod options {
pub const COMPLEMENT: &str = "complement"; pub const COMPLEMENT: &str = "complement";
pub const DELETE: &str = "delete"; pub const DELETE: &str = "delete";
@ -38,190 +34,6 @@ mod options {
pub const SETS: &str = "sets"; pub const SETS: &str = "sets";
} }
trait SymbolTranslator {
fn translate(&self, c: char, prev_c: char) -> Option<char>;
}
struct DeleteOperation {
bset: BitSet,
complement: bool,
}
impl SymbolTranslator for DeleteOperation {
fn translate(&self, c: char, _prev_c: char) -> Option<char> {
let uc = c as usize;
if self.complement == self.bset.contains(uc) {
Some(c)
} else {
None
}
}
}
struct SqueezeOperation {
squeeze_set: BitSet,
complement: bool,
}
impl SqueezeOperation {
fn new(squeeze_set: ExpandSet, complement: bool) -> SqueezeOperation {
SqueezeOperation {
squeeze_set: squeeze_set.map(|c| c as usize).collect(),
complement,
}
}
}
impl SymbolTranslator for SqueezeOperation {
fn translate(&self, c: char, prev_c: char) -> Option<char> {
if prev_c == c && self.complement != self.squeeze_set.contains(c as usize) {
None
} else {
Some(c)
}
}
}
struct DeleteAndSqueezeOperation {
delete_set: BitSet,
squeeze_set: BitSet,
complement: bool,
}
impl DeleteAndSqueezeOperation {
fn new(
delete_set: ExpandSet,
squeeze_set: ExpandSet,
complement: bool,
) -> DeleteAndSqueezeOperation {
DeleteAndSqueezeOperation {
delete_set: delete_set.map(|c| c as usize).collect(),
squeeze_set: squeeze_set.map(|c| c as usize).collect(),
complement,
}
}
}
impl SymbolTranslator for DeleteAndSqueezeOperation {
fn translate(&self, c: char, prev_c: char) -> Option<char> {
if self.complement != self.delete_set.contains(c as usize)
|| prev_c == c && self.squeeze_set.contains(c as usize)
{
None
} else {
Some(c)
}
}
}
struct TranslateOperation {
translate_map: FnvHashMap<usize, char>,
complement: bool,
s2_last: char,
}
impl TranslateOperation {
fn new(
set1: ExpandSet,
set2: &mut ExpandSet,
truncate: bool,
complement: bool,
) -> TranslateOperation {
let mut map = FnvHashMap::default();
let mut s2_prev = '_';
for i in set1 {
let s2_next = set2.next();
if s2_next.is_none() && truncate {
map.insert(i as usize, i);
} else {
s2_prev = s2_next.unwrap_or(s2_prev);
map.insert(i as usize, s2_prev);
}
}
TranslateOperation {
translate_map: map,
complement,
s2_last: set2.last().unwrap_or(s2_prev),
}
}
}
impl SymbolTranslator for TranslateOperation {
fn translate(&self, c: char, _prev_c: char) -> Option<char> {
if self.complement {
Some(if self.translate_map.contains_key(&(c as usize)) {
c
} else {
self.s2_last
})
} else {
Some(*self.translate_map.get(&(c as usize)).unwrap_or(&c))
}
}
}
struct TranslateAndSqueezeOperation {
translate: TranslateOperation,
squeeze: SqueezeOperation,
}
impl TranslateAndSqueezeOperation {
fn new(sets: Vec<String>, truncate: bool, complement: bool) -> TranslateAndSqueezeOperation {
let set1 = ExpandSet::new(sets[0].as_ref());
let set1_ = ExpandSet::new(sets[0].as_ref());
let mut set2 = ExpandSet::new(sets[1].as_ref());
let set2_ = ExpandSet::new(sets[1].as_ref());
TranslateAndSqueezeOperation {
translate: TranslateOperation::new(set1, &mut set2, truncate, complement),
squeeze: SqueezeOperation::new(if complement { set1_ } else { set2_ }, complement),
}
}
}
impl SymbolTranslator for TranslateAndSqueezeOperation {
fn translate(&self, c: char, prev_c: char) -> Option<char> {
// `unwrap()` will never panic because `Translate.translate()`
// always returns `Some`.
self.squeeze
.translate(self.translate.translate(c, 0 as char).unwrap(), prev_c)
}
}
fn translate_input<T: SymbolTranslator>(
input: &mut dyn BufRead,
output: &mut dyn Write,
translator: T,
) {
let mut buf = String::with_capacity(BUFFER_LEN + 4);
let mut output_buf = String::with_capacity(BUFFER_LEN + 4);
while let Ok(length) = input.read_line(&mut buf) {
let mut prev_c = 0 as char;
if length == 0 {
break;
}
{
// isolation to make borrow checker happy
let filtered = buf.chars().filter_map(|c| {
let res = translator.translate(c, prev_c);
// Set `prev_c` to the post-translate character. This
// allows the squeeze operation to correctly function
// after the translate operation.
if let Some(rc) = res {
prev_c = rc;
}
res
});
output_buf.extend(filtered);
output.write_all(output_buf.as_bytes()).unwrap();
}
buf.clear();
output_buf.clear();
}
}
fn get_usage() -> String { fn get_usage() -> String {
format!("{} [OPTION]... SET1 [SET2]", executable!()) format!("{} [OPTION]... SET1 [SET2]", executable!())
} }
@ -280,12 +92,19 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
if delete_flag { if delete_flag {
if squeeze_flag { if squeeze_flag {
let op = DeleteAndSqueezeOperation::new( let mut delete_buffer = vec![];
ExpandSet::new(sets[0].as_ref()), {
ExpandSet::new(sets[1].as_ref()), let mut delete_writer = BufWriter::new(&mut delete_buffer);
complement_flag, let delete_op =
); DeleteOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag);
translate_input(&mut locked_stdin, &mut buffered_stdout, op); translate_input_new(&mut locked_stdin, &mut delete_writer, delete_op);
}
{
let mut squeeze_reader = BufReader::new(delete_buffer.as_bytes());
let squeeze_op =
SqueezeOperationNew::new(Sequence::parse_set_string(&sets[1]), complement_flag);
translate_input_new(&mut squeeze_reader, &mut buffered_stdout, squeeze_op);
}
} else { } else {
let op = DeleteOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag); let op = DeleteOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag);
translate_input_new(&mut locked_stdin, &mut buffered_stdout, op); translate_input_new(&mut locked_stdin, &mut buffered_stdout, op);
@ -294,10 +113,26 @@ pub fn uumain(args: impl uucore::Args) -> i32 {
if sets.len() < 2 { if sets.len() < 2 {
let op = let op =
SqueezeOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag); SqueezeOperationNew::new(Sequence::parse_set_string(&sets[0]), complement_flag);
translate_input_new(&mut locked_stdin, &mut buffered_stdout, op); translate_input_new(&mut locked_stdin, &mut buffered_stdout, op);
} else { } else {
let op = TranslateAndSqueezeOperation::new(sets, truncate_set1_flag, complement_flag); let mut translate_buffer = vec![];
translate_input(&mut locked_stdin, &mut buffered_stdout, op); {
let mut writer = BufWriter::new(&mut translate_buffer);
let translate_op = TranslateOperationNew::new(
Sequence::parse_set_string(&sets[0]),
Sequence::parse_set_string(&sets[1]),
truncate_set1_flag,
complement_flag,
);
translate_input_new(&mut locked_stdin, &mut writer, translate_op);
}
{
let mut reader = BufReader::new(translate_buffer.as_bytes());
let squeeze_op =
SqueezeOperationNew::new(Sequence::parse_set_string(&sets[1]), false);
translate_input_new(&mut reader, &mut buffered_stdout, squeeze_op);
}
} }
} else { } else {
let op = TranslateOperationNew::new( let op = TranslateOperationNew::new(