Finish fast decode/encode by folding in Z85

This commit is contained in:
Andrew Liebenow 2024-09-22 08:28:20 -05:00
parent 5cd050665d
commit cdebd24733
5 changed files with 242 additions and 282 deletions

View file

@ -7,14 +7,12 @@
use clap::{crate_version, Arg, ArgAction, Command}; use clap::{crate_version, Arg, ArgAction, Command};
use std::fs::File; use std::fs::File;
use std::io::{stdout, Read, Write}; use std::io::{BufReader, Read, Stdin};
use std::io::{BufReader, Stdin};
use std::path::Path; use std::path::Path;
use uucore::display::Quotable; use uucore::display::Quotable;
use uucore::encoding::{decode_z_eight_five, encode_z_eight_five, BASE2LSBF, BASE2MSBF};
use uucore::encoding::{ use uucore::encoding::{
for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER}, for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER},
wrap_print, EncodeError, Format, Format, ZEightFiveWrapper, BASE2LSBF, BASE2MSBF,
}; };
use uucore::error::{FromIo, UResult, USimpleError, UUsageError}; use uucore::error::{FromIo, UResult, USimpleError, UUsageError};
use uucore::format_usage; use uucore::format_usage;
@ -172,130 +170,134 @@ pub fn handle_input<R: Read>(
// "The quick brown fox" // "The quick brown fox"
// is 19 characters, which is not divisible by 3, so its Base64 representation has padding: // is 19 characters, which is not divisible by 3, so its Base64 representation has padding:
// "VGhlIHF1aWNrIGJyb3duIGZveA==" // "VGhlIHF1aWNrIGJyb3duIGZveA=="
// The encoding logic in this function depends on these constants being correct, so do not modify //
// them. Performance can be tuned by multiplying these numbers by a different multiple (see // The encoding performed by `fast_encode` depend on these constants being correct. Performance can be tuned by
// `DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above). // multiplying these numbers by a different multiple (see `DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above).
const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize; const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize;
const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize; const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize;
const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize; const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize;
const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize; const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize;
const Z85_UN_PADDED_MULTIPLE: usize = 4_usize;
const BASE16_ENCODE_IN_CHUNKS_OF_SIZE: usize = // Similar to above, but for decoding
BASE16_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE2_ENCODE_IN_CHUNKS_OF_SIZE: usize =
BASE2_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE32_ENCODE_IN_CHUNKS_OF_SIZE: usize =
BASE32_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE64_ENCODE_IN_CHUNKS_OF_SIZE: usize =
BASE64_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE16_VALID_DECODING_MULTIPLE: usize = 2_usize; const BASE16_VALID_DECODING_MULTIPLE: usize = 2_usize;
const BASE2_VALID_DECODING_MULTIPLE: usize = 8_usize; const BASE2_VALID_DECODING_MULTIPLE: usize = 8_usize;
const BASE32_VALID_DECODING_MULTIPLE: usize = 8_usize; const BASE32_VALID_DECODING_MULTIPLE: usize = 8_usize;
const BASE64_VALID_DECODING_MULTIPLE: usize = 4_usize; const BASE64_VALID_DECODING_MULTIPLE: usize = 4_usize;
const Z85_VALID_DECODING_MULTIPLE: usize = 5_usize;
const BASE16_DECODE_IN_CHUNKS_OF_SIZE: usize =
BASE16_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE2_DECODE_IN_CHUNKS_OF_SIZE: usize =
BASE2_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE32_DECODE_IN_CHUNKS_OF_SIZE: usize =
BASE32_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
const BASE64_DECODE_IN_CHUNKS_OF_SIZE: usize =
BASE64_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
if decode { if decode {
let encoding_and_decode_in_chunks_of_size_and_alphabet: (_, _, &[u8]) = match format { let (encoding, valid_decoding_multiple, alphabet): (_, _, &[u8]) = match format {
// Use naive approach for Z85, since the crate being used doesn't have the API needed // Use naive approach (now only semi-naive) for Z85, since the crate being used doesn't have the API
// needed
Format::Z85 => { Format::Z85 => {
let result = match decode_z_eight_five(input, ignore_garbage) { // spell-checker:disable-next-line
Ok(ve) => { let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#";
if stdout().write_all(&ve).is_err() {
// on windows console, writing invalid utf8 returns an error
return Err(USimpleError::new(
1_i32,
"error: cannot write non-utf8 data",
));
}
Ok(()) fast_decode::fast_decode(
} input,
Err(_) => Err(USimpleError::new(1_i32, "error: invalid input")), (
}; ZEightFiveWrapper {},
Z85_VALID_DECODING_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE,
alphabet,
),
ignore_garbage,
)?;
return result; return Ok(());
} }
// For these, use faster, new decoding logic // For these, use faster, new decoding logic
Format::Base16 => ( Format::Base16 => (
HEXUPPER, HEXUPPER,
BASE16_DECODE_IN_CHUNKS_OF_SIZE, BASE16_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line
b"0123456789ABCDEF", b"0123456789ABCDEF",
), ),
Format::Base2Lsbf => (BASE2LSBF, BASE2_DECODE_IN_CHUNKS_OF_SIZE, b"01"), Format::Base2Lsbf => (
Format::Base2Msbf => (BASE2MSBF, BASE2_DECODE_IN_CHUNKS_OF_SIZE, b"01"), BASE2LSBF,
BASE2_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line
b"01",
),
Format::Base2Msbf => (
BASE2MSBF,
BASE2_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line
b"01",
),
Format::Base32 => ( Format::Base32 => (
BASE32, BASE32,
BASE32_DECODE_IN_CHUNKS_OF_SIZE, BASE32_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=",
), ),
Format::Base32Hex => ( Format::Base32Hex => (
BASE32HEX, BASE32HEX,
BASE32_DECODE_IN_CHUNKS_OF_SIZE, BASE32_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line // spell-checker:disable-next-line
b"0123456789ABCDEFGHIJKLMNOPQRSTUV=", b"0123456789ABCDEFGHIJKLMNOPQRSTUV=",
), ),
Format::Base64 => ( Format::Base64 => (
BASE64, BASE64,
BASE64_DECODE_IN_CHUNKS_OF_SIZE, BASE64_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line
b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/", b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=+/",
), ),
Format::Base64Url => ( Format::Base64Url => (
BASE64URL, BASE64URL,
BASE64_DECODE_IN_CHUNKS_OF_SIZE, BASE64_VALID_DECODING_MULTIPLE,
// spell-checker:disable-next-line
b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-", b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789=_-",
), ),
}; };
fast_decode::fast_decode( fast_decode::fast_decode(
input, input,
encoding_and_decode_in_chunks_of_size_and_alphabet, (
encoding,
valid_decoding_multiple * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE,
alphabet,
),
ignore_garbage, ignore_garbage,
)?; )?;
Ok(()) Ok(())
} else { } else {
let encoding_and_encode_in_chunks_of_size = match format { let (encoding, un_padded_multiple) = match format {
// Use naive approach for Z85, since the crate being used doesn't have the API needed // Use naive approach for Z85 (now only semi-naive), since the crate being used doesn't have the API
// needed
Format::Z85 => { Format::Z85 => {
let result = match encode_z_eight_five(input) { fast_encode::fast_encode(
Ok(st) => { input,
wrap_print(&st, wrap.unwrap_or(WRAP_DEFAULT))?; (
ZEightFiveWrapper {},
Z85_UN_PADDED_MULTIPLE * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE,
),
wrap,
)?;
Ok(()) return Ok(());
}
Err(EncodeError::InvalidInput) => {
Err(USimpleError::new(1_i32, "error: invalid input"))
}
Err(_) => Err(USimpleError::new(
1_i32,
"error: invalid input (length must be multiple of 4 characters)",
)),
};
return result;
} }
// For these, use faster, new encoding logic // For these, use faster, new encoding logic
Format::Base16 => (HEXUPPER, BASE16_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base16 => (HEXUPPER, BASE16_UN_PADDED_MULTIPLE),
Format::Base2Lsbf => (BASE2LSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base2Lsbf => (BASE2LSBF, BASE2_UN_PADDED_MULTIPLE),
Format::Base2Msbf => (BASE2MSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base2Msbf => (BASE2MSBF, BASE2_UN_PADDED_MULTIPLE),
Format::Base32 => (BASE32, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base32 => (BASE32, BASE32_UN_PADDED_MULTIPLE),
Format::Base32Hex => (BASE32HEX, BASE32_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base32Hex => (BASE32HEX, BASE32_UN_PADDED_MULTIPLE),
Format::Base64 => (BASE64, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base64 => (BASE64, BASE64_UN_PADDED_MULTIPLE),
Format::Base64Url => (BASE64URL, BASE64_ENCODE_IN_CHUNKS_OF_SIZE), Format::Base64Url => (BASE64URL, BASE64_UN_PADDED_MULTIPLE),
}; };
fast_encode::fast_encode(input, encoding_and_encode_in_chunks_of_size, wrap)?; fast_encode::fast_encode(
input,
(
encoding,
un_padded_multiple * DECODE_AND_ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE,
),
wrap,
)?;
Ok(()) Ok(())
} }
@ -306,29 +308,18 @@ mod fast_encode {
use std::{ use std::{
collections::VecDeque, collections::VecDeque,
io::{self, ErrorKind, Read, StdoutLock, Write}, io::{self, ErrorKind, Read, StdoutLock, Write},
num::{NonZero, NonZeroUsize},
}; };
use uucore::{ use uucore::{
encoding::for_fast_encode::Encoding, encoding::SupportsFastEncode,
error::{UResult, USimpleError}, error::{UResult, USimpleError},
}; };
struct LineWrapping { struct LineWrapping {
line_length: usize, line_length: NonZeroUsize,
print_buffer: Vec<u8>, print_buffer: Vec<u8>,
} }
// Start of helper functions
// Adapted from `encode_append` in the "data-encoding" crate
fn encode_append_vec_deque(encoding: &Encoding, input: &[u8], output: &mut VecDeque<u8>) {
let output_len = output.len();
output.resize(output_len + encoding.encode_len(input.len()), 0_u8);
let make_contiguous_result = output.make_contiguous();
encoding.encode_mut(input, &mut (make_contiguous_result[output_len..]));
}
fn write_without_line_breaks( fn write_without_line_breaks(
encoded_buffer: &mut VecDeque<u8>, encoded_buffer: &mut VecDeque<u8>,
stdout_lock: &mut StdoutLock, stdout_lock: &mut StdoutLock,
@ -358,9 +349,7 @@ mod fast_encode {
stdout_lock: &mut StdoutLock, stdout_lock: &mut StdoutLock,
is_cleanup: bool, is_cleanup: bool,
) -> io::Result<()> { ) -> io::Result<()> {
let line_length_usize = *line_length; let line_length_usize = line_length.get();
assert!(line_length_usize > 0_usize);
let make_contiguous_result = encoded_buffer.make_contiguous(); let make_contiguous_result = encoded_buffer.make_contiguous();
@ -420,9 +409,9 @@ mod fast_encode {
// https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710 // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L1710
// //
/// `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied /// `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied
pub fn fast_encode<R: Read>( pub fn fast_encode<R: Read, S: SupportsFastEncode>(
input: &mut R, input: &mut R,
(encoding, encode_in_chunks_of_size): (Encoding, usize), (supports_fast_encode, encode_in_chunks_of_size): (S, usize),
line_wrap: Option<usize>, line_wrap: Option<usize>,
) -> UResult<()> { ) -> UResult<()> {
/// Rust uses 8 kibibytes /// Rust uses 8 kibibytes
@ -435,12 +424,12 @@ mod fast_encode {
Some(0_usize) => None, Some(0_usize) => None,
// A custom line wrapping value was passed // A custom line wrapping value was passed
Some(an) => Some(LineWrapping { Some(an) => Some(LineWrapping {
line_length: an, line_length: NonZero::new(an).unwrap(),
print_buffer: Vec::<u8>::new(), print_buffer: Vec::<u8>::new(),
}), }),
// Line wrapping was not set, so the default is used // Line wrapping was not set, so the default is used
None => Some(LineWrapping { None => Some(LineWrapping {
line_length: WRAP_DEFAULT, line_length: NonZero::new(WRAP_DEFAULT).unwrap(),
print_buffer: Vec::<u8>::new(), print_buffer: Vec::<u8>::new(),
}), }),
}; };
@ -493,11 +482,10 @@ mod fast_encode {
// Encode the old unencoded data and the stolen bytes, and add the result to // Encode the old unencoded data and the stolen bytes, and add the result to
// `encoded_buffer` // `encoded_buffer`
encode_append_vec_deque( supports_fast_encode.encode_to_vec_deque(
&encoding,
leftover_buffer.make_contiguous(), leftover_buffer.make_contiguous(),
&mut encoded_buffer, &mut encoded_buffer,
); )?;
// Reset `leftover_buffer` // Reset `leftover_buffer`
leftover_buffer.clear(); leftover_buffer.clear();
@ -515,7 +503,7 @@ mod fast_encode {
for sl in chunks_exact { for sl in chunks_exact {
assert!(sl.len() == encode_in_chunks_of_size); assert!(sl.len() == encode_in_chunks_of_size);
encode_append_vec_deque(&encoding, sl, &mut encoded_buffer); supports_fast_encode.encode_to_vec_deque(sl, &mut encoded_buffer)?;
} }
leftover_buffer.extend(remainder); leftover_buffer.extend(remainder);
@ -544,11 +532,8 @@ mod fast_encode {
// `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed // `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed
{ {
// Encode all remaining unencoded bytes, placing them in `encoded_buffer` // Encode all remaining unencoded bytes, placing them in `encoded_buffer`
encode_append_vec_deque( supports_fast_encode
&encoding, .encode_to_vec_deque(leftover_buffer.make_contiguous(), &mut encoded_buffer)?;
leftover_buffer.make_contiguous(),
&mut encoded_buffer,
);
// Write all data in `encoded_buffer` to stdout // Write all data in `encoded_buffer` to stdout
// `is_cleanup` triggers special cleanup-only logic // `is_cleanup` triggers special cleanup-only logic
@ -567,41 +552,45 @@ mod fast_encode {
mod fast_decode { mod fast_decode {
use std::io::{self, ErrorKind, Read, StdoutLock, Write}; use std::io::{self, ErrorKind, Read, StdoutLock, Write};
use uucore::{ use uucore::{
encoding::{alphabet_to_table, for_fast_encode::Encoding}, encoding::SupportsFastDecode,
error::{UResult, USimpleError}, error::{UResult, USimpleError},
}; };
struct FilteringData {
table: [bool; 256_usize],
}
// Start of helper functions // Start of helper functions
// Adapted from `decode` in the "data-encoding" crate pub fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256_usize] {
fn decode_into_vec(encoding: &Encoding, input: &[u8], output: &mut Vec<u8>) -> UResult<()> { // If "ignore_garbage" is enabled, all characters outside the alphabet are ignored
let decode_len_result = match encoding.decode_len(input.len()) { // If it is not enabled, only '\n' and '\r' are ignored
Ok(us) => us, if ignore_garbage {
Err(de) => { // Note: "false" here
return Err(USimpleError::new(1_i32, format!("{de}"))); let mut table = [false; 256_usize];
}
};
let output_len = output.len(); // Pass through no characters except those in the alphabet
for ue in alphabet {
let us = usize::from(*ue);
output.resize(output_len + decode_len_result, 0_u8); // Should not have been set yet
assert!(!table[us]);
match encoding.decode_mut(input, &mut (output[output_len..])) { table[us] = true;
Ok(us) => {
// See:
// https://docs.rs/data-encoding/latest/data_encoding/struct.Encoding.html#method.decode_mut
// "Returns the length of the decoded output. This length may be smaller than the output length if the input contained padding or ignored characters. The output bytes after the returned length are not initialized and should not be read."
output.truncate(output_len + us);
} }
Err(_de) => {
return Err(USimpleError::new(1_i32, "error: invalid input".to_owned())); table
} else {
// Note: "true" here
let mut table = [true; 256_usize];
// Pass through all characters except '\n' and '\r' to
for ue in [b'\n', b'\r'] {
let us = usize::from(ue);
// Should not have been set yet
assert!(table[us]);
table[us] = false;
} }
table
} }
Ok(())
} }
fn write_to_stdout( fn write_to_stdout(
@ -619,9 +608,9 @@ mod fast_decode {
/// `encoding`, `decode_in_chunks_of_size`, and `alphabet` are passed in a tuple to indicate that they are /// `encoding`, `decode_in_chunks_of_size`, and `alphabet` are passed in a tuple to indicate that they are
/// logically tied /// logically tied
pub fn fast_decode<R: Read>( pub fn fast_decode<R: Read, S: SupportsFastDecode>(
input: &mut R, input: &mut R,
(encoding, decode_in_chunks_of_size, alphabet): (Encoding, usize, &[u8]), (supports_fast_decode, decode_in_chunks_of_size, alphabet): (S, usize, &[u8]),
ignore_garbage: bool, ignore_garbage: bool,
) -> UResult<()> { ) -> UResult<()> {
/// Rust uses 8 kibibytes /// Rust uses 8 kibibytes
@ -634,30 +623,12 @@ mod fast_decode {
// passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also // passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also
// allows execution to stay on the happy path in "data-encoding": // allows execution to stay on the happy path in "data-encoding":
// https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756 // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756
let (encoding_to_use, filter_data_option) = { // Update: it is not even worth it to use "data-encoding"'s ignore functionality when "ignore_garbage" is
if ignore_garbage { // false.
// Note that the alphabet constants above already include the padding characters // Note that the alphabet constants above already include the padding characters
// TODO // TODO
// Precompute this // Precompute this
let table = alphabet_to_table(alphabet); let table = alphabet_to_table(alphabet, ignore_garbage);
(encoding, Some(FilteringData { table }))
} else {
let mut sp = encoding.specification();
// '\n' and '\r' are always ignored
sp.ignore = "\n\r".to_owned();
let en = match sp.encoding() {
Ok(en) => en,
Err(sp) => {
return Err(USimpleError::new(1_i32, format!("{sp}")));
}
};
(en, None)
}
};
// Start of buffers // Start of buffers
// Data that was read from stdin // Data that was read from stdin
@ -689,40 +660,24 @@ mod fast_decode {
// The part of `input_buffer` that was actually filled by the call to `read` // The part of `input_buffer` that was actually filled by the call to `read`
let read_buffer = &input_buffer[..bytes_read_from_input]; let read_buffer = &input_buffer[..bytes_read_from_input];
if let Some(fi) = &filter_data_option { // First just scan the data for the happy path
let FilteringData { table } = fi; // Note: this happy path check has not been validated with performance testing
let found_garbage = read_buffer.iter().any(|ue| {
// Garbage, since it was not found in the table
!table[usize::from(*ue)]
});
let table_to_owned = table.to_owned(); if found_garbage {
non_garbage_buffer.clear();
// First just scan the data for the happy path
// Note: this happy path check has not been validated with performance testing
let mut found_garbage = false;
for ue in read_buffer { for ue in read_buffer {
if table_to_owned[usize::from(*ue)] { if table[usize::from(*ue)] {
// Not garbage, since it was found in the table // Not garbage, since it was found in the table
continue; non_garbage_buffer.push(*ue);
} else {
found_garbage = true;
break;
} }
} }
if found_garbage { non_garbage_buffer.as_slice()
non_garbage_buffer.clear();
for ue in read_buffer {
if table_to_owned[usize::from(*ue)] {
// Not garbage, since it was found in the table
non_garbage_buffer.push(*ue);
}
}
non_garbage_buffer.as_slice()
} else {
read_buffer
}
} else { } else {
read_buffer read_buffer
} }
@ -751,11 +706,8 @@ mod fast_decode {
// Decode the old un-decoded data and the stolen bytes, and add the result to // Decode the old un-decoded data and the stolen bytes, and add the result to
// `decoded_buffer` // `decoded_buffer`
decode_into_vec( supports_fast_decode
&encoding_to_use, .decode_into_vec(&leftover_buffer, &mut decoded_buffer)?;
&leftover_buffer,
&mut decoded_buffer,
)?;
// Reset `leftover_buffer` // Reset `leftover_buffer`
leftover_buffer.clear(); leftover_buffer.clear();
@ -773,7 +725,7 @@ mod fast_decode {
for sl in chunks_exact { for sl in chunks_exact {
assert!(sl.len() == decode_in_chunks_of_size); assert!(sl.len() == decode_in_chunks_of_size);
decode_into_vec(&encoding_to_use, sl, &mut decoded_buffer)?; supports_fast_decode.decode_into_vec(sl, &mut decoded_buffer)?;
} }
leftover_buffer.extend(remainder); leftover_buffer.extend(remainder);
@ -797,7 +749,7 @@ mod fast_decode {
// `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed // `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed
{ {
// Decode all remaining encoded bytes, placing them in `decoded_buffer` // Decode all remaining encoded bytes, placing them in `decoded_buffer`
decode_into_vec(&encoding_to_use, &leftover_buffer, &mut decoded_buffer)?; supports_fast_decode.decode_into_vec(&leftover_buffer, &mut decoded_buffer)?;
// Write all data in `decoded_buffer` to stdout // Write all data in `decoded_buffer` to stdout
write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?; write_to_stdout(&mut decoded_buffer, &mut stdout_lock)?;

View file

@ -111,7 +111,7 @@ where
OutputFormat::Hexadecimal => sum_hex, OutputFormat::Hexadecimal => sum_hex,
OutputFormat::Base64 => match options.algo_name { OutputFormat::Base64 => match options.algo_name {
ALGORITHM_OPTIONS_CRC | ALGORITHM_OPTIONS_SYSV | ALGORITHM_OPTIONS_BSD => sum_hex, ALGORITHM_OPTIONS_CRC | ALGORITHM_OPTIONS_SYSV | ALGORITHM_OPTIONS_BSD => sum_hex,
_ => encoding::encode_base_six_four(&hex::decode(sum_hex).unwrap()), _ => encoding::for_cksum::BASE64.encode(&hex::decode(sum_hex).unwrap()),
}, },
}; };
// The BSD checksum output is 5 digit integer // The BSD checksum output is 5 digit integer

View file

@ -77,7 +77,7 @@ default = []
backup-control = [] backup-control = []
colors = [] colors = []
checksum = ["data-encoding", "thiserror", "regex", "sum"] checksum = ["data-encoding", "thiserror", "regex", "sum"]
encoding = ["data-encoding", "data-encoding-macro", "z85", "thiserror"] encoding = ["data-encoding", "data-encoding-macro", "z85"]
entries = ["libc"] entries = ["libc"]
fs = ["dunce", "libc", "winapi-util", "windows-sys"] fs = ["dunce", "libc", "winapi-util", "windows-sys"]
fsext = ["libc", "windows-sys"] fsext = ["libc", "windows-sys"]

View file

@ -3,25 +3,20 @@
// For the full copyright and license information, please view the LICENSE // For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code. // file that was distributed with this source code.
// spell-checker:ignore (strings) ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFGHIJKLMNOPQRSTUV
// spell-checker:ignore (encodings) lsbf msbf // spell-checker:ignore (encodings) lsbf msbf
use data_encoding::{Encoding, BASE64}; use crate::error::{UResult, USimpleError};
use data_encoding::Encoding;
use data_encoding_macro::new_encoding; use data_encoding_macro::new_encoding;
use std::{ use std::collections::VecDeque;
error::Error,
io::{self, Read, Write},
};
// Re-export for the faster encoding logic // Re-export for the faster encoding logic
pub mod for_fast_encode { pub mod for_fast_encode {
pub use data_encoding::*; pub use data_encoding::*;
} }
#[derive(Debug)] pub mod for_cksum {
pub enum EncodeError { pub use data_encoding::BASE64;
Z85InputLenNotMultipleOf4,
InvalidInput,
} }
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
@ -46,96 +41,104 @@ pub const BASE2MSBF: Encoding = new_encoding! {
bit_order: MostSignificantFirst, bit_order: MostSignificantFirst,
}; };
pub fn encode_base_six_four(input: &[u8]) -> String { pub struct ZEightFiveWrapper {}
BASE64.encode(input)
pub trait SupportsFastEncode {
fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()>;
} }
pub fn decode_z_eight_five<R: Read>( impl SupportsFastEncode for ZEightFiveWrapper {
mut input: R, fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
ignore_garbage: bool, // According to the spec we should not accept inputs whose len is not a multiple of 4.
) -> Result<Vec<u8>, Box<dyn Error>> { // However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them.
const Z_EIGHT_FIVE_ALPHABET: &[u8; 85_usize] = if input.len() % 4_usize != 0_usize {
b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#"; return Err(USimpleError::new(
1_i32,
"error: invalid input (length must be multiple of 4 characters)".to_owned(),
));
}
let mut buf = Vec::<u8>::new(); let string = z85::encode(input);
input.read_to_end(&mut buf)?; output.extend(string.as_bytes());
if ignore_garbage { Ok(())
let table = alphabet_to_table(Z_EIGHT_FIVE_ALPHABET); }
buf.retain(|&ue| table[usize::from(ue)]);
} else {
buf.retain(|&ue| ue != b'\n' && ue != b'\r');
};
// The z85 crate implements a padded encoding by using a leading '#' which is otherwise not allowed.
// We manually check for a leading '#' and return an error ourselves.
let vec = if buf.starts_with(b"#") {
return Err(Box::from("'#' character at index 0 is invalid".to_owned()));
} else {
z85::decode(buf)?
};
Ok(vec)
} }
pub fn encode_z_eight_five<R: Read>(mut input: R) -> Result<String, EncodeError> { impl SupportsFastEncode for Encoding {
let mut buf = Vec::<u8>::new(); // Adapted from `encode_append` in the "data-encoding" crate
fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
let output_len = output.len();
match input.read_to_end(&mut buf) { output.resize(output_len + self.encode_len(input.len()), 0_u8);
Ok(_) => {
let buf_slice = buf.as_slice();
// According to the spec we should not accept inputs whose len is not a multiple of 4. let make_contiguous_result = output.make_contiguous();
// However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them.
if buf_slice.len() % 4_usize == 0_usize { self.encode_mut(input, &mut (make_contiguous_result[output_len..]));
Ok(z85::encode(buf_slice))
} else { Ok(())
Err(EncodeError::Z85InputLenNotMultipleOf4) }
}
pub trait SupportsFastDecode {
fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()>;
}
impl SupportsFastDecode for ZEightFiveWrapper {
fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
if input.first() == Some(&b'#') {
return Err(USimpleError::new(1_i32, "error: invalid input".to_owned()));
}
// According to the spec we should not accept inputs whose len is not a multiple of 4.
// However, the z85 crate implements a padded encoding and accepts such inputs. We have to manually check for them.
if input.len() % 4_usize != 0_usize {
return Err(USimpleError::new(
1_i32,
"error: invalid input (length must be multiple of 4 characters)".to_owned(),
));
};
let decode_result = match z85::decode(input) {
Ok(ve) => ve,
Err(_de) => {
return Err(USimpleError::new(1_i32, "error: invalid input".to_owned()));
}
};
output.extend_from_slice(&decode_result);
Ok(())
}
}
impl SupportsFastDecode for Encoding {
// Adapted from `decode` in the "data-encoding" crate
fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
let decode_len_result = match self.decode_len(input.len()) {
Ok(us) => us,
Err(_de) => {
return Err(USimpleError::new(1_i32, "error: invalid input".to_owned()));
}
};
let output_len = output.len();
output.resize(output_len + decode_len_result, 0_u8);
match self.decode_mut(input, &mut (output[output_len..])) {
Ok(us) => {
// See:
// https://docs.rs/data-encoding/latest/data_encoding/struct.Encoding.html#method.decode_mut
// "Returns the length of the decoded output. This length may be smaller than the output length if the input contained padding or ignored characters. The output bytes after the returned length are not initialized and should not be read."
output.truncate(output_len + us);
}
Err(_de) => {
return Err(USimpleError::new(1_i32, "error: invalid input".to_owned()));
} }
} }
Err(_) => Err(EncodeError::InvalidInput),
Ok(())
} }
} }
pub fn wrap_print(res: &str, line_wrap: usize) -> io::Result<()> {
let stdout = io::stdout();
let mut stdout_lock = stdout.lock();
if line_wrap == 0 {
stdout_lock.write_all(res.as_bytes())?;
} else {
let res_len = res.len();
let mut start = 0;
while start < res_len {
let start_plus_line_wrap = start + line_wrap;
let end = start_plus_line_wrap.min(res_len);
writeln!(stdout_lock, "{}", &res[start..end])?;
start = end;
}
}
Ok(())
}
pub fn alphabet_to_table(alphabet: &[u8]) -> [bool; 256_usize] {
let mut table = [false; 256_usize];
for ue in alphabet {
let us = usize::from(*ue);
// Should not have been set yet
assert!(!table[us]);
table[us] = true;
}
table
}

View file

@ -7,13 +7,18 @@
use crate::common::util::TestScenario; use crate::common::util::TestScenario;
#[test] #[test]
fn test_z85_not_padded() { fn test_z85_not_padded_decode() {
// The z85 crate deviates from the standard in some cases; we have to catch those // The z85 crate deviates from the standard in some cases; we have to catch those
new_ucmd!() new_ucmd!()
.args(&["--z85", "-d"]) .args(&["--z85", "-d"])
.pipe_in("##########") .pipe_in("##########")
.fails() .fails()
.stderr_only("basenc: error: invalid input\n"); .stderr_only("basenc: error: invalid input\n");
}
#[test]
fn test_z85_not_padded_encode() {
// The z85 crate deviates from the standard in some cases; we have to catch those
new_ucmd!() new_ucmd!()
.args(&["--z85"]) .args(&["--z85"])
.pipe_in("123") .pipe_in("123")