basenc: perform faster, streaming encoding

Improve the performance, both in memory and time, of the encoding performed by the basenc (except in --z85 mode), base32, and base64 programs. These programs now perform encoding in a buffered/streaming manner, so encoding is not constrained by the amount of available memory.
2024-12-12 14:22:41 +00:00 · 2024-09-20 14:34:18 -05:00 · 2024-09-20 14:34:18 -05:00 · 846cf06272
commit 846cf06272
parent 50f99580b4
4 changed files with 416 additions and 63 deletions
--- a/src/uu/base32/src/base_common.rs
+++ b/src/uu/base32/src/base_common.rs
@ -3,22 +3,31 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.

-use std::io::{stdout, Read, Write};
+// spell-checker:ignore HEXUPPER Lsbf Msbf

+use clap::{crate_version, Arg, ArgAction, Command};
+use std::fs::File;
+use std::io::{stdout, Read, Write};
+use std::io::{BufReader, Stdin};
+use std::path::Path;
 use uucore::display::Quotable;
-use uucore::encoding::{wrap_print, Data, EncodeError, Format};
+use uucore::encoding::{
+    for_fast_encode::{BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER},
+    wrap_print, Data, EncodeError, Format,
+};
+use uucore::encoding::{BASE2LSBF, BASE2MSBF};
 use uucore::error::{FromIo, UResult, USimpleError, UUsageError};
 use uucore::format_usage;

-use std::fs::File;
-use std::io::{BufReader, Stdin};
-use std::path::Path;
+pub const BASE_CMD_PARSE_ERROR: i32 = 1_i32;

-use clap::{crate_version, Arg, ArgAction, Command};
+/// Encoded output will be formatted in lines of this length (the last line can be shorter)
+///
+/// Other implementations default to 76
+///
+/// This default is only used if no "-w"/"--wrap" argument is passed
+const WRAP_DEFAULT: usize = 76_usize;

-pub static BASE_CMD_PARSE_ERROR: i32 = 1;
-
-// Config.
 pub struct Config {
    pub decode: bool,
    pub ignore_garbage: bool,
@ -118,7 +127,7 @@ pub fn base_app(about: &'static str, usage: &str) -> Command {
                .short('w')
                .long(options::WRAP)
                .value_name("COLS")
-                .help("wrap encoded lines after COLS character (default 76, 0 to disable wrapping)")
+                .help(format!("wrap encoded lines after COLS character (default {WRAP_DEFAULT}, 0 to disable wrapping)"))
                .overrides_with(options::WRAP),
        )
        // "multiple" arguments are used to check whether there is more than one
@ -147,17 +156,43 @@ pub fn get_input<'a>(config: &Config, stdin_ref: &'a Stdin) -> UResult<Box<dyn R
 pub fn handle_input<R: Read>(
    input: &mut R,
    format: Format,
-    line_wrap: Option<usize>,
+    wrap: Option<usize>,
    ignore_garbage: bool,
    decode: bool,
 ) -> UResult<()> {
-    let mut data = Data::new(input, format).ignore_garbage(ignore_garbage);
-    if let Some(wrap) = line_wrap {
-        data = data.line_wrap(wrap);
-    }
+    const ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024_usize;
+
+    // These constants indicate that inputs with lengths divisible by these numbers will have no padding characters
+    // after encoding.
+    // For instance:
+    // "The quick brown"
+    // is 15 characters (divisible by 3), so it is encoded in Base64 without padding:
+    // "VGhlIHF1aWNrIGJyb3du"
+    // While:
+    // "The quick brown fox"
+    // is 19 characters, which is not divisible by 3, so its Base64 representation has padding:
+    // "VGhlIHF1aWNrIGJyb3duIGZveA=="
+    // The encoding logic in this function depends on these constants being correct, so do not modify
+    // them. Performance can be tuned by multiplying these numbers by a different multiple (see
+    // `ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE` above).
+    const BASE16_UN_PADDED_MULTIPLE: usize = 1_usize;
+    const BASE2_UN_PADDED_MULTIPLE: usize = 1_usize;
+    const BASE32_UN_PADDED_MULTIPLE: usize = 5_usize;
+    const BASE64_UN_PADDED_MULTIPLE: usize = 3_usize;
+
+    const BASE16_ENCODE_IN_CHUNKS_OF_SIZE: usize =
+        BASE16_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
+    const BASE2_ENCODE_IN_CHUNKS_OF_SIZE: usize =
+        BASE2_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
+    const BASE32_ENCODE_IN_CHUNKS_OF_SIZE: usize =
+        BASE32_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
+    const BASE64_ENCODE_IN_CHUNKS_OF_SIZE: usize =
+        BASE64_UN_PADDED_MULTIPLE * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;

    if decode {
-        match data.decode() {
+        let mut data = Data::new(input, format);
+
+        match data.decode(ignore_garbage) {
            Ok(s) => {
                // Silent the warning as we want to the error message
                #[allow(clippy::question_mark)]
@ -170,16 +205,307 @@ pub fn handle_input<R: Read>(
            Err(_) => Err(USimpleError::new(1, "error: invalid input")),
        }
    } else {
-        match data.encode() {
-            Ok(s) => {
-                wrap_print(&data, &s);
-                Ok(())
+        #[allow(clippy::identity_op)]
+        let encoding_and_encode_in_chunks_of_size = match format {
+            // Use naive approach for Z85, since the crate being used doesn't have the API needed
+            Format::Z85 => {
+                let mut data = Data::new(input, format);
+
+                let result = match data.encode() {
+                    Ok(st) => {
+                        wrap_print(&st, wrap.unwrap_or(WRAP_DEFAULT))?;
+
+                        Ok(())
+                    }
+                    Err(EncodeError::InvalidInput) => {
+                        Err(USimpleError::new(1, "error: invalid input"))
+                    }
+                    Err(_) => Err(USimpleError::new(
+                        1,
+                        "error: invalid input (length must be multiple of 4 characters)",
+                    )),
+                };
+
+                return result;
            }
-            Err(EncodeError::InvalidInput) => Err(USimpleError::new(1, "error: invalid input")),
-            Err(_) => Err(USimpleError::new(
-                1,
-                "error: invalid input (length must be multiple of 4 characters)",
-            )),
-        }
+
+            // For these, use faster, new encoding logic
+            Format::Base16 => (&HEXUPPER, BASE16_ENCODE_IN_CHUNKS_OF_SIZE),
+            Format::Base2Lsbf => (&BASE2LSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE),
+            Format::Base2Msbf => (&BASE2MSBF, BASE2_ENCODE_IN_CHUNKS_OF_SIZE),
+            Format::Base32 => (&BASE32, BASE32_ENCODE_IN_CHUNKS_OF_SIZE),
+            Format::Base32Hex => (&BASE32HEX, BASE32_ENCODE_IN_CHUNKS_OF_SIZE),
+            Format::Base64 => (&BASE64, BASE64_ENCODE_IN_CHUNKS_OF_SIZE),
+            Format::Base64Url => (&BASE64URL, BASE64_ENCODE_IN_CHUNKS_OF_SIZE),
+        };
+
+        fast_encode::fast_encode(input, encoding_and_encode_in_chunks_of_size, wrap)?;
+
+        Ok(())
+    }
+}
+
+mod fast_encode {
+    use crate::base_common::WRAP_DEFAULT;
+    use std::{
+        collections::VecDeque,
+        io::{self, ErrorKind, Read, StdoutLock, Write},
+    };
+    use uucore::{
+        encoding::for_fast_encode::Encoding,
+        error::{UResult, USimpleError},
+    };
+
+    struct LineWrapping {
+        line_length: usize,
+        print_buffer: Vec<u8>,
+    }
+
+    // Start of helper functions
+    // Adapted from `encode_append` in the "data-encoding" crate
+    fn encode_append_vec_deque(encoding: &Encoding, input: &[u8], output: &mut VecDeque<u8>) {
+        let output_len = output.len();
+
+        output.resize(output_len + encoding.encode_len(input.len()), 0_u8);
+
+        let make_contiguous_result = output.make_contiguous();
+
+        encoding.encode_mut(input, &mut (make_contiguous_result[output_len..]));
+    }
+
+    fn write_without_line_breaks(
+        encoded_buffer: &mut VecDeque<u8>,
+        stdout_lock: &mut StdoutLock,
+        is_cleanup: bool,
+    ) -> io::Result<()> {
+        // TODO
+        // `encoded_buffer` only has to be a VecDeque if line wrapping is enabled
+        // (`make_contiguous` should be a no-op here)
+        // Refactoring could avoid this call
+        stdout_lock.write_all(encoded_buffer.make_contiguous())?;
+
+        if is_cleanup {
+            stdout_lock.write_all(b"\n")?;
+        } else {
+            encoded_buffer.truncate(0_usize);
+        }
+
+        Ok(())
+    }
+
+    fn write_with_line_breaks(
+        &mut LineWrapping {
+            ref line_length,
+            ref mut print_buffer,
+        }: &mut LineWrapping,
+        encoded_buffer: &mut VecDeque<u8>,
+        stdout_lock: &mut StdoutLock,
+        is_cleanup: bool,
+    ) -> io::Result<()> {
+        let line_length_usize = *line_length;
+
+        assert!(line_length_usize > 0_usize);
+
+        let number_of_lines = encoded_buffer.len() / line_length_usize;
+
+        // How many bytes to take from the front of `encoded_buffer` and then write to stdout
+        let number_of_bytes_to_drain = number_of_lines * line_length_usize;
+
+        let line_wrap_size_minus_one = line_length_usize - 1_usize;
+
+        let mut i = 0_usize;
+
+        for ue in encoded_buffer.drain(0_usize..number_of_bytes_to_drain) {
+            print_buffer.push(ue);
+
+            if i == line_wrap_size_minus_one {
+                print_buffer.push(b'\n');
+
+                i = 0_usize;
+            } else {
+                i += 1_usize;
+            }
+        }
+
+        stdout_lock.write_all(print_buffer)?;
+
+        if is_cleanup {
+            if encoded_buffer.is_empty() {
+                // Do not write a newline in this case, because two trailing newlines should never be printed
+            } else {
+                // Print the partial line, since this is cleanup and no more data is coming
+                stdout_lock.write_all(encoded_buffer.make_contiguous())?;
+                stdout_lock.write_all(b"\n")?;
+            }
+        } else {
+            print_buffer.truncate(0_usize);
+        }
+
+        Ok(())
+    }
+
+    fn write_to_stdout(
+        line_wrapping_option: &mut Option<LineWrapping>,
+        encoded_buffer: &mut VecDeque<u8>,
+        stdout_lock: &mut StdoutLock,
+        is_cleanup: bool,
+    ) -> io::Result<()> {
+        // Write all data in `encoded_buffer` to stdout
+        if let &mut Some(ref mut li) = line_wrapping_option {
+            write_with_line_breaks(li, encoded_buffer, stdout_lock, is_cleanup)?;
+        } else {
+            write_without_line_breaks(encoded_buffer, stdout_lock, is_cleanup)?;
+        }
+
+        Ok(())
+    }
+    // End of helper functions
+
+    // TODO
+    // It turns out the crate being used already supports line wrapping:
+    // https://docs.rs/data-encoding/latest/data_encoding/struct.Specification.html#wrap-output-when-encoding-1
+    // Check if that crate's line wrapping is faster than the wrapping being performed in this function
+    // `encoding` and `encode_in_chunks_of_size` are passed in a tuple to indicate that they are logically tied
+    pub fn fast_encode<R: Read>(
+        input: &mut R,
+        (encoding, encode_in_chunks_of_size): (&Encoding, usize),
+        line_wrap: Option<usize>,
+    ) -> UResult<()> {
+        /// Rust uses 8 kibibytes
+        ///
+        /// https://github.com/rust-lang/rust/blob/1a5a2240bc1b8cf0bcce7acb946c78d6493a4fd3/library/std/src/sys_common/io.rs#L3
+        const INPUT_BUFFER_SIZE: usize = 8_usize * 1_024_usize;
+
+        let mut line_wrapping_option = match line_wrap {
+            // Line wrapping is disabled because "-w"/"--wrap" was passed with "0"
+            Some(0_usize) => None,
+            // A custom line wrapping value was passed
+            Some(an) => Some(LineWrapping {
+                line_length: an,
+                print_buffer: Vec::<u8>::new(),
+            }),
+            // Line wrapping was not set, so the default is used
+            None => Some(LineWrapping {
+                line_length: WRAP_DEFAULT,
+                print_buffer: Vec::<u8>::new(),
+            }),
+        };
+
+        // Start of buffers
+        // Data that was read from stdin
+        let mut input_buffer = vec![0_u8; INPUT_BUFFER_SIZE];
+
+        assert!(!input_buffer.is_empty());
+
+        // Data that was read from stdin but has not been encoded yet
+        let mut leftover_buffer = VecDeque::<u8>::new();
+
+        // Encoded data that needs to be written to stdout
+        let mut encoded_buffer = VecDeque::<u8>::new();
+        // End of buffers
+
+        let mut stdout_lock = io::stdout().lock();
+
+        loop {
+            match input.read(&mut input_buffer) {
+                Ok(bytes_read_from_input) => {
+                    if bytes_read_from_input == 0_usize {
+                        break;
+                    }
+
+                    // The part of `input_buffer` that was actually filled by the call to `read`
+                    let read_buffer = &input_buffer[0_usize..bytes_read_from_input];
+
+                    // How many bytes to steal from `read_buffer` to get `leftover_buffer` to the right size
+                    let bytes_to_steal = encode_in_chunks_of_size - leftover_buffer.len();
+
+                    if bytes_to_steal > bytes_read_from_input {
+                        // Do not have enough data to encode a chunk, so copy data to `leftover_buffer` and read more
+                        leftover_buffer.extend(read_buffer);
+
+                        continue;
+                    }
+
+                    // Encode data in chunks, then place it in `encoded_buffer`
+                    {
+                        let bytes_to_chunk = if bytes_to_steal > 0 {
+                            let (stolen_bytes, rest_of_read_buffer) =
+                                read_buffer.split_at(bytes_to_steal);
+
+                            leftover_buffer.extend(stolen_bytes);
+
+                            // After appending the stolen bytes to `leftover_buffer`, it should be the right size
+                            assert!(leftover_buffer.len() == encode_in_chunks_of_size);
+
+                            // Encode the old unencoded data and the stolen bytes, and add the result to
+                            // `encoded_buffer`
+                            encode_append_vec_deque(
+                                encoding,
+                                leftover_buffer.make_contiguous(),
+                                &mut encoded_buffer,
+                            );
+
+                            // Reset `leftover_buffer`
+                            leftover_buffer.truncate(0_usize);
+
+                            rest_of_read_buffer
+                        } else {
+                            // Do not need to steal bytes from `read_buffer`
+                            read_buffer
+                        };
+
+                        let chunks_exact = bytes_to_chunk.chunks_exact(encode_in_chunks_of_size);
+
+                        let remainder = chunks_exact.remainder();
+
+                        for sl in chunks_exact {
+                            assert!(sl.len() == encode_in_chunks_of_size);
+
+                            encode_append_vec_deque(encoding, sl, &mut encoded_buffer);
+                        }
+
+                        leftover_buffer.extend(remainder);
+                    }
+
+                    // Write all data in `encoded_buffer` to stdout
+                    write_to_stdout(
+                        &mut line_wrapping_option,
+                        &mut encoded_buffer,
+                        &mut stdout_lock,
+                        false,
+                    )?;
+                }
+                Err(er) => {
+                    if er.kind() == ErrorKind::Interrupted {
+                        // TODO
+                        // Retry reading?
+                    }
+
+                    return Err(USimpleError::new(1_i32, format!("read error: {er}")));
+                }
+            }
+        }
+
+        // Cleanup
+        // `input` has finished producing data, so the data remaining in the buffers needs to be encoded and printed
+        {
+            // Encode all remaining unencoded bytes, placing it in `encoded_buffer`
+            encode_append_vec_deque(
+                encoding,
+                leftover_buffer.make_contiguous(),
+                &mut encoded_buffer,
+            );
+
+            // Write all data in `encoded_buffer` to stdout
+            // `is_cleanup` triggers special cleanup-only logic
+            write_to_stdout(
+                &mut line_wrapping_option,
+                &mut encoded_buffer,
+                &mut stdout_lock,
+                true,
+            )?;
+        }
+
+        Ok(())
    }
 }
--- a/src/uucore/src/lib/features/encoding.rs
+++ b/src/uucore/src/lib/features/encoding.rs
@ -6,13 +6,19 @@
 // spell-checker:ignore (strings) ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFGHIJKLMNOPQRSTUV
 // spell-checker:ignore (encodings) lsbf msbf hexupper

-use std::io::{self, Read, Write};
-
+use self::Format::*;
 use data_encoding::{Encoding, BASE32, BASE32HEX, BASE64, BASE64URL, HEXUPPER};
 use data_encoding_macro::new_encoding;
+use std::io::{self, Read, Write};
+
 #[cfg(feature = "thiserror")]
 use thiserror::Error;

+// Re-export for the faster encoding logic
+pub mod for_fast_encode {
+    pub use data_encoding::*;
+}
+
 #[derive(Debug, Error)]
 pub enum DecodeError {
    #[error("{}", _0)]
@ -42,13 +48,12 @@ pub enum Format {
    Base2Msbf,
    Z85,
 }
-use self::Format::*;

-const BASE2LSBF: Encoding = new_encoding! {
+pub const BASE2LSBF: Encoding = new_encoding! {
    symbols: "01",
    bit_order: LeastSignificantFirst,
 };
-const BASE2MSBF: Encoding = new_encoding! {
+pub const BASE2MSBF: Encoding = new_encoding! {
    symbols: "01",
    bit_order: MostSignificantFirst,
 };
@ -96,8 +101,6 @@ pub fn decode(f: Format, input: &[u8]) -> DecodeResult {
 }

 pub struct Data<R: Read> {
-    line_wrap: usize,
-    ignore_garbage: bool,
    input: R,
    format: Format,
    alphabet: &'static [u8],
@ -106,8 +109,6 @@ pub struct Data<R: Read> {
 impl<R: Read> Data<R> {
    pub fn new(input: R, format: Format) -> Self {
        Self {
-            line_wrap: 76,
-            ignore_garbage: false,
            input,
            format,
            alphabet: match format {
@ -123,22 +124,10 @@ impl<R: Read> Data<R> {
        }
    }

-    #[must_use]
-    pub fn line_wrap(mut self, wrap: usize) -> Self {
-        self.line_wrap = wrap;
-        self
-    }
-
-    #[must_use]
-    pub fn ignore_garbage(mut self, ignore: bool) -> Self {
-        self.ignore_garbage = ignore;
-        self
-    }
-
-    pub fn decode(&mut self) -> DecodeResult {
+    pub fn decode(&mut self, ignore_garbage: bool) -> DecodeResult {
        let mut buf = vec![];
        self.input.read_to_end(&mut buf)?;
-        if self.ignore_garbage {
+        if ignore_garbage {
            buf.retain(|c| self.alphabet.contains(c));
        } else {
            buf.retain(|&c| c != b'\r' && c != b'\n');
@ -155,24 +144,27 @@ impl<R: Read> Data<R> {
    }
 }

-// NOTE: this will likely be phased out at some point
-pub fn wrap_print<R: Read>(data: &Data<R>, res: &str) {
+pub fn wrap_print(res: &str, line_wrap: usize) -> io::Result<()> {
    let stdout = io::stdout();
-    wrap_write(stdout.lock(), data.line_wrap, res).unwrap();
-}

-pub fn wrap_write<W: Write>(mut writer: W, line_wrap: usize, res: &str) -> io::Result<()> {
-    use std::cmp::min;
+    let mut stdout_lock = stdout.lock();

    if line_wrap == 0 {
-        return write!(writer, "{res}");
-    }
+        stdout_lock.write_all(res.as_bytes())?;
+    } else {
+        let res_len = res.len();

-    let mut start = 0;
-    while start < res.len() {
-        let end = min(start + line_wrap, res.len());
-        writeln!(writer, "{}", &res[start..end])?;
-        start = end;
+        let mut start = 0;
+
+        while start < res_len {
+            let start_plus_line_wrap = start + line_wrap;
+
+            let end = start_plus_line_wrap.min(res_len);
+
+            writeln!(stdout_lock, "{}", &res[start..end])?;
+
+            start = end;
+        }
    }

    Ok(())
--- a/tests/by-util/test_base64.rs
+++ b/tests/by-util/test_base64.rs
@ -146,3 +146,36 @@ fn test_base64_file_not_found() {
        .fails()
        .stderr_only("base64: a.txt: No such file or directory\n");
 }
+
+#[test]
+fn test_no_repeated_trailing_newline() {
+    new_ucmd!()
+        .args(&["--wrap", "10", "--", "-"])
+        .pipe_in("The quick brown fox jumps over the lazy dog.")
+        .succeeds()
+        .stdout_only(
+            "\
+VGhlIHF1aW
+NrIGJyb3du
+IGZveCBqdW
+1wcyBvdmVy
+IHRoZSBsYX
+p5IGRvZy4=
+",
+        );
+}
+
+#[test]
+fn test_wrap_default() {
+    new_ucmd!()
+        .args(&["--", "-"])
+        .pipe_in("The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.")
+        .succeeds()
+        .stdout_only(
+            "\
+VGhlIHF1aWNrIGJyb3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZy4gVGhlIHF1aWNrIGJy
+b3duIGZveCBqdW1wcyBvdmVyIHRoZSBsYXp5IGRvZy4gVGhlIHF1aWNrIGJyb3duIGZveCBqdW1w
+cyBvdmVyIHRoZSBsYXp5IGRvZy4=
+",
+        );
+}
--- a/tests/by-util/test_basenc.rs
+++ b/tests/by-util/test_basenc.rs
@ -26,7 +26,9 @@ fn test_invalid_input() {
    let error_message = if cfg!(windows) {
        "basenc: .: Permission denied\n"
    } else {
-        "basenc: error: invalid input\n"
+        // TODO
+        // Other implementations do not show " (os error 21)"
+        "basenc: read error: Is a directory (os error 21)\n"
    };
    new_ucmd!()
        .args(&["--base32", "."])