No description
Find a file
bors e15b6f9005 Auto merge of #125679 - clarfonthey:escape_ascii, r=joboet
Optimize `escape_ascii` using a lookup table

Based upon my suggestion here: https://github.com/rust-lang/rust/pull/125340#issuecomment-2130441817

Effectively, we can take advantage of the fact that ASCII only needs 7 bits to make the eighth bit store whether the value should be escaped or not. This adds a 256-byte lookup table, but 256 bytes *should* be small enough that very few people will mind, according to my probably not incontrovertible opinion.

The generated assembly isn't clearly better (although has fewer branches), so, I decided to benchmark on three inputs: first on a random 200KiB, then on `/bin/cat`, then on `Cargo.toml` for this repo. In all cases, the generated code ran faster on my machine. (an old i7-8700)

But, if you want to try my benchmarking code for yourself:

<details><summary>Criterion code below. Replace <code>/home/ltdk/rustsrc</code> with the appropriate directory.</summary>

```rust
#![feature(ascii_char)]
#![feature(ascii_char_variants)]
#![feature(const_option)]
#![feature(let_chains)]
use core::ascii;
use core::ops::Range;
use criterion::{criterion_group, criterion_main, Criterion};
use rand::{thread_rng, Rng};

const HEX_DIGITS: [ascii::Char; 16] = *b"0123456789abcdef".as_ascii().unwrap();

#[inline]
const fn backslash<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) {
    const { assert!(N >= 2) };

    let mut output = [ascii::Char::Null; N];

    output[0] = ascii::Char::ReverseSolidus;
    output[1] = a;

    (output, 0..2)
}

#[inline]
const fn hex_escape<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
    const { assert!(N >= 4) };

    let mut output = [ascii::Char::Null; N];

    let hi = HEX_DIGITS[(byte >> 4) as usize];
    let lo = HEX_DIGITS[(byte & 0xf) as usize];

    output[0] = ascii::Char::ReverseSolidus;
    output[1] = ascii::Char::SmallX;
    output[2] = hi;
    output[3] = lo;

    (output, 0..4)
}

#[inline]
const fn verbatim<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) {
    const { assert!(N >= 1) };

    let mut output = [ascii::Char::Null; N];

    output[0] = a;

    (output, 0..1)
}

/// Escapes an ASCII character.
///
/// Returns a buffer and the length of the escaped representation.
const fn escape_ascii_old<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
    const { assert!(N >= 4) };

    match byte {
        b'\t' => backslash(ascii::Char::SmallT),
        b'\r' => backslash(ascii::Char::SmallR),
        b'\n' => backslash(ascii::Char::SmallN),
        b'\\' => backslash(ascii::Char::ReverseSolidus),
        b'\'' => backslash(ascii::Char::Apostrophe),
        b'\"' => backslash(ascii::Char::QuotationMark),
        0x00..=0x1F => hex_escape(byte),
        _ => match ascii::Char::from_u8(byte) {
            Some(a) => verbatim(a),
            None => hex_escape(byte),
        },
    }
}

/// Escapes an ASCII character.
///
/// Returns a buffer and the length of the escaped representation.
const fn escape_ascii_new<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
    /// Lookup table helps us determine how to display character.
    ///
    /// Since ASCII characters will always be 7 bits, we can exploit this to store the 8th bit to
    /// indicate whether the result is escaped or unescaped.
    ///
    /// We additionally use 0x80 (escaped NUL character) to indicate hex-escaped bytes, since
    /// escaped NUL will not occur.
    const LOOKUP: [u8; 256] = {
        let mut arr = [0; 256];
        let mut idx = 0;
        loop {
            arr[idx as usize] = match idx {
                // use 8th bit to indicate escaped
                b'\t' => 0x80 | b't',
                b'\r' => 0x80 | b'r',
                b'\n' => 0x80 | b'n',
                b'\\' => 0x80 | b'\\',
                b'\'' => 0x80 | b'\'',
                b'"' => 0x80 | b'"',

                // use NUL to indicate hex-escaped
                0x00..=0x1F | 0x7F..=0xFF => 0x80 | b'\0',

                _ => idx,
            };
            if idx == 255 {
                break;
            }
            idx += 1;
        }
        arr
    };

    let lookup = LOOKUP[byte as usize];

    // 8th bit indicates escape
    let lookup_escaped = lookup & 0x80 != 0;

    // SAFETY: We explicitly mask out the eighth bit to get a 7-bit ASCII character.
    let lookup_ascii = unsafe { ascii::Char::from_u8_unchecked(lookup & 0x7F) };

    if lookup_escaped {
        // NUL indicates hex-escaped
        if matches!(lookup_ascii, ascii::Char::Null) {
            hex_escape(byte)
        } else {
            backslash(lookup_ascii)
        }
    } else {
        verbatim(lookup_ascii)
    }
}

fn escape_bytes(bytes: &[u8], f: impl Fn(u8) -> ([ascii::Char; 4], Range<u8>)) -> Vec<ascii::Char> {
    let mut vec = Vec::new();
    for b in bytes {
        let (buf, range) = f(*b);
        vec.extend_from_slice(&buf[range.start as usize..range.end as usize]);
    }
    vec
}

pub fn criterion_benchmark(c: &mut Criterion) {
    let mut group = c.benchmark_group("escape_ascii");

    group.sample_size(1000);

    let rand_200k = &mut [0; 200 * 1024];
    thread_rng().fill(&mut rand_200k[..]);
    let cat = include_bytes!("/bin/cat");
    let cargo_toml = include_bytes!("/home/ltdk/rustsrc/Cargo.toml");

    group.bench_function("old_rand", |b| {
        b.iter(|| escape_bytes(rand_200k, escape_ascii_old));
    });
    group.bench_function("new_rand", |b| {
        b.iter(|| escape_bytes(rand_200k, escape_ascii_new));
    });

    group.bench_function("old_bin", |b| {
        b.iter(|| escape_bytes(cat, escape_ascii_old));
    });
    group.bench_function("new_bin", |b| {
        b.iter(|| escape_bytes(cat, escape_ascii_new));
    });

    group.bench_function("old_cargo_toml", |b| {
        b.iter(|| escape_bytes(cargo_toml, escape_ascii_old));
    });
    group.bench_function("new_cargo_toml", |b| {
        b.iter(|| escape_bytes(cargo_toml, escape_ascii_new));
    });

    group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
```

</details>

My benchmark results:

```
escape_ascii/old_rand   time:   [1.6965 ms 1.7006 ms 1.7053 ms]
Found 22 outliers among 1000 measurements (2.20%)
  4 (0.40%) high mild
  18 (1.80%) high severe
escape_ascii/new_rand   time:   [1.6749 ms 1.6953 ms 1.7158 ms]
Found 38 outliers among 1000 measurements (3.80%)
  38 (3.80%) high mild
escape_ascii/old_bin    time:   [224.59 µs 225.40 µs 226.33 µs]
Found 39 outliers among 1000 measurements (3.90%)
  17 (1.70%) high mild
  22 (2.20%) high severe
escape_ascii/new_bin    time:   [164.86 µs 165.63 µs 166.58 µs]
Found 107 outliers among 1000 measurements (10.70%)
  43 (4.30%) high mild
  64 (6.40%) high severe
escape_ascii/old_cargo_toml
                        time:   [23.397 µs 23.699 µs 24.014 µs]
Found 204 outliers among 1000 measurements (20.40%)
  21 (2.10%) high mild
  183 (18.30%) high severe
escape_ascii/new_cargo_toml
                        time:   [16.404 µs 16.438 µs 16.483 µs]
Found 88 outliers among 1000 measurements (8.80%)
  56 (5.60%) high mild
  32 (3.20%) high severe
```

Random: 1.7006ms => 1.6953ms (<1% speedup)
Binary: 225.40µs => 165.63µs (26% speedup)
Text: 23.699µs => 16.438µs (30% speedup)
2024-10-13 14:05:50 +00:00
.cargo fix: Fix generated markers not being patchable in package.json 2024-06-08 12:54:43 +02:00
.github Use macos-13 runners and bump MACOSX_DEPLOYMENT_TARGET 2024-10-08 13:00:58 +03:00
.vscode feat: use vscode log format for client logs 2024-07-27 21:43:35 -07:00
assets Automatically change text color in logo based on dark mode 2022-03-06 23:06:53 +11:00
bench_data Spelling 2023-04-19 09:45:55 -04:00
crates Reserve guarded string literals (RFC 3593) 2024-10-08 18:21:16 -06:00
docs internal: add JSON formatting for hprof 2024-10-04 11:26:15 -04:00
editors/code Auto merge of #18219 - Veykril:veykril/push-ytnzuvtoswqz, r=Veykril 2024-10-01 10:36:27 +00:00
lib fix: Don't panic lsp writer thread on dropped receiver 2024-09-06 09:13:00 +02:00
xtask Parse builtin#asm expressions 2024-09-04 14:09:03 +02:00
.editorconfig add max_line_length to .editorconfig 2024-01-20 17:14:00 +03:00
.git-blame-ignore-revs internal: Don't allocate autoderef steps when not needed 2024-08-25 13:12:07 +02:00
.gitattributes Fix .gitattributes for test_data 2022-07-24 14:05:35 +02:00
.gitignore internal: add "Shuffle Crate Graph" command 2021-12-07 16:37:19 +01:00
.typos.toml Parse builtin#asm expressions 2024-09-04 14:09:03 +02:00
Cargo.lock Update cc to 1.1.22 2024-09-27 17:50:06 +00:00
Cargo.toml Require rust 1.81 2024-09-25 17:02:17 -04:00
clippy.toml Lint debug prints and disallowed types with clippy 2024-02-01 17:57:27 +01:00
CONTRIBUTING.md Add CONTRIBUTING.md 2024-04-18 10:24:40 +02:00
LICENSE-APACHE Drop Apache license appendices 2024-08-27 14:52:34 +03:00
LICENSE-MIT Licenses 2018-01-10 22:47:04 +03:00
PRIVACY.md Update privacy note 2021-12-23 14:04:15 +02:00
README.md Add CONTRIBUTING.md 2024-04-18 10:24:40 +02:00
rust-bors.toml Prepare for rust-bors 2023-10-05 15:26:09 +03:00
rust-version Preparing for merge from rust-lang/rust 2024-10-08 14:25:24 +03:00
rustfmt.toml Shuffle hir-expand things around 2024-01-26 19:28:39 +01:00
triagebot.toml Update triagebot.toml 2024-06-27 10:32:13 +02:00

rust-analyzer logo

rust-analyzer is a modular compiler frontend for the Rust language. It is a part of a larger rls-2.0 effort to create excellent IDE support for Rust.

Quick Start

https://rust-analyzer.github.io/manual.html#installation

Documentation

If you want to contribute to rust-analyzer check out the CONTRIBUTING.md or if you are just curious about how things work under the hood, check the ./docs/dev folder.

If you want to use rust-analyzer's language server with your editor of choice, check the manual folder. It also contains some tips & tricks to help you be more productive when using rust-analyzer.

Security and Privacy

See the corresponding sections of the manual.

Communication

For usage and troubleshooting requests, please use "IDEs and Editors" category of the Rust forum:

https://users.rust-lang.org/c/ide/14

For questions about development and implementation, join rust-analyzer working group on Zulip:

https://rust-lang.zulipchat.com/#narrow/stream/185405-t-compiler.2Frust-analyzer

License

rust-analyzer is primarily distributed under the terms of both the MIT license and the Apache License (Version 2.0).

See LICENSE-APACHE and LICENSE-MIT for details.