mirror of
https://github.com/fish-shell/fish-shell
synced 2025-01-14 14:03:58 +00:00
Further optimize unescape_yaml_fish_2_0()
This hot function dominates the flamegraphs for the completions thread, and any optimizations are worthwhile. A variety of different approaches were tested and benchmarked against real-world fish-history file inputs and this is the one that won out across all rustc target-cpu variations tried. Benchmarks and code at https://github.com/mqudsi/fish-yaml-unescape-benchmark
This commit is contained in:
parent
9ab1ec2a9e
commit
bcb1e2ed85
1 changed files with 60 additions and 52 deletions
|
@ -250,43 +250,64 @@ fn escape_yaml_fish_2_0(s: &mut Vec<u8>) {
|
|||
replace_all(s, b"\n", b"\\n"); // replace newline with backslash + literal n
|
||||
}
|
||||
|
||||
/// This function is called frequently, so it ought to be fast.
|
||||
fn unescape_yaml_fish_2_0(s: &mut Vec<u8>) {
|
||||
let mut cursor = 0;
|
||||
while cursor < s.len() {
|
||||
// Look for a backslash.
|
||||
let Some(backslash) = s[cursor..].iter().position(|&c| c == b'\\') else {
|
||||
// No more backslashes
|
||||
break;
|
||||
};
|
||||
|
||||
// Add back the start offset
|
||||
let backslash = backslash + cursor;
|
||||
|
||||
// Backslash found. Maybe we'll do something about it.
|
||||
let Some(escaped_char) = s.get(backslash + 1) else {
|
||||
// Backslash was final character
|
||||
break;
|
||||
};
|
||||
|
||||
match escaped_char {
|
||||
b'\\' => {
|
||||
// Two backslashes in a row. Delete the second one.
|
||||
s.remove(backslash + 1);
|
||||
}
|
||||
b'n' => {
|
||||
// Backslash + n. Replace with a newline.
|
||||
s.splice(backslash..(backslash + 2), [b'\n']);
|
||||
}
|
||||
_ => {
|
||||
// Unknown backslash escape, keep as-is
|
||||
}
|
||||
};
|
||||
|
||||
// The character at index backslash has now been made whole; start at the next
|
||||
// character.
|
||||
cursor = backslash + 1;
|
||||
#[inline(always)]
|
||||
/// Unescapes the fish-specific yaml variant, if it requires it.
|
||||
fn maybe_unescape_yaml_fish_2_0(s: &[u8]) -> Cow<[u8]> {
|
||||
// This is faster than s.contains(b'\\') and can be auto-vectorized to SIMD. See benchmark note
|
||||
// on unescape_yaml_fish_2_0().
|
||||
if !s
|
||||
.into_iter()
|
||||
.copied()
|
||||
.fold(false, |acc, b| acc | (b == b'\\'))
|
||||
{
|
||||
return s.into();
|
||||
}
|
||||
unescape_yaml_fish_2_0(s).into()
|
||||
}
|
||||
|
||||
/// Unescapes the fish-specific yaml variant. Use [`maybe_unescape_yaml_fish_2_0()`] if you're not
|
||||
/// positive the input contains an escape.
|
||||
|
||||
// This function is called on every input event and shows up heavily in all flamegraphs.
|
||||
// Various approaches were benchmarked against real-world fish-history files on lines with escapes,
|
||||
// and this implementation (chunk_loop_box) won out. Make changes with care!
|
||||
//
|
||||
// Benchmarks and code: https://github.com/mqudsi/fish-yaml-unescape-benchmark
|
||||
fn unescape_yaml_fish_2_0(s: &[u8]) -> Vec<u8> {
|
||||
// Safety: we never read from this box; we only write to it then read what we've written.
|
||||
// Rationale: This function is in a very hot loop and this benchmarks around 8% faster on
|
||||
// real-world escaped yaml samples from the fish history file.
|
||||
//
|
||||
// This is a very long way around of writing `Box::new_uninit_slice(s.len())`, which
|
||||
// requires the unstablized nightly-only feature new_unit (#63291). It optimizes away.
|
||||
let mut result: Box<[u8]> = std::iter::repeat_with(std::mem::MaybeUninit::uninit)
|
||||
.take(s.len())
|
||||
.map(|b| unsafe { b.assume_init() })
|
||||
.collect();
|
||||
let mut chars = s.iter().copied();
|
||||
let mut src_idx = 0;
|
||||
let mut dst_idx = 0;
|
||||
loop {
|
||||
// While inspecting the asm reveals the compiler does not elide the bounds check from
|
||||
// the writes to `result`, benchmarking shows that using `result.get_unchecked_mut()`
|
||||
// everywhere does not result in a statistically significant improvement to the
|
||||
// performance of this function.
|
||||
let to_copy = chars.by_ref().take_while(|b| *b != b'\\').count();
|
||||
result[dst_idx..][..to_copy].copy_from_slice(&s[src_idx..][..to_copy]);
|
||||
dst_idx += to_copy;
|
||||
|
||||
match chars.next() {
|
||||
Some(b'\\') => result[dst_idx] = b'\\',
|
||||
Some(b'n') => result[dst_idx] = b'\n',
|
||||
_ => break,
|
||||
}
|
||||
src_idx += to_copy + 2;
|
||||
dst_idx += 1;
|
||||
}
|
||||
|
||||
let mut result = Vec::from(result);
|
||||
result.truncate(dst_idx);
|
||||
result
|
||||
}
|
||||
|
||||
/// Read one line, stripping off any newline, returning the number of bytes consumed.
|
||||
|
@ -320,23 +341,11 @@ fn extract_prefix_and_unescape_yaml(line: &[u8]) -> Option<(Cow<[u8]>, Cow<[u8]>
|
|||
let value = split.next()?;
|
||||
debug_assert!(split.next().is_none());
|
||||
|
||||
let key: Cow<[u8]> = if key.iter().copied().any(|b| b == b'\\') {
|
||||
let mut key = key.to_owned();
|
||||
unescape_yaml_fish_2_0(&mut key);
|
||||
key.into()
|
||||
} else {
|
||||
key.into()
|
||||
};
|
||||
let key = maybe_unescape_yaml_fish_2_0(key);
|
||||
|
||||
// Skip a space after the : if necessary.
|
||||
let value = trim_start(value);
|
||||
let value: Cow<[u8]> = if value.iter().copied().any(|b| b == b'\\') {
|
||||
let mut value = value.to_owned();
|
||||
unescape_yaml_fish_2_0(&mut value);
|
||||
value.into()
|
||||
} else {
|
||||
value.into()
|
||||
};
|
||||
let value = maybe_unescape_yaml_fish_2_0(value);
|
||||
|
||||
Some((key, value))
|
||||
}
|
||||
|
@ -399,8 +408,7 @@ fn decode_item_fish_2_0(mut data: &[u8]) -> Option<HistoryItem> {
|
|||
// We're going to consume this line.
|
||||
data = &data[advance..];
|
||||
|
||||
let mut line = line.to_owned();
|
||||
unescape_yaml_fish_2_0(&mut line);
|
||||
let line = maybe_unescape_yaml_fish_2_0(&line);
|
||||
paths.push(str2wcstring(&line));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue