nushell/crates/nu-table/src/wrap.rs
Stefan Holderbach 9c7feb2b19
Reduce table allocs: only strip ANSI if necessary (#4378)
For the width calculations for table layout the `strip_ansi` function
has to be called frequently. By checking for the ASCII control chars
(0x00 to 0x1f) except `\n` that are stripped by `strip_ansi_escapes` the number of
necessary allocations can be reduced significantly for the simple case
of text not containing ANSI escapes.

**Benchmark:**

```
nu -c "for i in 0..1000 { ls } | flatten | table"
```

**Allocation reduction**

Running on the nushell repo root as the directory, this change reduces the
allocation volume by approximately 400 MB

(Measured run via KDE heaptrack)
**Speed improvement to output**

Measured via `/usr/bin/time -v`

*before*

```
Command being timed: "./eager_nu -c for i in 0..1000 {ls} | flatten | table"
	User time (seconds): 0.87
	System time (seconds): 0.14
	Percent of CPU this job got: 87%
	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.16
	Average shared text size (kbytes): 0
	Average unshared data size (kbytes): 0
	Average stack size (kbytes): 0
	Average total size (kbytes): 0
	Maximum resident set size (kbytes): 18888
	Average resident set size (kbytes): 0
	Major (requiring I/O) page faults: 0
	Minor (reclaiming a frame) page faults: 4809
	Voluntary context switches: 38
	Involuntary context switches: 14
	Swaps: 0
	File system inputs: 0
	File system outputs: 0
	Socket messages sent: 0
	Socket messages received: 0
	Signals delivered: 0
	Page size (bytes): 4096
	Exit status: 0
```

*after*

```
Command being timed: "./lazy_nu -c for i in 0..1000 {ls} | flatten | table"
	User time (seconds): 0.63
	System time (seconds): 0.14
	Percent of CPU this job got: 80%
	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.97
	Average shared text size (kbytes): 0
	Average unshared data size (kbytes): 0
	Average stack size (kbytes): 0
	Average total size (kbytes): 0
	Maximum resident set size (kbytes): 18660
	Average resident set size (kbytes): 0
	Major (requiring I/O) page faults: 0
	Minor (reclaiming a frame) page faults: 5149
	Voluntary context switches: 24
	Involuntary context switches: 5
	Swaps: 0
	File system inputs: 0
	File system outputs: 0
	Socket messages sent: 0
	Socket messages received: 0
	Signals delivered: 0
	Page size (bytes): 4096
	Exit status: 0
```
2022-02-08 17:43:32 -06:00

320 lines
9.6 KiB
Rust

use crate::table::TextStyle;
use ansi_cut::AnsiCut;
use nu_ansi_term::Style;
use std::borrow::Cow;
use std::collections::HashMap;
use std::{fmt::Display, iter::Iterator};
use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};
#[derive(Debug, Clone, Copy)]
pub enum Alignment {
Left,
Center,
Right,
}
#[derive(Debug)]
pub struct Subline {
pub subline: String,
pub width: usize,
}
#[derive(Debug)]
pub struct Line {
pub sublines: Vec<Subline>,
pub width: usize,
}
#[derive(Debug, Clone)]
pub struct WrappedLine {
pub line: String,
pub width: usize,
}
#[derive(Debug, Clone)]
pub struct WrappedCell {
pub lines: Vec<WrappedLine>,
pub max_width: usize,
pub style: TextStyle,
}
impl Display for Line {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut first = true;
for subline in &self.sublines {
if !first {
write!(f, " ")?;
} else {
first = false;
}
write!(f, "{}", subline.subline)?;
}
Ok(())
}
}
/// Removes ANSI escape codes and some ASCII control characters
///
/// Keeps `\n` removes `\r`, `\t` etc.
///
/// If parsing fails silently returns the input string
fn strip_ansi(string: &str) -> Cow<str> {
// Check if any ascii control character except LF(0x0A = 10) is present,
// which will be stripped. Includes the primary start of ANSI sequences ESC
// (0x1B = decimal 27)
if string.bytes().any(|x| matches!(x, 0..=9 | 11..=31)) {
if let Ok(stripped) = strip_ansi_escapes::strip(string) {
if let Ok(new_string) = String::from_utf8(stripped) {
return Cow::Owned(new_string);
}
}
}
// Else case includes failures to parse!
Cow::Borrowed(string)
}
// fn special_width(astring: &str) -> usize {
// // remove the zwj's '\u{200d}'
// // remove the fe0f's
// let stripped_string: String = {
// if let Ok(bytes) = strip_ansi_escapes::strip(astring) {
// String::from_utf8_lossy(&bytes).to_string()
// } else {
// astring.to_string()
// }
// };
// let no_zwj = stripped_string.replace('\u{200d}', "");
// let no_fe0f = no_zwj.replace('\u{fe0f}', "");
// UnicodeWidthStr::width(&no_fe0f[..])
// }
pub fn split_sublines(input: &str) -> Vec<Vec<Subline>> {
input
.split_terminator('\n')
.map(|line| {
line.split_terminator(' ')
.map(|x| Subline {
subline: x.to_string(),
width: {
// We've tried UnicodeWidthStr::width(x), UnicodeSegmentation::graphemes(x, true).count()
// and x.chars().count() with all types of combinations. Currently, it appears that
// getting the max of char count and Unicode width seems to produce the best layout.
// However, it's not perfect.
// let c = x.chars().count();
// let u = UnicodeWidthStr::width(x);
// std::cmp::min(c, u)
// let c = strip_ansi(x).chars().count();
// let u = special_width(x);
// std::cmp::max(c, u)
let stripped = strip_ansi(x);
let c = stripped.chars().count();
let u = stripped.width();
std::cmp::max(c, u)
},
})
.collect::<Vec<_>>()
})
.collect::<Vec<_>>()
}
pub fn column_width(input: &[Vec<Subline>]) -> usize {
let mut max = 0;
for line in input {
let mut total = 0;
let mut first = true;
for inp in line {
if !first {
// Account for the space
total += 1;
} else {
first = false;
}
total += inp.width;
}
if total > max {
max = total;
}
}
max
}
fn split_word(cell_width: usize, word: &str) -> Vec<Subline> {
let mut output = vec![];
let mut current_width = 0;
let mut start_index = 0;
let mut end_index;
let word_no_ansi = strip_ansi(word);
for c in word_no_ansi.char_indices() {
if let Some(width) = c.1.width() {
end_index = c.0;
if current_width + width > cell_width {
output.push(Subline {
subline: word.cut(start_index..end_index),
width: current_width,
});
start_index = c.0;
current_width = width;
} else {
current_width += width;
}
}
}
if start_index != word_no_ansi.len() {
output.push(Subline {
subline: word.cut(start_index..),
width: current_width,
});
}
output
}
pub fn wrap(
cell_width: usize,
mut input: impl Iterator<Item = Subline>,
color_hm: &HashMap<String, Style>,
re_leading: &regex::Regex,
re_trailing: &regex::Regex,
) -> (Vec<WrappedLine>, usize) {
let mut lines = vec![];
let mut current_line: Vec<Subline> = vec![];
let mut current_width = 0;
let mut first = true;
let mut max_width = 0;
let lead_trail_space_bg_color = color_hm
.get("leading_trailing_space_bg")
.unwrap_or(&Style::default())
.to_owned();
loop {
match input.next() {
Some(item) => {
if !first {
current_width += 1;
} else {
first = false;
}
if item.width + current_width > cell_width {
// If this is a really long single word, we need to split the word
if current_line.len() == 1 && current_width > cell_width {
max_width = cell_width;
let sublines = split_word(cell_width, &current_line[0].subline);
for subline in sublines {
let width = subline.width;
lines.push(Line {
sublines: vec![subline],
width,
});
}
first = true;
current_width = item.width;
current_line = vec![item];
} else {
if !current_line.is_empty() {
lines.push(Line {
sublines: current_line,
width: current_width,
});
}
first = true;
current_width = item.width;
current_line = vec![item];
max_width = std::cmp::max(max_width, current_width);
}
} else {
current_width += item.width;
current_line.push(item);
}
}
None => {
if current_width > cell_width {
// We need to break up the last word
let sublines = split_word(cell_width, &current_line[0].subline);
for subline in sublines {
let width = subline.width;
lines.push(Line {
sublines: vec![subline],
width,
});
}
} else if current_width > 0 {
lines.push(Line {
sublines: current_line,
width: current_width,
});
}
break;
}
}
}
let mut current_max = 0;
let mut output = vec![];
for line in lines {
let mut current_line_width = 0;
let mut first = true;
let mut current_line = String::new();
for subline in line.sublines {
if !first {
current_line_width += 1 + subline.width;
current_line.push(' ');
} else {
first = false;
current_line_width = subline.width;
}
current_line.push_str(&subline.subline);
}
if current_line_width > current_max {
current_max = current_line_width;
}
// highlight leading and trailing spaces so they stand out.
let mut bg_color_string = Style::default().prefix().to_string();
// right now config settings can only set foreground colors so, in this
// instance we take the foreground color and make it a background color
if let Some(bg) = lead_trail_space_bg_color.foreground {
bg_color_string = Style::default().on(bg).prefix().to_string()
};
if let Some(leading_match) = re_leading.find(&current_line.clone()) {
String::insert_str(
&mut current_line,
leading_match.end(),
nu_ansi_term::ansi::RESET,
);
String::insert_str(&mut current_line, leading_match.start(), &bg_color_string);
}
if let Some(trailing_match) = re_trailing.find(&current_line.clone()) {
String::insert_str(&mut current_line, trailing_match.start(), &bg_color_string);
current_line += nu_ansi_term::ansi::RESET;
}
output.push(WrappedLine {
line: current_line,
width: current_line_width,
});
}
(output, current_max)
}