Refactor lines command (#11685)

# Description
This PR uses `str::lines` to simplify the `lines` command (and one other
section of code). This has two main benefits:
1. We no longer need to use regex to split on lines, as `str::lines`
splits on `\r\n` or `\n`.
2. We no longer need to handle blank empty lines at the end. E.g.,
`str::lines` results in `["text"]` for both `"test\n"` and `"text"`.

These changes give a slight boost to performance for the following
benchmarks:
1. lines of `Value::String`:
    ```nushell
    let data = open Cargo.lock
    1..10000 | each { $data | timeit { lines } } | math avg 
    ```
    current main: 392µs
    this PR: 270µs
2. lines of external stream:
    ```nushell
    1..10000 | each { open Cargo.lock | timeit { lines } } | math avg 
    ```
    current main: 794µs
    this PR: 489µs
This commit is contained in:
Ian Manske 2024-01-30 21:56:19 +00:00 committed by GitHub
parent c371d1a535
commit cf9813cbf8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 59 additions and 117 deletions

View file

@ -1,14 +1,12 @@
use std::collections::VecDeque;
use nu_engine::CallExt;
use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{
Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError,
Signature, Span, Type, Value,
Category, Example, IntoInterruptiblePipelineData, IntoPipelineData, PipelineData, RawStream,
ShellError, Signature, Span, Type, Value,
};
use once_cell::sync::Lazy;
// regex can be replaced with fancy-regex once it supports `split()`
// https://github.com/fancy-regex/fancy-regex/issues/104
use regex::Regex;
#[derive(Clone)]
pub struct Lines;
@ -39,38 +37,24 @@ impl Command for Lines {
let ctrlc = engine_state.ctrlc.clone();
let skip_empty = call.has_flag(engine_state, stack, "skip-empty")?;
// match \r\n or \n
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
let span = input.span().unwrap_or(call.head);
match input {
#[allow(clippy::needless_collect)]
// Collect is needed because the string may not live long enough for
// the Rc structure to continue using it. If split could take ownership
// of the split values, then this wouldn't be needed
PipelineData::Value(Value::String { val, .. }, ..) => {
let mut lines = LINE_BREAK_REGEX
.split(&val)
.map(|s| s.to_string())
.collect::<Vec<String>>();
let lines = if skip_empty {
val.lines()
.filter_map(|s| {
if s.trim().is_empty() {
None
} else {
Some(Value::string(s, span))
}
})
.collect()
} else {
val.lines().map(|s| Value::string(s, span)).collect()
};
// if the last one is empty, remove it, as it was just
// a newline at the end of the input we got
if let Some(last) = lines.last() {
if last.is_empty() {
lines.pop();
}
}
let iter = lines.into_iter().filter_map(move |s| {
if skip_empty && s.trim().is_empty() {
None
} else {
Some(Value::string(s, span))
}
});
Ok(iter.into_pipeline_data(engine_state.ctrlc.clone()))
Ok(Value::list(lines, span).into_pipeline_data())
}
PipelineData::Empty => Ok(PipelineData::Empty),
PipelineData::ListStream(stream, ..) => {
@ -79,26 +63,17 @@ impl Command for Lines {
.filter_map(move |value| {
let span = value.span();
if let Value::String { val, .. } = value {
let mut lines = LINE_BREAK_REGEX
.split(&val)
.filter_map(|s| {
if skip_empty && s.trim().is_empty() {
None
} else {
Some(s.to_string())
}
})
.collect::<Vec<_>>();
// if the last one is empty, remove it, as it was just
// a newline at the end of the input we got
if let Some(last) = lines.last() {
if last.is_empty() {
lines.pop();
}
}
Some(lines.into_iter().map(move |x| Value::string(x, span)))
Some(
val.lines()
.filter_map(|s| {
if skip_empty && s.trim().is_empty() {
None
} else {
Some(Value::string(s, span))
}
})
.collect::<Vec<_>>(),
)
} else {
None
}
@ -124,11 +99,7 @@ impl Command for Lines {
stdout: Some(stream),
..
} => Ok(RawStreamLinesAdapter::new(stream, head, skip_empty)
.enumerate()
.map(move |(_idx, x)| match x {
Ok(x) => x,
Err(err) => Value::error(err, head),
})
.map(move |x| x.unwrap_or_else(|err| Value::error(err, head)))
.into_pipeline_data(ctrlc)),
}
}
@ -152,38 +123,30 @@ struct RawStreamLinesAdapter {
skip_empty: bool,
span: Span,
incomplete_line: String,
queue: Vec<String>,
queue: VecDeque<String>,
}
impl Iterator for RawStreamLinesAdapter {
type Item = Result<Value, ShellError>;
fn next(&mut self) -> Option<Self::Item> {
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
loop {
if !self.queue.is_empty() {
let s = self.queue.remove(0usize);
if let Some(s) = self.queue.pop_front() {
if self.skip_empty && s.trim().is_empty() {
continue;
}
return Some(Ok(Value::string(s, self.span)));
} else {
// inner is complete, feed out remaining state
if self.inner_complete {
if !self.incomplete_line.is_empty() {
let r = Some(Ok(Value::string(
self.incomplete_line.to_string(),
return if self.incomplete_line.is_empty() {
None
} else {
Some(Ok(Value::string(
std::mem::take(&mut self.incomplete_line),
self.span,
)));
self.incomplete_line = String::new();
return r;
}
return None;
)))
};
}
// pull more data from inner
@ -196,36 +159,29 @@ impl Iterator for RawStreamLinesAdapter {
Value::String { val, .. } => {
self.span = span;
let mut lines = LINE_BREAK_REGEX
.split(&val)
.map(|s| s.to_string())
.collect::<Vec<_>>();
let mut lines = val.lines();
// handle incomplete line from previous
if !self.incomplete_line.is_empty() {
if let Some(first) = lines.first() {
let new_incomplete_line =
self.incomplete_line.to_string() + first.as_str();
lines.splice(0..1, vec![new_incomplete_line]);
self.incomplete_line = String::new();
}
}
// store incomplete line from current
if let Some(last) = lines.last() {
if last.is_empty() {
// we ended on a line ending
lines.pop();
} else {
// incomplete line, save for next time
if let Some(s) = lines.pop() {
self.incomplete_line = s;
}
if let Some(first) = lines.next() {
self.incomplete_line.push_str(first);
self.queue.push_back(std::mem::take(
&mut self.incomplete_line,
));
}
}
// save completed lines
self.queue.append(&mut lines);
self.queue.extend(lines.map(String::from));
if !val.ends_with('\n') {
// incomplete line, save for next time
// if `val` and `incomplete_line` were empty,
// then pop will return none
if let Some(s) = self.queue.pop_back() {
self.incomplete_line = s;
}
}
}
// Propagate errors by explicitly matching them before the final case.
Value::Error { error, .. } => return Some(Err(*error)),
@ -256,7 +212,7 @@ impl RawStreamLinesAdapter {
span,
skip_empty,
incomplete_line: String::new(),
queue: Vec::<String>::new(),
queue: VecDeque::new(),
inner_complete: false,
}
}

View file

@ -4,7 +4,6 @@
use std::fmt::{Display, LowerExp};
use std::io;
use std::io::{BufRead, BufReader};
use std::num::FpCategory;
use super::error::{Error, ErrorCode, Result};
@ -1034,20 +1033,7 @@ where
T: ser::Serialize,
{
let vec = to_vec(value)?;
let string = remove_json_whitespace(vec);
Ok(string)
}
fn remove_json_whitespace(v: Vec<u8>) -> String {
let reader = BufReader::new(&v[..]);
let mut output = String::new();
for line in reader.lines() {
match line {
Ok(line) => output.push_str(line.trim().trim_end()),
_ => {
eprintln!("Error removing JSON whitespace");
}
}
}
output
let string = String::from_utf8(vec)?;
let output = string.lines().map(str::trim).collect();
Ok(output)
}