Refactor lines command (#11685)

# Description
This PR uses `str::lines` to simplify the `lines` command (and one other
section of code). This has two main benefits:
1. We no longer need to use regex to split on lines, as `str::lines`
splits on `\r\n` or `\n`.
2. We no longer need to handle blank empty lines at the end. E.g.,
`str::lines` results in `["text"]` for both `"test\n"` and `"text"`.

These changes give a slight boost to performance for the following
benchmarks:
1. lines of `Value::String`:
    ```nushell
    let data = open Cargo.lock
    1..10000 | each { $data | timeit { lines } } | math avg 
    ```
    current main: 392µs
    this PR: 270µs
2. lines of external stream:
    ```nushell
    1..10000 | each { open Cargo.lock | timeit { lines } } | math avg 
    ```
    current main: 794µs
    this PR: 489µs
This commit is contained in:
Ian Manske 2024-01-30 21:56:19 +00:00 committed by GitHub
parent c371d1a535
commit cf9813cbf8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 59 additions and 117 deletions

View file

@ -1,14 +1,12 @@
use std::collections::VecDeque;
use nu_engine::CallExt; use nu_engine::CallExt;
use nu_protocol::ast::Call; use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{ use nu_protocol::{
Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError, Category, Example, IntoInterruptiblePipelineData, IntoPipelineData, PipelineData, RawStream,
Signature, Span, Type, Value, ShellError, Signature, Span, Type, Value,
}; };
use once_cell::sync::Lazy;
// regex can be replaced with fancy-regex once it supports `split()`
// https://github.com/fancy-regex/fancy-regex/issues/104
use regex::Regex;
#[derive(Clone)] #[derive(Clone)]
pub struct Lines; pub struct Lines;
@ -39,38 +37,24 @@ impl Command for Lines {
let ctrlc = engine_state.ctrlc.clone(); let ctrlc = engine_state.ctrlc.clone();
let skip_empty = call.has_flag(engine_state, stack, "skip-empty")?; let skip_empty = call.has_flag(engine_state, stack, "skip-empty")?;
// match \r\n or \n
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
let span = input.span().unwrap_or(call.head); let span = input.span().unwrap_or(call.head);
match input { match input {
#[allow(clippy::needless_collect)]
// Collect is needed because the string may not live long enough for
// the Rc structure to continue using it. If split could take ownership
// of the split values, then this wouldn't be needed
PipelineData::Value(Value::String { val, .. }, ..) => { PipelineData::Value(Value::String { val, .. }, ..) => {
let mut lines = LINE_BREAK_REGEX let lines = if skip_empty {
.split(&val) val.lines()
.map(|s| s.to_string()) .filter_map(|s| {
.collect::<Vec<String>>(); if s.trim().is_empty() {
// if the last one is empty, remove it, as it was just
// a newline at the end of the input we got
if let Some(last) = lines.last() {
if last.is_empty() {
lines.pop();
}
}
let iter = lines.into_iter().filter_map(move |s| {
if skip_empty && s.trim().is_empty() {
None None
} else { } else {
Some(Value::string(s, span)) Some(Value::string(s, span))
} }
}); })
.collect()
} else {
val.lines().map(|s| Value::string(s, span)).collect()
};
Ok(iter.into_pipeline_data(engine_state.ctrlc.clone())) Ok(Value::list(lines, span).into_pipeline_data())
} }
PipelineData::Empty => Ok(PipelineData::Empty), PipelineData::Empty => Ok(PipelineData::Empty),
PipelineData::ListStream(stream, ..) => { PipelineData::ListStream(stream, ..) => {
@ -79,26 +63,17 @@ impl Command for Lines {
.filter_map(move |value| { .filter_map(move |value| {
let span = value.span(); let span = value.span();
if let Value::String { val, .. } = value { if let Value::String { val, .. } = value {
let mut lines = LINE_BREAK_REGEX Some(
.split(&val) val.lines()
.filter_map(|s| { .filter_map(|s| {
if skip_empty && s.trim().is_empty() { if skip_empty && s.trim().is_empty() {
None None
} else { } else {
Some(s.to_string()) Some(Value::string(s, span))
} }
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>(),
)
// if the last one is empty, remove it, as it was just
// a newline at the end of the input we got
if let Some(last) = lines.last() {
if last.is_empty() {
lines.pop();
}
}
Some(lines.into_iter().map(move |x| Value::string(x, span)))
} else { } else {
None None
} }
@ -124,11 +99,7 @@ impl Command for Lines {
stdout: Some(stream), stdout: Some(stream),
.. ..
} => Ok(RawStreamLinesAdapter::new(stream, head, skip_empty) } => Ok(RawStreamLinesAdapter::new(stream, head, skip_empty)
.enumerate() .map(move |x| x.unwrap_or_else(|err| Value::error(err, head)))
.map(move |(_idx, x)| match x {
Ok(x) => x,
Err(err) => Value::error(err, head),
})
.into_pipeline_data(ctrlc)), .into_pipeline_data(ctrlc)),
} }
} }
@ -152,38 +123,30 @@ struct RawStreamLinesAdapter {
skip_empty: bool, skip_empty: bool,
span: Span, span: Span,
incomplete_line: String, incomplete_line: String,
queue: Vec<String>, queue: VecDeque<String>,
} }
impl Iterator for RawStreamLinesAdapter { impl Iterator for RawStreamLinesAdapter {
type Item = Result<Value, ShellError>; type Item = Result<Value, ShellError>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
loop { loop {
if !self.queue.is_empty() { if let Some(s) = self.queue.pop_front() {
let s = self.queue.remove(0usize);
if self.skip_empty && s.trim().is_empty() { if self.skip_empty && s.trim().is_empty() {
continue; continue;
} }
return Some(Ok(Value::string(s, self.span))); return Some(Ok(Value::string(s, self.span)));
} else { } else {
// inner is complete, feed out remaining state // inner is complete, feed out remaining state
if self.inner_complete { if self.inner_complete {
if !self.incomplete_line.is_empty() { return if self.incomplete_line.is_empty() {
let r = Some(Ok(Value::string( None
self.incomplete_line.to_string(), } else {
Some(Ok(Value::string(
std::mem::take(&mut self.incomplete_line),
self.span, self.span,
))); )))
self.incomplete_line = String::new(); };
return r;
}
return None;
} }
// pull more data from inner // pull more data from inner
@ -196,36 +159,29 @@ impl Iterator for RawStreamLinesAdapter {
Value::String { val, .. } => { Value::String { val, .. } => {
self.span = span; self.span = span;
let mut lines = LINE_BREAK_REGEX let mut lines = val.lines();
.split(&val)
.map(|s| s.to_string())
.collect::<Vec<_>>();
// handle incomplete line from previous // handle incomplete line from previous
if !self.incomplete_line.is_empty() { if !self.incomplete_line.is_empty() {
if let Some(first) = lines.first() { if let Some(first) = lines.next() {
let new_incomplete_line = self.incomplete_line.push_str(first);
self.incomplete_line.to_string() + first.as_str(); self.queue.push_back(std::mem::take(
lines.splice(0..1, vec![new_incomplete_line]); &mut self.incomplete_line,
self.incomplete_line = String::new(); ));
}
}
// store incomplete line from current
if let Some(last) = lines.last() {
if last.is_empty() {
// we ended on a line ending
lines.pop();
} else {
// incomplete line, save for next time
if let Some(s) = lines.pop() {
self.incomplete_line = s;
}
} }
} }
// save completed lines // save completed lines
self.queue.append(&mut lines); self.queue.extend(lines.map(String::from));
if !val.ends_with('\n') {
// incomplete line, save for next time
// if `val` and `incomplete_line` were empty,
// then pop will return none
if let Some(s) = self.queue.pop_back() {
self.incomplete_line = s;
}
}
} }
// Propagate errors by explicitly matching them before the final case. // Propagate errors by explicitly matching them before the final case.
Value::Error { error, .. } => return Some(Err(*error)), Value::Error { error, .. } => return Some(Err(*error)),
@ -256,7 +212,7 @@ impl RawStreamLinesAdapter {
span, span,
skip_empty, skip_empty,
incomplete_line: String::new(), incomplete_line: String::new(),
queue: Vec::<String>::new(), queue: VecDeque::new(),
inner_complete: false, inner_complete: false,
} }
} }

View file

@ -4,7 +4,6 @@
use std::fmt::{Display, LowerExp}; use std::fmt::{Display, LowerExp};
use std::io; use std::io;
use std::io::{BufRead, BufReader};
use std::num::FpCategory; use std::num::FpCategory;
use super::error::{Error, ErrorCode, Result}; use super::error::{Error, ErrorCode, Result};
@ -1034,20 +1033,7 @@ where
T: ser::Serialize, T: ser::Serialize,
{ {
let vec = to_vec(value)?; let vec = to_vec(value)?;
let string = remove_json_whitespace(vec); let string = String::from_utf8(vec)?;
Ok(string) let output = string.lines().map(str::trim).collect();
} Ok(output)
fn remove_json_whitespace(v: Vec<u8>) -> String {
let reader = BufReader::new(&v[..]);
let mut output = String::new();
for line in reader.lines() {
match line {
Ok(line) => output.push_str(line.trim().trim_end()),
_ => {
eprintln!("Error removing JSON whitespace");
}
}
}
output
} }