Handle mixed LF+CRLF in lines (#7316)

This closes #4989. Previously `lines` was unable to handle text input
with CRLF line breaks _and_ LF line breaks.

### Before:

![image](https://user-images.githubusercontent.com/26268125/205207685-b25da9e1-19fa-4abb-8ab2-0dd216c63fc0.png)

### After:


![image](https://user-images.githubusercontent.com/26268125/205207808-9f687242-a8c2-4b79-a12c-38b0583d8d52.png)
This commit is contained in:
Reilly Wood 2022-12-02 08:30:26 -08:00 committed by GitHub
parent 3ac36879e0
commit ee5a387300
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 17 deletions

1
Cargo.lock generated
View file

@ -2638,6 +2638,7 @@ dependencies = [
"rand 0.8.5",
"rayon",
"reedline",
"regex",
"reqwest",
"roxmltree",
"rstest",

View file

@ -68,6 +68,7 @@ powierza-coefficient = "1.0.1"
quick-xml = "0.25"
rand = "0.8"
rayon = "1.5.1"
regex = "1.6.0"
reqwest = {version = "0.11", features = ["blocking", "json"] }
roxmltree = "0.16.0"
rust-embed = "6.3.0"

View file

@ -4,6 +4,10 @@ use nu_protocol::{
Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError,
Signature, Span, Type, Value,
};
use once_cell::sync::Lazy;
// regex can be replaced with fancy-regex once it suppports `split()`
// https://github.com/fancy-regex/fancy-regex/issues/104
use regex::Regex;
#[derive(Clone)]
pub struct Lines;
@ -34,16 +38,18 @@ impl Command for Lines {
let head = call.head;
let ctrlc = engine_state.ctrlc.clone();
let skip_empty = call.has_flag("skip-empty");
// match \r\n or \n
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
match input {
#[allow(clippy::needless_collect)]
// Collect is needed because the string may not live long enough for
// the Rc structure to continue using it. If split could take ownership
// of the split values, then this wouldn't be needed
PipelineData::Value(Value::String { val, span }, ..) => {
let split_char = if val.contains("\r\n") { "\r\n" } else { "\n" };
let mut lines = val
.split(split_char)
let mut lines = LINE_BREAK_REGEX
.split(&val)
.map(|s| s.to_string())
.collect::<Vec<String>>();
@ -66,18 +72,12 @@ impl Command for Lines {
Ok(iter.into_pipeline_data(engine_state.ctrlc.clone()))
}
PipelineData::ListStream(stream, ..) => {
let mut split_char = "\n";
let iter = stream
.into_iter()
.filter_map(move |value| {
if let Value::String { val, span } = value {
if split_char != "\r\n" && val.contains("\r\n") {
split_char = "\r\n";
}
let mut lines = val
.split(split_char)
let mut lines = LINE_BREAK_REGEX
.split(&val)
.filter_map(|s| {
if skip_empty && s.trim().is_empty() {
None
@ -153,6 +153,9 @@ impl Iterator for RawStreamLinesAdapter {
type Item = Result<Value, ShellError>;
fn next(&mut self) -> Option<Self::Item> {
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
loop {
if !self.queue.is_empty() {
let s = self.queue.remove(0usize);
@ -188,11 +191,8 @@ impl Iterator for RawStreamLinesAdapter {
Value::String { val, span } => {
self.span = span;
let split_char =
if val.contains("\r\n") { "\r\n" } else { "\n" };
let mut lines = val
.split(split_char)
let mut lines = LINE_BREAK_REGEX
.split(&val)
.map(|s| s.to_string())
.collect::<Vec<_>>();

View file

@ -48,3 +48,16 @@ fn lines_multi_value_split() {
assert_eq!(actual.out, "6");
}
/// test whether this handles CRLF and LF in the same input
#[test]
fn lines_mixed_line_endings() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
"foo\nbar\r\nquux" | lines | length
"#
));
assert_eq!(actual.out, "3");
}