Merge pull request #1825 from cbjadwani/uniq-utf8-issues

uniq: utf-8 issues
This commit is contained in:
Sylvestre Ledru 2021-03-16 21:18:31 +01:00 committed by GitHub
commit 618d4a4fa5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 19 deletions

View file

@ -10,7 +10,7 @@ extern crate uucore;
use clap::{App, Arg, ArgMatches};
use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write};
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Result, Write};
use std::path::Path;
use std::str::FromStr;
@ -61,8 +61,7 @@ impl Uniq {
let delimiters = &self.delimiters;
let line_terminator = self.get_line_terminator();
for io_line in reader.split(line_terminator) {
let line = String::from_utf8(crash_if_err!(1, io_line)).unwrap();
for line in reader.split(line_terminator).map(get_line_string) {
if !lines.is_empty() && self.cmp_keys(&lines[0], &line) {
let print_delimiter = delimiters == &Delimiters::Prepend
|| (delimiters == &Delimiters::Separate && first_line_printed);
@ -80,22 +79,19 @@ impl Uniq {
fn skip_fields<'a>(&self, line: &'a str) -> &'a str {
if let Some(skip_fields) = self.skip_fields {
if line.split_whitespace().count() > skip_fields {
let mut field = 0;
let mut i = 0;
while field < skip_fields && i < line.len() {
while i < line.len() && line.chars().nth(i).unwrap().is_whitespace() {
i += 1;
}
while i < line.len() && !line.chars().nth(i).unwrap().is_whitespace() {
i += 1;
}
field += 1;
let mut i = 0;
let mut char_indices = line.char_indices();
for _ in 0..skip_fields {
if char_indices.find(|(_, c)| !c.is_whitespace()) == None {
return "";
}
match char_indices.find(|(_, c)| c.is_whitespace()) {
None => return "",
Some((next_field_i, _)) => i = next_field_i,
}
&line[i..]
} else {
""
}
&line[i..]
} else {
line
}
@ -199,6 +195,11 @@ impl Uniq {
}
}
fn get_line_string(io_line: Result<Vec<u8>>) -> String {
let line_bytes = crash_if_err!(1, io_line);
crash_if_err!(1, String::from_utf8(line_bytes))
}
fn opt_parsed<T: FromStr>(opt_name: &str, matches: &ArgMatches) -> Option<T> {
matches.value_of(opt_name).map(|arg_str| {
let opt_val: Option<T> = arg_str.parse().ok();

View file

@ -138,3 +138,12 @@ fn test_stdin_zero_terminated() {
.run()
.stdout_is_fixture("sorted-zero-terminated.expected");
}
#[test]
fn test_invalid_utf8() {
new_ucmd!()
.arg("not-utf8-sequence.txt")
.run()
.failure()
.stderr_only("uniq: error: invalid utf-8 sequence of 1 bytes from index 0");
}

View file

@ -0,0 +1,2 @@
Next line contains two bytes - 0xCC and 0xCD - which are not a valid utf-8 sequence
ÌÍ

View file

@ -1,2 +1,2 @@
aaa aa a
aaa ⟪⟫ a
aa a

View file

@ -1,4 +1,4 @@
aaa aa a
aaa ⟪⟫ a
ZZZ aa a
ZZZ aa a
ZZZ bb a