mirror of
https://github.com/uutils/coreutils
synced 2024-11-16 17:58:06 +00:00
Merge pull request #1825 from cbjadwani/uniq-utf8-issues
uniq: utf-8 issues
This commit is contained in:
commit
618d4a4fa5
5 changed files with 31 additions and 19 deletions
|
@ -10,7 +10,7 @@ extern crate uucore;
|
|||
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use std::fs::File;
|
||||
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Write};
|
||||
use std::io::{stdin, stdout, BufRead, BufReader, BufWriter, Read, Result, Write};
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
|
@ -61,8 +61,7 @@ impl Uniq {
|
|||
let delimiters = &self.delimiters;
|
||||
let line_terminator = self.get_line_terminator();
|
||||
|
||||
for io_line in reader.split(line_terminator) {
|
||||
let line = String::from_utf8(crash_if_err!(1, io_line)).unwrap();
|
||||
for line in reader.split(line_terminator).map(get_line_string) {
|
||||
if !lines.is_empty() && self.cmp_keys(&lines[0], &line) {
|
||||
let print_delimiter = delimiters == &Delimiters::Prepend
|
||||
|| (delimiters == &Delimiters::Separate && first_line_printed);
|
||||
|
@ -80,22 +79,19 @@ impl Uniq {
|
|||
|
||||
fn skip_fields<'a>(&self, line: &'a str) -> &'a str {
|
||||
if let Some(skip_fields) = self.skip_fields {
|
||||
if line.split_whitespace().count() > skip_fields {
|
||||
let mut field = 0;
|
||||
let mut i = 0;
|
||||
while field < skip_fields && i < line.len() {
|
||||
while i < line.len() && line.chars().nth(i).unwrap().is_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
while i < line.len() && !line.chars().nth(i).unwrap().is_whitespace() {
|
||||
i += 1;
|
||||
}
|
||||
field += 1;
|
||||
let mut i = 0;
|
||||
let mut char_indices = line.char_indices();
|
||||
for _ in 0..skip_fields {
|
||||
if char_indices.find(|(_, c)| !c.is_whitespace()) == None {
|
||||
return "";
|
||||
}
|
||||
match char_indices.find(|(_, c)| c.is_whitespace()) {
|
||||
None => return "",
|
||||
|
||||
Some((next_field_i, _)) => i = next_field_i,
|
||||
}
|
||||
&line[i..]
|
||||
} else {
|
||||
""
|
||||
}
|
||||
&line[i..]
|
||||
} else {
|
||||
line
|
||||
}
|
||||
|
@ -199,6 +195,11 @@ impl Uniq {
|
|||
}
|
||||
}
|
||||
|
||||
fn get_line_string(io_line: Result<Vec<u8>>) -> String {
|
||||
let line_bytes = crash_if_err!(1, io_line);
|
||||
crash_if_err!(1, String::from_utf8(line_bytes))
|
||||
}
|
||||
|
||||
fn opt_parsed<T: FromStr>(opt_name: &str, matches: &ArgMatches) -> Option<T> {
|
||||
matches.value_of(opt_name).map(|arg_str| {
|
||||
let opt_val: Option<T> = arg_str.parse().ok();
|
||||
|
|
|
@ -138,3 +138,12 @@ fn test_stdin_zero_terminated() {
|
|||
.run()
|
||||
.stdout_is_fixture("sorted-zero-terminated.expected");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_utf8() {
|
||||
new_ucmd!()
|
||||
.arg("not-utf8-sequence.txt")
|
||||
.run()
|
||||
.failure()
|
||||
.stderr_only("uniq: error: invalid utf-8 sequence of 1 bytes from index 0");
|
||||
}
|
||||
|
|
2
tests/fixtures/uniq/not-utf8-sequence.txt
vendored
Normal file
2
tests/fixtures/uniq/not-utf8-sequence.txt
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
Next line contains two bytes - 0xCC and 0xCD - which are not a valid utf-8 sequence
|
||||
ÌÍ
|
2
tests/fixtures/uniq/skip-2-fields.expected
vendored
2
tests/fixtures/uniq/skip-2-fields.expected
vendored
|
@ -1,2 +1,2 @@
|
|||
aaa aa a
|
||||
aaa ⟪⟫ a
|
||||
aa a
|
||||
|
|
2
tests/fixtures/uniq/skip-fields.txt
vendored
2
tests/fixtures/uniq/skip-fields.txt
vendored
|
@ -1,4 +1,4 @@
|
|||
aaa aa a
|
||||
aaa ⟪⟫ a
|
||||
ZZZ aa a
|
||||
ZZZ aa a
|
||||
ZZZ bb a
|
||||
|
|
Loading…
Reference in a new issue