2017-12-13 21:02:42 +00:00
|
|
|
#![crate_name = "uu_join"]
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This file is part of the uutils coreutils package.
|
|
|
|
*
|
|
|
|
* (c) Konstantin Pospelov <kupospelov@gmail.com>
|
|
|
|
*
|
|
|
|
* For the full copyright and license information, please view the LICENSE
|
|
|
|
* file that was distributed with this source code.
|
|
|
|
*/
|
|
|
|
|
2017-12-17 13:36:46 +00:00
|
|
|
extern crate clap;
|
2017-12-13 21:02:42 +00:00
|
|
|
|
|
|
|
#[macro_use]
|
|
|
|
extern crate uucore;
|
|
|
|
|
|
|
|
use std::fs::File;
|
2017-12-16 16:26:25 +00:00
|
|
|
use std::io::{BufRead, BufReader, Lines, Stdin, stdin};
|
2017-12-13 21:02:42 +00:00
|
|
|
use std::cmp::Ordering;
|
2017-12-17 13:36:46 +00:00
|
|
|
use clap::{App, Arg};
|
2017-12-13 21:02:42 +00:00
|
|
|
|
|
|
|
static NAME: &'static str = "join";
|
|
|
|
static VERSION: &'static str = env!("CARGO_PKG_VERSION");
|
|
|
|
|
|
|
|
#[derive(PartialEq)]
|
|
|
|
enum FileNum {
|
|
|
|
None,
|
|
|
|
File1,
|
|
|
|
File2,
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
#[derive(Copy, Clone)]
|
|
|
|
enum Sep {
|
|
|
|
Char(char),
|
|
|
|
Line,
|
|
|
|
Whitespaces,
|
|
|
|
}
|
|
|
|
|
2017-12-13 21:02:42 +00:00
|
|
|
struct Settings {
|
|
|
|
key1: usize,
|
|
|
|
key2: usize,
|
|
|
|
print_unpaired: FileNum,
|
|
|
|
ignore_case: bool,
|
2017-12-24 13:22:48 +00:00
|
|
|
separator: Sep,
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Default for Settings {
|
|
|
|
fn default() -> Settings {
|
|
|
|
Settings {
|
|
|
|
key1: 0,
|
|
|
|
key2: 0,
|
|
|
|
print_unpaired: FileNum::None,
|
|
|
|
ignore_case: false,
|
2017-12-24 13:22:48 +00:00
|
|
|
separator: Sep::Whitespaces,
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct Line {
|
|
|
|
fields: Vec<String>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Line {
|
2017-12-24 13:22:48 +00:00
|
|
|
fn new(string: String, separator: Sep) -> Line {
|
|
|
|
let fields = match separator {
|
|
|
|
Sep::Whitespaces => string.split_whitespace().map(String::from).collect(),
|
|
|
|
Sep::Char(sep) => string.split(sep).map(String::from).collect(),
|
|
|
|
Sep::Line => vec![string],
|
|
|
|
};
|
|
|
|
|
|
|
|
Line { fields }
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Get field at index.
|
|
|
|
fn get_field(&self, index: usize) -> &str {
|
|
|
|
if index < self.fields.len() {
|
|
|
|
&self.fields[index]
|
|
|
|
} else {
|
|
|
|
""
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
/// Print each field except the one at the index.
|
|
|
|
fn print_fields(&self, index: usize, separator: char) {
|
|
|
|
for i in 0..self.fields.len() {
|
2017-12-13 21:02:42 +00:00
|
|
|
if i != index {
|
2017-12-24 13:22:48 +00:00
|
|
|
print!("{}{}", separator, self.fields[i]);
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-16 16:26:25 +00:00
|
|
|
struct State<'a> {
|
2017-12-13 21:02:42 +00:00
|
|
|
key: usize,
|
|
|
|
print_unpaired: bool,
|
2017-12-16 16:26:25 +00:00
|
|
|
lines: Lines<Box<BufRead + 'a>>,
|
2017-12-13 21:02:42 +00:00
|
|
|
seq: Vec<Line>,
|
|
|
|
}
|
|
|
|
|
2017-12-16 16:26:25 +00:00
|
|
|
impl<'a> State<'a> {
|
|
|
|
fn new(name: &str, stdin: &'a Stdin, key: usize, print_unpaired: bool) -> State<'a> {
|
|
|
|
let f = if name == "-" {
|
|
|
|
Box::new(stdin.lock()) as Box<BufRead>
|
2017-12-13 21:02:42 +00:00
|
|
|
} else {
|
|
|
|
match File::open(name) {
|
2017-12-16 16:26:25 +00:00
|
|
|
Ok(file) => Box::new(BufReader::new(file)) as Box<BufRead>,
|
2017-12-13 21:02:42 +00:00
|
|
|
Err(err) => crash!(1, "{}: {}", name, err),
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
State {
|
|
|
|
key: key,
|
|
|
|
print_unpaired: print_unpaired,
|
2017-12-16 16:26:25 +00:00
|
|
|
lines: f.lines(),
|
2017-12-13 21:02:42 +00:00
|
|
|
seq: Vec::new(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Compare the key fields of the two current lines.
|
|
|
|
fn compare(&self, other: &State, ignore_case: bool) -> Ordering {
|
|
|
|
let key1 = self.seq[0].get_field(self.key);
|
|
|
|
let key2 = other.seq[0].get_field(other.key);
|
|
|
|
|
|
|
|
compare(key1, key2, ignore_case)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Skip the current unpaired line.
|
2017-12-24 13:22:48 +00:00
|
|
|
fn skip_line(&mut self, read_sep: Sep, write_sep: char) {
|
2017-12-13 21:02:42 +00:00
|
|
|
if self.print_unpaired {
|
2017-12-24 13:22:48 +00:00
|
|
|
self.print_unpaired_line(&self.seq[0], write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
match self.read_line(read_sep) {
|
2017-12-16 13:11:39 +00:00
|
|
|
Some(line) => self.seq[0] = line,
|
2017-12-16 16:26:25 +00:00
|
|
|
None => self.seq.clear(),
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Keep reading line sequence until the key does not change, return
|
|
|
|
/// the first line whose key differs.
|
2017-12-24 13:22:48 +00:00
|
|
|
fn extend(&mut self, read_sep: Sep, ignore_case: bool) -> Option<Line> {
|
|
|
|
while let Some(line) = self.read_line(read_sep) {
|
2017-12-13 21:02:42 +00:00
|
|
|
let diff = compare(
|
|
|
|
self.seq[0].get_field(self.key),
|
|
|
|
line.get_field(self.key),
|
|
|
|
ignore_case,
|
|
|
|
);
|
|
|
|
|
|
|
|
if diff == Ordering::Equal {
|
|
|
|
self.seq.push(line);
|
|
|
|
} else {
|
|
|
|
return Some(line);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Combine two line sequences.
|
2017-12-24 13:22:48 +00:00
|
|
|
fn combine(&self, other: &State, write_sep: char) {
|
2017-12-13 21:02:42 +00:00
|
|
|
let key = self.seq[0].get_field(self.key);
|
|
|
|
|
|
|
|
for line1 in &self.seq {
|
|
|
|
for line2 in &other.seq {
|
|
|
|
print!("{}", key);
|
2017-12-24 13:22:48 +00:00
|
|
|
line1.print_fields(self.key, write_sep);
|
|
|
|
line2.print_fields(other.key, write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
println!();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Reset with the next line.
|
|
|
|
fn reset(&mut self, next_line: Option<Line>) {
|
|
|
|
self.seq.clear();
|
|
|
|
|
|
|
|
if let Some(line) = next_line {
|
|
|
|
self.seq.push(line);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn has_line(&self) -> bool {
|
|
|
|
!self.seq.is_empty()
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
fn initialize(&mut self, read_sep: Sep) {
|
|
|
|
if let Some(line) = self.read_line(read_sep) {
|
2017-12-13 21:02:42 +00:00
|
|
|
self.seq.push(line);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
fn finalize(&mut self, read_sep: Sep, write_sep: char) {
|
2017-12-13 21:02:42 +00:00
|
|
|
if self.has_line() && self.print_unpaired {
|
2017-12-24 13:22:48 +00:00
|
|
|
self.print_unpaired_line(&self.seq[0], write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
while let Some(line) = self.read_line(read_sep) {
|
|
|
|
self.print_unpaired_line(&line, write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
fn read_line(&mut self, sep: Sep) -> Option<Line> {
|
2017-12-13 21:02:42 +00:00
|
|
|
match self.lines.next() {
|
2017-12-24 13:22:48 +00:00
|
|
|
Some(value) => Some(Line::new(crash_if_err!(1, value), sep)),
|
2017-12-13 21:02:42 +00:00
|
|
|
None => None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
fn print_unpaired_line(&self, line: &Line, sep: char) {
|
2017-12-13 21:02:42 +00:00
|
|
|
print!("{}", line.get_field(self.key));
|
2017-12-24 13:22:48 +00:00
|
|
|
line.print_fields(self.key, sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
println!();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn uumain(args: Vec<String>) -> i32 {
|
2017-12-17 13:36:46 +00:00
|
|
|
let matches = App::new(NAME)
|
|
|
|
.version(VERSION)
|
|
|
|
.about(
|
|
|
|
"For each pair of input lines with identical join fields, write a line to
|
|
|
|
standard output. The default join field is the first, delimited by blanks.
|
|
|
|
|
|
|
|
When FILE1 or FILE2 (not both) is -, read standard input.")
|
|
|
|
.help_message("display this help and exit")
|
|
|
|
.version_message("display version and exit")
|
|
|
|
.arg(Arg::with_name("a")
|
|
|
|
.short("a")
|
|
|
|
.takes_value(true)
|
|
|
|
.possible_values(&["1", "2"])
|
|
|
|
.value_name("FILENUM")
|
|
|
|
.help("also print unpairable lines from file FILENUM, where
|
|
|
|
FILENUM is 1 or 2, corresponding to FILE1 or FILE2"))
|
|
|
|
.arg(Arg::with_name("i")
|
|
|
|
.short("i")
|
|
|
|
.long("ignore-case")
|
|
|
|
.help("ignore differences in case when comparing fields"))
|
|
|
|
.arg(Arg::with_name("j")
|
|
|
|
.short("j")
|
|
|
|
.takes_value(true)
|
|
|
|
.value_name("FIELD")
|
|
|
|
.help("equivalent to '-1 FIELD -2 FIELD'"))
|
2017-12-24 13:22:48 +00:00
|
|
|
.arg(Arg::with_name("t")
|
|
|
|
.short("t")
|
|
|
|
.takes_value(true)
|
|
|
|
.value_name("CHAR")
|
|
|
|
.help("use CHAR as input and output field separator"))
|
2017-12-17 13:36:46 +00:00
|
|
|
.arg(Arg::with_name("1")
|
|
|
|
.short("1")
|
|
|
|
.takes_value(true)
|
|
|
|
.value_name("FIELD")
|
|
|
|
.help("join on this FIELD of file 1"))
|
|
|
|
.arg(Arg::with_name("2")
|
|
|
|
.short("2")
|
|
|
|
.takes_value(true)
|
|
|
|
.value_name("FIELD")
|
|
|
|
.help("join on this FIELD of file 2"))
|
|
|
|
.arg(Arg::with_name("file1")
|
|
|
|
.required(true)
|
|
|
|
.value_name("FILE1")
|
|
|
|
.hidden(true))
|
|
|
|
.arg(Arg::with_name("file2")
|
|
|
|
.required(true)
|
|
|
|
.value_name("FILE2")
|
|
|
|
.hidden(true))
|
|
|
|
.get_matches_from(args);
|
|
|
|
|
|
|
|
let keys = parse_field_number(matches.value_of("j"));
|
|
|
|
let key1 = parse_field_number(matches.value_of("1"));
|
|
|
|
let key2 = parse_field_number(matches.value_of("2"));
|
2017-12-13 21:02:42 +00:00
|
|
|
|
2017-12-17 13:36:46 +00:00
|
|
|
let mut settings: Settings = Default::default();
|
|
|
|
settings.print_unpaired = match matches.value_of("a") {
|
2017-12-13 21:02:42 +00:00
|
|
|
Some(value) => {
|
2017-12-17 13:36:46 +00:00
|
|
|
match value {
|
2017-12-13 21:02:42 +00:00
|
|
|
"1" => FileNum::File1,
|
|
|
|
"2" => FileNum::File2,
|
|
|
|
value => crash!(1, "invalid file number: {}", value),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None => FileNum::None,
|
|
|
|
};
|
2017-12-17 13:36:46 +00:00
|
|
|
settings.ignore_case = matches.is_present("i");
|
2017-12-13 21:02:42 +00:00
|
|
|
settings.key1 = get_field_number(keys, key1);
|
|
|
|
settings.key2 = get_field_number(keys, key2);
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
if let Some(value) = matches.value_of("t") {
|
|
|
|
settings.separator = match value.len() {
|
|
|
|
0 => Sep::Line,
|
|
|
|
1 => Sep::Char(value.chars().nth(0).unwrap()),
|
|
|
|
_ => crash!(1, "multi-character tab {}", value),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2017-12-17 13:36:46 +00:00
|
|
|
let file1 = matches.value_of("file1").unwrap();
|
|
|
|
let file2 = matches.value_of("file2").unwrap();
|
2017-12-13 21:02:42 +00:00
|
|
|
|
2017-12-17 13:36:46 +00:00
|
|
|
if file1 == "-" && file2 == "-" {
|
2017-12-13 21:02:42 +00:00
|
|
|
crash!(1, "both files cannot be standard input");
|
|
|
|
}
|
|
|
|
|
2017-12-17 13:36:46 +00:00
|
|
|
exec(file1, file2, &settings)
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
|
2017-12-17 13:36:46 +00:00
|
|
|
fn exec(file1: &str, file2: &str, settings: &Settings) -> i32 {
|
2017-12-16 16:26:25 +00:00
|
|
|
let stdin = stdin();
|
|
|
|
|
2017-12-13 21:02:42 +00:00
|
|
|
let mut state1 = State::new(
|
2017-12-17 13:36:46 +00:00
|
|
|
&file1,
|
2017-12-16 16:26:25 +00:00
|
|
|
&stdin,
|
2017-12-13 21:02:42 +00:00
|
|
|
settings.key1,
|
|
|
|
settings.print_unpaired == FileNum::File1,
|
|
|
|
);
|
|
|
|
|
|
|
|
let mut state2 = State::new(
|
2017-12-17 13:36:46 +00:00
|
|
|
&file2,
|
2017-12-16 16:26:25 +00:00
|
|
|
&stdin,
|
2017-12-13 21:02:42 +00:00
|
|
|
settings.key2,
|
|
|
|
settings.print_unpaired == FileNum::File2,
|
|
|
|
);
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
let write_sep = match settings.separator {
|
|
|
|
Sep::Char(sep) => sep,
|
|
|
|
_ => ' ',
|
|
|
|
};
|
|
|
|
|
|
|
|
state1.initialize(settings.separator);
|
|
|
|
state2.initialize(settings.separator);
|
2017-12-13 21:02:42 +00:00
|
|
|
|
|
|
|
while state1.has_line() && state2.has_line() {
|
|
|
|
let diff = state1.compare(&state2, settings.ignore_case);
|
|
|
|
|
|
|
|
match diff {
|
|
|
|
Ordering::Less => {
|
2017-12-24 13:22:48 +00:00
|
|
|
state1.skip_line(settings.separator, write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
Ordering::Greater => {
|
2017-12-24 13:22:48 +00:00
|
|
|
state2.skip_line(settings.separator, write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
}
|
|
|
|
Ordering::Equal => {
|
2017-12-24 13:22:48 +00:00
|
|
|
let next_line1 = state1.extend(settings.separator, settings.ignore_case);
|
|
|
|
let next_line2 = state2.extend(settings.separator, settings.ignore_case);
|
2017-12-13 21:02:42 +00:00
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
state1.combine(&state2, write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
|
|
|
|
state1.reset(next_line1);
|
|
|
|
state2.reset(next_line2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-24 13:22:48 +00:00
|
|
|
state1.finalize(settings.separator, write_sep);
|
|
|
|
state2.finalize(settings.separator, write_sep);
|
2017-12-13 21:02:42 +00:00
|
|
|
|
|
|
|
0
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Check that keys for both files and for a particular file are not
|
|
|
|
/// contradictory and return the zero-based key index.
|
|
|
|
fn get_field_number(keys: Option<usize>, key: Option<usize>) -> usize {
|
|
|
|
if let Some(keys) = keys {
|
|
|
|
if let Some(key) = key {
|
|
|
|
if keys != key {
|
|
|
|
crash!(1, "incompatible join fields {}, {}", keys, key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return keys - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
match key {
|
|
|
|
Some(key) => key - 1,
|
|
|
|
None => 0,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Parse the specified field string as a natural number and return it.
|
2017-12-17 13:36:46 +00:00
|
|
|
fn parse_field_number(value: Option<&str>) -> Option<usize> {
|
2017-12-13 21:02:42 +00:00
|
|
|
match value {
|
|
|
|
Some(value) => {
|
|
|
|
match value.parse() {
|
|
|
|
Ok(result) if result > 0 => Some(result),
|
|
|
|
_ => crash!(1, "invalid field number: '{}'", value),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
None => None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn compare(field1: &str, field2: &str, ignore_case: bool) -> Ordering {
|
|
|
|
if ignore_case {
|
|
|
|
field1.to_lowercase().cmp(&field2.to_lowercase())
|
|
|
|
} else {
|
|
|
|
field1.cmp(field2)
|
|
|
|
}
|
|
|
|
}
|