Adding JSON lines file support to dataframes (#9291)

# Description
Provides the ability to read and write [JSON
lines](https://jsonlines.org/) files. This is accomplished by exposing
the support already in Polars.

## Opening a JSON lines file 
<img width="1668" alt="Screenshot 2023-05-25 at 5 25 30 PM"
src="https://github.com/nushell/nushell/assets/56345/3b213c3d-eea1-440a-8425-4ce4b39ab7d1">

## Saving a dataframe to a JSON lines file
<img width="848" alt="Screenshot 2023-05-25 at 5 15 57 PM"
src="https://github.com/nushell/nushell/assets/56345/56089990-e14b-4f01-b676-5abab9333d7e">
This commit is contained in:
Jack Wright 2023-05-26 14:32:37 -07:00 committed by GitHub
parent 5f92fd20e9
commit 8144926dc7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 142 additions and 3 deletions

View file

@ -24,6 +24,7 @@ mod take;
mod to_arrow; mod to_arrow;
mod to_csv; mod to_csv;
mod to_df; mod to_df;
mod to_json_lines;
mod to_nu; mod to_nu;
mod to_parquet; mod to_parquet;
mod with_column; mod with_column;
@ -56,6 +57,7 @@ pub use take::TakeDF;
pub use to_arrow::ToArrow; pub use to_arrow::ToArrow;
pub use to_csv::ToCSV; pub use to_csv::ToCSV;
pub use to_df::ToDataFrame; pub use to_df::ToDataFrame;
pub use to_json_lines::ToJsonLines;
pub use to_nu::ToNu; pub use to_nu::ToNu;
pub use to_parquet::ToParquet; pub use to_parquet::ToParquet;
pub use with_column::WithColumn; pub use with_column::WithColumn;
@ -98,6 +100,7 @@ pub fn add_eager_decls(working_set: &mut StateWorkingSet) {
ToDataFrame, ToDataFrame,
ToNu, ToNu,
ToParquet, ToParquet,
ToJsonLines,
WithColumn WithColumn
); );
} }

View file

@ -9,8 +9,8 @@ use nu_protocol::{
use std::{fs::File, io::BufReader, path::PathBuf}; use std::{fs::File, io::BufReader, path::PathBuf};
use polars::prelude::{ use polars::prelude::{
CsvEncoding, CsvReader, IpcReader, JsonReader, LazyCsvReader, LazyFileListReader, LazyFrame, CsvEncoding, CsvReader, IpcReader, JsonFormat, JsonReader, LazyCsvReader, LazyFileListReader,
ParallelStrategy, ParquetReader, ScanArgsIpc, ScanArgsParquet, SerReader, LazyFrame, ParallelStrategy, ParquetReader, ScanArgsIpc, ScanArgsParquet, SerReader,
}; };
#[derive(Clone)] #[derive(Clone)]
@ -22,7 +22,7 @@ impl Command for OpenDataFrame {
} }
fn usage(&self) -> &str { fn usage(&self) -> &str {
"Opens CSV, JSON, arrow, or parquet file to create dataframe." "Opens CSV, JSON, JSON lines, arrow, or parquet file to create dataframe."
} }
fn signature(&self) -> Signature { fn signature(&self) -> Signature {
@ -118,6 +118,7 @@ fn command(
"parquet" => from_parquet(engine_state, stack, call), "parquet" => from_parquet(engine_state, stack, call),
"ipc" | "arrow" => from_ipc(engine_state, stack, call), "ipc" | "arrow" => from_ipc(engine_state, stack, call),
"json" => from_json(engine_state, stack, call), "json" => from_json(engine_state, stack, call),
"jsonl" => from_jsonl(engine_state, stack, call),
_ => Err(ShellError::FileNotFoundCustom( _ => Err(ShellError::FileNotFoundCustom(
format!("{msg}. Supported values: csv, tsv, parquet, ipc, arrow, json"), format!("{msg}. Supported values: csv, tsv, parquet, ipc, arrow, json"),
blamed, blamed,
@ -299,6 +300,44 @@ fn from_json(
Ok(df.into_value(call.head)) Ok(df.into_value(call.head))
} }
fn from_jsonl(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<Value, ShellError> {
let infer_schema: Option<usize> = call.get_flag(engine_state, stack, "infer-schema")?;
let file: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let file = File::open(&file.item).map_err(|e| {
ShellError::GenericError(
"Error opening file".into(),
e.to_string(),
Some(file.span),
None,
Vec::new(),
)
})?;
let buf_reader = BufReader::new(file);
let reader = JsonReader::new(buf_reader)
.with_json_format(JsonFormat::JsonLines)
.infer_schema_len(infer_schema);
let df: NuDataFrame = reader
.finish()
.map_err(|e| {
ShellError::GenericError(
"Json lines reader error".into(),
format!("{e:?}"),
Some(call.head),
None,
Vec::new(),
)
})?
.into();
Ok(df.into_value(call.head))
}
fn from_csv( fn from_csv(
engine_state: &EngineState, engine_state: &EngineState,
stack: &mut Stack, stack: &mut Stack,

View file

@ -0,0 +1,97 @@
use std::{fs::File, io::BufWriter, path::PathBuf};
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, Type, Value,
};
use polars::prelude::{JsonWriter, SerWriter};
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct ToJsonLines;
impl Command for ToJsonLines {
fn name(&self) -> &str {
"dfr to-jsonl"
}
fn usage(&self) -> &str {
"Saves dataframe to a JSON lines file."
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("file", SyntaxShape::Filepath, "file path to save dataframe")
.input_type(Type::Custom("dataframe".into()))
.output_type(Type::Any)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Saves dataframe to JSON lines file",
example: "[[a b]; [1 2] [3 4]] | dfr into-df | dfr to-jsonl test.jsonl",
result: None,
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let file_name: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let mut df = NuDataFrame::try_from_pipeline(input, call.head)?;
let file = File::create(&file_name.item).map_err(|e| {
ShellError::GenericError(
"Error with file name".into(),
e.to_string(),
Some(file_name.span),
None,
Vec::new(),
)
})?;
let buf_writer = BufWriter::new(file);
JsonWriter::new(buf_writer)
.finish(df.as_mut())
.map_err(|e| {
ShellError::GenericError(
"Error saving file".into(),
e.to_string(),
Some(file_name.span),
None,
Vec::new(),
)
})?;
let file_value = Value::String {
val: format!("saved {:?}", &file_name.item),
span: file_name.span,
};
Ok(PipelineData::Value(
Value::List {
vals: vec![file_value],
span: call.head,
},
None,
))
}