Document lexer (#2865)

* Update dependencies

* Document the lexer and lightly improve its names

The bulk of this pull request adds a substantial amount of new inline
documentation for the lexer. Along the way, I made a few minor changes
to the names in the lexer, most of which were internal.

The main change that affects other files is renaming `group` to `block`,
since the function is actually parsing a block (a list of groups).

* Fix rustfmt

* Update lock

Co-authored-by: Jonathan Turner <jonathandturner@users.noreply.github.com>
Co-authored-by: Jonathan Turner <jonathan.d.turner@gmail.com>
This commit is contained in:
Yehuda Katz 2021-01-06 19:03:00 -08:00 committed by GitHub
parent eb62fd466e
commit f410fb6689
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 262 additions and 121 deletions

44
Cargo.lock generated
View file

@ -282,7 +282,7 @@ dependencies = [
"memchr", "memchr",
"num_cpus", "num_cpus",
"once_cell", "once_cell",
"pin-project-lite 0.2.0", "pin-project-lite 0.2.1",
"pin-utils", "pin-utils",
"slab 0.4.2", "slab 0.4.2",
"wasm-bindgen-futures 0.4.19", "wasm-bindgen-futures 0.4.19",
@ -801,9 +801,9 @@ dependencies = [
[[package]] [[package]]
name = "const_fn" name = "const_fn"
version = "0.4.4" version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd51eab21ab4fd6a3bf889e2d0958c0a6e3a61ad04260325e919e652a2a62826" checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6"
[[package]] [[package]]
name = "constant_time_eq" name = "constant_time_eq"
@ -1742,7 +1742,7 @@ dependencies = [
"futures-io", "futures-io",
"memchr", "memchr",
"parking", "parking",
"pin-project-lite 0.2.0", "pin-project-lite 0.2.1",
"waker-fn", "waker-fn",
] ]
@ -1813,7 +1813,7 @@ dependencies = [
"futures-sink", "futures-sink",
"futures-task", "futures-task",
"memchr", "memchr",
"pin-project 1.0.2", "pin-project 1.0.3",
"pin-utils", "pin-utils",
"proc-macro-hack", "proc-macro-hack",
"proc-macro-nested", "proc-macro-nested",
@ -2322,7 +2322,7 @@ dependencies = [
"httparse", "httparse",
"httpdate", "httpdate",
"itoa", "itoa",
"pin-project 1.0.2", "pin-project 1.0.3",
"socket2", "socket2",
"tokio 0.2.24", "tokio 0.2.24",
"tower-service", "tower-service",
@ -4016,11 +4016,11 @@ dependencies = [
[[package]] [[package]]
name = "pin-project" name = "pin-project"
version = "1.0.2" version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ccc2237c2c489783abd8c4c80e5450fc0e98644555b1364da68cc29aa151ca7" checksum = "5a83804639aad6ba65345661744708855f9fbcb71176ea8d28d05aeb11d975e7"
dependencies = [ dependencies = [
"pin-project-internal 1.0.2", "pin-project-internal 1.0.3",
] ]
[[package]] [[package]]
@ -4036,9 +4036,9 @@ dependencies = [
[[package]] [[package]]
name = "pin-project-internal" name = "pin-project-internal"
version = "1.0.2" version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8e8d2bf0b23038a4424865103a4df472855692821aab4e4f5c3312d461d9e5f" checksum = "b7bcc46b8f73443d15bc1c5fecbb315718491fa9187fa483f0e359323cde8b3a"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -4053,9 +4053,9 @@ checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b"
[[package]] [[package]]
name = "pin-project-lite" name = "pin-project-lite"
version = "0.2.0" version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b063f57ec186e6140e2b8b6921e5f1bd89c7356dda5b33acc5401203ca6131c" checksum = "e36743d754ccdf9954c2e352ce2d4b106e024c814f6499c2dadff80da9a442d8"
[[package]] [[package]]
name = "pin-utils" name = "pin-utils"
@ -4210,9 +4210,9 @@ dependencies = [
[[package]] [[package]]
name = "ptree" name = "ptree"
version = "0.3.0" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "828735579562f9be5e3a605016076cc47d7da3c29bf40aa44da28f161cb7f3c0" checksum = "95fd400102d16e4e90e0735c0eb1808ae569a4e62fb8e65a7d1e700611cae6ae"
dependencies = [ dependencies = [
"ansi_term 0.12.1", "ansi_term 0.12.1",
"atty", "atty",
@ -4547,7 +4547,7 @@ dependencies = [
"mime_guess", "mime_guess",
"native-tls", "native-tls",
"percent-encoding 2.1.0", "percent-encoding 2.1.0",
"pin-project-lite 0.2.0", "pin-project-lite 0.2.1",
"serde 1.0.118", "serde 1.0.118",
"serde_urlencoded 0.7.0", "serde_urlencoded 0.7.0",
"tokio 0.2.24", "tokio 0.2.24",
@ -4851,7 +4851,7 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6"
dependencies = [ dependencies = [
"semver-parser 0.10.1", "semver-parser 0.10.2",
] ]
[[package]] [[package]]
@ -4862,9 +4862,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]] [[package]]
name = "semver-parser" name = "semver-parser"
version = "0.10.1" version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42ef146c2ad5e5f4b037cd6ce2ebb775401729b19a82040c1beac9d36c7d1428" checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7"
dependencies = [ dependencies = [
"pest", "pest",
] ]
@ -4985,9 +4985,9 @@ dependencies = [
[[package]] [[package]]
name = "serde_yaml" name = "serde_yaml"
version = "0.8.14" version = "0.8.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7baae0a99f1a324984bcdc5f0718384c1f69775f1c7eec8b859b71b443e3fd7" checksum = "971be8f6e4d4a47163b405a3df70d14359186f9ab0f3a3ec37df144ca1ce089f"
dependencies = [ dependencies = [
"dtoa", "dtoa",
"linked-hash-map 0.5.3", "linked-hash-map 0.5.3",
@ -5787,7 +5787,7 @@ checksum = "9f47026cdc4080c07e49b37087de021820269d996f581aac150ef9e5583eefe3"
dependencies = [ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
"log 0.4.11", "log 0.4.11",
"pin-project-lite 0.2.0", "pin-project-lite 0.2.1",
"tracing-core", "tracing-core",
] ]

View file

@ -408,7 +408,7 @@ mod tests {
#[quickcheck] #[quickcheck]
fn quickcheck_parse(data: String) -> bool { fn quickcheck_parse(data: String) -> bool {
let (tokens, err) = nu_parser::lex(&data, 0); let (tokens, err) = nu_parser::lex(&data, 0);
let (lite_block, err2) = nu_parser::group(tokens); let (lite_block, err2) = nu_parser::block(tokens);
if err.is_none() && err2.is_none() { if err.is_none() && err2.is_none() {
let context = crate::evaluation_context::EvaluationContext::basic().unwrap(); let context = crate::evaluation_context::EvaluationContext::basic().unwrap();
let _ = nu_parser::classify_block(&lite_block, &context.scope); let _ = nu_parser::classify_block(&lite_block, &context.scope);

View file

@ -256,7 +256,7 @@ pub fn completion_location(line: &str, block: &Block, pos: usize) -> Vec<Complet
mod tests { mod tests {
use super::*; use super::*;
use nu_parser::{classify_block, group, lex, ParserScope}; use nu_parser::{block, classify_block, lex, ParserScope};
use nu_protocol::{Signature, SyntaxShape}; use nu_protocol::{Signature, SyntaxShape};
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
@ -307,7 +307,7 @@ mod tests {
pos: usize, pos: usize,
) -> Vec<LocationType> { ) -> Vec<LocationType> {
let (tokens, _) = lex(line, 0); let (tokens, _) = lex(line, 0);
let (lite_block, _) = group(tokens); let (lite_block, _) = block(tokens);
scope.enter_scope(); scope.enter_scope();
let (block, _) = classify_block(&lite_block, scope); let (block, _) = classify_block(&lite_block, scope);

View file

@ -207,7 +207,7 @@ fn parse_line(line: &str, ctx: &EvaluationContext) -> Result<ClassifiedBlock, Sh
if let Some(err) = err { if let Some(err) = err {
return Err(err.into()); return Err(err.into());
} }
let (lite_result, err) = nu_parser::group(lite_result); let (lite_result, err) = nu_parser::block(lite_result);
if let Some(err) = err { if let Some(err) = err {
return Err(err.into()); return Err(err.into());
} }

View file

@ -123,7 +123,7 @@ impl rustyline::validate::Validator for NuValidator {
} }
} }
let (_, err) = nu_parser::group(tokens); let (_, err) = nu_parser::block(tokens);
if let Some(err) = err { if let Some(err) = err {
if let nu_errors::ParseErrorReason::Eof { .. } = err.reason() { if let nu_errors::ParseErrorReason::Eof { .. } = err.reason() {

View file

@ -17,14 +17,14 @@ bigdecimal = {version = "0.2.0", features = ["serde"]}
codespan-reporting = {version = "0.11.0", features = ["serialization"]} codespan-reporting = {version = "0.11.0", features = ["serialization"]}
derive-new = "0.5.8" derive-new = "0.5.8"
getset = "0.1.1" getset = "0.1.1"
num-bigint = {version = "0.3.0", features = ["serde"]} num-bigint = {version = "0.3.1", features = ["serde"]}
num-traits = "0.2.12" num-traits = "0.2.14"
serde = {version = "1.0.115", features = ["derive"]} serde = {version = "1.0.118", features = ["derive"]}
# implement conversions # implement conversions
glob = "0.3.0" glob = "0.3.0"
serde_json = "1.0.57" serde_json = "1.0.61"
serde_yaml = "0.8.13" serde_yaml = "0.8.15"
toml = "0.5.6" toml = "0.5.8"
[build-dependencies] [build-dependencies]

View file

@ -12,12 +12,12 @@ version = "0.25.1"
bigdecimal = {version = "0.2.0", features = ["serde"]} bigdecimal = {version = "0.2.0", features = ["serde"]}
codespan-reporting = "0.11.0" codespan-reporting = "0.11.0"
derive-new = "0.5.8" derive-new = "0.5.8"
indexmap = {version = "1.6.0", features = ["serde-1"]} indexmap = {version = "1.6.1", features = ["serde-1"]}
log = "0.4.11" log = "0.4.11"
num-bigint = {version = "0.3.0", features = ["serde"]} num-bigint = {version = "0.3.1", features = ["serde"]}
num-traits = "0.2.12" num-traits = "0.2.14"
serde = "1.0.115" serde = "1.0.118"
shellexpand = "2.0.0" shellexpand = "2.1.0"
nu-errors = {version = "0.25.1", path = "../nu-errors"} nu-errors = {version = "0.25.1", path = "../nu-errors"}
nu-protocol = {version = "0.25.1", path = "../nu-protocol"} nu-protocol = {version = "0.25.1", path = "../nu-protocol"}

View file

@ -20,12 +20,21 @@ impl Token {
#[derive(Debug)] #[derive(Debug)]
pub enum TokenContents { pub enum TokenContents {
Bare(String), /// A baseline token is an atomic chunk of source code. This means that the
/// token contains the entirety of string literals, as well as the entirety
/// of sections delimited by paired delimiters.
///
/// For example, if the token begins with `{`, the baseline token continues
/// until the closing `}` (after taking comments and string literals into
/// consideration).
Baseline(String),
Pipe, Pipe,
Semicolon, Semicolon,
EOL, EOL,
} }
/// A `LiteCommand` is a list of words that will get meaning when processed by
/// the parser.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct LiteCommand { pub struct LiteCommand {
pub parts: Vec<Spanned<String>>, pub parts: Vec<Spanned<String>>,
@ -39,6 +48,11 @@ impl LiteCommand {
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.parts.is_empty() self.parts.is_empty()
} }
pub fn has_content(&self) -> bool {
!self.is_empty()
}
pub fn push(&mut self, item: Spanned<String>) { pub fn push(&mut self, item: Spanned<String>) {
self.parts.push(item) self.parts.push(item)
} }
@ -60,6 +74,7 @@ impl LiteCommand {
} }
} }
/// A `LitePipeline` is a series of `LiteCommand`s, separated by `|`.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct LitePipeline { pub struct LitePipeline {
pub commands: Vec<LiteCommand>, pub commands: Vec<LiteCommand>,
@ -75,12 +90,19 @@ impl LitePipeline {
pub fn new() -> Self { pub fn new() -> Self {
Self { commands: vec![] } Self { commands: vec![] }
} }
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.commands.is_empty() self.commands.is_empty()
} }
pub fn has_content(&self) -> bool {
!self.commands.is_empty()
}
pub fn push(&mut self, item: LiteCommand) { pub fn push(&mut self, item: LiteCommand) {
self.commands.push(item) self.commands.push(item)
} }
pub(crate) fn span(&self) -> Span { pub(crate) fn span(&self) -> Span {
let start = if !self.commands.is_empty() { let start = if !self.commands.is_empty() {
self.commands[0].span().start() self.commands[0].span().start()
@ -96,6 +118,7 @@ impl LitePipeline {
} }
} }
/// A `LiteGroup` is a series of `LitePipeline`s, separated by `;`.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct LiteGroup { pub struct LiteGroup {
pub pipelines: Vec<LitePipeline>, pub pipelines: Vec<LitePipeline>,
@ -111,12 +134,19 @@ impl LiteGroup {
pub fn new() -> Self { pub fn new() -> Self {
Self { pipelines: vec![] } Self { pipelines: vec![] }
} }
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.pipelines.is_empty() self.pipelines.is_empty()
} }
pub fn has_content(&self) -> bool {
!self.pipelines.is_empty()
}
pub fn push(&mut self, item: LitePipeline) { pub fn push(&mut self, item: LitePipeline) {
self.pipelines.push(item) self.pipelines.push(item)
} }
pub fn is_comment(&self) -> bool { pub fn is_comment(&self) -> bool {
if !self.is_empty() if !self.is_empty()
&& !self.pipelines[0].is_empty() && !self.pipelines[0].is_empty()
@ -128,6 +158,7 @@ impl LiteGroup {
false false
} }
} }
#[cfg(test)] #[cfg(test)]
pub(crate) fn span(&self) -> Span { pub(crate) fn span(&self) -> Span {
let start = if !self.pipelines.is_empty() { let start = if !self.pipelines.is_empty() {
@ -144,6 +175,7 @@ impl LiteGroup {
} }
} }
/// A `LiteBlock` is a series of `LiteGroup`s, separated by newlines.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct LiteBlock { pub struct LiteBlock {
pub block: Vec<LiteGroup>, pub block: Vec<LiteGroup>,
@ -153,12 +185,15 @@ impl LiteBlock {
pub fn new(block: Vec<LiteGroup>) -> Self { pub fn new(block: Vec<LiteGroup>) -> Self {
Self { block } Self { block }
} }
pub fn is_empty(&self) -> bool { pub fn is_empty(&self) -> bool {
self.block.is_empty() self.block.is_empty()
} }
pub fn push(&mut self, item: LiteGroup) { pub fn push(&mut self, item: LiteGroup) {
self.block.push(item) self.block.push(item)
} }
#[cfg(test)] #[cfg(test)]
pub(crate) fn span(&self) -> Span { pub(crate) fn span(&self) -> Span {
let start = if !self.block.is_empty() { let start = if !self.block.is_empty() {
@ -173,29 +208,6 @@ impl LiteBlock {
Span::new(start, 0) Span::new(start, 0)
} }
} }
pub fn head(&self) -> Option<Spanned<String>> {
if let Some(group) = self.block.get(0) {
if let Some(pipeline) = group.pipelines.get(0) {
if let Some(command) = pipeline.commands.get(0) {
if let Some(head) = command.parts.get(0) {
return Some(head.clone());
}
}
}
}
None
}
pub fn remove_head(&mut self) {
if let Some(group) = self.block.get_mut(0) {
if let Some(pipeline) = group.pipelines.get_mut(0) {
if let Some(command) = pipeline.commands.get_mut(0) {
if !command.parts.is_empty() {
command.parts.remove(0);
}
}
}
}
}
} }
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
@ -205,9 +217,9 @@ enum BlockKind {
SquareBracket, SquareBracket,
} }
impl From<BlockKind> for char { impl BlockKind {
fn from(bk: BlockKind) -> char { fn closing(self) -> char {
match bk { match self {
BlockKind::Paren => ')', BlockKind::Paren => ')',
BlockKind::SquareBracket => ']', BlockKind::SquareBracket => ']',
BlockKind::CurlyBracket => '}', BlockKind::CurlyBracket => '}',
@ -215,93 +227,143 @@ impl From<BlockKind> for char {
} }
} }
/// Finds the extents of a bare (un-classified) token, returning the string with its associated span, /// Finds the extents of a basline token, returning the string with its
/// along with any parse error that was discovered along the way. /// associated span, along with any parse error that was discovered along the
/// Bare tokens are unparsed content separated by spaces or a command separator (like pipe or semicolon) /// way.
/// Bare tokens may be surrounded by quotes (single, double, or backtick) or braces (square, paren, curly) ///
pub fn bare(src: &mut Input, span_offset: usize) -> (Spanned<String>, Option<ParseError>) { /// Baseline tokens are unparsed content separated by spaces or a command
let mut bare = String::new(); /// separator (like pipe or semicolon) Baseline tokens may be surrounded by
/// quotes (single, double, or backtick) or braces (square, paren, curly)
///
/// Baseline tokens may be further processed based on the needs of the syntax
/// shape that encounters them. They are still lightly lexed. For example, if a
/// baseline token begins with `{`, the entire token will continue until the
/// closing `}`, taking comments into consideration.
pub fn baseline(src: &mut Input, span_offset: usize) -> (Spanned<String>, Option<ParseError>) {
let mut token_contents = String::new();
let start_offset = if let Some((pos, _)) = src.peek() { let start_offset = if let Some((pos, _)) = src.peek() {
*pos *pos
} else { } else {
0 0
}; };
let mut inside_quote: Option<char> = None; // This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<char> = None;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![]; let mut block_level: Vec<BlockKind> = vec![];
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
block_level.is_empty() && (c.is_whitespace() || c == '|' || c == ';' || c == '#')
}
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some((_, c)) = src.peek() { while let Some((_, c)) = src.peek() {
let c = *c; let c = *c;
if inside_quote.is_some() {
if Some(c) == inside_quote { if quote_start.is_some() {
inside_quote = None; // If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
} }
} else if c == '\'' || c == '"' || c == '`' { } else if c == '\'' || c == '"' || c == '`' {
inside_quote = Some(c); // We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == '[' { } else if c == '[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket); block_level.push(BlockKind::SquareBracket);
} else if c == ']' { } else if c == ']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() { if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop(); let _ = block_level.pop();
} }
} else if c == '{' { } else if c == '{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket); block_level.push(BlockKind::CurlyBracket);
} else if c == '}' { } else if c == '}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() { if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop(); let _ = block_level.pop();
} }
} else if c == '(' { } else if c == '(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren); block_level.push(BlockKind::Paren);
} else if c == ')' { } else if c == ')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() { if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop(); let _ = block_level.pop();
} }
} else if block_level.is_empty() && (c.is_whitespace() || c == '|' || c == ';' || c == '#') } else if is_termination(&block_level, c) {
{
break; break;
} }
bare.push(c);
// Otherwise, accumulate the character into the current token.
token_contents.push(c);
// Consume the character.
let _ = src.next(); let _ = src.next();
} }
let span = Span::new( let span = Span::new(
start_offset + span_offset, start_offset + span_offset,
start_offset + span_offset + bare.len(), start_offset + span_offset + token_contents.len(),
); );
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if let Some(block) = block_level.last() { if let Some(block) = block_level.last() {
let delim: char = (*block).into(); let delim: char = (*block).closing();
let cause = ParseError::unexpected_eof(delim.to_string(), span); let cause = ParseError::unexpected_eof(delim.to_string(), span);
while let Some(bk) = block_level.pop() { while let Some(bk) = block_level.pop() {
bare.push(bk.into()); token_contents.push(bk.closing());
} }
return (bare.spanned(span), Some(cause)); return (token_contents.spanned(span), Some(cause));
} }
if let Some(delimiter) = inside_quote { if let Some(delimiter) = quote_start {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that // The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get // anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse. // correct information from the non-lite parse.
bare.push(delimiter); token_contents.push(delimiter);
return ( return (
bare.spanned(span), token_contents.spanned(span),
Some(ParseError::unexpected_eof(delimiter.to_string(), span)), Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
); );
} }
if bare.is_empty() { // If we didn't accumulate any characters, it's an unexpected error.
if token_contents.is_empty() {
return ( return (
bare.spanned(span), token_contents.spanned(span),
Some(ParseError::unexpected_eof("command".to_string(), span)), Some(ParseError::unexpected_eof("command".to_string(), span)),
); );
} }
(bare.spanned(span), None) (token_contents.spanned(span), None)
} }
/// We encountered a `#` character. Keep consuming characters until we encounter
/// a newline character (but don't consume it).
fn skip_comment(input: &mut Input) { fn skip_comment(input: &mut Input) {
while let Some((_, c)) = input.peek() { while let Some((_, c)) = input.peek() {
if *c == '\n' || *c == '\r' { if *c == '\n' || *c == '\r' {
@ -311,39 +373,75 @@ fn skip_comment(input: &mut Input) {
} }
} }
pub fn group(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) { /// Try to parse a list of tokens into a block.
pub fn block(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
// Accumulate chunks of tokens into groups.
let mut groups = vec![]; let mut groups = vec![];
// The current group
let mut group = LiteGroup::new(); let mut group = LiteGroup::new();
// The current pipeline
let mut pipeline = LitePipeline::new(); let mut pipeline = LitePipeline::new();
// The current command
let mut command = LiteCommand::new(); let mut command = LiteCommand::new();
let mut prev_token: Option<Token> = None; let mut prev_token: Option<Token> = None;
// The parsing process repeats:
//
// - newline (`\n` or `\r`)
// - pipes (`|`)
// - semicolon
for token in tokens { for token in tokens {
match &token.contents { match &token.contents {
TokenContents::EOL => { TokenContents::EOL => {
// We encountered a newline character. If the last token on the
// current line is a `|`, continue the current group on the next
// line. Otherwise, close up the current group by rolling up the
// current command into the current pipeline, and then roll up
// the current pipeline into the group.
// If the last token on the current line is a `|`, the group
// continues on the next line.
if let Some(prev) = &prev_token { if let Some(prev) = &prev_token {
if let TokenContents::Pipe = prev.contents { if let TokenContents::Pipe = prev.contents {
continue; continue;
} }
} }
if !command.is_empty() {
// If we have an open command, push it into the current
// pipeline.
if command.has_content() {
pipeline.push(command); pipeline.push(command);
command = LiteCommand::new(); command = LiteCommand::new();
} }
if !pipeline.is_empty() {
// If we have an open pipeline, push it into the current group.
if pipeline.has_content() {
group.push(pipeline); group.push(pipeline);
pipeline = LitePipeline::new(); pipeline = LitePipeline::new();
} }
if !group.is_empty() {
// If we have an open group, accumulate it into `groups`.
if group.has_content() {
groups.push(group); groups.push(group);
group = LiteGroup::new(); group = LiteGroup::new();
} }
} }
TokenContents::Pipe => { TokenContents::Pipe => {
if !command.is_empty() { // We encountered a pipe (`|`) character, which terminates a
// command.
// If the current command has content, accumulate it into
// the current pipeline and start a new command.
if command.has_content() {
pipeline.push(command); pipeline.push(command);
command = LiteCommand::new(); command = LiteCommand::new();
} else { } else {
// If the current command doesn't have content, return an
// error that indicates that the `|` was unexpected.
return ( return (
LiteBlock::new(groups), LiteBlock::new(groups),
Some(ParseError::extra_tokens( Some(ParseError::extra_tokens(
@ -353,31 +451,49 @@ pub fn group(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
} }
} }
TokenContents::Semicolon => { TokenContents::Semicolon => {
if !command.is_empty() { // We encountered a semicolon (`;`) character, which terminates
// a pipeline.
// If the current command has content, accumulate it into the
// current pipeline and start a new command.
if command.has_content() {
pipeline.push(command); pipeline.push(command);
command = LiteCommand::new(); command = LiteCommand::new();
} }
if !pipeline.is_empty() {
// If the current pipeline has content, accumulate it into the
// current group and start a new pipeline.
if pipeline.has_content() {
group.push(pipeline); group.push(pipeline);
pipeline = LitePipeline::new(); pipeline = LitePipeline::new();
} }
} }
TokenContents::Bare(bare) => { TokenContents::Baseline(bare) => {
// We encountered an unclassified character. Accumulate it into
// the current command as a string.
command.push(bare.to_string().spanned(token.span)); command.push(bare.to_string().spanned(token.span));
} }
} }
prev_token = Some(token); prev_token = Some(token);
} }
if !command.is_empty() {
// If the current command has content, accumulate it into the current pipeline.
if command.has_content() {
pipeline.push(command); pipeline.push(command);
} }
if !pipeline.is_empty() {
// If the current pipeline has content, accumulate it into the current group.
if pipeline.has_content() {
group.push(pipeline); group.push(pipeline);
} }
if !group.is_empty() {
// If the current group has content, accumulate it into the list of groups.
if group.has_content() {
groups.push(group); groups.push(group);
} }
// Return a new LiteBlock with the accumulated list of groups.
(LiteBlock::new(groups), None) (LiteBlock::new(groups), None)
} }
@ -385,35 +501,51 @@ pub fn group(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
/// semicolons, pipes, etc from external bare values (values that haven't been classified further) /// semicolons, pipes, etc from external bare values (values that haven't been classified further)
/// Takes in a string and and offset, which is used to offset the spans created (for when this function is used to parse inner strings) /// Takes in a string and and offset, which is used to offset the spans created (for when this function is used to parse inner strings)
pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>) { pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>) {
// Break the input slice into an iterator of Unicode characters.
let mut char_indices = input.char_indices().peekable(); let mut char_indices = input.char_indices().peekable();
let mut error = None; let mut error = None;
let mut output = vec![]; let mut output = vec![];
let mut is_complete = true; let mut is_complete = true;
// The lexing process repeats. One character of lookahead is sufficient to decide what to do next.
//
// - `|`: the token is either `|` token or a `||` token
// - `;`: the token is a semicolon
// - `\n` or `\r`: the token is an EOL (end of line) token
// - other whitespace: ignored
// - `#` the token starts a line comment, which contains all of the subsequent characters until the next EOL
// -
while let Some((idx, c)) = char_indices.peek() { while let Some((idx, c)) = char_indices.peek() {
if *c == '|' { if *c == '|' {
// If the next character is `|`, it's either `|` or `||`.
let idx = *idx; let idx = *idx;
let prev_idx = idx; let prev_idx = idx;
let _ = char_indices.next(); let _ = char_indices.next();
// If the next character is `|`, we're looking at a `||`.
if let Some((idx, c)) = char_indices.peek() { if let Some((idx, c)) = char_indices.peek() {
if *c == '|' { if *c == '|' {
// we have '||' instead of '|'
let idx = *idx; let idx = *idx;
let _ = char_indices.next(); let _ = char_indices.next();
output.push(Token::new( output.push(Token::new(
TokenContents::Bare("||".into()), TokenContents::Baseline("||".into()),
Span::new(span_offset + prev_idx, span_offset + idx + 1), Span::new(span_offset + prev_idx, span_offset + idx + 1),
)); ));
continue; continue;
} }
} }
// Otherwise, it's just a regular `|` token.
output.push(Token::new( output.push(Token::new(
TokenContents::Pipe, TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(span_offset + idx, span_offset + idx + 1),
)); ));
is_complete = false; is_complete = false;
} else if *c == ';' { } else if *c == ';' {
// If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() { if !is_complete && error.is_none() {
error = Some(ParseError::extra_tokens( error = Some(ParseError::extra_tokens(
";".to_string().spanned(Span::new(*idx, idx + 1)), ";".to_string().spanned(Span::new(*idx, idx + 1)),
@ -426,6 +558,8 @@ pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>)
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(span_offset + idx, span_offset + idx + 1),
)); ));
} else if *c == '\n' || *c == '\r' { } else if *c == '\n' || *c == '\r' {
// If the next character is a newline, we're looking at an EOL (end of line) token.
let idx = *idx; let idx = *idx;
let _ = char_indices.next(); let _ = char_indices.next();
output.push(Token::new( output.push(Token::new(
@ -433,17 +567,24 @@ pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>)
Span::new(span_offset + idx, span_offset + idx + 1), Span::new(span_offset + idx, span_offset + idx + 1),
)); ));
} else if *c == '#' { } else if *c == '#' {
// If the next character is `#`, we're at the beginning of a line
// comment. The comment continues until the next newline.
skip_comment(&mut char_indices); skip_comment(&mut char_indices);
} else if c.is_whitespace() { } else if c.is_whitespace() {
// If the next character is non-newline whitespace, skip it.
let _ = char_indices.next(); let _ = char_indices.next();
} else { } else {
let (result, err) = bare(&mut char_indices, span_offset); // Otherwise, try to consume an unclassified token.
let (result, err) = baseline(&mut char_indices, span_offset);
if error.is_none() { if error.is_none() {
error = err; error = err;
} }
is_complete = true; is_complete = true;
let Spanned { item, span } = result; let Spanned { item, span } = result;
output.push(Token::new(TokenContents::Bare(item), span)); output.push(Token::new(TokenContents::Baseline(item), span));
} }
} }
@ -605,7 +746,7 @@ mod tests {
fn pipeline() { fn pipeline() {
let (result, err) = lex("cmd1 | cmd2 ; deploy", 0); let (result, err) = lex("cmd1 | cmd2 ; deploy", 0);
assert!(err.is_none()); assert!(err.is_none());
let (result, err) = group(result); let (result, err) = block(result);
assert!(err.is_none()); assert!(err.is_none());
assert_eq!(result.span(), span(0, 20)); assert_eq!(result.span(), span(0, 20));
assert_eq!(result.block[0].pipelines[0].span(), span(0, 11)); assert_eq!(result.block[0].pipelines[0].span(), span(0, 11));
@ -616,7 +757,7 @@ mod tests {
fn simple_1() { fn simple_1() {
let (result, err) = lex("foo", 0); let (result, err) = lex("foo", 0);
assert!(err.is_none()); assert!(err.is_none());
let (result, err) = group(result); let (result, err) = block(result);
assert!(err.is_none()); assert!(err.is_none());
assert_eq!(result.block.len(), 1); assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1); assert_eq!(result.block[0].pipelines.len(), 1);
@ -632,7 +773,7 @@ mod tests {
fn simple_offset() { fn simple_offset() {
let (result, err) = lex("foo", 10); let (result, err) = lex("foo", 10);
assert!(err.is_none()); assert!(err.is_none());
let (result, err) = group(result); let (result, err) = block(result);
assert!(err.is_none()); assert!(err.is_none());
assert_eq!(result.block[0].pipelines.len(), 1); assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1); assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
@ -647,7 +788,7 @@ mod tests {
fn incomplete_result() { fn incomplete_result() {
let (result, err) = lex("my_command \"foo' --test", 10); let (result, err) = lex("my_command \"foo' --test", 10);
assert!(matches!(err.unwrap().reason(), nu_errors::ParseErrorReason::Eof { .. })); assert!(matches!(err.unwrap().reason(), nu_errors::ParseErrorReason::Eof { .. }));
let (result, _) = group(result); let (result, _) = block(result);
assert_eq!(result.block.len(), 1); assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1); assert_eq!(result.block[0].pipelines.len(), 1);

View file

@ -6,7 +6,7 @@ mod scope;
mod shapes; mod shapes;
mod signature; mod signature;
pub use lex::{group, lex, LiteBlock, LiteCommand, LiteGroup, LitePipeline}; pub use lex::{block, lex, LiteBlock, LiteCommand, LiteGroup, LitePipeline};
pub use parse::{classify_block, garbage, parse, parse_full_column_path, parse_math_expression}; pub use parse::{classify_block, garbage, parse, parse_full_column_path, parse_math_expression};
pub use path::expand_ndots; pub use path::expand_ndots;
pub use scope::ParserScope; pub use scope::ParserScope;

View file

@ -13,7 +13,7 @@ use nu_source::{Span, Spanned, SpannedItem};
use num_bigint::BigInt; use num_bigint::BigInt;
//use crate::errors::{ParseError, ParseResult}; //use crate::errors::{ParseError, ParseResult};
use crate::lex::{group, lex, LiteBlock, LiteCommand, LitePipeline}; use crate::lex::{block, lex, LiteBlock, LiteCommand, LitePipeline};
use crate::path::expand_path; use crate::path::expand_path;
use crate::scope::ParserScope; use crate::scope::ParserScope;
use bigdecimal::BigDecimal; use bigdecimal::BigDecimal;
@ -393,7 +393,7 @@ fn parse_invocation(
if err.is_some() { if err.is_some() {
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
}; };
let (lite_block, err) = group(tokens); let (lite_block, err) = block(tokens);
if err.is_some() { if err.is_some() {
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
}; };
@ -719,7 +719,7 @@ fn parse_table(
return (garbage(lite_inner.span()), err); return (garbage(lite_inner.span()), err);
} }
let (lite_header, err) = group(tokens); let (lite_header, err) = block(tokens);
if err.is_some() { if err.is_some() {
return (garbage(lite_inner.span()), err); return (garbage(lite_inner.span()), err);
} }
@ -742,7 +742,7 @@ fn parse_table(
if err.is_some() { if err.is_some() {
return (garbage(arg.span), err); return (garbage(arg.span), err);
} }
let (lite_cell, err) = group(tokens); let (lite_cell, err) = block(tokens);
if err.is_some() { if err.is_some() {
return (garbage(arg.span), err); return (garbage(arg.span), err);
} }
@ -873,7 +873,7 @@ fn parse_arg(
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
} }
let (lite_block, err) = group(tokens); let (lite_block, err) = block(tokens);
if err.is_some() { if err.is_some() {
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
} }
@ -928,7 +928,7 @@ fn parse_arg(
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
} }
let (lite_block, err) = group(tokens); let (lite_block, err) = block(tokens);
if err.is_some() { if err.is_some() {
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
} }
@ -1156,7 +1156,7 @@ fn parse_parenthesized_expression(
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
} }
let (lite_block, err) = group(tokens); let (lite_block, err) = block(tokens);
if err.is_some() { if err.is_some() {
return (garbage(lite_arg.span), err); return (garbage(lite_arg.span), err);
} }
@ -2178,7 +2178,7 @@ fn parse_definition(call: &LiteCommand, scope: &dyn ParserScope) -> Option<Parse
if err.is_some() { if err.is_some() {
return err; return err;
}; };
let (lite_block, err) = group(tokens); let (lite_block, err) = block(tokens);
if err.is_some() { if err.is_some() {
return err; return err;
}; };
@ -2338,7 +2338,7 @@ pub fn parse(
if error.is_some() { if error.is_some() {
return (Block::basic(), error); return (Block::basic(), error);
} }
let (lite_block, error) = group(output); let (lite_block, error) = block(output);
if error.is_some() { if error.is_some() {
return (Block::basic(), error); return (Block::basic(), error);
} }

View file

@ -15,7 +15,7 @@ byte-unit = "4.0.9"
chrono = {version = "0.4.15", features = ["serde"]} chrono = {version = "0.4.15", features = ["serde"]}
derive-new = "0.5.8" derive-new = "0.5.8"
getset = "0.1.1" getset = "0.1.1"
indexmap = {version = "1.6.0", features = ["serde-1"]} indexmap = {version = "1.6.1", features = ["serde-1"]}
log = "0.4.11" log = "0.4.11"
nu-errors = {path = "../nu-errors", version = "0.25.1"} nu-errors = {path = "../nu-errors", version = "0.25.1"}
nu-source = {path = "../nu-source", version = "0.25.1"} nu-source = {path = "../nu-source", version = "0.25.1"}

View file

@ -13,7 +13,7 @@ doctest = false
derive-new = "0.5.8" derive-new = "0.5.8"
getset = "0.1.1" getset = "0.1.1"
pretty = "0.5.2" pretty = "0.5.2"
serde = {version = "1.0.115", features = ["derive"]} serde = {version = "1.0.118", features = ["derive"]}
termcolor = "1.1.0" termcolor = "1.1.2"
[build-dependencies] [build-dependencies]