nushell/crates/nu-parser/src/lex.rs

348 lines
11 KiB
Rust
Raw Normal View History

2021-06-30 01:42:56 +00:00
use crate::{ParseError, Span};
#[derive(Debug, PartialEq, Eq)]
pub enum TokenContents {
Item,
Comment,
Pipe,
Semicolon,
Eol,
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token {
pub contents: TokenContents,
pub span: Span,
}
impl Token {
pub fn new(contents: TokenContents, span: Span) -> Token {
Token { contents, span }
}
}
#[derive(Clone, Copy, Debug)]
pub enum BlockKind {
Paren,
CurlyBracket,
SquareBracket,
}
impl BlockKind {
fn closing(self) -> u8 {
match self {
BlockKind::Paren => b')',
BlockKind::SquareBracket => b']',
BlockKind::CurlyBracket => b'}',
}
}
}
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
2021-07-16 22:11:15 +00:00
fn is_item_terminator(
block_level: &[BlockKind],
c: u8,
additional_whitespace: &[u8],
special_tokens: &[u8],
) -> bool {
2021-06-30 01:42:56 +00:00
block_level.is_empty()
2021-07-05 22:58:56 +00:00
&& (c == b' '
|| c == b'\t'
|| c == b'\n'
|| c == b'\r'
2021-07-05 22:58:56 +00:00
|| c == b'|'
|| c == b';'
|| c == b'#'
2021-07-16 22:11:15 +00:00
|| additional_whitespace.contains(&c)
|| special_tokens.contains(&c))
2021-07-16 21:55:12 +00:00
}
// A special token is one that is a byte that stands alone as its own token. For example
// when parsing a signature you may want to have `:` be able to separate tokens and also
// to be handled as its own token to notify you you're about to parse a type in the example
// `foo:bar`
2021-07-16 22:11:15 +00:00
fn is_special_item(block_level: &[BlockKind], c: u8, special_tokens: &[u8]) -> bool {
block_level.is_empty() && special_tokens.contains(&c)
2021-06-30 01:42:56 +00:00
}
2021-07-05 22:58:56 +00:00
pub fn lex_item(
input: &[u8],
curr_offset: &mut usize,
span_offset: usize,
2021-07-16 22:11:15 +00:00
additional_whitespace: &[u8],
special_tokens: &[u8],
2021-07-05 22:58:56 +00:00
) -> (Span, Option<ParseError>) {
2021-06-30 01:42:56 +00:00
// This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<u8> = None;
let mut in_comment = false;
let token_start = *curr_offset;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![];
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some(c) = input.get(*curr_offset) {
let c = *c;
if quote_start.is_some() {
// If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
}
} else if c == b'#' {
2021-07-16 22:11:15 +00:00
if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
2021-06-30 01:42:56 +00:00
break;
}
in_comment = true;
} else if c == b'\n' {
in_comment = false;
2021-07-16 22:11:15 +00:00
if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
2021-06-30 01:42:56 +00:00
break;
}
} else if in_comment {
2021-07-16 22:11:15 +00:00
if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
2021-06-30 01:42:56 +00:00
break;
}
2021-07-16 22:11:15 +00:00
} else if is_special_item(&block_level, c, special_tokens) && token_start == *curr_offset {
2021-07-16 21:55:12 +00:00
*curr_offset += 1;
break;
2021-06-30 01:42:56 +00:00
} else if c == b'\'' || c == b'"' {
// We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == b'[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket);
} else if c == b']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == b'{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket);
} else if c == b'}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == b'(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren);
} else if c == b')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop();
}
2021-07-16 22:11:15 +00:00
} else if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
2021-06-30 01:42:56 +00:00
break;
}
*curr_offset += 1;
}
let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
2021-06-30 01:42:56 +00:00
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if let Some(block) = block_level.last() {
let delim = block.closing();
2021-07-29 22:56:51 +00:00
let cause = ParseError::UnexpectedEof(
(delim as char).to_string(),
Span {
start: span.end - 1,
end: span.end,
},
);
2021-06-30 01:42:56 +00:00
return (span, Some(cause));
}
if let Some(delim) = quote_start {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse.
return (
span,
Some(ParseError::UnexpectedEof((delim as char).to_string(), span)),
);
}
// If we didn't accumulate any characters, it's an unexpected error.
if *curr_offset - token_start == 0 {
return (
span,
Some(ParseError::UnexpectedEof("command".to_string(), span)),
);
}
(span, None)
}
pub fn lex(
input: &[u8],
span_offset: usize,
2021-07-16 22:11:15 +00:00
additional_whitespace: &[u8],
special_tokens: &[u8],
2021-06-30 01:42:56 +00:00
) -> (Vec<Token>, Option<ParseError>) {
let mut error = None;
let mut curr_offset = 0;
2021-06-30 01:42:56 +00:00
let mut output = vec![];
let mut is_complete = true;
while let Some(c) = input.get(curr_offset) {
let c = *c;
if c == b'|' {
// If the next character is `|`, it's either `|` or `||`.
let idx = curr_offset;
let prev_idx = idx;
curr_offset += 1;
// If the next character is `|`, we're looking at a `||`.
if let Some(c) = input.get(curr_offset) {
if *c == b'|' {
let idx = curr_offset;
curr_offset += 1;
output.push(Token::new(
TokenContents::Item,
Span::new(span_offset + prev_idx, span_offset + idx + 1),
2021-06-30 01:42:56 +00:00
));
continue;
}
}
// Otherwise, it's just a regular `|` token.
output.push(Token::new(
TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1),
2021-06-30 01:42:56 +00:00
));
is_complete = false;
} else if c == b';' {
// If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() {
error = Some(ParseError::ExtraTokens(Span::new(
curr_offset,
curr_offset + 1,
)));
}
let idx = curr_offset;
curr_offset += 1;
output.push(Token::new(
TokenContents::Semicolon,
Span::new(span_offset + idx, span_offset + idx + 1),
2021-06-30 01:42:56 +00:00
));
} else if c == b'\n' || c == b'\r' {
// If the next character is a newline, we're looking at an EOL (end of line) token.
let idx = curr_offset;
curr_offset += 1;
2021-07-16 22:11:15 +00:00
if !additional_whitespace.contains(&c) {
output.push(Token::new(
TokenContents::Eol,
Span::new(span_offset + idx, span_offset + idx + 1),
));
2021-06-30 01:42:56 +00:00
}
} else if c == b'#' {
// If the next character is `#`, we're at the beginning of a line
// comment. The comment continues until the next newline.
let mut start = curr_offset;
while let Some(input) = input.get(curr_offset) {
curr_offset += 1;
if *input == b'\n' || *input == b'\r' {
2021-06-30 01:42:56 +00:00
output.push(Token::new(
TokenContents::Comment,
Span::new(start, curr_offset),
2021-06-30 01:42:56 +00:00
));
start = curr_offset;
break;
}
}
if start != curr_offset {
output.push(Token::new(
TokenContents::Comment,
Span::new(span_offset + start, span_offset + curr_offset),
2021-06-30 01:42:56 +00:00
));
}
2021-07-16 22:11:15 +00:00
} else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) {
2021-06-30 01:42:56 +00:00
// If the next character is non-newline whitespace, skip it.
curr_offset += 1;
} else {
// Otherwise, try to consume an unclassified token.
2021-07-16 22:11:15 +00:00
let (span, err) = lex_item(
input,
&mut curr_offset,
span_offset,
2021-07-16 22:11:15 +00:00
additional_whitespace,
special_tokens,
);
2021-06-30 01:42:56 +00:00
if error.is_none() {
error = err;
}
is_complete = true;
output.push(Token::new(TokenContents::Item, span));
}
}
(output, error)
}
#[cfg(test)]
mod lex_tests {
use super::*;
#[test]
fn lex_basic() {
let file = b"let x = 4";
2021-07-16 22:11:15 +00:00
let output = lex(file, 0, &[], &[]);
2021-06-30 01:42:56 +00:00
assert!(output.1.is_none());
}
#[test]
fn lex_newline() {
let file = b"let x = 300\nlet y = 500;";
2021-07-16 22:11:15 +00:00
let output = lex(file, 0, &[], &[]);
2021-06-30 01:42:56 +00:00
println!("{:#?}", output.0);
assert!(output.0.contains(&Token {
contents: TokenContents::Eol,
span: Span { start: 11, end: 12 }
2021-06-30 01:42:56 +00:00
}));
}
#[test]
fn lex_empty() {
let file = b"";
2021-07-16 22:11:15 +00:00
let output = lex(file, 0, &[], &[]);
2021-06-30 01:42:56 +00:00
assert!(output.0.is_empty());
assert!(output.1.is_none());
}
}