mirror of
https://github.com/nushell/nushell
synced 2025-01-08 19:29:08 +00:00
320 lines
9.7 KiB
Rust
320 lines
9.7 KiB
Rust
|
use crate::{ParseError, Span};
|
||
|
|
||
|
#[derive(Debug, PartialEq, Eq)]
|
||
|
pub enum TokenContents {
|
||
|
Item,
|
||
|
Comment,
|
||
|
Pipe,
|
||
|
Semicolon,
|
||
|
Eol,
|
||
|
}
|
||
|
|
||
|
#[derive(Debug, PartialEq, Eq)]
|
||
|
pub struct Token {
|
||
|
pub contents: TokenContents,
|
||
|
pub span: Span,
|
||
|
}
|
||
|
|
||
|
impl Token {
|
||
|
pub fn new(contents: TokenContents, span: Span) -> Token {
|
||
|
Token { contents, span }
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[derive(Clone, Copy, Debug)]
|
||
|
pub enum BlockKind {
|
||
|
Paren,
|
||
|
CurlyBracket,
|
||
|
SquareBracket,
|
||
|
}
|
||
|
|
||
|
impl BlockKind {
|
||
|
fn closing(self) -> u8 {
|
||
|
match self {
|
||
|
BlockKind::Paren => b')',
|
||
|
BlockKind::SquareBracket => b']',
|
||
|
BlockKind::CurlyBracket => b'}',
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[derive(PartialEq, Eq, Debug)]
|
||
|
pub enum LexMode {
|
||
|
Normal,
|
||
|
}
|
||
|
|
||
|
// A baseline token is terminated if it's not nested inside of a paired
|
||
|
// delimiter and the next character is one of: `|`, `;`, `#` or any
|
||
|
// whitespace.
|
||
|
fn is_item_terminator(block_level: &[BlockKind], c: u8) -> bool {
|
||
|
block_level.is_empty()
|
||
|
&& (c == b' ' || c == b'\t' || c == b'\n' || c == b'|' || c == b';' || c == b'#')
|
||
|
}
|
||
|
|
||
|
pub fn lex_item(
|
||
|
input: &[u8],
|
||
|
curr_offset: &mut usize,
|
||
|
file_id: usize,
|
||
|
) -> (Span, Option<ParseError>) {
|
||
|
// This variable tracks the starting character of a string literal, so that
|
||
|
// we remain inside the string literal lexer mode until we encounter the
|
||
|
// closing quote.
|
||
|
let mut quote_start: Option<u8> = None;
|
||
|
|
||
|
let mut in_comment = false;
|
||
|
|
||
|
let token_start = *curr_offset;
|
||
|
|
||
|
// This Vec tracks paired delimiters
|
||
|
let mut block_level: Vec<BlockKind> = vec![];
|
||
|
|
||
|
// The process of slurping up a baseline token repeats:
|
||
|
//
|
||
|
// - String literal, which begins with `'`, `"` or `\``, and continues until
|
||
|
// the same character is encountered again.
|
||
|
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
|
||
|
// the matching closing delimiter is found, skipping comments and string
|
||
|
// literals.
|
||
|
// - When not nested inside of a delimiter pair, when a terminating
|
||
|
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
|
||
|
// token is done.
|
||
|
// - Otherwise, accumulate the character into the current baseline token.
|
||
|
while let Some(c) = input.get(*curr_offset) {
|
||
|
let c = *c;
|
||
|
|
||
|
if quote_start.is_some() {
|
||
|
// If we encountered the closing quote character for the current
|
||
|
// string, we're done with the current string.
|
||
|
if Some(c) == quote_start {
|
||
|
quote_start = None;
|
||
|
}
|
||
|
} else if c == b'#' {
|
||
|
if is_item_terminator(&block_level, c) {
|
||
|
break;
|
||
|
}
|
||
|
in_comment = true;
|
||
|
} else if c == b'\n' {
|
||
|
in_comment = false;
|
||
|
if is_item_terminator(&block_level, c) {
|
||
|
break;
|
||
|
}
|
||
|
} else if in_comment {
|
||
|
if is_item_terminator(&block_level, c) {
|
||
|
break;
|
||
|
}
|
||
|
} else if c == b'\'' || c == b'"' {
|
||
|
// We encountered the opening quote of a string literal.
|
||
|
quote_start = Some(c);
|
||
|
} else if c == b'[' {
|
||
|
// We encountered an opening `[` delimiter.
|
||
|
block_level.push(BlockKind::SquareBracket);
|
||
|
} else if c == b']' {
|
||
|
// We encountered a closing `]` delimiter. Pop off the opening `[`
|
||
|
// delimiter.
|
||
|
if let Some(BlockKind::SquareBracket) = block_level.last() {
|
||
|
let _ = block_level.pop();
|
||
|
}
|
||
|
} else if c == b'{' {
|
||
|
// We encountered an opening `{` delimiter.
|
||
|
block_level.push(BlockKind::CurlyBracket);
|
||
|
} else if c == b'}' {
|
||
|
// We encountered a closing `}` delimiter. Pop off the opening `{`.
|
||
|
if let Some(BlockKind::CurlyBracket) = block_level.last() {
|
||
|
let _ = block_level.pop();
|
||
|
}
|
||
|
} else if c == b'(' {
|
||
|
// We enceountered an opening `(` delimiter.
|
||
|
block_level.push(BlockKind::Paren);
|
||
|
} else if c == b')' {
|
||
|
// We encountered a closing `)` delimiter. Pop off the opening `(`.
|
||
|
if let Some(BlockKind::Paren) = block_level.last() {
|
||
|
let _ = block_level.pop();
|
||
|
}
|
||
|
} else if is_item_terminator(&block_level, c) {
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
*curr_offset += 1;
|
||
|
}
|
||
|
|
||
|
let span = Span::new(token_start, *curr_offset, file_id);
|
||
|
|
||
|
// If there is still unclosed opening delimiters, close them and add
|
||
|
// synthetic closing characters to the accumulated token.
|
||
|
if let Some(block) = block_level.last() {
|
||
|
let delim = block.closing();
|
||
|
let cause = ParseError::UnexpectedEof((delim as char).to_string(), span);
|
||
|
|
||
|
return (span, Some(cause));
|
||
|
}
|
||
|
|
||
|
if let Some(delim) = quote_start {
|
||
|
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
|
||
|
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
|
||
|
// correct information from the non-lite parse.
|
||
|
return (
|
||
|
span,
|
||
|
Some(ParseError::UnexpectedEof((delim as char).to_string(), span)),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
// If we didn't accumulate any characters, it's an unexpected error.
|
||
|
if *curr_offset - token_start == 0 {
|
||
|
return (
|
||
|
span,
|
||
|
Some(ParseError::UnexpectedEof("command".to_string(), span)),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
(span, None)
|
||
|
}
|
||
|
|
||
|
pub fn lex(
|
||
|
input: &[u8],
|
||
|
file_id: usize,
|
||
|
span_offset: usize,
|
||
|
lex_mode: LexMode,
|
||
|
) -> (Vec<Token>, Option<ParseError>) {
|
||
|
let mut error = None;
|
||
|
|
||
|
let mut curr_offset = span_offset;
|
||
|
|
||
|
let mut output = vec![];
|
||
|
let mut is_complete = true;
|
||
|
|
||
|
while let Some(c) = input.get(curr_offset) {
|
||
|
let c = *c;
|
||
|
if c == b'|' {
|
||
|
// If the next character is `|`, it's either `|` or `||`.
|
||
|
|
||
|
let idx = curr_offset;
|
||
|
let prev_idx = idx;
|
||
|
curr_offset += 1;
|
||
|
|
||
|
// If the next character is `|`, we're looking at a `||`.
|
||
|
if let Some(c) = input.get(curr_offset) {
|
||
|
if *c == b'|' {
|
||
|
let idx = curr_offset;
|
||
|
curr_offset += 1;
|
||
|
output.push(Token::new(
|
||
|
TokenContents::Item,
|
||
|
Span::new(span_offset + prev_idx, span_offset + idx + 1, file_id),
|
||
|
));
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Otherwise, it's just a regular `|` token.
|
||
|
output.push(Token::new(
|
||
|
TokenContents::Pipe,
|
||
|
Span::new(span_offset + idx, span_offset + idx + 1, file_id),
|
||
|
));
|
||
|
is_complete = false;
|
||
|
} else if c == b';' {
|
||
|
// If the next character is a `;`, we're looking at a semicolon token.
|
||
|
|
||
|
if !is_complete && error.is_none() {
|
||
|
error = Some(ParseError::ExtraTokens(Span::new(
|
||
|
curr_offset,
|
||
|
curr_offset + 1,
|
||
|
file_id,
|
||
|
)));
|
||
|
}
|
||
|
let idx = curr_offset;
|
||
|
curr_offset += 1;
|
||
|
output.push(Token::new(
|
||
|
TokenContents::Semicolon,
|
||
|
Span::new(idx, idx + 1, file_id),
|
||
|
));
|
||
|
} else if c == b'\n' || c == b'\r' {
|
||
|
// If the next character is a newline, we're looking at an EOL (end of line) token.
|
||
|
|
||
|
let idx = curr_offset;
|
||
|
curr_offset += 1;
|
||
|
if lex_mode == LexMode::Normal {
|
||
|
output.push(Token::new(
|
||
|
TokenContents::Eol,
|
||
|
Span::new(idx, idx + 1, file_id),
|
||
|
));
|
||
|
}
|
||
|
} else if c == b'#' {
|
||
|
// If the next character is `#`, we're at the beginning of a line
|
||
|
// comment. The comment continues until the next newline.
|
||
|
let mut start = curr_offset;
|
||
|
|
||
|
while let Some(input) = input.get(curr_offset) {
|
||
|
curr_offset += 1;
|
||
|
if *input == b'\n' {
|
||
|
output.push(Token::new(
|
||
|
TokenContents::Comment,
|
||
|
Span::new(start, curr_offset, file_id),
|
||
|
));
|
||
|
start = curr_offset;
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if start != curr_offset {
|
||
|
output.push(Token::new(
|
||
|
TokenContents::Comment,
|
||
|
Span::new(start, curr_offset, file_id),
|
||
|
));
|
||
|
}
|
||
|
} else if c == b' ' || c == b'\t' {
|
||
|
// If the next character is non-newline whitespace, skip it.
|
||
|
curr_offset += 1;
|
||
|
} else {
|
||
|
// Otherwise, try to consume an unclassified token.
|
||
|
|
||
|
let (span, err) = lex_item(input, &mut curr_offset, file_id);
|
||
|
if error.is_none() {
|
||
|
error = err;
|
||
|
}
|
||
|
is_complete = true;
|
||
|
output.push(Token::new(TokenContents::Item, span));
|
||
|
}
|
||
|
}
|
||
|
(output, error)
|
||
|
}
|
||
|
|
||
|
#[cfg(test)]
|
||
|
mod lex_tests {
|
||
|
use super::*;
|
||
|
|
||
|
#[test]
|
||
|
fn lex_basic() {
|
||
|
let file = b"let x = 4";
|
||
|
|
||
|
let output = lex(file, 0, 0, LexMode::Normal);
|
||
|
|
||
|
assert!(output.1.is_none());
|
||
|
}
|
||
|
|
||
|
#[test]
|
||
|
fn lex_newline() {
|
||
|
let file = b"let x = 300\nlet y = 500;";
|
||
|
|
||
|
let output = lex(file, 0, 0, LexMode::Normal);
|
||
|
|
||
|
println!("{:#?}", output.0);
|
||
|
assert!(output.0.contains(&Token {
|
||
|
contents: TokenContents::Eol,
|
||
|
span: Span {
|
||
|
start: 11,
|
||
|
end: 12,
|
||
|
file_id: 0
|
||
|
}
|
||
|
}));
|
||
|
}
|
||
|
|
||
|
#[test]
|
||
|
fn lex_empty() {
|
||
|
let file = b"";
|
||
|
|
||
|
let output = lex(file, 0, 0, LexMode::Normal);
|
||
|
|
||
|
assert!(output.0.is_empty());
|
||
|
assert!(output.1.is_none());
|
||
|
}
|
||
|
}
|