use crate::{ParseError, Span}; #[derive(Debug, PartialEq, Eq)] pub enum TokenContents { Item, Comment, Pipe, Semicolon, Eol, } #[derive(Debug, PartialEq, Eq)] pub struct Token { pub contents: TokenContents, pub span: Span, } impl Token { pub fn new(contents: TokenContents, span: Span) -> Token { Token { contents, span } } } #[derive(Clone, Copy, Debug)] pub enum BlockKind { Paren, CurlyBracket, SquareBracket, } impl BlockKind { fn closing(self) -> u8 { match self { BlockKind::Paren => b')', BlockKind::SquareBracket => b']', BlockKind::CurlyBracket => b'}', } } } // A baseline token is terminated if it's not nested inside of a paired // delimiter and the next character is one of: `|`, `;`, `#` or any // whitespace. fn is_item_terminator( block_level: &[BlockKind], c: u8, additional_whitespace: &[u8], special_tokens: &[u8], ) -> bool { block_level.is_empty() && (c == b' ' || c == b'\t' || c == b'\n' || c == b'|' || c == b';' || c == b'#' || additional_whitespace.contains(&c) || special_tokens.contains(&c)) } // A special token is one that is a byte that stands alone as its own token. For example // when parsing a signature you may want to have `:` be able to separate tokens and also // to be handled as its own token to notify you you're about to parse a type in the example // `foo:bar` fn is_special_item(block_level: &[BlockKind], c: u8, special_tokens: &[u8]) -> bool { block_level.is_empty() && special_tokens.contains(&c) } pub fn lex_item( input: &[u8], curr_offset: &mut usize, additional_whitespace: &[u8], special_tokens: &[u8], ) -> (Span, Option) { // This variable tracks the starting character of a string literal, so that // we remain inside the string literal lexer mode until we encounter the // closing quote. let mut quote_start: Option = None; let mut in_comment = false; let token_start = *curr_offset; // This Vec tracks paired delimiters let mut block_level: Vec = vec![]; // The process of slurping up a baseline token repeats: // // - String literal, which begins with `'`, `"` or `\``, and continues until // the same character is encountered again. // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until // the matching closing delimiter is found, skipping comments and string // literals. // - When not nested inside of a delimiter pair, when a terminating // character (whitespace, `|`, `;` or `#`) is encountered, the baseline // token is done. // - Otherwise, accumulate the character into the current baseline token. while let Some(c) = input.get(*curr_offset) { let c = *c; if quote_start.is_some() { // If we encountered the closing quote character for the current // string, we're done with the current string. if Some(c) == quote_start { quote_start = None; } } else if c == b'#' { if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) { break; } in_comment = true; } else if c == b'\n' { in_comment = false; if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) { break; } } else if in_comment { if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) { break; } } else if is_special_item(&block_level, c, special_tokens) && token_start == *curr_offset { *curr_offset += 1; break; } else if c == b'\'' || c == b'"' { // We encountered the opening quote of a string literal. quote_start = Some(c); } else if c == b'[' { // We encountered an opening `[` delimiter. block_level.push(BlockKind::SquareBracket); } else if c == b']' { // We encountered a closing `]` delimiter. Pop off the opening `[` // delimiter. if let Some(BlockKind::SquareBracket) = block_level.last() { let _ = block_level.pop(); } } else if c == b'{' { // We encountered an opening `{` delimiter. block_level.push(BlockKind::CurlyBracket); } else if c == b'}' { // We encountered a closing `}` delimiter. Pop off the opening `{`. if let Some(BlockKind::CurlyBracket) = block_level.last() { let _ = block_level.pop(); } } else if c == b'(' { // We enceountered an opening `(` delimiter. block_level.push(BlockKind::Paren); } else if c == b')' { // We encountered a closing `)` delimiter. Pop off the opening `(`. if let Some(BlockKind::Paren) = block_level.last() { let _ = block_level.pop(); } } else if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) { break; } *curr_offset += 1; } let span = Span::new(token_start, *curr_offset); // If there is still unclosed opening delimiters, close them and add // synthetic closing characters to the accumulated token. if let Some(block) = block_level.last() { let delim = block.closing(); let cause = ParseError::UnexpectedEof((delim as char).to_string(), span); return (span, Some(cause)); } if let Some(delim) = quote_start { // The non-lite parse trims quotes on both sides, so we add the expected quote so that // anyone wanting to consume this partial parse (e.g., completions) will be able to get // correct information from the non-lite parse. return ( span, Some(ParseError::UnexpectedEof((delim as char).to_string(), span)), ); } // If we didn't accumulate any characters, it's an unexpected error. if *curr_offset - token_start == 0 { return ( span, Some(ParseError::UnexpectedEof("command".to_string(), span)), ); } (span, None) } pub fn lex( input: &[u8], span_offset: usize, additional_whitespace: &[u8], special_tokens: &[u8], ) -> (Vec, Option) { let mut error = None; let mut curr_offset = span_offset; let mut output = vec![]; let mut is_complete = true; while let Some(c) = input.get(curr_offset) { let c = *c; if c == b'|' { // If the next character is `|`, it's either `|` or `||`. let idx = curr_offset; let prev_idx = idx; curr_offset += 1; // If the next character is `|`, we're looking at a `||`. if let Some(c) = input.get(curr_offset) { if *c == b'|' { let idx = curr_offset; curr_offset += 1; output.push(Token::new( TokenContents::Item, Span::new(span_offset + prev_idx, span_offset + idx + 1), )); continue; } } // Otherwise, it's just a regular `|` token. output.push(Token::new( TokenContents::Pipe, Span::new(span_offset + idx, span_offset + idx + 1), )); is_complete = false; } else if c == b';' { // If the next character is a `;`, we're looking at a semicolon token. if !is_complete && error.is_none() { error = Some(ParseError::ExtraTokens(Span::new( curr_offset, curr_offset + 1, ))); } let idx = curr_offset; curr_offset += 1; output.push(Token::new( TokenContents::Semicolon, Span::new(idx, idx + 1), )); } else if c == b'\n' || c == b'\r' { // If the next character is a newline, we're looking at an EOL (end of line) token. let idx = curr_offset; curr_offset += 1; if !additional_whitespace.contains(&c) { output.push(Token::new(TokenContents::Eol, Span::new(idx, idx + 1))); } } else if c == b'#' { // If the next character is `#`, we're at the beginning of a line // comment. The comment continues until the next newline. let mut start = curr_offset; while let Some(input) = input.get(curr_offset) { curr_offset += 1; if *input == b'\n' { output.push(Token::new( TokenContents::Comment, Span::new(start, curr_offset), )); start = curr_offset; break; } } if start != curr_offset { output.push(Token::new( TokenContents::Comment, Span::new(start, curr_offset), )); } } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { // If the next character is non-newline whitespace, skip it. curr_offset += 1; } else { // Otherwise, try to consume an unclassified token. let (span, err) = lex_item( input, &mut curr_offset, additional_whitespace, special_tokens, ); if error.is_none() { error = err; } is_complete = true; output.push(Token::new(TokenContents::Item, span)); } } (output, error) } #[cfg(test)] mod lex_tests { use super::*; #[test] fn lex_basic() { let file = b"let x = 4"; let output = lex(file, 0, &[], &[]); assert!(output.1.is_none()); } #[test] fn lex_newline() { let file = b"let x = 300\nlet y = 500;"; let output = lex(file, 0, &[], &[]); println!("{:#?}", output.0); assert!(output.0.contains(&Token { contents: TokenContents::Eol, span: Span { start: 11, end: 12 } })); } #[test] fn lex_empty() { let file = b""; let output = lex(file, 0, &[], &[]); assert!(output.0.is_empty()); assert!(output.1.is_none()); } }