From 8b9d145dea17dc28d83fae23b5be63233483ec6d Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sat, 18 Dec 2021 15:31:50 +0300 Subject: [PATCH] soa all the things --- .../src/{lexer_token.rs => lexed_str.rs} | 85 +++++++++++++------ crates/parser/src/lib.rs | 4 +- crates/parser/src/tests.rs | 17 ++-- crates/parser/src/tokens.rs | 3 +- 4 files changed, 75 insertions(+), 34 deletions(-) rename crates/parser/src/{lexer_token.rs => lexed_str.rs} (81%) diff --git a/crates/parser/src/lexer_token.rs b/crates/parser/src/lexed_str.rs similarity index 81% rename from crates/parser/src/lexer_token.rs rename to crates/parser/src/lexed_str.rs index a9134639d2..595b607229 100644 --- a/crates/parser/src/lexer_token.rs +++ b/crates/parser/src/lexed_str.rs @@ -4,48 +4,55 @@ //! on tokens which originated from text. Macros, eg, can synthesize tokes out //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however //! convenient to include a text-based lexer here! +//! +//! Note that these tokens, unlike the tokens we feed into the parser, do +//! include info about comments and whitespace. use crate::{ SyntaxKind::{self, *}, T, }; -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct LexerToken { - pub kind: SyntaxKind, - pub len: usize, - pub error: Option, +pub struct LexedStr<'a> { + text: &'a str, + kind: Vec, + start: Vec, + error: Vec, } -impl LexerToken { - pub fn new(kind: SyntaxKind, len: usize) -> Self { - Self { kind, len, error: None } - } +struct LexError { + msg: String, + token: u32, +} + +impl<'a> LexedStr<'a> { + pub fn new(text: &'a str) -> LexedStr<'a> { + let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() }; - /// Lexes text as a sequence of tokens. - pub fn tokenize(text: &str) -> Vec { - let mut res = Vec::new(); let mut offset = 0; - if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { - res.push(LexerToken::new(SHEBANG, shebang_len)); + res.push(SHEBANG, offset); offset = shebang_len }; - for token in rustc_lexer::tokenize(&text[offset..]) { let token_text = &text[offset..][..token.len]; - offset += token.len; let (kind, err) = from_rustc(&token.kind, token_text); - let mut token = LexerToken::new(kind, token.len); - token.error = err.map(|it| it.to_string()); - res.push(token); + res.push(kind, offset); + offset += token.len; + + if let Some(err) = err { + let token = res.len() as u32; + let msg = err.to_string(); + res.error.push(LexError { msg, token }); + } } + res.push(EOF, offset); res } - /// Lexes text as a single token. Returns `None` if there's leftover text. - pub fn from_str(text: &str) -> Option { + + pub fn single_token(text: &'a str) -> Option { if text.is_empty() { return None; } @@ -56,10 +63,40 @@ impl LexerToken { } let (kind, err) = from_rustc(&token.kind, text); + if err.is_some() { + return None; + } - let mut token = LexerToken::new(kind, token.len); - token.error = err.map(|it| it.to_string()); - Some(token) + Some(kind) + } + + pub fn as_str(&self) -> &str { + self.text + } + + pub fn len(&self) -> usize { + self.kind.len() - 1 + } + + pub fn kind(&self, i: usize) -> SyntaxKind { + assert!(i < self.len()); + self.kind[i] + } + pub fn text(&self, i: usize) -> &str { + assert!(i < self.len()); + let lo = self.start[i] as usize; + let hi = self.start[i + 1] as usize; + &self.text[lo..hi] + } + pub fn error(&self, i: usize) -> Option<&str> { + assert!(i < self.len()); + let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?; + Some(self.error[err].msg.as_str()) + } + + fn push(&mut self, kind: SyntaxKind, offset: usize) { + self.kind.push(kind); + self.start.push(offset as u32); } } diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index 448f22185d..dc02ae6e83 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -18,7 +18,7 @@ //! [`Parser`]: crate::parser::Parser #![allow(rustdoc::private_intra_doc_links)] -mod lexer_token; +mod lexed_str; mod token_set; mod syntax_kind; mod event; @@ -31,7 +31,7 @@ mod tests; pub(crate) use token_set::TokenSet; -pub use crate::{lexer_token::LexerToken, syntax_kind::SyntaxKind, tokens::Tokens}; +pub use crate::{lexed_str::LexedStr, syntax_kind::SyntaxKind, tokens::Tokens}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ParseError(pub Box); diff --git a/crates/parser/src/tests.rs b/crates/parser/src/tests.rs index f323eba5e4..ebba992561 100644 --- a/crates/parser/src/tests.rs +++ b/crates/parser/src/tests.rs @@ -6,7 +6,7 @@ use std::{ use expect_test::expect_file; -use crate::LexerToken; +use crate::LexedStr; #[test] fn valid_lexes_input() { @@ -25,13 +25,16 @@ fn invalid_lexes_input() { } fn lex(text: &str) -> String { + let lexed = LexedStr::new(text); + let mut res = String::new(); - let mut offset = 0; - for token in LexerToken::tokenize(text) { - let token_text = &text[offset..][..token.len]; - offset += token.len; - let err = token.error.map(|err| format!(" error: {}", err)).unwrap_or_default(); - writeln!(res, "{:?} {:?}{}", token.kind, token_text, err).unwrap(); + for i in 0..lexed.len() { + let kind = lexed.kind(i); + let text = lexed.text(i); + let error = lexed.error(i); + + let error = error.map(|err| format!(" error: {}", err)).unwrap_or_default(); + writeln!(res, "{:?} {:?}{}", kind, text, error).unwrap(); } res } diff --git a/crates/parser/src/tokens.rs b/crates/parser/src/tokens.rs index 1c0672492d..4fc2361add 100644 --- a/crates/parser/src/tokens.rs +++ b/crates/parser/src/tokens.rs @@ -1,7 +1,8 @@ //! Input for the parser -- a sequence of tokens. //! //! As of now, parser doesn't have access to the *text* of the tokens, and makes -//! decisions based solely on their classification. +//! decisions based solely on their classification. Unlike `LexerToken`, the +//! `Tokens` doesn't include whitespace and comments. use crate::SyntaxKind;