soa all the things

This commit is contained in:
Aleksey Kladov 2021-12-18 15:31:50 +03:00
parent 799941e05e
commit 8b9d145dea
4 changed files with 75 additions and 34 deletions

View file

@ -4,48 +4,55 @@
//! on tokens which originated from text. Macros, eg, can synthesize tokes out //! on tokens which originated from text. Macros, eg, can synthesize tokes out
//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however //! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
//! convenient to include a text-based lexer here! //! convenient to include a text-based lexer here!
//!
//! Note that these tokens, unlike the tokens we feed into the parser, do
//! include info about comments and whitespace.
use crate::{ use crate::{
SyntaxKind::{self, *}, SyntaxKind::{self, *},
T, T,
}; };
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct LexedStr<'a> {
pub struct LexerToken { text: &'a str,
pub kind: SyntaxKind, kind: Vec<SyntaxKind>,
pub len: usize, start: Vec<u32>,
pub error: Option<String>, error: Vec<LexError>,
} }
impl LexerToken { struct LexError {
pub fn new(kind: SyntaxKind, len: usize) -> Self { msg: String,
Self { kind, len, error: None } token: u32,
} }
impl<'a> LexedStr<'a> {
pub fn new(text: &'a str) -> LexedStr<'a> {
let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() };
/// Lexes text as a sequence of tokens.
pub fn tokenize(text: &str) -> Vec<LexerToken> {
let mut res = Vec::new();
let mut offset = 0; let mut offset = 0;
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
res.push(LexerToken::new(SHEBANG, shebang_len)); res.push(SHEBANG, offset);
offset = shebang_len offset = shebang_len
}; };
for token in rustc_lexer::tokenize(&text[offset..]) { for token in rustc_lexer::tokenize(&text[offset..]) {
let token_text = &text[offset..][..token.len]; let token_text = &text[offset..][..token.len];
offset += token.len;
let (kind, err) = from_rustc(&token.kind, token_text); let (kind, err) = from_rustc(&token.kind, token_text);
let mut token = LexerToken::new(kind, token.len); res.push(kind, offset);
token.error = err.map(|it| it.to_string()); offset += token.len;
res.push(token);
if let Some(err) = err {
let token = res.len() as u32;
let msg = err.to_string();
res.error.push(LexError { msg, token });
}
} }
res.push(EOF, offset);
res res
} }
/// Lexes text as a single token. Returns `None` if there's leftover text.
pub fn from_str(text: &str) -> Option<LexerToken> { pub fn single_token(text: &'a str) -> Option<SyntaxKind> {
if text.is_empty() { if text.is_empty() {
return None; return None;
} }
@ -56,10 +63,40 @@ impl LexerToken {
} }
let (kind, err) = from_rustc(&token.kind, text); let (kind, err) = from_rustc(&token.kind, text);
if err.is_some() {
return None;
}
let mut token = LexerToken::new(kind, token.len); Some(kind)
token.error = err.map(|it| it.to_string()); }
Some(token)
pub fn as_str(&self) -> &str {
self.text
}
pub fn len(&self) -> usize {
self.kind.len() - 1
}
pub fn kind(&self, i: usize) -> SyntaxKind {
assert!(i < self.len());
self.kind[i]
}
pub fn text(&self, i: usize) -> &str {
assert!(i < self.len());
let lo = self.start[i] as usize;
let hi = self.start[i + 1] as usize;
&self.text[lo..hi]
}
pub fn error(&self, i: usize) -> Option<&str> {
assert!(i < self.len());
let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
Some(self.error[err].msg.as_str())
}
fn push(&mut self, kind: SyntaxKind, offset: usize) {
self.kind.push(kind);
self.start.push(offset as u32);
} }
} }

View file

@ -18,7 +18,7 @@
//! [`Parser`]: crate::parser::Parser //! [`Parser`]: crate::parser::Parser
#![allow(rustdoc::private_intra_doc_links)] #![allow(rustdoc::private_intra_doc_links)]
mod lexer_token; mod lexed_str;
mod token_set; mod token_set;
mod syntax_kind; mod syntax_kind;
mod event; mod event;
@ -31,7 +31,7 @@ mod tests;
pub(crate) use token_set::TokenSet; pub(crate) use token_set::TokenSet;
pub use crate::{lexer_token::LexerToken, syntax_kind::SyntaxKind, tokens::Tokens}; pub use crate::{lexed_str::LexedStr, syntax_kind::SyntaxKind, tokens::Tokens};
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ParseError(pub Box<String>); pub struct ParseError(pub Box<String>);

View file

@ -6,7 +6,7 @@ use std::{
use expect_test::expect_file; use expect_test::expect_file;
use crate::LexerToken; use crate::LexedStr;
#[test] #[test]
fn valid_lexes_input() { fn valid_lexes_input() {
@ -25,13 +25,16 @@ fn invalid_lexes_input() {
} }
fn lex(text: &str) -> String { fn lex(text: &str) -> String {
let lexed = LexedStr::new(text);
let mut res = String::new(); let mut res = String::new();
let mut offset = 0; for i in 0..lexed.len() {
for token in LexerToken::tokenize(text) { let kind = lexed.kind(i);
let token_text = &text[offset..][..token.len]; let text = lexed.text(i);
offset += token.len; let error = lexed.error(i);
let err = token.error.map(|err| format!(" error: {}", err)).unwrap_or_default();
writeln!(res, "{:?} {:?}{}", token.kind, token_text, err).unwrap(); let error = error.map(|err| format!(" error: {}", err)).unwrap_or_default();
writeln!(res, "{:?} {:?}{}", kind, text, error).unwrap();
} }
res res
} }

View file

@ -1,7 +1,8 @@
//! Input for the parser -- a sequence of tokens. //! Input for the parser -- a sequence of tokens.
//! //!
//! As of now, parser doesn't have access to the *text* of the tokens, and makes //! As of now, parser doesn't have access to the *text* of the tokens, and makes
//! decisions based solely on their classification. //! decisions based solely on their classification. Unlike `LexerToken`, the
//! `Tokens` doesn't include whitespace and comments.
use crate::SyntaxKind; use crate::SyntaxKind;