mirror of
https://github.com/rust-lang/rust-analyzer
synced 2025-01-13 13:48:50 +00:00
soa all the things
This commit is contained in:
parent
799941e05e
commit
8b9d145dea
4 changed files with 75 additions and 34 deletions
|
@ -4,48 +4,55 @@
|
||||||
//! on tokens which originated from text. Macros, eg, can synthesize tokes out
|
//! on tokens which originated from text. Macros, eg, can synthesize tokes out
|
||||||
//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
|
//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
|
||||||
//! convenient to include a text-based lexer here!
|
//! convenient to include a text-based lexer here!
|
||||||
|
//!
|
||||||
|
//! Note that these tokens, unlike the tokens we feed into the parser, do
|
||||||
|
//! include info about comments and whitespace.
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
SyntaxKind::{self, *},
|
SyntaxKind::{self, *},
|
||||||
T,
|
T,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
pub struct LexedStr<'a> {
|
||||||
pub struct LexerToken {
|
text: &'a str,
|
||||||
pub kind: SyntaxKind,
|
kind: Vec<SyntaxKind>,
|
||||||
pub len: usize,
|
start: Vec<u32>,
|
||||||
pub error: Option<String>,
|
error: Vec<LexError>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LexerToken {
|
struct LexError {
|
||||||
pub fn new(kind: SyntaxKind, len: usize) -> Self {
|
msg: String,
|
||||||
Self { kind, len, error: None }
|
token: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> LexedStr<'a> {
|
||||||
|
pub fn new(text: &'a str) -> LexedStr<'a> {
|
||||||
|
let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() };
|
||||||
|
|
||||||
/// Lexes text as a sequence of tokens.
|
|
||||||
pub fn tokenize(text: &str) -> Vec<LexerToken> {
|
|
||||||
let mut res = Vec::new();
|
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
|
|
||||||
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
|
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
|
||||||
res.push(LexerToken::new(SHEBANG, shebang_len));
|
res.push(SHEBANG, offset);
|
||||||
offset = shebang_len
|
offset = shebang_len
|
||||||
};
|
};
|
||||||
|
|
||||||
for token in rustc_lexer::tokenize(&text[offset..]) {
|
for token in rustc_lexer::tokenize(&text[offset..]) {
|
||||||
let token_text = &text[offset..][..token.len];
|
let token_text = &text[offset..][..token.len];
|
||||||
offset += token.len;
|
|
||||||
|
|
||||||
let (kind, err) = from_rustc(&token.kind, token_text);
|
let (kind, err) = from_rustc(&token.kind, token_text);
|
||||||
let mut token = LexerToken::new(kind, token.len);
|
res.push(kind, offset);
|
||||||
token.error = err.map(|it| it.to_string());
|
offset += token.len;
|
||||||
res.push(token);
|
|
||||||
|
if let Some(err) = err {
|
||||||
|
let token = res.len() as u32;
|
||||||
|
let msg = err.to_string();
|
||||||
|
res.error.push(LexError { msg, token });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
res.push(EOF, offset);
|
||||||
|
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
/// Lexes text as a single token. Returns `None` if there's leftover text.
|
|
||||||
pub fn from_str(text: &str) -> Option<LexerToken> {
|
pub fn single_token(text: &'a str) -> Option<SyntaxKind> {
|
||||||
if text.is_empty() {
|
if text.is_empty() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
@ -56,10 +63,40 @@ impl LexerToken {
|
||||||
}
|
}
|
||||||
|
|
||||||
let (kind, err) = from_rustc(&token.kind, text);
|
let (kind, err) = from_rustc(&token.kind, text);
|
||||||
|
if err.is_some() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
let mut token = LexerToken::new(kind, token.len);
|
Some(kind)
|
||||||
token.error = err.map(|it| it.to_string());
|
}
|
||||||
Some(token)
|
|
||||||
|
pub fn as_str(&self) -> &str {
|
||||||
|
self.text
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.kind.len() - 1
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn kind(&self, i: usize) -> SyntaxKind {
|
||||||
|
assert!(i < self.len());
|
||||||
|
self.kind[i]
|
||||||
|
}
|
||||||
|
pub fn text(&self, i: usize) -> &str {
|
||||||
|
assert!(i < self.len());
|
||||||
|
let lo = self.start[i] as usize;
|
||||||
|
let hi = self.start[i + 1] as usize;
|
||||||
|
&self.text[lo..hi]
|
||||||
|
}
|
||||||
|
pub fn error(&self, i: usize) -> Option<&str> {
|
||||||
|
assert!(i < self.len());
|
||||||
|
let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
|
||||||
|
Some(self.error[err].msg.as_str())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn push(&mut self, kind: SyntaxKind, offset: usize) {
|
||||||
|
self.kind.push(kind);
|
||||||
|
self.start.push(offset as u32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
//! [`Parser`]: crate::parser::Parser
|
//! [`Parser`]: crate::parser::Parser
|
||||||
#![allow(rustdoc::private_intra_doc_links)]
|
#![allow(rustdoc::private_intra_doc_links)]
|
||||||
|
|
||||||
mod lexer_token;
|
mod lexed_str;
|
||||||
mod token_set;
|
mod token_set;
|
||||||
mod syntax_kind;
|
mod syntax_kind;
|
||||||
mod event;
|
mod event;
|
||||||
|
@ -31,7 +31,7 @@ mod tests;
|
||||||
|
|
||||||
pub(crate) use token_set::TokenSet;
|
pub(crate) use token_set::TokenSet;
|
||||||
|
|
||||||
pub use crate::{lexer_token::LexerToken, syntax_kind::SyntaxKind, tokens::Tokens};
|
pub use crate::{lexed_str::LexedStr, syntax_kind::SyntaxKind, tokens::Tokens};
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct ParseError(pub Box<String>);
|
pub struct ParseError(pub Box<String>);
|
||||||
|
|
|
@ -6,7 +6,7 @@ use std::{
|
||||||
|
|
||||||
use expect_test::expect_file;
|
use expect_test::expect_file;
|
||||||
|
|
||||||
use crate::LexerToken;
|
use crate::LexedStr;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn valid_lexes_input() {
|
fn valid_lexes_input() {
|
||||||
|
@ -25,13 +25,16 @@ fn invalid_lexes_input() {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex(text: &str) -> String {
|
fn lex(text: &str) -> String {
|
||||||
|
let lexed = LexedStr::new(text);
|
||||||
|
|
||||||
let mut res = String::new();
|
let mut res = String::new();
|
||||||
let mut offset = 0;
|
for i in 0..lexed.len() {
|
||||||
for token in LexerToken::tokenize(text) {
|
let kind = lexed.kind(i);
|
||||||
let token_text = &text[offset..][..token.len];
|
let text = lexed.text(i);
|
||||||
offset += token.len;
|
let error = lexed.error(i);
|
||||||
let err = token.error.map(|err| format!(" error: {}", err)).unwrap_or_default();
|
|
||||||
writeln!(res, "{:?} {:?}{}", token.kind, token_text, err).unwrap();
|
let error = error.map(|err| format!(" error: {}", err)).unwrap_or_default();
|
||||||
|
writeln!(res, "{:?} {:?}{}", kind, text, error).unwrap();
|
||||||
}
|
}
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
//! Input for the parser -- a sequence of tokens.
|
//! Input for the parser -- a sequence of tokens.
|
||||||
//!
|
//!
|
||||||
//! As of now, parser doesn't have access to the *text* of the tokens, and makes
|
//! As of now, parser doesn't have access to the *text* of the tokens, and makes
|
||||||
//! decisions based solely on their classification.
|
//! decisions based solely on their classification. Unlike `LexerToken`, the
|
||||||
|
//! `Tokens` doesn't include whitespace and comments.
|
||||||
|
|
||||||
use crate::SyntaxKind;
|
use crate::SyntaxKind;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue