mirror of
https://github.com/rust-lang/rust-analyzer
synced 2025-01-25 19:35:06 +00:00
Merge #11046
11046: internal: move all the lexing to the parser crate r=matklad a=matklad
bors r+
🤖
Co-authored-by: Aleksey Kladov <aleksey.kladov@gmail.com>
This commit is contained in:
commit
9f1a3ae5ab
233 changed files with 971 additions and 1019 deletions
5
Cargo.lock
generated
5
Cargo.lock
generated
|
@ -609,6 +609,7 @@ dependencies = [
|
|||
"hir",
|
||||
"ide_db",
|
||||
"itertools",
|
||||
"parser",
|
||||
"profile",
|
||||
"rustc-hash",
|
||||
"sourcegen",
|
||||
|
@ -654,6 +655,7 @@ dependencies = [
|
|||
"itertools",
|
||||
"limit",
|
||||
"once_cell",
|
||||
"parser",
|
||||
"profile",
|
||||
"rayon",
|
||||
"rustc-hash",
|
||||
|
@ -695,6 +697,7 @@ dependencies = [
|
|||
"hir",
|
||||
"ide_db",
|
||||
"itertools",
|
||||
"parser",
|
||||
"rustc-hash",
|
||||
"syntax",
|
||||
"test_utils",
|
||||
|
@ -1066,7 +1069,9 @@ name = "parser"
|
|||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"drop_bomb",
|
||||
"expect-test",
|
||||
"limit",
|
||||
"rustc-ap-rustc_lexer",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -16,6 +16,7 @@ itertools = "0.10.0"
|
|||
either = "1.6.1"
|
||||
|
||||
stdx = { path = "../stdx", version = "0.0.0" }
|
||||
parser = { path = "../parser", version = "0.0.0" }
|
||||
syntax = { path = "../syntax", version = "0.0.0" }
|
||||
text_edit = { path = "../text_edit", version = "0.0.0" }
|
||||
profile = { path = "../profile", version = "0.0.0" }
|
||||
|
|
|
@ -135,7 +135,7 @@ fn normalize(name: &str) -> Option<String> {
|
|||
}
|
||||
|
||||
fn is_valid_name(name: &str) -> bool {
|
||||
match syntax::lex_single_syntax_kind(name) {
|
||||
match parser::LexedStr::single_token(name) {
|
||||
Some((syntax::SyntaxKind::IDENT, _error)) => true,
|
||||
_ => false,
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ arrayvec = "0.7"
|
|||
indexmap = "1.7"
|
||||
|
||||
stdx = { path = "../stdx", version = "0.0.0" }
|
||||
parser = { path = "../parser", version = "0.0.0" }
|
||||
syntax = { path = "../syntax", version = "0.0.0" }
|
||||
text_edit = { path = "../text_edit", version = "0.0.0" }
|
||||
base_db = { path = "../base_db", version = "0.0.0" }
|
||||
|
|
|
@ -28,7 +28,7 @@ use hir::{AsAssocItem, FieldSource, HasSource, InFile, ModuleSource, Semantics};
|
|||
use stdx::never;
|
||||
use syntax::{
|
||||
ast::{self, HasName},
|
||||
lex_single_syntax_kind, AstNode, SyntaxKind, TextRange, T,
|
||||
AstNode, SyntaxKind, TextRange, T,
|
||||
};
|
||||
use text_edit::{TextEdit, TextEditBuilder};
|
||||
|
||||
|
@ -490,7 +490,7 @@ pub enum IdentifierKind {
|
|||
|
||||
impl IdentifierKind {
|
||||
pub fn classify(new_name: &str) -> Result<IdentifierKind> {
|
||||
match lex_single_syntax_kind(new_name) {
|
||||
match parser::LexedStr::single_token(new_name) {
|
||||
Some(res) => match res {
|
||||
(SyntaxKind::IDENT, _) => Ok(IdentifierKind::Ident),
|
||||
(T![_], _) => Ok(IdentifierKind::Underscore),
|
||||
|
|
|
@ -16,6 +16,7 @@ rustc-hash = "1.1.0"
|
|||
itertools = "0.10.0"
|
||||
|
||||
text_edit = { path = "../text_edit", version = "0.0.0" }
|
||||
parser = { path = "../parser", version = "0.0.0" }
|
||||
syntax = { path = "../syntax", version = "0.0.0" }
|
||||
ide_db = { path = "../ide_db", version = "0.0.0" }
|
||||
hir = { path = "../hir", version = "0.0.0" }
|
||||
|
|
|
@ -256,19 +256,13 @@ fn validate_rule(rule: &SsrRule) -> Result<(), SsrError> {
|
|||
}
|
||||
|
||||
fn tokenize(source: &str) -> Result<Vec<Token>, SsrError> {
|
||||
let mut start = 0;
|
||||
let (raw_tokens, errors) = syntax::tokenize(source);
|
||||
if let Some(first_error) = errors.first() {
|
||||
let lexed = parser::LexedStr::new(source);
|
||||
if let Some((_, first_error)) = lexed.errors().next() {
|
||||
bail!("Failed to parse pattern: {}", first_error);
|
||||
}
|
||||
let mut tokens: Vec<Token> = Vec::new();
|
||||
for raw_token in raw_tokens {
|
||||
let token_len = usize::from(raw_token.len);
|
||||
tokens.push(Token {
|
||||
kind: raw_token.kind,
|
||||
text: SmolStr::new(&source[start..start + token_len]),
|
||||
});
|
||||
start += token_len;
|
||||
for i in 0..lexed.len() {
|
||||
tokens.push(Token { kind: lexed.kind(i), text: lexed.text(i).into() });
|
||||
}
|
||||
Ok(tokens)
|
||||
}
|
||||
|
|
|
@ -4,10 +4,9 @@ use parser::{ParseError, TreeSink};
|
|||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
use syntax::{
|
||||
ast::{self, make::tokens::doc_comment},
|
||||
tokenize, AstToken, Parse, PreorderWithTokens, SmolStr, SyntaxElement, SyntaxKind,
|
||||
AstToken, Parse, PreorderWithTokens, SmolStr, SyntaxElement, SyntaxKind,
|
||||
SyntaxKind::*,
|
||||
SyntaxNode, SyntaxToken, SyntaxTreeBuilder, TextRange, TextSize, Token as RawToken, WalkEvent,
|
||||
T,
|
||||
SyntaxNode, SyntaxToken, SyntaxTreeBuilder, TextRange, TextSize, WalkEvent, T,
|
||||
};
|
||||
use tt::buffer::{Cursor, TokenBuffer};
|
||||
|
||||
|
@ -69,15 +68,14 @@ pub fn token_tree_to_syntax_node(
|
|||
|
||||
/// Convert a string to a `TokenTree`
|
||||
pub fn parse_to_token_tree(text: &str) -> Option<(tt::Subtree, TokenMap)> {
|
||||
let (tokens, errors) = tokenize(text);
|
||||
if !errors.is_empty() {
|
||||
let lexed = parser::LexedStr::new(text);
|
||||
if lexed.errors().next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut conv = RawConvertor {
|
||||
text,
|
||||
offset: TextSize::default(),
|
||||
inner: tokens.iter(),
|
||||
lexed: lexed,
|
||||
pos: 0,
|
||||
id_alloc: TokenIdAlloc {
|
||||
map: Default::default(),
|
||||
global_offset: TextSize::default(),
|
||||
|
@ -146,7 +144,7 @@ fn convert_tokens<C: TokenConvertor>(conv: &mut C) -> tt::Subtree {
|
|||
Some(it) => it,
|
||||
};
|
||||
|
||||
let k: SyntaxKind = token.kind();
|
||||
let k: SyntaxKind = token.kind(&conv);
|
||||
if k == COMMENT {
|
||||
if let Some(tokens) = conv.convert_doc_comment(&token) {
|
||||
// FIXME: There has to be a better way to do this
|
||||
|
@ -199,19 +197,19 @@ fn convert_tokens<C: TokenConvertor>(conv: &mut C) -> tt::Subtree {
|
|||
} else {
|
||||
let spacing = match conv.peek() {
|
||||
Some(next)
|
||||
if next.kind().is_trivia()
|
||||
|| next.kind() == T!['[']
|
||||
|| next.kind() == T!['{']
|
||||
|| next.kind() == T!['('] =>
|
||||
if next.kind(&conv).is_trivia()
|
||||
|| next.kind(&conv) == T!['[']
|
||||
|| next.kind(&conv) == T!['{']
|
||||
|| next.kind(&conv) == T!['('] =>
|
||||
{
|
||||
tt::Spacing::Alone
|
||||
}
|
||||
Some(next) if next.kind().is_punct() && next.kind() != UNDERSCORE => {
|
||||
Some(next) if next.kind(&conv).is_punct() && next.kind(&conv) != UNDERSCORE => {
|
||||
tt::Spacing::Joint
|
||||
}
|
||||
_ => tt::Spacing::Alone,
|
||||
};
|
||||
let char = match token.to_char() {
|
||||
let char = match token.to_char(&conv) {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
panic!("Token from lexer must be single char: token = {:#?}", token);
|
||||
|
@ -222,7 +220,7 @@ fn convert_tokens<C: TokenConvertor>(conv: &mut C) -> tt::Subtree {
|
|||
} else {
|
||||
macro_rules! make_leaf {
|
||||
($i:ident) => {
|
||||
tt::$i { id: conv.id_alloc().alloc(range), text: token.to_text() }.into()
|
||||
tt::$i { id: conv.id_alloc().alloc(range), text: token.to_text(conv) }.into()
|
||||
};
|
||||
}
|
||||
let leaf: tt::Leaf = match k {
|
||||
|
@ -243,7 +241,7 @@ fn convert_tokens<C: TokenConvertor>(conv: &mut C) -> tt::Subtree {
|
|||
|
||||
let r = TextRange::at(range.start() + char_unit, range.len() - char_unit);
|
||||
let ident = tt::Leaf::from(tt::Ident {
|
||||
text: SmolStr::new(&token.to_text()[1..]),
|
||||
text: SmolStr::new(&token.to_text(conv)[1..]),
|
||||
id: conv.id_alloc().alloc(r),
|
||||
});
|
||||
result.push(ident.into());
|
||||
|
@ -392,22 +390,21 @@ impl TokenIdAlloc {
|
|||
|
||||
/// A Raw Token (straightly from lexer) convertor
|
||||
struct RawConvertor<'a> {
|
||||
text: &'a str,
|
||||
offset: TextSize,
|
||||
lexed: parser::LexedStr<'a>,
|
||||
pos: usize,
|
||||
id_alloc: TokenIdAlloc,
|
||||
inner: std::slice::Iter<'a, RawToken>,
|
||||
}
|
||||
|
||||
trait SrcToken: std::fmt::Debug {
|
||||
fn kind(&self) -> SyntaxKind;
|
||||
trait SrcToken<Ctx>: std::fmt::Debug {
|
||||
fn kind(&self, ctx: &Ctx) -> SyntaxKind;
|
||||
|
||||
fn to_char(&self) -> Option<char>;
|
||||
fn to_char(&self, ctx: &Ctx) -> Option<char>;
|
||||
|
||||
fn to_text(&self) -> SmolStr;
|
||||
fn to_text(&self, ctx: &Ctx) -> SmolStr;
|
||||
}
|
||||
|
||||
trait TokenConvertor {
|
||||
type Token: SrcToken;
|
||||
trait TokenConvertor: Sized {
|
||||
type Token: SrcToken<Self>;
|
||||
|
||||
fn convert_doc_comment(&self, token: &Self::Token) -> Option<Vec<tt::TokenTree>>;
|
||||
|
||||
|
@ -418,42 +415,45 @@ trait TokenConvertor {
|
|||
fn id_alloc(&mut self) -> &mut TokenIdAlloc;
|
||||
}
|
||||
|
||||
impl<'a> SrcToken for (&'a RawToken, &'a str) {
|
||||
fn kind(&self) -> SyntaxKind {
|
||||
self.0.kind
|
||||
impl<'a> SrcToken<RawConvertor<'a>> for usize {
|
||||
fn kind(&self, ctx: &RawConvertor<'a>) -> SyntaxKind {
|
||||
ctx.lexed.kind(*self)
|
||||
}
|
||||
|
||||
fn to_char(&self) -> Option<char> {
|
||||
self.1.chars().next()
|
||||
fn to_char(&self, ctx: &RawConvertor<'a>) -> Option<char> {
|
||||
ctx.lexed.text(*self).chars().next()
|
||||
}
|
||||
|
||||
fn to_text(&self) -> SmolStr {
|
||||
self.1.into()
|
||||
fn to_text(&self, ctx: &RawConvertor<'_>) -> SmolStr {
|
||||
ctx.lexed.text(*self).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenConvertor for RawConvertor<'a> {
|
||||
type Token = (&'a RawToken, &'a str);
|
||||
type Token = usize;
|
||||
|
||||
fn convert_doc_comment(&self, token: &Self::Token) -> Option<Vec<tt::TokenTree>> {
|
||||
convert_doc_comment(&doc_comment(token.1))
|
||||
fn convert_doc_comment(&self, token: &usize) -> Option<Vec<tt::TokenTree>> {
|
||||
let text = self.lexed.text(*token);
|
||||
convert_doc_comment(&doc_comment(text))
|
||||
}
|
||||
|
||||
fn bump(&mut self) -> Option<(Self::Token, TextRange)> {
|
||||
let token = self.inner.next()?;
|
||||
let range = TextRange::at(self.offset, token.len);
|
||||
self.offset += token.len;
|
||||
if self.pos == self.lexed.len() {
|
||||
return None;
|
||||
}
|
||||
let token = self.pos;
|
||||
self.pos += 1;
|
||||
let range = self.lexed.text_range(token);
|
||||
let range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
|
||||
|
||||
Some(((token, &self.text[range]), range))
|
||||
Some((token, range))
|
||||
}
|
||||
|
||||
fn peek(&self) -> Option<Self::Token> {
|
||||
let token = self.inner.as_slice().get(0);
|
||||
|
||||
token.map(|it| {
|
||||
let range = TextRange::at(self.offset, it.len);
|
||||
(it, &self.text[range])
|
||||
})
|
||||
if self.pos == self.lexed.len() {
|
||||
return None;
|
||||
}
|
||||
Some(self.pos)
|
||||
}
|
||||
|
||||
fn id_alloc(&mut self) -> &mut TokenIdAlloc {
|
||||
|
@ -523,17 +523,17 @@ impl SynToken {
|
|||
}
|
||||
}
|
||||
|
||||
impl SrcToken for SynToken {
|
||||
fn kind(&self) -> SyntaxKind {
|
||||
impl<'a> SrcToken<Convertor<'a>> for SynToken {
|
||||
fn kind(&self, _ctx: &Convertor<'a>) -> SyntaxKind {
|
||||
self.token().kind()
|
||||
}
|
||||
fn to_char(&self) -> Option<char> {
|
||||
fn to_char(&self, _ctx: &Convertor<'a>) -> Option<char> {
|
||||
match self {
|
||||
SynToken::Ordinary(_) => None,
|
||||
SynToken::Punch(it, i) => it.text().chars().nth((*i).into()),
|
||||
}
|
||||
}
|
||||
fn to_text(&self) -> SmolStr {
|
||||
fn to_text(&self, _ctx: &Convertor<'a>) -> SmolStr {
|
||||
self.token().text().into()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
//! Convert macro-by-example tokens which are specific to macro expansion into a
|
||||
//! format that works for our parser.
|
||||
|
||||
use syntax::{lex_single_syntax_kind, SyntaxKind, SyntaxKind::*, T};
|
||||
use syntax::{SyntaxKind, SyntaxKind::*, T};
|
||||
use tt::buffer::TokenBuffer;
|
||||
|
||||
pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens {
|
||||
|
@ -35,7 +35,7 @@ pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens {
|
|||
let is_negated = lit.text.starts_with('-');
|
||||
let inner_text = &lit.text[if is_negated { 1 } else { 0 }..];
|
||||
|
||||
let kind = lex_single_syntax_kind(inner_text)
|
||||
let kind = parser::LexedStr::single_token(inner_text)
|
||||
.map(|(kind, _error)| kind)
|
||||
.filter(|kind| {
|
||||
kind.is_literal()
|
||||
|
|
|
@ -11,5 +11,8 @@ doctest = false
|
|||
|
||||
[dependencies]
|
||||
drop_bomb = "0.1.4"
|
||||
|
||||
rustc_lexer = { version = "725.0.0", package = "rustc-ap-rustc_lexer" }
|
||||
limit = { path = "../limit", version = "0.0.0" }
|
||||
|
||||
[dev-dependencies]
|
||||
expect-test = "1.2"
|
||||
|
|
300
crates/parser/src/lexed_str.rs
Normal file
300
crates/parser/src/lexed_str.rs
Normal file
|
@ -0,0 +1,300 @@
|
|||
//! Lexing `&str` into a sequence of Rust tokens.
|
||||
//!
|
||||
//! Note that strictly speaking the parser in this crate is not required to work
|
||||
//! on tokens which originated from text. Macros, eg, can synthesize tokens out
|
||||
//! of thin air. So, ideally, lexer should be an orthogonal crate. It is however
|
||||
//! convenient to include a text-based lexer here!
|
||||
//!
|
||||
//! Note that these tokens, unlike the tokens we feed into the parser, do
|
||||
//! include info about comments and whitespace.
|
||||
|
||||
use std::ops;
|
||||
|
||||
use crate::{
|
||||
SyntaxKind::{self, *},
|
||||
T,
|
||||
};
|
||||
|
||||
pub struct LexedStr<'a> {
|
||||
text: &'a str,
|
||||
kind: Vec<SyntaxKind>,
|
||||
start: Vec<u32>,
|
||||
error: Vec<LexError>,
|
||||
}
|
||||
|
||||
struct LexError {
|
||||
msg: String,
|
||||
token: u32,
|
||||
}
|
||||
|
||||
impl<'a> LexedStr<'a> {
|
||||
pub fn new(text: &'a str) -> LexedStr<'a> {
|
||||
let mut res = LexedStr { text, kind: Vec::new(), start: Vec::new(), error: Vec::new() };
|
||||
|
||||
let mut offset = 0;
|
||||
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
|
||||
res.push(SHEBANG, offset);
|
||||
offset = shebang_len
|
||||
};
|
||||
for token in rustc_lexer::tokenize(&text[offset..]) {
|
||||
let token_text = &text[offset..][..token.len];
|
||||
|
||||
let (kind, err) = from_rustc(&token.kind, token_text);
|
||||
res.push(kind, offset);
|
||||
offset += token.len;
|
||||
|
||||
if let Some(err) = err {
|
||||
let token = res.len() as u32;
|
||||
let msg = err.to_string();
|
||||
res.error.push(LexError { msg, token });
|
||||
}
|
||||
}
|
||||
res.push(EOF, offset);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub fn single_token(text: &'a str) -> Option<(SyntaxKind, Option<String>)> {
|
||||
if text.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let token = rustc_lexer::first_token(text);
|
||||
if token.len != text.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (kind, err) = from_rustc(&token.kind, text);
|
||||
Some((kind, err.map(|it| it.to_owned())))
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &str {
|
||||
self.text
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.kind.len() - 1
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
pub fn kind(&self, i: usize) -> SyntaxKind {
|
||||
assert!(i < self.len());
|
||||
self.kind[i]
|
||||
}
|
||||
|
||||
pub fn text(&self, i: usize) -> &str {
|
||||
self.range_text(i..i + 1)
|
||||
}
|
||||
pub fn range_text(&self, r: ops::Range<usize>) -> &str {
|
||||
assert!(r.start < r.end && r.end <= self.len());
|
||||
let lo = self.start[r.start] as usize;
|
||||
let hi = self.start[r.end] as usize;
|
||||
&self.text[lo..hi]
|
||||
}
|
||||
|
||||
// Naming is hard.
|
||||
pub fn text_range(&self, i: usize) -> ops::Range<usize> {
|
||||
assert!(i < self.len());
|
||||
let lo = self.start[i] as usize;
|
||||
let hi = self.start[i + 1] as usize;
|
||||
lo..hi
|
||||
}
|
||||
pub fn text_start(&self, i: usize) -> usize {
|
||||
assert!(i <= self.len());
|
||||
self.start[i] as usize
|
||||
}
|
||||
pub fn text_len(&self, i: usize) -> usize {
|
||||
assert!(i < self.len());
|
||||
let r = self.text_range(i);
|
||||
r.end - r.start
|
||||
}
|
||||
|
||||
pub fn error(&self, i: usize) -> Option<&str> {
|
||||
assert!(i < self.len());
|
||||
let err = self.error.binary_search_by_key(&(i as u32), |i| i.token).ok()?;
|
||||
Some(self.error[err].msg.as_str())
|
||||
}
|
||||
|
||||
pub fn errors(&self) -> impl Iterator<Item = (usize, &str)> + '_ {
|
||||
self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
|
||||
}
|
||||
|
||||
pub fn to_tokens(&self) -> crate::Tokens {
|
||||
let mut res = crate::Tokens::default();
|
||||
let mut was_joint = false;
|
||||
for i in 0..self.len() {
|
||||
let kind = self.kind(i);
|
||||
if kind.is_trivia() {
|
||||
was_joint = false
|
||||
} else {
|
||||
if kind == SyntaxKind::IDENT {
|
||||
let token_text = self.text(i);
|
||||
let contextual_kw = SyntaxKind::from_contextual_keyword(token_text)
|
||||
.unwrap_or(SyntaxKind::IDENT);
|
||||
res.push_ident(contextual_kw);
|
||||
} else {
|
||||
if was_joint {
|
||||
res.was_joint();
|
||||
}
|
||||
res.push(kind);
|
||||
}
|
||||
was_joint = true;
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
fn push(&mut self, kind: SyntaxKind, offset: usize) {
|
||||
self.kind.push(kind);
|
||||
self.start.push(offset as u32);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `SyntaxKind` and an optional tokenize error message.
|
||||
fn from_rustc(
|
||||
kind: &rustc_lexer::TokenKind,
|
||||
token_text: &str,
|
||||
) -> (SyntaxKind, Option<&'static str>) {
|
||||
// A note on an intended tradeoff:
|
||||
// We drop some useful information here (see patterns with double dots `..`)
|
||||
// Storing that info in `SyntaxKind` is not possible due to its layout requirements of
|
||||
// being `u16` that come from `rowan::SyntaxKind`.
|
||||
let mut err = "";
|
||||
|
||||
let syntax_kind = {
|
||||
match kind {
|
||||
rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
|
||||
rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
|
||||
if !terminated {
|
||||
err = "Missing trailing `*/` symbols to terminate the block comment";
|
||||
}
|
||||
COMMENT
|
||||
}
|
||||
|
||||
rustc_lexer::TokenKind::Whitespace => WHITESPACE,
|
||||
|
||||
rustc_lexer::TokenKind::Ident if token_text == "_" => UNDERSCORE,
|
||||
rustc_lexer::TokenKind::Ident => SyntaxKind::from_keyword(token_text).unwrap_or(IDENT),
|
||||
|
||||
rustc_lexer::TokenKind::RawIdent => IDENT,
|
||||
rustc_lexer::TokenKind::Literal { kind, .. } => return from_rustc_literal(kind),
|
||||
|
||||
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
|
||||
if *starts_with_number {
|
||||
err = "Lifetime name cannot start with a number";
|
||||
}
|
||||
LIFETIME_IDENT
|
||||
}
|
||||
|
||||
rustc_lexer::TokenKind::Semi => T![;],
|
||||
rustc_lexer::TokenKind::Comma => T![,],
|
||||
rustc_lexer::TokenKind::Dot => T![.],
|
||||
rustc_lexer::TokenKind::OpenParen => T!['('],
|
||||
rustc_lexer::TokenKind::CloseParen => T![')'],
|
||||
rustc_lexer::TokenKind::OpenBrace => T!['{'],
|
||||
rustc_lexer::TokenKind::CloseBrace => T!['}'],
|
||||
rustc_lexer::TokenKind::OpenBracket => T!['['],
|
||||
rustc_lexer::TokenKind::CloseBracket => T![']'],
|
||||
rustc_lexer::TokenKind::At => T![@],
|
||||
rustc_lexer::TokenKind::Pound => T![#],
|
||||
rustc_lexer::TokenKind::Tilde => T![~],
|
||||
rustc_lexer::TokenKind::Question => T![?],
|
||||
rustc_lexer::TokenKind::Colon => T![:],
|
||||
rustc_lexer::TokenKind::Dollar => T![$],
|
||||
rustc_lexer::TokenKind::Eq => T![=],
|
||||
rustc_lexer::TokenKind::Bang => T![!],
|
||||
rustc_lexer::TokenKind::Lt => T![<],
|
||||
rustc_lexer::TokenKind::Gt => T![>],
|
||||
rustc_lexer::TokenKind::Minus => T![-],
|
||||
rustc_lexer::TokenKind::And => T![&],
|
||||
rustc_lexer::TokenKind::Or => T![|],
|
||||
rustc_lexer::TokenKind::Plus => T![+],
|
||||
rustc_lexer::TokenKind::Star => T![*],
|
||||
rustc_lexer::TokenKind::Slash => T![/],
|
||||
rustc_lexer::TokenKind::Caret => T![^],
|
||||
rustc_lexer::TokenKind::Percent => T![%],
|
||||
rustc_lexer::TokenKind::Unknown => ERROR,
|
||||
}
|
||||
};
|
||||
|
||||
let err = if err.is_empty() { None } else { Some(err) };
|
||||
(syntax_kind, err)
|
||||
}
|
||||
|
||||
fn from_rustc_literal(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) {
|
||||
let mut err = "";
|
||||
|
||||
let syntax_kind = match *kind {
|
||||
rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
|
||||
if empty_int {
|
||||
err = "Missing digits after the integer base prefix";
|
||||
}
|
||||
INT_NUMBER
|
||||
}
|
||||
rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
|
||||
if empty_exponent {
|
||||
err = "Missing digits after the exponent symbol";
|
||||
}
|
||||
FLOAT_NUMBER
|
||||
}
|
||||
rustc_lexer::LiteralKind::Char { terminated } => {
|
||||
if !terminated {
|
||||
err = "Missing trailing `'` symbol to terminate the character literal";
|
||||
}
|
||||
CHAR
|
||||
}
|
||||
rustc_lexer::LiteralKind::Byte { terminated } => {
|
||||
if !terminated {
|
||||
err = "Missing trailing `'` symbol to terminate the byte literal";
|
||||
}
|
||||
BYTE
|
||||
}
|
||||
rustc_lexer::LiteralKind::Str { terminated } => {
|
||||
if !terminated {
|
||||
err = "Missing trailing `\"` symbol to terminate the string literal";
|
||||
}
|
||||
STRING
|
||||
}
|
||||
rustc_lexer::LiteralKind::ByteStr { terminated } => {
|
||||
if !terminated {
|
||||
err = "Missing trailing `\"` symbol to terminate the byte string literal";
|
||||
}
|
||||
BYTE_STRING
|
||||
}
|
||||
rustc_lexer::LiteralKind::RawStr { err: raw_str_err, .. } => {
|
||||
if let Some(raw_str_err) = raw_str_err {
|
||||
err = match raw_str_err {
|
||||
rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw string literal",
|
||||
rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found {
|
||||
"Missing trailing `\"` to terminate the raw string literal"
|
||||
} else {
|
||||
"Missing trailing `\"` with `#` symbols to terminate the raw string literal"
|
||||
},
|
||||
rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols",
|
||||
};
|
||||
};
|
||||
STRING
|
||||
}
|
||||
rustc_lexer::LiteralKind::RawByteStr { err: raw_str_err, .. } => {
|
||||
if let Some(raw_str_err) = raw_str_err {
|
||||
err = match raw_str_err {
|
||||
rustc_lexer::RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw byte string literal",
|
||||
rustc_lexer::RawStrError::NoTerminator { expected, found, .. } => if expected == found {
|
||||
"Missing trailing `\"` to terminate the raw byte string literal"
|
||||
} else {
|
||||
"Missing trailing `\"` with `#` symbols to terminate the raw byte string literal"
|
||||
},
|
||||
rustc_lexer::RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols",
|
||||
};
|
||||
};
|
||||
|
||||
BYTE_STRING
|
||||
}
|
||||
};
|
||||
|
||||
let err = if err.is_empty() { None } else { Some(err) };
|
||||
(syntax_kind, err)
|
||||
}
|
|
@ -18,6 +18,7 @@
|
|||
//! [`Parser`]: crate::parser::Parser
|
||||
#![allow(rustdoc::private_intra_doc_links)]
|
||||
|
||||
mod lexed_str;
|
||||
mod token_set;
|
||||
mod syntax_kind;
|
||||
mod event;
|
||||
|
@ -25,9 +26,12 @@ mod parser;
|
|||
mod grammar;
|
||||
mod tokens;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub(crate) use token_set::TokenSet;
|
||||
|
||||
pub use crate::{syntax_kind::SyntaxKind, tokens::Tokens};
|
||||
pub use crate::{lexed_str::LexedStr, syntax_kind::SyntaxKind, tokens::Tokens};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct ParseError(pub Box<String>);
|
||||
|
|
71
crates/parser/src/tests.rs
Normal file
71
crates/parser/src/tests.rs
Normal file
|
@ -0,0 +1,71 @@
|
|||
use std::{
|
||||
fmt::Write,
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use expect_test::expect_file;
|
||||
|
||||
use crate::LexedStr;
|
||||
|
||||
#[test]
|
||||
fn valid_lexes_input() {
|
||||
for case in TestCase::list("lexer/ok") {
|
||||
let actual = lex(&case.text);
|
||||
expect_file![case.txt].assert_eq(&actual)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invalid_lexes_input() {
|
||||
for case in TestCase::list("lexer/err") {
|
||||
let actual = lex(&case.text);
|
||||
expect_file![case.txt].assert_eq(&actual)
|
||||
}
|
||||
}
|
||||
|
||||
fn lex(text: &str) -> String {
|
||||
let lexed = LexedStr::new(text);
|
||||
|
||||
let mut res = String::new();
|
||||
for i in 0..lexed.len() {
|
||||
let kind = lexed.kind(i);
|
||||
let text = lexed.text(i);
|
||||
let error = lexed.error(i);
|
||||
|
||||
let error = error.map(|err| format!(" error: {}", err)).unwrap_or_default();
|
||||
writeln!(res, "{:?} {:?}{}", kind, text, error).unwrap();
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestCase {
|
||||
rs: PathBuf,
|
||||
txt: PathBuf,
|
||||
text: String,
|
||||
}
|
||||
|
||||
impl TestCase {
|
||||
fn list(path: &'static str) -> Vec<TestCase> {
|
||||
let crate_root_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
|
||||
let test_data_dir = crate_root_dir.join("test_data");
|
||||
let dir = test_data_dir.join(path);
|
||||
|
||||
let mut res = Vec::new();
|
||||
let read_dir = fs::read_dir(&dir)
|
||||
.unwrap_or_else(|err| panic!("can't `read_dir` {}: {}", dir.display(), err));
|
||||
for file in read_dir {
|
||||
let file = file.unwrap();
|
||||
let path = file.path();
|
||||
if path.extension().unwrap_or_default() == "rs" {
|
||||
let rs = path;
|
||||
let txt = rs.with_extension("txt");
|
||||
let text = fs::read_to_string(&rs).unwrap();
|
||||
res.push(TestCase { rs, txt, text });
|
||||
}
|
||||
}
|
||||
res.sort();
|
||||
res
|
||||
}
|
||||
}
|
|
@ -1,7 +1,8 @@
|
|||
//! Input for the parser -- a sequence of tokens.
|
||||
//!
|
||||
//! As of now, parser doesn't have access to the *text* of the tokens, and makes
|
||||
//! decisions based solely on their classification.
|
||||
//! decisions based solely on their classification. Unlike `LexerToken`, the
|
||||
//! `Tokens` doesn't include whitespace and comments.
|
||||
|
||||
use crate::SyntaxKind;
|
||||
|
||||
|
|
48
crates/parser/test_data/lexer/err/empty_exponent.txt
Normal file
48
crates/parser/test_data/lexer/err/empty_exponent.txt
Normal file
|
@ -0,0 +1,48 @@
|
|||
FLOAT_NUMBER "0e" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "0E" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n\n"
|
||||
FLOAT_NUMBER "42e+" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42e-" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42E+" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42E-" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n\n"
|
||||
INT_NUMBER "42"
|
||||
DOT "."
|
||||
IDENT "e"
|
||||
PLUS "+"
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "42"
|
||||
DOT "."
|
||||
IDENT "e"
|
||||
MINUS "-"
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "42"
|
||||
DOT "."
|
||||
IDENT "E"
|
||||
PLUS "+"
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "42"
|
||||
DOT "."
|
||||
IDENT "E"
|
||||
MINUS "-"
|
||||
WHITESPACE "\n\n"
|
||||
FLOAT_NUMBER "42.2e+" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42.2e-" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42.2E+" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42.2E-" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n\n"
|
||||
FLOAT_NUMBER "42.2e+f32" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42.2e-f32" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42.2E+f32" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
||||
FLOAT_NUMBER "42.2E-f32" error: Missing digits after the exponent symbol
|
||||
WHITESPACE "\n"
|
26
crates/parser/test_data/lexer/err/empty_int.txt
Normal file
26
crates/parser/test_data/lexer/err/empty_int.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
INT_NUMBER "0b" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0o" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0x" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n\n"
|
||||
INT_NUMBER "0b_" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0o_" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0x_" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n\n"
|
||||
INT_NUMBER "0bnoDigit" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0onoDigit" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0xnoDigit" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n\n"
|
||||
INT_NUMBER "0xG" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0xg" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n\n"
|
||||
INT_NUMBER "0x_g" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
||||
INT_NUMBER "0x_G" error: Missing digits after the integer base prefix
|
||||
WHITESPACE "\n"
|
|
@ -0,0 +1,4 @@
|
|||
LIFETIME_IDENT "'1" error: Lifetime name cannot start with a number
|
||||
WHITESPACE "\n"
|
||||
LIFETIME_IDENT "'1lifetime" error: Lifetime name cannot start with a number
|
||||
WHITESPACE "\n"
|
|
@ -0,0 +1 @@
|
|||
COMMENT "/*" error: Missing trailing `*/` symbols to terminate the block comment
|
|
@ -0,0 +1 @@
|
|||
COMMENT "/* comment\n" error: Missing trailing `*/` symbols to terminate the block comment
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"\\x7f" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"🦀" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"\\" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"\\\"" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"\\n" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\" " error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "b\"\\u{20AA}" error: Missing trailing `"` symbol to terminate the byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'\\x7f" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'🦀" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'\\" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'\\n" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'\\'" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b' " error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
BYTE "b'\\u{20AA}" error: Missing trailing `'` symbol to terminate the byte literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'\\x7f" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'🦀" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'\\" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'\\n" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'\\'" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "' " error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
CHAR "'\\u{20AA}" error: Missing trailing `'` symbol to terminate the character literal
|
|
@ -0,0 +1 @@
|
|||
COMMENT "/* /* /*\n" error: Missing trailing `*/` symbols to terminate the block comment
|
|
@ -0,0 +1 @@
|
|||
COMMENT "/** /*! /* comment */ */\n" error: Missing trailing `*/` symbols to terminate the block comment
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\"" error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\"\\x7f" error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\"🦀" error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\"\\" error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\"\\n" error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\" " error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
BYTE_STRING "br##\"\\u{20AA}" error: Missing trailing `"` with `#` symbols to terminate the raw byte string literal
|
|
@ -0,0 +1 @@
|
|||
STRING "r##\"" error: Missing trailing `"` with `#` symbols to terminate the raw string literal
|
|
@ -0,0 +1 @@
|
|||
STRING "r##\"\\x7f" error: Missing trailing `"` with `#` symbols to terminate the raw string literal
|
|
@ -0,0 +1 @@
|
|||
STRING "r##\"🦀" error: Missing trailing `"` with `#` symbols to terminate the raw string literal
|
|
@ -0,0 +1 @@
|
|||
STRING "r##\"\\" error: Missing trailing `"` with `#` symbols to terminate the raw string literal
|
|
@ -0,0 +1 @@
|
|||
STRING "r##\"\\n" error: Missing trailing `"` with `#` symbols to terminate the raw string literal
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue