diff --git a/Cargo.lock b/Cargo.lock index 2ca5899c77..44ef223557 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -609,6 +609,7 @@ dependencies = [ "hir", "ide_db", "itertools", + "parser", "profile", "rustc-hash", "sourcegen", @@ -654,6 +655,7 @@ dependencies = [ "itertools", "limit", "once_cell", + "parser", "profile", "rayon", "rustc-hash", @@ -695,6 +697,7 @@ dependencies = [ "hir", "ide_db", "itertools", + "parser", "rustc-hash", "syntax", "test_utils", diff --git a/crates/ide_assists/Cargo.toml b/crates/ide_assists/Cargo.toml index 4d97e9150e..3cd186fdf5 100644 --- a/crates/ide_assists/Cargo.toml +++ b/crates/ide_assists/Cargo.toml @@ -16,6 +16,7 @@ itertools = "0.10.0" either = "1.6.1" stdx = { path = "../stdx", version = "0.0.0" } +parser = { path = "../parser", version = "0.0.0" } syntax = { path = "../syntax", version = "0.0.0" } text_edit = { path = "../text_edit", version = "0.0.0" } profile = { path = "../profile", version = "0.0.0" } diff --git a/crates/ide_assists/src/utils/suggest_name.rs b/crates/ide_assists/src/utils/suggest_name.rs index 2021db3aba..f91b2fe44e 100644 --- a/crates/ide_assists/src/utils/suggest_name.rs +++ b/crates/ide_assists/src/utils/suggest_name.rs @@ -135,7 +135,7 @@ fn normalize(name: &str) -> Option { } fn is_valid_name(name: &str) -> bool { - match syntax::lex_single_syntax_kind(name) { + match parser::LexedStr::single_token(name) { Some((syntax::SyntaxKind::IDENT, _error)) => true, _ => false, } diff --git a/crates/ide_db/Cargo.toml b/crates/ide_db/Cargo.toml index ea20f5372c..cfcf9f56c8 100644 --- a/crates/ide_db/Cargo.toml +++ b/crates/ide_db/Cargo.toml @@ -22,6 +22,7 @@ arrayvec = "0.7" indexmap = "1.7" stdx = { path = "../stdx", version = "0.0.0" } +parser = { path = "../parser", version = "0.0.0" } syntax = { path = "../syntax", version = "0.0.0" } text_edit = { path = "../text_edit", version = "0.0.0" } base_db = { path = "../base_db", version = "0.0.0" } diff --git a/crates/ide_db/src/rename.rs b/crates/ide_db/src/rename.rs index 60160f9553..188499db72 100644 --- a/crates/ide_db/src/rename.rs +++ b/crates/ide_db/src/rename.rs @@ -28,7 +28,7 @@ use hir::{AsAssocItem, FieldSource, HasSource, InFile, ModuleSource, Semantics}; use stdx::never; use syntax::{ ast::{self, HasName}, - lex_single_syntax_kind, AstNode, SyntaxKind, TextRange, T, + AstNode, SyntaxKind, TextRange, T, }; use text_edit::{TextEdit, TextEditBuilder}; @@ -490,7 +490,7 @@ pub enum IdentifierKind { impl IdentifierKind { pub fn classify(new_name: &str) -> Result { - match lex_single_syntax_kind(new_name) { + match parser::LexedStr::single_token(new_name) { Some(res) => match res { (SyntaxKind::IDENT, _) => Ok(IdentifierKind::Ident), (T![_], _) => Ok(IdentifierKind::Underscore), diff --git a/crates/ide_ssr/Cargo.toml b/crates/ide_ssr/Cargo.toml index efa8fd243a..9a8221ac6c 100644 --- a/crates/ide_ssr/Cargo.toml +++ b/crates/ide_ssr/Cargo.toml @@ -16,6 +16,7 @@ rustc-hash = "1.1.0" itertools = "0.10.0" text_edit = { path = "../text_edit", version = "0.0.0" } +parser = { path = "../parser", version = "0.0.0" } syntax = { path = "../syntax", version = "0.0.0" } ide_db = { path = "../ide_db", version = "0.0.0" } hir = { path = "../hir", version = "0.0.0" } diff --git a/crates/ide_ssr/src/parsing.rs b/crates/ide_ssr/src/parsing.rs index ed7c033e27..ae7d5b4bf1 100644 --- a/crates/ide_ssr/src/parsing.rs +++ b/crates/ide_ssr/src/parsing.rs @@ -256,19 +256,13 @@ fn validate_rule(rule: &SsrRule) -> Result<(), SsrError> { } fn tokenize(source: &str) -> Result, SsrError> { - let mut start = 0; - let (raw_tokens, errors) = syntax::tokenize(source); - if let Some(first_error) = errors.first() { + let lexed = parser::LexedStr::new(source); + if let Some((_, first_error)) = lexed.errors().next() { bail!("Failed to parse pattern: {}", first_error); } let mut tokens: Vec = Vec::new(); - for raw_token in raw_tokens { - let token_len = usize::from(raw_token.len); - tokens.push(Token { - kind: raw_token.kind, - text: SmolStr::new(&source[start..start + token_len]), - }); - start += token_len; + for i in 0..lexed.len() { + tokens.push(Token { kind: lexed.kind(i), text: lexed.text(i).into() }); } Ok(tokens) } diff --git a/crates/mbe/src/syntax_bridge.rs b/crates/mbe/src/syntax_bridge.rs index 28a23f6be2..109842b0cd 100644 --- a/crates/mbe/src/syntax_bridge.rs +++ b/crates/mbe/src/syntax_bridge.rs @@ -4,10 +4,9 @@ use parser::{ParseError, TreeSink}; use rustc_hash::{FxHashMap, FxHashSet}; use syntax::{ ast::{self, make::tokens::doc_comment}, - tokenize, AstToken, Parse, PreorderWithTokens, SmolStr, SyntaxElement, SyntaxKind, + AstToken, Parse, PreorderWithTokens, SmolStr, SyntaxElement, SyntaxKind, SyntaxKind::*, - SyntaxNode, SyntaxToken, SyntaxTreeBuilder, TextRange, TextSize, Token as RawToken, WalkEvent, - T, + SyntaxNode, SyntaxToken, SyntaxTreeBuilder, TextRange, TextSize, WalkEvent, T, }; use tt::buffer::{Cursor, TokenBuffer}; @@ -69,15 +68,14 @@ pub fn token_tree_to_syntax_node( /// Convert a string to a `TokenTree` pub fn parse_to_token_tree(text: &str) -> Option<(tt::Subtree, TokenMap)> { - let (tokens, errors) = tokenize(text); - if !errors.is_empty() { + let lexed = parser::LexedStr::new(text); + if lexed.errors().next().is_some() { return None; } let mut conv = RawConvertor { - text, - offset: TextSize::default(), - inner: tokens.iter(), + lexed: lexed, + pos: 0, id_alloc: TokenIdAlloc { map: Default::default(), global_offset: TextSize::default(), @@ -146,7 +144,7 @@ fn convert_tokens(conv: &mut C) -> tt::Subtree { Some(it) => it, }; - let k: SyntaxKind = token.kind(); + let k: SyntaxKind = token.kind(&conv); if k == COMMENT { if let Some(tokens) = conv.convert_doc_comment(&token) { // FIXME: There has to be a better way to do this @@ -199,19 +197,19 @@ fn convert_tokens(conv: &mut C) -> tt::Subtree { } else { let spacing = match conv.peek() { Some(next) - if next.kind().is_trivia() - || next.kind() == T!['['] - || next.kind() == T!['{'] - || next.kind() == T!['('] => + if next.kind(&conv).is_trivia() + || next.kind(&conv) == T!['['] + || next.kind(&conv) == T!['{'] + || next.kind(&conv) == T!['('] => { tt::Spacing::Alone } - Some(next) if next.kind().is_punct() && next.kind() != UNDERSCORE => { + Some(next) if next.kind(&conv).is_punct() && next.kind(&conv) != UNDERSCORE => { tt::Spacing::Joint } _ => tt::Spacing::Alone, }; - let char = match token.to_char() { + let char = match token.to_char(&conv) { Some(c) => c, None => { panic!("Token from lexer must be single char: token = {:#?}", token); @@ -222,7 +220,7 @@ fn convert_tokens(conv: &mut C) -> tt::Subtree { } else { macro_rules! make_leaf { ($i:ident) => { - tt::$i { id: conv.id_alloc().alloc(range), text: token.to_text() }.into() + tt::$i { id: conv.id_alloc().alloc(range), text: token.to_text(conv) }.into() }; } let leaf: tt::Leaf = match k { @@ -243,7 +241,7 @@ fn convert_tokens(conv: &mut C) -> tt::Subtree { let r = TextRange::at(range.start() + char_unit, range.len() - char_unit); let ident = tt::Leaf::from(tt::Ident { - text: SmolStr::new(&token.to_text()[1..]), + text: SmolStr::new(&token.to_text(conv)[1..]), id: conv.id_alloc().alloc(r), }); result.push(ident.into()); @@ -392,22 +390,21 @@ impl TokenIdAlloc { /// A Raw Token (straightly from lexer) convertor struct RawConvertor<'a> { - text: &'a str, - offset: TextSize, + lexed: parser::LexedStr<'a>, + pos: usize, id_alloc: TokenIdAlloc, - inner: std::slice::Iter<'a, RawToken>, } -trait SrcToken: std::fmt::Debug { - fn kind(&self) -> SyntaxKind; +trait SrcToken: std::fmt::Debug { + fn kind(&self, ctx: &Ctx) -> SyntaxKind; - fn to_char(&self) -> Option; + fn to_char(&self, ctx: &Ctx) -> Option; - fn to_text(&self) -> SmolStr; + fn to_text(&self, ctx: &Ctx) -> SmolStr; } -trait TokenConvertor { - type Token: SrcToken; +trait TokenConvertor: Sized { + type Token: SrcToken; fn convert_doc_comment(&self, token: &Self::Token) -> Option>; @@ -418,42 +415,45 @@ trait TokenConvertor { fn id_alloc(&mut self) -> &mut TokenIdAlloc; } -impl<'a> SrcToken for (&'a RawToken, &'a str) { - fn kind(&self) -> SyntaxKind { - self.0.kind +impl<'a> SrcToken> for usize { + fn kind(&self, ctx: &RawConvertor<'a>) -> SyntaxKind { + ctx.lexed.kind(*self) } - fn to_char(&self) -> Option { - self.1.chars().next() + fn to_char(&self, ctx: &RawConvertor<'a>) -> Option { + ctx.lexed.text(*self).chars().next() } - fn to_text(&self) -> SmolStr { - self.1.into() + fn to_text(&self, ctx: &RawConvertor<'_>) -> SmolStr { + ctx.lexed.text(*self).into() } } impl<'a> TokenConvertor for RawConvertor<'a> { - type Token = (&'a RawToken, &'a str); + type Token = usize; - fn convert_doc_comment(&self, token: &Self::Token) -> Option> { - convert_doc_comment(&doc_comment(token.1)) + fn convert_doc_comment(&self, token: &usize) -> Option> { + let text = self.lexed.text(*token); + convert_doc_comment(&doc_comment(text)) } fn bump(&mut self) -> Option<(Self::Token, TextRange)> { - let token = self.inner.next()?; - let range = TextRange::at(self.offset, token.len); - self.offset += token.len; + if self.pos == self.lexed.len() { + return None; + } + let token = self.pos; + self.pos += 1; + let range = self.lexed.text_range(token); + let range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); - Some(((token, &self.text[range]), range)) + Some((token, range)) } fn peek(&self) -> Option { - let token = self.inner.as_slice().get(0); - - token.map(|it| { - let range = TextRange::at(self.offset, it.len); - (it, &self.text[range]) - }) + if self.pos == self.lexed.len() { + return None; + } + Some(self.pos) } fn id_alloc(&mut self) -> &mut TokenIdAlloc { @@ -523,17 +523,17 @@ impl SynToken { } } -impl SrcToken for SynToken { - fn kind(&self) -> SyntaxKind { +impl<'a> SrcToken> for SynToken { + fn kind(&self, _ctx: &Convertor<'a>) -> SyntaxKind { self.token().kind() } - fn to_char(&self) -> Option { + fn to_char(&self, _ctx: &Convertor<'a>) -> Option { match self { SynToken::Ordinary(_) => None, SynToken::Punch(it, i) => it.text().chars().nth((*i).into()), } } - fn to_text(&self) -> SmolStr { + fn to_text(&self, _ctx: &Convertor<'a>) -> SmolStr { self.token().text().into() } } diff --git a/crates/mbe/src/to_parser_tokens.rs b/crates/mbe/src/to_parser_tokens.rs index 644689f432..f419c78d46 100644 --- a/crates/mbe/src/to_parser_tokens.rs +++ b/crates/mbe/src/to_parser_tokens.rs @@ -1,7 +1,7 @@ //! Convert macro-by-example tokens which are specific to macro expansion into a //! format that works for our parser. -use syntax::{lex_single_syntax_kind, SyntaxKind, SyntaxKind::*, T}; +use syntax::{SyntaxKind, SyntaxKind::*, T}; use tt::buffer::TokenBuffer; pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens { @@ -35,7 +35,7 @@ pub(crate) fn to_parser_tokens(buffer: &TokenBuffer) -> parser::Tokens { let is_negated = lit.text.starts_with('-'); let inner_text = &lit.text[if is_negated { 1 } else { 0 }..]; - let kind = lex_single_syntax_kind(inner_text) + let kind = parser::LexedStr::single_token(inner_text) .map(|(kind, _error)| kind) .filter(|kind| { kind.is_literal() diff --git a/crates/parser/src/lexed_str.rs b/crates/parser/src/lexed_str.rs index 9c5d27f51d..1ef29b5210 100644 --- a/crates/parser/src/lexed_str.rs +++ b/crates/parser/src/lexed_str.rs @@ -8,6 +8,8 @@ //! Note that these tokens, unlike the tokens we feed into the parser, do //! include info about comments and whitespace. +use std::ops; + use crate::{ SyntaxKind::{self, *}, T, @@ -52,7 +54,7 @@ impl<'a> LexedStr<'a> { res } - pub fn single_token(text: &'a str) -> Option { + pub fn single_token(text: &'a str) -> Option<(SyntaxKind, Option)> { if text.is_empty() { return None; } @@ -63,11 +65,7 @@ impl<'a> LexedStr<'a> { } let (kind, err) = from_rustc(&token.kind, text); - if err.is_some() { - return None; - } - - Some(kind) + Some((kind, err.map(|it| it.to_owned()))) } pub fn as_str(&self) -> &str { @@ -78,16 +76,40 @@ impl<'a> LexedStr<'a> { self.kind.len() - 1 } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + pub fn kind(&self, i: usize) -> SyntaxKind { assert!(i < self.len()); self.kind[i] } pub fn text(&self, i: usize) -> &str { + self.range_text(i..i + 1) + } + pub fn range_text(&self, r: ops::Range) -> &str { + assert!(r.start < r.end && r.end <= self.len()); + let lo = self.start[r.start] as usize; + let hi = self.start[r.end] as usize; + &self.text[lo..hi] + } + + // Naming is hard. + pub fn text_range(&self, i: usize) -> ops::Range { assert!(i < self.len()); let lo = self.start[i] as usize; let hi = self.start[i + 1] as usize; - &self.text[lo..hi] + lo..hi + } + pub fn text_start(&self, i: usize) -> usize { + assert!(i <= self.len()); + self.start[i] as usize + } + pub fn text_len(&self, i: usize) -> usize { + assert!(i < self.len()); + let r = self.text_range(i); + r.end - r.start } pub fn error(&self, i: usize) -> Option<&str> { @@ -96,6 +118,10 @@ impl<'a> LexedStr<'a> { Some(self.error[err].msg.as_str()) } + pub fn errors(&self) -> impl Iterator + '_ { + self.error.iter().map(|it| (it.token as usize, it.msg.as_str())) + } + pub fn to_tokens(&self) -> crate::Tokens { let mut res = crate::Tokens::default(); let mut was_joint = false; diff --git a/crates/syntax/src/lib.rs b/crates/syntax/src/lib.rs index 07817bfc0d..65a6b7ac4e 100644 --- a/crates/syntax/src/lib.rs +++ b/crates/syntax/src/lib.rs @@ -48,7 +48,6 @@ use text_edit::Indel; pub use crate::{ ast::{AstNode, AstToken}, - parsing::lexer::{lex_single_syntax_kind, tokenize, Token}, ptr::{AstPtr, SyntaxNodePtr}, syntax_error::SyntaxError, syntax_node::{ diff --git a/crates/syntax/src/parsing.rs b/crates/syntax/src/parsing.rs index 865e146482..cba1ddde85 100644 --- a/crates/syntax/src/parsing.rs +++ b/crates/syntax/src/parsing.rs @@ -1,7 +1,6 @@ //! Lexing, bridging to parser (which does the actual parsing) and //! incremental reparsing. -pub(crate) mod lexer; mod text_tree_sink; mod reparsing; @@ -10,18 +9,17 @@ use text_tree_sink::TextTreeSink; use crate::{syntax_node::GreenNode, AstNode, SyntaxError, SyntaxNode}; -pub(crate) use crate::parsing::{lexer::*, reparsing::incremental_reparse}; +pub(crate) use crate::parsing::reparsing::incremental_reparse; pub(crate) fn parse_text(text: &str) -> (GreenNode, Vec) { - let (lexer_tokens, lexer_errors) = tokenize(text); - let parser_tokens = to_parser_tokens(text, &lexer_tokens); + let lexed = parser::LexedStr::new(text); + let parser_tokens = lexed.to_tokens(); - let mut tree_sink = TextTreeSink::new(text, &lexer_tokens); + let mut tree_sink = TextTreeSink::new(lexed); parser::parse_source_file(&parser_tokens, &mut tree_sink); - let (tree, mut parser_errors) = tree_sink.finish(); - parser_errors.extend(lexer_errors); + let (tree, parser_errors) = tree_sink.finish(); (tree, parser_errors) } @@ -31,14 +29,13 @@ pub(crate) fn parse_text_as( text: &str, entry_point: parser::ParserEntryPoint, ) -> Result { - let (lexer_tokens, lexer_errors) = tokenize(text); - if !lexer_errors.is_empty() { + let lexed = parser::LexedStr::new(text); + if lexed.errors().next().is_some() { return Err(()); } + let parser_tokens = lexed.to_tokens(); - let parser_tokens = to_parser_tokens(text, &lexer_tokens); - - let mut tree_sink = TextTreeSink::new(text, &lexer_tokens); + let mut tree_sink = TextTreeSink::new(lexed); // TextTreeSink assumes that there's at least some root node to which it can attach errors and // tokens. We arbitrarily give it a SourceFile. @@ -54,29 +51,3 @@ pub(crate) fn parse_text_as( SyntaxNode::new_root(tree).first_child().and_then(T::cast).ok_or(()) } - -pub(crate) fn to_parser_tokens(text: &str, lexer_tokens: &[lexer::Token]) -> ::parser::Tokens { - let mut off = 0; - let mut res = parser::Tokens::default(); - let mut was_joint = false; - for t in lexer_tokens { - if t.kind.is_trivia() { - was_joint = false; - } else { - if t.kind == SyntaxKind::IDENT { - let token_text = &text[off..][..usize::from(t.len)]; - let contextual_kw = - SyntaxKind::from_contextual_keyword(token_text).unwrap_or(SyntaxKind::IDENT); - res.push_ident(contextual_kw); - } else { - if was_joint { - res.was_joint(); - } - res.push(t.kind); - } - was_joint = true; - } - off += usize::from(t.len); - } - res -} diff --git a/crates/syntax/src/parsing/lexer.rs b/crates/syntax/src/parsing/lexer.rs deleted file mode 100644 index d94f5f067d..0000000000 --- a/crates/syntax/src/parsing/lexer.rs +++ /dev/null @@ -1,249 +0,0 @@ -//! Lexer analyzes raw input string and produces lexemes (tokens). -//! It is just a bridge to `rustc_lexer`. - -use std::convert::TryInto; - -use rustc_lexer::RawStrError; - -use crate::{ - SyntaxError, - SyntaxKind::{self, *}, - TextRange, TextSize, T, -}; - -/// A token of Rust source. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Token { - /// The kind of token. - pub kind: SyntaxKind, - /// The length of the token. - pub len: TextSize, -} - -/// Break a string up into its component tokens. -/// Beware that it checks for shebang first and its length contributes to resulting -/// tokens offsets. -pub fn tokenize(text: &str) -> (Vec, Vec) { - // non-empty string is a precondition of `rustc_lexer::strip_shebang()`. - if text.is_empty() { - return Default::default(); - } - - let mut tokens = Vec::new(); - let mut errors = Vec::new(); - - let mut offset = match rustc_lexer::strip_shebang(text) { - Some(shebang_len) => { - tokens.push(Token { kind: SHEBANG, len: shebang_len.try_into().unwrap() }); - shebang_len - } - None => 0, - }; - - let text_without_shebang = &text[offset..]; - - for rustc_token in rustc_lexer::tokenize(text_without_shebang) { - let token_len: TextSize = rustc_token.len.try_into().unwrap(); - let token_range = TextRange::at(offset.try_into().unwrap(), token_len); - - let (syntax_kind, err_message) = - rustc_token_kind_to_syntax_kind(&rustc_token.kind, &text[token_range]); - - tokens.push(Token { kind: syntax_kind, len: token_len }); - - if let Some(err_message) = err_message { - errors.push(SyntaxError::new(err_message, token_range)); - } - - offset += rustc_token.len; - } - - (tokens, errors) -} - -/// Returns `SyntaxKind` and `Option` if `text` parses as a single token. -/// -/// Returns `None` if the string contains zero *or two or more* tokens. -/// The token is malformed if the returned error is not `None`. -/// -/// Beware that unescape errors are not checked at tokenization time. -pub fn lex_single_syntax_kind(text: &str) -> Option<(SyntaxKind, Option)> { - let (first_token, err) = lex_first_token(text)?; - if first_token.len != TextSize::of(text) { - return None; - } - Some((first_token.kind, err)) -} - -/// Returns `SyntaxKind` and `Option` of the first token -/// encountered at the beginning of the string. -/// -/// Returns `None` if the string contains zero tokens or if the token was parsed -/// with an error. -/// The token is malformed if the returned error is not `None`. -/// -/// Beware that unescape errors are not checked at tokenization time. -fn lex_first_token(text: &str) -> Option<(Token, Option)> { - // non-empty string is a precondition of `rustc_lexer::first_token()`. - if text.is_empty() { - return None; - } - - let rustc_token = rustc_lexer::first_token(text); - let (syntax_kind, err_message) = rustc_token_kind_to_syntax_kind(&rustc_token.kind, text); - - let token = Token { kind: syntax_kind, len: rustc_token.len.try_into().unwrap() }; - let optional_error = err_message - .map(|err_message| SyntaxError::new(err_message, TextRange::up_to(TextSize::of(text)))); - - Some((token, optional_error)) -} - -/// Returns `SyntaxKind` and an optional tokenize error message. -fn rustc_token_kind_to_syntax_kind( - rustc_token_kind: &rustc_lexer::TokenKind, - token_text: &str, -) -> (SyntaxKind, Option<&'static str>) { - // A note on an intended tradeoff: - // We drop some useful information here (see patterns with double dots `..`) - // Storing that info in `SyntaxKind` is not possible due to its layout requirements of - // being `u16` that come from `rowan::SyntaxKind`. - - let syntax_kind = { - match rustc_token_kind { - rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT, - - rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated: true } => COMMENT, - rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated: false } => { - return ( - COMMENT, - Some("Missing trailing `*/` symbols to terminate the block comment"), - ); - } - - rustc_lexer::TokenKind::Whitespace => WHITESPACE, - - rustc_lexer::TokenKind::Ident => { - if token_text == "_" { - UNDERSCORE - } else { - SyntaxKind::from_keyword(token_text).unwrap_or(IDENT) - } - } - - rustc_lexer::TokenKind::RawIdent => IDENT, - rustc_lexer::TokenKind::Literal { kind, .. } => return match_literal_kind(kind), - - rustc_lexer::TokenKind::Lifetime { starts_with_number: false } => LIFETIME_IDENT, - rustc_lexer::TokenKind::Lifetime { starts_with_number: true } => { - return (LIFETIME_IDENT, Some("Lifetime name cannot start with a number")) - } - - rustc_lexer::TokenKind::Semi => T![;], - rustc_lexer::TokenKind::Comma => T![,], - rustc_lexer::TokenKind::Dot => T![.], - rustc_lexer::TokenKind::OpenParen => T!['('], - rustc_lexer::TokenKind::CloseParen => T![')'], - rustc_lexer::TokenKind::OpenBrace => T!['{'], - rustc_lexer::TokenKind::CloseBrace => T!['}'], - rustc_lexer::TokenKind::OpenBracket => T!['['], - rustc_lexer::TokenKind::CloseBracket => T![']'], - rustc_lexer::TokenKind::At => T![@], - rustc_lexer::TokenKind::Pound => T![#], - rustc_lexer::TokenKind::Tilde => T![~], - rustc_lexer::TokenKind::Question => T![?], - rustc_lexer::TokenKind::Colon => T![:], - rustc_lexer::TokenKind::Dollar => T![$], - rustc_lexer::TokenKind::Eq => T![=], - rustc_lexer::TokenKind::Bang => T![!], - rustc_lexer::TokenKind::Lt => T![<], - rustc_lexer::TokenKind::Gt => T![>], - rustc_lexer::TokenKind::Minus => T![-], - rustc_lexer::TokenKind::And => T![&], - rustc_lexer::TokenKind::Or => T![|], - rustc_lexer::TokenKind::Plus => T![+], - rustc_lexer::TokenKind::Star => T![*], - rustc_lexer::TokenKind::Slash => T![/], - rustc_lexer::TokenKind::Caret => T![^], - rustc_lexer::TokenKind::Percent => T![%], - rustc_lexer::TokenKind::Unknown => ERROR, - } - }; - - return (syntax_kind, None); - - fn match_literal_kind(kind: &rustc_lexer::LiteralKind) -> (SyntaxKind, Option<&'static str>) { - let mut err = ""; - let syntax_kind = match *kind { - rustc_lexer::LiteralKind::Int { empty_int, base: _ } => { - if empty_int { - err = "Missing digits after the integer base prefix"; - } - INT_NUMBER - } - rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => { - if empty_exponent { - err = "Missing digits after the exponent symbol"; - } - FLOAT_NUMBER - } - rustc_lexer::LiteralKind::Char { terminated } => { - if !terminated { - err = "Missing trailing `'` symbol to terminate the character literal"; - } - CHAR - } - rustc_lexer::LiteralKind::Byte { terminated } => { - if !terminated { - err = "Missing trailing `'` symbol to terminate the byte literal"; - } - BYTE - } - rustc_lexer::LiteralKind::Str { terminated } => { - if !terminated { - err = "Missing trailing `\"` symbol to terminate the string literal"; - } - STRING - } - rustc_lexer::LiteralKind::ByteStr { terminated } => { - if !terminated { - err = "Missing trailing `\"` symbol to terminate the byte string literal"; - } - BYTE_STRING - } - rustc_lexer::LiteralKind::RawStr { err: raw_str_err, .. } => { - if let Some(raw_str_err) = raw_str_err { - err = match raw_str_err { - RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw string literal", - RawStrError::NoTerminator { expected, found, .. } => if expected == found { - "Missing trailing `\"` to terminate the raw string literal" - } else { - "Missing trailing `\"` with `#` symbols to terminate the raw string literal" - }, - RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw strings may be delimited by up to 65535 `#` symbols", - }; - }; - STRING - } - rustc_lexer::LiteralKind::RawByteStr { err: raw_str_err, .. } => { - if let Some(raw_str_err) = raw_str_err { - err = match raw_str_err { - RawStrError::InvalidStarter { .. } => "Missing `\"` symbol after `#` symbols to begin the raw byte string literal", - RawStrError::NoTerminator { expected, found, .. } => if expected == found { - "Missing trailing `\"` to terminate the raw byte string literal" - } else { - "Missing trailing `\"` with `#` symbols to terminate the raw byte string literal" - }, - RawStrError::TooManyDelimiters { .. } => "Too many `#` symbols: raw byte strings may be delimited by up to 65535 `#` symbols", - }; - }; - - BYTE_STRING - } - }; - - let err = if err.is_empty() { None } else { Some(err) }; - - (syntax_kind, err) - } -} diff --git a/crates/syntax/src/parsing/reparsing.rs b/crates/syntax/src/parsing/reparsing.rs index 62f39a9347..e9567a838c 100644 --- a/crates/syntax/src/parsing/reparsing.rs +++ b/crates/syntax/src/parsing/reparsing.rs @@ -10,11 +10,7 @@ use parser::Reparser; use text_edit::Indel; use crate::{ - parsing::{ - lexer::{lex_single_syntax_kind, tokenize, Token}, - text_tree_sink::TextTreeSink, - to_parser_tokens, - }, + parsing::text_tree_sink::TextTreeSink, syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode}, SyntaxError, SyntaxKind::*, @@ -53,7 +49,7 @@ fn reparse_token( } let mut new_text = get_text_after_edit(prev_token.clone().into(), edit); - let (new_token_kind, new_err) = lex_single_syntax_kind(&new_text)?; + let (new_token_kind, new_err) = parser::LexedStr::single_token(&new_text)?; if new_token_kind != prev_token_kind || (new_token_kind == IDENT && is_contextual_kw(&new_text)) @@ -66,7 +62,7 @@ fn reparse_token( // `b` no longer remains an identifier, but becomes a part of byte string literal if let Some(next_char) = root.text().char_at(prev_token.text_range().end()) { new_text.push(next_char); - let token_with_next_char = lex_single_syntax_kind(&new_text); + let token_with_next_char = parser::LexedStr::single_token(&new_text); if let Some((_kind, _error)) = token_with_next_char { return None; } @@ -74,9 +70,10 @@ fn reparse_token( } let new_token = GreenToken::new(rowan::SyntaxKind(prev_token_kind.into()), &new_text); + let range = TextRange::up_to(TextSize::of(&new_text)); Some(( prev_token.replace_with(new_token), - new_err.into_iter().collect(), + new_err.into_iter().map(|msg| SyntaxError::new(msg, range)).collect(), prev_token.text_range(), )) } @@ -91,17 +88,17 @@ fn reparse_block( let (node, reparser) = find_reparsable_node(root, edit.delete)?; let text = get_text_after_edit(node.clone().into(), edit); - let (lexer_tokens, new_lexer_errors) = tokenize(&text); - if !is_balanced(&lexer_tokens) { + let lexed = parser::LexedStr::new(text.as_str()); + let parser_tokens = lexed.to_tokens(); + if !is_balanced(&lexed) { return None; } - let parser_tokens = to_parser_tokens(&text, &lexer_tokens); - let mut tree_sink = TextTreeSink::new(&text, &lexer_tokens); + let mut tree_sink = TextTreeSink::new(lexed); + reparser.parse(&parser_tokens, &mut tree_sink); - let (green, mut new_parser_errors) = tree_sink.finish(); - new_parser_errors.extend(new_lexer_errors); + let (green, new_parser_errors) = tree_sink.finish(); Some((node.replace_with(green), new_parser_errors, node.text_range())) } @@ -131,16 +128,13 @@ fn find_reparsable_node(node: &SyntaxNode, range: TextRange) -> Option<(SyntaxNo }) } -fn is_balanced(tokens: &[Token]) -> bool { - if tokens.is_empty() - || tokens.first().unwrap().kind != T!['{'] - || tokens.last().unwrap().kind != T!['}'] - { +fn is_balanced(lexed: &parser::LexedStr<'_>) -> bool { + if lexed.is_empty() || lexed.kind(0) != T!['{'] || lexed.kind(lexed.len() - 1) != T!['}'] { return false; } let mut balance = 0usize; - for t in &tokens[1..tokens.len() - 1] { - match t.kind { + for i in 1..lexed.len() - 1 { + match lexed.kind(i) { T!['{'] => balance += 1, T!['}'] => { balance = match balance.checked_sub(1) { diff --git a/crates/syntax/src/parsing/text_tree_sink.rs b/crates/syntax/src/parsing/text_tree_sink.rs index c1792199fd..c9e7feb965 100644 --- a/crates/syntax/src/parsing/text_tree_sink.rs +++ b/crates/syntax/src/parsing/text_tree_sink.rs @@ -2,25 +2,22 @@ use std::mem; -use parser::{ParseError, TreeSink}; +use parser::{LexedStr, ParseError, TreeSink}; use crate::{ ast, - parsing::Token, syntax_node::GreenNode, SyntaxError, SyntaxKind::{self, *}, - SyntaxTreeBuilder, TextRange, TextSize, + SyntaxTreeBuilder, TextRange, }; /// Bridges the parser with our specific syntax tree representation. /// /// `TextTreeSink` also handles attachment of trivia (whitespace) to nodes. pub(crate) struct TextTreeSink<'a> { - text: &'a str, - tokens: &'a [Token], - text_pos: TextSize, - token_pos: usize, + lexed: LexedStr<'a>, + pos: usize, state: State, inner: SyntaxTreeBuilder, } @@ -39,12 +36,7 @@ impl<'a> TreeSink for TextTreeSink<'a> { State::Normal => (), } self.eat_trivias(); - let n_tokens = n_tokens as usize; - let len = self.tokens[self.token_pos..self.token_pos + n_tokens] - .iter() - .map(|it| it.len) - .sum::(); - self.do_token(kind, len, n_tokens); + self.do_token(kind, n_tokens as usize); } fn start_node(&mut self, kind: SyntaxKind) { @@ -60,20 +52,12 @@ impl<'a> TreeSink for TextTreeSink<'a> { } let n_trivias = - self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count(); - let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; - let mut trivia_end = - self.text_pos + leading_trivias.iter().map(|it| it.len).sum::(); - - let n_attached_trivias = { - let leading_trivias = leading_trivias.iter().rev().map(|it| { - let next_end = trivia_end - it.len; - let range = TextRange::new(next_end, trivia_end); - trivia_end = next_end; - (it.kind, &self.text[range]) - }); - n_attached_trivias(kind, leading_trivias) - }; + (self.pos..self.lexed.len()).take_while(|&it| self.lexed.kind(it).is_trivia()).count(); + let leading_trivias = self.pos..self.pos + n_trivias; + let n_attached_trivias = n_attached_trivias( + kind, + leading_trivias.rev().map(|it| (self.lexed.kind(it), self.lexed.text(it))), + ); self.eat_n_trivias(n_trivias - n_attached_trivias); self.inner.start_node(kind); self.eat_n_trivias(n_attached_trivias); @@ -88,20 +72,14 @@ impl<'a> TreeSink for TextTreeSink<'a> { } fn error(&mut self, error: ParseError) { - self.inner.error(error, self.text_pos); + let text_pos = self.lexed.text_start(self.pos).try_into().unwrap(); + self.inner.error(error, text_pos); } } impl<'a> TextTreeSink<'a> { - pub(super) fn new(text: &'a str, tokens: &'a [Token]) -> Self { - Self { - text, - tokens, - text_pos: 0.into(), - token_pos: 0, - state: State::PendingStart, - inner: SyntaxTreeBuilder::default(), - } + pub(super) fn new(lexed: parser::LexedStr<'a>) -> Self { + Self { lexed, pos: 0, state: State::PendingStart, inner: SyntaxTreeBuilder::default() } } pub(super) fn finish_eof(mut self) -> (GreenNode, Vec, bool) { @@ -113,8 +91,17 @@ impl<'a> TextTreeSink<'a> { State::PendingStart | State::Normal => unreachable!(), } - let (node, errors) = self.inner.finish_raw(); - let is_eof = self.token_pos == self.tokens.len(); + let (node, mut errors) = self.inner.finish_raw(); + for (i, err) in self.lexed.errors() { + let text_range = self.lexed.text_range(i); + let text_range = TextRange::new( + text_range.start.try_into().unwrap(), + text_range.end.try_into().unwrap(), + ); + errors.push(SyntaxError::new(err, text_range)) + } + + let is_eof = self.pos == self.lexed.len(); (node, errors, is_eof) } @@ -125,27 +112,26 @@ impl<'a> TextTreeSink<'a> { } fn eat_trivias(&mut self) { - while let Some(&token) = self.tokens.get(self.token_pos) { - if !token.kind.is_trivia() { + while self.pos < self.lexed.len() { + let kind = self.lexed.kind(self.pos); + if !kind.is_trivia() { break; } - self.do_token(token.kind, token.len, 1); + self.do_token(kind, 1); } } fn eat_n_trivias(&mut self, n: usize) { for _ in 0..n { - let token = self.tokens[self.token_pos]; - assert!(token.kind.is_trivia()); - self.do_token(token.kind, token.len, 1); + let kind = self.lexed.kind(self.pos); + assert!(kind.is_trivia()); + self.do_token(kind, 1); } } - fn do_token(&mut self, kind: SyntaxKind, len: TextSize, n_tokens: usize) { - let range = TextRange::at(self.text_pos, len); - let text = &self.text[range]; - self.text_pos += len; - self.token_pos += n_tokens; + fn do_token(&mut self, kind: SyntaxKind, n_tokens: usize) { + let text = &self.lexed.range_text(self.pos..self.pos + n_tokens); + self.pos += n_tokens; self.inner.token(kind, text); } } diff --git a/crates/syntax/src/tests.rs b/crates/syntax/src/tests.rs index 022db39f33..69c5b1cd35 100644 --- a/crates/syntax/src/tests.rs +++ b/crates/syntax/src/tests.rs @@ -3,7 +3,6 @@ mod sourcegen_ast; mod ast_src; use std::{ - fmt::Write, fs, path::{Path, PathBuf}, }; @@ -13,25 +12,7 @@ use expect_test::expect_file; use rayon::prelude::*; use test_utils::{bench, bench_fixture, project_root}; -use crate::{ast, fuzz, tokenize, AstNode, SourceFile, SyntaxError, TextRange, TextSize, Token}; - -#[test] -fn lexer_tests() { - // FIXME: - // * Add tests for unicode escapes in byte-character and [raw]-byte-string literals - // * Add tests for unescape errors - - dir_tests(&test_data_dir(), &["lexer/ok"], "txt", |text, path| { - let (tokens, errors) = tokenize(text); - assert_errors_are_absent(&errors, path); - dump_tokens_and_errors(&tokens, &errors, text) - }); - dir_tests(&test_data_dir(), &["lexer/err"], "txt", |text, path| { - let (tokens, errors) = tokenize(text); - assert_errors_are_present(&errors, path); - dump_tokens_and_errors(&tokens, &errors, text) - }); -} +use crate::{ast, fuzz, AstNode, SourceFile, SyntaxError}; #[test] fn parse_smoke_test() { @@ -206,22 +187,6 @@ fn assert_errors_are_absent(errors: &[SyntaxError], path: &Path) { ); } -fn dump_tokens_and_errors(tokens: &[Token], errors: &[SyntaxError], text: &str) -> String { - let mut acc = String::new(); - let mut offset: TextSize = 0.into(); - for token in tokens { - let token_len = token.len; - let token_text = &text[TextRange::at(offset, token.len)]; - offset += token.len; - writeln!(acc, "{:?} {:?} {:?}", token.kind, token_len, token_text).unwrap(); - } - for err in errors { - writeln!(acc, "> error{:?} token({:?}) msg({})", err.range(), &text[err.range()], err) - .unwrap(); - } - acc -} - fn fragment_parser_dir_test(ok_paths: &[&str], err_paths: &[&str], f: F) where T: crate::AstNode,