From 75761c0e47d8c20a490a3d61ea64d2413d3c3570 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 22 Jul 2019 17:47:33 +0300 Subject: [PATCH] add rustc_lexer --- Cargo.lock | 10 ++ crates/ra_syntax/Cargo.toml | 1 + crates/ra_syntax/src/parsing/lexer.rs | 108 +++++++++++++++++- .../tests/data/lexer/0004_numbers.txt | 12 +- .../tests/data/lexer/0014_unclosed_char.txt | 2 +- .../parser/err/0002_duplicate_shebang.txt | 38 +++++- .../{err => ok}/0030_string_suffixes.rs | 0 .../{err => ok}/0030_string_suffixes.txt | 13 +-- 8 files changed, 159 insertions(+), 25 deletions(-) rename crates/ra_syntax/tests/data/parser/{err => ok}/0030_string_suffixes.rs (100%) rename crates/ra_syntax/tests/data/parser/{err => ok}/0030_string_suffixes.txt (86%) diff --git a/Cargo.lock b/Cargo.lock index 8feaf27ecb..d5474d6e28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1308,12 +1308,21 @@ dependencies = [ "serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "ra_rustc_lexer" +version = "0.1.0-pre.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "ra_syntax" version = "0.1.0" dependencies = [ "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "ra_parser 0.1.0", + "ra_rustc_lexer 0.1.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)", "ra_text_edit 0.1.0", "rowan 0.6.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)", "smol_str 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)", @@ -2250,6 +2259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum proptest 0.9.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cf147e022eacf0c8a054ab864914a7602618adba841d800a9a9868a5237a529f" "checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" "checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +"checksum ra_rustc_lexer 0.1.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e8d92772f822978a6c9c4657aa61af439e4e635180628b3354049b283b749f1e" "checksum ra_vfs 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fb7cd4e302032c5ab514f1c01c89727cd96fd950dd36f9ebee9252df45d9fb1a" "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" "checksum rand 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d47eab0e83d9693d40f825f86948aa16eff6750ead4bdffc4ab95b8b3a7f052c" diff --git a/crates/ra_syntax/Cargo.toml b/crates/ra_syntax/Cargo.toml index 97b6b047fa..9ef8dee5d6 100644 --- a/crates/ra_syntax/Cargo.toml +++ b/crates/ra_syntax/Cargo.toml @@ -11,6 +11,7 @@ repository = "https://github.com/rust-analyzer/rust-analyzer" unicode-xid = "0.1.0" itertools = "0.8.0" rowan = "0.6.0-pre.1" +ra_rustc_lexer = { version = "0.1.0-pre.1", features = [ "unicode-xid" ] } # ideally, `serde` should be enabled by `ra_lsp_server`, but we enable it here # to reduce number of compilations diff --git a/crates/ra_syntax/src/parsing/lexer.rs b/crates/ra_syntax/src/parsing/lexer.rs index 60cf37047d..1c818fdf41 100644 --- a/crates/ra_syntax/src/parsing/lexer.rs +++ b/crates/ra_syntax/src/parsing/lexer.rs @@ -30,19 +30,119 @@ pub struct Token { /// Break a string up into its component tokens pub fn tokenize(text: &str) -> Vec { + if text.is_empty() { + return vec![]; + } let mut text = text; let mut acc = Vec::new(); + if let Some(len) = ra_rustc_lexer::strip_shebang(text) { + acc.push(Token { kind: SHEBANG, len: TextUnit::from_usize(len) }); + text = &text[len..]; + } while !text.is_empty() { - let token = next_token(text); + let rustc_token = ra_rustc_lexer::first_token(text); + macro_rules! decompose { + ($t1:expr, $t2:expr) => {{ + acc.push(Token { kind: $t1, len: 1.into() }); + acc.push(Token { kind: $t2, len: 1.into() }); + text = &text[2..]; + continue; + }}; + ($t1:expr, $t2:expr, $t3:expr) => {{ + acc.push(Token { kind: $t1, len: 1.into() }); + acc.push(Token { kind: $t2, len: 1.into() }); + acc.push(Token { kind: $t3, len: 1.into() }); + text = &text[3..]; + continue; + }}; + } + let kind = match rustc_token.kind { + ra_rustc_lexer::TokenKind::LineComment => COMMENT, + ra_rustc_lexer::TokenKind::BlockComment { .. } => COMMENT, + ra_rustc_lexer::TokenKind::Whitespace => WHITESPACE, + ra_rustc_lexer::TokenKind::Ident => { + let token_text = &text[..rustc_token.len]; + if token_text == "_" { + UNDERSCORE + } else { + SyntaxKind::from_keyword(&text[..rustc_token.len]).unwrap_or(IDENT) + } + } + ra_rustc_lexer::TokenKind::RawIdent => IDENT, + ra_rustc_lexer::TokenKind::Literal { kind, .. } => match kind { + ra_rustc_lexer::LiteralKind::Int { .. } => INT_NUMBER, + ra_rustc_lexer::LiteralKind::Float { .. } => FLOAT_NUMBER, + ra_rustc_lexer::LiteralKind::Char { .. } => CHAR, + ra_rustc_lexer::LiteralKind::Byte { .. } => BYTE, + ra_rustc_lexer::LiteralKind::Str { .. } => STRING, + ra_rustc_lexer::LiteralKind::ByteStr { .. } => BYTE_STRING, + ra_rustc_lexer::LiteralKind::RawStr { .. } => RAW_STRING, + ra_rustc_lexer::LiteralKind::RawByteStr { .. } => RAW_BYTE_STRING, + }, + ra_rustc_lexer::TokenKind::Lifetime { .. } => LIFETIME, + ra_rustc_lexer::TokenKind::Semi => SEMI, + ra_rustc_lexer::TokenKind::Comma => COMMA, + ra_rustc_lexer::TokenKind::DotDotDot => decompose!(DOT, DOT, DOT), + ra_rustc_lexer::TokenKind::DotDotEq => decompose!(DOT, DOT, EQ), + ra_rustc_lexer::TokenKind::DotDot => decompose!(DOT, DOT), + ra_rustc_lexer::TokenKind::Dot => DOT, + ra_rustc_lexer::TokenKind::OpenParen => L_PAREN, + ra_rustc_lexer::TokenKind::CloseParen => R_PAREN, + ra_rustc_lexer::TokenKind::OpenBrace => L_CURLY, + ra_rustc_lexer::TokenKind::CloseBrace => R_CURLY, + ra_rustc_lexer::TokenKind::OpenBracket => L_BRACK, + ra_rustc_lexer::TokenKind::CloseBracket => R_BRACK, + ra_rustc_lexer::TokenKind::At => AT, + ra_rustc_lexer::TokenKind::Pound => POUND, + ra_rustc_lexer::TokenKind::Tilde => TILDE, + ra_rustc_lexer::TokenKind::Question => QUESTION, + ra_rustc_lexer::TokenKind::ColonColon => decompose!(COLON, COLON), + ra_rustc_lexer::TokenKind::Colon => COLON, + ra_rustc_lexer::TokenKind::Dollar => DOLLAR, + ra_rustc_lexer::TokenKind::EqEq => decompose!(EQ, EQ), + ra_rustc_lexer::TokenKind::Eq => EQ, + ra_rustc_lexer::TokenKind::FatArrow => decompose!(EQ, R_ANGLE), + ra_rustc_lexer::TokenKind::Ne => decompose!(EXCL, EQ), + ra_rustc_lexer::TokenKind::Not => EXCL, + ra_rustc_lexer::TokenKind::Le => decompose!(L_ANGLE, EQ), + ra_rustc_lexer::TokenKind::LArrow => decompose!(COLON, MINUS), + ra_rustc_lexer::TokenKind::Lt => L_ANGLE, + ra_rustc_lexer::TokenKind::ShlEq => decompose!(L_ANGLE, L_ANGLE, EQ), + ra_rustc_lexer::TokenKind::Shl => decompose!(L_ANGLE, L_ANGLE), + ra_rustc_lexer::TokenKind::Ge => decompose!(R_ANGLE, EQ), + ra_rustc_lexer::TokenKind::Gt => R_ANGLE, + ra_rustc_lexer::TokenKind::ShrEq => decompose!(R_ANGLE, R_ANGLE, EQ), + ra_rustc_lexer::TokenKind::Shr => decompose!(R_ANGLE, R_ANGLE), + ra_rustc_lexer::TokenKind::RArrow => decompose!(MINUS, R_ANGLE), + ra_rustc_lexer::TokenKind::Minus => MINUS, + ra_rustc_lexer::TokenKind::MinusEq => decompose!(MINUS, EQ), + ra_rustc_lexer::TokenKind::And => AMP, + ra_rustc_lexer::TokenKind::AndAnd => decompose!(AMP, AMP), + ra_rustc_lexer::TokenKind::AndEq => decompose!(AMP, EQ), + ra_rustc_lexer::TokenKind::Or => PIPE, + ra_rustc_lexer::TokenKind::OrOr => decompose!(PIPE, PIPE), + ra_rustc_lexer::TokenKind::OrEq => decompose!(PIPE, EQ), + ra_rustc_lexer::TokenKind::PlusEq => decompose!(PLUS, EQ), + ra_rustc_lexer::TokenKind::Plus => PLUS, + ra_rustc_lexer::TokenKind::StarEq => decompose!(STAR, EQ), + ra_rustc_lexer::TokenKind::Star => STAR, + ra_rustc_lexer::TokenKind::SlashEq => decompose!(SLASH, EQ), + ra_rustc_lexer::TokenKind::Slash => SLASH, + ra_rustc_lexer::TokenKind::CaretEq => decompose!(CARET, EQ), + ra_rustc_lexer::TokenKind::Caret => CARET, + ra_rustc_lexer::TokenKind::PercentEq => decompose!(PERCENT, EQ), + ra_rustc_lexer::TokenKind::Percent => PERCENT, + ra_rustc_lexer::TokenKind::Unknown => ERROR, + }; + let token = Token { kind, len: TextUnit::from_usize(rustc_token.len) }; acc.push(token); - let len: u32 = token.len.into(); - text = &text[len as usize..]; + text = &text[rustc_token.len..]; } acc } /// Get the next token from a string -pub fn next_token(text: &str) -> Token { +fn next_token(text: &str) -> Token { assert!(!text.is_empty()); let mut ptr = Ptr::new(text); let c = ptr.bump().unwrap(); diff --git a/crates/ra_syntax/tests/data/lexer/0004_numbers.txt b/crates/ra_syntax/tests/data/lexer/0004_numbers.txt index 39988aedcd..7bb89b8ae8 100644 --- a/crates/ra_syntax/tests/data/lexer/0004_numbers.txt +++ b/crates/ra_syntax/tests/data/lexer/0004_numbers.txt @@ -12,9 +12,9 @@ INT_NUMBER 2 "0_" WHITESPACE 1 " " FLOAT_NUMBER 2 "0." WHITESPACE 1 " " -INT_NUMBER 2 "0e" +FLOAT_NUMBER 2 "0e" WHITESPACE 1 " " -INT_NUMBER 2 "0E" +FLOAT_NUMBER 2 "0E" WHITESPACE 1 " " INT_NUMBER 2 "0z" WHITESPACE 1 "\n" @@ -32,9 +32,9 @@ INT_NUMBER 6 "0_1279" WHITESPACE 1 " " FLOAT_NUMBER 6 "0.1279" WHITESPACE 1 " " -INT_NUMBER 6 "0e1279" +FLOAT_NUMBER 6 "0e1279" WHITESPACE 1 " " -INT_NUMBER 6 "0E1279" +FLOAT_NUMBER 6 "0E1279" WHITESPACE 1 "\n" INT_NUMBER 1 "0" DOT 1 "." @@ -47,9 +47,7 @@ IDENT 3 "foo" L_PAREN 1 "(" R_PAREN 1 ")" WHITESPACE 1 "\n" -INT_NUMBER 2 "0e" -PLUS 1 "+" -INT_NUMBER 1 "1" +FLOAT_NUMBER 4 "0e+1" WHITESPACE 1 "\n" INT_NUMBER 1 "0" DOT 1 "." diff --git a/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt b/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt index 812dfbc18d..737a300ee7 100644 --- a/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt +++ b/crates/ra_syntax/tests/data/lexer/0014_unclosed_char.txt @@ -1 +1 @@ -CHAR 2 "\'1" +LIFETIME 2 "\'1" diff --git a/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt b/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt index 76d186a3ce..84867026f4 100644 --- a/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt +++ b/crates/ra_syntax/tests/data/parser/err/0002_duplicate_shebang.txt @@ -1,7 +1,39 @@ SOURCE_FILE@[0; 42) SHEBANG@[0; 20) "#!/use/bin/env rusti" WHITESPACE@[20; 21) "\n" - ERROR@[21; 41) - SHEBANG@[21; 41) "#!/use/bin/env rusti" + ATTR@[21; 23) + POUND@[21; 22) "#" + EXCL@[22; 23) "!" + ERROR@[23; 24) + SLASH@[23; 24) "/" + USE_ITEM@[24; 28) + USE_KW@[24; 27) "use" + ERROR@[27; 28) + SLASH@[27; 28) "/" + MACRO_CALL@[28; 31) + PATH@[28; 31) + PATH_SEGMENT@[28; 31) + NAME_REF@[28; 31) + IDENT@[28; 31) "bin" + ERROR@[31; 32) + SLASH@[31; 32) "/" + MACRO_CALL@[32; 41) + PATH@[32; 35) + PATH_SEGMENT@[32; 35) + NAME_REF@[32; 35) + IDENT@[32; 35) "env" + WHITESPACE@[35; 36) " " + NAME@[36; 41) + IDENT@[36; 41) "rusti" WHITESPACE@[41; 42) "\n" -error 21: expected an item +error 23: expected `[` +error 23: expected an item +error 27: expected one of `*`, `::`, `{`, `self`, `super` or an indentifier +error 28: expected SEMI +error 31: expected EXCL +error 31: expected `{`, `[`, `(` +error 31: expected SEMI +error 31: expected an item +error 35: expected EXCL +error 41: expected `{`, `[`, `(` +error 41: expected SEMI diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs similarity index 100% rename from crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.rs rename to crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.rs diff --git a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt similarity index 86% rename from crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt rename to crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt index b0acfa5d25..4f7e809c5d 100644 --- a/crates/ra_syntax/tests/data/parser/err/0030_string_suffixes.txt +++ b/crates/ra_syntax/tests/data/parser/ok/0030_string_suffixes.txt @@ -11,7 +11,7 @@ SOURCE_FILE@[0; 112) BLOCK@[10; 111) L_CURLY@[10; 11) "{" WHITESPACE@[11; 16) "\n " - LET_STMT@[16; 27) + LET_STMT@[16; 31) LET_KW@[16; 19) "let" WHITESPACE@[19; 20) " " PLACEHOLDER_PAT@[20; 21) @@ -19,14 +19,8 @@ SOURCE_FILE@[0; 112) WHITESPACE@[21; 22) " " EQ@[22; 23) "=" WHITESPACE@[23; 24) " " - LITERAL@[24; 27) - CHAR@[24; 27) "\'c\'" - EXPR_STMT@[27; 31) - PATH_EXPR@[27; 30) - PATH@[27; 30) - PATH_SEGMENT@[27; 30) - NAME_REF@[27; 30) - IDENT@[27; 30) "u32" + LITERAL@[24; 30) + CHAR@[24; 30) "\'c\'u32" SEMI@[30; 31) ";" WHITESPACE@[31; 36) "\n " LET_STMT@[36; 60) @@ -67,4 +61,3 @@ SOURCE_FILE@[0; 112) WHITESPACE@[109; 110) "\n" R_CURLY@[110; 111) "}" WHITESPACE@[111; 112) "\n" -error 27: expected SEMI