mirror of
https://github.com/rust-lang/rust-analyzer
synced 2025-01-14 14:13:58 +00:00
Auto merge of #17037 - davidsemakula:token-set-collisions, r=Veykril
internal: improve `TokenSet` implementation and add reserved keywords The current `TokenSet` type represents "A bit-set of `SyntaxKind`s" as a newtype `u128`. Internally, the flag for each `SyntaxKind` variant in the bit-set is set as the n-th LSB (least significant bit) via a bit-wise left shift operation, where n is the discriminant. Edit: This is problematic because there's currently ~121 token `SyntaxKind`s, so adding new token kinds for missing reserved keywords increases the number of token `SyntaxKind`s above 128, thus making this ["mask"](7a8374c162/crates/parser/src/token_set.rs (L31-L33)
) operation overflow. ~~This is problematic because there's currently 266 SyntaxKinds, so this ["mask"](7a8374c162/crates/parser/src/token_set.rs (L31-L33)
) operation silently overflows in release mode.~~ ~~This leads to a single flag/bit in the bit-set being shared by multiple `SyntaxKind`s~~. This PR: - Changes the wrapped type for `TokenSet` from `u128` to `[u64; 3]` ~~`[u*; N]` (currently `[u16; 17]`) where `u*` can be any desirable unsigned integer type and `N` is the minimum array length needed to represent all token `SyntaxKind`s without any collisions~~. - Edit: Add assertion that `TokenSet`s only include token `SyntaxKind`s - Edit: Add ~7 missing [reserved keywords](https://doc.rust-lang.org/stable/reference/keywords.html#reserved-keywords) - ~~Moves the definition of the `TokenSet` type to grammar codegen in xtask, so that `N` is adjusted automatically (depending on the chosen `u*` "base" type) when new `SyntaxKind`s are added~~. - ~~Updates the `token_set_works_for_tokens` unit test to include the `__LAST` `SyntaxKind` as a way of catching overflows in tests.~~ ~~Currently `u16` is arbitrarily chosen as the `u*` "base" type mostly because it strikes a good balance (IMO) between unused bits and readability of the generated `TokenSet` code (especially the [`union` method](7a8374c162/crates/parser/src/token_set.rs (L26-L28)
)), but I'm open to other suggestions or a better methodology for choosing `u*` type.~~ ~~I considered using a third-party crate for the bit-set, but a direct implementation seems simple enough without adding any new dependencies. I'm not strongly opposed to using a third-party crate though, if that's preferred.~~ ~~Finally, I haven't had the chance to review issues, to figure out if there are any parser issues caused by collisions due the current implementation that may be fixed by this PR - I just stumbled upon the issue while adding "new" keywords to solve #16858~~ Edit: fixes #16858
This commit is contained in:
commit
e64610dbbe
4 changed files with 101 additions and 20 deletions
|
@ -1620,4 +1620,50 @@ mod foo {
|
||||||
"#,
|
"#,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preserve_raw_identifiers_strict() {
|
||||||
|
check_assist(
|
||||||
|
auto_import,
|
||||||
|
r"
|
||||||
|
r#as$0
|
||||||
|
|
||||||
|
pub mod ffi_mod {
|
||||||
|
pub fn r#as() {};
|
||||||
|
}
|
||||||
|
",
|
||||||
|
r"
|
||||||
|
use ffi_mod::r#as;
|
||||||
|
|
||||||
|
r#as
|
||||||
|
|
||||||
|
pub mod ffi_mod {
|
||||||
|
pub fn r#as() {};
|
||||||
|
}
|
||||||
|
",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preserve_raw_identifiers_reserved() {
|
||||||
|
check_assist(
|
||||||
|
auto_import,
|
||||||
|
r"
|
||||||
|
r#abstract$0
|
||||||
|
|
||||||
|
pub mod ffi_mod {
|
||||||
|
pub fn r#abstract() {};
|
||||||
|
}
|
||||||
|
",
|
||||||
|
r"
|
||||||
|
use ffi_mod::r#abstract;
|
||||||
|
|
||||||
|
r#abstract
|
||||||
|
|
||||||
|
pub mod ffi_mod {
|
||||||
|
pub fn r#abstract() {};
|
||||||
|
}
|
||||||
|
",
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -4,34 +4,48 @@ use crate::SyntaxKind;
|
||||||
|
|
||||||
/// A bit-set of `SyntaxKind`s
|
/// A bit-set of `SyntaxKind`s
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub(crate) struct TokenSet(u128);
|
pub(crate) struct TokenSet([u64; 3]);
|
||||||
|
|
||||||
|
/// `TokenSet`s should only include token `SyntaxKind`s, so the discriminant of any passed/included
|
||||||
|
/// `SyntaxKind` must *not* be greater than that of the last token `SyntaxKind`.
|
||||||
|
/// See #17037.
|
||||||
|
const LAST_TOKEN_KIND_DISCRIMINANT: usize = SyntaxKind::SHEBANG as usize;
|
||||||
|
|
||||||
impl TokenSet {
|
impl TokenSet {
|
||||||
pub(crate) const EMPTY: TokenSet = TokenSet(0);
|
pub(crate) const EMPTY: TokenSet = TokenSet([0; 3]);
|
||||||
|
|
||||||
pub(crate) const fn new(kinds: &[SyntaxKind]) -> TokenSet {
|
pub(crate) const fn new(kinds: &[SyntaxKind]) -> TokenSet {
|
||||||
let mut res = 0u128;
|
let mut res = [0; 3];
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
while i < kinds.len() {
|
while i < kinds.len() {
|
||||||
res |= mask(kinds[i]);
|
let discriminant = kinds[i] as usize;
|
||||||
|
debug_assert!(
|
||||||
|
discriminant <= LAST_TOKEN_KIND_DISCRIMINANT,
|
||||||
|
"Expected a token `SyntaxKind`"
|
||||||
|
);
|
||||||
|
let idx = discriminant / 64;
|
||||||
|
res[idx] |= 1 << (discriminant % 64);
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
TokenSet(res)
|
TokenSet(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) const fn union(self, other: TokenSet) -> TokenSet {
|
pub(crate) const fn union(self, other: TokenSet) -> TokenSet {
|
||||||
TokenSet(self.0 | other.0)
|
TokenSet([self.0[0] | other.0[0], self.0[1] | other.0[1], self.0[2] | other.0[2]])
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) const fn contains(&self, kind: SyntaxKind) -> bool {
|
pub(crate) const fn contains(&self, kind: SyntaxKind) -> bool {
|
||||||
self.0 & mask(kind) != 0
|
let discriminant = kind as usize;
|
||||||
|
debug_assert!(
|
||||||
|
discriminant <= LAST_TOKEN_KIND_DISCRIMINANT,
|
||||||
|
"Expected a token `SyntaxKind`"
|
||||||
|
);
|
||||||
|
let idx = discriminant / 64;
|
||||||
|
let mask = 1 << (discriminant % 64);
|
||||||
|
self.0[idx] & mask != 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const fn mask(kind: SyntaxKind) -> u128 {
|
|
||||||
1u128 << (kind as usize)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn token_set_works_for_tokens() {
|
fn token_set_works_for_tokens() {
|
||||||
use crate::SyntaxKind::*;
|
use crate::SyntaxKind::*;
|
||||||
|
|
|
@ -65,11 +65,11 @@ pub(crate) const KINDS_SRC: KindsSrc<'_> = KindsSrc {
|
||||||
(">>=", "SHREQ"),
|
(">>=", "SHREQ"),
|
||||||
],
|
],
|
||||||
keywords: &[
|
keywords: &[
|
||||||
"as", "async", "await", "box", "break", "const", "continue", "crate", "do", "dyn", "else",
|
"abstract", "as", "async", "await", "become", "box", "break", "const", "continue", "crate",
|
||||||
"enum", "extern", "false", "fn", "for", "if", "impl", "in", "let", "loop", "macro",
|
"do", "dyn", "else", "enum", "extern", "false", "final", "fn", "for", "if", "impl", "in",
|
||||||
"match", "mod", "move", "mut", "pub", "ref", "return", "become", "self", "Self", "static",
|
"let", "loop", "macro", "match", "mod", "move", "mut", "override", "priv", "pub", "ref",
|
||||||
"struct", "super", "trait", "true", "try", "type", "unsafe", "use", "where", "while",
|
"return", "self", "Self", "static", "struct", "super", "trait", "true", "try", "type",
|
||||||
"yield",
|
"typeof", "unsafe", "unsized", "use", "virtual", "where", "while", "yield",
|
||||||
],
|
],
|
||||||
contextual_keywords: &[
|
contextual_keywords: &[
|
||||||
"auto",
|
"auto",
|
||||||
|
|
Loading…
Reference in a new issue