mirror of
https://github.com/rust-lang/rust-analyzer
synced 2024-12-25 20:43:21 +00:00
Cleanup character classes
This commit is contained in:
parent
171baf4c48
commit
770ecd4ecd
8 changed files with 73 additions and 31 deletions
|
@ -1,6 +1,8 @@
|
||||||
Grammar(
|
Grammar(
|
||||||
syntax_kinds: [
|
syntax_kinds: [
|
||||||
|
"ERROR",
|
||||||
"IDENT",
|
"IDENT",
|
||||||
|
"UNDERSCORE",
|
||||||
"WHITESPACE",
|
"WHITESPACE",
|
||||||
]
|
]
|
||||||
)
|
)
|
22
src/lexer/classes.rs
Normal file
22
src/lexer/classes.rs
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
use unicode_xid::UnicodeXID;
|
||||||
|
|
||||||
|
pub fn is_ident_start(c: char) -> bool {
|
||||||
|
(c >= 'a' && c <= 'z')
|
||||||
|
|| (c >= 'A' && c <= 'Z')
|
||||||
|
|| c == '_'
|
||||||
|
|| (c > '\x7f' && UnicodeXID::is_xid_start(c))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_ident_continue(c: char) -> bool {
|
||||||
|
(c >= 'a' && c <= 'z')
|
||||||
|
|| (c >= 'A' && c <= 'Z')
|
||||||
|
|| (c >= '0' && c <= '9')
|
||||||
|
|| c == '_'
|
||||||
|
|| (c > '\x7f' && UnicodeXID::is_xid_continue(c))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_whitespace(c: char) -> bool {
|
||||||
|
//FIXME: use is_pattern_whitespace
|
||||||
|
//https://github.com/behnam/rust-unic/issues/192
|
||||||
|
c.is_whitespace()
|
||||||
|
}
|
|
@ -1,11 +1,12 @@
|
||||||
use unicode_xid::UnicodeXID;
|
|
||||||
|
|
||||||
use {Token, SyntaxKind};
|
use {Token, SyntaxKind};
|
||||||
use syntax_kinds::*;
|
use syntax_kinds::*;
|
||||||
|
|
||||||
mod ptr;
|
mod ptr;
|
||||||
use self::ptr::Ptr;
|
use self::ptr::Ptr;
|
||||||
|
|
||||||
|
mod classes;
|
||||||
|
use self::classes::*;
|
||||||
|
|
||||||
pub fn next_token(text: &str) -> Token {
|
pub fn next_token(text: &str) -> Token {
|
||||||
assert!(!text.is_empty());
|
assert!(!text.is_empty());
|
||||||
let mut ptr = Ptr::new(text);
|
let mut ptr = Ptr::new(text);
|
||||||
|
@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
|
||||||
// Note: r as in r" or r#" is part of a raw string literal,
|
// Note: r as in r" or r#" is part of a raw string literal,
|
||||||
// b as in b' is part of a byte literal.
|
// b as in b' is part of a byte literal.
|
||||||
// They are not identifiers, and are handled further down.
|
// They are not identifiers, and are handled further down.
|
||||||
let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
|
let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
|
||||||
if ident_start {
|
if ident_start {
|
||||||
loop {
|
ptr.bump_while(is_ident_continue);
|
||||||
match ptr.next() {
|
return IDENT;
|
||||||
Some(c) if ident_continue(c) => {
|
|
||||||
ptr.bump();
|
|
||||||
},
|
|
||||||
_ => break,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if is_whitespace(c) {
|
||||||
|
ptr.bump_while(is_whitespace);
|
||||||
|
return WHITESPACE;
|
||||||
}
|
}
|
||||||
IDENT
|
|
||||||
} else {
|
|
||||||
WHITESPACE
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ident_start(c: char) -> bool {
|
return ERROR
|
||||||
(c >= 'a' && c <= 'z')
|
|
||||||
|| (c >= 'A' && c <= 'Z')
|
|
||||||
|| c == '_'
|
|
||||||
|| (c > '\x7f' && UnicodeXID::is_xid_start(c))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ident_continue(c: char) -> bool {
|
|
||||||
(c >= 'a' && c <= 'z')
|
|
||||||
|| (c >= 'A' && c <= 'Z')
|
|
||||||
|| (c >= '0' && c <= '9')
|
|
||||||
|| c == '_'
|
|
||||||
|| (c > '\x7f' && UnicodeXID::is_xid_continue(c))
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
|
fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
|
||||||
match (c, c1, c2) {
|
match (c, c1, c2) {
|
||||||
('r', Some('"'), _) |
|
('r', Some('"'), _) |
|
||||||
|
|
|
@ -32,6 +32,17 @@ impl<'s> Ptr<'s> {
|
||||||
Some(ch)
|
Some(ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
|
||||||
|
loop {
|
||||||
|
match self.next() {
|
||||||
|
Some(c) if pred(c) => {
|
||||||
|
self.bump();
|
||||||
|
},
|
||||||
|
_ => return,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn chars(&self) -> Chars {
|
fn chars(&self) -> Chars {
|
||||||
self.text[self.len.0 as usize ..].chars()
|
self.text[self.len.0 as usize ..].chars()
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,15 @@
|
||||||
// Generated from grammar.ron
|
// Generated from grammar.ron
|
||||||
use tree::{SyntaxKind, SyntaxInfo};
|
use tree::{SyntaxKind, SyntaxInfo};
|
||||||
|
|
||||||
pub const IDENT: SyntaxKind = SyntaxKind(0);
|
pub const ERROR: SyntaxKind = SyntaxKind(0);
|
||||||
pub const WHITESPACE: SyntaxKind = SyntaxKind(1);
|
pub const IDENT: SyntaxKind = SyntaxKind(1);
|
||||||
|
pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
|
||||||
|
pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
|
||||||
|
|
||||||
static INFOS: [SyntaxInfo; 2] = [
|
static INFOS: [SyntaxInfo; 4] = [
|
||||||
|
SyntaxInfo { name: "ERROR" },
|
||||||
SyntaxInfo { name: "IDENT" },
|
SyntaxInfo { name: "IDENT" },
|
||||||
|
SyntaxInfo { name: "UNDERSCORE" },
|
||||||
SyntaxInfo { name: "WHITESPACE" },
|
SyntaxInfo { name: "WHITESPACE" },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
4
tests/data/lexer/0002_whitespace.rs
Normal file
4
tests/data/lexer/0002_whitespace.rs
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
a b c
|
||||||
|
d
|
||||||
|
|
||||||
|
e f
|
12
tests/data/lexer/0002_whitespace.txt
Normal file
12
tests/data/lexer/0002_whitespace.txt
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
IDENT 1
|
||||||
|
WHITESPACE 1
|
||||||
|
IDENT 1
|
||||||
|
WHITESPACE 2
|
||||||
|
IDENT 1
|
||||||
|
WHITESPACE 1
|
||||||
|
IDENT 1
|
||||||
|
WHITESPACE 2
|
||||||
|
IDENT 1
|
||||||
|
WHITESPACE 1
|
||||||
|
IDENT 1
|
||||||
|
WHITESPACE 1
|
4
validation.md
Normal file
4
validation.md
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
Fixmes:
|
||||||
|
|
||||||
|
* Fix `is_whitespace`, add more test
|
||||||
|
|
Loading…
Reference in a new issue