Cleanup character classes

This commit is contained in:
Aleksey Kladov 2017-12-30 00:48:47 +03:00
parent 171baf4c48
commit 770ecd4ecd
8 changed files with 73 additions and 31 deletions

View file

@ -1,6 +1,8 @@
Grammar(
syntax_kinds: [
"ERROR",
"IDENT",
"UNDERSCORE",
"WHITESPACE",
]
)

22
src/lexer/classes.rs Normal file
View file

@ -0,0 +1,22 @@
use unicode_xid::UnicodeXID;
pub fn is_ident_start(c: char) -> bool {
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_start(c))
}
pub fn is_ident_continue(c: char) -> bool {
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_continue(c))
}
pub fn is_whitespace(c: char) -> bool {
//FIXME: use is_pattern_whitespace
//https://github.com/behnam/rust-unic/issues/192
c.is_whitespace()
}

View file

@ -1,11 +1,12 @@
use unicode_xid::UnicodeXID;
use {Token, SyntaxKind};
use syntax_kinds::*;
mod ptr;
use self::ptr::Ptr;
mod classes;
use self::classes::*;
pub fn next_token(text: &str) -> Token {
assert!(!text.is_empty());
let mut ptr = Ptr::new(text);
@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
// Note: r as in r" or r#" is part of a raw string literal,
// b as in b' is part of a byte literal.
// They are not identifiers, and are handled further down.
let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
if ident_start {
loop {
match ptr.next() {
Some(c) if ident_continue(c) => {
ptr.bump();
},
_ => break,
ptr.bump_while(is_ident_continue);
return IDENT;
}
if is_whitespace(c) {
ptr.bump_while(is_whitespace);
return WHITESPACE;
}
IDENT
} else {
WHITESPACE
}
}
fn ident_start(c: char) -> bool {
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_start(c))
return ERROR
}
fn ident_continue(c: char) -> bool {
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_continue(c))
}
fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
match (c, c1, c2) {
('r', Some('"'), _) |

View file

@ -32,6 +32,17 @@ impl<'s> Ptr<'s> {
Some(ch)
}
pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
loop {
match self.next() {
Some(c) if pred(c) => {
self.bump();
},
_ => return,
}
}
}
fn chars(&self) -> Chars {
self.text[self.len.0 as usize ..].chars()
}

View file

@ -1,11 +1,15 @@
// Generated from grammar.ron
use tree::{SyntaxKind, SyntaxInfo};
pub const IDENT: SyntaxKind = SyntaxKind(0);
pub const WHITESPACE: SyntaxKind = SyntaxKind(1);
pub const ERROR: SyntaxKind = SyntaxKind(0);
pub const IDENT: SyntaxKind = SyntaxKind(1);
pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
static INFOS: [SyntaxInfo; 2] = [
static INFOS: [SyntaxInfo; 4] = [
SyntaxInfo { name: "ERROR" },
SyntaxInfo { name: "IDENT" },
SyntaxInfo { name: "UNDERSCORE" },
SyntaxInfo { name: "WHITESPACE" },
];

View file

@ -0,0 +1,4 @@
a b c
d
e f

View file

@ -0,0 +1,12 @@
IDENT 1
WHITESPACE 1
IDENT 1
WHITESPACE 2
IDENT 1
WHITESPACE 1
IDENT 1
WHITESPACE 2
IDENT 1
WHITESPACE 1
IDENT 1
WHITESPACE 1

4
validation.md Normal file
View file

@ -0,0 +1,4 @@
Fixmes:
* Fix `is_whitespace`, add more test