Cleanup character classes

This commit is contained in:
Aleksey Kladov 2017-12-30 00:48:47 +03:00
parent 171baf4c48
commit 770ecd4ecd
8 changed files with 73 additions and 31 deletions

View file

@ -1,6 +1,8 @@
Grammar( Grammar(
syntax_kinds: [ syntax_kinds: [
"ERROR",
"IDENT", "IDENT",
"UNDERSCORE",
"WHITESPACE", "WHITESPACE",
] ]
) )

22
src/lexer/classes.rs Normal file
View file

@ -0,0 +1,22 @@
use unicode_xid::UnicodeXID;
pub fn is_ident_start(c: char) -> bool {
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_start(c))
}
pub fn is_ident_continue(c: char) -> bool {
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_continue(c))
}
pub fn is_whitespace(c: char) -> bool {
//FIXME: use is_pattern_whitespace
//https://github.com/behnam/rust-unic/issues/192
c.is_whitespace()
}

View file

@ -1,11 +1,12 @@
use unicode_xid::UnicodeXID;
use {Token, SyntaxKind}; use {Token, SyntaxKind};
use syntax_kinds::*; use syntax_kinds::*;
mod ptr; mod ptr;
use self::ptr::Ptr; use self::ptr::Ptr;
mod classes;
use self::classes::*;
pub fn next_token(text: &str) -> Token { pub fn next_token(text: &str) -> Token {
assert!(!text.is_empty()); assert!(!text.is_empty());
let mut ptr = Ptr::new(text); let mut ptr = Ptr::new(text);
@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
// Note: r as in r" or r#" is part of a raw string literal, // Note: r as in r" or r#" is part of a raw string literal,
// b as in b' is part of a byte literal. // b as in b' is part of a byte literal.
// They are not identifiers, and are handled further down. // They are not identifiers, and are handled further down.
let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
if ident_start { if ident_start {
loop { ptr.bump_while(is_ident_continue);
match ptr.next() { return IDENT;
Some(c) if ident_continue(c) => {
ptr.bump();
},
_ => break,
}
}
IDENT
} else {
WHITESPACE
}
} }
fn ident_start(c: char) -> bool { if is_whitespace(c) {
(c >= 'a' && c <= 'z') ptr.bump_while(is_whitespace);
|| (c >= 'A' && c <= 'Z') return WHITESPACE;
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_start(c))
} }
fn ident_continue(c: char) -> bool { return ERROR
(c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9')
|| c == '_'
|| (c > '\x7f' && UnicodeXID::is_xid_continue(c))
} }
fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
match (c, c1, c2) { match (c, c1, c2) {
('r', Some('"'), _) | ('r', Some('"'), _) |

View file

@ -32,6 +32,17 @@ impl<'s> Ptr<'s> {
Some(ch) Some(ch)
} }
pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
loop {
match self.next() {
Some(c) if pred(c) => {
self.bump();
},
_ => return,
}
}
}
fn chars(&self) -> Chars { fn chars(&self) -> Chars {
self.text[self.len.0 as usize ..].chars() self.text[self.len.0 as usize ..].chars()
} }

View file

@ -1,11 +1,15 @@
// Generated from grammar.ron // Generated from grammar.ron
use tree::{SyntaxKind, SyntaxInfo}; use tree::{SyntaxKind, SyntaxInfo};
pub const IDENT: SyntaxKind = SyntaxKind(0); pub const ERROR: SyntaxKind = SyntaxKind(0);
pub const WHITESPACE: SyntaxKind = SyntaxKind(1); pub const IDENT: SyntaxKind = SyntaxKind(1);
pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
static INFOS: [SyntaxInfo; 2] = [ static INFOS: [SyntaxInfo; 4] = [
SyntaxInfo { name: "ERROR" },
SyntaxInfo { name: "IDENT" }, SyntaxInfo { name: "IDENT" },
SyntaxInfo { name: "UNDERSCORE" },
SyntaxInfo { name: "WHITESPACE" }, SyntaxInfo { name: "WHITESPACE" },
]; ];

View file

@ -0,0 +1,4 @@
a b c
d
e f

View file

@ -0,0 +1,12 @@
IDENT 1
WHITESPACE 1
IDENT 1
WHITESPACE 2
IDENT 1
WHITESPACE 1
IDENT 1
WHITESPACE 2
IDENT 1
WHITESPACE 1
IDENT 1
WHITESPACE 1

4
validation.md Normal file
View file

@ -0,0 +1,4 @@
Fixmes:
* Fix `is_whitespace`, add more test