Lexer: start numbers

This commit is contained in:
Aleksey Kladov 2017-12-30 15:22:40 +03:00
parent 8103772a10
commit ddc637c161
8 changed files with 176 additions and 12 deletions

View file

@ -4,5 +4,7 @@ Grammar(
"IDENT", "IDENT",
"UNDERSCORE", "UNDERSCORE",
"WHITESPACE", "WHITESPACE",
"INT_NUMBER",
"FLOAT_NUMBER",
] ]
) )

View file

@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
//https://github.com/behnam/rust-unic/issues/192 //https://github.com/behnam/rust-unic/issues/192
c.is_whitespace() c.is_whitespace()
} }
pub fn is_dec_digit(c: char) -> bool {
'0' <= c && c <= '9'
}

View file

@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
// They are not identifiers, and are handled further down. // They are not identifiers, and are handled further down.
let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
if ident_start { if ident_start {
let is_single_letter = match ptr.next() { return scan_ident(c, ptr);
None => true,
Some(c) if !is_ident_continue(c) => true,
_ => false,
};
if is_single_letter {
return if c == '_' { UNDERSCORE } else { IDENT };
}
ptr.bump_while(is_ident_continue);
return IDENT;
} }
if is_whitespace(c) { if is_whitespace(c) {
@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
return WHITESPACE; return WHITESPACE;
} }
if is_dec_digit(c) {
return scan_number(c, ptr);
}
ERROR ERROR
} }
fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
let is_single_letter = match ptr.next() {
None => true,
Some(c) if !is_ident_continue(c) => true,
_ => false,
};
if is_single_letter {
return if c == '_' { UNDERSCORE } else { IDENT };
}
ptr.bump_while(is_ident_continue);
IDENT
}
fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
if c == '0' {
match ptr.next().unwrap_or('\0') {
'b' | 'o' => {
ptr.bump();
scan_digits(ptr, false);
}
'x' => {
ptr.bump();
scan_digits(ptr, true);
}
'0'...'9' | '_' | '.' | 'e' | 'E' => {
scan_digits(ptr, true);
}
_ => return INT_NUMBER,
}
} else {
scan_digits(ptr, false);
}
// might be a float, but don't be greedy if this is actually an
// integer literal followed by field/method access or a range pattern
// (`0..2` and `12.foo()`)
if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
// might have stuff after the ., and if it does, it needs to start
// with a number
ptr.bump();
scan_digits(ptr, false);
scan_float_exponent(ptr);
return FLOAT_NUMBER;
}
// it might be a float if it has an exponent
if ptr.next_is('e') || ptr.next_is('E') {
scan_float_exponent(ptr);
return FLOAT_NUMBER;
}
INT_NUMBER
}
fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
while let Some(c) = ptr.next() {
match c {
'_' | '0'...'9' => {
ptr.bump();
}
'a'...'f' | 'A' ... 'F' if allow_hex => {
ptr.bump();
}
_ => return
}
}
}
fn scan_float_exponent(ptr: &mut Ptr) {
if ptr.next_is('e') || ptr.next_is('E') {
ptr.bump();
if ptr.next_is('-') || ptr.next_is('+') {
ptr.bump();
}
scan_digits(ptr, false);
}
}
fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool { fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
match (c, c1, c2) { match (c, c1, c2) {
('r', Some('"'), _) | ('r', Some('"'), _) |

View file

@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
chars.next() chars.next()
} }
pub fn next_is(&self, c: char) -> bool {
self.next() == Some(c)
}
pub fn nnext_is(&self, c: char) -> bool {
self.nnext() == Some(c)
}
pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
self.nnext().map(p) == Some(true)
}
pub fn bump(&mut self) -> Option<char> { pub fn bump(&mut self) -> Option<char> {
let ch = self.chars().next()?; let ch = self.chars().next()?;
self.len += TextUnit::len_of_char(ch); self.len += TextUnit::len_of_char(ch);

View file

@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
pub const IDENT: SyntaxKind = SyntaxKind(1); pub const IDENT: SyntaxKind = SyntaxKind(1);
pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
pub const WHITESPACE: SyntaxKind = SyntaxKind(3); pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
static INFOS: [SyntaxInfo; 4] = [ static INFOS: [SyntaxInfo; 6] = [
SyntaxInfo { name: "ERROR" }, SyntaxInfo { name: "ERROR" },
SyntaxInfo { name: "IDENT" }, SyntaxInfo { name: "IDENT" },
SyntaxInfo { name: "UNDERSCORE" }, SyntaxInfo { name: "UNDERSCORE" },
SyntaxInfo { name: "WHITESPACE" }, SyntaxInfo { name: "WHITESPACE" },
SyntaxInfo { name: "INT_NUMBER" },
SyntaxInfo { name: "FLOAT_NUMBER" },
]; ];
pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {

View file

@ -0,0 +1,7 @@
0 0b 0o 0x 00 0_ 0. 0e 0E 0z
01790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279
0..2
0.foo()
0e+1
0.e+1
0.0E-2

View file

@ -0,0 +1,62 @@
INT_NUMBER 1
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
FLOAT_NUMBER 2
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
INT_NUMBER 2
WHITESPACE 1
INT_NUMBER 1
IDENT 1
WHITESPACE 1
INT_NUMBER 5
WHITESPACE 1
INT_NUMBER 6
WHITESPACE 1
INT_NUMBER 6
WHITESPACE 1
INT_NUMBER 18
WHITESPACE 1
INT_NUMBER 6
WHITESPACE 1
INT_NUMBER 6
WHITESPACE 1
FLOAT_NUMBER 6
WHITESPACE 1
INT_NUMBER 6
WHITESPACE 1
INT_NUMBER 6
WHITESPACE 1
INT_NUMBER 1
ERROR 1
ERROR 1
INT_NUMBER 1
WHITESPACE 1
INT_NUMBER 1
ERROR 1
IDENT 3
ERROR 1
ERROR 1
WHITESPACE 1
INT_NUMBER 2
ERROR 1
INT_NUMBER 1
WHITESPACE 1
INT_NUMBER 1
ERROR 1
IDENT 1
ERROR 1
INT_NUMBER 1
WHITESPACE 1
FLOAT_NUMBER 6
WHITESPACE 1

View file

@ -1,5 +1,7 @@
Fixmes: Fixmes:
* Fix `is_whitespace`, add more test * Fix `is_whitespace`, add more tests
* Add more thorough tests for idents for XID_Start & XID_Continue * Add more thorough tests for idents for XID_Start & XID_Continue
* Validate that float and integer literals use digits only of the appropriate
base, and are in range