From ddc637c16120fb352183698f635fc93a68580f7b Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sat, 30 Dec 2017 15:22:40 +0300 Subject: [PATCH] Lexer: start numbers --- grammar.ron | 2 + src/lexer/classes.rs | 4 ++ src/lexer/mod.rs | 91 ++++++++++++++++++++++++++++---- src/lexer/ptr.rs | 12 +++++ src/syntax_kinds.rs | 6 ++- tests/data/lexer/0004_number.rs | 7 +++ tests/data/lexer/0004_number.txt | 62 ++++++++++++++++++++++ validation.md | 4 +- 8 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 tests/data/lexer/0004_number.rs create mode 100644 tests/data/lexer/0004_number.txt diff --git a/grammar.ron b/grammar.ron index 49b9c527c1..a86fe693fc 100644 --- a/grammar.ron +++ b/grammar.ron @@ -4,5 +4,7 @@ Grammar( "IDENT", "UNDERSCORE", "WHITESPACE", + "INT_NUMBER", + "FLOAT_NUMBER", ] ) \ No newline at end of file diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs index 7cc050bde4..4235d2648a 100644 --- a/src/lexer/classes.rs +++ b/src/lexer/classes.rs @@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool { //https://github.com/behnam/rust-unic/issues/192 c.is_whitespace() } + +pub fn is_dec_digit(c: char) -> bool { + '0' <= c && c <= '9' +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 83a411cdd0..afbbee4d0d 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { // They are not identifiers, and are handled further down. let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); if ident_start { - let is_single_letter = match ptr.next() { - None => true, - Some(c) if !is_ident_continue(c) => true, - _ => false, - }; - if is_single_letter { - return if c == '_' { UNDERSCORE } else { IDENT }; - } - ptr.bump_while(is_ident_continue); - return IDENT; + return scan_ident(c, ptr); } if is_whitespace(c) { @@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { return WHITESPACE; } + if is_dec_digit(c) { + return scan_number(c, ptr); + } + ERROR } +fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { + let is_single_letter = match ptr.next() { + None => true, + Some(c) if !is_ident_continue(c) => true, + _ => false, + }; + if is_single_letter { + return if c == '_' { UNDERSCORE } else { IDENT }; + } + ptr.bump_while(is_ident_continue); + IDENT +} + +fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind { + if c == '0' { + match ptr.next().unwrap_or('\0') { + 'b' | 'o' => { + ptr.bump(); + scan_digits(ptr, false); + } + 'x' => { + ptr.bump(); + scan_digits(ptr, true); + } + '0'...'9' | '_' | '.' | 'e' | 'E' => { + scan_digits(ptr, true); + } + _ => return INT_NUMBER, + } + } else { + scan_digits(ptr, false); + } + + // might be a float, but don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) { + // might have stuff after the ., and if it does, it needs to start + // with a number + ptr.bump(); + scan_digits(ptr, false); + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + // it might be a float if it has an exponent + if ptr.next_is('e') || ptr.next_is('E') { + scan_float_exponent(ptr); + return FLOAT_NUMBER; + } + INT_NUMBER +} + +fn scan_digits(ptr: &mut Ptr, allow_hex: bool) { + while let Some(c) = ptr.next() { + match c { + '_' | '0'...'9' => { + ptr.bump(); + } + 'a'...'f' | 'A' ... 'F' if allow_hex => { + ptr.bump(); + } + _ => return + } + } +} + +fn scan_float_exponent(ptr: &mut Ptr) { + if ptr.next_is('e') || ptr.next_is('E') { + ptr.bump(); + if ptr.next_is('-') || ptr.next_is('+') { + ptr.bump(); + } + scan_digits(ptr, false); + } +} + fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { match (c, c1, c2) { ('r', Some('"'), _) | diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index e8aa6f37b4..d441b826bc 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs @@ -26,6 +26,18 @@ impl<'s> Ptr<'s> { chars.next() } + pub fn next_is(&self, c: char) -> bool { + self.next() == Some(c) + } + + pub fn nnext_is(&self, c: char) -> bool { + self.nnext() == Some(c) + } + + pub fn nnext_is_p bool>(&self, p: P) -> bool { + self.nnext().map(p) == Some(true) + } + pub fn bump(&mut self) -> Option { let ch = self.chars().next()?; self.len += TextUnit::len_of_char(ch); diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index b9b47a2ede..bd1265bdea 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs @@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0); pub const IDENT: SyntaxKind = SyntaxKind(1); pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); pub const WHITESPACE: SyntaxKind = SyntaxKind(3); +pub const INT_NUMBER: SyntaxKind = SyntaxKind(4); +pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5); -static INFOS: [SyntaxInfo; 4] = [ +static INFOS: [SyntaxInfo; 6] = [ SyntaxInfo { name: "ERROR" }, SyntaxInfo { name: "IDENT" }, SyntaxInfo { name: "UNDERSCORE" }, SyntaxInfo { name: "WHITESPACE" }, + SyntaxInfo { name: "INT_NUMBER" }, + SyntaxInfo { name: "FLOAT_NUMBER" }, ]; pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { diff --git a/tests/data/lexer/0004_number.rs b/tests/data/lexer/0004_number.rs new file mode 100644 index 0000000000..af53ff2cd1 --- /dev/null +++ b/tests/data/lexer/0004_number.rs @@ -0,0 +1,7 @@ +0 0b 0o 0x 00 0_ 0. 0e 0E 0z +01790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279 +0..2 +0.foo() +0e+1 +0.e+1 +0.0E-2 diff --git a/tests/data/lexer/0004_number.txt b/tests/data/lexer/0004_number.txt new file mode 100644 index 0000000000..e9ad8410d7 --- /dev/null +++ b/tests/data/lexer/0004_number.txt @@ -0,0 +1,62 @@ +INT_NUMBER 1 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +FLOAT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 2 +WHITESPACE 1 +INT_NUMBER 1 +IDENT 1 +WHITESPACE 1 +INT_NUMBER 5 +WHITESPACE 1 +INT_NUMBER 6 +WHITESPACE 1 +INT_NUMBER 6 +WHITESPACE 1 +INT_NUMBER 18 +WHITESPACE 1 +INT_NUMBER 6 +WHITESPACE 1 +INT_NUMBER 6 +WHITESPACE 1 +FLOAT_NUMBER 6 +WHITESPACE 1 +INT_NUMBER 6 +WHITESPACE 1 +INT_NUMBER 6 +WHITESPACE 1 +INT_NUMBER 1 +ERROR 1 +ERROR 1 +INT_NUMBER 1 +WHITESPACE 1 +INT_NUMBER 1 +ERROR 1 +IDENT 3 +ERROR 1 +ERROR 1 +WHITESPACE 1 +INT_NUMBER 2 +ERROR 1 +INT_NUMBER 1 +WHITESPACE 1 +INT_NUMBER 1 +ERROR 1 +IDENT 1 +ERROR 1 +INT_NUMBER 1 +WHITESPACE 1 +FLOAT_NUMBER 6 +WHITESPACE 1 diff --git a/validation.md b/validation.md index 3706760ba4..b21ffebd54 100644 --- a/validation.md +++ b/validation.md @@ -1,5 +1,7 @@ Fixmes: -* Fix `is_whitespace`, add more test +* Fix `is_whitespace`, add more tests * Add more thorough tests for idents for XID_Start & XID_Continue +* Validate that float and integer literals use digits only of the appropriate + base, and are in range