From ddc637c16120fb352183698f635fc93a68580f7b Mon Sep 17 00:00:00 2001
From: Aleksey Kladov <aleksey.kladov@gmail.com>
Date: Sat, 30 Dec 2017 15:22:40 +0300
Subject: [PATCH] Lexer: start numbers

---
 grammar.ron                      |  2 +
 src/lexer/classes.rs             |  4 ++
 src/lexer/mod.rs                 | 91 ++++++++++++++++++++++++++++----
 src/lexer/ptr.rs                 | 12 +++++
 src/syntax_kinds.rs              |  6 ++-
 tests/data/lexer/0004_number.rs  |  7 +++
 tests/data/lexer/0004_number.txt | 62 ++++++++++++++++++++++
 validation.md                    |  4 +-
 8 files changed, 176 insertions(+), 12 deletions(-)
 create mode 100644 tests/data/lexer/0004_number.rs
 create mode 100644 tests/data/lexer/0004_number.txt

diff --git a/grammar.ron b/grammar.ron
index 49b9c527c1..a86fe693fc 100644
--- a/grammar.ron
+++ b/grammar.ron
@@ -4,5 +4,7 @@ Grammar(
         "IDENT",
         "UNDERSCORE",
         "WHITESPACE",
+        "INT_NUMBER",
+        "FLOAT_NUMBER",
     ]
 )
\ No newline at end of file
diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs
index 7cc050bde4..4235d2648a 100644
--- a/src/lexer/classes.rs
+++ b/src/lexer/classes.rs
@@ -20,3 +20,7 @@ pub fn is_whitespace(c: char) -> bool {
     //https://github.com/behnam/rust-unic/issues/192
     c.is_whitespace()
 }
+
+pub fn is_dec_digit(c: char) -> bool {
+    '0' <= c && c <= '9'
+}
diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs
index 83a411cdd0..afbbee4d0d 100644
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@@ -22,16 +22,7 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
     // They are not identifiers, and are handled further down.
     let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
     if ident_start {
-        let is_single_letter = match ptr.next() {
-            None => true,
-            Some(c) if !is_ident_continue(c) => true,
-            _ => false,
-        };
-        if is_single_letter {
-            return if c == '_' { UNDERSCORE } else { IDENT };
-        }
-        ptr.bump_while(is_ident_continue);
-        return IDENT;
+        return scan_ident(c, ptr);
     }
 
     if is_whitespace(c) {
@@ -39,9 +30,89 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
         return WHITESPACE;
     }
 
+    if is_dec_digit(c) {
+        return scan_number(c, ptr);
+    }
+
     ERROR
 }
 
+fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    let is_single_letter = match ptr.next() {
+        None => true,
+        Some(c) if !is_ident_continue(c) => true,
+        _ => false,
+    };
+    if is_single_letter {
+        return if c == '_' { UNDERSCORE } else { IDENT };
+    }
+    ptr.bump_while(is_ident_continue);
+    IDENT
+}
+
+fn scan_number(c: char, ptr: &mut Ptr) -> SyntaxKind {
+    if c == '0' {
+        match ptr.next().unwrap_or('\0') {
+            'b' | 'o' => {
+                ptr.bump();
+                scan_digits(ptr, false);
+            }
+            'x' => {
+                ptr.bump();
+                scan_digits(ptr, true);
+            }
+            '0'...'9' | '_' | '.' | 'e' | 'E' => {
+                scan_digits(ptr, true);
+            }
+            _ => return INT_NUMBER,
+        }
+    } else {
+        scan_digits(ptr, false);
+    }
+
+    // might be a float, but don't be greedy if this is actually an
+    // integer literal followed by field/method access or a range pattern
+    // (`0..2` and `12.foo()`)
+    if ptr.next_is('.') && !(ptr.nnext_is('.') || ptr.nnext_is_p(is_ident_start)) {
+        // might have stuff after the ., and if it does, it needs to start
+        // with a number
+        ptr.bump();
+        scan_digits(ptr, false);
+        scan_float_exponent(ptr);
+        return FLOAT_NUMBER;
+    }
+    // it might be a float if it has an exponent
+    if ptr.next_is('e') || ptr.next_is('E') {
+        scan_float_exponent(ptr);
+        return FLOAT_NUMBER;
+    }
+    INT_NUMBER
+}
+
+fn scan_digits(ptr: &mut Ptr, allow_hex: bool) {
+    while let Some(c) = ptr.next() {
+        match c {
+            '_' | '0'...'9' => {
+                ptr.bump();
+            }
+            'a'...'f' | 'A' ... 'F' if allow_hex => {
+                ptr.bump();
+            }
+            _ => return
+        }
+    }
+}
+
+fn scan_float_exponent(ptr: &mut Ptr) {
+    if ptr.next_is('e') || ptr.next_is('E') {
+        ptr.bump();
+        if ptr.next_is('-') || ptr.next_is('+') {
+            ptr.bump();
+        }
+        scan_digits(ptr, false);
+    }
+}
+
 fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
     match (c, c1, c2) {
         ('r', Some('"'), _) |
diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs
index e8aa6f37b4..d441b826bc 100644
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@@ -26,6 +26,18 @@ impl<'s> Ptr<'s> {
         chars.next()
     }
 
+    pub fn next_is(&self, c: char) -> bool {
+        self.next() == Some(c)
+    }
+
+    pub fn nnext_is(&self, c: char) -> bool {
+        self.nnext() == Some(c)
+    }
+
+    pub fn nnext_is_p<P: Fn(char) -> bool>(&self, p: P) -> bool {
+        self.nnext().map(p) == Some(true)
+    }
+
     pub fn bump(&mut self) -> Option<char> {
         let ch = self.chars().next()?;
         self.len += TextUnit::len_of_char(ch);
diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs
index b9b47a2ede..bd1265bdea 100644
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@@ -5,12 +5,16 @@ pub const ERROR: SyntaxKind = SyntaxKind(0);
 pub const IDENT: SyntaxKind = SyntaxKind(1);
 pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
 pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
+pub const INT_NUMBER: SyntaxKind = SyntaxKind(4);
+pub const FLOAT_NUMBER: SyntaxKind = SyntaxKind(5);
 
-static INFOS: [SyntaxInfo; 4] = [
+static INFOS: [SyntaxInfo; 6] = [
     SyntaxInfo { name: "ERROR" },
     SyntaxInfo { name: "IDENT" },
     SyntaxInfo { name: "UNDERSCORE" },
     SyntaxInfo { name: "WHITESPACE" },
+    SyntaxInfo { name: "INT_NUMBER" },
+    SyntaxInfo { name: "FLOAT_NUMBER" },
 ];
 
 pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo {
diff --git a/tests/data/lexer/0004_number.rs b/tests/data/lexer/0004_number.rs
new file mode 100644
index 0000000000..af53ff2cd1
--- /dev/null
+++ b/tests/data/lexer/0004_number.rs
@@ -0,0 +1,7 @@
+0 0b 0o 0x 00 0_ 0. 0e 0E 0z
+01790 0b1790 0o1790 0x1790aAbBcCdDeEfF 001279 0_1279 0.1279 0e1279 0E1279
+0..2
+0.foo()
+0e+1
+0.e+1
+0.0E-2
diff --git a/tests/data/lexer/0004_number.txt b/tests/data/lexer/0004_number.txt
new file mode 100644
index 0000000000..e9ad8410d7
--- /dev/null
+++ b/tests/data/lexer/0004_number.txt
@@ -0,0 +1,62 @@
+INT_NUMBER 1
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+FLOAT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 2
+WHITESPACE 1
+INT_NUMBER 1
+IDENT 1
+WHITESPACE 1
+INT_NUMBER 5
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 18
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+FLOAT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 6
+WHITESPACE 1
+INT_NUMBER 1
+ERROR 1
+ERROR 1
+INT_NUMBER 1
+WHITESPACE 1
+INT_NUMBER 1
+ERROR 1
+IDENT 3
+ERROR 1
+ERROR 1
+WHITESPACE 1
+INT_NUMBER 2
+ERROR 1
+INT_NUMBER 1
+WHITESPACE 1
+INT_NUMBER 1
+ERROR 1
+IDENT 1
+ERROR 1
+INT_NUMBER 1
+WHITESPACE 1
+FLOAT_NUMBER 6
+WHITESPACE 1
diff --git a/validation.md b/validation.md
index 3706760ba4..b21ffebd54 100644
--- a/validation.md
+++ b/validation.md
@@ -1,5 +1,7 @@
 Fixmes:
 
-* Fix `is_whitespace`, add more test
+* Fix `is_whitespace`, add more tests
 * Add more thorough tests for idents for XID_Start & XID_Continue
+* Validate that float and integer literals use digits only of the appropriate
+  base, and are in range