Cleanup character classes

2024-12-25 12:33:33 +00:00 · 2017-12-30 00:48:47 +03:00 · 2017-12-30 00:48:47 +03:00 · 770ecd4ecd
commit 770ecd4ecd
parent 171baf4c48
8 changed files with 73 additions and 31 deletions
--- a/grammar.ron
+++ b/grammar.ron
@ -1,6 +1,8 @@
 Grammar(
    syntax_kinds: [
+        "ERROR",
        "IDENT",
+        "UNDERSCORE",
        "WHITESPACE",
    ]
 )
--- a/src/lexer/classes.rs
+++ b/src/lexer/classes.rs
@ -0,0 +1,22 @@
+use unicode_xid::UnicodeXID;
+
+pub fn is_ident_start(c: char) -> bool {
+    (c >= 'a' && c <= 'z')
+        || (c >= 'A' && c <= 'Z')
+        || c == '_'
+        || (c > '\x7f' && UnicodeXID::is_xid_start(c))
+}
+
+pub fn is_ident_continue(c: char) -> bool {
+    (c >= 'a' && c <= 'z')
+        || (c >= 'A' && c <= 'Z')
+        || (c >= '0' && c <= '9')
+        || c == '_'
+        || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
+}
+
+pub fn is_whitespace(c: char) -> bool {
+    //FIXME: use is_pattern_whitespace
+    //https://github.com/behnam/rust-unic/issues/192
+    c.is_whitespace()
+}
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -1,11 +1,12 @@
-use unicode_xid::UnicodeXID;
-
 use {Token, SyntaxKind};
 use syntax_kinds::*;

 mod ptr;
 use self::ptr::Ptr;

+mod classes;
+use self::classes::*;
+
 pub fn next_token(text: &str) -> Token {
    assert!(!text.is_empty());
    let mut ptr = Ptr::new(text);
@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
    // Note: r as in r" or r#" is part of a raw string literal,
    // b as in b' is part of a byte literal.
    // They are not identifiers, and are handled further down.
-    let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
+    let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
    if ident_start {
-        loop {
-            match ptr.next() {
-                Some(c) if ident_continue(c) => {
-                    ptr.bump();
-                },
-                _ => break,
-            }
-        }
-        IDENT
-    } else {
-        WHITESPACE
-    }
+        ptr.bump_while(is_ident_continue);
+        return IDENT;
    }

-fn ident_start(c: char) -> bool {
-    (c >= 'a' && c <= 'z')
-        || (c >= 'A' && c <= 'Z')
-        || c == '_'
-        || (c > '\x7f' && UnicodeXID::is_xid_start(c))
+    if is_whitespace(c) {
+        ptr.bump_while(is_whitespace);
+        return WHITESPACE;
    }

-fn ident_continue(c: char) -> bool {
-    (c >= 'a' && c <= 'z')
-        || (c >= 'A' && c <= 'Z')
-        || (c >= '0' && c <= '9')
-        || c == '_'
-        || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
+    return ERROR
 }

-
 fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@ -32,6 +32,17 @@ impl<'s> Ptr<'s> {
        Some(ch)
    }

+    pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
+        loop {
+            match self.next() {
+                Some(c) if pred(c) => {
+                    self.bump();
+                },
+                _ => return,
+            }
+        }
+    }
+
    fn chars(&self) -> Chars {
        self.text[self.len.0 as usize ..].chars()
    }
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@ -1,11 +1,15 @@
 // Generated from grammar.ron
 use tree::{SyntaxKind, SyntaxInfo};

-pub const IDENT: SyntaxKind = SyntaxKind(0);
-pub const WHITESPACE: SyntaxKind = SyntaxKind(1);
+pub const ERROR: SyntaxKind = SyntaxKind(0);
+pub const IDENT: SyntaxKind = SyntaxKind(1);
+pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
+pub const WHITESPACE: SyntaxKind = SyntaxKind(3);

-static INFOS: [SyntaxInfo; 2] = [
+static INFOS: [SyntaxInfo; 4] = [
+    SyntaxInfo { name: "ERROR" },
    SyntaxInfo { name: "IDENT" },
+    SyntaxInfo { name: "UNDERSCORE" },
    SyntaxInfo { name: "WHITESPACE" },
 ];

--- a/tests/data/lexer/0002_whitespace.rs
+++ b/tests/data/lexer/0002_whitespace.rs
@ -0,0 +1,4 @@
+a b  c
+d
+
+e	f
--- a/tests/data/lexer/0002_whitespace.txt
+++ b/tests/data/lexer/0002_whitespace.txt
@ -0,0 +1,12 @@
+IDENT 1
+WHITESPACE 1
+IDENT 1
+WHITESPACE 2
+IDENT 1
+WHITESPACE 1
+IDENT 1
+WHITESPACE 2
+IDENT 1
+WHITESPACE 1
+IDENT 1
+WHITESPACE 1
--- a/validation.md
+++ b/validation.md
@ -0,0 +1,4 @@
+Fixmes:
+
+* Fix `is_whitespace`, add more test
+