Cleanup character classes

2024-12-25 20:43:21 +00:00 · 2017-12-30 00:48:47 +03:00 · 2017-12-30 00:48:47 +03:00 · 770ecd4ecd
commit 770ecd4ecd
parent 171baf4c48
8 changed files with 73 additions and 31 deletions
--- a/grammar.ron
+++ b/grammar.ron
@ -1,6 +1,8 @@
 Grammar(
    syntax_kinds: [
        "ERROR",
        "IDENT",
        "UNDERSCORE",
        "WHITESPACE",
    ]
 )
--- a/src/lexer/classes.rs
+++ b/src/lexer/classes.rs
@ -0,0 +1,22 @@
 use unicode_xid::UnicodeXID;
 pub fn is_ident_start(c: char) -> bool {
    (c >= 'a' && c <= 'z')
        || (c >= 'A' && c <= 'Z')
        || c == '_'
        || (c > '\x7f' && UnicodeXID::is_xid_start(c))
 }
 pub fn is_ident_continue(c: char) -> bool {
    (c >= 'a' && c <= 'z')
        || (c >= 'A' && c <= 'Z')
        || (c >= '0' && c <= '9')
        || c == '_'
        || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
 }
 pub fn is_whitespace(c: char) -> bool {
    //FIXME: use is_pattern_whitespace
    //https://github.com/behnam/rust-unic/issues/192
    c.is_whitespace()
 }
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -1,11 +1,12 @@
 use unicode_xid::UnicodeXID;
 use {Token, SyntaxKind};
 use syntax_kinds::*;
 mod ptr;
 use self::ptr::Ptr;
 mod classes;
 use self::classes::*;
 pub fn next_token(text: &str) -> Token {
    assert!(!text.is_empty());
    let mut ptr = Ptr::new(text);
@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
    // Note: r as in r" or r#" is part of a raw string literal,
    // b as in b' is part of a byte literal.
    // They are not identifiers, and are handled further down.
-    let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
+    let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext());
    if ident_start {
-        loop {
+        ptr.bump_while(is_ident_continue);
-            match ptr.next() {
+        return IDENT;
                Some(c) if ident_continue(c) => {
                    ptr.bump();
                },
                _ => break,
    }
    if is_whitespace(c) {
        ptr.bump_while(is_whitespace);
        return WHITESPACE;
    }
        IDENT
    } else {
        WHITESPACE
    }
 }
-fn ident_start(c: char) -> bool {
+    return ERROR
    (c >= 'a' && c <= 'z')
        || (c >= 'A' && c <= 'Z')
        || c == '_'
        || (c > '\x7f' && UnicodeXID::is_xid_start(c))
 }
 fn ident_continue(c: char) -> bool {
    (c >= 'a' && c <= 'z')
        || (c >= 'A' && c <= 'Z')
        || (c >= '0' && c <= '9')
        || c == '_'
        || (c > '\x7f' && UnicodeXID::is_xid_continue(c))
 }
 fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
--- a/src/lexer/ptr.rs
+++ b/src/lexer/ptr.rs
@ -32,6 +32,17 @@ impl<'s> Ptr<'s> {
        Some(ch)
    }
    pub fn bump_while<F: Fn(char) -> bool>(&mut self, pred: F) {
        loop {
            match self.next() {
                Some(c) if pred(c) => {
                    self.bump();
                },
                _ => return,
            }
        }
    }
    fn chars(&self) -> Chars {
        self.text[self.len.0 as usize ..].chars()
    }
--- a/src/syntax_kinds.rs
+++ b/src/syntax_kinds.rs
@ -1,11 +1,15 @@
 // Generated from grammar.ron
 use tree::{SyntaxKind, SyntaxInfo};
-pub const IDENT: SyntaxKind = SyntaxKind(0);
+pub const ERROR: SyntaxKind = SyntaxKind(0);
-pub const WHITESPACE: SyntaxKind = SyntaxKind(1);
+pub const IDENT: SyntaxKind = SyntaxKind(1);
 pub const UNDERSCORE: SyntaxKind = SyntaxKind(2);
 pub const WHITESPACE: SyntaxKind = SyntaxKind(3);
-static INFOS: [SyntaxInfo; 2] = [
+static INFOS: [SyntaxInfo; 4] = [
    SyntaxInfo { name: "ERROR" },
    SyntaxInfo { name: "IDENT" },
    SyntaxInfo { name: "UNDERSCORE" },
    SyntaxInfo { name: "WHITESPACE" },
 ];
--- a/tests/data/lexer/0002_whitespace.rs
+++ b/tests/data/lexer/0002_whitespace.rs
@ -0,0 +1,4 @@
 a b  c
 d
 e	f
--- a/tests/data/lexer/0002_whitespace.txt
+++ b/tests/data/lexer/0002_whitespace.txt
@ -0,0 +1,12 @@
 IDENT 1
 WHITESPACE 1
 IDENT 1
 WHITESPACE 2
 IDENT 1
 WHITESPACE 1
 IDENT 1
 WHITESPACE 2
 IDENT 1
 WHITESPACE 1
 IDENT 1
 WHITESPACE 1
--- a/validation.md
+++ b/validation.md
@ -0,0 +1,4 @@
 Fixmes:
 * Fix `is_whitespace`, add more test
		`@ -0,0 +1,4 @@`
							`Fixmes:`

							* Fix `is_whitespace`, add more test