From 770ecd4ecd61f04597d9478001848b703d915cce Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sat, 30 Dec 2017 00:48:47 +0300 Subject: [PATCH] Cleanup character classes --- grammar.ron | 2 ++ src/lexer/classes.rs | 22 ++++++++++++++++ src/lexer/mod.rs | 39 ++++++++-------------------- src/lexer/ptr.rs | 11 ++++++++ src/syntax_kinds.rs | 10 ++++--- tests/data/lexer/0002_whitespace.rs | 4 +++ tests/data/lexer/0002_whitespace.txt | 12 +++++++++ validation.md | 4 +++ 8 files changed, 73 insertions(+), 31 deletions(-) create mode 100644 src/lexer/classes.rs create mode 100644 tests/data/lexer/0002_whitespace.rs create mode 100644 tests/data/lexer/0002_whitespace.txt create mode 100644 validation.md diff --git a/grammar.ron b/grammar.ron index 18c382536e..49b9c527c1 100644 --- a/grammar.ron +++ b/grammar.ron @@ -1,6 +1,8 @@ Grammar( syntax_kinds: [ + "ERROR", "IDENT", + "UNDERSCORE", "WHITESPACE", ] ) \ No newline at end of file diff --git a/src/lexer/classes.rs b/src/lexer/classes.rs new file mode 100644 index 0000000000..7cc050bde4 --- /dev/null +++ b/src/lexer/classes.rs @@ -0,0 +1,22 @@ +use unicode_xid::UnicodeXID; + +pub fn is_ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +pub fn is_ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + +pub fn is_whitespace(c: char) -> bool { + //FIXME: use is_pattern_whitespace + //https://github.com/behnam/rust-unic/issues/192 + c.is_whitespace() +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 136afb7b80..dd3e2896d2 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,11 +1,12 @@ -use unicode_xid::UnicodeXID; - use {Token, SyntaxKind}; use syntax_kinds::*; mod ptr; use self::ptr::Ptr; +mod classes; +use self::classes::*; + pub fn next_token(text: &str) -> Token { assert!(!text.is_empty()); let mut ptr = Ptr::new(text); @@ -19,38 +20,20 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { // Note: r as in r" or r#" is part of a raw string literal, // b as in b' is part of a byte literal. // They are not identifiers, and are handled further down. - let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); + let ident_start = is_ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); if ident_start { - loop { - match ptr.next() { - Some(c) if ident_continue(c) => { - ptr.bump(); - }, - _ => break, - } - } - IDENT - } else { - WHITESPACE + ptr.bump_while(is_ident_continue); + return IDENT; } -} -fn ident_start(c: char) -> bool { - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || c == '_' - || (c > '\x7f' && UnicodeXID::is_xid_start(c)) -} + if is_whitespace(c) { + ptr.bump_while(is_whitespace); + return WHITESPACE; + } -fn ident_continue(c: char) -> bool { - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || (c >= '0' && c <= '9') - || c == '_' - || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) + return ERROR } - fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { match (c, c1, c2) { ('r', Some('"'), _) | diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index 4638dac213..e8aa6f37b4 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs @@ -32,6 +32,17 @@ impl<'s> Ptr<'s> { Some(ch) } + pub fn bump_while bool>(&mut self, pred: F) { + loop { + match self.next() { + Some(c) if pred(c) => { + self.bump(); + }, + _ => return, + } + } + } + fn chars(&self) -> Chars { self.text[self.len.0 as usize ..].chars() } diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index 421cae15ac..b9b47a2ede 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs @@ -1,11 +1,15 @@ // Generated from grammar.ron use tree::{SyntaxKind, SyntaxInfo}; -pub const IDENT: SyntaxKind = SyntaxKind(0); -pub const WHITESPACE: SyntaxKind = SyntaxKind(1); +pub const ERROR: SyntaxKind = SyntaxKind(0); +pub const IDENT: SyntaxKind = SyntaxKind(1); +pub const UNDERSCORE: SyntaxKind = SyntaxKind(2); +pub const WHITESPACE: SyntaxKind = SyntaxKind(3); -static INFOS: [SyntaxInfo; 2] = [ +static INFOS: [SyntaxInfo; 4] = [ + SyntaxInfo { name: "ERROR" }, SyntaxInfo { name: "IDENT" }, + SyntaxInfo { name: "UNDERSCORE" }, SyntaxInfo { name: "WHITESPACE" }, ]; diff --git a/tests/data/lexer/0002_whitespace.rs b/tests/data/lexer/0002_whitespace.rs new file mode 100644 index 0000000000..08fce1418a --- /dev/null +++ b/tests/data/lexer/0002_whitespace.rs @@ -0,0 +1,4 @@ +a b c +d + +e f diff --git a/tests/data/lexer/0002_whitespace.txt b/tests/data/lexer/0002_whitespace.txt new file mode 100644 index 0000000000..4b9885e4a2 --- /dev/null +++ b/tests/data/lexer/0002_whitespace.txt @@ -0,0 +1,12 @@ +IDENT 1 +WHITESPACE 1 +IDENT 1 +WHITESPACE 2 +IDENT 1 +WHITESPACE 1 +IDENT 1 +WHITESPACE 2 +IDENT 1 +WHITESPACE 1 +IDENT 1 +WHITESPACE 1 diff --git a/validation.md b/validation.md new file mode 100644 index 0000000000..9cfec5309f --- /dev/null +++ b/validation.md @@ -0,0 +1,4 @@ +Fixmes: + +* Fix `is_whitespace`, add more test +