From 171baf4c4863f035384c6c63a5f0ce531b01cf9d Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Fri, 29 Dec 2017 23:33:04 +0300 Subject: [PATCH] Simple identifier lexer --- Cargo.toml | 2 ++ src/lexer.rs | 10 ------ src/lexer/mod.rs | 64 +++++++++++++++++++++++++++++++++ src/lexer/ptr.rs | 38 ++++++++++++++++++++ src/lib.rs | 2 ++ src/text.rs | 31 +++++++++++++++- tests/data/lexer/0001_hello.txt | 2 +- tests/lexer.rs | 16 +++++---- 8 files changed, 146 insertions(+), 19 deletions(-) delete mode 100644 src/lexer.rs create mode 100644 src/lexer/mod.rs create mode 100644 src/lexer/ptr.rs diff --git a/Cargo.toml b/Cargo.toml index 0afd4d3277..063d52211d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,8 @@ version = "0.1.0" authors = ["Aleksey Kladov "] [dependencies] +unicode-xid = "0.1.0" + serde = "1.0.26" serde_derive = "1.0.26" file = "1.1.1" diff --git a/src/lexer.rs b/src/lexer.rs deleted file mode 100644 index cda9fe2b2d..0000000000 --- a/src/lexer.rs +++ /dev/null @@ -1,10 +0,0 @@ -use {Token, TextUnit}; -use syntax_kinds::*; - -pub fn next_token(text: &str) -> Token { - let c = text.chars().next().unwrap(); - Token { - kind: IDENT, - len: TextUnit::len_of_char(c), - } -} \ No newline at end of file diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs new file mode 100644 index 0000000000..136afb7b80 --- /dev/null +++ b/src/lexer/mod.rs @@ -0,0 +1,64 @@ +use unicode_xid::UnicodeXID; + +use {Token, SyntaxKind}; +use syntax_kinds::*; + +mod ptr; +use self::ptr::Ptr; + +pub fn next_token(text: &str) -> Token { + assert!(!text.is_empty()); + let mut ptr = Ptr::new(text); + let c = ptr.bump().unwrap(); + let kind = next_token_inner(c, &mut ptr); + let len = ptr.into_len(); + Token { kind, len } +} + +fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { + // Note: r as in r" or r#" is part of a raw string literal, + // b as in b' is part of a byte literal. + // They are not identifiers, and are handled further down. + let ident_start = ident_start(c) && !string_literal_start(c, ptr.next(), ptr.nnext()); + if ident_start { + loop { + match ptr.next() { + Some(c) if ident_continue(c) => { + ptr.bump(); + }, + _ => break, + } + } + IDENT + } else { + WHITESPACE + } +} + +fn ident_start(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_start(c)) +} + +fn ident_continue(c: char) -> bool { + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && UnicodeXID::is_xid_continue(c)) +} + + +fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { + match (c, c1, c2) { + ('r', Some('"'), _) | + ('r', Some('#'), _) | + ('b', Some('"'), _) | + ('b', Some('\''), _) | + ('b', Some('r'), Some('"')) | + ('b', Some('r'), Some('#')) => true, + _ => false + } +} diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs new file mode 100644 index 0000000000..4638dac213 --- /dev/null +++ b/src/lexer/ptr.rs @@ -0,0 +1,38 @@ +use {TextUnit}; + +use std::str::Chars; + +pub(crate) struct Ptr<'s> { + text: &'s str, + len: TextUnit, +} + +impl<'s> Ptr<'s> { + pub fn new(text: &'s str) -> Ptr<'s> { + Ptr { text, len: TextUnit::new(0) } + } + + pub fn into_len(self) -> TextUnit { + self.len + } + + pub fn next(&self) -> Option { + self.chars().next() + } + + pub fn nnext(&self) -> Option { + let mut chars = self.chars(); + chars.next()?; + chars.next() + } + + pub fn bump(&mut self) -> Option { + let ch = self.chars().next()?; + self.len += TextUnit::len_of_char(ch); + Some(ch) + } + + fn chars(&self) -> Chars { + self.text[self.len.0 as usize ..].chars() + } +} diff --git a/src/lib.rs b/src/lib.rs index 4385c03254..3b9dbc8f74 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +extern crate unicode_xid; + mod text; mod tree; mod lexer; diff --git a/src/text.rs b/src/text.rs index 5297275ed4..31e67b4560 100644 --- a/src/text.rs +++ b/src/text.rs @@ -1,7 +1,10 @@ use std::fmt; +use std::ops; #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct TextUnit(u32); +pub struct TextUnit( + pub(crate) u32 +); impl TextUnit { pub fn len_of_char(c: char) -> TextUnit { @@ -30,3 +33,29 @@ impl From for u32 { tu.0 } } + +impl ops::Add for TextUnit { + type Output = TextUnit; + fn add(self, rhs: TextUnit) -> TextUnit { + TextUnit(self.0 + rhs.0) + } +} + +impl ops::AddAssign for TextUnit { + fn add_assign(&mut self, rhs: TextUnit) { + self.0 += rhs.0 + } +} + +impl ops::Sub for TextUnit { + type Output = TextUnit; + fn sub(self, rhs: TextUnit) -> TextUnit { + TextUnit(self.0 - rhs.0) + } +} + +impl ops::SubAssign for TextUnit { + fn sub_assign(&mut self, rhs: TextUnit) { + self.0 -= rhs.0 + } +} \ No newline at end of file diff --git a/tests/data/lexer/0001_hello.txt b/tests/data/lexer/0001_hello.txt index 5bec9be806..e0b6a1f109 100644 --- a/tests/data/lexer/0001_hello.txt +++ b/tests/data/lexer/0001_hello.txt @@ -1,3 +1,3 @@ IDENT 5 WHITESPACE 1 -IDENT 5 \ No newline at end of file +IDENT 5 diff --git a/tests/lexer.rs b/tests/lexer.rs index a27e7c3954..a3c8916b1e 100644 --- a/tests/lexer.rs +++ b/tests/lexer.rs @@ -41,13 +41,15 @@ fn lexer_test_case(path: &Path) { dump_tokens(&tokens) }; let expected = file::get_text(&path.with_extension("txt")).unwrap(); - - assert_diff!( - expected.as_str(), - actual.as_str(), - "\n", - 0 - ) + let expected = expected.as_str(); + let actual = actual.as_str(); + if expected == actual { + return + } + if expected.trim() == actual.trim() { + panic!("Whitespace difference!") + } + assert_diff!(expected, actual, "\n", 0) } fn tokenize(text: &str) -> Vec {