From d6a922459ed3ebc77ba5d79cd65144078f43e321 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Sun, 31 Dec 2017 10:41:42 +0300 Subject: [PATCH] Lexer: basic chars & lifetimes --- grammar.ron | 2 ++ src/lexer/mod.rs | 45 ++++++++++++++++++++++++++++++++++++++++++++- src/lexer/ptr.rs | 4 ++++ src/syntax_kinds.rs | 6 +++++- validation.md | 1 + 5 files changed, 56 insertions(+), 2 deletions(-) diff --git a/grammar.ron b/grammar.ron index b707248f3f..995d71f814 100644 --- a/grammar.ron +++ b/grammar.ron @@ -32,5 +32,7 @@ Grammar( "FAT_ARROW", "NEQ", "NOT", + "CHAR", + "LIFETIME", ] ) \ No newline at end of file diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 24c14add01..3f277bd2b5 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -34,7 +34,9 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { } if is_dec_digit(c) { - return scan_number(c, ptr); + let kind = scan_number(c, ptr); + scan_literal_suffix(ptr); + return kind; } // One-byte tokens. @@ -98,6 +100,8 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind { } _ => NOT, }, + + // '\'' => scan_char_or_lifetime(ptr), _ => (), } ERROR @@ -116,6 +120,45 @@ fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind { IDENT } +fn scan_char_or_lifetime(ptr: &mut Ptr) -> SyntaxKind { + // Either a character constant 'a' OR a lifetime name 'abc + let c = match ptr.bump() { + Some(c) => c, + None => return CHAR, // TODO: error reporting is upper in the stack + }; + + // If the character is an ident start not followed by another single + // quote, then this is a lifetime name: + if is_ident_start(c) && !ptr.next_is('\'') { + while ptr.next_is_p(is_ident_continue) { + ptr.bump(); + } + + // lifetimes shouldn't end with a single quote + // if we find one, then this is an invalid character literal + if ptr.next_is('\'') { + ptr.bump(); + return CHAR; + } + return LIFETIME; + } + scan_char_or_byte(ptr); + if !ptr.next_is('\'') { + return CHAR; // TODO: error reporting + } + ptr.bump(); + scan_literal_suffix(ptr); + CHAR +} + +fn scan_literal_suffix(ptr: &mut Ptr) { + +} + +fn scan_char_or_byte(ptr: &mut Ptr) { + ptr.bump(); +} + fn string_literal_start(c: char, c1: Option, c2: Option) -> bool { match (c, c1, c2) { ('r', Some('"'), _) | diff --git a/src/lexer/ptr.rs b/src/lexer/ptr.rs index b380117e63..2f759119af 100644 --- a/src/lexer/ptr.rs +++ b/src/lexer/ptr.rs @@ -34,6 +34,10 @@ impl<'s> Ptr<'s> { self.nnext() == Some(c) } + pub fn next_is_p bool>(&self, p: P) -> bool { + self.next().map(p) == Some(true) + } + pub fn nnext_is_p bool>(&self, p: P) -> bool { self.nnext().map(p) == Some(true) } diff --git a/src/syntax_kinds.rs b/src/syntax_kinds.rs index 040ac1257d..4c023757ba 100644 --- a/src/syntax_kinds.rs +++ b/src/syntax_kinds.rs @@ -33,8 +33,10 @@ pub const EQEQ: SyntaxKind = SyntaxKind(28); pub const FAT_ARROW: SyntaxKind = SyntaxKind(29); pub const NEQ: SyntaxKind = SyntaxKind(30); pub const NOT: SyntaxKind = SyntaxKind(31); +pub const CHAR: SyntaxKind = SyntaxKind(32); +pub const LIFETIME: SyntaxKind = SyntaxKind(33); -static INFOS: [SyntaxInfo; 32] = [ +static INFOS: [SyntaxInfo; 34] = [ SyntaxInfo { name: "ERROR" }, SyntaxInfo { name: "IDENT" }, SyntaxInfo { name: "UNDERSCORE" }, @@ -67,6 +69,8 @@ static INFOS: [SyntaxInfo; 32] = [ SyntaxInfo { name: "FAT_ARROW" }, SyntaxInfo { name: "NEQ" }, SyntaxInfo { name: "NOT" }, + SyntaxInfo { name: "CHAR" }, + SyntaxInfo { name: "LIFETIME" }, ]; pub(crate) fn syntax_info(kind: SyntaxKind) -> &'static SyntaxInfo { diff --git a/validation.md b/validation.md index b21ffebd54..a38b4a96e4 100644 --- a/validation.md +++ b/validation.md @@ -4,4 +4,5 @@ Fixmes: * Add more thorough tests for idents for XID_Start & XID_Continue * Validate that float and integer literals use digits only of the appropriate base, and are in range +* Validation for unclosed char literal