Lexer: extract string lexing to a separate file

2024-12-25 12:33:33 +00:00 · 2017-12-31 13:32:00 +03:00 · 2017-12-31 13:32:00 +03:00 · f1a840cc38
commit f1a840cc38
parent 9d5138bf11
2 changed files with 88 additions and 49 deletions
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -10,6 +10,9 @@ use self::classes::*;
 mod numbers;
 use self::numbers::scan_number;
 mod strings;
 use self::strings::{string_literal_start, scan_char, scan_byte_char_or_string};
 pub fn next_token(text: &str) -> Token {
    assert!(!text.is_empty());
    let mut ptr = Ptr::new(text);
@ -101,7 +104,26 @@ fn next_token_inner(c: char, ptr: &mut Ptr) -> SyntaxKind {
            _ => NOT,
        },
-        '\'' => return scan_char_or_lifetime(ptr),
+        // If the character is an ident start not followed by another single
        // quote, then this is a lifetime name:
        '\'' => return if ptr.next_is_p(is_ident_start) && !ptr.nnext_is('\'') {
            ptr.bump();
            while ptr.next_is_p(is_ident_continue) {
                ptr.bump();
            }
            // lifetimes shouldn't end with a single quote
            // if we find one, then this is an invalid character literal
            if ptr.next_is('\'') {
                ptr.bump();
                return CHAR; // TODO: error reporting
            }
            LIFETIME
        } else {
            scan_char(ptr);
            scan_literal_suffix(ptr);
            CHAR
        },
        'b' => return scan_byte_char_or_string(ptr),
        _ => (),
    }
    ERROR
@ -120,57 +142,9 @@ fn scan_ident(c: char, ptr: &mut Ptr) -> SyntaxKind {
    IDENT
 }
 fn scan_char_or_lifetime(ptr: &mut Ptr) -> SyntaxKind {
    // Either a character constant 'a' OR a lifetime name 'abc
    let c = match ptr.bump() {
        Some(c) => c,
        None => return CHAR, // TODO: error reporting is upper in the stack
    };
    // If the character is an ident start not followed by another single
    // quote, then this is a lifetime name:
    if is_ident_start(c) && !ptr.next_is('\'') {
        while ptr.next_is_p(is_ident_continue) {
            ptr.bump();
        }
        // lifetimes shouldn't end with a single quote
        // if we find one, then this is an invalid character literal
        if ptr.next_is('\'') {
            ptr.bump();
            return CHAR;
        }
        return LIFETIME;
    }
    scan_char_or_byte(ptr);
    if !ptr.next_is('\'') {
        return CHAR; // TODO: error reporting
    }
    ptr.bump();
    scan_literal_suffix(ptr);
    CHAR
 }
 fn scan_literal_suffix(ptr: &mut Ptr) {
    if ptr.next_is_p(is_ident_start) {
        ptr.bump();
    }
    ptr.bump_while(is_ident_continue);
 }
 fn scan_char_or_byte(ptr: &mut Ptr) {
    //FIXME: deal with escape sequencies
    ptr.bump();
 }
 fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
        ('r', Some('#'), _) |
        ('b', Some('"'), _) |
        ('b', Some('\''), _) |
        ('b', Some('r'), Some('"')) |
        ('b', Some('r'), Some('#')) => true,
        _ => false
    }
 }
--- a/src/lexer/strings.rs
+++ b/src/lexer/strings.rs
@ -0,0 +1,65 @@
 use {SyntaxKind};
 use syntax_kinds::*;
 use lexer::ptr::Ptr;
 pub(crate) fn string_literal_start(c: char, c1: Option<char>, c2: Option<char>) -> bool {
    match (c, c1, c2) {
        ('r', Some('"'), _) |
        ('r', Some('#'), _) |
        ('b', Some('"'), _) |
        ('b', Some('\''), _) |
        ('b', Some('r'), Some('"')) |
        ('b', Some('r'), Some('#')) => true,
        _ => false
    }
 }
 pub(crate) fn scan_char(ptr: &mut Ptr) {
    if ptr.bump().is_none() {
        return; // TODO: error reporting is upper in the stack
    }
    scan_char_or_byte(ptr);
    if !ptr.next_is('\'') {
        return; // TODO: error reporting
    }
    ptr.bump();
 }
 pub(crate) fn scan_byte_char_or_string(ptr: &mut Ptr) -> SyntaxKind {
    // unwrapping and not-exhaustive match are ok
    // because of string_literal_start
    let c = ptr.bump().unwrap();
    match c {
        '\'' => {
            scan_byte(ptr);
            CHAR
        }
        '"' => {
            scan_byte_string(ptr);
            CHAR
        }
        'r' => {
            scan_raw_byte_string(ptr);
            CHAR
        }
        _ => unreachable!(),
    }
 }
 fn scan_byte(ptr: &mut Ptr) {
 }
 fn scan_byte_string(ptr: &mut Ptr) {
 }
 fn scan_raw_byte_string(ptr: &mut Ptr) {
 }
 fn scan_char_or_byte(ptr: &mut Ptr) {
    //FIXME: deal with escape sequencies
    ptr.bump();
 }