dioxus/packages/core-macro/styles/string/lexer.rs

503 lines
14 KiB
Rust
Raw Normal View History

//! Parse the various css types from strings directly (avoid pulling in syn if working at runtime)
//!
//! Differences to spec:
//! - Exponential floats are not supported for now.
use std::{char, fmt, iter};
const REPLACEMENT_CHAR: char = '<27>';
#[derive(Copy, Clone, Debug, PartialEq)]
#[non_exhaustive] // Don't allow user to create
pub struct Span {
/// Inclusive
start: usize,
/// Exclusive
end: usize,
}
impl Span {
fn new(start: usize, end: usize) -> Self {
assert!(end > start, "end must be greater than start");
Span { start, end }
}
pub fn len(&self) -> usize {
self.end - self.start
}
}
#[derive(Debug)]
pub struct InvalidChar {
ch: char,
pos: usize,
}
impl fmt::Display for InvalidChar {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"invalid character `{}` found at position {}",
self.ch.escape_debug(),
self.pos
)
}
}
#[derive(Debug)]
pub struct Lexer<'src> {
src: &'src str,
cursor: usize,
}
impl<'src> Lexer<'src> {
pub fn new(src: &'src str) -> Result<Lexer<'src>, InvalidChar> {
// Check that the user has already replaced characters as specified at
// https://www.w3.org/TR/css-syntax-3/#input-preprocessing
for (pos, ch) in src.char_indices() {
if ch == '\r' || ch == '\u{d}' || ch == '\0' {
return Err(InvalidChar { ch, pos });
}
}
Ok(Lexer { src, cursor: 0 })
}
fn len(&self) -> usize {
self.src.len()
}
fn remaining(&self) -> usize {
self.src.len() - self.cursor
}
pub fn next_token(&mut self) -> Option<Token> {
match self.peek() {
Some(token) => {
self.consume(&token);
Some(token)
}
None => None,
}
}
pub fn peek(&self) -> Option<Token> {
// https://www.w3.org/TR/css-syntax-3/#tokenizer-definitions
if let Some(comment) = self.comment() {
return Some(comment);
}
if let Some(tok) = self.whitespace() {
return Some(tok);
}
if let Some(tok) = self.string() {
return Some(tok);
}
match self.chars().next() {
Some(other) => Some(Token::new(
TokenKind::Error,
Span::new(self.cursor, self.cursor + other.len_utf8()),
)),
None => None,
}
}
pub fn peek_n(&self, n: usize) -> Option<Token> {
todo!()
}
pub fn is_empty(&self) -> bool {
todo!() //self.peek().is_none()
}
pub fn resolve_span(&self, span: Span) -> &'src str {
if span.end > self.len() {
panic!("End of requested span is past the end of the source");
}
&self.src[span.start..span.end]
}
/// Create another independent lexer at the given start point
fn fork(&self) -> Lexer {
Lexer {
src: self.src,
cursor: self.cursor,
}
}
pub fn consume(&mut self, tok: &Token) {
assert!(
tok.len() <= self.remaining(),
"trying to consume a token that would be bigger \
than all remaining text"
);
self.cursor += tok.len();
}
/// Resolve a position from cursor to position from start of src
fn resolve_pos(&self, pos: usize) -> usize {
self.cursor + pos
}
/// Create a span from the current position with the given length
fn span(&self, len: usize) -> Span {
debug_assert!(self.cursor + len <= self.len());
Span::new(self.cursor, self.cursor + len)
}
/// Create a span from the current position to the end
fn span_to_end(&self) -> Span {
Span::new(self.cursor, self.len())
}
/// Iterate over the remaining chars of the input
fn chars(&self) -> std::str::Chars {
self.src[self.cursor..].chars()
}
/// Iterate over the remaining chars of the input
fn char_indices(&self) -> std::str::CharIndices {
self.src[self.cursor..].char_indices()
}
/// Parse a comment
fn comment(&self) -> Option<Token> {
let mut ch_iter = self.char_indices().peekable();
if let Some((_, '/')) = ch_iter.next() {
if let Some((_, '*')) = ch_iter.next() {
loop {
match ch_iter.next() {
Some((_, '*')) => {
if let Some((idx, '/')) = ch_iter.peek() {
return Some(Token {
kind: TokenKind::Comment,
span: self.span(*idx + '/'.len_utf8()),
});
}
}
None => {
return Some(Token::new(
TokenKind::UnclosedComment,
self.span_to_end(),
));
}
_ => (),
}
}
}
}
None
}
/// Parse whitespace
fn whitespace(&self) -> Option<Token> {
let mut ch_iter = self.chars();
let mut len = match ch_iter.next() {
Some(ch) if ch.is_ascii_whitespace() => ch.len_utf8(),
_ => return None,
};
loop {
match ch_iter.next() {
Some(ch) if ch.is_ascii_whitespace() => len += ch.len_utf8(),
_ => break,
}
}
Some(Token {
kind: TokenKind::Whitespace,
span: self.span(len),
})
}
/// Parse either a single or double quoted string
fn string(&self) -> Option<Token> {
let mut ch_iter = self.char_indices().fuse().peekable();
let delim = match ch_iter.next() {
Some((_, '"')) => '"',
Some((_, '\'')) => '\'',
_ => return None,
};
let mut decoded_string = String::new();
loop {
match ch_iter.next() {
Some((end, ch)) if ch == delim => {
return Some(Token {
kind: TokenKind::String(decoded_string),
span: self.span(end + 1), // '"'.len_utf8() == 1
});
}
Some((end, '\n')) => {
return Some(Token {
kind: TokenKind::BadString(decoded_string),
span: self.span(end + 1), // '\n'.len_utf8() == 1
});
}
Some((_, '\\')) => match ch_iter.peek() {
Some((_, ch)) => {
if *ch == '\n' {
// do nothing - skip the backslash and newline.
ch_iter.next().unwrap();
} else if let Some(decoded_ch) = unescape(&mut ch_iter) {
decoded_string.push(decoded_ch);
} else {
decoded_string.push(ch_iter.next().unwrap().1);
}
}
None => {
// The spec says not to add the last '\'.
// a bad string will be returned on next pass
ch_iter.next().unwrap();
}
},
Some((_, ch)) => decoded_string.push(ch),
None => {
return Some(Token {
kind: TokenKind::BadString(decoded_string),
span: self.span_to_end(),
})
}
}
}
}
/*
fn hash(&self) -> Option<Token> {
let mut iter = self.char_indices();
match iter.next() {
Some((_, '#')) => (),
None => return None,
};
match iter.next() {
Some((_, '\\')) => {}
_ => Some(Token {
kind: TokenKind::Delim('#'),
span: self.span(1),
}),
}
}
*/
}
impl<'src> Iterator for Lexer<'src> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
self.next_token()
}
}
#[derive(Debug, PartialEq)]
#[non_exhaustive]
pub struct Token {
pub kind: TokenKind,
pub span: Span,
}
impl Token {
fn new(kind: TokenKind, span: Span) -> Self {
Token { kind, span }
}
pub fn len(&self) -> usize {
self.span.len()
}
}
#[derive(Debug, PartialEq)]
pub enum TokenKind {
Ident,
Function,
At,
Hash,
String(String),
BadString(String),
Url,
BadUrl,
Delim(char),
Number,
Percentage,
Dimension,
Whitespace,
/// <!--
CDO,
/// -->
CDC,
/// :
Colon,
/// ;
Semicolon,
/// ,
Comma,
/// [
LBracket,
/// ]
RBracket,
/// (
LParen,
/// )
RParen,
/// {
LBrace,
/// }
RBrace,
Comment,
UnclosedComment,
/// Could not parse the next token
Error,
}
// Helpers
/// Hex to char (up to 6 characters, e.g. "ffffff").
///
/// For example `"5c" => '\'`. Returns None if first char is not hex. Consumes the hex values.
fn unescape(input: &mut iter::Peekable<impl Iterator<Item = (usize, char)>>) -> Option<char> {
fn hex_acc(acc: &mut u32, next: char) {
debug_assert!(*acc & 0xf0000000 == 0); // make sure we don't overflow
(*acc) = (*acc << 4) + next.to_digit(16).unwrap()
}
let (_, ch) = match input.peek() {
Some((idx, ch)) if ch.is_ascii_hexdigit() => input.next().unwrap(),
_ => return None,
};
let mut acc = 0;
let mut count = 0;
hex_acc(&mut acc, ch);
// Here we use that the length of all valid hexdigits in utf8 is 1.
while count < 5
&& input
.peek()
.map(|(_, ch)| ch.is_ascii_hexdigit())
.unwrap_or(false)
{
let ch = input.next().unwrap().1;
hex_acc(&mut acc, ch);
count += 1;
}
// consume a whitespace char if it's there
if input
.peek()
.map(|(_, ch)| ch.is_ascii_whitespace())
.unwrap_or(false)
{
input.next().unwrap();
}
// maybe we could just directly use `char::from_u32(acc).unwrap_or(REPLACEMENT_CHAR)`
// null, surrogate, or too big
Some(
if acc == 0 || (acc >= 0xd800 && acc < 0xe000) || acc >= 0x110000 {
REPLACEMENT_CHAR
} else {
char::from_u32(acc).unwrap() // there should be no other invalid chars.
},
)
}
#[cfg(test)]
mod test {
use super::{Lexer, Span, Token, TokenKind};
#[test]
fn comment() {
println!();
let mut input = Lexer::new("/* a valid comment */").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::Comment,
span,
}) => {
assert_eq!(
input.resolve_span(span),
"/* a valid comment */".to_string()
);
assert_eq!(span.len(), 21);
}
_ => panic!("not a comment"),
};
let mut input = Lexer::new("/* a comment").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::UnclosedComment,
span,
}) => {
assert_eq!(input.resolve_span(span), "/* a comment".to_string());
assert_eq!(span.len(), 12);
}
_ => panic!("not a comment"),
};
let mut input = Lexer::new("/!* not a comment").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::Error,
span,
}) => {}
_ => panic!("not a comment"),
};
}
#[test]
fn string() {
println!("h");
let mut input = Lexer::new("\" a vali\\64\\e9 \\\n string \"").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::String(s),
span,
}) => {
assert_eq!(s, " a validé string ".to_string());
assert_eq!(span.len(), 26);
}
_ => panic!("not a string"),
};
let mut input = Lexer::new("' a valid string '").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::String(s),
span,
}) => {
assert_eq!(s, " a valid string ".to_string());
assert_eq!(span.len(), 18);
}
_ => panic!("not a string"),
};
let mut input = Lexer::new("\" a string").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::BadString(s),
span,
}) => {
assert_eq!(s, " a string".to_string());
assert_eq!(span.len(), 10);
}
_ => panic!("not a string"),
};
}
#[test]
fn whitespace() {
println!();
let mut input = Lexer::new("\n\t ").unwrap();
match input.next_token() {
Some(Token {
kind: TokenKind::Whitespace,
span,
}) => {
assert_eq!(input.resolve_span(span), "\n\t ".to_string());
assert_eq!(span.len(), 3);
}
_ => panic!("not a string"),
};
}
#[test]
fn escape() {
let mut iter = "e9".char_indices().peekable();
assert_eq!(super::unescape(&mut iter), Some('é'));
}
}