123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502 |
- //! Parse the various css types from strings directly (avoid pulling in syn if working at runtime)
- //!
- //! Differences to spec:
- //! - Exponential floats are not supported for now.
- use std::{char, fmt, iter};
- const REPLACEMENT_CHAR: char = '�';
- #[derive(Copy, Clone, Debug, PartialEq)]
- #[non_exhaustive] // Don't allow user to create
- pub struct Span {
- /// Inclusive
- start: usize,
- /// Exclusive
- end: usize,
- }
- impl Span {
- fn new(start: usize, end: usize) -> Self {
- assert!(end > start, "end must be greater than start");
- Span { start, end }
- }
- pub fn len(&self) -> usize {
- self.end - self.start
- }
- }
- #[derive(Debug)]
- pub struct InvalidChar {
- ch: char,
- pos: usize,
- }
- impl fmt::Display for InvalidChar {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(
- f,
- "invalid character `{}` found at position {}",
- self.ch.escape_debug(),
- self.pos
- )
- }
- }
- #[derive(Debug)]
- pub struct Lexer<'src> {
- src: &'src str,
- cursor: usize,
- }
- impl<'src> Lexer<'src> {
- pub fn new(src: &'src str) -> Result<Lexer<'src>, InvalidChar> {
- // Check that the user has already replaced characters as specified at
- // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
- for (pos, ch) in src.char_indices() {
- if ch == '\r' || ch == '\u{d}' || ch == '\0' {
- return Err(InvalidChar { ch, pos });
- }
- }
- Ok(Lexer { src, cursor: 0 })
- }
- fn len(&self) -> usize {
- self.src.len()
- }
- fn remaining(&self) -> usize {
- self.src.len() - self.cursor
- }
- pub fn next_token(&mut self) -> Option<Token> {
- match self.peek() {
- Some(token) => {
- self.consume(&token);
- Some(token)
- }
- None => None,
- }
- }
- pub fn peek(&self) -> Option<Token> {
- // https://www.w3.org/TR/css-syntax-3/#tokenizer-definitions
- if let Some(comment) = self.comment() {
- return Some(comment);
- }
- if let Some(tok) = self.whitespace() {
- return Some(tok);
- }
- if let Some(tok) = self.string() {
- return Some(tok);
- }
- match self.chars().next() {
- Some(other) => Some(Token::new(
- TokenKind::Error,
- Span::new(self.cursor, self.cursor + other.len_utf8()),
- )),
- None => None,
- }
- }
- pub fn peek_n(&self, n: usize) -> Option<Token> {
- todo!()
- }
- pub fn is_empty(&self) -> bool {
- todo!() //self.peek().is_none()
- }
- pub fn resolve_span(&self, span: Span) -> &'src str {
- if span.end > self.len() {
- panic!("End of requested span is past the end of the source");
- }
- &self.src[span.start..span.end]
- }
- /// Create another independent lexer at the given start point
- fn fork(&self) -> Lexer {
- Lexer {
- src: self.src,
- cursor: self.cursor,
- }
- }
- pub fn consume(&mut self, tok: &Token) {
- assert!(
- tok.len() <= self.remaining(),
- "trying to consume a token that would be bigger \
- than all remaining text"
- );
- self.cursor += tok.len();
- }
- /// Resolve a position from cursor to position from start of src
- fn resolve_pos(&self, pos: usize) -> usize {
- self.cursor + pos
- }
- /// Create a span from the current position with the given length
- fn span(&self, len: usize) -> Span {
- debug_assert!(self.cursor + len <= self.len());
- Span::new(self.cursor, self.cursor + len)
- }
- /// Create a span from the current position to the end
- fn span_to_end(&self) -> Span {
- Span::new(self.cursor, self.len())
- }
- /// Iterate over the remaining chars of the input
- fn chars(&self) -> std::str::Chars {
- self.src[self.cursor..].chars()
- }
- /// Iterate over the remaining chars of the input
- fn char_indices(&self) -> std::str::CharIndices {
- self.src[self.cursor..].char_indices()
- }
- /// Parse a comment
- fn comment(&self) -> Option<Token> {
- let mut ch_iter = self.char_indices().peekable();
- if let Some((_, '/')) = ch_iter.next() {
- if let Some((_, '*')) = ch_iter.next() {
- loop {
- match ch_iter.next() {
- Some((_, '*')) => {
- if let Some((idx, '/')) = ch_iter.peek() {
- return Some(Token {
- kind: TokenKind::Comment,
- span: self.span(*idx + '/'.len_utf8()),
- });
- }
- }
- None => {
- return Some(Token::new(
- TokenKind::UnclosedComment,
- self.span_to_end(),
- ));
- }
- _ => (),
- }
- }
- }
- }
- None
- }
- /// Parse whitespace
- fn whitespace(&self) -> Option<Token> {
- let mut ch_iter = self.chars();
- let mut len = match ch_iter.next() {
- Some(ch) if ch.is_ascii_whitespace() => ch.len_utf8(),
- _ => return None,
- };
- loop {
- match ch_iter.next() {
- Some(ch) if ch.is_ascii_whitespace() => len += ch.len_utf8(),
- _ => break,
- }
- }
- Some(Token {
- kind: TokenKind::Whitespace,
- span: self.span(len),
- })
- }
- /// Parse either a single or double quoted string
- fn string(&self) -> Option<Token> {
- let mut ch_iter = self.char_indices().fuse().peekable();
- let delim = match ch_iter.next() {
- Some((_, '"')) => '"',
- Some((_, '\'')) => '\'',
- _ => return None,
- };
- let mut decoded_string = String::new();
- loop {
- match ch_iter.next() {
- Some((end, ch)) if ch == delim => {
- return Some(Token {
- kind: TokenKind::String(decoded_string),
- span: self.span(end + 1), // '"'.len_utf8() == 1
- });
- }
- Some((end, '\n')) => {
- return Some(Token {
- kind: TokenKind::BadString(decoded_string),
- span: self.span(end + 1), // '\n'.len_utf8() == 1
- });
- }
- Some((_, '\\')) => match ch_iter.peek() {
- Some((_, ch)) => {
- if *ch == '\n' {
- // do nothing - skip the backslash and newline.
- ch_iter.next().unwrap();
- } else if let Some(decoded_ch) = unescape(&mut ch_iter) {
- decoded_string.push(decoded_ch);
- } else {
- decoded_string.push(ch_iter.next().unwrap().1);
- }
- }
- None => {
- // The spec says not to add the last '\'.
- // a bad string will be returned on next pass
- ch_iter.next().unwrap();
- }
- },
- Some((_, ch)) => decoded_string.push(ch),
- None => {
- return Some(Token {
- kind: TokenKind::BadString(decoded_string),
- span: self.span_to_end(),
- })
- }
- }
- }
- }
- /*
- fn hash(&self) -> Option<Token> {
- let mut iter = self.char_indices();
- match iter.next() {
- Some((_, '#')) => (),
- None => return None,
- };
- match iter.next() {
- Some((_, '\\')) => {}
- _ => Some(Token {
- kind: TokenKind::Delim('#'),
- span: self.span(1),
- }),
- }
- }
- */
- }
- impl<'src> Iterator for Lexer<'src> {
- type Item = Token;
- fn next(&mut self) -> Option<Self::Item> {
- self.next_token()
- }
- }
- #[derive(Debug, PartialEq)]
- #[non_exhaustive]
- pub struct Token {
- pub kind: TokenKind,
- pub span: Span,
- }
- impl Token {
- fn new(kind: TokenKind, span: Span) -> Self {
- Token { kind, span }
- }
- pub fn len(&self) -> usize {
- self.span.len()
- }
- }
- #[derive(Debug, PartialEq)]
- pub enum TokenKind {
- Ident,
- Function,
- At,
- Hash,
- String(String),
- BadString(String),
- Url,
- BadUrl,
- Delim(char),
- Number,
- Percentage,
- Dimension,
- Whitespace,
- /// <!--
- CDO,
- /// -->
- CDC,
- /// :
- Colon,
- /// ;
- Semicolon,
- /// ,
- Comma,
- /// [
- LBracket,
- /// ]
- RBracket,
- /// (
- LParen,
- /// )
- RParen,
- /// {
- LBrace,
- /// }
- RBrace,
- Comment,
- UnclosedComment,
- /// Could not parse the next token
- Error,
- }
- // Helpers
- /// Hex to char (up to 6 characters, e.g. "ffffff").
- ///
- /// For example `"5c" => '\'`. Returns None if first char is not hex. Consumes the hex values.
- fn unescape(input: &mut iter::Peekable<impl Iterator<Item = (usize, char)>>) -> Option<char> {
- fn hex_acc(acc: &mut u32, next: char) {
- debug_assert!(*acc & 0xf0000000 == 0); // make sure we don't overflow
- (*acc) = (*acc << 4) + next.to_digit(16).unwrap()
- }
- let (_, ch) = match input.peek() {
- Some((idx, ch)) if ch.is_ascii_hexdigit() => input.next().unwrap(),
- _ => return None,
- };
- let mut acc = 0;
- let mut count = 0;
- hex_acc(&mut acc, ch);
- // Here we use that the length of all valid hexdigits in utf8 is 1.
- while count < 5
- && input
- .peek()
- .map(|(_, ch)| ch.is_ascii_hexdigit())
- .unwrap_or(false)
- {
- let ch = input.next().unwrap().1;
- hex_acc(&mut acc, ch);
- count += 1;
- }
- // consume a whitespace char if it's there
- if input
- .peek()
- .map(|(_, ch)| ch.is_ascii_whitespace())
- .unwrap_or(false)
- {
- input.next().unwrap();
- }
- // maybe we could just directly use `char::from_u32(acc).unwrap_or(REPLACEMENT_CHAR)`
- // null, surrogate, or too big
- Some(
- if acc == 0 || (acc >= 0xd800 && acc < 0xe000) || acc >= 0x110000 {
- REPLACEMENT_CHAR
- } else {
- char::from_u32(acc).unwrap() // there should be no other invalid chars.
- },
- )
- }
- #[cfg(test)]
- mod test {
- use super::{Lexer, Span, Token, TokenKind};
- #[test]
- fn comment() {
- println!();
- let mut input = Lexer::new("/* a valid comment */").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::Comment,
- span,
- }) => {
- assert_eq!(
- input.resolve_span(span),
- "/* a valid comment */".to_string()
- );
- assert_eq!(span.len(), 21);
- }
- _ => panic!("not a comment"),
- };
- let mut input = Lexer::new("/* a comment").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::UnclosedComment,
- span,
- }) => {
- assert_eq!(input.resolve_span(span), "/* a comment".to_string());
- assert_eq!(span.len(), 12);
- }
- _ => panic!("not a comment"),
- };
- let mut input = Lexer::new("/!* not a comment").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::Error,
- span,
- }) => {}
- _ => panic!("not a comment"),
- };
- }
- #[test]
- fn string() {
- println!("h");
- let mut input = Lexer::new("\" a vali\\64\\e9 \\\n string \"").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::String(s),
- span,
- }) => {
- assert_eq!(s, " a validé string ".to_string());
- assert_eq!(span.len(), 26);
- }
- _ => panic!("not a string"),
- };
- let mut input = Lexer::new("' a valid string '").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::String(s),
- span,
- }) => {
- assert_eq!(s, " a valid string ".to_string());
- assert_eq!(span.len(), 18);
- }
- _ => panic!("not a string"),
- };
- let mut input = Lexer::new("\" a string").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::BadString(s),
- span,
- }) => {
- assert_eq!(s, " a string".to_string());
- assert_eq!(span.len(), 10);
- }
- _ => panic!("not a string"),
- };
- }
- #[test]
- fn whitespace() {
- println!();
- let mut input = Lexer::new("\n\t ").unwrap();
- match input.next_token() {
- Some(Token {
- kind: TokenKind::Whitespace,
- span,
- }) => {
- assert_eq!(input.resolve_span(span), "\n\t ".to_string());
- assert_eq!(span.len(), 3);
- }
- _ => panic!("not a string"),
- };
- }
- #[test]
- fn escape() {
- let mut iter = "e9".char_indices().peekable();
- assert_eq!(super::unescape(&mut iter), Some('é'));
- }
- }
|