lexer.rs 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. //! Parse the various css types from strings directly (avoid pulling in syn if working at runtime)
  2. //!
  3. //! Differences to spec:
  4. //! - Exponential floats are not supported for now.
  5. use std::{char, fmt, iter};
  6. const REPLACEMENT_CHAR: char = '�';
  7. #[derive(Copy, Clone, Debug, PartialEq)]
  8. #[non_exhaustive] // Don't allow user to create
  9. pub struct Span {
  10. /// Inclusive
  11. start: usize,
  12. /// Exclusive
  13. end: usize,
  14. }
  15. impl Span {
  16. fn new(start: usize, end: usize) -> Self {
  17. assert!(end > start, "end must be greater than start");
  18. Span { start, end }
  19. }
  20. pub fn len(&self) -> usize {
  21. self.end - self.start
  22. }
  23. }
  24. #[derive(Debug)]
  25. pub struct InvalidChar {
  26. ch: char,
  27. pos: usize,
  28. }
  29. impl fmt::Display for InvalidChar {
  30. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
  31. write!(
  32. f,
  33. "invalid character `{}` found at position {}",
  34. self.ch.escape_debug(),
  35. self.pos
  36. )
  37. }
  38. }
  39. #[derive(Debug)]
  40. pub struct Lexer<'src> {
  41. src: &'src str,
  42. cursor: usize,
  43. }
  44. impl<'src> Lexer<'src> {
  45. pub fn new(src: &'src str) -> Result<Lexer<'src>, InvalidChar> {
  46. // Check that the user has already replaced characters as specified at
  47. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  48. for (pos, ch) in src.char_indices() {
  49. if ch == '\r' || ch == '\u{d}' || ch == '\0' {
  50. return Err(InvalidChar { ch, pos });
  51. }
  52. }
  53. Ok(Lexer { src, cursor: 0 })
  54. }
  55. fn len(&self) -> usize {
  56. self.src.len()
  57. }
  58. fn remaining(&self) -> usize {
  59. self.src.len() - self.cursor
  60. }
  61. pub fn next_token(&mut self) -> Option<Token> {
  62. match self.peek() {
  63. Some(token) => {
  64. self.consume(&token);
  65. Some(token)
  66. }
  67. None => None,
  68. }
  69. }
  70. pub fn peek(&self) -> Option<Token> {
  71. // https://www.w3.org/TR/css-syntax-3/#tokenizer-definitions
  72. if let Some(comment) = self.comment() {
  73. return Some(comment);
  74. }
  75. if let Some(tok) = self.whitespace() {
  76. return Some(tok);
  77. }
  78. if let Some(tok) = self.string() {
  79. return Some(tok);
  80. }
  81. match self.chars().next() {
  82. Some(other) => Some(Token::new(
  83. TokenKind::Error,
  84. Span::new(self.cursor, self.cursor + other.len_utf8()),
  85. )),
  86. None => None,
  87. }
  88. }
  89. pub fn peek_n(&self, n: usize) -> Option<Token> {
  90. todo!()
  91. }
  92. pub fn is_empty(&self) -> bool {
  93. todo!() //self.peek().is_none()
  94. }
  95. pub fn resolve_span(&self, span: Span) -> &'src str {
  96. if span.end > self.len() {
  97. panic!("End of requested span is past the end of the source");
  98. }
  99. &self.src[span.start..span.end]
  100. }
  101. /// Create another independent lexer at the given start point
  102. fn fork(&self) -> Lexer {
  103. Lexer {
  104. src: self.src,
  105. cursor: self.cursor,
  106. }
  107. }
  108. pub fn consume(&mut self, tok: &Token) {
  109. assert!(
  110. tok.len() <= self.remaining(),
  111. "trying to consume a token that would be bigger \
  112. than all remaining text"
  113. );
  114. self.cursor += tok.len();
  115. }
  116. /// Resolve a position from cursor to position from start of src
  117. fn resolve_pos(&self, pos: usize) -> usize {
  118. self.cursor + pos
  119. }
  120. /// Create a span from the current position with the given length
  121. fn span(&self, len: usize) -> Span {
  122. debug_assert!(self.cursor + len <= self.len());
  123. Span::new(self.cursor, self.cursor + len)
  124. }
  125. /// Create a span from the current position to the end
  126. fn span_to_end(&self) -> Span {
  127. Span::new(self.cursor, self.len())
  128. }
  129. /// Iterate over the remaining chars of the input
  130. fn chars(&self) -> std::str::Chars {
  131. self.src[self.cursor..].chars()
  132. }
  133. /// Iterate over the remaining chars of the input
  134. fn char_indices(&self) -> std::str::CharIndices {
  135. self.src[self.cursor..].char_indices()
  136. }
  137. /// Parse a comment
  138. fn comment(&self) -> Option<Token> {
  139. let mut ch_iter = self.char_indices().peekable();
  140. if let Some((_, '/')) = ch_iter.next() {
  141. if let Some((_, '*')) = ch_iter.next() {
  142. loop {
  143. match ch_iter.next() {
  144. Some((_, '*')) => {
  145. if let Some((idx, '/')) = ch_iter.peek() {
  146. return Some(Token {
  147. kind: TokenKind::Comment,
  148. span: self.span(*idx + '/'.len_utf8()),
  149. });
  150. }
  151. }
  152. None => {
  153. return Some(Token::new(
  154. TokenKind::UnclosedComment,
  155. self.span_to_end(),
  156. ));
  157. }
  158. _ => (),
  159. }
  160. }
  161. }
  162. }
  163. None
  164. }
  165. /// Parse whitespace
  166. fn whitespace(&self) -> Option<Token> {
  167. let mut ch_iter = self.chars();
  168. let mut len = match ch_iter.next() {
  169. Some(ch) if ch.is_ascii_whitespace() => ch.len_utf8(),
  170. _ => return None,
  171. };
  172. loop {
  173. match ch_iter.next() {
  174. Some(ch) if ch.is_ascii_whitespace() => len += ch.len_utf8(),
  175. _ => break,
  176. }
  177. }
  178. Some(Token {
  179. kind: TokenKind::Whitespace,
  180. span: self.span(len),
  181. })
  182. }
  183. /// Parse either a single or double quoted string
  184. fn string(&self) -> Option<Token> {
  185. let mut ch_iter = self.char_indices().fuse().peekable();
  186. let delim = match ch_iter.next() {
  187. Some((_, '"')) => '"',
  188. Some((_, '\'')) => '\'',
  189. _ => return None,
  190. };
  191. let mut decoded_string = String::new();
  192. loop {
  193. match ch_iter.next() {
  194. Some((end, ch)) if ch == delim => {
  195. return Some(Token {
  196. kind: TokenKind::String(decoded_string),
  197. span: self.span(end + 1), // '"'.len_utf8() == 1
  198. });
  199. }
  200. Some((end, '\n')) => {
  201. return Some(Token {
  202. kind: TokenKind::BadString(decoded_string),
  203. span: self.span(end + 1), // '\n'.len_utf8() == 1
  204. });
  205. }
  206. Some((_, '\\')) => match ch_iter.peek() {
  207. Some((_, ch)) => {
  208. if *ch == '\n' {
  209. // do nothing - skip the backslash and newline.
  210. ch_iter.next().unwrap();
  211. } else if let Some(decoded_ch) = unescape(&mut ch_iter) {
  212. decoded_string.push(decoded_ch);
  213. } else {
  214. decoded_string.push(ch_iter.next().unwrap().1);
  215. }
  216. }
  217. None => {
  218. // The spec says not to add the last '\'.
  219. // a bad string will be returned on next pass
  220. ch_iter.next().unwrap();
  221. }
  222. },
  223. Some((_, ch)) => decoded_string.push(ch),
  224. None => {
  225. return Some(Token {
  226. kind: TokenKind::BadString(decoded_string),
  227. span: self.span_to_end(),
  228. })
  229. }
  230. }
  231. }
  232. }
  233. /*
  234. fn hash(&self) -> Option<Token> {
  235. let mut iter = self.char_indices();
  236. match iter.next() {
  237. Some((_, '#')) => (),
  238. None => return None,
  239. };
  240. match iter.next() {
  241. Some((_, '\\')) => {}
  242. _ => Some(Token {
  243. kind: TokenKind::Delim('#'),
  244. span: self.span(1),
  245. }),
  246. }
  247. }
  248. */
  249. }
  250. impl<'src> Iterator for Lexer<'src> {
  251. type Item = Token;
  252. fn next(&mut self) -> Option<Self::Item> {
  253. self.next_token()
  254. }
  255. }
  256. #[derive(Debug, PartialEq)]
  257. #[non_exhaustive]
  258. pub struct Token {
  259. pub kind: TokenKind,
  260. pub span: Span,
  261. }
  262. impl Token {
  263. fn new(kind: TokenKind, span: Span) -> Self {
  264. Token { kind, span }
  265. }
  266. pub fn len(&self) -> usize {
  267. self.span.len()
  268. }
  269. }
  270. #[derive(Debug, PartialEq)]
  271. pub enum TokenKind {
  272. Ident,
  273. Function,
  274. At,
  275. Hash,
  276. String(String),
  277. BadString(String),
  278. Url,
  279. BadUrl,
  280. Delim(char),
  281. Number,
  282. Percentage,
  283. Dimension,
  284. Whitespace,
  285. /// <!--
  286. CDO,
  287. /// -->
  288. CDC,
  289. /// :
  290. Colon,
  291. /// ;
  292. Semicolon,
  293. /// ,
  294. Comma,
  295. /// [
  296. LBracket,
  297. /// ]
  298. RBracket,
  299. /// (
  300. LParen,
  301. /// )
  302. RParen,
  303. /// {
  304. LBrace,
  305. /// }
  306. RBrace,
  307. Comment,
  308. UnclosedComment,
  309. /// Could not parse the next token
  310. Error,
  311. }
  312. // Helpers
  313. /// Hex to char (up to 6 characters, e.g. "ffffff").
  314. ///
  315. /// For example `"5c" => '\'`. Returns None if first char is not hex. Consumes the hex values.
  316. fn unescape(input: &mut iter::Peekable<impl Iterator<Item = (usize, char)>>) -> Option<char> {
  317. fn hex_acc(acc: &mut u32, next: char) {
  318. debug_assert!(*acc & 0xf0000000 == 0); // make sure we don't overflow
  319. (*acc) = (*acc << 4) + next.to_digit(16).unwrap()
  320. }
  321. let (_, ch) = match input.peek() {
  322. Some((idx, ch)) if ch.is_ascii_hexdigit() => input.next().unwrap(),
  323. _ => return None,
  324. };
  325. let mut acc = 0;
  326. let mut count = 0;
  327. hex_acc(&mut acc, ch);
  328. // Here we use that the length of all valid hexdigits in utf8 is 1.
  329. while count < 5
  330. && input
  331. .peek()
  332. .map(|(_, ch)| ch.is_ascii_hexdigit())
  333. .unwrap_or(false)
  334. {
  335. let ch = input.next().unwrap().1;
  336. hex_acc(&mut acc, ch);
  337. count += 1;
  338. }
  339. // consume a whitespace char if it's there
  340. if input
  341. .peek()
  342. .map(|(_, ch)| ch.is_ascii_whitespace())
  343. .unwrap_or(false)
  344. {
  345. input.next().unwrap();
  346. }
  347. // maybe we could just directly use `char::from_u32(acc).unwrap_or(REPLACEMENT_CHAR)`
  348. // null, surrogate, or too big
  349. Some(
  350. if acc == 0 || (acc >= 0xd800 && acc < 0xe000) || acc >= 0x110000 {
  351. REPLACEMENT_CHAR
  352. } else {
  353. char::from_u32(acc).unwrap() // there should be no other invalid chars.
  354. },
  355. )
  356. }
  357. #[cfg(test)]
  358. mod test {
  359. use super::{Lexer, Span, Token, TokenKind};
  360. #[test]
  361. fn comment() {
  362. println!();
  363. let mut input = Lexer::new("/* a valid comment */").unwrap();
  364. match input.next_token() {
  365. Some(Token {
  366. kind: TokenKind::Comment,
  367. span,
  368. }) => {
  369. assert_eq!(
  370. input.resolve_span(span),
  371. "/* a valid comment */".to_string()
  372. );
  373. assert_eq!(span.len(), 21);
  374. }
  375. _ => panic!("not a comment"),
  376. };
  377. let mut input = Lexer::new("/* a comment").unwrap();
  378. match input.next_token() {
  379. Some(Token {
  380. kind: TokenKind::UnclosedComment,
  381. span,
  382. }) => {
  383. assert_eq!(input.resolve_span(span), "/* a comment".to_string());
  384. assert_eq!(span.len(), 12);
  385. }
  386. _ => panic!("not a comment"),
  387. };
  388. let mut input = Lexer::new("/!* not a comment").unwrap();
  389. match input.next_token() {
  390. Some(Token {
  391. kind: TokenKind::Error,
  392. span,
  393. }) => {}
  394. _ => panic!("not a comment"),
  395. };
  396. }
  397. #[test]
  398. fn string() {
  399. println!("h");
  400. let mut input = Lexer::new("\" a vali\\64\\e9 \\\n string \"").unwrap();
  401. match input.next_token() {
  402. Some(Token {
  403. kind: TokenKind::String(s),
  404. span,
  405. }) => {
  406. assert_eq!(s, " a validé string ".to_string());
  407. assert_eq!(span.len(), 26);
  408. }
  409. _ => panic!("not a string"),
  410. };
  411. let mut input = Lexer::new("' a valid string '").unwrap();
  412. match input.next_token() {
  413. Some(Token {
  414. kind: TokenKind::String(s),
  415. span,
  416. }) => {
  417. assert_eq!(s, " a valid string ".to_string());
  418. assert_eq!(span.len(), 18);
  419. }
  420. _ => panic!("not a string"),
  421. };
  422. let mut input = Lexer::new("\" a string").unwrap();
  423. match input.next_token() {
  424. Some(Token {
  425. kind: TokenKind::BadString(s),
  426. span,
  427. }) => {
  428. assert_eq!(s, " a string".to_string());
  429. assert_eq!(span.len(), 10);
  430. }
  431. _ => panic!("not a string"),
  432. };
  433. }
  434. #[test]
  435. fn whitespace() {
  436. println!();
  437. let mut input = Lexer::new("\n\t ").unwrap();
  438. match input.next_token() {
  439. Some(Token {
  440. kind: TokenKind::Whitespace,
  441. span,
  442. }) => {
  443. assert_eq!(input.resolve_span(span), "\n\t ".to_string());
  444. assert_eq!(span.len(), 3);
  445. }
  446. _ => panic!("not a string"),
  447. };
  448. }
  449. #[test]
  450. fn escape() {
  451. let mut iter = "e9".char_indices().peekable();
  452. assert_eq!(super::unescape(&mut iter), Some('é'));
  453. }
  454. }