|
25 | 25 | #![deny(unstable_features)] |
26 | 26 | // tidy-alphabetical-end |
27 | 27 |
|
28 | | -mod cursor; |
29 | | - |
30 | 28 | #[cfg(test)] |
31 | 29 | mod tests; |
32 | 30 |
|
| 31 | +use std::str::Chars; |
| 32 | + |
33 | 33 | use LiteralKind::*; |
34 | 34 | use TokenKind::*; |
35 | | -use cursor::EOF_CHAR; |
36 | | -pub use cursor::{Cursor, FrontmatterAllowed}; |
37 | 35 | pub use unicode_ident::UNICODE_VERSION; |
38 | 36 | use unicode_properties::UnicodeEmoji; |
39 | 37 |
|
@@ -407,7 +405,129 @@ pub fn is_ident(string: &str) -> bool { |
407 | 405 | } |
408 | 406 | } |
409 | 407 |
|
410 | | -impl Cursor<'_> { |
| 408 | +pub enum FrontmatterAllowed { |
| 409 | + Yes, |
| 410 | + No, |
| 411 | +} |
| 412 | + |
| 413 | +/// Peekable iterator over a char sequence. |
| 414 | +/// |
| 415 | +/// Next characters can be peeked via `first` method, |
| 416 | +/// and position can be shifted forward via `bump` method. |
| 417 | +pub struct Cursor<'a> { |
| 418 | + len_remaining: usize, |
| 419 | + /// Iterator over chars. Slightly faster than a &str. |
| 420 | + chars: Chars<'a>, |
| 421 | + pub(crate) frontmatter_allowed: FrontmatterAllowed, |
| 422 | + #[cfg(debug_assertions)] |
| 423 | + prev: char, |
| 424 | +} |
| 425 | + |
| 426 | +const EOF_CHAR: char = '\0'; |
| 427 | + |
| 428 | +impl<'a> Cursor<'a> { |
| 429 | + pub fn new(input: &'a str, frontmatter_allowed: FrontmatterAllowed) -> Cursor<'a> { |
| 430 | + Cursor { |
| 431 | + len_remaining: input.len(), |
| 432 | + chars: input.chars(), |
| 433 | + frontmatter_allowed, |
| 434 | + #[cfg(debug_assertions)] |
| 435 | + prev: EOF_CHAR, |
| 436 | + } |
| 437 | + } |
| 438 | + |
| 439 | + pub fn as_str(&self) -> &'a str { |
| 440 | + self.chars.as_str() |
| 441 | + } |
| 442 | + |
| 443 | + /// Returns the last eaten symbol (or `'\0'` in release builds). |
| 444 | + /// (For debug assertions only.) |
| 445 | + pub(crate) fn prev(&self) -> char { |
| 446 | + #[cfg(debug_assertions)] |
| 447 | + { |
| 448 | + self.prev |
| 449 | + } |
| 450 | + |
| 451 | + #[cfg(not(debug_assertions))] |
| 452 | + { |
| 453 | + EOF_CHAR |
| 454 | + } |
| 455 | + } |
| 456 | + |
| 457 | + /// Peeks the next symbol from the input stream without consuming it. |
| 458 | + /// If requested position doesn't exist, `EOF_CHAR` is returned. |
| 459 | + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, |
| 460 | + /// it should be checked with `is_eof` method. |
| 461 | + pub fn first(&self) -> char { |
| 462 | + // `.next()` optimizes better than `.nth(0)` |
| 463 | + self.chars.clone().next().unwrap_or(EOF_CHAR) |
| 464 | + } |
| 465 | + |
| 466 | + /// Peeks the second symbol from the input stream without consuming it. |
| 467 | + pub(crate) fn second(&self) -> char { |
| 468 | + // `.next()` optimizes better than `.nth(1)` |
| 469 | + let mut iter = self.chars.clone(); |
| 470 | + iter.next(); |
| 471 | + iter.next().unwrap_or(EOF_CHAR) |
| 472 | + } |
| 473 | + |
| 474 | + /// Peeks the third symbol from the input stream without consuming it. |
| 475 | + pub fn third(&self) -> char { |
| 476 | + // `.next()` optimizes better than `.nth(2)` |
| 477 | + let mut iter = self.chars.clone(); |
| 478 | + iter.next(); |
| 479 | + iter.next(); |
| 480 | + iter.next().unwrap_or(EOF_CHAR) |
| 481 | + } |
| 482 | + |
| 483 | + /// Checks if there is nothing more to consume. |
| 484 | + pub(crate) fn is_eof(&self) -> bool { |
| 485 | + self.chars.as_str().is_empty() |
| 486 | + } |
| 487 | + |
| 488 | + /// Returns amount of already consumed symbols. |
| 489 | + pub(crate) fn pos_within_token(&self) -> u32 { |
| 490 | + (self.len_remaining - self.chars.as_str().len()) as u32 |
| 491 | + } |
| 492 | + |
| 493 | + /// Resets the number of bytes consumed to 0. |
| 494 | + pub(crate) fn reset_pos_within_token(&mut self) { |
| 495 | + self.len_remaining = self.chars.as_str().len(); |
| 496 | + } |
| 497 | + |
| 498 | + /// Moves to the next character. |
| 499 | + pub(crate) fn bump(&mut self) -> Option<char> { |
| 500 | + let c = self.chars.next()?; |
| 501 | + |
| 502 | + #[cfg(debug_assertions)] |
| 503 | + { |
| 504 | + self.prev = c; |
| 505 | + } |
| 506 | + |
| 507 | + Some(c) |
| 508 | + } |
| 509 | + |
| 510 | + /// Moves to a substring by a number of bytes. |
| 511 | + pub(crate) fn bump_bytes(&mut self, n: usize) { |
| 512 | + self.chars = self.as_str()[n..].chars(); |
| 513 | + } |
| 514 | + |
| 515 | + /// Eats symbols while predicate returns true or until the end of file is reached. |
| 516 | + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { |
| 517 | + // It was tried making optimized version of this for eg. line comments, but |
| 518 | + // LLVM can inline all of this and compile it down to fast iteration over bytes. |
| 519 | + while predicate(self.first()) && !self.is_eof() { |
| 520 | + self.bump(); |
| 521 | + } |
| 522 | + } |
| 523 | + |
| 524 | + pub(crate) fn eat_until(&mut self, byte: u8) { |
| 525 | + self.chars = match memchr::memchr(byte, self.as_str().as_bytes()) { |
| 526 | + Some(index) => self.as_str()[index..].chars(), |
| 527 | + None => "".chars(), |
| 528 | + } |
| 529 | + } |
| 530 | + |
411 | 531 | /// Parses a token from the input string. |
412 | 532 | pub fn advance_token(&mut self) -> Token { |
413 | 533 | let Some(first_char) = self.bump() else { |
|
0 commit comments