From 0825b357d894d1486deecd9294a15da1a32a3441 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 11:39:39 +0300 Subject: [PATCH 1/9] librustc_lexer: Add methods "first" and "second" to the "Cursor" --- src/librustc_lexer/src/cursor.rs | 10 ++++++++ src/librustc_lexer/src/lib.rs | 40 ++++++++++++++++---------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs index 73d305c6d4fe2..13d0b07d98bae 100644 --- a/src/librustc_lexer/src/cursor.rs +++ b/src/librustc_lexer/src/cursor.rs @@ -45,6 +45,16 @@ impl<'a> Cursor<'a> { self.chars().nth(n).unwrap_or(EOF_CHAR) } + /// Peeks the next symbol from the input stream without consuming it. + pub(crate) fn first(&self) -> char { + self.nth_char(0) + } + + /// Peeks the second symbol from the input stream without consuming it. + pub(crate) fn second(&self) -> char { + self.nth_char(1) + } + /// Checks if there is nothing more to consume. pub(crate) fn is_eof(&self) -> bool { self.chars.as_str().is_empty() diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index d55ef46d7506e..6e2e0c44e0a42 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -248,7 +248,7 @@ impl Cursor<'_> { let first_char = self.bump().unwrap(); let token_kind = match first_char { // Slash, comment or block comment. - '/' => match self.nth_char(0) { + '/' => match self.first() { '/' => self.line_comment(), '*' => self.block_comment(), _ => Slash, @@ -257,8 +257,8 @@ impl Cursor<'_> { // Whitespace sequence. c if is_whitespace(c) => self.whitespace(), - // Raw string literal or identifier. - 'r' => match (self.nth_char(0), self.nth_char(1)) { + // Raw identifier, raw string literal or identifier. + 'r' => match (self.first(), self.second()) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { let (n_hashes, started, terminated) = self.raw_double_quoted_string(); @@ -273,7 +273,7 @@ impl Cursor<'_> { }, // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (self.nth_char(0), self.nth_char(1)) { + 'b' => match (self.first(), self.second()) { ('\'', _) => { self.bump(); let terminated = self.single_quoted_string(); @@ -366,7 +366,7 @@ impl Cursor<'_> { } fn line_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.nth_char(0) == '/'); + debug_assert!(self.prev() == '/' && self.first() == '/'); self.bump(); loop { match self.nth_char(0) { @@ -381,16 +381,16 @@ impl Cursor<'_> { } fn block_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.nth_char(0) == '*'); + debug_assert!(self.prev() == '/' && self.first() == '*'); self.bump(); let mut depth = 1usize; while let Some(c) = self.bump() { match c { - '/' if self.nth_char(0) == '*' => { + '/' if self.first() == '*' => { self.bump(); depth += 1; } - '*' if self.nth_char(0) == '/' => { + '*' if self.first() == '/' => { self.bump(); depth -= 1; if depth == 0 { @@ -418,8 +418,8 @@ impl Cursor<'_> { fn raw_ident(&mut self) -> TokenKind { debug_assert!( self.prev() == 'r' - && self.nth_char(0) == '#' - && is_id_start(self.nth_char(1)) + && self.first() == '#' + && is_id_start(self.second()) ); self.bump(); self.bump(); @@ -442,7 +442,7 @@ impl Cursor<'_> { let mut base = Base::Decimal; if first_digit == '0' { // Attempt to parse encoding base. - let has_digits = match self.nth_char(0) { + let has_digits = match self.first() { 'b' => { base = Base::Binary; self.bump(); @@ -476,20 +476,20 @@ impl Cursor<'_> { self.eat_decimal_digits(); }; - match self.nth_char(0) { + match self.first() { // Don't be greedy if this is actually an // integer literal followed by field/method access or a range pattern // (`0..2` and `12.foo()`) - '.' if self.nth_char(1) != '.' - && !is_id_start(self.nth_char(1)) => + '.' if self.second() != '.' + && !is_id_start(self.second()) => { // might have stuff after the ., and if it does, it needs to start // with a number self.bump(); let mut empty_exponent = false; - if self.nth_char(0).is_digit(10) { + if self.first().is_digit(10) { self.eat_decimal_digits(); - match self.nth_char(0) { + match self.first() { 'e' | 'E' => { self.bump(); empty_exponent = self.float_exponent().is_err() @@ -556,7 +556,7 @@ impl Cursor<'_> { // Parse until either quotes are terminated or error is detected. let mut first = true; loop { - match self.nth_char(0) { + match self.first() { // Probably beginning of the comment, which we don't want to include // to the error report. '/' if !first => break, @@ -643,7 +643,7 @@ impl Cursor<'_> { fn eat_decimal_digits(&mut self) -> bool { let mut has_digits = false; loop { - match self.nth_char(0) { + match self.first() { '_' => { self.bump(); } @@ -660,7 +660,7 @@ impl Cursor<'_> { fn eat_hexadecimal_digits(&mut self) -> bool { let mut has_digits = false; loop { - match self.nth_char(0) { + match self.first() { '_' => { self.bump(); } @@ -676,7 +676,7 @@ impl Cursor<'_> { fn float_exponent(&mut self) -> Result<(), ()> { debug_assert!(self.prev() == 'e' || self.prev() == 'E'); - if self.nth_char(0) == '-' || self.nth_char(0) == '+' { + if self.first() == '-' || self.first() == '+' { self.bump(); } if self.eat_decimal_digits() { Ok(()) } else { Err(()) } From 72767a805679b40c1884f1051b67cc43b46fc4e8 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 11:42:08 +0300 Subject: [PATCH 2/9] librustc_lexer: Introduce "eat_while" and "eat_identifier" methods --- src/librustc_lexer/src/lib.rs | 50 ++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 6e2e0c44e0a42..576b4ff5ed606 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -368,15 +368,7 @@ impl Cursor<'_> { fn line_comment(&mut self) -> TokenKind { debug_assert!(self.prev() == '/' && self.first() == '/'); self.bump(); - loop { - match self.nth_char(0) { - '\n' => break, - EOF_CHAR if self.is_eof() => break, - _ => { - self.bump(); - } - } - } + self.eat_while(|c| c != '\n'); LineComment } @@ -409,9 +401,7 @@ impl Cursor<'_> { fn whitespace(&mut self) -> TokenKind { debug_assert!(is_whitespace(self.prev())); - while is_whitespace(self.nth_char(0)) { - self.bump(); - } + self.eat_while(is_whitespace); Whitespace } @@ -421,19 +411,17 @@ impl Cursor<'_> { && self.first() == '#' && is_id_start(self.second()) ); + // Eat "#" symbol. self.bump(); - self.bump(); - while is_id_continue(self.nth_char(0)) { - self.bump(); - } + // Eat the identifier part of RawIdent. + self.eat_identifier(); RawIdent } fn ident(&mut self) -> TokenKind { debug_assert!(is_id_start(self.prev())); - while is_id_continue(self.nth_char(0)) { - self.bump(); - } + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_id_continue); Ident } @@ -682,15 +670,33 @@ impl Cursor<'_> { if self.eat_decimal_digits() { Ok(()) } else { Err(()) } } - // Eats the suffix if it's an identifier. + // Eats the suffix of the literal, e.g. "_u8". fn eat_literal_suffix(&mut self) { - if !is_id_start(self.nth_char(0)) { + self.eat_identifier(); + } + + // Eats the identifier. + fn eat_identifier(&mut self) { + if !is_id_start(self.first()) { return; } self.bump(); - while is_id_continue(self.nth_char(0)) { + self.eat_while(is_id_continue); + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + /// Returns amount of eaten symbols. + fn eat_while(&mut self, mut predicate: F) -> usize + where + F: FnMut(char) -> bool + { + let mut eaten: usize = 0; + while predicate(self.first()) && !self.is_eof() { + eaten += 1; self.bump(); } + + eaten } } From e0c45f7ee7b1c3882d08e9b71e753e3251c2dff1 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 11:43:47 +0300 Subject: [PATCH 3/9] librustc_lexer: Make "eat_float_exponent" return bool instead of result --- src/librustc_lexer/src/lib.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 576b4ff5ed606..92d99f377136b 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -480,7 +480,7 @@ impl Cursor<'_> { match self.first() { 'e' | 'E' => { self.bump(); - empty_exponent = self.float_exponent().is_err() + empty_exponent = !self.eat_float_exponent(); } _ => (), } @@ -489,7 +489,7 @@ impl Cursor<'_> { } 'e' | 'E' => { self.bump(); - let empty_exponent = self.float_exponent().is_err(); + let empty_exponent = !self.eat_float_exponent(); Float { base, empty_exponent } } _ => Int { base, empty_int: false }, @@ -662,12 +662,14 @@ impl Cursor<'_> { has_digits } - fn float_exponent(&mut self) -> Result<(), ()> { + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { debug_assert!(self.prev() == 'e' || self.prev() == 'E'); if self.first() == '-' || self.first() == '+' { self.bump(); } - if self.eat_decimal_digits() { Ok(()) } else { Err(()) } + self.eat_decimal_digits() } // Eats the suffix of the literal, e.g. "_u8". From 649a5247f58a2cdba58b63e48403b55cf7bf8bdb Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 12:54:23 +0300 Subject: [PATCH 4/9] librustc_lexer: Simplify "single_quoted_string" method --- src/librustc_lexer/src/lib.rs | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 92d99f377136b..2edf3bd78efb9 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -537,26 +537,30 @@ impl Cursor<'_> { fn single_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '\''); - // Parse `'''` as a single char literal. - if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' { + // Check if it's a one-symbol literal. + if self.second() == '\'' && self.first() != '\\' { self.bump(); + self.bump(); + return true; } + + // Literal has more than one symbol. + // Parse until either quotes are terminated or error is detected. - let mut first = true; loop { match self.first() { - // Probably beginning of the comment, which we don't want to include - // to the error report. - '/' if !first => break, - // Newline without following '\'' means unclosed quote, stop parsing. - '\n' if self.nth_char(1) != '\'' => break, - // End of file, stop parsing. - EOF_CHAR if self.is_eof() => break, // Quotes are terminated, finish parsing. '\'' => { self.bump(); return true; } + // Probably beginning of the comment, which we don't want to include + // to the error report. + '/' => break, + // Newline without following '\'' means unclosed quote, stop parsing. + '\n' if self.second() != '\'' => break, + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, // Escaped slash is considered one character, so bump twice. '\\' => { self.bump(); @@ -567,8 +571,8 @@ impl Cursor<'_> { self.bump(); } } - first = false; } + // String was not terminated. false } From d6f722d79cf01f8305288235d82e13a6863711f5 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 12:55:05 +0300 Subject: [PATCH 5/9] librustc_lexer: Simplify "double_quoted_string" method --- src/librustc_lexer/src/lib.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 2edf3bd78efb9..6653ddc307226 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -580,20 +580,20 @@ impl Cursor<'_> { /// if string is terminated. fn double_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '"'); - loop { - match self.nth_char(0) { + while let Some(c) = self.bump() { + match c { '"' => { - self.bump(); return true; } - EOF_CHAR if self.is_eof() => return false, - '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => { + '\\' if self.first() == '\\' || self.first() == '"' => { + // Bump again to skip escaped character. self.bump(); } _ => (), } - self.bump(); } + // End of file reached. + false } /// Eats the double-quoted string and returns a tuple of From 6e350bd9990c600ed68ebac13ff8a89cade98fb2 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 12:55:50 +0300 Subject: [PATCH 6/9] librustc_lexer: Simplify "raw_double_quoted_string" method --- src/librustc_lexer/src/lib.rs | 59 ++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 6653ddc307226..f66e6641abc24 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -600,36 +600,45 @@ impl Cursor<'_> { /// (amount of the '#' symbols, raw string started, raw string terminated) fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { debug_assert!(self.prev() == 'r'); + let mut started: bool = false; + let mut finished: bool = false; + // Count opening '#' symbols. - let n_hashes = { - let mut acc: usize = 0; - loop { - match self.bump() { - Some('#') => acc += 1, - Some('"') => break acc, - None | Some(_) => return (acc, false, false), - } + let n_hashes = self.eat_while(|c| c == '#'); + + // Check that string is started. + match self.bump() { + Some('"') => started = true, + _ => return (n_hashes, started, finished), + } + + // Skip the string contents and on each '#' character met, check if this is + // a raw string termination. + while !finished { + self.eat_while(|c| c != '"'); + + if self.is_eof() { + return (n_hashes, started, finished); } - }; - // Skip the string itself and check that amount of closing '#' - // symbols is equal to the amount of opening ones. - loop { - match self.bump() { - Some('"') => { - let mut acc = n_hashes; - while self.nth_char(0) == '#' && acc > 0 { - self.bump(); - acc -= 1; - } - if acc == 0 { - return (n_hashes, true, true); - } + // Eat closing double quote. + self.bump(); + + // Check that amount of closing '#' symbols + // is equal to the amount of opening ones. + let mut hashes_left = n_hashes; + let is_closing_hash = |c| { + if c == '#' && hashes_left != 0 { + hashes_left -= 1; + true + } else { + false } - Some(_) => (), - None => return (n_hashes, true, false), - } + }; + finished = self.eat_while(is_closing_hash) == n_hashes; } + + (n_hashes, started, finished) } fn eat_decimal_digits(&mut self) -> bool { From ecd26739d45837ee21fe0e2941f957086fbf6a47 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 12:56:49 +0300 Subject: [PATCH 7/9] librustc_lexer: Simplify "lifetime_or_char" method --- src/librustc_lexer/src/lib.rs | 67 +++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index f66e6641abc24..655619bc72212 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -498,41 +498,48 @@ impl Cursor<'_> { fn lifetime_or_char(&mut self) -> TokenKind { debug_assert!(self.prev() == '\''); - let mut starts_with_number = false; - - // Check if the first symbol after '\'' is a valid identifier - // character or a number (not a digit followed by '\''). - if (is_id_start(self.nth_char(0)) - || self.nth_char(0).is_digit(10) && { - starts_with_number = true; - true - }) - && self.nth_char(1) != '\'' - { - self.bump(); - // Skip the identifier. - while is_id_continue(self.nth_char(0)) { - self.bump(); - } + let can_be_a_lifetime = if self.second() == '\'' { + // It's surely not a lifetime. + false + } else { + // If the first symbol is valid for identifier, it can be a lifetime. + // Also check if it's a number for a better error reporting (so '0 will + // be reported as invalid lifetime and not as unterminated char literal). + is_id_start(self.first()) || self.first().is_digit(10) + }; - return if self.nth_char(0) == '\'' { - self.bump(); - let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } - } else { - Lifetime { starts_with_number } - }; + if !can_be_a_lifetime { + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Char { terminated }; + return Literal { kind, suffix_start }; } - // This is not a lifetime (checked above), parse a char literal. - let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); - if terminated { - self.eat_literal_suffix(); + // Either a lifetime or a character literal with + // length greater than 1. + + let starts_with_number = self.first().is_digit(10); + + // Skip the literal contents. + // First symbol can be a number (which isn't a valid identifier start), + // so skip it without any checks. + self.bump(); + self.eat_while(is_id_continue); + + // Check if after skipping literal contents we've met a closing + // single quote (which means that user attempted to create a + // string with single quotes). + if self.first() == '\'' { + self.bump(); + let kind = Char { terminated: true }; + return Literal { kind, suffix_start: self.len_consumed() }; } - let kind = Char { terminated }; - return Literal { kind, suffix_start }; + + return Lifetime { starts_with_number }; } fn single_quoted_string(&mut self) -> bool { From e8b8d2a7257b83698f095f51e2f5127176fc8816 Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 12:57:12 +0300 Subject: [PATCH 8/9] librustc_lexer: Reorder imports in lib.rs --- src/librustc_lexer/src/lib.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 655619bc72212..c50808adec1dd 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -18,6 +18,8 @@ mod cursor; pub mod unescape; use crate::cursor::{Cursor, EOF_CHAR}; +use self::TokenKind::*; +use self::LiteralKind::*; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -116,7 +118,6 @@ pub enum TokenKind { /// Unknown token, not expected by the lexer, e.g. "№" Unknown, } -use self::TokenKind::*; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum LiteralKind { @@ -137,7 +138,6 @@ pub enum LiteralKind { /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" RawByteStr { n_hashes: usize, started: bool, terminated: bool }, } -use self::LiteralKind::*; /// Base of numeric literal encoding according to its prefix. #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -241,7 +241,6 @@ pub fn is_id_continue(c: char) -> bool { || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) } - impl Cursor<'_> { /// Parses a token from the input string. fn advance_token(&mut self) -> Token { From 31735b02c95510f2e236ebd773b02e84ee6e1a5b Mon Sep 17 00:00:00 2001 From: Igor Aleksanov Date: Sun, 3 Nov 2019 12:57:59 +0300 Subject: [PATCH 9/9] librustc_lexer: Make nth_char method private --- src/librustc_lexer/src/cursor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs index 13d0b07d98bae..ed0911379c4b3 100644 --- a/src/librustc_lexer/src/cursor.rs +++ b/src/librustc_lexer/src/cursor.rs @@ -41,7 +41,7 @@ impl<'a> Cursor<'a> { /// If requested position doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - pub(crate) fn nth_char(&self, n: usize) -> char { + fn nth_char(&self, n: usize) -> char { self.chars().nth(n).unwrap_or(EOF_CHAR) }