Auto merge of #76170 - matklad:notrivia, r=petrochenkov

bors · bors · commit b4acb1103333 · 2020-09-02T03:19:38.000Z
Remove trivia tokens r? @ghost
diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs
@@ -251,17 +251,6 @@ pub enum TokenKind {
     /// similarly to symbols in string literal tokens.
     DocComment(CommentKind, ast::AttrStyle, Symbol),
 
-    // Junk. These carry no data because we don't really care about the data
-    // they *would* carry, and don't really want to allocate a new ident for
-    // them. Instead, users could extract that from the associated span.
-    /// Whitespace.
-    Whitespace,
-    /// A comment.
-    Comment,
-    Shebang(Symbol),
-    /// A completely invalid token which should be skipped.
-    Unknown(Symbol),
-
     Eof,
 }
 
@@ -331,7 +320,7 @@ impl Token {
 
     /// Some token that will be thrown away later.
     pub fn dummy() -> Self {
-        Token::new(TokenKind::Whitespace, DUMMY_SP)
+        Token::new(TokenKind::Question, DUMMY_SP)
     }
 
     /// Recovers a `Token` from an `Ident`. This creates a raw identifier if necessary.
@@ -360,7 +349,7 @@ impl Token {
     pub fn is_op(&self) -> bool {
         match self.kind {
             OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..)
-            | Lifetime(..) | Interpolated(..) | Whitespace | Comment | Shebang(..) | Eof => false,
+            | Lifetime(..) | Interpolated(..) | Eof => false,
             _ => true,
         }
     }
@@ -676,8 +665,7 @@ impl Token {
             Le | EqEq | Ne | Ge | AndAnd | OrOr | Tilde | BinOpEq(..) | At | DotDotDot
             | DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar
             | Question | OpenDelim(..) | CloseDelim(..) | Literal(..) | Ident(..)
-            | Lifetime(..) | Interpolated(..) | DocComment(..) | Whitespace | Comment
-            | Shebang(..) | Unknown(..) | Eof => return None,
+            | Lifetime(..) | Interpolated(..) | DocComment(..) | Eof => return None,
         };
 
         Some(Token::new(kind, self.span.to(joint.span)))
diff --git a/compiler/rustc_ast_pretty/src/pprust.rs b/compiler/rustc_ast_pretty/src/pprust.rs
@@ -289,10 +289,6 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option<Span>)
             doc_comment_to_string(comment_kind, attr_style, data)
         }
         token::Eof => "<eof>".to_string(),
-        token::Whitespace => " ".to_string(),
-        token::Comment => "/* */".to_string(),
-        token::Shebang(s) => format!("/* shebang: {}*/", s),
-        token::Unknown(s) => s.to_string(),
 
         token::Interpolated(ref nt) => nonterminal_to_string(nt),
     }
diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs
@@ -189,7 +189,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec<Self>)>
             }
 
             OpenDelim(..) | CloseDelim(..) => unreachable!(),
-            Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => unreachable!(),
+            Eof => unreachable!(),
         }
     }
 }
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -1,5 +1,6 @@
 use rustc_ast::ast::AttrStyle;
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
+use rustc_ast::tokenstream::IsJoint;
 use rustc_data_structures::sync::Lrc;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
 use rustc_lexer::Base;
@@ -65,42 +66,46 @@ impl<'a> StringReader<'a> {
         self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
     }
 
-    /// Returns the next token, including trivia like whitespace or comments.
-    fn next_token(&mut self) -> Token {
+    /// Returns the next token, and info about preceding whitespace, if any.
+    fn next_token(&mut self) -> (IsJoint, Token) {
+        let mut is_joint = IsJoint::Joint;
+
+        // Skip `#!` at the start of the file
         let start_src_index = self.src_index(self.pos);
         let text: &str = &self.src[start_src_index..self.end_src_index];
-
-        if text.is_empty() {
-            let span = self.mk_sp(self.pos, self.pos);
-            return Token::new(token::Eof, span);
+        let is_beginning_of_file = self.pos == self.start_pos;
+        if is_beginning_of_file {
+            if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
+                self.pos = self.pos + BytePos::from_usize(shebang_len);
+                is_joint = IsJoint::NonJoint;
+            }
         }
 
-        {
-            let is_beginning_of_file = self.pos == self.start_pos;
-            if is_beginning_of_file {
-                if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
-                    let start = self.pos;
-                    self.pos = self.pos + BytePos::from_usize(shebang_len);
+        // Skip trivial (whitespace & comments) tokens
+        loop {
+            let start_src_index = self.src_index(self.pos);
+            let text: &str = &self.src[start_src_index..self.end_src_index];
 
-                    let sym = self.symbol_from(start + BytePos::from_usize("#!".len()));
-                    let kind = token::Shebang(sym);
-
-                    let span = self.mk_sp(start, self.pos);
-                    return Token::new(kind, span);
-                }
+            if text.is_empty() {
+                let span = self.mk_sp(self.pos, self.pos);
+                return (is_joint, Token::new(token::Eof, span));
             }
-        }
 
-        let token = rustc_lexer::first_token(text);
+            let token = rustc_lexer::first_token(text);
 
-        let start = self.pos;
-        self.pos = self.pos + BytePos::from_usize(token.len);
+            let start = self.pos;
+            self.pos = self.pos + BytePos::from_usize(token.len);
 
-        debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start));
+            debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
 
-        let kind = self.cook_lexer_token(token.kind, start);
-        let span = self.mk_sp(start, self.pos);
-        Token::new(kind, span)
+            match self.cook_lexer_token(token.kind, start) {
+                Some(kind) => {
+                    let span = self.mk_sp(start, self.pos);
+                    return (is_joint, Token::new(kind, span));
+                }
+                None => is_joint = IsJoint::NonJoint,
+            }
+        }
     }
 
     /// Report a fatal lexical error with a given span.
@@ -140,19 +145,16 @@ impl<'a> StringReader<'a> {
     /// Turns simple `rustc_lexer::TokenKind` enum into a rich
     /// `librustc_ast::TokenKind`. This turns strings into interned
     /// symbols and runs additional validation.
-    fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
-        match token {
+    fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
+        Some(match token {
             rustc_lexer::TokenKind::LineComment { doc_style } => {
-                match doc_style {
-                    Some(doc_style) => {
-                        // Opening delimiter of the length 3 is not included into the symbol.
-                        let content_start = start + BytePos(3);
-                        let content = self.str_from(content_start);
+                // Skip non-doc comments
+                let doc_style = doc_style?;
 
-                        self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
-                    }
-                    None => token::Comment,
-                }
+                // Opening delimiter of the length 3 is not included into the symbol.
+                let content_start = start + BytePos(3);
+                let content = self.str_from(content_start);
+                self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
             }
             rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
                 if !terminated {
@@ -171,20 +173,18 @@ impl<'a> StringReader<'a> {
                         .emit();
                     FatalError.raise();
                 }
-                match doc_style {
-                    Some(doc_style) => {
-                        // Opening delimiter of the length 3 and closing delimiter of the length 2
-                        // are not included into the symbol.
-                        let content_start = start + BytePos(3);
-                        let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
-                        let content = self.str_from_to(content_start, content_end);
-
-                        self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
-                    }
-                    None => token::Comment,
-                }
+
+                // Skip non-doc comments
+                let doc_style = doc_style?;
+
+                // Opening delimiter of the length 3 and closing delimiter of the length 2
+                // are not included into the symbol.
+                let content_start = start + BytePos(3);
+                let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
+                let content = self.str_from_to(content_start, content_end);
+                self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
             }
-            rustc_lexer::TokenKind::Whitespace => token::Whitespace,
+            rustc_lexer::TokenKind::Whitespace => return None,
             rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => {
                 let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent;
                 let mut ident_start = start;
@@ -282,12 +282,11 @@ impl<'a> StringReader<'a> {
                 // this should be inside `rustc_lexer`. However, we should first remove compound
                 // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
                 // as there will be less overall work to do this way.
-                let token = unicode_chars::check_for_substitution(self, start, c, &mut err)
-                    .unwrap_or_else(|| token::Unknown(self.symbol_from(start)));
+                let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
                 err.emit();
-                token
+                token?
             }
-        }
+        })
     }
 
     fn cook_doc_comment(
@@ -450,12 +449,6 @@ impl<'a> StringReader<'a> {
         self.str_from_to(start, self.pos)
     }
 
-    /// Creates a Symbol from a given offset to the current offset.
-    fn symbol_from(&self, start: BytePos) -> Symbol {
-        debug!("taking an ident from {:?} to {:?}", start, self.pos);
-        Symbol::intern(self.str_from(start))
-    }
-
     /// As symbol_from, with an explicit endpoint.
     fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
         debug!("taking an ident from {:?} to {:?}", start, end);
diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs
@@ -16,7 +16,6 @@ impl<'a> StringReader<'a> {
         let mut tt_reader = TokenTreesReader {
             string_reader: self,
             token: Token::dummy(),
-            joint_to_prev: Joint,
             open_braces: Vec::new(),
             unmatched_braces: Vec::new(),
             matching_delim_spans: Vec::new(),
@@ -32,7 +31,6 @@ impl<'a> StringReader<'a> {
 struct TokenTreesReader<'a> {
     string_reader: StringReader<'a>,
     token: Token,
-    joint_to_prev: IsJoint,
     /// Stack of open delimiters and their spans. Used for error message.
     open_braces: Vec<(token::DelimToken, Span)>,
     unmatched_braces: Vec<UnmatchedBrace>,
@@ -53,7 +51,7 @@ impl<'a> TokenTreesReader<'a> {
     fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
         let mut buf = TokenStreamBuilder::default();
 
-        self.real_token();
+        self.bump();
         while self.token != token::Eof {
             buf.push(self.parse_token_tree()?);
         }
@@ -126,7 +124,7 @@ impl<'a> TokenTreesReader<'a> {
 
                 // Parse the open delimiter.
                 self.open_braces.push((delim, self.token.span));
-                self.real_token();
+                self.bump();
 
                 // Parse the token trees within the delimiters.
                 // We stop at any delimiter so we can try to recover if the user
@@ -171,7 +169,7 @@ impl<'a> TokenTreesReader<'a> {
                             ));
                         }
                         // Parse the closing delimiter.
-                        self.real_token();
+                        self.bump();
                     }
                     // Incorrect delimiter.
                     token::CloseDelim(other) => {
@@ -217,7 +215,7 @@ impl<'a> TokenTreesReader<'a> {
                         //     bar(baz(
                         // }  // Incorrect delimiter but matches the earlier `{`
                         if !self.open_braces.iter().any(|&(b, _)| b == other) {
-                            self.real_token();
+                            self.bump();
                         }
                     }
                     token::Eof => {
@@ -264,27 +262,19 @@ impl<'a> TokenTreesReader<'a> {
             }
             _ => {
                 let tt = TokenTree::Token(self.token.take());
-                self.real_token();
-                let is_joint = self.joint_to_prev == Joint && self.token.is_op();
-                Ok((tt, if is_joint { Joint } else { NonJoint }))
+                let mut is_joint = self.bump();
+                if !self.token.is_op() {
+                    is_joint = NonJoint;
+                }
+                Ok((tt, is_joint))
             }
         }
     }
 
-    fn real_token(&mut self) {
-        self.joint_to_prev = Joint;
-        loop {
-            let token = self.string_reader.next_token();
-            match token.kind {
-                token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => {
-                    self.joint_to_prev = NonJoint;
-                }
-                _ => {
-                    self.token = token;
-                    return;
-                }
-            }
-        }
+    fn bump(&mut self) -> IsJoint {
+        let (joint_to_prev, token) = self.string_reader.next_token();
+        self.token = token;
+        joint_to_prev
     }
 }
 
diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@@ -303,7 +303,7 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
 // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
 // fancier error recovery to it, as there will be less overall work to do this way.
 const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
-    (' ', "Space", Some(token::Whitespace)),
+    (' ', "Space", None),
     ('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
     ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
     (',', "Comma", Some(token::Comma)),
diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs
@@ -348,9 +348,6 @@ pub fn tokenstream_probably_equal_for_proc_macro(
                 | token::CloseDelim(DelimToken::NoDelim)
                 // The pretty printer collapses many semicolons into one.
                 | token::Semi
-                // The pretty printer collapses whitespace arbitrarily and can
-                // introduce whitespace from `NoDelim`.
-                | token::Whitespace
                 // The pretty printer can turn `$crate` into `::crate_name`
                 | token::ModSep = token.kind {
                 return false;
@@ -506,8 +503,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
         | (&Pound, &Pound)
         | (&Dollar, &Dollar)
         | (&Question, &Question)
-        | (&Whitespace, &Whitespace)
-        | (&Comment, &Comment)
         | (&Eof, &Eof) => true,
 
         (&BinOp(a), &BinOp(b)) | (&BinOpEq(a), &BinOpEq(b)) => a == b,
@@ -516,8 +511,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
 
         (&DocComment(a1, a2, a3), &DocComment(b1, b2, b3)) => a1 == b1 && a2 == b2 && a3 == b3,
 
-        (&Shebang(a), &Shebang(b)) => a == b,
-
         (&Literal(a), &Literal(b)) => a == b,
 
         (&Lifetime(a), &Lifetime(b)) => a == b,

Original file line number	Diff line number	Diff line change
`@@ -289,10 +289,6 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option<Span>)`
`289`	`289`	`doc_comment_to_string(comment_kind, attr_style, data)`
`290`	`290`	`}`
`291`	`291`	`token::Eof => "<eof>".to_string(),`
`292`		`- token::Whitespace => " ".to_string(),`
`293`		`- token::Comment => "/* */".to_string(),`
`294`		`- token::Shebang(s) => format!("/* shebang: {}*/", s),`
`295`		`- token::Unknown(s) => s.to_string(),`
`296`	`292`
`297`	`293`	`token::Interpolated(ref nt) => nonterminal_to_string(nt),`
`298`	`294`	`}`
Original file line number	Diff line number	Diff line change
`@@ -189,7 +189,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec<Self>)>`
`189`	`189`	`}`
`190`	`190`
`191`	`191`	`OpenDelim(..) \| CloseDelim(..) => unreachable!(),`
`192`		`- Whitespace \| Comment \| Shebang(..) \| Unknown(..) \| Eof => unreachable!(),`
	`192`	`+ Eof => unreachable!(),`
`193`	`193`	`}`
`194`	`194`	`}`
`195`	`195`	`}`