Skip to content

Commit b4acb11

Browse files
committed
Auto merge of #76170 - matklad:notrivia, r=petrochenkov
Remove trivia tokens r? @ghost
2 parents e36e4bd + fabd8a6 commit b4acb11

File tree

7 files changed

+71
-111
lines changed

7 files changed

+71
-111
lines changed

compiler/rustc_ast/src/token.rs

+3-15
Original file line numberDiff line numberDiff line change
@@ -251,17 +251,6 @@ pub enum TokenKind {
251251
/// similarly to symbols in string literal tokens.
252252
DocComment(CommentKind, ast::AttrStyle, Symbol),
253253

254-
// Junk. These carry no data because we don't really care about the data
255-
// they *would* carry, and don't really want to allocate a new ident for
256-
// them. Instead, users could extract that from the associated span.
257-
/// Whitespace.
258-
Whitespace,
259-
/// A comment.
260-
Comment,
261-
Shebang(Symbol),
262-
/// A completely invalid token which should be skipped.
263-
Unknown(Symbol),
264-
265254
Eof,
266255
}
267256

@@ -331,7 +320,7 @@ impl Token {
331320

332321
/// Some token that will be thrown away later.
333322
pub fn dummy() -> Self {
334-
Token::new(TokenKind::Whitespace, DUMMY_SP)
323+
Token::new(TokenKind::Question, DUMMY_SP)
335324
}
336325

337326
/// Recovers a `Token` from an `Ident`. This creates a raw identifier if necessary.
@@ -360,7 +349,7 @@ impl Token {
360349
pub fn is_op(&self) -> bool {
361350
match self.kind {
362351
OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..)
363-
| Lifetime(..) | Interpolated(..) | Whitespace | Comment | Shebang(..) | Eof => false,
352+
| Lifetime(..) | Interpolated(..) | Eof => false,
364353
_ => true,
365354
}
366355
}
@@ -676,8 +665,7 @@ impl Token {
676665
Le | EqEq | Ne | Ge | AndAnd | OrOr | Tilde | BinOpEq(..) | At | DotDotDot
677666
| DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar
678667
| Question | OpenDelim(..) | CloseDelim(..) | Literal(..) | Ident(..)
679-
| Lifetime(..) | Interpolated(..) | DocComment(..) | Whitespace | Comment
680-
| Shebang(..) | Unknown(..) | Eof => return None,
668+
| Lifetime(..) | Interpolated(..) | DocComment(..) | Eof => return None,
681669
};
682670

683671
Some(Token::new(kind, self.span.to(joint.span)))

compiler/rustc_ast_pretty/src/pprust.rs

-4
Original file line numberDiff line numberDiff line change
@@ -289,10 +289,6 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option<Span>)
289289
doc_comment_to_string(comment_kind, attr_style, data)
290290
}
291291
token::Eof => "<eof>".to_string(),
292-
token::Whitespace => " ".to_string(),
293-
token::Comment => "/* */".to_string(),
294-
token::Shebang(s) => format!("/* shebang: {}*/", s),
295-
token::Unknown(s) => s.to_string(),
296292

297293
token::Interpolated(ref nt) => nonterminal_to_string(nt),
298294
}

compiler/rustc_expand/src/proc_macro_server.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec<Self>)>
189189
}
190190

191191
OpenDelim(..) | CloseDelim(..) => unreachable!(),
192-
Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => unreachable!(),
192+
Eof => unreachable!(),
193193
}
194194
}
195195
}

compiler/rustc_parse/src/lexer/mod.rs

+53-60
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use rustc_ast::ast::AttrStyle;
22
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
3+
use rustc_ast::tokenstream::IsJoint;
34
use rustc_data_structures::sync::Lrc;
45
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
56
use rustc_lexer::Base;
@@ -65,42 +66,46 @@ impl<'a> StringReader<'a> {
6566
self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
6667
}
6768

68-
/// Returns the next token, including trivia like whitespace or comments.
69-
fn next_token(&mut self) -> Token {
69+
/// Returns the next token, and info about preceding whitespace, if any.
70+
fn next_token(&mut self) -> (IsJoint, Token) {
71+
let mut is_joint = IsJoint::Joint;
72+
73+
// Skip `#!` at the start of the file
7074
let start_src_index = self.src_index(self.pos);
7175
let text: &str = &self.src[start_src_index..self.end_src_index];
72-
73-
if text.is_empty() {
74-
let span = self.mk_sp(self.pos, self.pos);
75-
return Token::new(token::Eof, span);
76+
let is_beginning_of_file = self.pos == self.start_pos;
77+
if is_beginning_of_file {
78+
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
79+
self.pos = self.pos + BytePos::from_usize(shebang_len);
80+
is_joint = IsJoint::NonJoint;
81+
}
7682
}
7783

78-
{
79-
let is_beginning_of_file = self.pos == self.start_pos;
80-
if is_beginning_of_file {
81-
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
82-
let start = self.pos;
83-
self.pos = self.pos + BytePos::from_usize(shebang_len);
84+
// Skip trivial (whitespace & comments) tokens
85+
loop {
86+
let start_src_index = self.src_index(self.pos);
87+
let text: &str = &self.src[start_src_index..self.end_src_index];
8488

85-
let sym = self.symbol_from(start + BytePos::from_usize("#!".len()));
86-
let kind = token::Shebang(sym);
87-
88-
let span = self.mk_sp(start, self.pos);
89-
return Token::new(kind, span);
90-
}
89+
if text.is_empty() {
90+
let span = self.mk_sp(self.pos, self.pos);
91+
return (is_joint, Token::new(token::Eof, span));
9192
}
92-
}
9393

94-
let token = rustc_lexer::first_token(text);
94+
let token = rustc_lexer::first_token(text);
9595

96-
let start = self.pos;
97-
self.pos = self.pos + BytePos::from_usize(token.len);
96+
let start = self.pos;
97+
self.pos = self.pos + BytePos::from_usize(token.len);
9898

99-
debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start));
99+
debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
100100

101-
let kind = self.cook_lexer_token(token.kind, start);
102-
let span = self.mk_sp(start, self.pos);
103-
Token::new(kind, span)
101+
match self.cook_lexer_token(token.kind, start) {
102+
Some(kind) => {
103+
let span = self.mk_sp(start, self.pos);
104+
return (is_joint, Token::new(kind, span));
105+
}
106+
None => is_joint = IsJoint::NonJoint,
107+
}
108+
}
104109
}
105110

106111
/// Report a fatal lexical error with a given span.
@@ -140,19 +145,16 @@ impl<'a> StringReader<'a> {
140145
/// Turns simple `rustc_lexer::TokenKind` enum into a rich
141146
/// `librustc_ast::TokenKind`. This turns strings into interned
142147
/// symbols and runs additional validation.
143-
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
144-
match token {
148+
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
149+
Some(match token {
145150
rustc_lexer::TokenKind::LineComment { doc_style } => {
146-
match doc_style {
147-
Some(doc_style) => {
148-
// Opening delimiter of the length 3 is not included into the symbol.
149-
let content_start = start + BytePos(3);
150-
let content = self.str_from(content_start);
151+
// Skip non-doc comments
152+
let doc_style = doc_style?;
151153

152-
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
153-
}
154-
None => token::Comment,
155-
}
154+
// Opening delimiter of the length 3 is not included into the symbol.
155+
let content_start = start + BytePos(3);
156+
let content = self.str_from(content_start);
157+
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
156158
}
157159
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
158160
if !terminated {
@@ -171,20 +173,18 @@ impl<'a> StringReader<'a> {
171173
.emit();
172174
FatalError.raise();
173175
}
174-
match doc_style {
175-
Some(doc_style) => {
176-
// Opening delimiter of the length 3 and closing delimiter of the length 2
177-
// are not included into the symbol.
178-
let content_start = start + BytePos(3);
179-
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
180-
let content = self.str_from_to(content_start, content_end);
181-
182-
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
183-
}
184-
None => token::Comment,
185-
}
176+
177+
// Skip non-doc comments
178+
let doc_style = doc_style?;
179+
180+
// Opening delimiter of the length 3 and closing delimiter of the length 2
181+
// are not included into the symbol.
182+
let content_start = start + BytePos(3);
183+
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
184+
let content = self.str_from_to(content_start, content_end);
185+
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
186186
}
187-
rustc_lexer::TokenKind::Whitespace => token::Whitespace,
187+
rustc_lexer::TokenKind::Whitespace => return None,
188188
rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => {
189189
let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent;
190190
let mut ident_start = start;
@@ -282,12 +282,11 @@ impl<'a> StringReader<'a> {
282282
// this should be inside `rustc_lexer`. However, we should first remove compound
283283
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
284284
// as there will be less overall work to do this way.
285-
let token = unicode_chars::check_for_substitution(self, start, c, &mut err)
286-
.unwrap_or_else(|| token::Unknown(self.symbol_from(start)));
285+
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
287286
err.emit();
288-
token
287+
token?
289288
}
290-
}
289+
})
291290
}
292291

293292
fn cook_doc_comment(
@@ -450,12 +449,6 @@ impl<'a> StringReader<'a> {
450449
self.str_from_to(start, self.pos)
451450
}
452451

453-
/// Creates a Symbol from a given offset to the current offset.
454-
fn symbol_from(&self, start: BytePos) -> Symbol {
455-
debug!("taking an ident from {:?} to {:?}", start, self.pos);
456-
Symbol::intern(self.str_from(start))
457-
}
458-
459452
/// As symbol_from, with an explicit endpoint.
460453
fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
461454
debug!("taking an ident from {:?} to {:?}", start, end);

compiler/rustc_parse/src/lexer/tokentrees.rs

+13-23
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ impl<'a> StringReader<'a> {
1616
let mut tt_reader = TokenTreesReader {
1717
string_reader: self,
1818
token: Token::dummy(),
19-
joint_to_prev: Joint,
2019
open_braces: Vec::new(),
2120
unmatched_braces: Vec::new(),
2221
matching_delim_spans: Vec::new(),
@@ -32,7 +31,6 @@ impl<'a> StringReader<'a> {
3231
struct TokenTreesReader<'a> {
3332
string_reader: StringReader<'a>,
3433
token: Token,
35-
joint_to_prev: IsJoint,
3634
/// Stack of open delimiters and their spans. Used for error message.
3735
open_braces: Vec<(token::DelimToken, Span)>,
3836
unmatched_braces: Vec<UnmatchedBrace>,
@@ -53,7 +51,7 @@ impl<'a> TokenTreesReader<'a> {
5351
fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
5452
let mut buf = TokenStreamBuilder::default();
5553

56-
self.real_token();
54+
self.bump();
5755
while self.token != token::Eof {
5856
buf.push(self.parse_token_tree()?);
5957
}
@@ -126,7 +124,7 @@ impl<'a> TokenTreesReader<'a> {
126124

127125
// Parse the open delimiter.
128126
self.open_braces.push((delim, self.token.span));
129-
self.real_token();
127+
self.bump();
130128

131129
// Parse the token trees within the delimiters.
132130
// We stop at any delimiter so we can try to recover if the user
@@ -171,7 +169,7 @@ impl<'a> TokenTreesReader<'a> {
171169
));
172170
}
173171
// Parse the closing delimiter.
174-
self.real_token();
172+
self.bump();
175173
}
176174
// Incorrect delimiter.
177175
token::CloseDelim(other) => {
@@ -217,7 +215,7 @@ impl<'a> TokenTreesReader<'a> {
217215
// bar(baz(
218216
// } // Incorrect delimiter but matches the earlier `{`
219217
if !self.open_braces.iter().any(|&(b, _)| b == other) {
220-
self.real_token();
218+
self.bump();
221219
}
222220
}
223221
token::Eof => {
@@ -264,27 +262,19 @@ impl<'a> TokenTreesReader<'a> {
264262
}
265263
_ => {
266264
let tt = TokenTree::Token(self.token.take());
267-
self.real_token();
268-
let is_joint = self.joint_to_prev == Joint && self.token.is_op();
269-
Ok((tt, if is_joint { Joint } else { NonJoint }))
265+
let mut is_joint = self.bump();
266+
if !self.token.is_op() {
267+
is_joint = NonJoint;
268+
}
269+
Ok((tt, is_joint))
270270
}
271271
}
272272
}
273273

274-
fn real_token(&mut self) {
275-
self.joint_to_prev = Joint;
276-
loop {
277-
let token = self.string_reader.next_token();
278-
match token.kind {
279-
token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => {
280-
self.joint_to_prev = NonJoint;
281-
}
282-
_ => {
283-
self.token = token;
284-
return;
285-
}
286-
}
287-
}
274+
fn bump(&mut self) -> IsJoint {
275+
let (joint_to_prev, token) = self.string_reader.next_token();
276+
self.token = token;
277+
joint_to_prev
288278
}
289279
}
290280

compiler/rustc_parse/src/lexer/unicode_chars.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
303303
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
304304
// fancier error recovery to it, as there will be less overall work to do this way.
305305
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
306-
(' ', "Space", Some(token::Whitespace)),
306+
(' ', "Space", None),
307307
('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
308308
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
309309
(',', "Comma", Some(token::Comma)),

compiler/rustc_parse/src/lib.rs

-7
Original file line numberDiff line numberDiff line change
@@ -348,9 +348,6 @@ pub fn tokenstream_probably_equal_for_proc_macro(
348348
| token::CloseDelim(DelimToken::NoDelim)
349349
// The pretty printer collapses many semicolons into one.
350350
| token::Semi
351-
// The pretty printer collapses whitespace arbitrarily and can
352-
// introduce whitespace from `NoDelim`.
353-
| token::Whitespace
354351
// The pretty printer can turn `$crate` into `::crate_name`
355352
| token::ModSep = token.kind {
356353
return false;
@@ -506,8 +503,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
506503
| (&Pound, &Pound)
507504
| (&Dollar, &Dollar)
508505
| (&Question, &Question)
509-
| (&Whitespace, &Whitespace)
510-
| (&Comment, &Comment)
511506
| (&Eof, &Eof) => true,
512507

513508
(&BinOp(a), &BinOp(b)) | (&BinOpEq(a), &BinOpEq(b)) => a == b,
@@ -516,8 +511,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
516511

517512
(&DocComment(a1, a2, a3), &DocComment(b1, b2, b3)) => a1 == b1 && a2 == b2 && a3 == b3,
518513

519-
(&Shebang(a), &Shebang(b)) => a == b,
520-
521514
(&Literal(a), &Literal(b)) => a == b,
522515

523516
(&Lifetime(a), &Lifetime(b)) => a == b,

0 commit comments

Comments
 (0)