Skip to content

Commit 46d0ca0

Browse files
committed
Auto merge of #60261 - matklad:one-escape, r=petrochenkov
introduce unescape module A WIP PR to gauge early feedback Currently, we deal with escape sequences twice: once when we [lex](https://github.com/rust-lang/rust/blob/112f7e9ac564e2cfcfc13d599c8376a219fde1bc/src/libsyntax/parse/lexer/mod.rs#L928-L1065) a string, and a second time when we [unescape](https://github.com/rust-lang/rust/blob/112f7e9ac564e2cfcfc13d599c8376a219fde1bc/src/libsyntax/parse/mod.rs#L313-L366) literals. Note that we also produce different sets of diagnostics in these two cases. This PR aims to remove this duplication, by introducing a new `unescape` module as a single source of truth for character escaping rules. I think this would be a useful cleanup by itself, but I also need this for #59706. In the current state, the PR has `unescape` module which fully (modulo bugs) deals with string and char literals. I am quite happy about the state of this module What this PR doesn't have yet are: * [x] handling of byte and byte string literals (should be simple to add) * [x] good diagnostics * [x] actual removal of code from lexer (giant `scan_char_or_byte` should go away completely) * [x] performance check * [x] general cleanup of the new code Diagnostics will be the most labor-consuming bit here, but they are mostly a question of just correctly adjusting spans to sub-tokens. The current setup for diagnostics is that `unescape` produces a plain old `enum` with various problems, and they are rendered into `Handler` separately. This bit is not actually required (it is possible to just pass the `Handler` in), but I like the separation between diagnostics and logic this approach imposes, and such separation should again be useful for #59706 cc @eddyb , @petrochenkov
2 parents 40bd145 + 1835cbe commit 46d0ca0

28 files changed

+1048
-785
lines changed

src/librustc_errors/diagnostic_builder.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ impl<'a> DiagnosticBuilder<'a> {
184184
) -> &mut Self);
185185
forward!(pub fn warn(&mut self, msg: &str) -> &mut Self);
186186
forward!(pub fn span_warn<S: Into<MultiSpan>>(&mut self, sp: S, msg: &str) -> &mut Self);
187-
forward!(pub fn help(&mut self , msg: &str) -> &mut Self);
187+
forward!(pub fn help(&mut self, msg: &str) -> &mut Self);
188188
forward!(pub fn span_help<S: Into<MultiSpan>>(&mut self,
189189
sp: S,
190190
msg: &str,

src/libsyntax/ext/base.rs

+1
Original file line numberDiff line numberDiff line change
@@ -998,6 +998,7 @@ pub fn expr_to_spanned_string<'a>(
998998
Err(match expr.node {
999999
ast::ExprKind::Lit(ref l) => match l.node {
10001000
ast::LitKind::Str(s, style) => return Ok(respan(expr.span, (s, style))),
1001+
ast::LitKind::Err(_) => None,
10011002
_ => Some(cx.struct_span_err(l.span, err_msg))
10021003
},
10031004
ast::ExprKind::Err => None,

src/libsyntax/parse/lexer/mod.rs

+142-443
Large diffs are not rendered by default.

src/libsyntax/parse/mod.rs

+47-222
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ use log::debug;
1818

1919
use rustc_data_structures::fx::FxHashSet;
2020
use std::borrow::Cow;
21-
use std::iter;
2221
use std::path::{Path, PathBuf};
2322
use std::str;
2423

@@ -34,6 +33,11 @@ pub mod diagnostics;
3433

3534
pub mod classify;
3635

36+
pub(crate) mod unescape;
37+
use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte};
38+
39+
pub(crate) mod unescape_error_reporting;
40+
3741
/// Info about a parsing session.
3842
pub struct ParseSess {
3943
pub span_diagnostic: Handler,
@@ -307,133 +311,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
307311
Parser::new(sess, stream, None, true, false)
308312
}
309313

310-
/// Parses a string representing a character literal into its final form.
311-
/// Rather than just accepting/rejecting a given literal, unescapes it as
312-
/// well. Can take any slice prefixed by a character escape. Returns the
313-
/// character and the number of characters consumed.
314-
fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
315-
use std::char;
316-
317-
// Handle non-escaped chars first.
318-
if lit.as_bytes()[0] != b'\\' {
319-
// If the first byte isn't '\\' it might part of a multi-byte char, so
320-
// get the char with chars().
321-
let c = lit.chars().next().unwrap();
322-
return (c, 1);
323-
}
324-
325-
// Handle escaped chars.
326-
match lit.as_bytes()[1] as char {
327-
'"' => ('"', 2),
328-
'n' => ('\n', 2),
329-
'r' => ('\r', 2),
330-
't' => ('\t', 2),
331-
'\\' => ('\\', 2),
332-
'\'' => ('\'', 2),
333-
'0' => ('\0', 2),
334-
'x' => {
335-
let v = u32::from_str_radix(&lit[2..4], 16).unwrap();
336-
let c = char::from_u32(v).unwrap();
337-
(c, 4)
338-
}
339-
'u' => {
340-
assert_eq!(lit.as_bytes()[2], b'{');
341-
let idx = lit.find('}').unwrap();
342-
343-
// All digits and '_' are ascii, so treat each byte as a char.
344-
let mut v: u32 = 0;
345-
for c in lit[3..idx].bytes() {
346-
let c = char::from(c);
347-
if c != '_' {
348-
let x = c.to_digit(16).unwrap();
349-
v = v.checked_mul(16).unwrap().checked_add(x).unwrap();
350-
}
351-
}
352-
let c = char::from_u32(v).unwrap_or_else(|| {
353-
if let Some((span, diag)) = diag {
354-
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
355-
if v > 0x10FFFF {
356-
diag.help("unicode escape must be at most 10FFFF").emit();
357-
} else {
358-
diag.help("unicode escape must not be a surrogate").emit();
359-
}
360-
}
361-
'\u{FFFD}'
362-
});
363-
(c, (idx + 1) as isize)
364-
}
365-
_ => panic!("lexer should have rejected a bad character escape {}", lit)
366-
}
367-
}
368-
369-
/// Parses a string representing a string literal into its final form. Does unescaping.
370-
fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
371-
debug!("str_lit: given {}", lit.escape_default());
372-
let mut res = String::with_capacity(lit.len());
373-
374-
let error = |i| format!("lexer should have rejected {} at {}", lit, i);
375-
376-
/// Eat everything up to a non-whitespace.
377-
fn eat<'a>(it: &mut iter::Peekable<str::CharIndices<'a>>) {
378-
loop {
379-
match it.peek().map(|x| x.1) {
380-
Some(' ') | Some('\n') | Some('\r') | Some('\t') => {
381-
it.next();
382-
},
383-
_ => { break; }
384-
}
385-
}
386-
}
387-
388-
let mut chars = lit.char_indices().peekable();
389-
while let Some((i, c)) = chars.next() {
390-
match c {
391-
'\\' => {
392-
let ch = chars.peek().unwrap_or_else(|| {
393-
panic!("{}", error(i))
394-
}).1;
395-
396-
if ch == '\n' {
397-
eat(&mut chars);
398-
} else if ch == '\r' {
399-
chars.next();
400-
let ch = chars.peek().unwrap_or_else(|| {
401-
panic!("{}", error(i))
402-
}).1;
403-
404-
if ch != '\n' {
405-
panic!("lexer accepted bare CR");
406-
}
407-
eat(&mut chars);
408-
} else {
409-
// otherwise, a normal escape
410-
let (c, n) = char_lit(&lit[i..], diag);
411-
for _ in 0..n - 1 { // we don't need to move past the first \
412-
chars.next();
413-
}
414-
res.push(c);
415-
}
416-
},
417-
'\r' => {
418-
let ch = chars.peek().unwrap_or_else(|| {
419-
panic!("{}", error(i))
420-
}).1;
421-
422-
if ch != '\n' {
423-
panic!("lexer accepted bare CR");
424-
}
425-
chars.next();
426-
res.push('\n');
427-
}
428-
c => res.push(c),
429-
}
430-
}
431-
432-
res.shrink_to_fit(); // probably not going to do anything, unless there was an escape.
433-
debug!("parse_str_lit: returning {}", res);
434-
res
435-
}
436-
437314
/// Parses a string representing a raw string literal into its final form. The
438315
/// only operation this does is convert embedded CRLF into a single LF.
439316
fn raw_str_lit(lit: &str) -> String {
@@ -476,9 +353,21 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
476353
use ast::LitKind;
477354

478355
match lit {
479-
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
480-
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
481-
token::Err(i) => (true, Some(LitKind::Err(i))),
356+
token::Byte(i) => {
357+
let lit_kind = match unescape_byte(&i.as_str()) {
358+
Ok(c) => LitKind::Byte(c),
359+
Err(_) => LitKind::Err(i),
360+
};
361+
(true, Some(lit_kind))
362+
},
363+
token::Char(i) => {
364+
let lit_kind = match unescape_char(&i.as_str()) {
365+
Ok(c) => LitKind::Char(c),
366+
Err(_) => LitKind::Err(i),
367+
};
368+
(true, Some(lit_kind))
369+
},
370+
token::Err(i) => (true, Some(LitKind::Err(i))),
482371

483372
// There are some valid suffixes for integer and float literals,
484373
// so all the handling is done internally.
@@ -490,10 +379,22 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
490379
// reuse the symbol from the Token. Otherwise, we must generate a
491380
// new symbol because the string in the LitKind is different to the
492381
// string in the Token.
382+
let mut has_error = false;
493383
let s = &sym.as_str();
494384
if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') {
495-
sym = Symbol::intern(&str_lit(s, diag));
385+
let mut buf = String::with_capacity(s.len());
386+
unescape_str(s, &mut |_, unescaped_char| {
387+
match unescaped_char {
388+
Ok(c) => buf.push(c),
389+
Err(_) => has_error = true,
390+
}
391+
});
392+
if has_error {
393+
return (true, Some(LitKind::Err(sym)));
394+
}
395+
sym = Symbol::intern(&buf)
496396
}
397+
497398
(true, Some(LitKind::Str(sym, ast::StrStyle::Cooked)))
498399
}
499400
token::StrRaw(mut sym, n) => {
@@ -505,7 +406,20 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
505406
(true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n))))
506407
}
507408
token::ByteStr(i) => {
508-
(true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str()))))
409+
let s = &i.as_str();
410+
let mut buf = Vec::with_capacity(s.len());
411+
let mut has_error = false;
412+
unescape_byte_str(s, &mut |_, unescaped_byte| {
413+
match unescaped_byte {
414+
Ok(c) => buf.push(c),
415+
Err(_) => has_error = true,
416+
}
417+
});
418+
if has_error {
419+
return (true, Some(LitKind::Err(i)));
420+
}
421+
buf.shrink_to_fit();
422+
(true, Some(LitKind::ByteStr(Lrc::new(buf))))
509423
}
510424
token::ByteStrRaw(i, _) => {
511425
(true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes()))))
@@ -560,95 +474,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
560474
filtered_float_lit(Symbol::intern(s), suffix, diag)
561475
}
562476

563-
/// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
564-
fn byte_lit(lit: &str) -> (u8, usize) {
565-
let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i);
566-
567-
if lit.len() == 1 {
568-
(lit.as_bytes()[0], 1)
569-
} else {
570-
assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0));
571-
let b = match lit.as_bytes()[1] {
572-
b'"' => b'"',
573-
b'n' => b'\n',
574-
b'r' => b'\r',
575-
b't' => b'\t',
576-
b'\\' => b'\\',
577-
b'\'' => b'\'',
578-
b'0' => b'\0',
579-
_ => {
580-
match u64::from_str_radix(&lit[2..4], 16).ok() {
581-
Some(c) =>
582-
if c > 0xFF {
583-
panic!(err(2))
584-
} else {
585-
return (c as u8, 4)
586-
},
587-
None => panic!(err(3))
588-
}
589-
}
590-
};
591-
(b, 2)
592-
}
593-
}
594-
595-
fn byte_str_lit(lit: &str) -> Lrc<Vec<u8>> {
596-
let mut res = Vec::with_capacity(lit.len());
597-
598-
let error = |i| panic!("lexer should have rejected {} at {}", lit, i);
599-
600-
/// Eat everything up to a non-whitespace.
601-
fn eat<I: Iterator<Item=(usize, u8)>>(it: &mut iter::Peekable<I>) {
602-
loop {
603-
match it.peek().map(|x| x.1) {
604-
Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => {
605-
it.next();
606-
},
607-
_ => { break; }
608-
}
609-
}
610-
}
611-
612-
// byte string literals *must* be ASCII, but the escapes don't have to be
613-
let mut chars = lit.bytes().enumerate().peekable();
614-
loop {
615-
match chars.next() {
616-
Some((i, b'\\')) => {
617-
match chars.peek().unwrap_or_else(|| error(i)).1 {
618-
b'\n' => eat(&mut chars),
619-
b'\r' => {
620-
chars.next();
621-
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
622-
panic!("lexer accepted bare CR");
623-
}
624-
eat(&mut chars);
625-
}
626-
_ => {
627-
// otherwise, a normal escape
628-
let (c, n) = byte_lit(&lit[i..]);
629-
// we don't need to move past the first \
630-
for _ in 0..n - 1 {
631-
chars.next();
632-
}
633-
res.push(c);
634-
}
635-
}
636-
},
637-
Some((i, b'\r')) => {
638-
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
639-
panic!("lexer accepted bare CR");
640-
}
641-
chars.next();
642-
res.push(b'\n');
643-
}
644-
Some((_, c)) => res.push(c),
645-
None => break,
646-
}
647-
}
648-
649-
Lrc::new(res)
650-
}
651-
652477
fn integer_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
653478
-> Option<ast::LitKind> {
654479
// s can only be ascii, byte indexing is fine

0 commit comments

Comments
 (0)