Skip to content

Commit 6a5ad7f

Browse files
committed
Make lexer::unescape use iterators inside instead of callbacks.
1 parent 8239a37 commit 6a5ad7f

File tree

1 file changed

+121
-76
lines changed

1 file changed

+121
-76
lines changed

compiler/rustc_lexer/src/unescape.rs

+121-76
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,14 @@ where
9595
let res = unescape_char_or_byte(&mut chars, mode);
9696
callback(0..(src.len() - chars.as_str().len()), res);
9797
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99-
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100-
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
101-
if let Ok('\0') = result {
102-
result = Err(EscapeError::NulInCStr);
98+
Str | ByteStr => Unescape::new(src, |chars| scan_escape(chars, mode))
99+
.for_each(|(res, r)| callback(r, res)),
100+
RawStr | RawByteStr => check_raw_common(src, mode).for_each(|(res, r)| callback(r, res)),
101+
RawCStr => check_raw_common(src, mode).for_each(|(mut res, r)| {
102+
if let Ok('\0') = res {
103+
res = Err(EscapeError::NulInCStr);
103104
}
104-
callback(r, result)
105+
callback(r, res);
105106
}),
106107
CStr => unreachable!(),
107108
}
@@ -147,12 +148,13 @@ where
147148
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
148149
{
149150
match mode {
150-
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
151-
if let Ok(MixedUnit::Char('\0')) = result {
152-
result = Err(EscapeError::NulInCStr);
151+
CStr => Unescape::new(src, |chars| scan_escape(chars, mode)).for_each(|(mut res, r)| {
152+
if let Ok(MixedUnit::Char('\0')) = res {
153+
res = Err(EscapeError::NulInCStr);
153154
}
154-
callback(r, result)
155+
callback(r, res);
155156
}),
157+
156158
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
157159
}
158160
}
@@ -301,7 +303,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
301303
}
302304

303305
break std::char::from_u32(value).ok_or({
304-
if value > 0x10FFFF {
306+
if value > char::MAX as u32 {
305307
EscapeError::OutOfRangeUnicodeEscape
306308
} else {
307309
EscapeError::LoneSurrogateUnicodeEscape
@@ -340,94 +342,137 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
340342
Ok(res)
341343
}
342344

343-
/// Takes a contents of a string literal (without quotes) and produces a
344-
/// sequence of escaped characters or errors.
345-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
346-
where
347-
F: FnMut(Range<usize>, Result<T, EscapeError>),
345+
/// Iterator that removes string continuations and interprets other backslash-escapes
346+
struct Unescape<'s, T: From<char> + From<u8>, F: FnMut(&mut Chars<'_>) -> Result<T, EscapeError>> {
347+
state: State,
348+
chars: Chars<'s>,
349+
pos: usize,
350+
scan_escape: F,
351+
}
352+
353+
/// States for `Unescape` iterator state machine
354+
enum State {
355+
Start,
356+
UnskippedWhitespace(usize),
357+
}
358+
359+
impl<T: From<char> + From<u8>, F: FnMut(&mut Chars<'_>) -> Result<T, EscapeError>> Iterator
360+
for Unescape<'_, T, F>
348361
{
349-
let mut chars = src.chars();
350-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
362+
type Item = (Result<T, EscapeError>, Range<usize>);
351363

352-
// The `start` and `end` computation here is complicated because
353-
// `skip_ascii_whitespace` makes us to skip over chars without counting
354-
// them in the range computation.
355-
while let Some(c) = chars.next() {
356-
let start = src.len() - chars.as_str().len() - c.len_utf8();
357-
let res = match c {
358-
'\\' => {
359-
match chars.clone().next() {
360-
Some('\n') => {
361-
// Rust language specification requires us to skip whitespaces
362-
// if unescaped '\' character is followed by '\n'.
363-
// For details see [Rust language reference]
364+
fn next(&mut self) -> Option<Self::Item> {
365+
match self.state {
366+
State::Start => self.start(),
367+
State::UnskippedWhitespace(end) => self.unskipped_whitespace(end),
368+
}
369+
}
370+
}
371+
372+
impl<'s, T: From<char> + From<u8>, F: FnMut(&mut Chars<'_>) -> Result<T, EscapeError>>
373+
Unescape<'s, T, F>
374+
{
375+
pub(crate) fn new(s: &'s str, scan_escape: F) -> Self {
376+
Self { state: State::Start, chars: s.chars(), pos: 0, scan_escape }
377+
}
378+
379+
fn start(&mut self) -> Option<<Self as Iterator>::Item> {
380+
if let Some(c) = self.chars.next() {
381+
match c {
382+
'\\' => {
383+
// peek
384+
if Some('\n') == self.chars.clone().next() {
385+
assert_eq!(Some('\n'), self.chars.next());
386+
// skip whitespace for backslash newline, see [Rust language reference]
364387
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
365-
skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
366-
callback(range, Err(err))
367-
});
368-
continue;
388+
self.skip_whitespace()
389+
} else {
390+
let mut chars_for_escape = self.chars.clone();
391+
let res = (self.scan_escape)(&mut chars);
392+
let used = self.chars.as_str().len() - chars_for_escape.as_str().len();
393+
let range = self.pos..self.pos + used + 1;
394+
self.pos += used + 1;
395+
Some((res, range))
369396
}
370-
_ => scan_escape::<T>(&mut chars, mode),
397+
}
398+
c => {
399+
let res = match c {
400+
'"' => Err(EscapeError::EscapeOnlyChar),
401+
'\r' => Err(EscapeError::BareCarriageReturn),
402+
c => Ok(c),
403+
};
404+
let end = self.pos + c.len_utf8();
405+
let range = self.pos..end;
406+
self.pos = end;
407+
Some((res.map(T::from), range))
371408
}
372409
}
373-
'"' => Err(EscapeError::EscapeOnlyChar),
374-
'\r' => Err(EscapeError::BareCarriageReturn),
375-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
376-
};
377-
let end = src.len() - chars.as_str().len();
378-
callback(start..end, res);
410+
} else {
411+
None
412+
}
379413
}
380-
}
381414

382-
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
383-
where
384-
F: FnMut(Range<usize>, EscapeError),
385-
{
386-
let tail = chars.as_str();
387-
let first_non_space = tail
388-
.bytes()
389-
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
390-
.unwrap_or(tail.len());
391-
if tail[1..first_non_space].contains('\n') {
392-
// The +1 accounts for the escaping slash.
393-
let end = start + first_non_space + 1;
394-
callback(start..end, EscapeError::MultipleSkippedLinesWarning);
415+
/// Skip ASCII whitespace, except for the formfeed character
416+
/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
417+
/// Warns on unescaped newline and following non-ASCII whitespace.
418+
fn skip_whitespace(&mut self) -> Option<<Self as Iterator>::Item> {
419+
// the escaping slash and newline characters add 2 bytes
420+
let mut end = self.pos + 2;
421+
let mut contains_nl = false;
422+
// manual next_if loop
423+
while let Some(c) = self.chars.clone().next() {
424+
if c.is_ascii_whitespace() && c != '\x0c' {
425+
let _ = self.chars.next();
426+
end += 1;
427+
contains_nl = contains_nl || c == '\n';
428+
} else {
429+
break;
430+
}
431+
}
432+
if contains_nl {
433+
self.state = State::UnskippedWhitespace(end);
434+
Some((Err(EscapeError::MultipleSkippedLinesWarning), self.pos..end))
435+
} else {
436+
self.unskipped_whitespace(end)
437+
}
395438
}
396-
let tail = &tail[first_non_space..];
397-
if let Some(c) = tail.chars().next() {
398-
if c.is_whitespace() {
399-
// For error reporting, we would like the span to contain the character that was not
400-
// skipped. The +1 is necessary to account for the leading \ that started the escape.
401-
let end = start + first_non_space + c.len_utf8() + 1;
402-
callback(start..end, EscapeError::UnskippedWhitespaceWarning);
439+
440+
/// Helper for `skip_whitespace`
441+
fn unskipped_whitespace(&mut self, end: usize) -> Option<<Self as Iterator>::Item> {
442+
self.state = State::Start;
443+
// peek
444+
if let Some(c) = self.chars.clone().next() {
445+
let range = self.pos..end + c.len_utf8();
446+
self.pos = end;
447+
if c.is_whitespace() {
448+
// for error reporting, include the character that was not skipped in the span
449+
Some((Err(EscapeError::UnskippedWhitespaceWarning), range))
450+
} else {
451+
self.start()
452+
}
453+
} else {
454+
None
403455
}
404456
}
405-
*chars = tail.chars();
406457
}
407458

408459
/// Takes a contents of a string literal (without quotes) and produces a
409460
/// sequence of characters or errors.
410461
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411462
/// only produce errors on bare CR.
412-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
413-
where
414-
F: FnMut(Range<usize>, Result<char, EscapeError>),
415-
{
416-
let mut chars = src.chars();
463+
fn check_raw_common(
464+
src: &str,
465+
mode: Mode,
466+
) -> impl Iterator<Item = (Result<char, EscapeError>, Range<usize>)> + '_ {
417467
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
418468

419-
// The `start` and `end` computation here matches the one in
420-
// `unescape_non_raw_common` for consistency, even though this function
421-
// doesn't have to worry about skipping any chars.
422-
while let Some(c) = chars.next() {
423-
let start = src.len() - chars.as_str().len() - c.len_utf8();
469+
src.char_indices().map(move |(pos, c)| {
424470
let res = match c {
425471
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
426472
_ => ascii_check(c, allow_unicode_chars),
427473
};
428-
let end = src.len() - chars.as_str().len();
429-
callback(start..end, res);
430-
}
474+
(res, pos..pos + c.len_utf8())
475+
})
431476
}
432477

433478
#[inline]

0 commit comments

Comments
 (0)