Skip to content

Commit ccd8ed6

Browse files
committed
Make lexer::unescape use iterators inside instead of callbacks.
1 parent 8239a37 commit ccd8ed6

File tree

1 file changed

+127
-78
lines changed

1 file changed

+127
-78
lines changed

compiler/rustc_lexer/src/unescape.rs

+127-78
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,14 @@ where
9595
let res = unescape_char_or_byte(&mut chars, mode);
9696
callback(0..(src.len() - chars.as_str().len()), res);
9797
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99-
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100-
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
101-
if let Ok('\0') = result {
102-
result = Err(EscapeError::NulInCStr);
103-
}
104-
callback(r, result)
98+
Str | ByteStr => Unescape::new(src, |chars| scan_escape(chars, mode))
99+
.for_each(|(res, r)| callback(r, res)),
100+
RawStr | RawByteStr => check_raw_common(src, mode).for_each(|(res, r)| callback(r, res)),
101+
RawCStr => check_raw_common(src, mode).for_each(|(res, r)| {
102+
callback(r, match res {
103+
Ok('\0') => Err(EscapeError::NulInCStr),
104+
_ => res,
105+
});
105106
}),
106107
CStr => unreachable!(),
107108
}
@@ -147,12 +148,13 @@ where
147148
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
148149
{
149150
match mode {
150-
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
151-
if let Ok(MixedUnit::Char('\0')) = result {
152-
result = Err(EscapeError::NulInCStr);
153-
}
154-
callback(r, result)
151+
CStr => Unescape::new(src, |chars| scan_escape(chars, mode)).for_each(|(res, r)| {
152+
callback(r, match res {
153+
Ok(MixedUnit::Char('\0')) => Err(EscapeError::NulInCStr),
154+
_ => res,
155+
});
155156
}),
157+
156158
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
157159
}
158160
}
@@ -301,7 +303,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
301303
}
302304

303305
break std::char::from_u32(value).ok_or({
304-
if value > 0x10FFFF {
306+
if value > char::MAX as u32 {
305307
EscapeError::OutOfRangeUnicodeEscape
306308
} else {
307309
EscapeError::LoneSurrogateUnicodeEscape
@@ -340,94 +342,141 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
340342
Ok(res)
341343
}
342344

343-
/// Takes a contents of a string literal (without quotes) and produces a
344-
/// sequence of escaped characters or errors.
345-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
346-
where
347-
F: FnMut(Range<usize>, Result<T, EscapeError>),
345+
/// Iterator that removes string continuations and interprets other backslash-escapes
346+
struct Unescape<
347+
'chars,
348+
T: From<char> + From<u8>,
349+
F: FnMut(&mut Chars<'_>) -> Result<T, EscapeError>,
350+
> {
351+
state: State,
352+
chars: Chars<'chars>,
353+
pos: usize,
354+
scan_escape: F,
355+
}
356+
357+
/// States for `Unescape` iterator state machine
358+
enum State {
359+
Start,
360+
UnskippedWhitespace(usize),
361+
}
362+
363+
impl<T: From<char> + From<u8>, F: FnMut(&mut Chars<'_>) -> Result<T, EscapeError>> Iterator
364+
for Unescape<'_, T, F>
348365
{
349-
let mut chars = src.chars();
350-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
366+
type Item = (Result<T, EscapeError>, Range<usize>);
351367

352-
// The `start` and `end` computation here is complicated because
353-
// `skip_ascii_whitespace` makes us to skip over chars without counting
354-
// them in the range computation.
355-
while let Some(c) = chars.next() {
356-
let start = src.len() - chars.as_str().len() - c.len_utf8();
357-
let res = match c {
358-
'\\' => {
359-
match chars.clone().next() {
360-
Some('\n') => {
361-
// Rust language specification requires us to skip whitespaces
362-
// if unescaped '\' character is followed by '\n'.
363-
// For details see [Rust language reference]
368+
fn next(&mut self) -> Option<Self::Item> {
369+
match self.state {
370+
State::Start => self.start(),
371+
State::UnskippedWhitespace(end) => self.unskipped_whitespace(end),
372+
}
373+
}
374+
}
375+
376+
impl<'s, T: From<char> + From<u8>, F: FnMut(&mut Chars<'_>) -> Result<T, EscapeError>>
377+
Unescape<'s, T, F>
378+
{
379+
pub(crate) fn new(s: &'s str, scan_escape: F) -> Self {
380+
Self { state: State::Start, chars: s.chars(), pos: 0, scan_escape }
381+
}
382+
383+
fn start(&mut self) -> Option<<Self as Iterator>::Item> {
384+
if let Some(c) = self.chars.next() {
385+
match c {
386+
'\\' => {
387+
// peek
388+
if Some('\n') == self.chars.clone().next() {
389+
assert_eq!(Some('\n'), self.chars.next());
390+
// skip whitespace for backslash newline, see [Rust language reference]
364391
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
365-
skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
366-
callback(range, Err(err))
367-
});
368-
continue;
392+
self.skip_whitespace()
393+
} else {
394+
let mut chars = self.chars.clone();
395+
let res = (self.scan_escape)(&mut chars);
396+
let used = self.chars.as_str().len() - self.chars.as_str().len();
397+
let range = self.pos..self.pos + used + 1;
398+
self.pos += used + 1;
399+
Some((res, range))
369400
}
370-
_ => scan_escape::<T>(&mut chars, mode),
401+
}
402+
c => {
403+
let res = match c {
404+
'"' => Err(EscapeError::EscapeOnlyChar),
405+
'\r' => Err(EscapeError::BareCarriageReturn),
406+
c => Ok(c),
407+
};
408+
let end = self.pos + c.len_utf8();
409+
let range = self.pos..end;
410+
self.pos = end;
411+
Some((res.map(T::from), range))
371412
}
372413
}
373-
'"' => Err(EscapeError::EscapeOnlyChar),
374-
'\r' => Err(EscapeError::BareCarriageReturn),
375-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
376-
};
377-
let end = src.len() - chars.as_str().len();
378-
callback(start..end, res);
414+
} else {
415+
None
416+
}
379417
}
380-
}
381418

382-
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
383-
where
384-
F: FnMut(Range<usize>, EscapeError),
385-
{
386-
let tail = chars.as_str();
387-
let first_non_space = tail
388-
.bytes()
389-
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
390-
.unwrap_or(tail.len());
391-
if tail[1..first_non_space].contains('\n') {
392-
// The +1 accounts for the escaping slash.
393-
let end = start + first_non_space + 1;
394-
callback(start..end, EscapeError::MultipleSkippedLinesWarning);
419+
/// Skip ASCII whitespace, except for the formfeed character
420+
/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
421+
/// Warns on unescaped newline and following non-ASCII whitespace.
422+
fn skip_whitespace(&mut self) -> Option<<Self as Iterator>::Item> {
423+
// the escaping slash and newline characters add 2 bytes
424+
let mut end = self.pos + 2;
425+
let mut contains_nl = false;
426+
// manual next_if loop
427+
while let Some(c) = self.chars.clone().next() {
428+
if c.is_ascii_whitespace() && c != '\x0c' {
429+
let _ = self.chars.next();
430+
end += 1;
431+
contains_nl = contains_nl || c == '\n';
432+
} else {
433+
break;
434+
}
435+
}
436+
if contains_nl {
437+
self.state = State::UnskippedWhitespace(end);
438+
Some((Err(EscapeError::MultipleSkippedLinesWarning), self.pos..end))
439+
} else {
440+
self.unskipped_whitespace(end)
441+
}
395442
}
396-
let tail = &tail[first_non_space..];
397-
if let Some(c) = tail.chars().next() {
398-
if c.is_whitespace() {
399-
// For error reporting, we would like the span to contain the character that was not
400-
// skipped. The +1 is necessary to account for the leading \ that started the escape.
401-
let end = start + first_non_space + c.len_utf8() + 1;
402-
callback(start..end, EscapeError::UnskippedWhitespaceWarning);
443+
444+
/// Helper for `skip_whitespace`
445+
fn unskipped_whitespace(&mut self, end: usize) -> Option<<Self as Iterator>::Item> {
446+
self.state = State::Start;
447+
// peek
448+
if let Some(c) = self.chars.clone().next() {
449+
let range = self.pos..end + c.len_utf8();
450+
self.pos = end;
451+
if c.is_whitespace() {
452+
// for error reporting, include the character that was not skipped in the span
453+
Some((Err(EscapeError::UnskippedWhitespaceWarning), range))
454+
} else {
455+
self.start()
456+
}
457+
} else {
458+
None
403459
}
404460
}
405-
*chars = tail.chars();
406461
}
407462

408463
/// Takes a contents of a string literal (without quotes) and produces a
409464
/// sequence of characters or errors.
410465
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411466
/// only produce errors on bare CR.
412-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
413-
where
414-
F: FnMut(Range<usize>, Result<char, EscapeError>),
415-
{
416-
let mut chars = src.chars();
467+
fn check_raw_common(
468+
src: &str,
469+
mode: Mode,
470+
) -> impl Iterator<Item = (Result<char, EscapeError>, Range<usize>)> + '_ {
417471
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
418472

419-
// The `start` and `end` computation here matches the one in
420-
// `unescape_non_raw_common` for consistency, even though this function
421-
// doesn't have to worry about skipping any chars.
422-
while let Some(c) = chars.next() {
423-
let start = src.len() - chars.as_str().len() - c.len_utf8();
473+
src.char_indices().map(move |(pos, c)| {
424474
let res = match c {
425475
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
426476
_ => ascii_check(c, allow_unicode_chars),
427477
};
428-
let end = src.len() - chars.as_str().len();
429-
callback(start..end, res);
430-
}
478+
(res, pos..pos + c.len_utf8())
479+
})
431480
}
432481

433482
#[inline]

0 commit comments

Comments
 (0)