Skip to content

Commit a21e7df

Browse files
committed
Make lexer::unescape use iterators inside instead of callbacks.
1 parent 8239a37 commit a21e7df

File tree

1 file changed

+131
-81
lines changed

1 file changed

+131
-81
lines changed

compiler/rustc_lexer/src/unescape.rs

+131-81
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,13 @@ where
9595
let res = unescape_char_or_byte(&mut chars, mode);
9696
callback(0..(src.len() - chars.as_str().len()), res);
9797
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99-
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100-
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
101-
if let Ok('\0') = result {
102-
result = Err(EscapeError::NulInCStr);
98+
Str | ByteStr => Unescape::new(src, mode).for_each(|(res, r)| callback(r, res)),
99+
RawStr | RawByteStr => check_raw_common(src, mode).for_each(|(res, r)| callback(r, res)),
100+
RawCStr => check_raw_common(src, mode).for_each(|(mut res, r)| {
101+
if let Ok('\0') = res {
102+
res = Err(EscapeError::NulInCStr);
103103
}
104-
callback(r, result)
104+
callback(r, res);
105105
}),
106106
CStr => unreachable!(),
107107
}
@@ -147,12 +147,13 @@ where
147147
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
148148
{
149149
match mode {
150-
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
151-
if let Ok(MixedUnit::Char('\0')) = result {
152-
result = Err(EscapeError::NulInCStr);
150+
CStr => Unescape::new(src, mode).for_each(|(mut res, r)| {
151+
if let Ok(MixedUnit::Char('\0')) = res {
152+
res = Err(EscapeError::NulInCStr);
153153
}
154-
callback(r, result)
154+
callback(r, res);
155155
}),
156+
156157
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
157158
}
158159
}
@@ -301,7 +302,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
301302
}
302303

303304
break std::char::from_u32(value).ok_or({
304-
if value > 0x10FFFF {
305+
if value > char::MAX as u32 {
305306
EscapeError::OutOfRangeUnicodeEscape
306307
} else {
307308
EscapeError::LoneSurrogateUnicodeEscape
@@ -340,94 +341,143 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
340341
Ok(res)
341342
}
342343

343-
/// Takes a contents of a string literal (without quotes) and produces a
344-
/// sequence of escaped characters or errors.
345-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
346-
where
347-
F: FnMut(Range<usize>, Result<T, EscapeError>),
348-
{
349-
let mut chars = src.chars();
350-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
344+
/// Iterator that removes string continuations and interprets other backslash-escapes
345+
struct Unescape<'s, T: From<char> + From<u8>> {
346+
state: State,
347+
chars: Chars<'s>,
348+
pos: usize,
349+
mode: Mode,
350+
phantom: std::marker::PhantomData<T>,
351+
}
351352

352-
// The `start` and `end` computation here is complicated because
353-
// `skip_ascii_whitespace` makes us to skip over chars without counting
354-
// them in the range computation.
355-
while let Some(c) = chars.next() {
356-
let start = src.len() - chars.as_str().len() - c.len_utf8();
357-
let res = match c {
358-
'\\' => {
359-
match chars.clone().next() {
360-
Some('\n') => {
361-
// Rust language specification requires us to skip whitespaces
362-
// if unescaped '\' character is followed by '\n'.
363-
// For details see [Rust language reference]
353+
/// States for `Unescape` iterator state machine
354+
enum State {
355+
Start,
356+
UnskippedWhitespace(usize),
357+
}
358+
359+
impl<T: From<char> + From<u8>> Iterator for Unescape<'_, T> {
360+
type Item = (Result<T, EscapeError>, Range<usize>);
361+
362+
fn next(&mut self) -> Option<Self::Item> {
363+
match self.state {
364+
State::Start => self.start(),
365+
State::UnskippedWhitespace(end) => self.unskipped_whitespace(end),
366+
}
367+
}
368+
}
369+
370+
impl<'s, T: From<char> + From<u8>> Unescape<'s, T> {
371+
pub(crate) fn new(s: &'s str, mode: Mode) -> Self {
372+
Self {
373+
state: State::Start,
374+
chars: s.chars(),
375+
pos: 0,
376+
mode,
377+
phantom: std::marker::PhantomData,
378+
}
379+
}
380+
381+
fn start(&mut self) -> Option<<Self as Iterator>::Item> {
382+
if let Some(c) = self.chars.next() {
383+
match c {
384+
'\\' => {
385+
// peek
386+
if Some('\n') == self.chars.clone().next() {
387+
assert_eq!(Some('\n'), self.chars.next());
388+
// skip whitespace for backslash newline, see [Rust language reference]
364389
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
365-
skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
366-
callback(range, Err(err))
367-
});
368-
continue;
390+
self.skip_whitespace()
391+
} else {
392+
let mut chars_clone = self.chars.clone();
393+
let res = scan_escape(&mut chars_clone, self.mode);
394+
let bytes_diff = self.chars.as_str().len() - chars_clone.as_str().len();
395+
let end = self.pos + 1 + bytes_diff;
396+
self.chars = chars_clone;
397+
let range = self.pos..end;
398+
self.pos = end;
399+
Some((res, range))
369400
}
370-
_ => scan_escape::<T>(&mut chars, mode),
401+
}
402+
c => {
403+
let res = match c {
404+
'"' => Err(EscapeError::EscapeOnlyChar),
405+
'\r' => Err(EscapeError::BareCarriageReturn),
406+
c => ascii_check(c, self.mode.allow_unicode_chars()).map(T::from),
407+
};
408+
let end = self.pos + c.len_utf8();
409+
let range = self.pos..end;
410+
self.pos = end;
411+
Some((res, range))
371412
}
372413
}
373-
'"' => Err(EscapeError::EscapeOnlyChar),
374-
'\r' => Err(EscapeError::BareCarriageReturn),
375-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
376-
};
377-
let end = src.len() - chars.as_str().len();
378-
callback(start..end, res);
414+
} else {
415+
None
416+
}
379417
}
380-
}
381418

382-
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
383-
where
384-
F: FnMut(Range<usize>, EscapeError),
385-
{
386-
let tail = chars.as_str();
387-
let first_non_space = tail
388-
.bytes()
389-
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
390-
.unwrap_or(tail.len());
391-
if tail[1..first_non_space].contains('\n') {
392-
// The +1 accounts for the escaping slash.
393-
let end = start + first_non_space + 1;
394-
callback(start..end, EscapeError::MultipleSkippedLinesWarning);
419+
/// Skip ASCII whitespace, except for the formfeed character
420+
/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
421+
/// Warns on unescaped newline and following non-ASCII whitespace.
422+
fn skip_whitespace(&mut self) -> Option<<Self as Iterator>::Item> {
423+
// the escaping slash and newline characters add 2 bytes
424+
let mut end = self.pos + 2;
425+
let mut contains_nl = false;
426+
// manual next_if loop
427+
loop {
428+
let mut chars_clone = self.chars.clone();
429+
match chars_clone.next() {
430+
Some(c) if c.is_ascii_whitespace() && c != '\x0c' => {
431+
self.chars = chars_clone;
432+
end += 1;
433+
contains_nl = contains_nl || c == '\n';
434+
}
435+
_ => break,
436+
}
437+
}
438+
if contains_nl {
439+
self.state = State::UnskippedWhitespace(end);
440+
Some((Err(EscapeError::MultipleSkippedLinesWarning), self.pos..end))
441+
} else {
442+
self.unskipped_whitespace(end)
443+
}
395444
}
396-
let tail = &tail[first_non_space..];
397-
if let Some(c) = tail.chars().next() {
398-
if c.is_whitespace() {
399-
// For error reporting, we would like the span to contain the character that was not
400-
// skipped. The +1 is necessary to account for the leading \ that started the escape.
401-
let end = start + first_non_space + c.len_utf8() + 1;
402-
callback(start..end, EscapeError::UnskippedWhitespaceWarning);
445+
446+
/// Helper for `skip_whitespace`
447+
fn unskipped_whitespace(&mut self, end: usize) -> Option<<Self as Iterator>::Item> {
448+
self.state = State::Start;
449+
// peek
450+
if let Some(c) = self.chars.clone().next() {
451+
let range = self.pos..end + c.len_utf8();
452+
self.pos = end;
453+
if c.is_whitespace() {
454+
// for error reporting, include the character that was not skipped in the span
455+
Some((Err(EscapeError::UnskippedWhitespaceWarning), range))
456+
} else {
457+
self.start()
458+
}
459+
} else {
460+
None
403461
}
404462
}
405-
*chars = tail.chars();
406463
}
407464

408-
/// Takes a contents of a string literal (without quotes) and produces a
409-
/// sequence of characters or errors.
410-
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411-
/// only produce errors on bare CR.
412-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
413-
where
414-
F: FnMut(Range<usize>, Result<char, EscapeError>),
415-
{
416-
let mut chars = src.chars();
465+
/// Takes the contents of a raw string literal (without quotes) and produces an
466+
/// iterator of characters or errors.
467+
/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR.
468+
fn check_raw_common(
469+
src: &str,
470+
mode: Mode,
471+
) -> impl Iterator<Item = (Result<char, EscapeError>, Range<usize>)> + '_ {
417472
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
418473

419-
// The `start` and `end` computation here matches the one in
420-
// `unescape_non_raw_common` for consistency, even though this function
421-
// doesn't have to worry about skipping any chars.
422-
while let Some(c) = chars.next() {
423-
let start = src.len() - chars.as_str().len() - c.len_utf8();
474+
src.char_indices().map(move |(pos, c)| {
424475
let res = match c {
425476
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
426477
_ => ascii_check(c, allow_unicode_chars),
427478
};
428-
let end = src.len() - chars.as_str().len();
429-
callback(start..end, res);
430-
}
479+
(res, pos..pos + c.len_utf8())
480+
})
431481
}
432482

433483
#[inline]

0 commit comments

Comments
 (0)