@@ -95,13 +95,14 @@ where
95
95
let res = unescape_char_or_byte ( & mut chars, mode) ;
96
96
callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
97
97
}
98
- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
99
- RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100
- RawCStr => check_raw_common ( src, mode, & mut |r, mut result| {
101
- if let Ok ( '\0' ) = result {
102
- result = Err ( EscapeError :: NulInCStr ) ;
98
+ Str | ByteStr => Unescape :: new ( src, |chars| scan_escape ( chars, mode) )
99
+ . for_each ( |( res, r) | callback ( r, res) ) ,
100
+ RawStr | RawByteStr => check_raw_common ( src, mode) . for_each ( |( res, r) | callback ( r, res) ) ,
101
+ RawCStr => check_raw_common ( src, mode) . for_each ( |( mut res, r) | {
102
+ if let Ok ( '\0' ) = res {
103
+ res = Err ( EscapeError :: NulInCStr ) ;
103
104
}
104
- callback ( r, result )
105
+ callback ( r, res ) ;
105
106
} ) ,
106
107
CStr => unreachable ! ( ) ,
107
108
}
@@ -147,12 +148,13 @@ where
147
148
F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
148
149
{
149
150
match mode {
150
- CStr => unescape_non_raw_common ( src, mode , & mut |r , mut result | {
151
- if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
152
- result = Err ( EscapeError :: NulInCStr ) ;
151
+ CStr => Unescape :: new ( src, |chars| scan_escape ( chars , mode ) ) . for_each ( | ( mut res , r ) | {
152
+ if let Ok ( MixedUnit :: Char ( '\0' ) ) = res {
153
+ res = Err ( EscapeError :: NulInCStr ) ;
153
154
}
154
- callback ( r, result )
155
+ callback ( r, res ) ;
155
156
} ) ,
157
+
156
158
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable ! ( ) ,
157
159
}
158
160
}
@@ -301,7 +303,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
301
303
}
302
304
303
305
break std:: char:: from_u32 ( value) . ok_or ( {
304
- if value > 0x10FFFF {
306
+ if value > char :: MAX as u32 {
305
307
EscapeError :: OutOfRangeUnicodeEscape
306
308
} else {
307
309
EscapeError :: LoneSurrogateUnicodeEscape
@@ -340,94 +342,137 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
340
342
Ok ( res)
341
343
}
342
344
343
- /// Takes a contents of a string literal (without quotes) and produces a
344
- /// sequence of escaped characters or errors.
345
- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
346
- where
347
- F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
345
+ /// Iterator that removes string continuations and interprets other backslash-escapes
346
+ struct Unescape < ' s , T : From < char > + From < u8 > , F : FnMut ( & mut Chars < ' _ > ) -> Result < T , EscapeError > > {
347
+ state : State ,
348
+ chars : Chars < ' s > ,
349
+ pos : usize ,
350
+ scan_escape : F ,
351
+ }
352
+
353
+ /// States for `Unescape` iterator state machine
354
+ enum State {
355
+ Start ,
356
+ UnskippedWhitespace ( usize ) ,
357
+ }
358
+
359
+ impl < T : From < char > + From < u8 > , F : FnMut ( & mut Chars < ' _ > ) -> Result < T , EscapeError > > Iterator
360
+ for Unescape < ' _ , T , F >
348
361
{
349
- let mut chars = src. chars ( ) ;
350
- let allow_unicode_chars = mode. allow_unicode_chars ( ) ; // get this outside the loop
362
+ type Item = ( Result < T , EscapeError > , Range < usize > ) ;
351
363
352
- // The `start` and `end` computation here is complicated because
353
- // `skip_ascii_whitespace` makes us to skip over chars without counting
354
- // them in the range computation.
355
- while let Some ( c) = chars. next ( ) {
356
- let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
357
- let res = match c {
358
- '\\' => {
359
- match chars. clone ( ) . next ( ) {
360
- Some ( '\n' ) => {
361
- // Rust language specification requires us to skip whitespaces
362
- // if unescaped '\' character is followed by '\n'.
363
- // For details see [Rust language reference]
364
+ fn next ( & mut self ) -> Option < Self :: Item > {
365
+ match self . state {
366
+ State :: Start => self . start ( ) ,
367
+ State :: UnskippedWhitespace ( end) => self . unskipped_whitespace ( end) ,
368
+ }
369
+ }
370
+ }
371
+
372
+ impl < ' s , T : From < char > + From < u8 > , F : FnMut ( & mut Chars < ' _ > ) -> Result < T , EscapeError > >
373
+ Unescape < ' s , T , F >
374
+ {
375
+ pub ( crate ) fn new ( s : & ' s str , scan_escape : F ) -> Self {
376
+ Self { state : State :: Start , chars : s. chars ( ) , pos : 0 , scan_escape }
377
+ }
378
+
379
+ fn start ( & mut self ) -> Option < <Self as Iterator >:: Item > {
380
+ if let Some ( c) = self . chars . next ( ) {
381
+ match c {
382
+ '\\' => {
383
+ // peek
384
+ if Some ( '\n' ) == self . chars . clone ( ) . next ( ) {
385
+ assert_eq ! ( Some ( '\n' ) , self . chars. next( ) ) ;
386
+ // skip whitespace for backslash newline, see [Rust language reference]
364
387
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
365
- skip_ascii_whitespace ( & mut chars, start, & mut |range, err| {
366
- callback ( range, Err ( err) )
367
- } ) ;
368
- continue ;
388
+ self . skip_whitespace ( )
389
+ } else {
390
+ let mut chars_for_escape = self . chars . clone ( ) ;
391
+ let res = ( self . scan_escape ) ( & mut chars) ;
392
+ let used = self . chars . as_str ( ) . len ( ) - chars_for_escape. as_str ( ) . len ( ) ;
393
+ let range = self . pos ..self . pos + used + 1 ;
394
+ self . pos += used + 1 ;
395
+ Some ( ( res, range) )
369
396
}
370
- _ => scan_escape :: < T > ( & mut chars, mode) ,
397
+ }
398
+ c => {
399
+ let res = match c {
400
+ '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
401
+ '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
402
+ c => Ok ( c) ,
403
+ } ;
404
+ let end = self . pos + c. len_utf8 ( ) ;
405
+ let range = self . pos ..end;
406
+ self . pos = end;
407
+ Some ( ( res. map ( T :: from) , range) )
371
408
}
372
409
}
373
- '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
374
- '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
375
- _ => ascii_check ( c, allow_unicode_chars) . map ( T :: from) ,
376
- } ;
377
- let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
378
- callback ( start..end, res) ;
410
+ } else {
411
+ None
412
+ }
379
413
}
380
- }
381
414
382
- fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
383
- where
384
- F : FnMut ( Range < usize > , EscapeError ) ,
385
- {
386
- let tail = chars. as_str ( ) ;
387
- let first_non_space = tail
388
- . bytes ( )
389
- . position ( |b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r' )
390
- . unwrap_or ( tail. len ( ) ) ;
391
- if tail[ 1 ..first_non_space] . contains ( '\n' ) {
392
- // The +1 accounts for the escaping slash.
393
- let end = start + first_non_space + 1 ;
394
- callback ( start..end, EscapeError :: MultipleSkippedLinesWarning ) ;
415
+ /// Skip ASCII whitespace, except for the formfeed character
416
+ /// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
417
+ /// Warns on unescaped newline and following non-ASCII whitespace.
418
+ fn skip_whitespace ( & mut self ) -> Option < <Self as Iterator >:: Item > {
419
+ // the escaping slash and newline characters add 2 bytes
420
+ let mut end = self . pos + 2 ;
421
+ let mut contains_nl = false ;
422
+ // manual next_if loop
423
+ while let Some ( c) = self . chars . clone ( ) . next ( ) {
424
+ if c. is_ascii_whitespace ( ) && c != '\x0c' {
425
+ let _ = self . chars . next ( ) ;
426
+ end += 1 ;
427
+ contains_nl = contains_nl || c == '\n' ;
428
+ } else {
429
+ break ;
430
+ }
431
+ }
432
+ if contains_nl {
433
+ self . state = State :: UnskippedWhitespace ( end) ;
434
+ Some ( ( Err ( EscapeError :: MultipleSkippedLinesWarning ) , self . pos ..end) )
435
+ } else {
436
+ self . unskipped_whitespace ( end)
437
+ }
395
438
}
396
- let tail = & tail[ first_non_space..] ;
397
- if let Some ( c) = tail. chars ( ) . next ( ) {
398
- if c. is_whitespace ( ) {
399
- // For error reporting, we would like the span to contain the character that was not
400
- // skipped. The +1 is necessary to account for the leading \ that started the escape.
401
- let end = start + first_non_space + c. len_utf8 ( ) + 1 ;
402
- callback ( start..end, EscapeError :: UnskippedWhitespaceWarning ) ;
439
+
440
+ /// Helper for `skip_whitespace`
441
+ fn unskipped_whitespace ( & mut self , end : usize ) -> Option < <Self as Iterator >:: Item > {
442
+ self . state = State :: Start ;
443
+ // peek
444
+ if let Some ( c) = self . chars . clone ( ) . next ( ) {
445
+ let range = self . pos ..end + c. len_utf8 ( ) ;
446
+ self . pos = end;
447
+ if c. is_whitespace ( ) {
448
+ // for error reporting, include the character that was not skipped in the span
449
+ Some ( ( Err ( EscapeError :: UnskippedWhitespaceWarning ) , range) )
450
+ } else {
451
+ self . start ( )
452
+ }
453
+ } else {
454
+ None
403
455
}
404
456
}
405
- * chars = tail. chars ( ) ;
406
457
}
407
458
408
459
/// Takes a contents of a string literal (without quotes) and produces a
409
460
/// sequence of characters or errors.
410
461
/// NOTE: Raw strings do not perform any explicit character escaping, here we
411
462
/// only produce errors on bare CR.
412
- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
413
- where
414
- F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
415
- {
416
- let mut chars = src. chars ( ) ;
463
+ fn check_raw_common (
464
+ src : & str ,
465
+ mode : Mode ,
466
+ ) -> impl Iterator < Item = ( Result < char , EscapeError > , Range < usize > ) > + ' _ {
417
467
let allow_unicode_chars = mode. allow_unicode_chars ( ) ; // get this outside the loop
418
468
419
- // The `start` and `end` computation here matches the one in
420
- // `unescape_non_raw_common` for consistency, even though this function
421
- // doesn't have to worry about skipping any chars.
422
- while let Some ( c) = chars. next ( ) {
423
- let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
469
+ src. char_indices ( ) . map ( move |( pos, c) | {
424
470
let res = match c {
425
471
'\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
426
472
_ => ascii_check ( c, allow_unicode_chars) ,
427
473
} ;
428
- let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
429
- callback ( start..end, res) ;
430
- }
474
+ ( res, pos..pos + c. len_utf8 ( ) )
475
+ } )
431
476
}
432
477
433
478
#[ inline]
0 commit comments