@@ -18,7 +18,6 @@ use log::debug;
18
18
19
19
use rustc_data_structures:: fx:: FxHashSet ;
20
20
use std:: borrow:: Cow ;
21
- use std:: iter;
22
21
use std:: path:: { Path , PathBuf } ;
23
22
use std:: str;
24
23
@@ -34,6 +33,11 @@ pub mod diagnostics;
34
33
35
34
pub mod classify;
36
35
36
+ pub ( crate ) mod unescape;
37
+ use unescape:: { unescape_str, unescape_char, unescape_byte_str, unescape_byte} ;
38
+
39
+ pub ( crate ) mod unescape_error_reporting;
40
+
37
41
/// Info about a parsing session.
38
42
pub struct ParseSess {
39
43
pub span_diagnostic : Handler ,
@@ -307,133 +311,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
307
311
Parser :: new ( sess, stream, None , true , false )
308
312
}
309
313
310
- /// Parses a string representing a character literal into its final form.
311
- /// Rather than just accepting/rejecting a given literal, unescapes it as
312
- /// well. Can take any slice prefixed by a character escape. Returns the
313
- /// character and the number of characters consumed.
314
- fn char_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> ( char , isize ) {
315
- use std:: char;
316
-
317
- // Handle non-escaped chars first.
318
- if lit. as_bytes ( ) [ 0 ] != b'\\' {
319
- // If the first byte isn't '\\' it might part of a multi-byte char, so
320
- // get the char with chars().
321
- let c = lit. chars ( ) . next ( ) . unwrap ( ) ;
322
- return ( c, 1 ) ;
323
- }
324
-
325
- // Handle escaped chars.
326
- match lit. as_bytes ( ) [ 1 ] as char {
327
- '"' => ( '"' , 2 ) ,
328
- 'n' => ( '\n' , 2 ) ,
329
- 'r' => ( '\r' , 2 ) ,
330
- 't' => ( '\t' , 2 ) ,
331
- '\\' => ( '\\' , 2 ) ,
332
- '\'' => ( '\'' , 2 ) ,
333
- '0' => ( '\0' , 2 ) ,
334
- 'x' => {
335
- let v = u32:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . unwrap ( ) ;
336
- let c = char:: from_u32 ( v) . unwrap ( ) ;
337
- ( c, 4 )
338
- }
339
- 'u' => {
340
- assert_eq ! ( lit. as_bytes( ) [ 2 ] , b'{' ) ;
341
- let idx = lit. find ( '}' ) . unwrap ( ) ;
342
-
343
- // All digits and '_' are ascii, so treat each byte as a char.
344
- let mut v: u32 = 0 ;
345
- for c in lit[ 3 ..idx] . bytes ( ) {
346
- let c = char:: from ( c) ;
347
- if c != '_' {
348
- let x = c. to_digit ( 16 ) . unwrap ( ) ;
349
- v = v. checked_mul ( 16 ) . unwrap ( ) . checked_add ( x) . unwrap ( ) ;
350
- }
351
- }
352
- let c = char:: from_u32 ( v) . unwrap_or_else ( || {
353
- if let Some ( ( span, diag) ) = diag {
354
- let mut diag = diag. struct_span_err ( span, "invalid unicode character escape" ) ;
355
- if v > 0x10FFFF {
356
- diag. help ( "unicode escape must be at most 10FFFF" ) . emit ( ) ;
357
- } else {
358
- diag. help ( "unicode escape must not be a surrogate" ) . emit ( ) ;
359
- }
360
- }
361
- '\u{FFFD}'
362
- } ) ;
363
- ( c, ( idx + 1 ) as isize )
364
- }
365
- _ => panic ! ( "lexer should have rejected a bad character escape {}" , lit)
366
- }
367
- }
368
-
369
- /// Parses a string representing a string literal into its final form. Does unescaping.
370
- fn str_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> String {
371
- debug ! ( "str_lit: given {}" , lit. escape_default( ) ) ;
372
- let mut res = String :: with_capacity ( lit. len ( ) ) ;
373
-
374
- let error = |i| format ! ( "lexer should have rejected {} at {}" , lit, i) ;
375
-
376
- /// Eat everything up to a non-whitespace.
377
- fn eat < ' a > ( it : & mut iter:: Peekable < str:: CharIndices < ' a > > ) {
378
- loop {
379
- match it. peek ( ) . map ( |x| x. 1 ) {
380
- Some ( ' ' ) | Some ( '\n' ) | Some ( '\r' ) | Some ( '\t' ) => {
381
- it. next ( ) ;
382
- } ,
383
- _ => { break ; }
384
- }
385
- }
386
- }
387
-
388
- let mut chars = lit. char_indices ( ) . peekable ( ) ;
389
- while let Some ( ( i, c) ) = chars. next ( ) {
390
- match c {
391
- '\\' => {
392
- let ch = chars. peek ( ) . unwrap_or_else ( || {
393
- panic ! ( "{}" , error( i) )
394
- } ) . 1 ;
395
-
396
- if ch == '\n' {
397
- eat ( & mut chars) ;
398
- } else if ch == '\r' {
399
- chars. next ( ) ;
400
- let ch = chars. peek ( ) . unwrap_or_else ( || {
401
- panic ! ( "{}" , error( i) )
402
- } ) . 1 ;
403
-
404
- if ch != '\n' {
405
- panic ! ( "lexer accepted bare CR" ) ;
406
- }
407
- eat ( & mut chars) ;
408
- } else {
409
- // otherwise, a normal escape
410
- let ( c, n) = char_lit ( & lit[ i..] , diag) ;
411
- for _ in 0 ..n - 1 { // we don't need to move past the first \
412
- chars. next ( ) ;
413
- }
414
- res. push ( c) ;
415
- }
416
- } ,
417
- '\r' => {
418
- let ch = chars. peek ( ) . unwrap_or_else ( || {
419
- panic ! ( "{}" , error( i) )
420
- } ) . 1 ;
421
-
422
- if ch != '\n' {
423
- panic ! ( "lexer accepted bare CR" ) ;
424
- }
425
- chars. next ( ) ;
426
- res. push ( '\n' ) ;
427
- }
428
- c => res. push ( c) ,
429
- }
430
- }
431
-
432
- res. shrink_to_fit ( ) ; // probably not going to do anything, unless there was an escape.
433
- debug ! ( "parse_str_lit: returning {}" , res) ;
434
- res
435
- }
436
-
437
314
/// Parses a string representing a raw string literal into its final form. The
438
315
/// only operation this does is convert embedded CRLF into a single LF.
439
316
fn raw_str_lit ( lit : & str ) -> String {
@@ -476,9 +353,21 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
476
353
use ast:: LitKind ;
477
354
478
355
match lit {
479
- token:: Byte ( i) => ( true , Some ( LitKind :: Byte ( byte_lit ( & i. as_str ( ) ) . 0 ) ) ) ,
480
- token:: Char ( i) => ( true , Some ( LitKind :: Char ( char_lit ( & i. as_str ( ) , diag) . 0 ) ) ) ,
481
- token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
356
+ token:: Byte ( i) => {
357
+ let lit_kind = match unescape_byte ( & i. as_str ( ) ) {
358
+ Ok ( c) => LitKind :: Byte ( c) ,
359
+ Err ( _) => LitKind :: Err ( i) ,
360
+ } ;
361
+ ( true , Some ( lit_kind) )
362
+ } ,
363
+ token:: Char ( i) => {
364
+ let lit_kind = match unescape_char ( & i. as_str ( ) ) {
365
+ Ok ( c) => LitKind :: Char ( c) ,
366
+ Err ( _) => LitKind :: Err ( i) ,
367
+ } ;
368
+ ( true , Some ( lit_kind) )
369
+ } ,
370
+ token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
482
371
483
372
// There are some valid suffixes for integer and float literals,
484
373
// so all the handling is done internally.
@@ -490,10 +379,22 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
490
379
// reuse the symbol from the Token. Otherwise, we must generate a
491
380
// new symbol because the string in the LitKind is different to the
492
381
// string in the Token.
382
+ let mut has_error = false ;
493
383
let s = & sym. as_str ( ) ;
494
384
if s. as_bytes ( ) . iter ( ) . any ( |& c| c == b'\\' || c == b'\r' ) {
495
- sym = Symbol :: intern ( & str_lit ( s, diag) ) ;
385
+ let mut buf = String :: with_capacity ( s. len ( ) ) ;
386
+ unescape_str ( s, & mut |_, unescaped_char| {
387
+ match unescaped_char {
388
+ Ok ( c) => buf. push ( c) ,
389
+ Err ( _) => has_error = true ,
390
+ }
391
+ } ) ;
392
+ if has_error {
393
+ return ( true , Some ( LitKind :: Err ( sym) ) ) ;
394
+ }
395
+ sym = Symbol :: intern ( & buf)
496
396
}
397
+
497
398
( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Cooked ) ) )
498
399
}
499
400
token:: StrRaw ( mut sym, n) => {
@@ -505,7 +406,20 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
505
406
( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Raw ( n) ) ) )
506
407
}
507
408
token:: ByteStr ( i) => {
508
- ( true , Some ( LitKind :: ByteStr ( byte_str_lit ( & i. as_str ( ) ) ) ) )
409
+ let s = & i. as_str ( ) ;
410
+ let mut buf = Vec :: with_capacity ( s. len ( ) ) ;
411
+ let mut has_error = false ;
412
+ unescape_byte_str ( s, & mut |_, unescaped_byte| {
413
+ match unescaped_byte {
414
+ Ok ( c) => buf. push ( c) ,
415
+ Err ( _) => has_error = true ,
416
+ }
417
+ } ) ;
418
+ if has_error {
419
+ return ( true , Some ( LitKind :: Err ( i) ) ) ;
420
+ }
421
+ buf. shrink_to_fit ( ) ;
422
+ ( true , Some ( LitKind :: ByteStr ( Lrc :: new ( buf) ) ) )
509
423
}
510
424
token:: ByteStrRaw ( i, _) => {
511
425
( true , Some ( LitKind :: ByteStr ( Lrc :: new ( i. to_string ( ) . into_bytes ( ) ) ) ) )
@@ -560,95 +474,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
560
474
filtered_float_lit ( Symbol :: intern ( s) , suffix, diag)
561
475
}
562
476
563
- /// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
564
- fn byte_lit ( lit : & str ) -> ( u8 , usize ) {
565
- let err = |i| format ! ( "lexer accepted invalid byte literal {} step {}" , lit, i) ;
566
-
567
- if lit. len ( ) == 1 {
568
- ( lit. as_bytes ( ) [ 0 ] , 1 )
569
- } else {
570
- assert_eq ! ( lit. as_bytes( ) [ 0 ] , b'\\' , "{}" , err( 0 ) ) ;
571
- let b = match lit. as_bytes ( ) [ 1 ] {
572
- b'"' => b'"' ,
573
- b'n' => b'\n' ,
574
- b'r' => b'\r' ,
575
- b't' => b'\t' ,
576
- b'\\' => b'\\' ,
577
- b'\'' => b'\'' ,
578
- b'0' => b'\0' ,
579
- _ => {
580
- match u64:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . ok ( ) {
581
- Some ( c) =>
582
- if c > 0xFF {
583
- panic ! ( err( 2 ) )
584
- } else {
585
- return ( c as u8 , 4 )
586
- } ,
587
- None => panic ! ( err( 3 ) )
588
- }
589
- }
590
- } ;
591
- ( b, 2 )
592
- }
593
- }
594
-
595
- fn byte_str_lit ( lit : & str ) -> Lrc < Vec < u8 > > {
596
- let mut res = Vec :: with_capacity ( lit. len ( ) ) ;
597
-
598
- let error = |i| panic ! ( "lexer should have rejected {} at {}" , lit, i) ;
599
-
600
- /// Eat everything up to a non-whitespace.
601
- fn eat < I : Iterator < Item =( usize , u8 ) > > ( it : & mut iter:: Peekable < I > ) {
602
- loop {
603
- match it. peek ( ) . map ( |x| x. 1 ) {
604
- Some ( b' ' ) | Some ( b'\n' ) | Some ( b'\r' ) | Some ( b'\t' ) => {
605
- it. next ( ) ;
606
- } ,
607
- _ => { break ; }
608
- }
609
- }
610
- }
611
-
612
- // byte string literals *must* be ASCII, but the escapes don't have to be
613
- let mut chars = lit. bytes ( ) . enumerate ( ) . peekable ( ) ;
614
- loop {
615
- match chars. next ( ) {
616
- Some ( ( i, b'\\' ) ) => {
617
- match chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 {
618
- b'\n' => eat ( & mut chars) ,
619
- b'\r' => {
620
- chars. next ( ) ;
621
- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
622
- panic ! ( "lexer accepted bare CR" ) ;
623
- }
624
- eat ( & mut chars) ;
625
- }
626
- _ => {
627
- // otherwise, a normal escape
628
- let ( c, n) = byte_lit ( & lit[ i..] ) ;
629
- // we don't need to move past the first \
630
- for _ in 0 ..n - 1 {
631
- chars. next ( ) ;
632
- }
633
- res. push ( c) ;
634
- }
635
- }
636
- } ,
637
- Some ( ( i, b'\r' ) ) => {
638
- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
639
- panic ! ( "lexer accepted bare CR" ) ;
640
- }
641
- chars. next ( ) ;
642
- res. push ( b'\n' ) ;
643
- }
644
- Some ( ( _, c) ) => res. push ( c) ,
645
- None => break ,
646
- }
647
- }
648
-
649
- Lrc :: new ( res)
650
- }
651
-
652
477
fn integer_lit ( s : & str , suffix : Option < Symbol > , diag : Option < ( Span , & Handler ) > )
653
478
-> Option < ast:: LitKind > {
654
479
// s can only be ascii, byte indexing is fine
0 commit comments