@@ -18,7 +18,6 @@ use log::debug;
1818
1919use rustc_data_structures:: fx:: FxHashSet ;
2020use std:: borrow:: Cow ;
21- use std:: iter;
2221use std:: path:: { Path , PathBuf } ;
2322use std:: str;
2423
@@ -33,6 +32,11 @@ pub mod attr;
3332
3433pub mod classify;
3534
35+ pub ( crate ) mod unescape;
36+ use unescape:: { unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError } ;
37+
38+ pub ( crate ) mod unescape_error_reporting;
39+
3640/// Info about a parsing session.
3741pub struct ParseSess {
3842 pub span_diagnostic : Handler ,
@@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
306310 Parser :: new ( sess, stream, None , true , false )
307311}
308312
309- /// Parses a string representing a character literal into its final form.
310- /// Rather than just accepting/rejecting a given literal, unescapes it as
311- /// well. Can take any slice prefixed by a character escape. Returns the
312- /// character and the number of characters consumed.
313- fn char_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> ( char , isize ) {
314- use std:: char;
315-
316- // Handle non-escaped chars first.
317- if lit. as_bytes ( ) [ 0 ] != b'\\' {
318- // If the first byte isn't '\\' it might part of a multi-byte char, so
319- // get the char with chars().
320- let c = lit. chars ( ) . next ( ) . unwrap ( ) ;
321- return ( c, 1 ) ;
322- }
323-
324- // Handle escaped chars.
325- match lit. as_bytes ( ) [ 1 ] as char {
326- '"' => ( '"' , 2 ) ,
327- 'n' => ( '\n' , 2 ) ,
328- 'r' => ( '\r' , 2 ) ,
329- 't' => ( '\t' , 2 ) ,
330- '\\' => ( '\\' , 2 ) ,
331- '\'' => ( '\'' , 2 ) ,
332- '0' => ( '\0' , 2 ) ,
333- 'x' => {
334- let v = u32:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . unwrap ( ) ;
335- let c = char:: from_u32 ( v) . unwrap ( ) ;
336- ( c, 4 )
337- }
338- 'u' => {
339- assert_eq ! ( lit. as_bytes( ) [ 2 ] , b'{' ) ;
340- let idx = lit. find ( '}' ) . unwrap ( ) ;
341-
342- // All digits and '_' are ascii, so treat each byte as a char.
343- let mut v: u32 = 0 ;
344- for c in lit[ 3 ..idx] . bytes ( ) {
345- let c = char:: from ( c) ;
346- if c != '_' {
347- let x = c. to_digit ( 16 ) . unwrap ( ) ;
348- v = v. checked_mul ( 16 ) . unwrap ( ) . checked_add ( x) . unwrap ( ) ;
349- }
350- }
351- let c = char:: from_u32 ( v) . unwrap_or_else ( || {
352- if let Some ( ( span, diag) ) = diag {
353- let mut diag = diag. struct_span_err ( span, "invalid unicode character escape" ) ;
354- if v > 0x10FFFF {
355- diag. help ( "unicode escape must be at most 10FFFF" ) . emit ( ) ;
356- } else {
357- diag. help ( "unicode escape must not be a surrogate" ) . emit ( ) ;
358- }
359- }
360- '\u{FFFD}'
361- } ) ;
362- ( c, ( idx + 1 ) as isize )
363- }
364- _ => panic ! ( "lexer should have rejected a bad character escape {}" , lit)
365- }
366- }
367-
368- /// Parses a string representing a string literal into its final form. Does unescaping.
369- fn str_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> String {
370- debug ! ( "str_lit: given {}" , lit. escape_default( ) ) ;
371- let mut res = String :: with_capacity ( lit. len ( ) ) ;
372-
373- let error = |i| format ! ( "lexer should have rejected {} at {}" , lit, i) ;
374-
375- /// Eat everything up to a non-whitespace.
376- fn eat < ' a > ( it : & mut iter:: Peekable < str:: CharIndices < ' a > > ) {
377- loop {
378- match it. peek ( ) . map ( |x| x. 1 ) {
379- Some ( ' ' ) | Some ( '\n' ) | Some ( '\r' ) | Some ( '\t' ) => {
380- it. next ( ) ;
381- } ,
382- _ => { break ; }
383- }
384- }
385- }
386-
387- let mut chars = lit. char_indices ( ) . peekable ( ) ;
388- while let Some ( ( i, c) ) = chars. next ( ) {
389- match c {
390- '\\' => {
391- let ch = chars. peek ( ) . unwrap_or_else ( || {
392- panic ! ( "{}" , error( i) )
393- } ) . 1 ;
394-
395- if ch == '\n' {
396- eat ( & mut chars) ;
397- } else if ch == '\r' {
398- chars. next ( ) ;
399- let ch = chars. peek ( ) . unwrap_or_else ( || {
400- panic ! ( "{}" , error( i) )
401- } ) . 1 ;
402-
403- if ch != '\n' {
404- panic ! ( "lexer accepted bare CR" ) ;
405- }
406- eat ( & mut chars) ;
407- } else {
408- // otherwise, a normal escape
409- let ( c, n) = char_lit ( & lit[ i..] , diag) ;
410- for _ in 0 ..n - 1 { // we don't need to move past the first \
411- chars. next ( ) ;
412- }
413- res. push ( c) ;
414- }
415- } ,
416- '\r' => {
417- let ch = chars. peek ( ) . unwrap_or_else ( || {
418- panic ! ( "{}" , error( i) )
419- } ) . 1 ;
420-
421- if ch != '\n' {
422- panic ! ( "lexer accepted bare CR" ) ;
423- }
424- chars. next ( ) ;
425- res. push ( '\n' ) ;
426- }
427- c => res. push ( c) ,
428- }
429- }
430-
431- res. shrink_to_fit ( ) ; // probably not going to do anything, unless there was an escape.
432- debug ! ( "parse_str_lit: returning {}" , res) ;
433- res
434- }
435-
436313/// Parses a string representing a raw string literal into its final form. The
437314/// only operation this does is convert embedded CRLF into a single LF.
438315fn raw_str_lit ( lit : & str ) -> String {
@@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
475352 use ast:: LitKind ;
476353
477354 match lit {
478- token:: Byte ( i) => ( true , Some ( LitKind :: Byte ( byte_lit ( & i. as_str ( ) ) . 0 ) ) ) ,
479- token:: Char ( i) => ( true , Some ( LitKind :: Char ( char_lit ( & i. as_str ( ) , diag) . 0 ) ) ) ,
480- token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
355+ token:: Byte ( i) => {
356+ let lit_kind = match unescape_byte ( & i. as_str ( ) ) {
357+ Ok ( c) => LitKind :: Byte ( c) ,
358+ Err ( ( _, EscapeError :: MoreThanOneChar ) ) => LitKind :: Err ( i) ,
359+ Err ( _) => LitKind :: Byte ( 0 ) ,
360+ } ;
361+ ( true , Some ( lit_kind) )
362+ } ,
363+ token:: Char ( i) => {
364+ let lit_kind = match unescape_char ( & i. as_str ( ) ) {
365+ Ok ( c) => LitKind :: Char ( c) ,
366+ Err ( ( _, EscapeError :: MoreThanOneChar ) ) => LitKind :: Err ( i) ,
367+ Err ( _) => LitKind :: Char ( '\u{FFFD}' ) ,
368+ } ;
369+ ( true , Some ( lit_kind) )
370+ } ,
371+ token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
481372
482373 // There are some valid suffixes for integer and float literals,
483374 // so all the handling is done internally.
@@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
491382 // string in the Token.
492383 let s = & sym. as_str ( ) ;
493384 if s. as_bytes ( ) . iter ( ) . any ( |& c| c == b'\\' || c == b'\r' ) {
494- sym = Symbol :: intern ( & str_lit ( s, diag) ) ;
385+ let mut buf = String :: with_capacity ( s. len ( ) ) ;
386+ unescape_str ( s, & mut |_, unescaped_char| {
387+ match unescaped_char {
388+ Ok ( c) => buf. push ( c) ,
389+ Err ( _) => buf. push ( '\u{FFFD}' ) ,
390+ }
391+ } ) ;
392+ sym = Symbol :: intern ( & buf)
495393 }
496394 ( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Cooked ) ) )
497395 }
@@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
504402 ( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Raw ( n) ) ) )
505403 }
506404 token:: ByteStr ( i) => {
507- ( true , Some ( LitKind :: ByteStr ( byte_str_lit ( & i. as_str ( ) ) ) ) )
405+ let s = & i. as_str ( ) ;
406+ let mut buf = Vec :: with_capacity ( s. len ( ) ) ;
407+ unescape_byte_str ( s, & mut |_, unescaped_byte| {
408+ match unescaped_byte {
409+ Ok ( c) => buf. push ( c) ,
410+ Err ( _) => buf. push ( 0 ) ,
411+ }
412+ } ) ;
413+ buf. shrink_to_fit ( ) ;
414+ ( true , Some ( LitKind :: ByteStr ( Lrc :: new ( buf) ) ) )
508415 }
509416 token:: ByteStrRaw ( i, _) => {
510417 ( true , Some ( LitKind :: ByteStr ( Lrc :: new ( i. to_string ( ) . into_bytes ( ) ) ) ) )
@@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
559466 filtered_float_lit ( Symbol :: intern ( s) , suffix, diag)
560467}
561468
562- /// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
563- fn byte_lit ( lit : & str ) -> ( u8 , usize ) {
564- let err = |i| format ! ( "lexer accepted invalid byte literal {} step {}" , lit, i) ;
565-
566- if lit. len ( ) == 1 {
567- ( lit. as_bytes ( ) [ 0 ] , 1 )
568- } else {
569- assert_eq ! ( lit. as_bytes( ) [ 0 ] , b'\\' , "{}" , err( 0 ) ) ;
570- let b = match lit. as_bytes ( ) [ 1 ] {
571- b'"' => b'"' ,
572- b'n' => b'\n' ,
573- b'r' => b'\r' ,
574- b't' => b'\t' ,
575- b'\\' => b'\\' ,
576- b'\'' => b'\'' ,
577- b'0' => b'\0' ,
578- _ => {
579- match u64:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . ok ( ) {
580- Some ( c) =>
581- if c > 0xFF {
582- panic ! ( err( 2 ) )
583- } else {
584- return ( c as u8 , 4 )
585- } ,
586- None => panic ! ( err( 3 ) )
587- }
588- }
589- } ;
590- ( b, 2 )
591- }
592- }
593-
594- fn byte_str_lit ( lit : & str ) -> Lrc < Vec < u8 > > {
595- let mut res = Vec :: with_capacity ( lit. len ( ) ) ;
596-
597- let error = |i| panic ! ( "lexer should have rejected {} at {}" , lit, i) ;
598-
599- /// Eat everything up to a non-whitespace.
600- fn eat < I : Iterator < Item =( usize , u8 ) > > ( it : & mut iter:: Peekable < I > ) {
601- loop {
602- match it. peek ( ) . map ( |x| x. 1 ) {
603- Some ( b' ' ) | Some ( b'\n' ) | Some ( b'\r' ) | Some ( b'\t' ) => {
604- it. next ( ) ;
605- } ,
606- _ => { break ; }
607- }
608- }
609- }
610-
611- // byte string literals *must* be ASCII, but the escapes don't have to be
612- let mut chars = lit. bytes ( ) . enumerate ( ) . peekable ( ) ;
613- loop {
614- match chars. next ( ) {
615- Some ( ( i, b'\\' ) ) => {
616- match chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 {
617- b'\n' => eat ( & mut chars) ,
618- b'\r' => {
619- chars. next ( ) ;
620- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
621- panic ! ( "lexer accepted bare CR" ) ;
622- }
623- eat ( & mut chars) ;
624- }
625- _ => {
626- // otherwise, a normal escape
627- let ( c, n) = byte_lit ( & lit[ i..] ) ;
628- // we don't need to move past the first \
629- for _ in 0 ..n - 1 {
630- chars. next ( ) ;
631- }
632- res. push ( c) ;
633- }
634- }
635- } ,
636- Some ( ( i, b'\r' ) ) => {
637- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
638- panic ! ( "lexer accepted bare CR" ) ;
639- }
640- chars. next ( ) ;
641- res. push ( b'\n' ) ;
642- }
643- Some ( ( _, c) ) => res. push ( c) ,
644- None => break ,
645- }
646- }
647-
648- Lrc :: new ( res)
649- }
650-
651469fn integer_lit ( s : & str , suffix : Option < Symbol > , diag : Option < ( Span , & Handler ) > )
652470 -> Option < ast:: LitKind > {
653471 // s can only be ascii, byte indexing is fine
0 commit comments