Auto merge of rust-lang#131656 - richard-uk1:move_empty_exponent_to_rustc_session, r=<try>

bors · bors · commit c0ae1973f0fe · 2025-02-28T21:28:48.000Z
move some invalid exponent detection into rustc_session This PR moves part of the exponent checks from `rustc_lexer`/`rustc_parser` into `rustc_session`. This change does not affect which programs are accepted by the complier, or the diagnostics that are reported, with one main exception. That exception is that floats or ints with suffixes beginning with `e` are rejected *after* the token stream is passed to proc macros, rather than being rejected by the parser as was the case. This gives proc macro authors more consistent access to numeric literals: currently a proc macro could interpret `1m` or `30s` but not `7eggs` or `3em`. After this change all are handled the same. The lexer will still reject input if it contains `e` followed by a number, `+`/`-`, or `_` if they are not followed by a valid integer literal (number + `_`), but this doesn't affect macro authors who just want to access alpha suffixes. This PR is a continuation of rust-lang#79912. It is also solving exactly the same problem as [rust-lang#111628](rust-lang#111628). Exponents that contain arbitrarily long underscore suffixes are handled without read-ahead by tracking the exponent start in case of invalid exponent, so the suffix start is correct. This is very much an edge-case (the user would have to write something like `1e_______________23`) but nevertheless it is handled correctly. Also adds tests for various edge cases and improves diagnostics marginally. r: `@petrochenkov,` since they reviewed rust-lang#79912.
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -194,7 +194,7 @@ pub enum DocStyle {
 pub enum LiteralKind {
     /// `12_u8`, `0o100`, `0b120i99`, `1f32`.
     Int { base: Base, empty_int: bool },
-    /// `12.34f32`, `1e3`, but not `1f32`.
+    /// `12.34f32`, `1e3` and `1e+`, but not `1f32` or `1em`.
     Float { base: Base, empty_exponent: bool },
     /// `'a'`, `'\\'`, `'''`, `';`
     Char { terminated: bool },
@@ -409,8 +409,8 @@ impl Cursor<'_> {
 
             // Numeric literal.
             c @ '0'..='9' => {
-                let literal_kind = self.number(c);
-                let suffix_start = self.pos_within_token();
+                let (literal_kind, suffix_start) = self.number(c);
+                let suffix_start = suffix_start.unwrap_or(self.pos_within_token());
                 self.eat_literal_suffix();
                 TokenKind::Literal { kind: literal_kind, suffix_start }
             }
@@ -606,7 +606,9 @@ impl Cursor<'_> {
         }
     }
 
-    fn number(&mut self, first_digit: char) -> LiteralKind {
+    /// Parses a number and in `.1` returns the offset of the literal suffix if
+    /// different from the current position on return.
+    fn number(&mut self, first_digit: char) -> (LiteralKind, Option<u32>) {
         debug_assert!('0' <= self.prev() && self.prev() <= '9');
         let mut base = Base::Decimal;
         if first_digit == '0' {
@@ -616,21 +618,21 @@ impl Cursor<'_> {
                     base = Base::Binary;
                     self.bump();
                     if !self.eat_decimal_digits() {
-                        return Int { base, empty_int: true };
+                        return (Int { base, empty_int: true }, None);
                     }
                 }
                 'o' => {
                     base = Base::Octal;
                     self.bump();
                     if !self.eat_decimal_digits() {
-                        return Int { base, empty_int: true };
+                        return (Int { base, empty_int: true }, None);
                     }
                 }
                 'x' => {
                     base = Base::Hexadecimal;
                     self.bump();
                     if !self.eat_hexadecimal_digits() {
-                        return Int { base, empty_int: true };
+                        return (Int { base, empty_int: true }, None);
                     }
                 }
                 // Not a base prefix; consume additional digits.
@@ -642,40 +644,79 @@ impl Cursor<'_> {
                 '.' | 'e' | 'E' => {}
 
                 // Just a 0.
-                _ => return Int { base, empty_int: false },
+                _ => return (Int { base, empty_int: false }, None),
             }
         } else {
             // No base prefix, parse number in the usual way.
             self.eat_decimal_digits();
         };
 
-        match self.first() {
+        match (self.first(), self.second()) {
             // Don't be greedy if this is actually an
             // integer literal followed by field/method access or a range pattern
             // (`0..2` and `12.foo()`)
-            '.' if self.second() != '.' && !is_id_start(self.second()) => {
-                // might have stuff after the ., and if it does, it needs to start
-                // with a number
+            ('.', second) if second != '.' && !is_id_start(second) => {
                 self.bump();
+                self.eat_decimal_digits();
+
                 let mut empty_exponent = false;
-                if self.first().is_ascii_digit() {
-                    self.eat_decimal_digits();
-                    match self.first() {
-                        'e' | 'E' => {
-                            self.bump();
-                            empty_exponent = !self.eat_float_exponent();
-                        }
-                        _ => (),
+                let suffix_start = match (self.first(), self.second()) {
+                    ('e' | 'E', '_') => self.eat_underscore_exponent(),
+                    ('e' | 'E', '0'..='9' | '+' | '-') => {
+                        // Definitely an exponent (which still can be empty).
+                        self.bump();
+                        empty_exponent = !self.eat_float_exponent();
+                        None
                     }
+                    _ => None,
+                };
+                (Float { base, empty_exponent }, suffix_start)
+            }
+            ('e' | 'E', '_') => {
+                match self.eat_underscore_exponent() {
+                    Some(suffix_start) => {
+                        // The suffix begins at `e`, meaning the number is an integer.
+                        (Int { base, empty_int: false }, Some(suffix_start))
+                    }
+                    None => (Float { base, empty_exponent: false }, None),
                 }
-                Float { base, empty_exponent }
             }
-            'e' | 'E' => {
+            ('e' | 'E', '0'..='9' | '+' | '-') => {
+                // Definitely an exponent (which still can be empty).
                 self.bump();
                 let empty_exponent = !self.eat_float_exponent();
-                Float { base, empty_exponent }
+                (Float { base, empty_exponent }, None)
             }
-            _ => Int { base, empty_int: false },
+            _ => (Int { base, empty_int: false }, None),
+        }
+    }
+
+    /// Try to find and eat an exponent
+    ///
+    /// Assumes the first character is `e`/`E` and second is `_`, and consumes
+    /// `e`/`E` followed by all consecutive `_`s.
+    ///
+    /// Returns `Some` if no exponent was found. In this case, the suffix is partially
+    /// consumed, and began at the return value.
+    fn eat_underscore_exponent(&mut self) -> Option<u32> {
+        debug_assert!(matches!(self.first(), 'e' | 'E'));
+        debug_assert!(matches!(self.second(), '_'));
+        let suffix_start = self.pos_within_token();
+
+        // check if series of `_` is ended by a digit. If yes
+        // include it in the number as exponent. If no include
+        // it in suffix.
+        self.bump();
+        while matches!(self.first(), '_') {
+            self.bump();
+        }
+        // If we find a digit, then the exponential was valid
+        // so the suffix will start at the cursor as usual.
+        if self.first().is_ascii_digit() {
+            self.eat_decimal_digits();
+            None
+        } else {
+            Some(suffix_start)
         }
     }
 
@@ -924,6 +965,7 @@ impl Cursor<'_> {
         }
     }
 
+    /// Returns `true` if a digit was consumed (rather than just '_'s).
     fn eat_decimal_digits(&mut self) -> bool {
         let mut has_digits = false;
         loop {
@@ -961,20 +1003,20 @@ impl Cursor<'_> {
     /// Eats the float exponent. Returns true if at least one digit was met,
     /// and returns false otherwise.
     fn eat_float_exponent(&mut self) -> bool {
-        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
+        debug_assert!(matches!(self.prev(), 'e' | 'E'));
         if self.first() == '-' || self.first() == '+' {
             self.bump();
         }
         self.eat_decimal_digits()
     }
 
-    // Eats the suffix of the literal, e.g. "u8".
+    /// Eats the suffix of the literal, e.g. "u8".
     fn eat_literal_suffix(&mut self) {
-        self.eat_identifier();
+        self.eat_identifier()
     }
 
-    // Eats the identifier. Note: succeeds on `_`, which isn't a valid
-    // identifier.
+    /// Eats the identifier. Note: succeeds on `_`, which isn't a valid
+    /// identifier.
     fn eat_identifier(&mut self) {
         if !is_id_start(self.first()) {
             return;
diff --git a/compiler/rustc_session/messages.ftl b/compiler/rustc_session/messages.ftl
@@ -14,6 +14,8 @@ session_embed_source_insufficient_dwarf_version = `-Zembed-source=y` requires at
 
 session_embed_source_requires_debug_info = `-Zembed-source=y` requires debug information to be enabled
 
+session_empty_float_exponent = expected at least one digit in exponent
+
 session_expr_parentheses_needed = parentheses are required to parse this as an expression
 
 session_failed_to_create_profiler = failed to create profiler: {$err}
diff --git a/compiler/rustc_session/src/errors.rs b/compiler/rustc_session/src/errors.rs
@@ -377,6 +377,10 @@ pub fn report_lit_error(
         s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit())
     }
 
+    fn looks_like_empty_exponent(s: &str) -> bool {
+        s.len() == 1 && matches!(s.chars().next(), Some('e' | 'E'))
+    }
+
     // Try to lowercase the prefix if the prefix and suffix are valid.
     fn fix_base_capitalisation(prefix: &str, suffix: &str) -> Option<String> {
         let mut chars = suffix.chars();
@@ -409,6 +413,8 @@ pub fn report_lit_error(
             if looks_like_width_suffix(&['i', 'u'], suf) {
                 // If it looks like a width, try to be helpful.
                 dcx.emit_err(InvalidIntLiteralWidth { span, width: suf[1..].into() })
+            } else if looks_like_empty_exponent(suf) {
+                dcx.emit_err(EmptyFloatExponent { span })
             } else if let Some(fixed) = fix_base_capitalisation(lit.symbol.as_str(), suf) {
                 dcx.emit_err(InvalidNumLiteralBasePrefix { span, fixed })
             } else {
@@ -420,6 +426,8 @@ pub fn report_lit_error(
             if looks_like_width_suffix(&['f'], suf) {
                 // If it looks like a width, try to be helpful.
                 dcx.emit_err(InvalidFloatLiteralWidth { span, width: suf[1..].to_string() })
+            } else if looks_like_empty_exponent(suf) {
+                dcx.emit_err(EmptyFloatExponent { span })
             } else {
                 dcx.emit_err(InvalidFloatLiteralSuffix { span, suffix: suf.to_string() })
             }
@@ -489,3 +497,10 @@ pub(crate) struct SoftFloatIgnored;
 #[note]
 #[note(session_soft_float_deprecated_issue)]
 pub(crate) struct SoftFloatDeprecated;
+
+#[derive(Diagnostic)]
+#[diag(session_empty_float_exponent)]
+pub(crate) struct EmptyFloatExponent {
+    #[primary_span]
+    pub span: Span,
+}
diff --git a/tests/ui/consts/const-eval/issue-104390.stderr b/tests/ui/consts/const-eval/issue-104390.stderr
@@ -1,39 +1,3 @@
-error: expected at least one digit in exponent
-  --> $DIR/issue-104390.rs:1:27
-   |
-LL | fn f1() -> impl Sized { & 2E }
-   |                           ^^
-
-error: expected at least one digit in exponent
-  --> $DIR/issue-104390.rs:2:28
-   |
-LL | fn f2() -> impl Sized { && 2E }
-   |                            ^^
-
-error: expected at least one digit in exponent
-  --> $DIR/issue-104390.rs:3:29
-   |
-LL | fn f3() -> impl Sized { &'a 2E }
-   |                             ^^
-
-error: expected at least one digit in exponent
-  --> $DIR/issue-104390.rs:5:34
-   |
-LL | fn f4() -> impl Sized { &'static 2E }
-   |                                  ^^
-
-error: expected at least one digit in exponent
-  --> $DIR/issue-104390.rs:7:28
-   |
-LL | fn f5() -> impl Sized { *& 2E }
-   |                            ^^
-
-error: expected at least one digit in exponent
-  --> $DIR/issue-104390.rs:8:29
-   |
-LL | fn f6() -> impl Sized { &'_ 2E }
-   |                             ^^
-
 error: borrow expressions cannot be annotated with lifetimes
   --> $DIR/issue-104390.rs:3:25
    |
@@ -76,5 +40,41 @@ LL - fn f6() -> impl Sized { &'_ 2E }
 LL + fn f6() -> impl Sized { &2E }
    |
 
+error: expected at least one digit in exponent
+  --> $DIR/issue-104390.rs:1:27
+   |
+LL | fn f1() -> impl Sized { & 2E }
+   |                           ^^
+
+error: expected at least one digit in exponent
+  --> $DIR/issue-104390.rs:2:28
+   |
+LL | fn f2() -> impl Sized { && 2E }
+   |                            ^^
+
+error: expected at least one digit in exponent
+  --> $DIR/issue-104390.rs:3:29
+   |
+LL | fn f3() -> impl Sized { &'a 2E }
+   |                             ^^
+
+error: expected at least one digit in exponent
+  --> $DIR/issue-104390.rs:5:34
+   |
+LL | fn f4() -> impl Sized { &'static 2E }
+   |                                  ^^
+
+error: expected at least one digit in exponent
+  --> $DIR/issue-104390.rs:7:28
+   |
+LL | fn f5() -> impl Sized { *& 2E }
+   |                            ^^
+
+error: expected at least one digit in exponent
+  --> $DIR/issue-104390.rs:8:29
+   |
+LL | fn f6() -> impl Sized { &'_ 2E }
+   |                             ^^
+
 error: aborting due to 9 previous errors
 
diff --git a/tests/ui/consts/issue-91434.stderr b/tests/ui/consts/issue-91434.stderr
@@ -1,15 +1,15 @@
-error: expected at least one digit in exponent
-  --> $DIR/issue-91434.rs:2:11
-   |
-LL |     [9; [[9E; h]]];
-   |           ^^
-
 error[E0425]: cannot find value `h` in this scope
   --> $DIR/issue-91434.rs:2:15
    |
 LL |     [9; [[9E; h]]];
    |               ^ not found in this scope
 
+error: expected at least one digit in exponent
+  --> $DIR/issue-91434.rs:2:11
+   |
+LL |     [9; [[9E; h]]];
+   |           ^^
+
 error: aborting due to 2 previous errors
 
 For more information about this error, try `rustc --explain E0425`.
diff --git a/tests/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr b/tests/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr
@@ -1,9 +1,3 @@
-error: expected at least one digit in exponent
-  --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:47
-   |
-LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
-   |                                               ^^^^^^
-
 error: unknown start of token: \u{2212}
   --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
    |
@@ -16,5 +10,11 @@ LL - const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹
 LL + const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
    |
 
+error: expected at least one digit in exponent
+  --> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:47
+   |
+LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
+   |                                               ^^^^^^
+
 error: aborting due to 2 previous errors
 
diff --git a/tests/ui/lexer/custom-suffixes-exponent-like.rs b/tests/ui/lexer/custom-suffixes-exponent-like.rs
@@ -0,0 +1,16 @@
+const _A: f64 = 1em;
+    //~^ ERROR invalid suffix `em` for number literal
+const _B: f64 = 1e0m;
+    //~^ ERROR invalid suffix `m` for float literal
+const _C: f64 = 1e_______________0m;
+    //~^ ERROR invalid suffix `m` for float literal
+const _D: f64 = 1e_______________m;
+    //~^ ERROR invalid suffix `e_______________m` for number literal
+
+// All the above patterns should not generate an error when used in a macro
+macro_rules! do_nothing {
+    ($($toks:tt)*) => {};
+}
+do_nothing!(1em 1e0m 1e_______________0m 1e_______________m);
+
+fn main() {}
diff --git a/tests/ui/lexer/custom-suffixes-exponent-like.stderr b/tests/ui/lexer/custom-suffixes-exponent-like.stderr
diff --git a/tests/ui/lexer/custom-suffixes.rs b/tests/ui/lexer/custom-suffixes.rs