Split impl Debug for str into ASCII/Unicode chunks

Swatinem · Swatinem · commit 1b43527934ee · 2024-05-12T14:48:52.000+02:00
Instead of having a single loop that works on utf-8 `char`s,
this splits the implementation into separate ASCII and Unicode loops,
and uses more optimized code for the ASCII-only case.
diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs
@@ -2399,26 +2399,68 @@ impl Display for bool {
 impl Debug for str {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
         f.write_char('"')?;
-        let mut from = 0;
-        for (i, c) in self.char_indices() {
-            // a fast path for ASCII chars that do not need escapes:
-            if matches!(c, ' '..='~') && !matches!(c, '\\' | '\"') {
-                continue;
-            }
 
-            let esc = c.escape_debug_ext(EscapeDebugExtArgs {
-                escape_grapheme_extended: true,
-                escape_single_quote: false,
-                escape_double_quote: true,
-            });
-            // If char needs escaping, flush backlog so far and write, else skip
-            if esc.len() != 1 {
-                f.write_str(&self[from..i])?;
+        // substring we know is printable
+        let mut printable_range = 0..0;
+
+        // the outer loop here splits the string into ASCII-only, and Unicode-only chunks,
+        // which are then processed separately, to enable a fast path for the ASCII-only chunk.
+        let mut rest = self.as_bytes();
+        while rest.len() > 0 {
+            let mut ascii_bytes: &[u8];
+            let unicode_bytes: &[u8];
+
+            // first, handle an ascii-only prefix
+            let non_ascii_position = rest.iter().position(|&b| b >= 0x80).unwrap_or(rest.len());
+            // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary
+            (ascii_bytes, rest) = unsafe { rest.split_at_unchecked(non_ascii_position) };
+
+            fn needs_escape(b: u8) -> bool {
+                b > 0x7E || b < 0x20 || b == b'\\' || b == b'"'
+            }
+            while let Some(escape_position) = ascii_bytes.iter().position(|&b| needs_escape(b)) {
+                printable_range.end += escape_position;
+                f.write_str(&self[printable_range.clone()])?;
+
+                let c = ascii_bytes[escape_position] as char;
+                let esc = c.escape_debug_ext(EscapeDebugExtArgs {
+                    escape_grapheme_extended: true,
+                    escape_single_quote: false,
+                    escape_double_quote: true,
+                });
                 Display::fmt(&esc, f)?;
-                from = i + c.len_utf8();
+
+                ascii_bytes = &ascii_bytes[escape_position + 1..];
+                printable_range = (printable_range.end + 1)..(printable_range.end + 1);
+            }
+            printable_range.end += ascii_bytes.len();
+
+            // then, handle a unicode-only prefix
+            let ascii_position = rest.iter().position(|&b| b < 0x80).unwrap_or(rest.len());
+            // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary
+            (unicode_bytes, rest) = unsafe { rest.split_at_unchecked(ascii_position) };
+            // SAFETY: prefix is a valid utf8 sequence, and at a char boundary
+            let unicode_prefix = unsafe { crate::str::from_utf8_unchecked(unicode_bytes) };
+
+            for c in unicode_prefix.chars() {
+                // SAFETY: we know that our slice only contains unicode chars
+                unsafe { crate::hint::assert_unchecked(c as u32 >= 0x80) };
+                let esc = c.escape_debug_ext(EscapeDebugExtArgs {
+                    escape_grapheme_extended: true,
+                    escape_single_quote: false,
+                    escape_double_quote: true,
+                });
+                if esc.len() != 1 {
+                    f.write_str(&self[printable_range.clone()])?;
+                    Display::fmt(&esc, f)?;
+                    printable_range.start = printable_range.end + c.len_utf8();
+                }
+                printable_range.end += c.len_utf8();
             }
         }
-        f.write_str(&self[from..])?;
+
+        f.write_str(&self[printable_range])?;
+
         f.write_char('"')
     }
 }