Skip to content

Commit 1b43527

Browse files
committed
Split impl Debug for str into ASCII/Unicode chunks
Instead of having a single loop that works on utf-8 `char`s, this splits the implementation into separate ASCII and Unicode loops, and uses more optimized code for the ASCII-only case.
1 parent 51ecc64 commit 1b43527

File tree

1 file changed

+58
-16
lines changed

1 file changed

+58
-16
lines changed

library/core/src/fmt/mod.rs

+58-16
Original file line numberDiff line numberDiff line change
@@ -2399,26 +2399,68 @@ impl Display for bool {
23992399
impl Debug for str {
24002400
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
24012401
f.write_char('"')?;
2402-
let mut from = 0;
2403-
for (i, c) in self.char_indices() {
2404-
// a fast path for ASCII chars that do not need escapes:
2405-
if matches!(c, ' '..='~') && !matches!(c, '\\' | '\"') {
2406-
continue;
2407-
}
24082402

2409-
let esc = c.escape_debug_ext(EscapeDebugExtArgs {
2410-
escape_grapheme_extended: true,
2411-
escape_single_quote: false,
2412-
escape_double_quote: true,
2413-
});
2414-
// If char needs escaping, flush backlog so far and write, else skip
2415-
if esc.len() != 1 {
2416-
f.write_str(&self[from..i])?;
2403+
// substring we know is printable
2404+
let mut printable_range = 0..0;
2405+
2406+
// the outer loop here splits the string into ASCII-only, and Unicode-only chunks,
2407+
// which are then processed separately, to enable a fast path for the ASCII-only chunk.
2408+
let mut rest = self.as_bytes();
2409+
while rest.len() > 0 {
2410+
let mut ascii_bytes: &[u8];
2411+
let unicode_bytes: &[u8];
2412+
2413+
// first, handle an ascii-only prefix
2414+
let non_ascii_position = rest.iter().position(|&b| b >= 0x80).unwrap_or(rest.len());
2415+
// SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary
2416+
(ascii_bytes, rest) = unsafe { rest.split_at_unchecked(non_ascii_position) };
2417+
2418+
fn needs_escape(b: u8) -> bool {
2419+
b > 0x7E || b < 0x20 || b == b'\\' || b == b'"'
2420+
}
2421+
while let Some(escape_position) = ascii_bytes.iter().position(|&b| needs_escape(b)) {
2422+
printable_range.end += escape_position;
2423+
f.write_str(&self[printable_range.clone()])?;
2424+
2425+
let c = ascii_bytes[escape_position] as char;
2426+
let esc = c.escape_debug_ext(EscapeDebugExtArgs {
2427+
escape_grapheme_extended: true,
2428+
escape_single_quote: false,
2429+
escape_double_quote: true,
2430+
});
24172431
Display::fmt(&esc, f)?;
2418-
from = i + c.len_utf8();
2432+
2433+
ascii_bytes = &ascii_bytes[escape_position + 1..];
2434+
printable_range = (printable_range.end + 1)..(printable_range.end + 1);
2435+
}
2436+
printable_range.end += ascii_bytes.len();
2437+
2438+
// then, handle a unicode-only prefix
2439+
let ascii_position = rest.iter().position(|&b| b < 0x80).unwrap_or(rest.len());
2440+
// SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary
2441+
(unicode_bytes, rest) = unsafe { rest.split_at_unchecked(ascii_position) };
2442+
// SAFETY: prefix is a valid utf8 sequence, and at a char boundary
2443+
let unicode_prefix = unsafe { crate::str::from_utf8_unchecked(unicode_bytes) };
2444+
2445+
for c in unicode_prefix.chars() {
2446+
// SAFETY: we know that our slice only contains unicode chars
2447+
unsafe { crate::hint::assert_unchecked(c as u32 >= 0x80) };
2448+
let esc = c.escape_debug_ext(EscapeDebugExtArgs {
2449+
escape_grapheme_extended: true,
2450+
escape_single_quote: false,
2451+
escape_double_quote: true,
2452+
});
2453+
if esc.len() != 1 {
2454+
f.write_str(&self[printable_range.clone()])?;
2455+
Display::fmt(&esc, f)?;
2456+
printable_range.start = printable_range.end + c.len_utf8();
2457+
}
2458+
printable_range.end += c.len_utf8();
24192459
}
24202460
}
2421-
f.write_str(&self[from..])?;
2461+
2462+
f.write_str(&self[printable_range])?;
2463+
24222464
f.write_char('"')
24232465
}
24242466
}

0 commit comments

Comments
 (0)