Skip to content

Commit 40cf1f9

Browse files
committed
optimize str::iter::Chars::advance_by
this avoids part of the char decoding work by not looking at utf8 continuation bytes
1 parent 3f55e86 commit 40cf1f9

File tree

2 files changed

+61
-0
lines changed

2 files changed

+61
-0
lines changed

library/alloc/tests/str.rs

+11
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,17 @@ fn test_iterator() {
11701170
assert_eq!(s.chars().count(), v.len());
11711171
}
11721172

1173+
#[test]
1174+
fn test_iterator_advance() {
1175+
let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の(含水)酸化物粒子の疎な凝集膜であるとみなせる。";
1176+
let chars: Vec<char> = s.chars().collect();
1177+
let mut it = s.chars();
1178+
it.advance_by(1).unwrap();
1179+
assert_eq!(it.next(), Some(chars[1]));
1180+
it.advance_by(33).unwrap();
1181+
assert_eq!(it.next(), Some(chars[35]));
1182+
}
1183+
11731184
#[test]
11741185
fn test_rev_iterator() {
11751186
let s = "ศไทย中华Việt Nam";

library/core/src/str/iter.rs

+50
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce};
88
use crate::ops::Try;
99
use crate::option;
1010
use crate::slice::{self, Split as SliceSplit};
11+
use core::num::NonZeroUsize;
1112

1213
use super::from_utf8_unchecked;
1314
use super::pattern::Pattern;
@@ -49,6 +50,55 @@ impl<'a> Iterator for Chars<'a> {
4950
super::count::count_chars(self.as_str())
5051
}
5152

53+
#[inline]
54+
fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> {
55+
const CHUNK_SIZE: usize = 32;
56+
57+
if remainder >= CHUNK_SIZE {
58+
let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>();
59+
let mut bytes_skipped: usize = 0;
60+
61+
while remainder > CHUNK_SIZE
62+
&& let Some(chunk) = chunks.next()
63+
{
64+
bytes_skipped += CHUNK_SIZE;
65+
66+
let mut start_bytes = [false; CHUNK_SIZE];
67+
68+
for i in 0..CHUNK_SIZE {
69+
start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]);
70+
}
71+
72+
remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize;
73+
}
74+
75+
// SAFETY: The amount of bytes exists since we just iterated over them,
76+
// so advance_by will succeed.
77+
unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() };
78+
79+
// skip trailing continuation bytes
80+
while self.iter.len() > 0 {
81+
let b = self.iter.as_slice()[0];
82+
if !super::validations::utf8_is_cont_byte(b) {
83+
break;
84+
}
85+
// SAFETY: We just peeked at the byte, therefore it exists
86+
unsafe { self.iter.advance_by(1).unwrap_unchecked() };
87+
}
88+
}
89+
90+
while (remainder > 0) && (self.iter.len() > 0) {
91+
remainder -= 1;
92+
let b = self.iter.as_slice()[0];
93+
let slurp = super::validations::utf8_char_width(b);
94+
// SAFETY: utf8 validity requires that the string must contain
95+
// the continuation bytes (if any)
96+
unsafe { self.iter.advance_by(slurp).unwrap_unchecked() };
97+
}
98+
99+
NonZeroUsize::new(remainder).map_or(Ok(()), Err)
100+
}
101+
52102
#[inline]
53103
fn size_hint(&self) -> (usize, Option<usize>) {
54104
let len = self.iter.len();

0 commit comments

Comments
 (0)