optimize str::iter::Chars::advance_by

the8472 · the8472 · commit 40cf1f925762 · 2023-11-27T22:06:35.000+01:00
this avoids part of the char decoding work by not looking at utf8 continuation bytes
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
@@ -1170,6 +1170,17 @@ fn test_iterator() {
     assert_eq!(s.chars().count(), v.len());
 }
 
+#[test]
+fn test_iterator_advance() {
+    let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の（含水）酸化物粒子の疎な凝集膜であるとみなせる。";
+    let chars: Vec<char> = s.chars().collect();
+    let mut it = s.chars();
+    it.advance_by(1).unwrap();
+    assert_eq!(it.next(), Some(chars[1]));
+    it.advance_by(33).unwrap();
+    assert_eq!(it.next(), Some(chars[35]));
+}
+
 #[test]
 fn test_rev_iterator() {
     let s = "ศไทย中华Việt Nam";
diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs
@@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce};
 use crate::ops::Try;
 use crate::option;
 use crate::slice::{self, Split as SliceSplit};
+use core::num::NonZeroUsize;
 
 use super::from_utf8_unchecked;
 use super::pattern::Pattern;
@@ -49,6 +50,55 @@ impl<'a> Iterator for Chars<'a> {
         super::count::count_chars(self.as_str())
     }
 
+    #[inline]
+    fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> {
+        const CHUNK_SIZE: usize = 32;
+
+        if remainder >= CHUNK_SIZE {
+            let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>();
+            let mut bytes_skipped: usize = 0;
+
+            while remainder > CHUNK_SIZE
+                && let Some(chunk) = chunks.next()
+            {
+                bytes_skipped += CHUNK_SIZE;
+
+                let mut start_bytes = [false; CHUNK_SIZE];
+
+                for i in 0..CHUNK_SIZE {
+                    start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]);
+                }
+
+                remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize;
+            }
+
+            // SAFETY: The amount of bytes exists since we just iterated over them,
+            // so advance_by will succeed.
+            unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() };
+
+            // skip trailing continuation bytes
+            while self.iter.len() > 0 {
+                let b = self.iter.as_slice()[0];
+                if !super::validations::utf8_is_cont_byte(b) {
+                    break;
+                }
+                // SAFETY: We just peeked at the byte, therefore it exists
+                unsafe { self.iter.advance_by(1).unwrap_unchecked() };
+            }
+        }
+
+        while (remainder > 0) && (self.iter.len() > 0) {
+            remainder -= 1;
+            let b = self.iter.as_slice()[0];
+            let slurp = super::validations::utf8_char_width(b);
+            // SAFETY: utf8 validity requires that the string must contain
+            // the continuation bytes (if any)
+            unsafe { self.iter.advance_by(slurp).unwrap_unchecked() };
+        }
+
+        NonZeroUsize::new(remainder).map_or(Ok(()), Err)
+    }
+
     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         let len = self.iter.len();