Skip to content

Commit adddd38

Browse files
committed
Encode core::str::CharSearcher::utf8_size as enum
1 parent e927184 commit adddd38

File tree

1 file changed

+50
-13
lines changed

1 file changed

+50
-13
lines changed

library/core/src/str/pattern.rs

+50-13
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,40 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
348348
// Impl for char
349349
/////////////////////////////////////////////////////////////////////////////
350350

351+
#[derive(Clone, Copy, Debug)]
352+
enum Utf8Size {
353+
// Values are indexes, so `- 1`
354+
One = 0,
355+
Two = 1,
356+
Three = 2,
357+
Four = 3,
358+
}
359+
360+
impl Utf8Size {
361+
fn new(size: usize) -> Option<Self> {
362+
match size {
363+
1 => Some(Self::One),
364+
2 => Some(Self::Two),
365+
3 => Some(Self::Three),
366+
4 => Some(Self::Four),
367+
_ => None,
368+
}
369+
}
370+
371+
// # Safety
372+
//
373+
// `size` must be more than `0` and less than `5`
374+
unsafe fn new_unchecked(size: usize) -> Self {
375+
// SAFETY: Invariant held by caller
376+
unsafe { Self::new(size).unwrap_unchecked() }
377+
}
378+
379+
fn index(self, arr: &[u8; 4]) -> &u8 {
380+
// SAFETY: max value is 3, which indexes to the 4th element.
381+
unsafe { arr.get_unchecked(self as usize) }
382+
}
383+
}
384+
351385
/// Associated type for `<char as Pattern<'a>>::Searcher`.
352386
#[derive(Clone, Debug)]
353387
pub struct CharSearcher<'a> {
@@ -368,9 +402,8 @@ pub struct CharSearcher<'a> {
368402
/// The character being searched for
369403
needle: char,
370404

371-
// safety invariant: `utf8_size` must be less than 5
372405
/// The number of bytes `needle` takes up when encoded in utf8.
373-
utf8_size: usize,
406+
utf8_size: Utf8Size,
374407
/// A utf8 encoded copy of the `needle`
375408
utf8_encoded: [u8; 4],
376409
}
@@ -413,8 +446,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
413446
// get the haystack after the last character found
414447
let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
415448
// the last byte of the utf8 encoded needle
416-
// SAFETY: we have an invariant that `utf8_size < 5`
417-
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
449+
let last_byte = *self.utf8_size.index(&self.utf8_encoded);
418450
if let Some(index) = memchr::memchr(last_byte, bytes) {
419451
// The new finger is the index of the byte we found,
420452
// plus one, since we memchr'd for the last byte of the character.
@@ -434,10 +466,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
434466
// find something. When we find something the `finger` will be set
435467
// to a UTF8 boundary.
436468
self.finger += index + 1;
437-
if self.finger >= self.utf8_size {
438-
let found_char = self.finger - self.utf8_size;
469+
470+
let utf8_size = self.utf8_size as usize;
471+
if self.finger >= utf8_size {
472+
let found_char = self.finger - utf8_size;
439473
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
440-
if slice == &self.utf8_encoded[0..self.utf8_size] {
474+
if slice == &self.utf8_encoded[0..utf8_size] {
441475
return Some((found_char, self.finger));
442476
}
443477
}
@@ -481,8 +515,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
481515
// get the haystack up to but not including the last character searched
482516
let bytes = haystack.get(self.finger..self.finger_back)?;
483517
// the last byte of the utf8 encoded needle
484-
// SAFETY: we have an invariant that `utf8_size < 5`
485-
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
518+
let last_byte = *self.utf8_size.index(&self.utf8_encoded);
486519
if let Some(index) = memchr::memrchr(last_byte, bytes) {
487520
// we searched a slice that was offset by self.finger,
488521
// add self.finger to recoup the original index
@@ -493,14 +526,15 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
493526
// char in the paradigm of reverse iteration). For
494527
// multibyte chars we need to skip down by the number of more
495528
// bytes they have than ASCII
496-
let shift = self.utf8_size - 1;
529+
let utf8_size = self.utf8_size as usize;
530+
let shift = utf8_size - 1;
497531
if index >= shift {
498532
let found_char = index - shift;
499-
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
500-
if slice == &self.utf8_encoded[0..self.utf8_size] {
533+
if let Some(slice) = haystack.get(found_char..(found_char + utf8_size)) {
534+
if slice == &self.utf8_encoded[0..utf8_size] {
501535
// move finger to before the character found (i.e., at its start index)
502536
self.finger_back = found_char;
503-
return Some((self.finger_back, self.finger_back + self.utf8_size));
537+
return Some((self.finger_back, self.finger_back + utf8_size));
504538
}
505539
}
506540
}
@@ -543,6 +577,9 @@ impl<'a> Pattern<'a> for char {
543577
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
544578
let mut utf8_encoded = [0; 4];
545579
let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
580+
581+
// SAFETY: utf8_size is below 5
582+
let utf8_size = unsafe { Utf8Size::new_unchecked(utf8_size) };
546583
CharSearcher {
547584
haystack,
548585
finger: 0,

0 commit comments

Comments
 (0)