Skip to content

Commit 71006bd

Browse files
author
Ulrik Sverdrup
committed
StrSearcher: Use trait to specialize two way algorithm by case
Use a trait to be able to implement both the fast search that skips to each match, and the slower search that emits `Reject` intervals regularly. The latter is important for uses of `next_reject`.
1 parent a6dd203 commit 71006bd

File tree

1 file changed

+133
-57
lines changed

1 file changed

+133
-57
lines changed

src/libcore/str/pattern.rs

+133-57
Original file line numberDiff line numberDiff line change
@@ -544,11 +544,7 @@ pub struct StrSearcher<'a, 'b> {
544544
#[derive(Clone, Debug)]
545545
enum StrSearcherImpl {
546546
Empty(EmptyNeedle),
547-
TwoWay {
548-
last_match_fw: Option<(usize, usize)>,
549-
last_match_bw: Option<(usize, usize)>,
550-
searcher: TwoWaySearcher,
551-
}
547+
TwoWay(TwoWaySearcher),
552548
}
553549

554550
#[derive(Clone, Debug)]
@@ -576,11 +572,9 @@ impl<'a, 'b> StrSearcher<'a, 'b> {
576572
StrSearcher {
577573
haystack: haystack,
578574
needle: needle,
579-
searcher: StrSearcherImpl::TwoWay {
580-
last_match_fw: None,
581-
last_match_bw: None,
582-
searcher: TwoWaySearcher::new(needle.as_bytes(), haystack.len())
583-
},
575+
searcher: StrSearcherImpl::TwoWay(
576+
TwoWaySearcher::new(needle.as_bytes(), haystack.len())
577+
),
584578
}
585579
}
586580
}
@@ -606,39 +600,55 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
606600
}
607601
}
608602
}
609-
StrSearcherImpl::TwoWay { ref mut last_match_fw, ref mut searcher, .. } => {
603+
StrSearcherImpl::TwoWay(ref mut searcher) => {
610604
// TwoWaySearcher produces valid *Match* indices that split at char boundaries
611605
// as long as it does correct matching and that haystack and needle are
612606
// valid UTF-8
613-
// *Rejects* fall on the same indices (the intervals between matches)
614-
// so they are always on character boundaries.
615-
if let Some((a, b)) = last_match_fw.take() {
616-
return SearchStep::Match(a, b);
607+
// *Rejects* from the algorithm can fall on any indices, but we will walk them
608+
// manually to the next character boundary, so that they are utf-8 safe.
609+
if searcher.position == self.haystack.len() {
610+
return SearchStep::Done;
617611
}
618-
let last_pos = searcher.position;
619612
let is_long = searcher.memory == usize::MAX;
620-
let next_match = searcher.next(self.haystack.as_bytes(),
621-
self.needle.as_bytes(),
622-
is_long);
623-
match next_match {
624-
None => if last_pos != self.haystack.len() {
625-
SearchStep::Reject(last_pos, self.haystack.len())
626-
} else {
627-
SearchStep::Done
628-
},
629-
Some((a, b)) => {
630-
if a == last_pos {
631-
SearchStep::Match(a, b)
632-
} else {
633-
*last_match_fw = Some((a, b));
634-
SearchStep::Reject(last_pos, a)
613+
match searcher.next::<RejectAndMatch>(self.haystack.as_bytes(),
614+
self.needle.as_bytes(),
615+
is_long)
616+
{
617+
SearchStep::Reject(a, mut b) => {
618+
// skip to next char boundary
619+
while !self.haystack.is_char_boundary(b) {
620+
b += 1;
635621
}
622+
searcher.position = cmp::max(b, searcher.position);
623+
SearchStep::Reject(a, b)
636624
}
625+
otherwise => otherwise,
637626
}
638627
}
639628
}
640629
}
641630

631+
#[inline]
632+
fn next_match(&mut self) -> Option<(usize, usize)> {
633+
match self.searcher {
634+
StrSearcherImpl::Empty(..) => {
635+
loop {
636+
match self.next() {
637+
SearchStep::Match(a, b) => return Some((a, b)),
638+
SearchStep::Done => return None,
639+
SearchStep::Reject(..) => { }
640+
}
641+
}
642+
}
643+
StrSearcherImpl::TwoWay(ref mut searcher) => {
644+
let is_long = searcher.memory == usize::MAX;
645+
searcher.next::<MatchOnly>(self.haystack.as_bytes(),
646+
self.needle.as_bytes(),
647+
is_long)
648+
}
649+
}
650+
}
651+
642652
}
643653
unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
644654
#[inline]
@@ -657,31 +667,45 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
657667
}
658668
}
659669
}
660-
StrSearcherImpl::TwoWay { ref mut last_match_bw, ref mut searcher, .. } => {
661-
if let Some((a, b)) = last_match_bw.take() {
662-
return SearchStep::Match(a, b);
670+
StrSearcherImpl::TwoWay(ref mut searcher) => {
671+
if searcher.end == 0 {
672+
return SearchStep::Done;
663673
}
664-
let last_end = searcher.end;
665-
let next_match = searcher.next_back(self.haystack.as_bytes(),
666-
self.needle.as_bytes());
667-
match next_match {
668-
None => if last_end != 0 {
669-
SearchStep::Reject(0, last_end)
670-
} else {
671-
SearchStep::Done
672-
},
673-
Some((a, b)) => {
674-
if b == last_end {
675-
SearchStep::Match(a, b)
676-
} else {
677-
*last_match_bw = Some((a, b));
678-
SearchStep::Reject(b, last_end)
674+
match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(),
675+
self.needle.as_bytes())
676+
{
677+
SearchStep::Reject(mut a, b) => {
678+
// skip to next char boundary
679+
while !self.haystack.is_char_boundary(a) {
680+
a -= 1;
679681
}
682+
searcher.end = cmp::min(a, searcher.end);
683+
SearchStep::Reject(a, b)
680684
}
685+
otherwise => otherwise,
681686
}
682687
}
683688
}
684689
}
690+
691+
#[inline]
692+
fn next_match_back(&mut self) -> Option<(usize, usize)> {
693+
match self.searcher {
694+
StrSearcherImpl::Empty(..) => {
695+
loop {
696+
match self.next_back() {
697+
SearchStep::Match(a, b) => return Some((a, b)),
698+
SearchStep::Done => return None,
699+
SearchStep::Reject(..) => { }
700+
}
701+
}
702+
}
703+
StrSearcherImpl::TwoWay(ref mut searcher) => {
704+
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
705+
self.needle.as_bytes())
706+
}
707+
}
708+
}
685709
}
686710

687711
/// The internal state of an iterator that searches for matches of a substring
@@ -831,14 +855,21 @@ impl TwoWaySearcher {
831855
// How far we can jump when we encounter a mismatch is all based on the fact
832856
// that (u, v) is a critical factorization for the needle.
833857
#[inline]
834-
fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
835-
-> Option<(usize, usize)> {
858+
fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
859+
-> S::Output
860+
where S: TwoWayStrategy
861+
{
836862
// `next()` uses `self.position` as its cursor
863+
let old_pos = self.position;
837864
'search: loop {
838865
// Check that we have room to search in
839-
if self.position + needle.len() > haystack.len() {
866+
if needle.len() > haystack.len() - self.position {
840867
self.position = haystack.len();
841-
return None;
868+
return S::rejecting(old_pos, self.position);
869+
}
870+
871+
if S::use_early_reject() && old_pos != self.position {
872+
return S::rejecting(old_pos, self.position);
842873
}
843874

844875
// Quickly skip by large portions unrelated to our substring
@@ -884,7 +915,7 @@ impl TwoWaySearcher {
884915
self.memory = 0; // set to needle.len() - self.period for overlapping matches
885916
}
886917

887-
return Some((match_pos, match_pos + needle.len()));
918+
return S::matching(match_pos, match_pos + needle.len());
888919
}
889920
}
890921

@@ -902,14 +933,22 @@ impl TwoWaySearcher {
902933
// a reversed haystack with a reversed needle, and the above paragraph shows
903934
// that the precomputed parameters can be left alone.
904935
#[inline]
905-
fn next_back(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(usize, usize)> {
936+
fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8])
937+
-> S::Output
938+
where S: TwoWayStrategy
939+
{
906940
// `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
907941
// are independent.
942+
let old_end = self.end;
908943
'search: loop {
909944
// Check that we have room to search in
910945
if needle.len() > self.end {
911946
self.end = 0;
912-
return None;
947+
return S::rejecting(0, old_end);
948+
}
949+
950+
if S::use_early_reject() && old_end != self.end {
951+
return S::rejecting(self.end, old_end);
913952
}
914953

915954
// Quickly skip by large portions unrelated to our substring
@@ -939,7 +978,7 @@ impl TwoWaySearcher {
939978
// Note: sub self.period instead of needle.len() to have overlapping matches
940979
self.end -= needle.len();
941980

942-
return Some((match_pos, match_pos + needle.len()));
981+
return S::matching(match_pos, match_pos + needle.len());
943982
}
944983
}
945984

@@ -987,3 +1026,40 @@ impl TwoWaySearcher {
9871026
(left.wrapping_add(1), period)
9881027
}
9891028
}
1029+
1030+
// TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1031+
// as possible, or to work in a mode where it emits Rejects relatively quickly.
1032+
trait TwoWayStrategy {
1033+
type Output;
1034+
fn use_early_reject() -> bool;
1035+
fn rejecting(usize, usize) -> Self::Output;
1036+
fn matching(usize, usize) -> Self::Output;
1037+
}
1038+
1039+
/// Skip to match intervals as quickly as possible
1040+
enum MatchOnly { }
1041+
1042+
impl TwoWayStrategy for MatchOnly {
1043+
type Output = Option<(usize, usize)>;
1044+
1045+
#[inline]
1046+
fn use_early_reject() -> bool { false }
1047+
#[inline]
1048+
fn rejecting(_a: usize, _b: usize) -> Self::Output { None }
1049+
#[inline]
1050+
fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) }
1051+
}
1052+
1053+
/// Emit Rejects regularly
1054+
enum RejectAndMatch { }
1055+
1056+
impl TwoWayStrategy for RejectAndMatch {
1057+
type Output = SearchStep;
1058+
1059+
#[inline]
1060+
fn use_early_reject() -> bool { true }
1061+
#[inline]
1062+
fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) }
1063+
#[inline]
1064+
fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) }
1065+
}

0 commit comments

Comments
 (0)