@@ -544,11 +544,7 @@ pub struct StrSearcher<'a, 'b> {
544
544
#[ derive( Clone , Debug ) ]
545
545
enum StrSearcherImpl {
546
546
Empty ( EmptyNeedle ) ,
547
- TwoWay {
548
- last_match_fw : Option < ( usize , usize ) > ,
549
- last_match_bw : Option < ( usize , usize ) > ,
550
- searcher : TwoWaySearcher ,
551
- }
547
+ TwoWay ( TwoWaySearcher ) ,
552
548
}
553
549
554
550
#[ derive( Clone , Debug ) ]
@@ -576,11 +572,9 @@ impl<'a, 'b> StrSearcher<'a, 'b> {
576
572
StrSearcher {
577
573
haystack : haystack,
578
574
needle : needle,
579
- searcher : StrSearcherImpl :: TwoWay {
580
- last_match_fw : None ,
581
- last_match_bw : None ,
582
- searcher : TwoWaySearcher :: new ( needle. as_bytes ( ) , haystack. len ( ) )
583
- } ,
575
+ searcher : StrSearcherImpl :: TwoWay (
576
+ TwoWaySearcher :: new ( needle. as_bytes ( ) , haystack. len ( ) )
577
+ ) ,
584
578
}
585
579
}
586
580
}
@@ -606,39 +600,55 @@ unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
606
600
}
607
601
}
608
602
}
609
- StrSearcherImpl :: TwoWay { ref mut last_match_fw , ref mut searcher, .. } => {
603
+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
610
604
// TwoWaySearcher produces valid *Match* indices that split at char boundaries
611
605
// as long as it does correct matching and that haystack and needle are
612
606
// valid UTF-8
613
- // *Rejects* fall on the same indices (the intervals between matches)
614
- // so they are always on character boundaries .
615
- if let Some ( ( a , b ) ) = last_match_fw . take ( ) {
616
- return SearchStep :: Match ( a , b ) ;
607
+ // *Rejects* from the algorithm can fall on any indices, but we will walk them
608
+ // manually to the next character boundary, so that they are utf-8 safe .
609
+ if searcher . position == self . haystack . len ( ) {
610
+ return SearchStep :: Done ;
617
611
}
618
- let last_pos = searcher. position ;
619
612
let is_long = searcher. memory == usize:: MAX ;
620
- let next_match = searcher. next ( self . haystack . as_bytes ( ) ,
621
- self . needle . as_bytes ( ) ,
622
- is_long) ;
623
- match next_match {
624
- None => if last_pos != self . haystack . len ( ) {
625
- SearchStep :: Reject ( last_pos, self . haystack . len ( ) )
626
- } else {
627
- SearchStep :: Done
628
- } ,
629
- Some ( ( a, b) ) => {
630
- if a == last_pos {
631
- SearchStep :: Match ( a, b)
632
- } else {
633
- * last_match_fw = Some ( ( a, b) ) ;
634
- SearchStep :: Reject ( last_pos, a)
613
+ match searcher. next :: < RejectAndMatch > ( self . haystack . as_bytes ( ) ,
614
+ self . needle . as_bytes ( ) ,
615
+ is_long)
616
+ {
617
+ SearchStep :: Reject ( a, mut b) => {
618
+ // skip to next char boundary
619
+ while !self . haystack . is_char_boundary ( b) {
620
+ b += 1 ;
635
621
}
622
+ searcher. position = cmp:: max ( b, searcher. position ) ;
623
+ SearchStep :: Reject ( a, b)
636
624
}
625
+ otherwise => otherwise,
637
626
}
638
627
}
639
628
}
640
629
}
641
630
631
+ #[ inline]
632
+ fn next_match ( & mut self ) -> Option < ( usize , usize ) > {
633
+ match self . searcher {
634
+ StrSearcherImpl :: Empty ( ..) => {
635
+ loop {
636
+ match self . next ( ) {
637
+ SearchStep :: Match ( a, b) => return Some ( ( a, b) ) ,
638
+ SearchStep :: Done => return None ,
639
+ SearchStep :: Reject ( ..) => { }
640
+ }
641
+ }
642
+ }
643
+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
644
+ let is_long = searcher. memory == usize:: MAX ;
645
+ searcher. next :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
646
+ self . needle . as_bytes ( ) ,
647
+ is_long)
648
+ }
649
+ }
650
+ }
651
+
642
652
}
643
653
unsafe impl < ' a , ' b > ReverseSearcher < ' a > for StrSearcher < ' a , ' b > {
644
654
#[ inline]
@@ -657,31 +667,45 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
657
667
}
658
668
}
659
669
}
660
- StrSearcherImpl :: TwoWay { ref mut last_match_bw , ref mut searcher, .. } => {
661
- if let Some ( ( a , b ) ) = last_match_bw . take ( ) {
662
- return SearchStep :: Match ( a , b ) ;
670
+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
671
+ if searcher . end == 0 {
672
+ return SearchStep :: Done ;
663
673
}
664
- let last_end = searcher. end ;
665
- let next_match = searcher. next_back ( self . haystack . as_bytes ( ) ,
666
- self . needle . as_bytes ( ) ) ;
667
- match next_match {
668
- None => if last_end != 0 {
669
- SearchStep :: Reject ( 0 , last_end)
670
- } else {
671
- SearchStep :: Done
672
- } ,
673
- Some ( ( a, b) ) => {
674
- if b == last_end {
675
- SearchStep :: Match ( a, b)
676
- } else {
677
- * last_match_bw = Some ( ( a, b) ) ;
678
- SearchStep :: Reject ( b, last_end)
674
+ match searcher. next_back :: < RejectAndMatch > ( self . haystack . as_bytes ( ) ,
675
+ self . needle . as_bytes ( ) )
676
+ {
677
+ SearchStep :: Reject ( mut a, b) => {
678
+ // skip to next char boundary
679
+ while !self . haystack . is_char_boundary ( a) {
680
+ a -= 1 ;
679
681
}
682
+ searcher. end = cmp:: min ( a, searcher. end ) ;
683
+ SearchStep :: Reject ( a, b)
680
684
}
685
+ otherwise => otherwise,
681
686
}
682
687
}
683
688
}
684
689
}
690
+
691
+ #[ inline]
692
+ fn next_match_back ( & mut self ) -> Option < ( usize , usize ) > {
693
+ match self . searcher {
694
+ StrSearcherImpl :: Empty ( ..) => {
695
+ loop {
696
+ match self . next_back ( ) {
697
+ SearchStep :: Match ( a, b) => return Some ( ( a, b) ) ,
698
+ SearchStep :: Done => return None ,
699
+ SearchStep :: Reject ( ..) => { }
700
+ }
701
+ }
702
+ }
703
+ StrSearcherImpl :: TwoWay ( ref mut searcher) => {
704
+ searcher. next_back :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
705
+ self . needle . as_bytes ( ) )
706
+ }
707
+ }
708
+ }
685
709
}
686
710
687
711
/// The internal state of an iterator that searches for matches of a substring
@@ -831,14 +855,21 @@ impl TwoWaySearcher {
831
855
// How far we can jump when we encounter a mismatch is all based on the fact
832
856
// that (u, v) is a critical factorization for the needle.
833
857
#[ inline]
834
- fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
835
- -> Option < ( usize , usize ) > {
858
+ fn next < S > ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
859
+ -> S :: Output
860
+ where S : TwoWayStrategy
861
+ {
836
862
// `next()` uses `self.position` as its cursor
863
+ let old_pos = self . position ;
837
864
' search: loop {
838
865
// Check that we have room to search in
839
- if self . position + needle. len ( ) > haystack. len ( ) {
866
+ if needle. len ( ) > haystack. len ( ) - self . position {
840
867
self . position = haystack. len ( ) ;
841
- return None ;
868
+ return S :: rejecting ( old_pos, self . position ) ;
869
+ }
870
+
871
+ if S :: use_early_reject ( ) && old_pos != self . position {
872
+ return S :: rejecting ( old_pos, self . position ) ;
842
873
}
843
874
844
875
// Quickly skip by large portions unrelated to our substring
@@ -884,7 +915,7 @@ impl TwoWaySearcher {
884
915
self . memory = 0 ; // set to needle.len() - self.period for overlapping matches
885
916
}
886
917
887
- return Some ( ( match_pos, match_pos + needle. len ( ) ) ) ;
918
+ return S :: matching ( match_pos, match_pos + needle. len ( ) ) ;
888
919
}
889
920
}
890
921
@@ -902,14 +933,22 @@ impl TwoWaySearcher {
902
933
// a reversed haystack with a reversed needle, and the above paragraph shows
903
934
// that the precomputed parameters can be left alone.
904
935
#[ inline]
905
- fn next_back ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] ) -> Option < ( usize , usize ) > {
936
+ fn next_back < S > ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] )
937
+ -> S :: Output
938
+ where S : TwoWayStrategy
939
+ {
906
940
// `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
907
941
// are independent.
942
+ let old_end = self . end ;
908
943
' search: loop {
909
944
// Check that we have room to search in
910
945
if needle. len ( ) > self . end {
911
946
self . end = 0 ;
912
- return None ;
947
+ return S :: rejecting ( 0 , old_end) ;
948
+ }
949
+
950
+ if S :: use_early_reject ( ) && old_end != self . end {
951
+ return S :: rejecting ( self . end , old_end) ;
913
952
}
914
953
915
954
// Quickly skip by large portions unrelated to our substring
@@ -939,7 +978,7 @@ impl TwoWaySearcher {
939
978
// Note: sub self.period instead of needle.len() to have overlapping matches
940
979
self . end -= needle. len ( ) ;
941
980
942
- return Some ( ( match_pos, match_pos + needle. len ( ) ) ) ;
981
+ return S :: matching ( match_pos, match_pos + needle. len ( ) ) ;
943
982
}
944
983
}
945
984
@@ -987,3 +1026,40 @@ impl TwoWaySearcher {
987
1026
( left. wrapping_add ( 1 ) , period)
988
1027
}
989
1028
}
1029
+
1030
+ // TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1031
+ // as possible, or to work in a mode where it emits Rejects relatively quickly.
1032
+ trait TwoWayStrategy {
1033
+ type Output ;
1034
+ fn use_early_reject ( ) -> bool ;
1035
+ fn rejecting ( usize , usize ) -> Self :: Output ;
1036
+ fn matching ( usize , usize ) -> Self :: Output ;
1037
+ }
1038
+
1039
+ /// Skip to match intervals as quickly as possible
1040
+ enum MatchOnly { }
1041
+
1042
+ impl TwoWayStrategy for MatchOnly {
1043
+ type Output = Option < ( usize , usize ) > ;
1044
+
1045
+ #[ inline]
1046
+ fn use_early_reject ( ) -> bool { false }
1047
+ #[ inline]
1048
+ fn rejecting ( _a : usize , _b : usize ) -> Self :: Output { None }
1049
+ #[ inline]
1050
+ fn matching ( a : usize , b : usize ) -> Self :: Output { Some ( ( a, b) ) }
1051
+ }
1052
+
1053
+ /// Emit Rejects regularly
1054
+ enum RejectAndMatch { }
1055
+
1056
+ impl TwoWayStrategy for RejectAndMatch {
1057
+ type Output = SearchStep ;
1058
+
1059
+ #[ inline]
1060
+ fn use_early_reject ( ) -> bool { true }
1061
+ #[ inline]
1062
+ fn rejecting ( a : usize , b : usize ) -> Self :: Output { SearchStep :: Reject ( a, b) }
1063
+ #[ inline]
1064
+ fn matching ( a : usize , b : usize ) -> Self :: Output { SearchStep :: Match ( a, b) }
1065
+ }
0 commit comments