15
15
#![ doc( primitive = "str" ) ]
16
16
#![ stable( feature = "rust1" , since = "1.0.0" ) ]
17
17
18
- use self :: OldSearcher :: { TwoWay , TwoWayLong } ;
19
18
use self :: pattern:: Pattern ;
20
19
use self :: pattern:: { Searcher , ReverseSearcher , DoubleEndedSearcher } ;
21
20
22
21
use char:: CharExt ;
23
22
use clone:: Clone ;
24
- use cmp:: { self , Eq } ;
23
+ use cmp:: Eq ;
25
24
use convert:: AsRef ;
26
25
use default:: Default ;
27
26
use fmt;
@@ -33,7 +32,6 @@ use option::Option::{self, None, Some};
33
32
use raw:: { Repr , Slice } ;
34
33
use result:: Result :: { self , Ok , Err } ;
35
34
use slice:: { self , SliceExt } ;
36
- use usize;
37
35
38
36
pub mod pattern;
39
37
@@ -870,301 +868,6 @@ impl<'a> DoubleEndedIterator for LinesAny<'a> {
870
868
}
871
869
}
872
870
873
- /// The internal state of an iterator that searches for matches of a substring
874
- /// within a larger string using two-way search
875
- #[ derive( Clone ) ]
876
- struct TwoWaySearcher {
877
- // constants
878
- crit_pos : usize ,
879
- period : usize ,
880
- byteset : u64 ,
881
-
882
- // variables
883
- position : usize ,
884
- memory : usize
885
- }
886
-
887
- /*
888
- This is the Two-Way search algorithm, which was introduced in the paper:
889
- Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
890
-
891
- Here's some background information.
892
-
893
- A *word* is a string of symbols. The *length* of a word should be a familiar
894
- notion, and here we denote it for any word x by |x|.
895
- (We also allow for the possibility of the *empty word*, a word of length zero).
896
-
897
- If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
898
- *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
899
- For example, both 1 and 2 are periods for the string "aa". As another example,
900
- the only period of the string "abcd" is 4.
901
-
902
- We denote by period(x) the *smallest* period of x (provided that x is non-empty).
903
- This is always well-defined since every non-empty word x has at least one period,
904
- |x|. We sometimes call this *the period* of x.
905
-
906
- If u, v and x are words such that x = uv, where uv is the concatenation of u and
907
- v, then we say that (u, v) is a *factorization* of x.
908
-
909
- Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
910
- that both of the following hold
911
-
912
- - either w is a suffix of u or u is a suffix of w
913
- - either w is a prefix of v or v is a prefix of w
914
-
915
- then w is said to be a *repetition* for the factorization (u, v).
916
-
917
- Just to unpack this, there are four possibilities here. Let w = "abc". Then we
918
- might have:
919
-
920
- - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
921
- - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
922
- - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
923
- - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
924
-
925
- Note that the word vu is a repetition for any factorization (u,v) of x = uv,
926
- so every factorization has at least one repetition.
927
-
928
- If x is a string and (u, v) is a factorization for x, then a *local period* for
929
- (u, v) is an integer r such that there is some word w such that |w| = r and w is
930
- a repetition for (u, v).
931
-
932
- We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
933
- call this *the local period* of (u, v). Provided that x = uv is non-empty, this
934
- is well-defined (because each non-empty word has at least one factorization, as
935
- noted above).
936
-
937
- It can be proven that the following is an equivalent definition of a local period
938
- for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
939
- all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
940
- defined. (i.e. i > 0 and i + r < |x|).
941
-
942
- Using the above reformulation, it is easy to prove that
943
-
944
- 1 <= local_period(u, v) <= period(uv)
945
-
946
- A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
947
- *critical factorization*.
948
-
949
- The algorithm hinges on the following theorem, which is stated without proof:
950
-
951
- **Critical Factorization Theorem** Any word x has at least one critical
952
- factorization (u, v) such that |u| < period(x).
953
-
954
- The purpose of maximal_suffix is to find such a critical factorization.
955
-
956
- */
957
- impl TwoWaySearcher {
958
- #[ allow( dead_code) ]
959
- fn new ( needle : & [ u8 ] ) -> TwoWaySearcher {
960
- let ( crit_pos_false, period_false) = TwoWaySearcher :: maximal_suffix ( needle, false ) ;
961
- let ( crit_pos_true, period_true) = TwoWaySearcher :: maximal_suffix ( needle, true ) ;
962
-
963
- let ( crit_pos, period) =
964
- if crit_pos_false > crit_pos_true {
965
- ( crit_pos_false, period_false)
966
- } else {
967
- ( crit_pos_true, period_true)
968
- } ;
969
-
970
- // This isn't in the original algorithm, as far as I'm aware.
971
- let byteset = needle. iter ( )
972
- . fold ( 0 , |a, & b| ( 1 << ( ( b & 0x3f ) as usize ) ) | a) ;
973
-
974
- // A particularly readable explanation of what's going on here can be found
975
- // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
976
- // see the code for "Algorithm CP" on p. 323.
977
- //
978
- // What's going on is we have some critical factorization (u, v) of the
979
- // needle, and we want to determine whether u is a suffix of
980
- // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
981
- // "Algorithm CP2", which is optimized for when the period of the needle
982
- // is large.
983
- if & needle[ ..crit_pos] == & needle[ period.. period + crit_pos] {
984
- TwoWaySearcher {
985
- crit_pos : crit_pos,
986
- period : period,
987
- byteset : byteset,
988
-
989
- position : 0 ,
990
- memory : 0
991
- }
992
- } else {
993
- TwoWaySearcher {
994
- crit_pos : crit_pos,
995
- period : cmp:: max ( crit_pos, needle. len ( ) - crit_pos) + 1 ,
996
- byteset : byteset,
997
-
998
- position : 0 ,
999
- memory : usize:: MAX // Dummy value to signify that the period is long
1000
- }
1001
- }
1002
- }
1003
-
1004
- // One of the main ideas of Two-Way is that we factorize the needle into
1005
- // two halves, (u, v), and begin trying to find v in the haystack by scanning
1006
- // left to right. If v matches, we try to match u by scanning right to left.
1007
- // How far we can jump when we encounter a mismatch is all based on the fact
1008
- // that (u, v) is a critical factorization for the needle.
1009
- #[ inline]
1010
- fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
1011
- -> Option < ( usize , usize ) > {
1012
- ' search: loop {
1013
- // Check that we have room to search in
1014
- if self . position + needle. len ( ) > haystack. len ( ) {
1015
- return None ;
1016
- }
1017
-
1018
- // Quickly skip by large portions unrelated to our substring
1019
- if ( self . byteset >>
1020
- ( ( haystack[ self . position + needle. len ( ) - 1 ] & 0x3f )
1021
- as usize ) ) & 1 == 0 {
1022
- self . position += needle. len ( ) ;
1023
- if !long_period {
1024
- self . memory = 0 ;
1025
- }
1026
- continue ' search;
1027
- }
1028
-
1029
- // See if the right part of the needle matches
1030
- let start = if long_period { self . crit_pos }
1031
- else { cmp:: max ( self . crit_pos , self . memory ) } ;
1032
- for i in start..needle. len ( ) {
1033
- if needle[ i] != haystack[ self . position + i] {
1034
- self . position += i - self . crit_pos + 1 ;
1035
- if !long_period {
1036
- self . memory = 0 ;
1037
- }
1038
- continue ' search;
1039
- }
1040
- }
1041
-
1042
- // See if the left part of the needle matches
1043
- let start = if long_period { 0 } else { self . memory } ;
1044
- for i in ( start..self . crit_pos ) . rev ( ) {
1045
- if needle[ i] != haystack[ self . position + i] {
1046
- self . position += self . period ;
1047
- if !long_period {
1048
- self . memory = needle. len ( ) - self . period ;
1049
- }
1050
- continue ' search;
1051
- }
1052
- }
1053
-
1054
- // We have found a match!
1055
- let match_pos = self . position ;
1056
- self . position += needle. len ( ) ; // add self.period for all matches
1057
- if !long_period {
1058
- self . memory = 0 ; // set to needle.len() - self.period for all matches
1059
- }
1060
- return Some ( ( match_pos, match_pos + needle. len ( ) ) ) ;
1061
- }
1062
- }
1063
-
1064
- // Computes a critical factorization (u, v) of `arr`.
1065
- // Specifically, returns (i, p), where i is the starting index of v in some
1066
- // critical factorization (u, v) and p = period(v)
1067
- #[ inline]
1068
- #[ allow( dead_code) ]
1069
- #[ allow( deprecated) ]
1070
- fn maximal_suffix ( arr : & [ u8 ] , reversed : bool ) -> ( usize , usize ) {
1071
- let mut left: usize = !0 ; // Corresponds to i in the paper
1072
- let mut right = 0 ; // Corresponds to j in the paper
1073
- let mut offset = 1 ; // Corresponds to k in the paper
1074
- let mut period = 1 ; // Corresponds to p in the paper
1075
-
1076
- while right + offset < arr. len ( ) {
1077
- let a;
1078
- let b;
1079
- if reversed {
1080
- a = arr[ left. wrapping_add ( offset) ] ;
1081
- b = arr[ right + offset] ;
1082
- } else {
1083
- a = arr[ right + offset] ;
1084
- b = arr[ left. wrapping_add ( offset) ] ;
1085
- }
1086
- if a < b {
1087
- // Suffix is smaller, period is entire prefix so far.
1088
- right += offset;
1089
- offset = 1 ;
1090
- period = right. wrapping_sub ( left) ;
1091
- } else if a == b {
1092
- // Advance through repetition of the current period.
1093
- if offset == period {
1094
- right += offset;
1095
- offset = 1 ;
1096
- } else {
1097
- offset += 1 ;
1098
- }
1099
- } else {
1100
- // Suffix is larger, start over from current location.
1101
- left = right;
1102
- right += 1 ;
1103
- offset = 1 ;
1104
- period = 1 ;
1105
- }
1106
- }
1107
- ( left. wrapping_add ( 1 ) , period)
1108
- }
1109
- }
1110
-
1111
- /// The internal state of an iterator that searches for matches of a substring
1112
- /// within a larger string using a dynamically chosen search algorithm
1113
- #[ derive( Clone ) ]
1114
- // NB: This is kept around for convenience because
1115
- // it is planned to be used again in the future
1116
- enum OldSearcher {
1117
- TwoWay ( TwoWaySearcher ) ,
1118
- TwoWayLong ( TwoWaySearcher ) ,
1119
- }
1120
-
1121
- impl OldSearcher {
1122
- #[ allow( dead_code) ]
1123
- fn new ( haystack : & [ u8 ] , needle : & [ u8 ] ) -> OldSearcher {
1124
- if needle. is_empty ( ) {
1125
- // Handle specially
1126
- unimplemented ! ( )
1127
- // FIXME: Tune this.
1128
- // FIXME(#16715): This unsigned integer addition will probably not
1129
- // overflow because that would mean that the memory almost solely
1130
- // consists of the needle. Needs #16715 to be formally fixed.
1131
- } else if needle. len ( ) + 20 > haystack. len ( ) {
1132
- // Use naive searcher
1133
- unimplemented ! ( )
1134
- } else {
1135
- let searcher = TwoWaySearcher :: new ( needle) ;
1136
- if searcher. memory == usize:: MAX { // If the period is long
1137
- TwoWayLong ( searcher)
1138
- } else {
1139
- TwoWay ( searcher)
1140
- }
1141
- }
1142
- }
1143
- }
1144
-
1145
- #[ derive( Clone ) ]
1146
- // NB: This is kept around for convenience because
1147
- // it is planned to be used again in the future
1148
- struct OldMatchIndices < ' a , ' b > {
1149
- // constants
1150
- haystack : & ' a str ,
1151
- needle : & ' b str ,
1152
- searcher : OldSearcher
1153
- }
1154
-
1155
- impl < ' a , ' b > OldMatchIndices < ' a , ' b > {
1156
- #[ inline]
1157
- #[ allow( dead_code) ]
1158
- fn next ( & mut self ) -> Option < ( usize , usize ) > {
1159
- match self . searcher {
1160
- TwoWay ( ref mut searcher)
1161
- => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , false ) ,
1162
- TwoWayLong ( ref mut searcher)
1163
- => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , true ) ,
1164
- }
1165
- }
1166
- }
1167
-
1168
871
/*
1169
872
Section: Comparing strings
1170
873
*/
0 commit comments