1
- //! Damerau-Levenshtein distances.
1
+ //! Edit distances.
2
2
//!
3
- //! The [Damerau-Levenshtein distance] is a metric for measuring the difference between two strings.
4
- //! This implementation is a restricted version of the algorithm, as it does not permit modifying
5
- //! characters that have already been transposed.
3
+ //! The [edit distance] is a metric for measuring the difference between two strings.
6
4
//!
7
- //! [Damerau-Levenshtein distance]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
5
+ //! [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
6
+
7
+ // The current implementation is the restricted Damerau-Levenshtein algorithm. It is restricted
8
+ // because it does not permit modifying characters that have already been transposed. The specific
9
+ // algorithm should not matter to the caller of the methods, which is why it is not noted in the
10
+ // documentation.
8
11
9
12
use crate :: symbol:: Symbol ;
10
13
use std:: { cmp, mem} ;
11
14
12
15
#[ cfg( test) ]
13
16
mod tests;
14
17
15
- /// Finds the restricted Damerau-Levenshtein distance between two strings. Characters that have
16
- /// already been transposed may not be modified.
18
+ /// Finds the [edit distance] between two strings.
19
+ ///
20
+ /// Returns `None` if the distance exceeds the limit.
17
21
///
18
- /// Returns None if the distance exceeds the limit.
19
- pub fn lev_distance ( a : & str , b : & str , limit : usize ) -> Option < usize > {
22
+ /// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
23
+ pub fn edit_distance ( a : & str , b : & str , limit : usize ) -> Option < usize > {
20
24
let mut a = & a. chars ( ) . collect :: < Vec < _ > > ( ) [ ..] ;
21
25
let mut b = & b. chars ( ) . collect :: < Vec < _ > > ( ) [ ..] ;
22
26
@@ -95,29 +99,29 @@ pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
95
99
}
96
100
97
101
/// Provides a word similarity score between two words that accounts for substrings being more
98
- /// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
99
- /// 0 is an identical match.
102
+ /// meaningful than a typical edit distance. The lower the score, the closer the match. 0 is an
103
+ /// identical match.
100
104
///
101
- /// Uses the Levenshtein distance between the two strings and removes the cost of the length
102
- /// difference. If this is 0 then it is either a substring match or a full word match, in the
103
- /// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
104
- /// eg. "in" in " shrink", we only perform this subtraction of length difference if one of the words
105
- /// is not greater than twice the length of the other. For cases where the words are close in size
106
- /// but not an exact substring then the cost of the length difference is discounted by half.
105
+ /// Uses the edit distance between the two strings and removes the cost of the length difference.
106
+ /// If this is 0 then it is either a substring match or a full word match, in the substring match
107
+ /// case we detect this and return `1`. To prevent finding meaningless substrings, eg. "in" in
108
+ /// " shrink", we only perform this subtraction of length difference if one of the words is not
109
+ /// greater than twice the length of the other. For cases where the words are close in size but not
110
+ /// an exact substring then the cost of the length difference is discounted by half.
107
111
///
108
112
/// Returns `None` if the distance exceeds the limit.
109
- pub fn lev_distance_with_substrings ( a : & str , b : & str , limit : usize ) -> Option < usize > {
113
+ pub fn edit_distance_with_substrings ( a : & str , b : & str , limit : usize ) -> Option < usize > {
110
114
let n = a. chars ( ) . count ( ) ;
111
115
let m = b. chars ( ) . count ( ) ;
112
116
113
117
// Check one isn't less than half the length of the other. If this is true then there is a
114
118
// big difference in length.
115
119
let big_len_diff = ( n * 2 ) < m || ( m * 2 ) < n;
116
120
let len_diff = if n < m { m - n } else { n - m } ;
117
- let lev = lev_distance ( a, b, limit + len_diff) ?;
121
+ let distance = edit_distance ( a, b, limit + len_diff) ?;
118
122
119
123
// This is the crux, subtracting length difference means exact substring matches will now be 0
120
- let score = lev - len_diff;
124
+ let score = distance - len_diff;
121
125
122
126
// If the score is 0 but the words have different lengths then it's a substring match not a full
123
127
// word match
@@ -136,12 +140,12 @@ pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<us
136
140
137
141
/// Finds the best match for given word in the given iterator where substrings are meaningful.
138
142
///
139
- /// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings `] as the score
140
- /// for word similarity. This takes an optional distance limit which defaults to one-third of the
141
- /// given word.
143
+ /// A version of [`find_best_match_for_name`] that uses [`edit_distance_with_substrings `] as the
144
+ /// score for word similarity. This takes an optional distance limit which defaults to one-third of
145
+ /// the given word.
142
146
///
143
- /// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
144
- /// on an edge case with a lower(upper)case letters mismatch.
147
+ /// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case
148
+ /// letters mismatch.
145
149
pub fn find_best_match_for_name_with_substrings (
146
150
candidates : & [ Symbol ] ,
147
151
lookup : Symbol ,
@@ -156,8 +160,8 @@ pub fn find_best_match_for_name_with_substrings(
156
160
/// an optional limit for the maximum allowable edit distance, which defaults
157
161
/// to one-third of the given word.
158
162
///
159
- /// Besides Levenshtein, we use case insensitive comparison to improve accuracy
160
- /// on an edge case with a lower(upper)case letters mismatch.
163
+ /// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case
164
+ /// letters mismatch.
161
165
pub fn find_best_match_for_name (
162
166
candidates : & [ Symbol ] ,
163
167
lookup : Symbol ,
@@ -178,7 +182,7 @@ fn find_best_match_for_name_impl(
178
182
179
183
// Priority of matches:
180
184
// 1. Exact case insensitive match
181
- // 2. Levenshtein distance match
185
+ // 2. Edit distance match
182
186
// 3. Sorted word match
183
187
if let Some ( c) = candidates. iter ( ) . find ( |c| c. as_str ( ) . to_uppercase ( ) == lookup_uppercase) {
184
188
return Some ( * c) ;
@@ -188,9 +192,9 @@ fn find_best_match_for_name_impl(
188
192
let mut best = None ;
189
193
for c in candidates {
190
194
match if use_substring_score {
191
- lev_distance_with_substrings ( lookup, c. as_str ( ) , dist)
195
+ edit_distance_with_substrings ( lookup, c. as_str ( ) , dist)
192
196
} else {
193
- lev_distance ( lookup, c. as_str ( ) , dist)
197
+ edit_distance ( lookup, c. as_str ( ) , dist)
194
198
} {
195
199
Some ( 0 ) => return Some ( * c) ,
196
200
Some ( d) => {
0 commit comments