1
- use std:: cmp;
1
+ use std:: { cmp, mem } ;
2
2
3
- pub fn lev_distance ( me : & str , t : & str ) -> usize {
3
+ /// Finds the [edit distance] between two strings.
4
+ ///
5
+ /// Returns `None` if the distance exceeds the limit.
6
+ ///
7
+ /// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
8
+ pub fn lev_distance ( a : & str , b : & str , limit : usize ) -> Option < usize > {
4
9
// Comparing the strings lowercased will result in a difference in capitalization being less distance away
5
10
// than being a completely different letter. Otherwise `CHECK` is as far away from `check` as it
6
11
// is from `build` (both with a distance of 5). For a single letter shortcut (e.g. `b` or `c`), they will
7
12
// all be as far away from any capital single letter entry (all with a distance of 1).
8
13
// By first lowercasing the strings, `C` and `c` are closer than `C` and `b`, for example.
9
- let me = me . to_lowercase ( ) ;
10
- let t = t . to_lowercase ( ) ;
14
+ let a = a . to_lowercase ( ) ;
15
+ let b = b . to_lowercase ( ) ;
11
16
12
- let t_len = t. chars ( ) . count ( ) ;
13
- if me. is_empty ( ) {
14
- return t_len;
17
+ let mut a = & a. chars ( ) . collect :: < Vec < _ > > ( ) [ ..] ;
18
+ let mut b = & b. chars ( ) . collect :: < Vec < _ > > ( ) [ ..] ;
19
+
20
+ // Ensure that `b` is the shorter string, minimizing memory use.
21
+ if a. len ( ) < b. len ( ) {
22
+ mem:: swap ( & mut a, & mut b) ;
15
23
}
16
- if t. is_empty ( ) {
17
- return me. chars ( ) . count ( ) ;
24
+
25
+ let min_dist = a. len ( ) - b. len ( ) ;
26
+ // If we know the limit will be exceeded, we can return early.
27
+ if min_dist > limit {
28
+ return None ;
18
29
}
19
30
20
- let mut dcol = ( 0 ..=t_len) . collect :: < Vec < _ > > ( ) ;
21
- let mut t_last = 0 ;
31
+ // Strip common prefix.
32
+ while let Some ( ( ( b_char, b_rest) , ( a_char, a_rest) ) ) = b. split_first ( ) . zip ( a. split_first ( ) ) {
33
+ if a_char != b_char {
34
+ break ;
35
+ }
36
+ a = a_rest;
37
+ b = b_rest;
38
+ }
39
+ // Strip common suffix.
40
+ while let Some ( ( ( b_char, b_rest) , ( a_char, a_rest) ) ) = b. split_last ( ) . zip ( a. split_last ( ) ) {
41
+ if a_char != b_char {
42
+ break ;
43
+ }
44
+ a = a_rest;
45
+ b = b_rest;
46
+ }
22
47
23
- for ( i, sc) in me. chars ( ) . enumerate ( ) {
24
- let mut current = i;
25
- dcol[ 0 ] = current + 1 ;
48
+ // If either string is empty, the distance is the length of the other.
49
+ // We know that `b` is the shorter string, so we don't need to check `a`.
50
+ if b. len ( ) == 0 {
51
+ return Some ( min_dist) ;
52
+ }
26
53
27
- for ( j, tc) in t. chars ( ) . enumerate ( ) {
28
- let next = dcol[ j + 1 ] ;
54
+ let mut prev_prev = vec ! [ usize :: MAX ; b. len( ) + 1 ] ;
55
+ let mut prev = ( 0 ..=b. len ( ) ) . collect :: < Vec < _ > > ( ) ;
56
+ let mut current = vec ! [ 0 ; b. len( ) + 1 ] ;
29
57
30
- if sc == tc {
31
- dcol[ j + 1 ] = current;
32
- } else {
33
- dcol[ j + 1 ] = cmp:: min ( current, next) ;
34
- dcol[ j + 1 ] = cmp:: min ( dcol[ j + 1 ] , dcol[ j] ) + 1 ;
35
- }
58
+ // row by row
59
+ for i in 1 ..=a. len ( ) {
60
+ current[ 0 ] = i;
61
+ let a_idx = i - 1 ;
62
+
63
+ // column by column
64
+ for j in 1 ..=b. len ( ) {
65
+ let b_idx = j - 1 ;
66
+
67
+ // There is no cost to substitute a character with itself.
68
+ let substitution_cost = if a[ a_idx] == b[ b_idx] { 0 } else { 1 } ;
36
69
37
- current = next;
38
- t_last = j;
70
+ current[ j] = cmp:: min (
71
+ // deletion
72
+ prev[ j] + 1 ,
73
+ cmp:: min (
74
+ // insertion
75
+ current[ j - 1 ] + 1 ,
76
+ // substitution
77
+ prev[ j - 1 ] + substitution_cost,
78
+ ) ,
79
+ ) ;
80
+
81
+ if ( i > 1 ) && ( j > 1 ) && ( a[ a_idx] == b[ b_idx - 1 ] ) && ( a[ a_idx - 1 ] == b[ b_idx] ) {
82
+ // transposition
83
+ current[ j] = cmp:: min ( current[ j] , prev_prev[ j - 2 ] + 1 ) ;
84
+ }
39
85
}
86
+
87
+ // Rotate the buffers, reusing the memory.
88
+ [ prev_prev, prev, current] = [ prev, current, prev_prev] ;
40
89
}
41
90
42
- dcol[ t_last + 1 ]
91
+ // `prev` because we already rotated the buffers.
92
+ let distance = prev[ b. len ( ) ] ;
93
+ ( distance <= limit) . then_some ( distance)
43
94
}
44
95
45
96
/// Find the closest element from `iter` matching `choice`. The `key` callback
@@ -51,8 +102,7 @@ pub fn closest<'a, T>(
51
102
) -> Option < T > {
52
103
// Only consider candidates with a lev_distance of 3 or less so we don't
53
104
// suggest out-of-the-blue options.
54
- iter. map ( |e| ( lev_distance ( choice, key ( & e) ) , e) )
55
- . filter ( |& ( d, _) | d < 4 )
105
+ iter. filter_map ( |e| Some ( ( lev_distance ( choice, key ( & e) , 3 ) ?, e) ) )
56
106
. min_by_key ( |t| t. 0 )
57
107
. map ( |t| t. 1 )
58
108
}
@@ -78,16 +128,16 @@ fn test_lev_distance() {
78
128
. filter_map ( from_u32)
79
129
. map ( |i| i. to_string ( ) )
80
130
{
81
- assert_eq ! ( lev_distance( & c, & c) , 0 ) ;
131
+ assert_eq ! ( lev_distance( & c, & c, usize :: MAX ) , Some ( 0 ) ) ;
82
132
}
83
133
84
134
let a = "\n Märy häd ä little lämb\n \n Little lämb\n " ;
85
135
let b = "\n Mary häd ä little lämb\n \n Little lämb\n " ;
86
136
let c = "Mary häd ä little lämb\n \n Little lämb\n " ;
87
- assert_eq ! ( lev_distance( a, b) , 1 ) ;
88
- assert_eq ! ( lev_distance( b, a) , 1 ) ;
89
- assert_eq ! ( lev_distance( a, c) , 2 ) ;
90
- assert_eq ! ( lev_distance( c, a) , 2 ) ;
91
- assert_eq ! ( lev_distance( b, c) , 1 ) ;
92
- assert_eq ! ( lev_distance( c, b) , 1 ) ;
137
+ assert_eq ! ( lev_distance( a, b, usize :: MAX ) , Some ( 1 ) ) ;
138
+ assert_eq ! ( lev_distance( b, a, usize :: MAX ) , Some ( 1 ) ) ;
139
+ assert_eq ! ( lev_distance( a, c, usize :: MAX ) , Some ( 2 ) ) ;
140
+ assert_eq ! ( lev_distance( c, a, usize :: MAX ) , Some ( 2 ) ) ;
141
+ assert_eq ! ( lev_distance( b, c, usize :: MAX ) , Some ( 1 ) ) ;
142
+ assert_eq ! ( lev_distance( c, b, usize :: MAX ) , Some ( 1 ) ) ;
93
143
}
0 commit comments