Skip to content

Commit f026409

Browse files
committed
Use restricted Damerau-Levenshtein algorithm
1 parent 7bf43f0 commit f026409

File tree

3 files changed

+88
-38
lines changed

3 files changed

+88
-38
lines changed

src/cargo/core/resolver/errors.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,7 @@ pub(super) fn activation_error(
308308
candidates.dedup_by(|a, b| a.name() == b.name());
309309
let mut candidates: Vec<_> = candidates
310310
.iter()
311-
.map(|n| (lev_distance(&*new_dep.package_name(), &*n.name()), n))
312-
.filter(|&(d, _)| d < 4)
311+
.filter_map(|n| Some((lev_distance(&*new_dep.package_name(), &*n.name(), 3)?, n)))
313312
.collect();
314313
candidates.sort_by_key(|o| o.0);
315314
let mut msg: String;

src/cargo/core/workspace.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,8 +1245,9 @@ impl<'cfg> Workspace<'cfg> {
12451245
optional_dependency_names_per_member.insert(member, optional_dependency_names_raw);
12461246
}
12471247

1248-
let levenshtein_test =
1249-
|a: InternedString, b: InternedString| lev_distance(a.as_str(), b.as_str()) < 4;
1248+
let levenshtein_test = |a: InternedString, b: InternedString| {
1249+
lev_distance(a.as_str(), b.as_str(), 3).is_some()
1250+
};
12501251

12511252
let suggestions: Vec<_> = cli_features
12521253
.features

src/cargo/util/lev_distance.rs

Lines changed: 84 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,96 @@
1-
use std::cmp;
1+
use std::{cmp, mem};
22

3-
pub fn lev_distance(me: &str, t: &str) -> usize {
3+
/// Finds the [edit distance] between two strings.
4+
///
5+
/// Returns `None` if the distance exceeds the limit.
6+
///
7+
/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
8+
pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
49
// Comparing the strings lowercased will result in a difference in capitalization being less distance away
510
// than being a completely different letter. Otherwise `CHECK` is as far away from `check` as it
611
// is from `build` (both with a distance of 5). For a single letter shortcut (e.g. `b` or `c`), they will
712
// all be as far away from any capital single letter entry (all with a distance of 1).
813
// By first lowercasing the strings, `C` and `c` are closer than `C` and `b`, for example.
9-
let me = me.to_lowercase();
10-
let t = t.to_lowercase();
14+
let a = a.to_lowercase();
15+
let b = b.to_lowercase();
1116

12-
let t_len = t.chars().count();
13-
if me.is_empty() {
14-
return t_len;
17+
let mut a = &a.chars().collect::<Vec<_>>()[..];
18+
let mut b = &b.chars().collect::<Vec<_>>()[..];
19+
20+
// Ensure that `b` is the shorter string, minimizing memory use.
21+
if a.len() < b.len() {
22+
mem::swap(&mut a, &mut b);
1523
}
16-
if t.is_empty() {
17-
return me.chars().count();
24+
25+
let min_dist = a.len() - b.len();
26+
// If we know the limit will be exceeded, we can return early.
27+
if min_dist > limit {
28+
return None;
1829
}
1930

20-
let mut dcol = (0..=t_len).collect::<Vec<_>>();
21-
let mut t_last = 0;
31+
// Strip common prefix.
32+
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) {
33+
if a_char != b_char {
34+
break;
35+
}
36+
a = a_rest;
37+
b = b_rest;
38+
}
39+
// Strip common suffix.
40+
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) {
41+
if a_char != b_char {
42+
break;
43+
}
44+
a = a_rest;
45+
b = b_rest;
46+
}
2247

23-
for (i, sc) in me.chars().enumerate() {
24-
let mut current = i;
25-
dcol[0] = current + 1;
48+
// If either string is empty, the distance is the length of the other.
49+
// We know that `b` is the shorter string, so we don't need to check `a`.
50+
if b.len() == 0 {
51+
return Some(min_dist);
52+
}
2653

27-
for (j, tc) in t.chars().enumerate() {
28-
let next = dcol[j + 1];
54+
let mut prev_prev = vec![usize::MAX; b.len() + 1];
55+
let mut prev = (0..=b.len()).collect::<Vec<_>>();
56+
let mut current = vec![0; b.len() + 1];
2957

30-
if sc == tc {
31-
dcol[j + 1] = current;
32-
} else {
33-
dcol[j + 1] = cmp::min(current, next);
34-
dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
35-
}
58+
// row by row
59+
for i in 1..=a.len() {
60+
current[0] = i;
61+
let a_idx = i - 1;
62+
63+
// column by column
64+
for j in 1..=b.len() {
65+
let b_idx = j - 1;
66+
67+
// There is no cost to substitute a character with itself.
68+
let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
3669

37-
current = next;
38-
t_last = j;
70+
current[j] = cmp::min(
71+
// deletion
72+
prev[j] + 1,
73+
cmp::min(
74+
// insertion
75+
current[j - 1] + 1,
76+
// substitution
77+
prev[j - 1] + substitution_cost,
78+
),
79+
);
80+
81+
if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
82+
// transposition
83+
current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
84+
}
3985
}
86+
87+
// Rotate the buffers, reusing the memory.
88+
[prev_prev, prev, current] = [prev, current, prev_prev];
4089
}
4190

42-
dcol[t_last + 1]
91+
// `prev` because we already rotated the buffers.
92+
let distance = prev[b.len()];
93+
(distance <= limit).then_some(distance)
4394
}
4495

4596
/// Find the closest element from `iter` matching `choice`. The `key` callback
@@ -51,8 +102,7 @@ pub fn closest<'a, T>(
51102
) -> Option<T> {
52103
// Only consider candidates with a lev_distance of 3 or less so we don't
53104
// suggest out-of-the-blue options.
54-
iter.map(|e| (lev_distance(choice, key(&e)), e))
55-
.filter(|&(d, _)| d < 4)
105+
iter.filter_map(|e| Some((lev_distance(choice, key(&e), 3)?, e)))
56106
.min_by_key(|t| t.0)
57107
.map(|t| t.1)
58108
}
@@ -78,16 +128,16 @@ fn test_lev_distance() {
78128
.filter_map(from_u32)
79129
.map(|i| i.to_string())
80130
{
81-
assert_eq!(lev_distance(&c, &c), 0);
131+
assert_eq!(lev_distance(&c, &c, usize::MAX), Some(0));
82132
}
83133

84134
let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
85135
let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
86136
let c = "Mary häd ä little lämb\n\nLittle lämb\n";
87-
assert_eq!(lev_distance(a, b), 1);
88-
assert_eq!(lev_distance(b, a), 1);
89-
assert_eq!(lev_distance(a, c), 2);
90-
assert_eq!(lev_distance(c, a), 2);
91-
assert_eq!(lev_distance(b, c), 1);
92-
assert_eq!(lev_distance(c, b), 1);
137+
assert_eq!(lev_distance(a, b, usize::MAX), Some(1));
138+
assert_eq!(lev_distance(b, a, usize::MAX), Some(1));
139+
assert_eq!(lev_distance(a, c, usize::MAX), Some(2));
140+
assert_eq!(lev_distance(c, a, usize::MAX), Some(2));
141+
assert_eq!(lev_distance(b, c, usize::MAX), Some(1));
142+
assert_eq!(lev_distance(c, b, usize::MAX), Some(1));
93143
}

0 commit comments

Comments
 (0)