@@ -33,89 +33,35 @@ inline ui32 LevenshteinDistance(TString word1, TString word2) {
33
33
return dist[size1][size2];
34
34
}
35
35
36
- template <typename Type>
37
36
class FuzzySearcher {
38
- struct WordHit {
39
- bool Contains;
40
- ui32 LengthDifference;
41
- ui32 LevenshteinDistance;
42
- Type Data;
43
-
44
- WordHit (bool contains, ui32 lengthDifference, ui32 levenshteinDistance, Type data)
45
- : Contains(contains)
46
- , LengthDifference(lengthDifference)
47
- , LevenshteinDistance(levenshteinDistance)
48
- , Data(data)
49
- {}
50
-
51
- bool operator <(const WordHit& other) const {
52
- if (this ->Contains && !other.Contains ) {
53
- return true ;
54
- }
55
- if (this ->Contains && other.Contains ) {
56
- return this ->LengthDifference < other.LengthDifference ;
57
- }
58
- return this ->LevenshteinDistance < other.LevenshteinDistance ;
59
- }
60
-
61
- bool operator >(const WordHit& other) const {
62
- if (!this ->Contains && other.Contains ) {
63
- return true ;
64
- }
65
- if (this ->Contains && other.Contains ) {
66
- return this ->LengthDifference > other.LengthDifference ;
67
- }
68
- return this ->LevenshteinDistance > other.LevenshteinDistance ;
69
- }
70
- };
71
-
72
- static WordHit CalculateWordHit (TString searchWord, TString testWord, Type testData) {
73
- searchWord = to_lower (searchWord);
74
- testWord = to_lower (testWord);
75
- if (testWord.Contains (searchWord)) {
76
- return {1 , static_cast <ui32>(testWord.length () - searchWord.length ()), 0 , testData};
37
+ static size_t CalculateWordHit (const TString& searchWord, const TString& testWord) {
38
+ size_t findPos = testWord.find (searchWord);
39
+ if (findPos != TString::npos) {
40
+ return testWord.size () - searchWord.size () + findPos;
77
41
} else {
78
- ui32 levenshteinDistance = LevenshteinDistance (searchWord, testWord);
79
- return {0 , 0 , levenshteinDistance, testData};
42
+ return 1000 * LevenshteinDistance (searchWord, testWord);
80
43
}
81
44
}
82
45
83
46
public:
84
- THashMap<TString, Type> Dictionary;
85
-
86
- FuzzySearcher (const THashMap<TString, Type>& dictionary)
87
- : Dictionary(dictionary) {}
88
-
89
- FuzzySearcher (const TVector<TString>& words) {
90
- for (const auto & word : words) {
91
- Dictionary[word] = word;
47
+ template <typename Type>
48
+ static std::vector<const Type*> Search (const std::vector<Type>& dictionary, const TString& searchWord, ui32 limit = 10 ) {
49
+ TString search = to_lower (searchWord);
50
+ std::vector<std::pair<size_t , size_t >> hits; // {distance, index}
51
+ hits.reserve (dictionary.size ());
52
+ for (size_t index = 0 ; index < dictionary.size (); ++index) {
53
+ hits.emplace_back (CalculateWordHit (search, to_lower (TString (dictionary[index]))), index);
92
54
}
93
- }
94
-
95
- TVector<Type> Search (const TString& searchWord, ui32 limit = 10 ) {
96
- auto cmp = [](const WordHit& left, const WordHit& right) {
97
- return left < right;
98
- };
99
- std::priority_queue<WordHit, TVector<WordHit>, decltype (cmp)> queue (cmp);
100
-
101
- for (const auto & [word, data]: Dictionary) {
102
- auto wordHit = CalculateWordHit (searchWord, word, data);
103
- if (queue.size () < limit) {
104
- queue.emplace (wordHit);
105
- } else if (queue.size () > 0 && wordHit < queue.top ()) {
106
- queue.pop ();
107
- queue.emplace (wordHit);
108
- }
55
+ std::sort (hits.begin (), hits.end ());
56
+ if (hits.size () > limit) {
57
+ hits.resize (limit);
109
58
}
110
-
111
- TVector<Type> results;
112
- while (!queue.empty ()) {
113
- results.emplace_back (queue.top ().Data );
114
- queue.pop ();
59
+ std::vector<const Type*> result;
60
+ result.reserve (hits.size ());
61
+ for (const auto & hit : hits) {
62
+ result.emplace_back (&dictionary[hit.second ]);
115
63
}
116
-
117
- std::reverse (results.begin (), results.end ());
118
- return results;
64
+ return result;
119
65
}
120
66
};
121
67
0 commit comments