Skip to content

Commit b81087b

Browse files
vityamanrobot-piglet
authored andcommitted
YQL-19747 Normalize names for ranking and filtering
I was lazy to search for a most frequent used name among equivalent by the relation `(a ~ b) iff (NormalizeName(a) = NormalizeName(b))`. Because it seems that names we receive from JSONs are canonized and therefore in a preferable style by the opinion of the YQL language designers. But because of duplicates at `statements_opensource.json` we have, for example, both `IGNORETYPEV3` and `IGNORE_TYPE_V3` in candidates list. I think that we should just remove `IGNORETYPEV3` from the JSON. --- - Related to #9056 - Related to vityaman#21 --- Pull Request resolved: ytsaurus/ytsaurus#1229 commit_hash:fe73374ae27df1fcacb0adccda930ec98ed1d7a6
1 parent 60665d3 commit b81087b

File tree

10 files changed

+183
-22
lines changed

10 files changed

+183
-22
lines changed

yql/essentials/sql/v1/complete/name/static/frequency.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "frequency.h"
22

3+
#include "name_index.h"
4+
35
#include <library/cpp/json/json_reader.h>
46
#include <library/cpp/resource/resource.h>
57

@@ -54,7 +56,7 @@ namespace NSQLComplete {
5456
}
5557
};
5658

57-
TFrequencyData Convert(TVector<TFrequencyItem> items) {
59+
TFrequencyData Convert(TVector<TFrequencyItem> items, auto normalize) {
5860
TFrequencyData data;
5961
for (auto& item : items) {
6062
if (item.Parent == Json.Parent.Pragma ||
@@ -65,7 +67,7 @@ namespace NSQLComplete {
6567
item.Parent == Json.Parent.Module ||
6668
item.Parent == Json.Parent.ReadHint ||
6769
item.Parent == Json.Parent.InsertHint) {
68-
item.Rule = ToLowerUTF8(item.Rule);
70+
item.Rule = normalize(item.Rule);
6971
}
7072

7173
if (item.Parent == Json.Parent.Pragma) {
@@ -89,14 +91,24 @@ namespace NSQLComplete {
8991
return data;
9092
}
9193

94+
TFrequencyData ParseJsonFrequencyData(const TStringBuf text, auto normalize) {
95+
return Convert(TFrequencyItem::ParseListFromJsonText(text), normalize);
96+
}
97+
9298
TFrequencyData ParseJsonFrequencyData(const TStringBuf text) {
93-
return Convert(TFrequencyItem::ParseListFromJsonText(text));
99+
return ParseJsonFrequencyData(text, NormalizeName);
94100
}
95101

96102
TFrequencyData LoadFrequencyData() {
97103
TString text;
98104
Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text));
99-
return ParseJsonFrequencyData(text);
105+
return ParseJsonFrequencyData(text, NormalizeName);
106+
}
107+
108+
TFrequencyData LoadFrequencyDataForPrunning() {
109+
TString text;
110+
Y_ENSURE(NResource::FindExact("rules_corr_basic.json", &text));
111+
return ParseJsonFrequencyData(text, UnchangedName);
100112
}
101113

102114
} // namespace NSQLComplete

yql/essentials/sql/v1/complete/name/static/frequency.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,6 @@ namespace NSQLComplete {
1717

1818
TFrequencyData LoadFrequencyData();
1919

20+
TFrequencyData LoadFrequencyDataForPrunning();
21+
2022
} // namespace NSQLComplete

yql/essentials/sql/v1/complete/name/static/frequency_ut.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Y_UNIT_TEST_SUITE(FrequencyTests) {
3131
},
3232
.Hints = {
3333
{"columns", 826110},
34-
{"column_groups", 225},
34+
{"columngroups", 225},
3535
},
3636
};
3737

yql/essentials/sql/v1/complete/name/static/json_name_set.cpp

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#include "name_service.h"
22

3+
#include "frequency.h"
4+
#include "name_index.h"
5+
36
#include <library/cpp/json/json_reader.h>
47
#include <library/cpp/resource/resource.h>
58

@@ -78,15 +81,52 @@ namespace NSQLComplete {
7881
return hints;
7982
}
8083

84+
TVector<TString> Pruned(TVector<TString> names, const THashMap<TString, size_t>& frequency) {
85+
THashMap<TString, TVector<std::tuple<TString, size_t>>> groups;
86+
87+
for (auto& [normalized, original] : BuildNameIndex(std::move(names), NormalizeName)) {
88+
size_t freq = 0;
89+
if (const size_t* it = frequency.FindPtr(original)) {
90+
freq = *it;
91+
}
92+
groups[normalized].emplace_back(std::move(original), freq);
93+
}
94+
95+
for (auto& [_, group] : groups) {
96+
Sort(group, [](const auto& lhs, const auto& rhs) {
97+
return std::get<1>(lhs) < std::get<1>(rhs);
98+
});
99+
}
100+
101+
names = TVector<TString>();
102+
names.reserve(groups.size());
103+
for (auto& [_, group] : groups) {
104+
Y_ASSERT(!group.empty());
105+
names.emplace_back(std::move(std::get<0>(group.back())));
106+
}
107+
return names;
108+
}
109+
110+
NameSet Pruned(NameSet names) {
111+
auto frequency = LoadFrequencyDataForPrunning();
112+
names.Pragmas = Pruned(std::move(names.Pragmas), frequency.Pragmas);
113+
names.Types = Pruned(std::move(names.Types), frequency.Types);
114+
names.Functions = Pruned(std::move(names.Functions), frequency.Functions);
115+
for (auto& [k, h] : names.Hints) {
116+
h = Pruned(h, frequency.Hints);
117+
}
118+
return names;
119+
}
120+
81121
NameSet MakeDefaultNameSet() {
82-
return {
122+
return Pruned({
83123
.Pragmas = ParsePragmas(LoadJsonResource("pragmas_opensource.json")),
84124
.Types = ParseTypes(LoadJsonResource("types.json")),
85125
.Functions = Merge(
86126
ParseFunctions(LoadJsonResource("sql_functions.json")),
87127
ParseUdfs(LoadJsonResource("udfs_basic.json"))),
88128
.Hints = ParseHints(LoadJsonResource("statements_opensource.json")),
89-
};
129+
});
90130
}
91131

92132
} // namespace NSQLComplete
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#include "name_index.h"
2+
3+
#include <yql/essentials/core/sql_types/normalize_name.h>
4+
5+
#include <util/charset/utf8.h>
6+
7+
namespace NSQLComplete {
8+
9+
TString NormalizeName(const TString& name) {
10+
return NYql::NormalizeName(name);
11+
}
12+
13+
TString LowerizeName(const TString& name) {
14+
return ToLowerUTF8(name);
15+
}
16+
17+
TString UnchangedName(const TString& name) {
18+
return name;
19+
}
20+
21+
} // namespace NSQLComplete
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#pragma once
2+
3+
#include <yql/essentials/sql/v1/complete/text/case.h>
4+
5+
#include <util/generic/string.h>
6+
#include <util/generic/vector.h>
7+
#include <util/generic/algorithm.h>
8+
9+
namespace NSQLComplete {
10+
11+
struct TNameIndexEntry {
12+
TString Normalized;
13+
TString Original;
14+
};
15+
16+
using TNameIndex = TVector<TNameIndexEntry>;
17+
18+
inline bool NameIndexCompare(const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) {
19+
return NoCaseCompare(lhs.Normalized, rhs.Normalized);
20+
}
21+
22+
inline auto NameIndexCompareLimit(size_t limit) {
23+
return [cmp = NoCaseCompareLimit(limit)](const TNameIndexEntry& lhs, const TNameIndexEntry& rhs) {
24+
return cmp(lhs.Normalized, rhs.Normalized);
25+
};
26+
}
27+
28+
TNameIndex BuildNameIndex(TVector<TString> originals, auto normalize) {
29+
TNameIndex index;
30+
for (auto& original : originals) {
31+
TNameIndexEntry entry = {
32+
.Normalized = normalize(original),
33+
.Original = std::move(original),
34+
};
35+
index.emplace_back(std::move(entry));
36+
}
37+
38+
Sort(index, NameIndexCompare);
39+
return index;
40+
}
41+
42+
TString NormalizeName(const TString& name);
43+
44+
TString LowerizeName(const TString& name);
45+
46+
TString UnchangedName(const TString& name);
47+
48+
} // namespace NSQLComplete

yql/essentials/sql/v1/complete/name/static/name_service.cpp

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,29 @@
11
#include "name_service.h"
22

3+
#include "name_index.h"
34
#include "ranking.h"
45

56
#include <yql/essentials/sql/v1/complete/text/case.h>
67

78
namespace NSQLComplete {
89

10+
const TVector<TStringBuf> FilteredByPrefix(const TString& prefix, const TNameIndex& index Y_LIFETIME_BOUND) {
11+
TNameIndexEntry normalized = {
12+
.Normalized = NormalizeName(prefix),
13+
.Original = "",
14+
};
15+
16+
auto range = std::ranges::equal_range(
17+
std::begin(index), std::end(index),
18+
normalized, NameIndexCompareLimit(normalized.Normalized.size()));
19+
20+
TVector<TStringBuf> filtered;
21+
for (const TNameIndexEntry& entry : range) {
22+
filtered.emplace_back(TStringBuf(entry.Original));
23+
}
24+
return filtered;
25+
}
26+
927
const TVector<TStringBuf> FilteredByPrefix(
1028
const TString& prefix,
1129
const TVector<TString>& sorted Y_LIFETIME_BOUND) {
@@ -55,15 +73,18 @@ namespace NSQLComplete {
5573
class TStaticNameService: public INameService {
5674
public:
5775
explicit TStaticNameService(NameSet names, IRanking::TPtr ranking)
58-
: NameSet_(std::move(names))
76+
: Pragmas_(BuildNameIndex(std::move(names.Pragmas), NormalizeName))
77+
, Types_(BuildNameIndex(std::move(names.Types), NormalizeName))
78+
, Functions_(BuildNameIndex(std::move(names.Functions), NormalizeName))
79+
, Hints_([hints = std::move(names.Hints)] {
80+
THashMap<EStatementKind, TNameIndex> index;
81+
for (auto& [k, hints] : hints) {
82+
index.emplace(k, BuildNameIndex(std::move(hints), NormalizeName));
83+
}
84+
return index;
85+
}())
5986
, Ranking_(std::move(ranking))
6087
{
61-
Sort(NameSet_.Pragmas, NoCaseCompare);
62-
Sort(NameSet_.Types, NoCaseCompare);
63-
Sort(NameSet_.Functions, NoCaseCompare);
64-
for (auto& [_, hints] : NameSet_.Hints) {
65-
Sort(hints, NoCaseCompare);
66-
}
6788
}
6889

6990
TFuture<TNameResponse> Lookup(TNameRequest request) override {
@@ -76,27 +97,27 @@ namespace NSQLComplete {
7697

7798
if (request.Constraints.Pragma) {
7899
auto prefix = Prefixed(request.Prefix, ".", *request.Constraints.Pragma);
79-
auto names = FilteredByPrefix(prefix, NameSet_.Pragmas);
100+
auto names = FilteredByPrefix(prefix, Pragmas_);
80101
AppendAs<TPragmaName>(response.RankedNames, names);
81102
}
82103

83104
if (request.Constraints.Type) {
84105
AppendAs<TTypeName>(
85106
response.RankedNames,
86-
FilteredByPrefix(request.Prefix, NameSet_.Types));
107+
FilteredByPrefix(request.Prefix, Types_));
87108
}
88109

89110
if (request.Constraints.Function) {
90111
auto prefix = Prefixed(request.Prefix, "::", *request.Constraints.Function);
91-
auto names = FilteredByPrefix(prefix, NameSet_.Functions);
112+
auto names = FilteredByPrefix(prefix, Functions_);
92113
AppendAs<TFunctionName>(response.RankedNames, names);
93114
}
94115

95116
if (request.Constraints.Hint) {
96117
const auto stmt = request.Constraints.Hint->Statement;
97118
AppendAs<THintName>(
98119
response.RankedNames,
99-
FilteredByPrefix(request.Prefix, NameSet_.Hints[stmt]));
120+
FilteredByPrefix(request.Prefix, Hints_[stmt]));
100121
}
101122

102123
Ranking_->CropToSortedPrefix(response.RankedNames, request.Limit);
@@ -109,7 +130,10 @@ namespace NSQLComplete {
109130
}
110131

111132
private:
112-
NameSet NameSet_;
133+
TNameIndex Pragmas_;
134+
TNameIndex Types_;
135+
TNameIndex Functions_;
136+
THashMap<EStatementKind, TNameIndex> Hints_;
113137
IRanking::TPtr Ranking_;
114138
};
115139

yql/essentials/sql/v1/complete/name/static/ranking.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
#include <yql/essentials/sql/v1/complete/name/name_service.h>
66

7+
#include <yql/essentials/core/sql_types/normalize_name.h>
8+
79
#include <util/charset/utf8.h>
810

911
namespace NSQLComplete {
@@ -57,7 +59,7 @@ namespace NSQLComplete {
5759
return std::visit([this](const auto& name) -> size_t {
5860
using T = std::decay_t<decltype(name)>;
5961

60-
auto content = ToLowerUTF8(ContentView(name));
62+
auto content = NYql::NormalizeName(ContentView(name));
6163

6264
if constexpr (std::is_same_v<T, TKeyword>) {
6365
if (auto weight = Frequency_.Keywords.FindPtr(content)) {

yql/essentials/sql/v1/complete/name/static/ya.make

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ LIBRARY()
33
SRCS(
44
frequency.cpp
55
json_name_set.cpp
6+
name_index.cpp
67
name_service.cpp
78
ranking.cpp
89
)
910

1011
PEERDIR(
12+
yql/essentials/core/sql_types
1113
yql/essentials/sql/v1/complete/name
1214
yql/essentials/sql/v1/complete/text
1315
)

yql/essentials/sql/v1/complete/sql_complete_ut.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,6 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
605605
{
606606
TVector<TCandidate> expected = {
607607
{HintName, "IGNORE_TYPE_V3"},
608-
{HintName, "IGNORETYPEV3"},
609608
};
610609
UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ig"}), expected);
611610
}
@@ -642,6 +641,17 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
642641
UNIT_ASSERT_GE(Complete(engine, {"SELECT "}).size(), 55);
643642
}
644643

644+
Y_UNIT_TEST(NameNormalization) {
645+
auto set = MakeDefaultNameSet();
646+
auto service = MakeStaticNameService(std::move(set), MakeDefaultRanking());
647+
auto engine = MakeSqlCompletionEngine(MakePureLexerSupplier(), std::move(service));
648+
649+
TVector<TCandidate> expected = {
650+
{HintName, "IGNORE_TYPE_V3"},
651+
};
652+
UNIT_ASSERT_VALUES_EQUAL(Complete(engine, {"REDUCE a WITH ignoret"}), expected);
653+
}
654+
645655
Y_UNIT_TEST(Ranking) {
646656
TFrequencyData frequency = {
647657
.Keywords = {
@@ -715,7 +725,7 @@ Y_UNIT_TEST_SUITE(SqlCompleteTests) {
715725
{HintName, "XLOCK"},
716726
{HintName, "UNORDERED"},
717727
{Keyword, "COLUMNS"},
718-
{HintName, "FORCEINFERSCHEMA"},
728+
{HintName, "FORCE_INFER_SCHEMA"},
719729
};
720730
UNIT_ASSERT_VALUES_EQUAL(CompleteTop(expected.size(), engine, {"SELECT * FROM a WITH "}), expected);
721731
}

0 commit comments

Comments
 (0)