Skip to content

Commit 529314d

Browse files
committed
Build index builders doc and naming (#18830)
1 parent b84ad24 commit 529314d

16 files changed

+519
-323
lines changed

ydb/core/base/table_index.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,4 +189,16 @@ bool IsBuildImplTable(std::string_view tableName) {
189189
|| tableName.ends_with(NTableVectorKmeansTreeIndex::BuildSuffix1);
190190
}
191191

192+
static constexpr TClusterId PostingParentFlag = (1ull << 63ull);
193+
194+
// Note: if cluster id is too big, something is wrong with cluster enumeration
195+
void EnsureNoPostingParentFlag(TClusterId parent) {
196+
Y_ENSURE((parent & PostingParentFlag) == 0);
197+
}
198+
199+
TClusterId SetPostingParentFlag(TClusterId parent) {
200+
EnsureNoPostingParentFlag(parent);
201+
return (parent | PostingParentFlag);
202+
}
203+
192204
}

ydb/core/base/table_index.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,12 @@ bool IsImplTable(std::string_view tableName);
3838
bool IsBuildImplTable(std::string_view tableName);
3939

4040
using TClusterId = ui64;
41-
4241
inline constexpr auto ClusterIdType = Ydb::Type::UINT64;
4342
inline constexpr const char* ClusterIdTypeName = "Uint64";
4443

44+
void EnsureNoPostingParentFlag(TClusterId parent);
45+
46+
TClusterId SetPostingParentFlag(TClusterId parent);
47+
4548
}
4649
}

ydb/core/protos/tx_datashard.proto

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,14 +1552,12 @@ message TEvLocalKMeansRequest {
15521552
optional uint32 K = 10;
15531553
optional uint32 NeedsRounds = 14;
15541554

1555-
// id of parent cluster
15561555
optional uint64 ParentFrom = 15;
15571556
optional uint64 ParentTo = 21;
1558-
// [Child ... Child + K * (ParentFrom - ParentTo + 1)) ids reserved for this kmeans clusters
15591557
optional uint64 Child = 16;
15601558

15611559
optional string LevelName = 17;
1562-
optional string PostingName = 18;
1560+
optional string OutputName = 18;
15631561

15641562
optional string EmbeddingColumn = 19;
15651563
repeated string DataColumns = 20;
@@ -1608,7 +1606,7 @@ message TEvReshuffleKMeansRequest {
16081606
// centroids of clusters
16091607
repeated string Clusters = 12;
16101608

1611-
optional string PostingName = 13;
1609+
optional string OutputName = 13;
16121610

16131611
optional string EmbeddingColumn = 14;
16141612
repeated string DataColumns = 15;
@@ -1656,7 +1654,7 @@ message TEvPrefixKMeansRequest {
16561654
optional uint64 Child = 11;
16571655

16581656
optional string LevelName = 12;
1659-
optional string PostingName = 13;
1657+
optional string OutputName = 13;
16601658
optional string PrefixName = 14;
16611659

16621660
optional string EmbeddingColumn = 15;

ydb/core/tx/datashard/build_index/common_helper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <ydb/library/actors/core/log.h>
66

77
namespace NKikimr::NDataShard {
8+
using namespace NTableIndex;
89

910
#define LOG_T(stream) LOG_TRACE_S (*TlsActivationContext, NKikimrServices::BUILD_INDEX, stream)
1011
#define LOG_D(stream) LOG_DEBUG_S (*TlsActivationContext, NKikimrServices::BUILD_INDEX, stream)

ydb/core/tx/datashard/build_index/kmeans_helper.cpp

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
namespace NKikimr::NDataShard::NKMeans {
77

8-
TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to) {
8+
TTableRange CreateRangeFrom(const TUserTable& table, TClusterId parent, TCell& from, TCell& to) {
99
if (parent == 0) {
1010
return table.GetTableRange();
1111
}
@@ -28,7 +28,26 @@ NTable::TLead CreateLeadFrom(const TTableRange& range) {
2828
return lead;
2929
}
3030

31-
void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row) {
31+
void AddRowToLevel(TBufferData& buffer, TClusterId parent, TClusterId child, const TString& embedding, bool isPostingLevel) {
32+
if (isPostingLevel) {
33+
child = SetPostingParentFlag(child);
34+
} else {
35+
EnsureNoPostingParentFlag(child);
36+
}
37+
38+
std::array<TCell, 2> pk;
39+
pk[0] = TCell::Make(parent);
40+
pk[1] = TCell::Make(child);
41+
42+
std::array<TCell, 1> data;
43+
data[0] = TCell{embedding};
44+
45+
buffer.AddRow(TSerializedCellVec{pk}, TSerializedCellVec::Serialize(data));
46+
}
47+
48+
void AddRowMainToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row) {
49+
EnsureNoPostingParentFlag(parent);
50+
3251
std::array<TCell, 1> cells;
3352
cells[0] = TCell::Make(parent);
3453
auto pk = TSerializedCellVec::Serialize(cells);
@@ -37,9 +56,10 @@ void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArra
3756
TSerializedCellVec{key});
3857
}
3958

40-
void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
41-
ui32 dataPos)
59+
void AddRowMainToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos)
4260
{
61+
parent = SetPostingParentFlag(parent);
62+
4363
std::array<TCell, 1> cells;
4464
cells[0] = TCell::Make(parent);
4565
auto pk = TSerializedCellVec::Serialize(cells);
@@ -48,9 +68,10 @@ void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TAr
4868
TSerializedCellVec{key});
4969
}
5070

51-
void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
52-
ui32 prefixColumns)
71+
void AddRowBuildToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 prefixColumns)
5372
{
73+
EnsureNoPostingParentFlag(parent);
74+
5475
std::array<TCell, 1> cells;
5576
cells[0] = TCell::Make(parent);
5677
auto pk = TSerializedCellVec::Serialize(cells);
@@ -59,9 +80,10 @@ void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArr
5980
TSerializedCellVec{key});
6081
}
6182

62-
void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
63-
ui32 dataPos, ui32 prefixColumns)
83+
void AddRowBuildToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos, ui32 prefixColumns)
6484
{
85+
parent = SetPostingParentFlag(parent);
86+
6587
std::array<TCell, 1> cells;
6688
cells[0] = TCell::Make(parent);
6789
auto pk = TSerializedCellVec::Serialize(cells);
@@ -70,45 +92,44 @@ void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TA
7092
TSerializedCellVec{key});
7193
}
7294

73-
TTags MakeUploadTags(const TUserTable& table, const TProtoStringType& embedding,
74-
const google::protobuf::RepeatedPtrField<TProtoStringType>& data, ui32& embeddingPos,
75-
ui32& dataPos, NTable::TTag& embeddingTag)
95+
TTags MakeScanTags(const TUserTable& table, const TProtoStringType& embedding,
96+
const google::protobuf::RepeatedPtrField<TProtoStringType>& data, ui32& embeddingPos,
97+
ui32& dataPos, NTable::TTag& embeddingTag)
7698
{
7799
auto tags = GetAllTags(table);
78-
TTags uploadTags;
79-
uploadTags.reserve(1 + data.size());
100+
TTags result;
101+
result.reserve(1 + data.size());
80102
embeddingTag = tags.at(embedding);
81103
if (auto it = std::find(data.begin(), data.end(), embedding); it != data.end()) {
82104
embeddingPos = it - data.begin();
83105
dataPos = 0;
84106
} else {
85-
uploadTags.push_back(embeddingTag);
107+
result.push_back(embeddingTag);
86108
}
87109
for (const auto& column : data) {
88-
uploadTags.push_back(tags.at(column));
110+
result.push_back(tags.at(column));
89111
}
90-
return uploadTags;
112+
return result;
91113
}
92114

93-
std::shared_ptr<NTxProxy::TUploadTypes>
94-
MakeUploadTypes(const TUserTable& table, NKikimrTxDataShard::EKMeansState uploadState,
95-
const TProtoStringType& embedding, const google::protobuf::RepeatedPtrField<TProtoStringType>& data,
96-
ui32 prefixColumns)
115+
std::shared_ptr<NTxProxy::TUploadTypes> MakeOutputTypes(const TUserTable& table, NKikimrTxDataShard::EKMeansState uploadState,
116+
const TProtoStringType& embedding, const google::protobuf::RepeatedPtrField<TProtoStringType>& data,
117+
ui32 prefixColumns)
97118
{
98119
auto types = GetAllTypes(table);
99120

100-
auto uploadTypes = std::make_shared<NTxProxy::TUploadTypes>();
101-
uploadTypes->reserve(1 + 1 + std::min((table.KeyColumnTypes.size() - prefixColumns) + data.size(), types.size()));
121+
auto result = std::make_shared<NTxProxy::TUploadTypes>();
122+
result->reserve(1 + 1 + std::min((table.KeyColumnTypes.size() - prefixColumns) + data.size(), types.size()));
102123

103124
Ydb::Type type;
104125
type.set_type_id(NTableIndex::ClusterIdType);
105-
uploadTypes->emplace_back(NTableIndex::NTableVectorKmeansTreeIndex::ParentColumn, type);
126+
result->emplace_back(NTableIndex::NTableVectorKmeansTreeIndex::ParentColumn, type);
106127

107128
auto addType = [&](const auto& column) {
108129
auto it = types.find(column);
109130
if (it != types.end()) {
110131
NScheme::ProtoFromTypeInfo(it->second, type);
111-
uploadTypes->emplace_back(it->first, type);
132+
result->emplace_back(it->first, type);
112133
types.erase(it);
113134
}
114135
};
@@ -133,7 +154,7 @@ MakeUploadTypes(const TUserTable& table, NKikimrTxDataShard::EKMeansState upload
133154
Y_ASSERT(false);
134155

135156
}
136-
return uploadTypes;
157+
return result;
137158
}
138159

139160
}

ydb/core/tx/datashard/build_index/kmeans_helper.h

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Y_PURE_FUNCTION TTriWayDotProduct<TRes> CosineImpl(const ui8* lhs, const ui8* rh
4444
return {static_cast<TRes>(ll), static_cast<TRes>(lr), static_cast<TRes>(rr)};
4545
}
4646

47-
TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to);
47+
TTableRange CreateRangeFrom(const TUserTable& table, TClusterId parent, TCell& from, TCell& to);
4848

4949
NTable::TLead CreateLeadFrom(const TTableRange& range);
5050

@@ -138,25 +138,23 @@ struct TMaxInnerProductSimilarity : TMetric<TCoord> {
138138
}
139139
};
140140

141-
void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row);
141+
void AddRowToLevel(TBufferData& buffer, TClusterId parent, TClusterId child, const TString& embedding, bool isPostingLevel);
142142

143-
void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
144-
ui32 dataPos);
143+
void AddRowMainToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row);
145144

146-
void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
147-
ui32 prefixColumns = 1);
145+
void AddRowMainToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos);
148146

149-
void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row,
150-
ui32 dataPos, ui32 prefixColumns = 1);
147+
void AddRowBuildToBuild(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 prefixColumns = 1);
151148

152-
TTags MakeUploadTags(const TUserTable& table, const TProtoStringType& embedding,
153-
const google::protobuf::RepeatedPtrField<TProtoStringType>& data, ui32& embeddingPos,
154-
ui32& dataPos, NTable::TTag& embeddingTag);
149+
void AddRowBuildToPosting(TBufferData& buffer, TClusterId parent, TArrayRef<const TCell> key, TArrayRef<const TCell> row, ui32 dataPos, ui32 prefixColumns = 1);
155150

156-
std::shared_ptr<NTxProxy::TUploadTypes>
157-
MakeUploadTypes(const TUserTable& table, NKikimrTxDataShard::EKMeansState uploadState,
158-
const TProtoStringType& embedding, const google::protobuf::RepeatedPtrField<TProtoStringType>& data,
159-
ui32 prefixColumns = 0);
151+
TTags MakeScanTags(const TUserTable& table, const TProtoStringType& embedding,
152+
const google::protobuf::RepeatedPtrField<TProtoStringType>& data, ui32& embeddingPos,
153+
ui32& dataPos, NTable::TTag& embeddingTag);
154+
155+
std::shared_ptr<NTxProxy::TUploadTypes> MakeOutputTypes(const TUserTable& table, NKikimrTxDataShard::EKMeansState uploadState,
156+
const TProtoStringType& embedding, const google::protobuf::RepeatedPtrField<TProtoStringType>& data,
157+
ui32 prefixColumns = 0);
160158

161159
void MakeScan(auto& record, const auto& createScan, const auto& badRequest)
162160
{
@@ -410,16 +408,16 @@ class TClusters {
410408
return true;
411409
}
412410

413-
ui32 FindCluster(TArrayRef<const TCell> row, NTable::TPos embeddingPos)
411+
std::optional<ui32> FindCluster(TArrayRef<const TCell> row, NTable::TPos embeddingPos)
414412
{
415413
Y_ASSERT(embeddingPos < row.size());
416414
const auto embedding = row.at(embeddingPos).AsRef();
417415
if (!IsExpectedSize<TCoord>(embedding, Dimensions)) {
418-
return Max<ui32>();
416+
return {};
419417
}
420418

421419
auto min = TMetric::Init();
422-
ui32 closest = Max<ui32>();
420+
std::optional<ui32> closest = {};
423421
for (size_t i = 0; const auto& cluster : Clusters) {
424422
auto distance = TMetric::Distance(cluster.data(), embedding.data(), Dimensions);
425423
if (distance < min) {

0 commit comments

Comments
 (0)