Skip to content

Commit 02fadfa

Browse files
authored
Replace uint32 with uint64 for cluster id in vector index (#14576)
1 parent 78556e0 commit 02fadfa

File tree

18 files changed

+123
-108
lines changed

18 files changed

+123
-108
lines changed

ydb/core/base/table_index.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#pragma once
22

3+
#include <ydb/public/api/protos/ydb_value.pb.h>
4+
#include <ydb/public/lib/scheme_types/scheme_type_id.h>
35
#include <ydb/core/protos/flat_scheme_op.pb.h>
46

57
#include <util/generic/hash_set.h>
@@ -35,5 +37,10 @@ std::span<const std::string_view> GetImplTables(NKikimrSchemeOp::EIndexType inde
3537
bool IsImplTable(std::string_view tableName);
3638
bool IsBuildImplTable(std::string_view tableName);
3739

40+
using TClusterId = ui64;
41+
42+
inline constexpr auto ClusterIdType = Ydb::Type::UINT64;
43+
inline constexpr const char* ClusterIdTypeName = "Uint64";
44+
3845
}
3946
}

ydb/core/kqp/opt/logical/kqp_opt_log_indexes.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,10 +425,10 @@ TExprBase DoRewriteTopSortOverKMeansTree(
425425

426426
// TODO(mbkkt) How to inline construction of these constants to construction of readLevel0?
427427
auto fromValues = ctx.Builder(pos)
428-
.Callable("Uint32").Atom(0, "0", TNodeFlags::Default).Seal()
428+
.Callable(NTableIndex::ClusterIdTypeName).Atom(0, "0", TNodeFlags::Default).Seal()
429429
.Build();
430430
auto toValues = ctx.Builder(pos)
431-
.Callable("Uint32").Atom(0, "1", TNodeFlags::Default).Seal()
431+
.Callable(NTableIndex::ClusterIdTypeName).Atom(0, "1", TNodeFlags::Default).Seal()
432432
.Build();
433433

434434
auto levelLambda = [&] {

ydb/core/protos/tx_datashard.proto

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1548,10 +1548,10 @@ message TEvLocalKMeansRequest {
15481548
optional uint32 NeedsRounds = 14;
15491549

15501550
// id of parent cluster
1551-
optional uint32 ParentFrom = 15;
1552-
optional uint32 ParentTo = 21;
1551+
optional uint64 ParentFrom = 15;
1552+
optional uint64 ParentTo = 21;
15531553
// [Child ... Child + K * (ParentFrom - ParentTo + 1)) ids reserved for this kmeans clusters
1554-
optional uint32 Child = 16;
1554+
optional uint64 Child = 16;
15551555

15561556
optional string LevelName = 17;
15571557
optional string PostingName = 18;
@@ -1599,9 +1599,9 @@ message TEvReshuffleKMeansRequest {
15991599
optional TEvLocalKMeansRequest.EState Upload = 9;
16001600

16011601
// id of parent cluster
1602-
optional uint32 Parent = 10;
1602+
optional uint64 Parent = 10;
16031603
// [Child ... Child + ClustersSize) ids of this kmeans clusters
1604-
optional uint32 Child = 11;
1604+
optional uint64 Child = 11;
16051605
// centroids of clusters
16061606
repeated string Clusters = 12;
16071607

ydb/core/tx/datashard/datashard_ut_local_kmeans.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <ydb/core/base/table_index.h>
12
#include <ydb/core/testlib/test_client.h>
23
#include <ydb/core/tx/datashard/ut_common/datashard_ut_common.h>
34
#include <ydb/core/tx/schemeshard/schemeshard.h>
@@ -91,7 +92,7 @@ Y_UNIT_TEST_SUITE (TTxDataShardLocalKMeansScan) {
9192
}
9293

9394
static std::tuple<TString, TString> DoLocalKMeans(
94-
Tests::TServer::TPtr server, TActorId sender, ui32 parent, ui64 seed, ui64 k,
95+
Tests::TServer::TPtr server, TActorId sender, NTableIndex::TClusterId parent, ui64 seed, ui64 k,
9596
NKikimrTxDataShard::TEvLocalKMeansRequest::EState upload, VectorIndexSettings::VectorType type,
9697
VectorIndexSettings::Metric metric)
9798
{
@@ -185,8 +186,8 @@ Y_UNIT_TEST_SUITE (TTxDataShardLocalKMeansScan) {
185186
{
186187
options.AllowSystemColumnNames(true);
187188
options.Columns({
188-
{ParentColumn, "Uint32", true, true},
189-
{IdColumn, "Uint32", true, true},
189+
{ParentColumn, NTableIndex::ClusterIdTypeName, true, true},
190+
{IdColumn, NTableIndex::ClusterIdTypeName, true, true},
190191
{CentroidColumn, "String", false, true},
191192
});
192193
CreateShardedTable(server, sender, "/Root", "table-level", options);
@@ -196,7 +197,7 @@ Y_UNIT_TEST_SUITE (TTxDataShardLocalKMeansScan) {
196197
{
197198
options.AllowSystemColumnNames(true);
198199
options.Columns({
199-
{ParentColumn, "Uint32", true, true},
200+
{ParentColumn, NTableIndex::ClusterIdTypeName, true, true},
200201
{"key", "Uint32", true, true},
201202
{"data", "String", false, false},
202203
});
@@ -208,7 +209,7 @@ Y_UNIT_TEST_SUITE (TTxDataShardLocalKMeansScan) {
208209
{
209210
options.AllowSystemColumnNames(true);
210211
options.Columns({
211-
{ParentColumn, "Uint32", true, true},
212+
{ParentColumn, NTableIndex::ClusterIdTypeName, true, true},
212213
{"key", "Uint32", true, true},
213214
{"embedding", "String", false, false},
214215
{"data", "String", false, false},

ydb/core/tx/datashard/datashard_ut_reshuffle_kmeans.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <ydb/core/base/table_index.h>
12
#include <ydb/core/testlib/test_client.h>
23
#include <ydb/core/tx/datashard/ut_common/datashard_ut_common.h>
34
#include <ydb/core/tx/schemeshard/schemeshard.h>
@@ -84,7 +85,7 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
8485
}
8586
}
8687

87-
static TString DoReshuffleKMeans(Tests::TServer::TPtr server, TActorId sender, ui32 parent,
88+
static TString DoReshuffleKMeans(Tests::TServer::TPtr server, TActorId sender, NTableIndex::TClusterId parent,
8889
const std::vector<TString>& level,
8990
NKikimrTxDataShard::TEvLocalKMeansRequest::EState upload,
9091
VectorIndexSettings::VectorType type, VectorIndexSettings::Metric metric)
@@ -171,7 +172,7 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
171172
{
172173
options.AllowSystemColumnNames(true);
173174
options.Columns({
174-
{ParentColumn, "Uint32", true, true},
175+
{ParentColumn, NTableIndex::ClusterIdTypeName, true, true},
175176
{"key", "Uint32", true, true},
176177
{"data", "String", false, false},
177178
});
@@ -183,7 +184,7 @@ Y_UNIT_TEST_SUITE (TTxDataShardReshuffleKMeansScan) {
183184
{
184185
options.AllowSystemColumnNames(true);
185186
options.Columns({
186-
{ParentColumn, "Uint32", true, true},
187+
{ParentColumn, NTableIndex::ClusterIdTypeName, true, true},
187188
{"key", "Uint32", true, true},
188189
{"embedding", "String", false, false},
189190
{"data", "String", false, false},

ydb/core/tx/datashard/kmeans_helper.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
namespace NKikimr::NDataShard::NKMeans {
77

8-
TTableRange CreateRangeFrom(const TUserTable& table, ui32 parent, TCell& from, TCell& to) {
8+
TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to) {
99
if (parent == 0) {
1010
return table.GetTableRange();
1111
}
@@ -28,15 +28,15 @@ NTable::TLead CreateLeadFrom(const TTableRange& range) {
2828
return lead;
2929
}
3030

31-
void AddRowMain2Build(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row) {
31+
void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row) {
3232
std::array<TCell, 1> cells;
3333
cells[0] = TCell::Make(parent);
3434
auto pk = TSerializedCellVec::Serialize(cells);
3535
TSerializedCellVec::UnsafeAppendCells(key, pk);
3636
buffer.AddRow(TSerializedCellVec{key}, TSerializedCellVec{std::move(pk)}, TSerializedCellVec::Serialize(*row));
3737
}
3838

39-
void AddRowMain2Posting(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
39+
void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
4040
ui32 dataPos)
4141
{
4242
std::array<TCell, 1> cells;
@@ -47,15 +47,15 @@ void AddRowMain2Posting(TBufferData& buffer, ui32 parent, TArrayRef<const TCell>
4747
TSerializedCellVec::Serialize((*row).Slice(dataPos)));
4848
}
4949

50-
void AddRowBuild2Build(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row) {
50+
void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row) {
5151
std::array<TCell, 1> cells;
5252
cells[0] = TCell::Make(parent);
5353
auto pk = TSerializedCellVec::Serialize(cells);
5454
TSerializedCellVec::UnsafeAppendCells(key.Slice(1), pk);
5555
buffer.AddRow(TSerializedCellVec{key}, TSerializedCellVec{std::move(pk)}, TSerializedCellVec::Serialize(*row));
5656
}
5757

58-
void AddRowBuild2Posting(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
58+
void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
5959
ui32 dataPos)
6060
{
6161
std::array<TCell, 1> cells;
@@ -96,7 +96,7 @@ MakeUploadTypes(const TUserTable& table, NKikimrTxDataShard::TEvLocalKMeansReque
9696
uploadTypes->reserve(1 + 1 + std::min(table.KeyColumnTypes.size() + data.size(), types.size()));
9797

9898
Ydb::Type type;
99-
type.set_type_id(Ydb::Type::UINT32);
99+
type.set_type_id(NTableIndex::ClusterIdType);
100100
uploadTypes->emplace_back(NTableIndex::NTableVectorKmeansTreeIndex::ParentColumn, type);
101101

102102
auto addType = [&](const auto& column) {

ydb/core/tx/datashard/kmeans_helper.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22

3+
#include <ydb/core/base/table_index.h>
34
#include <ydb/core/tx/datashard/buffer_data.h>
45
#include <ydb/core/tx/datashard/datashard_user_table.h>
56
#include <ydb/core/tx/datashard/range_ops.h>
@@ -48,7 +49,7 @@ Y_PURE_FUNCTION TTriWayDotProduct<TRes> CosineImpl(const ui8* lhs, const ui8* rh
4849
return {static_cast<TRes>(ll), static_cast<TRes>(lr), static_cast<TRes>(rr)};
4950
}
5051

51-
TTableRange CreateRangeFrom(const TUserTable& table, ui32 parent, TCell& from, TCell& to);
52+
TTableRange CreateRangeFrom(const TUserTable& table, NTableIndex::TClusterId parent, TCell& from, TCell& to);
5253

5354
NTable::TLead CreateLeadFrom(const TTableRange& range);
5455

@@ -200,14 +201,14 @@ ui32 FeedEmbedding(const TCalculation<TMetric>& calculation, std::span<const TSt
200201
return calculation.FindClosest(clusters, embedding);
201202
}
202203

203-
void AddRowMain2Build(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row);
204+
void AddRowMain2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row);
204205

205-
void AddRowMain2Posting(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
206+
void AddRowMain2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
206207
ui32 dataPos);
207208

208-
void AddRowBuild2Build(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row);
209+
void AddRowBuild2Build(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row);
209210

210-
void AddRowBuild2Posting(TBufferData& buffer, ui32 parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
211+
void AddRowBuild2Posting(TBufferData& buffer, NTableIndex::TClusterId parent, TArrayRef<const TCell> key, const NTable::TRowState& row,
211212
ui32 dataPos);
212213

213214
TTags MakeUploadTags(const TUserTable& table, const TProtoStringType& embedding,

ydb/core/tx/datashard/local_kmeans.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ class TLocalKMeansScanBase: public TActor<TLocalKMeansScanBase>, public NTable::
8686
protected:
8787
using EState = NKikimrTxDataShard::TEvLocalKMeansRequest;
8888

89-
ui32 Parent = 0;
90-
ui32 Child = 0;
89+
NTableIndex::TClusterId Parent = 0;
90+
NTableIndex::TClusterId Child = 0;
9191

9292
ui32 Round = 0;
9393
ui32 MaxRounds = 0;
@@ -156,7 +156,7 @@ class TLocalKMeansScanBase: public TActor<TLocalKMeansScanBase>, public NTable::
156156
return NKikimrServices::TActivity::LOCAL_KMEANS_SCAN_ACTOR;
157157
}
158158

159-
TLocalKMeansScanBase(ui64 buildId, const TUserTable& table, TLead&& lead, ui32 parent, ui32 child,
159+
TLocalKMeansScanBase(ui64 buildId, const TUserTable& table, TLead&& lead, NTableIndex::TClusterId parent, NTableIndex::TClusterId child,
160160
const NKikimrTxDataShard::TEvLocalKMeansRequest& request,
161161
std::shared_ptr<TResult> result)
162162
: TActor{&TThis::StateWork}
@@ -180,7 +180,7 @@ class TLocalKMeansScanBase: public TActor<TLocalKMeansScanBase>, public NTable::
180180
// upload types
181181
if (Ydb::Type type; State <= EState::KMEANS) {
182182
TargetTypes = std::make_shared<NTxProxy::TUploadTypes>(3);
183-
type.set_type_id(Ydb::Type::UINT32);
183+
type.set_type_id(NTableIndex::ClusterIdType);
184184
(*TargetTypes)[0] = {NTableIndex::NTableVectorKmeansTreeIndex::ParentColumn, type};
185185
(*TargetTypes)[1] = {NTableIndex::NTableVectorKmeansTreeIndex::IdColumn, type};
186186
type.set_type_id(Ydb::Type::STRING);
@@ -382,7 +382,7 @@ class TLocalKMeansScan final: public TLocalKMeansScanBase, private TCalculation<
382382
std::vector<TAggregatedCluster> AggregatedClusters;
383383

384384
public:
385-
TLocalKMeansScan(ui64 buildId, const TUserTable& table, TLead&& lead, ui32 parent, ui32 child, NKikimrTxDataShard::TEvLocalKMeansRequest& request,
385+
TLocalKMeansScan(ui64 buildId, const TUserTable& table, TLead&& lead, NTableIndex::TClusterId parent, NTableIndex::TClusterId child, NKikimrTxDataShard::TEvLocalKMeansRequest& request,
386386
std::shared_ptr<TResult> result)
387387
: TLocalKMeansScanBase{buildId, table, std::move(lead), parent, child, request, std::move(result)}
388388
{

ydb/core/tx/datashard/reshuffle_kmeans.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ class TReshuffleKMeansScanBase: public TActor<TReshuffleKMeansScanBase>, public
2727
protected:
2828
using EState = NKikimrTxDataShard::TEvLocalKMeansRequest;
2929

30-
ui32 Parent = 0;
31-
ui32 Child = 0;
30+
NTableIndex::TClusterId Parent = 0;
31+
NTableIndex::TClusterId Child = 0;
3232

3333
ui32 K = 0;
3434

ydb/core/tx/schemeshard/schemeshard__init.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4523,20 +4523,20 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45234523

45244524
// read kmeans tree state
45254525
{
4526-
auto rowset = db.Table<Schema::KMeansTreeState>().Range().Select();
4526+
auto rowset = db.Table<Schema::KMeansTreeProgress>().Range().Select();
45274527
if (!rowset.IsReady()) {
45284528
return false;
45294529
}
45304530

45314531
while (!rowset.EndOfSet()) {
4532-
TIndexBuildId id = rowset.GetValue<Schema::KMeansTreeState::Id>();
4532+
TIndexBuildId id = rowset.GetValue<Schema::KMeansTreeProgress::Id>();
45334533
const auto* buildInfoPtr = Self->IndexBuilds.FindPtr(id);
45344534
Y_VERIFY_S(buildInfoPtr, "BuildIndex not found: id# " << id);
45354535
auto& buildInfo = *buildInfoPtr->Get();
45364536
buildInfo.KMeans.Set(
4537-
rowset.GetValue<Schema::KMeansTreeState::Level>(),
4538-
rowset.GetValue<Schema::KMeansTreeState::Parent>(),
4539-
rowset.GetValue<Schema::KMeansTreeState::State>()
4537+
rowset.GetValue<Schema::KMeansTreeProgress::Level>(),
4538+
rowset.GetValue<Schema::KMeansTreeProgress::Parent>(),
4539+
rowset.GetValue<Schema::KMeansTreeProgress::State>()
45404540
);
45414541
buildInfo.Sample.Rows.reserve(buildInfo.KMeans.K * 2);
45424542

0 commit comments

Comments
 (0)