@@ -125,43 +125,35 @@ class TClusters: public IClusters {
125
125
using TSum = TMetric::TSum;
126
126
using TEmbedding = TVector<TSum>;
127
127
128
- ui32 InitK = 0 ;
129
- ui32 K = 0 ;
130
128
const ui32 Dimensions = 0 ;
129
+ const ui32 MaxRounds = 0 ;
130
+ const ui8 TypeByte = 0 ;
131
131
132
132
TVector<TString> Clusters;
133
133
TVector<ui64> ClusterSizes;
134
-
135
- struct TAggregatedCluster {
136
- TEmbedding Cluster;
137
- ui64 Size = 0 ;
138
- };
139
- TVector<TAggregatedCluster> AggregatedClusters;
134
+ TVector<TEmbedding> NextClusters;
135
+ TVector<ui64> NextClusterSizes;
140
136
141
137
ui32 Round = 0 ;
142
- ui32 MaxRounds = 0 ;
143
138
144
139
public:
145
- TClusters (ui32 dimensions)
140
+ TClusters (ui32 dimensions, ui32 maxRounds, ui8 typeByte )
146
141
: Dimensions(dimensions)
142
+ , MaxRounds(maxRounds)
143
+ , TypeByte(typeByte)
147
144
{
148
145
}
149
146
150
- void Init (ui32 k, ui32 maxRounds) override {
151
- InitK = k;
152
- K = k;
153
- MaxRounds = maxRounds;
154
- }
155
-
156
- ui32 GetK () const override {
157
- return K;
147
+ void SetRound (ui32 round) override {
148
+ Round = round;
158
149
}
159
150
160
151
TString Debug () const override {
161
- if (!MaxRounds) {
162
- return TStringBuilder () << " K: " << K;
152
+ auto sb = TStringBuilder () << " K: " << Clusters.size ();
153
+ if (MaxRounds) {
154
+ sb << " Round: " << Round << " / " << MaxRounds;
163
155
}
164
- return TStringBuilder () << " K: " << K << " Round: " << Round << " / " << MaxRounds ;
156
+ return sb ;
165
157
}
166
158
167
159
const TVector<TString>& GetClusters () const override {
@@ -172,11 +164,19 @@ class TClusters: public IClusters {
172
164
return ClusterSizes;
173
165
}
174
166
167
+ const TVector<ui64>& GetNextClusterSizes () const override {
168
+ return NextClusterSizes;
169
+ }
170
+
171
+ virtual void SetClusterSize (ui32 num, ui64 size) override {
172
+ ClusterSizes.at (num) = size;
173
+ }
174
+
175
175
void Clear () override {
176
- K = InitK;
177
176
Clusters.clear ();
178
177
ClusterSizes.clear ();
179
- AggregatedClusters.clear ();
178
+ NextClusterSizes.clear ();
179
+ NextClusters.clear ();
180
180
Round = 0 ;
181
181
}
182
182
@@ -189,40 +189,37 @@ class TClusters: public IClusters {
189
189
return false ;
190
190
}
191
191
}
192
- Clusters = newClusters;
193
- K = newClusters.size ();
194
- return true ;
195
- }
196
-
197
- void InitAggregatedClusters () override {
198
- AggregatedClusters.resize (K);
199
- ClusterSizes.resize (K, 0 );
200
- for (auto & aggregate : AggregatedClusters) {
201
- aggregate.Cluster .resize (Dimensions, 0 );
192
+ Clusters = std::move (newClusters);
193
+ ClusterSizes.clear ();
194
+ ClusterSizes.resize (Clusters.size ());
195
+ NextClusterSizes.clear ();
196
+ NextClusterSizes.resize (Clusters.size ());
197
+ NextClusters.clear ();
198
+ NextClusters.resize (Clusters.size ());
199
+ for (auto & aggregate : NextClusters) {
200
+ aggregate.resize (Dimensions, 0 );
202
201
}
203
- Round = 1 ;
202
+ return true ;
204
203
}
205
204
206
205
bool RecomputeClusters () override {
207
- Y_ENSURE (K >= 1 );
208
206
ui64 vectorCount = 0 ;
209
207
ui64 reassignedCount = 0 ;
210
- for (size_t i = 0 ; auto & aggregate : AggregatedClusters) {
211
- vectorCount += aggregate.Size ;
208
+ for (size_t i = 0 ; auto & aggregate : NextClusters) {
209
+ auto newSize = NextClusterSizes[i];
210
+ vectorCount += newSize;
212
211
213
- auto & clusterSize = ClusterSizes[i];
214
- reassignedCount += clusterSize < aggregate.Size ? aggregate.Size - clusterSize : 0 ;
215
- clusterSize = aggregate.Size ;
212
+ auto clusterSize = ClusterSizes[i];
213
+ reassignedCount += clusterSize < newSize ? newSize - clusterSize : 0 ;
216
214
217
- if (aggregate.Size != 0 ) {
218
- this ->Fill (Clusters[i], aggregate.Cluster .data (), aggregate.Size );
219
- Y_ENSURE (aggregate.Size == 0 );
215
+ if (newSize != 0 ) {
216
+ this ->Fill (Clusters[i], aggregate.data (), newSize);
220
217
}
221
218
++i;
222
219
}
223
- Y_ENSURE (vectorCount >= K);
220
+
224
221
Y_ENSURE (reassignedCount <= vectorCount);
225
- if (K == 1 ) {
222
+ if (Clusters. size () == 1 ) {
226
223
return true ;
227
224
}
228
225
@@ -232,7 +229,6 @@ class TClusters: public IClusters {
232
229
last = changes < MinVectorsNeedsReassigned;
233
230
}
234
231
if (!last) {
235
- ++Round;
236
232
return false ;
237
233
}
238
234
return true ;
@@ -251,6 +247,25 @@ class TClusters: public IClusters {
251
247
Clusters.erase (Clusters.begin () + w, Clusters.end ());
252
248
}
253
249
250
+ bool NextRound () override {
251
+ bool isLast = RecomputeClusters ();
252
+ ClusterSizes = std::move (NextClusterSizes);
253
+ RemoveEmptyClusters ();
254
+ if (isLast) {
255
+ NextClusters.clear ();
256
+ return true ;
257
+ }
258
+ ++Round;
259
+ NextClusterSizes.clear ();
260
+ NextClusterSizes.resize (Clusters.size ());
261
+ NextClusters.clear ();
262
+ NextClusters.resize (Clusters.size ());
263
+ for (auto & aggregate : NextClusters) {
264
+ aggregate.resize (Dimensions, 0 );
265
+ }
266
+ return false ;
267
+ }
268
+
254
269
std::optional<ui32> FindCluster (TArrayRef<const TCell> row, ui32 embeddingPos) override {
255
270
Y_ENSURE (embeddingPos < row.size ());
256
271
const auto embedding = row.at (embeddingPos).AsRef ();
@@ -271,16 +286,17 @@ class TClusters: public IClusters {
271
286
return closest;
272
287
}
273
288
274
- void AggregateToCluster (ui32 pos, const char * embedding) override {
275
- auto & aggregate = AggregatedClusters[pos];
276
- auto * coords = aggregate.Cluster .data ();
277
- for (auto coord : this ->GetCoords (embedding)) {
278
- *coords++ += coord;
289
+ void AggregateToCluster (ui32 pos, const TArrayRef<const char >& embedding, ui64 weight) override {
290
+ auto & aggregate = NextClusters.at (pos);
291
+ auto * coords = aggregate.data ();
292
+ Y_ENSURE (IsExpectedSize (embedding));
293
+ for (auto coord : this ->GetCoords (embedding.data ())) {
294
+ *coords++ += (TSum)coord * weight;
279
295
}
280
- ++aggregate. Size ;
296
+ NextClusterSizes. at (pos) += weight ;
281
297
}
282
298
283
- bool IsExpectedSize (TArrayRef<const char > data) override {
299
+ bool IsExpectedSize (const TArrayRef<const char >& data) override {
284
300
return data.size () == 1 + sizeof (TCoord) * Dimensions;
285
301
}
286
302
@@ -295,36 +311,37 @@ class TClusters: public IClusters {
295
311
296
312
void Fill (TString& d, TSum* embedding, ui64& c) {
297
313
Y_ENSURE (c > 0 );
298
- const auto count = static_cast <TSum>(std::exchange (c, 0 ) );
314
+ const auto count = static_cast <TSum>(c );
299
315
auto data = GetData (d.MutRef ().data ());
300
316
for (auto & coord : data) {
301
317
coord = *embedding / count;
302
- * embedding++ = 0 ;
318
+ embedding++;
303
319
}
304
320
}
305
321
};
306
322
307
- std::unique_ptr<IClusters> CreateClusters (const Ydb::Table::VectorIndexSettings& settings, TString& error) {
323
+ std::unique_ptr<IClusters> CreateClusters (const Ydb::Table::VectorIndexSettings& settings, ui32 maxRounds, TString& error) {
308
324
if (settings.vector_dimension () < 1 ) {
309
325
error = " Dimension of vector should be at least one" ;
310
326
return nullptr ;
311
327
}
312
328
329
+ const ui8 typeVal = (ui8)settings.vector_type ();
313
330
const ui32 dim = settings.vector_dimension ();
314
331
315
332
auto handleMetric = [&]<typename T>() -> std::unique_ptr<IClusters> {
316
333
switch (settings.metric ()) {
317
334
case Ydb::Table::VectorIndexSettings::SIMILARITY_INNER_PRODUCT:
318
- return std::make_unique<TClusters<TMaxInnerProductSimilarity<T>>>(dim);
335
+ return std::make_unique<TClusters<TMaxInnerProductSimilarity<T>>>(dim, maxRounds, typeVal );
319
336
case Ydb::Table::VectorIndexSettings::SIMILARITY_COSINE:
320
337
case Ydb::Table::VectorIndexSettings::DISTANCE_COSINE:
321
338
// We don't need to have separate implementation for distance,
322
339
// because clusters will be same as for similarity
323
- return std::make_unique<TClusters<TCosineSimilarity<T>>>(dim);
340
+ return std::make_unique<TClusters<TCosineSimilarity<T>>>(dim, maxRounds, typeVal );
324
341
case Ydb::Table::VectorIndexSettings::DISTANCE_MANHATTAN:
325
- return std::make_unique<TClusters<TL1Distance<T>>>(dim);
342
+ return std::make_unique<TClusters<TL1Distance<T>>>(dim, maxRounds, typeVal );
326
343
case Ydb::Table::VectorIndexSettings::DISTANCE_EUCLIDEAN:
327
- return std::make_unique<TClusters<TL2Distance<T>>>(dim);
344
+ return std::make_unique<TClusters<TL2Distance<T>>>(dim, maxRounds, typeVal );
328
345
default :
329
346
error = " Wrong similarity" ;
330
347
break ;
0 commit comments