Skip to content

Commit 6a79e27

Browse files
committed
Extract TIndexBuildInfo::TKMeans functions from .h to .cpp (#20531)
1 parent f721ad0 commit 6a79e27

File tree

2 files changed

+183
-152
lines changed

2 files changed

+183
-152
lines changed

ydb/core/tx/schemeshard/schemeshard_info_types.cpp

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2214,6 +2214,168 @@ void TIndexBuildInfo::SerializeToProto([[maybe_unused]] TSchemeShard* ss, NKikim
22142214
}
22152215
}
22162216

2217+
ui64 TIndexBuildInfo::TKMeans::ParentEnd() const noexcept { // included
2218+
return ChildBegin - 1;
2219+
}
2220+
ui64 TIndexBuildInfo::TKMeans::ChildEnd() const noexcept { // included
2221+
return ChildBegin + ChildCount() - 1;
2222+
}
2223+
2224+
ui64 TIndexBuildInfo::TKMeans::ParentCount() const noexcept {
2225+
return ParentEnd() - ParentBegin + 1;
2226+
}
2227+
ui64 TIndexBuildInfo::TKMeans::ChildCount() const noexcept {
2228+
return ParentCount() * K;
2229+
}
2230+
2231+
TString TIndexBuildInfo::TKMeans::DebugString() const {
2232+
return TStringBuilder()
2233+
<< "{ "
2234+
<< "State = " << State
2235+
<< ", Level = " << Level << " / " << Levels
2236+
<< ", K = " << K
2237+
<< ", Round = " << Round
2238+
<< ", Parent = [" << ParentBegin << ".." << Parent << ".." << ParentEnd() << "]"
2239+
<< ", Child = [" << ChildBegin << ".." << Child << ".." << ChildEnd() << "]"
2240+
<< ", TableSize = " << TableSize
2241+
<< " }";
2242+
}
2243+
2244+
bool TIndexBuildInfo::TKMeans::NeedsAnotherLevel() const noexcept {
2245+
return Level < Levels;
2246+
}
2247+
bool TIndexBuildInfo::TKMeans::NeedsAnotherParent() const noexcept {
2248+
return Parent < ParentEnd();
2249+
}
2250+
2251+
bool TIndexBuildInfo::TKMeans::NextParent() noexcept {
2252+
if (!NeedsAnotherParent()) {
2253+
return false;
2254+
}
2255+
++Parent;
2256+
Child += K;
2257+
return true;
2258+
}
2259+
2260+
bool TIndexBuildInfo::TKMeans::NextLevel() noexcept {
2261+
if (!NeedsAnotherLevel()) {
2262+
return false;
2263+
}
2264+
NextLevel(ChildCount());
2265+
return true;
2266+
}
2267+
2268+
void TIndexBuildInfo::TKMeans::PrefixIndexDone(ui64 shards) {
2269+
Y_ENSURE(NeedsAnotherLevel());
2270+
// There's two worst cases, but in both one shard contains TableSize rows
2271+
// 1. all rows have unique prefix (*), in such case we need 1 id for each row (parent, id in prefix table)
2272+
// 2. all unique prefixes have size K, so we have TableSize/K parents + TableSize childs
2273+
// * it doesn't work now, because now prefix should have at least K embeddings, but it's bug
2274+
NextLevel((2 * TableSize) * shards);
2275+
Parent = ParentEnd();
2276+
}
2277+
2278+
void TIndexBuildInfo::TKMeans::Set(ui32 level,
2279+
NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent,
2280+
NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child,
2281+
ui32 state, ui64 tableSize, ui32 round) {
2282+
Level = level;
2283+
Round = round;
2284+
ParentBegin = parentBegin;
2285+
Parent = parent;
2286+
ChildBegin = childBegin;
2287+
Child = child;
2288+
State = static_cast<EState>(state);
2289+
TableSize = tableSize;
2290+
}
2291+
2292+
NKikimrTxDataShard::EKMeansState TIndexBuildInfo::TKMeans::GetUpload() const {
2293+
if (Level == 1) {
2294+
if (NeedsAnotherLevel()) {
2295+
return NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_BUILD;
2296+
} else {
2297+
return NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING;
2298+
}
2299+
} else {
2300+
if (NeedsAnotherLevel()) {
2301+
return NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_BUILD;
2302+
} else {
2303+
return NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING;
2304+
}
2305+
}
2306+
}
2307+
2308+
TString TIndexBuildInfo::TKMeans::WriteTo(bool needsBuildTable) const {
2309+
using namespace NTableIndex::NTableVectorKmeansTreeIndex;
2310+
TString name = PostingTable;
2311+
if (needsBuildTable || NeedsAnotherLevel()) {
2312+
name += Level % 2 != 0 ? BuildSuffix0 : BuildSuffix1;
2313+
}
2314+
return name;
2315+
}
2316+
2317+
TString TIndexBuildInfo::TKMeans::ReadFrom() const {
2318+
Y_ENSURE(Level > 1);
2319+
using namespace NTableIndex::NTableVectorKmeansTreeIndex;
2320+
TString name = PostingTable;
2321+
name += Level % 2 != 0 ? BuildSuffix1 : BuildSuffix0;
2322+
return name;
2323+
}
2324+
2325+
std::pair<NTableIndex::TClusterId, NTableIndex::TClusterId> TIndexBuildInfo::TKMeans::RangeToBorders(const TSerializedTableRange& range) const {
2326+
const NTableIndex::TClusterId minParent = ParentBegin;
2327+
const NTableIndex::TClusterId maxParent = ParentEnd();
2328+
const NTableIndex::TClusterId parentFrom = [&, from = range.From.GetCells()] {
2329+
if (!from.empty()) {
2330+
if (!from[0].IsNull()) {
2331+
return from[0].AsValue<NTableIndex::TClusterId>() + static_cast<NTableIndex::TClusterId>(from.size() == 1);
2332+
}
2333+
}
2334+
return minParent;
2335+
}();
2336+
const NTableIndex::TClusterId parentTo = [&, to = range.To.GetCells()] {
2337+
if (!to.empty()) {
2338+
if (!to[0].IsNull()) {
2339+
return to[0].AsValue<NTableIndex::TClusterId>() - static_cast<NTableIndex::TClusterId>(to.size() != 1 && to[1].IsNull());
2340+
}
2341+
}
2342+
return maxParent;
2343+
}();
2344+
Y_ENSURE(minParent <= parentFrom, "minParent(" << minParent << ") > parentFrom(" << parentFrom << ") " << DebugString());
2345+
Y_ENSURE(parentFrom <= parentTo, "parentFrom(" << parentFrom << ") > parentTo(" << parentTo << ") " << DebugString());
2346+
Y_ENSURE(parentTo <= maxParent, "parentTo(" << parentTo << ") > maxParent(" << maxParent << ") " << DebugString());
2347+
return {parentFrom, parentTo};
2348+
}
2349+
2350+
TString TIndexBuildInfo::TKMeans::RangeToDebugStr(const TSerializedTableRange& range) const {
2351+
auto toStr = [&](const TSerializedCellVec& v) -> TString {
2352+
const auto cells = v.GetCells();
2353+
if (cells.empty()) {
2354+
return "inf";
2355+
}
2356+
if (cells[0].IsNull()) {
2357+
return "-inf";
2358+
}
2359+
auto str = TStringBuilder{} << "{ count: " << cells.size();
2360+
if (Level > 1) {
2361+
str << ", parent: " << cells[0].AsValue<NTableIndex::TClusterId>();
2362+
if (cells.size() != 1 && cells[1].IsNull()) {
2363+
str << ", pk: null";
2364+
}
2365+
}
2366+
return str << " }";
2367+
};
2368+
return TStringBuilder{} << "{ From: " << toStr(range.From) << ", To: " << toStr(range.To) << " }";
2369+
}
2370+
2371+
void TIndexBuildInfo::TKMeans::NextLevel(ui64 childCount) noexcept {
2372+
ParentBegin = ChildBegin;
2373+
Parent = ParentBegin;
2374+
ChildBegin = ParentBegin + childCount;
2375+
Child = ChildBegin;
2376+
++Level;
2377+
}
2378+
22172379
void TIndexBuildInfo::AddParent(const TSerializedTableRange& range, TShardIdx shard) {
22182380
// For Parent == 0 only single kmeans needed, so there are two options:
22192381
// 1. It fits entirely in the single shard => local kmeans for single shard

ydb/core/tx/schemeshard/schemeshard_info_types.h

Lines changed: 21 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -3062,167 +3062,36 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
30623062

30633063
ui64 TableSize = 0;
30643064

3065-
ui64 ParentEnd() const noexcept { // included
3066-
return ChildBegin - 1;
3067-
}
3068-
ui64 ChildEnd() const noexcept { // included
3069-
return ChildBegin + ChildCount() - 1;
3070-
}
3065+
ui64 ParentEnd() const noexcept;
3066+
ui64 ChildEnd() const noexcept;
30713067

3072-
ui64 ParentCount() const noexcept {
3073-
return ParentEnd() - ParentBegin + 1;
3074-
}
3075-
ui64 ChildCount() const noexcept {
3076-
return ParentCount() * K;
3077-
}
3068+
ui64 ParentCount() const noexcept;
3069+
ui64 ChildCount() const noexcept;
30783070

3079-
TString DebugString() const {
3080-
return TStringBuilder()
3081-
<< "{ "
3082-
<< "State = " << State
3083-
<< ", Level = " << Level << " / " << Levels
3084-
<< ", K = " << K
3085-
<< ", Round = " << Round
3086-
<< ", Parent = [" << ParentBegin << ".." << Parent << ".." << ParentEnd() << "]"
3087-
<< ", Child = [" << ChildBegin << ".." << Child << ".." << ChildEnd() << "]"
3088-
<< ", TableSize = " << TableSize
3089-
<< " }";
3090-
}
3071+
TString DebugString() const;
30913072

3092-
bool NeedsAnotherLevel() const noexcept {
3093-
return Level < Levels;
3094-
}
3095-
bool NeedsAnotherParent() const noexcept {
3096-
return Parent < ParentEnd();
3097-
}
3073+
bool NeedsAnotherLevel() const noexcept;
3074+
bool NeedsAnotherParent() const noexcept;
3075+
bool NextParent() noexcept;
3076+
bool NextLevel() noexcept;
3077+
void PrefixIndexDone(ui64 shards);
30983078

3099-
bool NextParent() noexcept {
3100-
if (!NeedsAnotherParent()) {
3101-
return false;
3102-
}
3103-
++Parent;
3104-
Child += K;
3105-
return true;
3106-
}
3079+
void Set(ui32 level,
3080+
NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent,
3081+
NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child,
3082+
ui32 state, ui64 tableSize, ui32 round);
31073083

3108-
bool NextLevel() noexcept {
3109-
if (!NeedsAnotherLevel()) {
3110-
return false;
3111-
}
3112-
NextLevel(ChildCount());
3113-
return true;
3114-
}
3084+
NKikimrTxDataShard::EKMeansState GetUpload() const;
31153085

3116-
void PrefixIndexDone(ui64 shards) {
3117-
Y_ENSURE(NeedsAnotherLevel());
3118-
// There's two worst cases, but in both one shard contains TableSize rows
3119-
// 1. all rows have unique prefix (*), in such case we need 1 id for each row (parent, id in prefix table)
3120-
// 2. all unique prefixes have size K, so we have TableSize/K parents + TableSize childs
3121-
// * it doesn't work now, because now prefix should have at least K embeddings, but it's bug
3122-
NextLevel((2 * TableSize) * shards);
3123-
Parent = ParentEnd();
3124-
}
3125-
3126-
void Set(ui32 level,
3127-
NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent,
3128-
NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child,
3129-
ui32 state, ui64 tableSize, ui32 round) {
3130-
Level = level;
3131-
Round = round;
3132-
ParentBegin = parentBegin;
3133-
Parent = parent;
3134-
ChildBegin = childBegin;
3135-
Child = child;
3136-
State = static_cast<EState>(state);
3137-
TableSize = tableSize;
3138-
}
3139-
3140-
NKikimrTxDataShard::EKMeansState GetUpload() const {
3141-
if (Level == 1) {
3142-
if (NeedsAnotherLevel()) {
3143-
return NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_BUILD;
3144-
} else {
3145-
return NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING;
3146-
}
3147-
} else {
3148-
if (NeedsAnotherLevel()) {
3149-
return NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_BUILD;
3150-
} else {
3151-
return NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING;
3152-
}
3153-
}
3154-
}
3086+
TString WriteTo(bool needsBuildTable = false) const;
3087+
TString ReadFrom() const;
31553088

3156-
TString WriteTo(bool needsBuildTable = false) const {
3157-
using namespace NTableIndex::NTableVectorKmeansTreeIndex;
3158-
TString name = PostingTable;
3159-
if (needsBuildTable || NeedsAnotherLevel()) {
3160-
name += Level % 2 != 0 ? BuildSuffix0 : BuildSuffix1;
3161-
}
3162-
return name;
3163-
}
3164-
TString ReadFrom() const {
3165-
Y_ENSURE(Level > 1);
3166-
using namespace NTableIndex::NTableVectorKmeansTreeIndex;
3167-
TString name = PostingTable;
3168-
name += Level % 2 != 0 ? BuildSuffix1 : BuildSuffix0;
3169-
return name;
3170-
}
3171-
3172-
std::pair<NTableIndex::TClusterId, NTableIndex::TClusterId> RangeToBorders(const TSerializedTableRange& range) const {
3173-
const NTableIndex::TClusterId minParent = ParentBegin;
3174-
const NTableIndex::TClusterId maxParent = ParentEnd();
3175-
const NTableIndex::TClusterId parentFrom = [&, from = range.From.GetCells()] {
3176-
if (!from.empty()) {
3177-
if (!from[0].IsNull()) {
3178-
return from[0].AsValue<NTableIndex::TClusterId>() + static_cast<NTableIndex::TClusterId>(from.size() == 1);
3179-
}
3180-
}
3181-
return minParent;
3182-
}();
3183-
const NTableIndex::TClusterId parentTo = [&, to = range.To.GetCells()] {
3184-
if (!to.empty()) {
3185-
if (!to[0].IsNull()) {
3186-
return to[0].AsValue<NTableIndex::TClusterId>() - static_cast<NTableIndex::TClusterId>(to.size() != 1 && to[1].IsNull());
3187-
}
3188-
}
3189-
return maxParent;
3190-
}();
3191-
Y_ENSURE(minParent <= parentFrom, "minParent(" << minParent << ") > parentFrom(" << parentFrom << ") " << DebugString());
3192-
Y_ENSURE(parentFrom <= parentTo, "parentFrom(" << parentFrom << ") > parentTo(" << parentTo << ") " << DebugString());
3193-
Y_ENSURE(parentTo <= maxParent, "parentTo(" << parentTo << ") > maxParent(" << maxParent << ") " << DebugString());
3194-
return {parentFrom, parentTo};
3195-
}
3196-
3197-
TString RangeToDebugStr(const TSerializedTableRange& range) const {
3198-
auto toStr = [&](const TSerializedCellVec& v) -> TString {
3199-
const auto cells = v.GetCells();
3200-
if (cells.empty()) {
3201-
return "inf";
3202-
}
3203-
if (cells[0].IsNull()) {
3204-
return "-inf";
3205-
}
3206-
auto str = TStringBuilder{} << "{ count: " << cells.size();
3207-
if (Level > 1) {
3208-
str << ", parent: " << cells[0].AsValue<NTableIndex::TClusterId>();
3209-
if (cells.size() != 1 && cells[1].IsNull()) {
3210-
str << ", pk: null";
3211-
}
3212-
}
3213-
return str << " }";
3214-
};
3215-
return TStringBuilder{} << "{ From: " << toStr(range.From) << ", To: " << toStr(range.To) << " }";
3216-
}
3089+
std::pair<NTableIndex::TClusterId, NTableIndex::TClusterId> RangeToBorders(const TSerializedTableRange& range) const;
3090+
3091+
TString RangeToDebugStr(const TSerializedTableRange& range) const;
32173092

32183093
private:
3219-
void NextLevel(ui64 childCount) noexcept {
3220-
ParentBegin = ChildBegin;
3221-
Parent = ParentBegin;
3222-
ChildBegin = ParentBegin + childCount;
3223-
Child = ChildBegin;
3224-
++Level;
3225-
}
3094+
void NextLevel(ui64 childCount) noexcept;
32263095
};
32273096
TKMeans KMeans;
32283097

0 commit comments

Comments
 (0)