@@ -3062,167 +3062,36 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
3062
3062
3063
3063
ui64 TableSize = 0 ;
3064
3064
3065
- ui64 ParentEnd () const noexcept { // included
3066
- return ChildBegin - 1 ;
3067
- }
3068
- ui64 ChildEnd () const noexcept { // included
3069
- return ChildBegin + ChildCount () - 1 ;
3070
- }
3065
+ ui64 ParentEnd () const noexcept ;
3066
+ ui64 ChildEnd () const noexcept ;
3071
3067
3072
- ui64 ParentCount () const noexcept {
3073
- return ParentEnd () - ParentBegin + 1 ;
3074
- }
3075
- ui64 ChildCount () const noexcept {
3076
- return ParentCount () * K;
3077
- }
3068
+ ui64 ParentCount () const noexcept ;
3069
+ ui64 ChildCount () const noexcept ;
3078
3070
3079
- TString DebugString () const {
3080
- return TStringBuilder ()
3081
- << " { "
3082
- << " State = " << State
3083
- << " , Level = " << Level << " / " << Levels
3084
- << " , K = " << K
3085
- << " , Round = " << Round
3086
- << " , Parent = [" << ParentBegin << " .." << Parent << " .." << ParentEnd () << " ]"
3087
- << " , Child = [" << ChildBegin << " .." << Child << " .." << ChildEnd () << " ]"
3088
- << " , TableSize = " << TableSize
3089
- << " }" ;
3090
- }
3071
+ TString DebugString () const ;
3091
3072
3092
- bool NeedsAnotherLevel () const noexcept {
3093
- return Level < Levels;
3094
- }
3095
- bool NeedsAnotherParent () const noexcept {
3096
- return Parent < ParentEnd ();
3097
- }
3073
+ bool NeedsAnotherLevel () const noexcept ;
3074
+ bool NeedsAnotherParent () const noexcept ;
3075
+ bool NextParent () noexcept ;
3076
+ bool NextLevel () noexcept ;
3077
+ void PrefixIndexDone (ui64 shards);
3098
3078
3099
- bool NextParent () noexcept {
3100
- if (!NeedsAnotherParent ()) {
3101
- return false ;
3102
- }
3103
- ++Parent;
3104
- Child += K;
3105
- return true ;
3106
- }
3079
+ void Set (ui32 level,
3080
+ NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent,
3081
+ NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child,
3082
+ ui32 state, ui64 tableSize, ui32 round);
3107
3083
3108
- bool NextLevel () noexcept {
3109
- if (!NeedsAnotherLevel ()) {
3110
- return false ;
3111
- }
3112
- NextLevel (ChildCount ());
3113
- return true ;
3114
- }
3084
+ NKikimrTxDataShard::EKMeansState GetUpload () const ;
3115
3085
3116
- void PrefixIndexDone (ui64 shards) {
3117
- Y_ENSURE (NeedsAnotherLevel ());
3118
- // There's two worst cases, but in both one shard contains TableSize rows
3119
- // 1. all rows have unique prefix (*), in such case we need 1 id for each row (parent, id in prefix table)
3120
- // 2. all unique prefixes have size K, so we have TableSize/K parents + TableSize childs
3121
- // * it doesn't work now, because now prefix should have at least K embeddings, but it's bug
3122
- NextLevel ((2 * TableSize) * shards);
3123
- Parent = ParentEnd ();
3124
- }
3125
-
3126
- void Set (ui32 level,
3127
- NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent,
3128
- NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child,
3129
- ui32 state, ui64 tableSize, ui32 round) {
3130
- Level = level;
3131
- Round = round;
3132
- ParentBegin = parentBegin;
3133
- Parent = parent;
3134
- ChildBegin = childBegin;
3135
- Child = child;
3136
- State = static_cast <EState>(state);
3137
- TableSize = tableSize;
3138
- }
3139
-
3140
- NKikimrTxDataShard::EKMeansState GetUpload () const {
3141
- if (Level == 1 ) {
3142
- if (NeedsAnotherLevel ()) {
3143
- return NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_BUILD;
3144
- } else {
3145
- return NKikimrTxDataShard::EKMeansState::UPLOAD_MAIN_TO_POSTING;
3146
- }
3147
- } else {
3148
- if (NeedsAnotherLevel ()) {
3149
- return NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_BUILD;
3150
- } else {
3151
- return NKikimrTxDataShard::EKMeansState::UPLOAD_BUILD_TO_POSTING;
3152
- }
3153
- }
3154
- }
3086
+ TString WriteTo (bool needsBuildTable = false ) const ;
3087
+ TString ReadFrom () const ;
3155
3088
3156
- TString WriteTo (bool needsBuildTable = false ) const {
3157
- using namespace NTableIndex ::NTableVectorKmeansTreeIndex;
3158
- TString name = PostingTable;
3159
- if (needsBuildTable || NeedsAnotherLevel ()) {
3160
- name += Level % 2 != 0 ? BuildSuffix0 : BuildSuffix1;
3161
- }
3162
- return name;
3163
- }
3164
- TString ReadFrom () const {
3165
- Y_ENSURE (Level > 1 );
3166
- using namespace NTableIndex ::NTableVectorKmeansTreeIndex;
3167
- TString name = PostingTable;
3168
- name += Level % 2 != 0 ? BuildSuffix1 : BuildSuffix0;
3169
- return name;
3170
- }
3171
-
3172
- std::pair<NTableIndex::TClusterId, NTableIndex::TClusterId> RangeToBorders (const TSerializedTableRange& range) const {
3173
- const NTableIndex::TClusterId minParent = ParentBegin;
3174
- const NTableIndex::TClusterId maxParent = ParentEnd ();
3175
- const NTableIndex::TClusterId parentFrom = [&, from = range.From .GetCells ()] {
3176
- if (!from.empty ()) {
3177
- if (!from[0 ].IsNull ()) {
3178
- return from[0 ].AsValue <NTableIndex::TClusterId>() + static_cast <NTableIndex::TClusterId>(from.size () == 1 );
3179
- }
3180
- }
3181
- return minParent;
3182
- }();
3183
- const NTableIndex::TClusterId parentTo = [&, to = range.To .GetCells ()] {
3184
- if (!to.empty ()) {
3185
- if (!to[0 ].IsNull ()) {
3186
- return to[0 ].AsValue <NTableIndex::TClusterId>() - static_cast <NTableIndex::TClusterId>(to.size () != 1 && to[1 ].IsNull ());
3187
- }
3188
- }
3189
- return maxParent;
3190
- }();
3191
- Y_ENSURE (minParent <= parentFrom, " minParent(" << minParent << " ) > parentFrom(" << parentFrom << " ) " << DebugString ());
3192
- Y_ENSURE (parentFrom <= parentTo, " parentFrom(" << parentFrom << " ) > parentTo(" << parentTo << " ) " << DebugString ());
3193
- Y_ENSURE (parentTo <= maxParent, " parentTo(" << parentTo << " ) > maxParent(" << maxParent << " ) " << DebugString ());
3194
- return {parentFrom, parentTo};
3195
- }
3196
-
3197
- TString RangeToDebugStr (const TSerializedTableRange& range) const {
3198
- auto toStr = [&](const TSerializedCellVec& v) -> TString {
3199
- const auto cells = v.GetCells ();
3200
- if (cells.empty ()) {
3201
- return " inf" ;
3202
- }
3203
- if (cells[0 ].IsNull ()) {
3204
- return " -inf" ;
3205
- }
3206
- auto str = TStringBuilder{} << " { count: " << cells.size ();
3207
- if (Level > 1 ) {
3208
- str << " , parent: " << cells[0 ].AsValue <NTableIndex::TClusterId>();
3209
- if (cells.size () != 1 && cells[1 ].IsNull ()) {
3210
- str << " , pk: null" ;
3211
- }
3212
- }
3213
- return str << " }" ;
3214
- };
3215
- return TStringBuilder{} << " { From: " << toStr (range.From ) << " , To: " << toStr (range.To ) << " }" ;
3216
- }
3089
+ std::pair<NTableIndex::TClusterId, NTableIndex::TClusterId> RangeToBorders (const TSerializedTableRange& range) const ;
3090
+
3091
+ TString RangeToDebugStr (const TSerializedTableRange& range) const ;
3217
3092
3218
3093
private:
3219
- void NextLevel (ui64 childCount) noexcept {
3220
- ParentBegin = ChildBegin;
3221
- Parent = ParentBegin;
3222
- ChildBegin = ParentBegin + childCount;
3223
- Child = ChildBegin;
3224
- ++Level;
3225
- }
3094
+ void NextLevel (ui64 childCount) noexcept ;
3226
3095
};
3227
3096
TKMeans KMeans;
3228
3097
0 commit comments