@@ -140,17 +140,29 @@ TSerializedCellVec DoFindSplitKey(const TVector<std::pair<TSerializedCellVec, ui
140
140
auto loIt = std::upper_bound (keysHist.begin (), keysHist.end (), total*0.1 , fnValueLess);
141
141
auto hiIt = std::upper_bound (keysHist.begin (), keysHist.end (), total*0.9 , fnValueLess);
142
142
143
- auto fnCmp = [&keyColumnTypes, prefixSize] (const auto & bucket1, const auto & bucket2) {
144
- return CompareTypedCellVectors (bucket1.first .GetCells ().data (), bucket2.first .GetCells ().data (),
145
- keyColumnTypes.data (),
146
- std::min (bucket1.first .GetCells ().size (), prefixSize), std::min (bucket2.first .GetCells ().size (), prefixSize));
143
+ // compare histogram entries by key prefixes
144
+ auto comparePrefix = [&keyColumnTypes] (const auto & entry1, const auto & entry2, const size_t prefixSize) {
145
+ const auto & key1cells = entry1.first .GetCells ();
146
+ const auto clampedSize1 = std::min (key1cells.size (), prefixSize);
147
+
148
+ const auto & key2cells = entry2.first .GetCells ();
149
+ const auto clampedSize2 = std::min (key2cells.size (), prefixSize);
150
+
151
+ int cmp = CompareTypedCellVectors (key1cells.data (), key2cells.data (), keyColumnTypes.data (), std::min (clampedSize1, clampedSize2));
152
+ if (cmp == 0 && clampedSize1 != clampedSize2) {
153
+ // smaller key prefix is filled with +inf => always bigger
154
+ cmp = (clampedSize1 < clampedSize2) ? +1 : -1 ;
155
+ }
156
+ return cmp;
147
157
};
148
158
149
159
// Check if half key is no equal to low and high keys
150
- if (fnCmp (*halfIt, *loIt) == 0 )
160
+ if (comparePrefix (*halfIt, *loIt, prefixSize ) == 0 ) {
151
161
return TSerializedCellVec ();
152
- if (fnCmp (*halfIt, *hiIt) == 0 )
162
+ }
163
+ if (comparePrefix (*halfIt, *hiIt, prefixSize) == 0 ) {
153
164
return TSerializedCellVec ();
165
+ }
154
166
155
167
// Build split key by leaving the prefix and extending it with NULLs
156
168
TVector<TCell> splitKey (halfIt->first .GetCells ().begin (), halfIt->first .GetCells ().end ());
@@ -170,10 +182,17 @@ TSerializedCellVec ChooseSplitKeyByKeySample(const NKikimrTableStats::THistogram
170
182
keysHist.emplace_back (std::make_pair (TSerializedCellVec (bucket.GetKey ()), bucket.GetValue ()));
171
183
}
172
184
173
- auto fnCmp = [&keyColumnTypes] (const auto & key1, const auto & key2) {
174
- return CompareTypedCellVectors (key1.first .GetCells ().data (), key2.first .GetCells ().data (),
175
- keyColumnTypes.data (),
176
- key1.first .GetCells ().size (), key2.first .GetCells ().size ());
185
+ // compare histogram entries by keys
186
+ auto fnCmp = [&keyColumnTypes] (const auto & entry1, const auto & entry2) {
187
+ const auto & key1cells = entry1.first .GetCells ();
188
+ const auto & key2cells = entry2.first .GetCells ();
189
+ const auto minKeySize = std::min (key1cells.size (), key2cells.size ());
190
+ int cmp = CompareTypedCellVectors (key1cells.data (), key2cells.data (), keyColumnTypes.data (), minKeySize);
191
+ if (cmp == 0 && key1cells.size () != key2cells.size ()) {
192
+ // smaller key is filled with +inf => always bigger
193
+ cmp = (key1cells.size () < key2cells.size ()) ? +1 : -1 ;
194
+ }
195
+ return cmp;
177
196
};
178
197
179
198
Sort (keysHist, [&fnCmp] (const auto & key1, const auto & key2) { return fnCmp (key1, key2) < 0 ; });
0 commit comments