Skip to content

Commit 3deaccb

Browse files
authored
25-1: schemeshard: fix split key selection from access sample (#18362)
Fix crash when selecting split key from key access sample containing entries of different sizes. Size differences could occur when exact key accesses are mixed with key prefix operations (such as range reads). But the key comparator used was asymmetrical, allowing comparison of full keys to prefixes but not vice versa.
2 parents af0f7f8 + 0df111c commit 3deaccb

File tree

3 files changed

+564
-10
lines changed

3 files changed

+564
-10
lines changed

ydb/core/tx/schemeshard/schemeshard__table_stats_histogram.cpp

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,29 @@ TSerializedCellVec DoFindSplitKey(const TVector<std::pair<TSerializedCellVec, ui
140140
auto loIt = std::upper_bound(keysHist.begin(), keysHist.end(), total*0.1, fnValueLess);
141141
auto hiIt = std::upper_bound(keysHist.begin(), keysHist.end(), total*0.9, fnValueLess);
142142

143-
auto fnCmp = [&keyColumnTypes, prefixSize] (const auto& bucket1, const auto& bucket2) {
144-
return CompareTypedCellVectors(bucket1.first.GetCells().data(), bucket2.first.GetCells().data(),
145-
keyColumnTypes.data(),
146-
std::min(bucket1.first.GetCells().size(), prefixSize), std::min(bucket2.first.GetCells().size(), prefixSize));
143+
// compare histogram entries by key prefixes
144+
auto comparePrefix = [&keyColumnTypes] (const auto& entry1, const auto& entry2, const size_t prefixSize) {
145+
const auto& key1cells = entry1.first.GetCells();
146+
const auto clampedSize1 = std::min(key1cells.size(), prefixSize);
147+
148+
const auto& key2cells = entry2.first.GetCells();
149+
const auto clampedSize2 = std::min(key2cells.size(), prefixSize);
150+
151+
int cmp = CompareTypedCellVectors(key1cells.data(), key2cells.data(), keyColumnTypes.data(), std::min(clampedSize1, clampedSize2));
152+
if (cmp == 0 && clampedSize1 != clampedSize2) {
153+
// smaller key prefix is filled with +inf => always bigger
154+
cmp = (clampedSize1 < clampedSize2) ? +1 : -1;
155+
}
156+
return cmp;
147157
};
148158

149159
// Check if half key is no equal to low and high keys
150-
if (fnCmp(*halfIt, *loIt) == 0)
160+
if (comparePrefix(*halfIt, *loIt, prefixSize) == 0) {
151161
return TSerializedCellVec();
152-
if (fnCmp(*halfIt, *hiIt) == 0)
162+
}
163+
if (comparePrefix(*halfIt, *hiIt, prefixSize) == 0) {
153164
return TSerializedCellVec();
165+
}
154166

155167
// Build split key by leaving the prefix and extending it with NULLs
156168
TVector<TCell> splitKey(halfIt->first.GetCells().begin(), halfIt->first.GetCells().end());
@@ -170,10 +182,17 @@ TSerializedCellVec ChooseSplitKeyByKeySample(const NKikimrTableStats::THistogram
170182
keysHist.emplace_back(std::make_pair(TSerializedCellVec(bucket.GetKey()), bucket.GetValue()));
171183
}
172184

173-
auto fnCmp = [&keyColumnTypes] (const auto& key1, const auto& key2) {
174-
return CompareTypedCellVectors(key1.first.GetCells().data(), key2.first.GetCells().data(),
175-
keyColumnTypes.data(),
176-
key1.first.GetCells().size(), key2.first.GetCells().size());
185+
// compare histogram entries by keys
186+
auto fnCmp = [&keyColumnTypes] (const auto& entry1, const auto& entry2) {
187+
const auto& key1cells = entry1.first.GetCells();
188+
const auto& key2cells = entry2.first.GetCells();
189+
const auto minKeySize = std::min(key1cells.size(), key2cells.size());
190+
int cmp = CompareTypedCellVectors(key1cells.data(), key2cells.data(), keyColumnTypes.data(), minKeySize);
191+
if (cmp == 0 && key1cells.size() != key2cells.size()) {
192+
// smaller key is filled with +inf => always bigger
193+
cmp = (key1cells.size() < key2cells.size()) ? +1 : -1;
194+
}
195+
return cmp;
177196
};
178197

179198
Sort(keysHist, [&fnCmp] (const auto& key1, const auto& key2) { return fnCmp(key1, key2) < 0; });

0 commit comments

Comments
 (0)