Skip to content

Commit f4f3e42

Browse files
authored
Merge cs from main 4 (#18771)
2 parents e73ef57 + 31f78b6 commit f4f3e42

File tree

38 files changed

+564
-269
lines changed

38 files changed

+564
-269
lines changed

.github/config/muted_ya.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ ydb/tests/functional/tpc/large test_tpcds.py.TestTpcdsS1.test_tpcds[87]
169169
ydb/tests/functional/tpc/large test_tpcds.py.TestTpcdsS1.test_tpcds[93]
170170
ydb/tests/functional/tpc/large test_tpcds.py.TestTpcdsS1.test_tpcds[9]
171171
ydb/tests/olap sole chunk chunk
172+
ydb/tests/olap test_quota_exhaustion.py.TestYdbWorkload.test
172173
ydb/tests/olap/column_family/compression sole chunk chunk
173174
ydb/tests/olap/scenario sole chunk chunk
174175
ydb/tests/olap/scenario test_alter_compression.py.TestAlterCompression.test[alter_compression]

ydb/core/formats/arrow/reader/batch_iterator.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class TBatchIterator {
1111
TRWSortableBatchPosition VersionColumns;
1212
i64 RecordsCount;
1313
int ReverseSortKff;
14+
YDB_READONLY(ui64, SourceId, 0);
1415

1516
std::shared_ptr<NArrow::TColumnFilter> Filter;
1617
std::shared_ptr<NArrow::TColumnFilter::TIterator> FilterIterator;
@@ -50,14 +51,17 @@ class TBatchIterator {
5051
}
5152

5253
template <class TDataContainer>
53-
TBatchIterator(std::shared_ptr<TDataContainer> batch, std::shared_ptr<NArrow::TColumnFilter> filter,
54-
const std::vector<std::string>& keyColumns, const std::vector<std::string>& dataColumns, const bool reverseSort, const std::vector<std::string>& versionColumnNames)
54+
TBatchIterator(std::shared_ptr<TDataContainer> batch, std::shared_ptr<NArrow::TColumnFilter> filter, const arrow::Schema& keySchema,
55+
const arrow::Schema& dataSchema, const bool reverseSort, const std::vector<std::string>& versionColumnNames, const ui64 sourceId)
5556
: ControlPointFlag(false)
56-
, KeyColumns(batch, 0, keyColumns, dataColumns, reverseSort)
57+
, KeyColumns(batch, 0, keySchema.field_names(), dataSchema.field_names(), reverseSort)
5758
, VersionColumns(batch, 0, versionColumnNames, {}, false)
5859
, RecordsCount(batch->num_rows())
5960
, ReverseSortKff(reverseSort ? -1 : 1)
61+
, SourceId(sourceId)
6062
, Filter(filter) {
63+
AFL_VERIFY(KeyColumns.IsSameSortingSchema(keySchema))("batch", KeyColumns.DebugJson())("schema", keySchema.ToString());
64+
AFL_VERIFY(KeyColumns.IsSameDataSchema(dataSchema))("batch", KeyColumns.DebugJson())("schema", dataSchema.ToString());
6165
Y_ABORT_UNLESS(KeyColumns.InitPosition(GetFirstPosition()));
6266
Y_ABORT_UNLESS(VersionColumns.InitPosition(GetFirstPosition()));
6367
if (Filter) {

ydb/core/formats/arrow/reader/merger.cpp

Lines changed: 1 addition & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
namespace NKikimr::NArrow::NMerger {
77

88
void TMergePartialStream::PutControlPoint(const TSortableBatchPosition& point, const bool deepCopy) {
9-
AFL_VERIFY(point.IsSameSortingSchema(SortSchema))("point", point.DebugJson())("schema", SortSchema->ToString());
9+
AFL_VERIFY(point.IsSameSortingSchema(*SortSchema))("point", point.DebugJson())("schema", SortSchema->ToString());
1010
Y_ABORT_UNLESS(point.IsReverseSort() == Reverse);
1111
Y_ABORT_UNLESS(++ControlPoints == 1);
1212

@@ -37,39 +37,6 @@ void TMergePartialStream::CheckSequenceInDebug(const TRWSortableBatchPosition& n
3737
#endif
3838
}
3939

40-
bool TMergePartialStream::DrainToControlPoint(TRecordBatchBuilder& builder, const bool includeFinish, std::optional<TCursor>* lastResultPosition) {
41-
AFL_VERIFY(ControlPoints == 1);
42-
Y_ABORT_UNLESS((ui32)DataSchema->num_fields() == builder.GetBuildersCount());
43-
builder.ValidateDataSchema(DataSchema);
44-
bool cpReachedFlag = false;
45-
std::shared_ptr<TSortableScanData> resultScanData;
46-
ui64 resultPosition;
47-
while (SortHeap.Size() && !cpReachedFlag && !builder.IsBufferExhausted()) {
48-
if (SortHeap.Current().IsControlPoint()) {
49-
auto keyColumns = SortHeap.Current().GetKeyColumns().BuildSortingCursor();
50-
RemoveControlPoint();
51-
cpReachedFlag = true;
52-
if (SortHeap.Empty() || !includeFinish || SortHeap.Current().GetKeyColumns().Compare(keyColumns) == std::partial_ordering::greater) {
53-
if (lastResultPosition && resultScanData) {
54-
*lastResultPosition = resultScanData->BuildCursor(resultPosition);
55-
}
56-
return true;
57-
}
58-
}
59-
60-
DrainCurrentPosition(&builder, &resultScanData, &resultPosition);
61-
}
62-
if (lastResultPosition && resultScanData) {
63-
*lastResultPosition = resultScanData->BuildCursor(resultPosition);
64-
}
65-
return cpReachedFlag;
66-
}
67-
68-
bool TMergePartialStream::DrainCurrentTo(TRecordBatchBuilder& builder, const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TCursor>* lastResultPosition) {
69-
PutControlPoint(readTo, false);
70-
return DrainToControlPoint(builder, includeFinish, lastResultPosition);
71-
}
72-
7340
std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain(const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TCursor>* lastResultPosition) {
7441
std::shared_ptr<arrow::Table> result;
7542
if (SortHeap.Empty()) {
@@ -143,53 +110,6 @@ std::shared_ptr<arrow::Table> TMergePartialStream::SingleSourceDrain(const TSort
143110
return result;
144111
}
145112

146-
void TMergePartialStream::DrainAll(TRecordBatchBuilder& builder) {
147-
Y_ABORT_UNLESS((ui32)DataSchema->num_fields() == builder.GetBuildersCount());
148-
while (SortHeap.Size()) {
149-
DrainCurrentPosition(&builder, nullptr, nullptr);
150-
}
151-
}
152-
153-
void TMergePartialStream::DrainCurrentPosition(TRecordBatchBuilder* builder, std::shared_ptr<TSortableScanData>* resultScanData, ui64* resultPosition) {
154-
Y_ABORT_UNLESS(SortHeap.Size());
155-
Y_ABORT_UNLESS(!SortHeap.Current().IsControlPoint());
156-
if (!SortHeap.Current().IsDeleted()) {
157-
// AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("key_add", SortHeap.Current().GetKeyColumns().DebugJson().GetStringRobust());
158-
if (builder) {
159-
builder->AddRecord(SortHeap.Current().GetKeyColumns());
160-
}
161-
if (resultScanData && resultPosition) {
162-
*resultScanData = SortHeap.Current().GetKeyColumns().GetSorting();
163-
*resultPosition = SortHeap.Current().GetKeyColumns().GetPosition();
164-
}
165-
} else {
166-
// AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("key_skip", SortHeap.Current().GetKeyColumns().DebugJson().GetStringRobust());
167-
}
168-
CheckSequenceInDebug(SortHeap.Current().GetKeyColumns());
169-
const ui64 startPosition = SortHeap.Current().GetKeyColumns().GetPosition();
170-
const TSortableScanData* startSorting = SortHeap.Current().GetKeyColumns().GetSorting().get();
171-
const TSortableScanData* startVersion = SortHeap.Current().GetVersionColumns().GetSorting().get();
172-
bool isFirst = true;
173-
while (SortHeap.Size() && (isFirst || SortHeap.Current().GetKeyColumns().Compare(*startSorting, startPosition) == std::partial_ordering::equivalent)) {
174-
if (!isFirst) {
175-
// AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("key_skip1", SortHeap.Current().GetKeyColumns().DebugJson().GetStringRobust());
176-
auto& anotherIterator = SortHeap.Current();
177-
if (PossibleSameVersionFlag) {
178-
AFL_VERIFY(anotherIterator.GetVersionColumns().Compare(*startVersion, startPosition) != std::partial_ordering::greater)
179-
("r", startVersion->BuildCursor(startPosition).DebugJson())("a", anotherIterator.GetVersionColumns().DebugJson())
180-
("key", startSorting->BuildCursor(startPosition).DebugJson());
181-
} else {
182-
AFL_VERIFY(anotherIterator.GetVersionColumns().Compare(*startVersion, startPosition) == std::partial_ordering::less)
183-
("r", startVersion->BuildCursor(startPosition).DebugJson())("a", anotherIterator.GetVersionColumns().DebugJson())
184-
("key", startSorting->BuildCursor(startPosition).DebugJson());
185-
}
186-
}
187-
SortHeap.Next();
188-
isFirst = false;
189-
}
190-
SortHeap.CleanFinished();
191-
}
192-
193113
std::vector<std::shared_ptr<arrow::RecordBatch>> TMergePartialStream::DrainAllParts(const TIntervalPositions& positions,
194114
const std::vector<std::shared_ptr<arrow::Field>>& resultFields)
195115
{

ydb/core/formats/arrow/reader/merger.h

Lines changed: 140 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
#pragma once
2-
#include "position.h"
3-
#include "heap.h"
4-
#include "result_builder.h"
52
#include "batch_iterator.h"
3+
#include "heap.h"
4+
#include "position.h"
65

76
#include <ydb/core/formats/arrow/arrow_filter.h>
87

98
namespace NKikimr::NArrow::NMerger {
109

10+
template <typename T>
11+
concept MergeResultBuilder = requires(const T& constT, T& mutT, const std::shared_ptr<arrow::Schema>& schema, const TBatchIterator& cursor) {
12+
{ constT.IsBufferExhausted() } -> std::same_as<bool>;
13+
{ constT.ValidateDataSchema(schema) } -> std::same_as<void>;
14+
{ mutT.AddRecord(cursor) } -> std::same_as<void>;
15+
{ mutT.SkipRecord(cursor) } -> std::same_as<void>;
16+
};
17+
1118
class TMergePartialStream {
1219
private:
1320
#ifndef NDEBUG
@@ -19,6 +26,7 @@ class TMergePartialStream {
1926
std::shared_ptr<arrow::Schema> DataSchema;
2027
const bool Reverse;
2128
const std::vector<std::string> VersionColumnNames;
29+
std::optional<TCursor> MaxVersion;
2230
ui32 ControlPoints = 0;
2331

2432
TSortingHeap<TBatchIterator> SortHeap;
@@ -34,19 +42,92 @@ class TMergePartialStream {
3442
return result;
3543
}
3644

37-
void DrainCurrentPosition(TRecordBatchBuilder* builder, std::shared_ptr<TSortableScanData>* resultScanData, ui64* resultPosition);
45+
template <MergeResultBuilder TBuilder>
46+
[[nodiscard]] bool DrainCurrentPosition(TBuilder* builder, std::shared_ptr<TSortableScanData>* resultScanData, ui64* resultPosition) {
47+
Y_ABORT_UNLESS(SortHeap.Size());
48+
Y_ABORT_UNLESS(!SortHeap.Current().IsControlPoint());
49+
CheckSequenceInDebug(SortHeap.Current().GetKeyColumns());
50+
51+
const ui64 startPosition = SortHeap.Current().GetKeyColumns().GetPosition();
52+
const TSortableScanData* startSorting = SortHeap.Current().GetKeyColumns().GetSorting().get();
53+
const TSortableScanData* startVersion = SortHeap.Current().GetVersionColumns().GetSorting().get();
54+
55+
if (MaxVersion) {
56+
bool skippedPk = false;
57+
while (SortHeap.Size() && SortHeap.Current().GetVersionColumns().Compare(*MaxVersion) == std::partial_ordering::greater && !skippedPk) {
58+
if (builder) {
59+
builder->SkipRecord(SortHeap.Current());
60+
}
61+
SortHeap.Next();
62+
if (SortHeap.Empty() ||
63+
SortHeap.Current().GetKeyColumns().Compare(*startSorting, startPosition) != std::partial_ordering::equivalent) {
64+
skippedPk = true;
65+
}
66+
}
67+
if (skippedPk) {
68+
SortHeap.CleanFinished();
69+
return false;
70+
}
71+
}
72+
73+
bool foundResult = false;
74+
if (!SortHeap.Current().IsDeleted()) {
75+
foundResult = true;
76+
// AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("key_add", SortHeap.Current().GetKeyColumns().DebugJson().GetStringRobust());
77+
if (builder) {
78+
builder->AddRecord(SortHeap.Current());
79+
}
80+
if (resultScanData && resultPosition) {
81+
*resultScanData = SortHeap.Current().GetKeyColumns().GetSorting();
82+
*resultPosition = SortHeap.Current().GetKeyColumns().GetPosition();
83+
}
84+
} else {
85+
// AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("key_skip", SortHeap.Current().GetKeyColumns().DebugJson().GetStringRobust());
86+
if (builder) {
87+
builder->SkipRecord(SortHeap.Current());
88+
}
89+
}
90+
SortHeap.Next();
91+
92+
while (
93+
SortHeap.Size() && (SortHeap.Current().GetKeyColumns().Compare(*startSorting, startPosition) == std::partial_ordering::equivalent)) {
94+
if (builder) {
95+
builder->SkipRecord(SortHeap.Current());
96+
}
97+
// AFL_ERROR(NKikimrServices::TX_COLUMNSHARD)("key_skip1", SortHeap.Current().GetKeyColumns().DebugJson().GetStringRobust());
98+
auto& anotherIterator = SortHeap.Current();
99+
if (PossibleSameVersionFlag) {
100+
AFL_VERIFY(anotherIterator.GetVersionColumns().Compare(*startVersion, startPosition) != std::partial_ordering::greater)
101+
("r", startVersion->BuildCursor(startPosition).DebugJson())("a", anotherIterator.GetVersionColumns().DebugJson())(
102+
"key", startSorting->BuildCursor(startPosition).DebugJson());
103+
} else {
104+
AFL_VERIFY(anotherIterator.GetVersionColumns().Compare(*startVersion, startPosition) == std::partial_ordering::less)
105+
("r", startVersion->BuildCursor(startPosition).DebugJson())("a", anotherIterator.GetVersionColumns().DebugJson())(
106+
"key", startSorting->BuildCursor(startPosition).DebugJson());
107+
}
108+
SortHeap.Next();
109+
}
110+
SortHeap.CleanFinished();
111+
return foundResult;
112+
}
38113

39114
void CheckSequenceInDebug(const TRWSortableBatchPosition& nextKeyColumnsPosition);
40-
bool DrainCurrentTo(TRecordBatchBuilder& builder, const TSortableBatchPosition& readTo, const bool includeFinish,
41-
std::optional<TCursor>* lastResultPosition = nullptr);
115+
116+
template <MergeResultBuilder TBuilder>
117+
bool DrainCurrentTo(TBuilder& builder, const TSortableBatchPosition& readTo, const bool includeFinish,
118+
std::optional<TCursor>* lastResultPosition = nullptr) {
119+
PutControlPoint(readTo, false);
120+
return DrainToControlPoint(builder, includeFinish, lastResultPosition);
121+
}
42122

43123
public:
44-
TMergePartialStream(std::shared_ptr<arrow::Schema> sortSchema, std::shared_ptr<arrow::Schema> dataSchema, const bool reverse, const std::vector<std::string>& versionColumnNames)
124+
TMergePartialStream(std::shared_ptr<arrow::Schema> sortSchema, std::shared_ptr<arrow::Schema> dataSchema, const bool reverse,
125+
const std::vector<std::string>& versionColumnNames, const std::optional<TCursor>& maxVersion)
45126
: SortSchema(sortSchema)
46127
, DataSchema(dataSchema)
47128
, Reverse(reverse)
48129
, VersionColumnNames(versionColumnNames)
49-
{
130+
, MaxVersion(maxVersion) {
50131
Y_ABORT_UNLESS(SortSchema);
51132
Y_ABORT_UNLESS(SortSchema->num_fields());
52133
Y_ABORT_UNLESS(!DataSchema || DataSchema->num_fields());
@@ -78,25 +159,67 @@ class TMergePartialStream {
78159
}
79160

80161
template <class TDataContainer>
81-
void AddSource(const std::shared_ptr<TDataContainer>& batch, const std::shared_ptr<NArrow::TColumnFilter>& filter) {
162+
void AddSource(const std::shared_ptr<TDataContainer>& batch, const std::shared_ptr<NArrow::TColumnFilter>& filter,
163+
const std::optional<ui64> sourceIdExt = std::nullopt) {
164+
const ui64 sourceId = sourceIdExt.value_or(SortHeap.Size());
82165
if (!batch || !batch->num_rows()) {
83166
return;
84167
}
85-
// Y_DEBUG_ABORT_UNLESS(NArrow::IsSorted(batch, SortSchema));
168+
// Y_DEBUG_ABORT_UNLESS(NArrow::IsSorted(batch, SortSchema));
86169
const bool isDenyFilter = filter && filter->IsTotalDenyFilter();
87170
auto filterImpl = (!filter || filter->IsTotalAllowFilter()) ? nullptr : filter;
88-
SortHeap.Push(TBatchIterator(batch, filterImpl, SortSchema->field_names(), (!isDenyFilter && DataSchema) ? DataSchema->field_names() : std::vector<std::string>(), Reverse, VersionColumnNames));
171+
static const arrow::Schema emptySchema = arrow::Schema(arrow::FieldVector());
172+
TBatchIterator iterator(
173+
batch, filterImpl, *SortSchema, (!isDenyFilter && DataSchema) ? *DataSchema : emptySchema, Reverse, VersionColumnNames, sourceId);
174+
if (MaxVersion) {
175+
MaxVersion->ValidateSchema(*iterator.GetVersionColumns().GetSorting());
176+
}
177+
SortHeap.Push(std::move(iterator));
89178
}
90179

91180
bool IsEmpty() const {
92181
return !SortHeap.Size();
93182
}
94183

95-
void DrainAll(TRecordBatchBuilder& builder);
96-
std::shared_ptr<arrow::Table> SingleSourceDrain(const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TCursor>* lastResultPosition = nullptr);
97-
bool DrainToControlPoint(TRecordBatchBuilder& builder, const bool includeFinish, std::optional<TCursor>* lastResultPosition = nullptr);
98-
std::vector<std::shared_ptr<arrow::RecordBatch>> DrainAllParts(const TIntervalPositions& positions,
99-
const std::vector<std::shared_ptr<arrow::Field>>& resultFields);
184+
template <MergeResultBuilder TBuilder>
185+
void DrainAll(TBuilder& builder) {
186+
builder.ValidateDataSchema(DataSchema);
187+
while (SortHeap.Size()) {
188+
Y_UNUSED(DrainCurrentPosition(&builder, nullptr, nullptr));
189+
}
190+
}
191+
std::shared_ptr<arrow::Table> SingleSourceDrain(
192+
const TSortableBatchPosition& readTo, const bool includeFinish, std::optional<TCursor>* lastResultPosition = nullptr);
193+
std::vector<std::shared_ptr<arrow::RecordBatch>> DrainAllParts(
194+
const TIntervalPositions& positions, const std::vector<std::shared_ptr<arrow::Field>>& resultFields);
195+
196+
template <MergeResultBuilder TBuilder>
197+
bool DrainToControlPoint(TBuilder& builder, const bool includeFinish, std::optional<TCursor>* lastResultPosition = nullptr) {
198+
AFL_VERIFY(ControlPoints == 1);
199+
builder.ValidateDataSchema(DataSchema);
200+
bool cpReachedFlag = false;
201+
std::shared_ptr<TSortableScanData> resultScanData;
202+
ui64 resultPosition;
203+
while (SortHeap.Size() && !cpReachedFlag && !builder.IsBufferExhausted()) {
204+
if (SortHeap.Current().IsControlPoint()) {
205+
auto keyColumns = SortHeap.Current().GetKeyColumns().BuildSortingCursor();
206+
RemoveControlPoint();
207+
cpReachedFlag = true;
208+
if (SortHeap.Empty() || !includeFinish ||
209+
SortHeap.Current().GetKeyColumns().Compare(keyColumns) == std::partial_ordering::greater) {
210+
if (lastResultPosition && resultScanData) {
211+
*lastResultPosition = resultScanData->BuildCursor(resultPosition);
212+
}
213+
return true;
214+
}
215+
}
216+
Y_UNUSED(DrainCurrentPosition(&builder, &resultScanData, &resultPosition));
217+
}
218+
if (lastResultPosition && resultScanData) {
219+
*lastResultPosition = resultScanData->BuildCursor(resultPosition);
220+
}
221+
return cpReachedFlag;
222+
}
100223
};
101224

102-
}
225+
} // namespace NKikimr::NArrow::NMerger

ydb/core/formats/arrow/reader/position.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@ std::optional<TSortableBatchPosition::TFoundPosition> TSortableBatchPosition::Fi
3232
};
3333

3434
{
35-
AFL_VERIFY(guard.InitSortingPosition(posStart));
35+
AFL_VERIFY(guard.InitSortingPosition(posStart))("start", posStart)("finish", posFinish);
3636
auto cmp = position.Compare(forFound);
3737
if (cond(cmp)) {
3838
return TFoundPosition(posStart, cmp);
3939
}
4040
}
4141
{
42-
AFL_VERIFY(guard.InitSortingPosition(posFinish));
42+
AFL_VERIFY(guard.InitSortingPosition(posFinish))("start", posStart)("finish", posFinish);
4343
auto cmp = position.Compare(forFound);
4444
if (!cond(cmp)) {
4545
return std::nullopt;
@@ -266,6 +266,15 @@ std::partial_ordering TCursor::Compare(const TCursor& item) const {
266266
return std::partial_ordering::equivalent;
267267
}
268268

269+
void TCursor::ValidateSchema(const TSortableScanData& position) const {
270+
AFL_VERIFY(position.GetFields().size() == PositionAddress.size());
271+
for (ui64 i = 0; i < PositionAddress.size(); ++i) {
272+
const auto& posType = position.GetFields()[i]->type();
273+
const auto& cursorType = PositionAddress[i].GetArray()->type();
274+
AFL_VERIFY(posType->Equals(cursorType))("pos", posType->ToString())("cursor", cursorType->ToString());
275+
}
276+
}
277+
269278
void TCursor::AppendPositionTo(const std::vector<std::unique_ptr<arrow::ArrayBuilder>>& builders, ui64* recordSize) const {
270279
AFL_VERIFY(builders.size() == PositionAddress.size());
271280
for (ui32 i = 0; i < PositionAddress.size(); ++i) {

0 commit comments

Comments
 (0)