Skip to content

Commit b5dba78

Browse files
dont create temporary sparsed array on merging (#8174)
1 parent 92ff6c1 commit b5dba78

File tree

13 files changed

+317
-133
lines changed

13 files changed

+317
-133
lines changed

ydb/core/formats/arrow/accessor/abstract/accessor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class IChunkedArray {
7676
}
7777

7878
ui32 GetLocalIndex(const ui32 position) const {
79-
AFL_VERIFY(Contains(position));
79+
AFL_VERIFY(Contains(position))("pos", position)("start", GlobalStartPosition);
8080
return position - GlobalStartPosition;
8181
}
8282

ydb/core/formats/arrow/accessor/sparsed/accessor.cpp

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,14 @@ TSparsedArray::TSparsedArray(const IChunkedArray& defaultArray, const std::share
5959
pos = current->GetAddress().GetGlobalFinishPosition();
6060
AFL_VERIFY(pos <= GetRecordsCount());
6161
}
62-
std::vector<std::shared_ptr<arrow::Field>> fields = { std::make_shared<arrow::Field>("index", arrow::uint32()),
63-
std::make_shared<arrow::Field>("value", GetDataType()) };
64-
auto schema = std::make_shared<arrow::Schema>(fields);
6562
std::vector<std::shared_ptr<arrow::Array>> columns = { NArrow::TStatusValidator::GetValid(builderIndex->Finish()),
6663
NArrow::TStatusValidator::GetValid(builderValue->Finish()) };
67-
records = arrow::RecordBatch::Make(schema, sparsedRecordsCount, columns);
64+
records = arrow::RecordBatch::Make(BuildSchema(GetDataType()), sparsedRecordsCount, columns);
6865
AFL_VERIFY_DEBUG(records->ValidateFull().ok());
6966
return true;
7067
}));
7168
AFL_VERIFY(records);
72-
Records.emplace_back(TSparsedArrayChunk(0, GetRecordsCount(), records, DefaultValue));
69+
Records.emplace_back(0, GetRecordsCount(), records, DefaultValue);
7370
}
7471

7572
std::vector<NKikimr::NArrow::NAccessor::TChunkedArraySerialized> TSparsedArray::DoSplitBySizes(
@@ -136,27 +133,44 @@ ui32 TSparsedArray::GetLastIndex(const std::shared_ptr<arrow::RecordBatch>& batc
136133
return ui32Column->Value(ui32Column->length() - 1);
137134
}
138135

136+
namespace {
137+
static thread_local THashMap<TString, std::shared_ptr<arrow::RecordBatch>> SimpleBatchesCache;
138+
}
139+
140+
NKikimr::NArrow::NAccessor::TSparsedArrayChunk TSparsedArray::MakeDefaultChunk(
141+
const std::shared_ptr<arrow::Scalar>& defaultValue, const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount) {
142+
auto it = SimpleBatchesCache.find(type->ToString());
143+
if (it == SimpleBatchesCache.end()) {
144+
it = SimpleBatchesCache.emplace(type->ToString(), NArrow::MakeEmptyBatch(BuildSchema(type))).first;
145+
AFL_VERIFY(it->second->ValidateFull().ok());
146+
}
147+
return TSparsedArrayChunk(0, recordsCount, it->second, defaultValue);
148+
}
149+
139150
IChunkedArray::TLocalDataAddress TSparsedArrayChunk::GetChunk(
140151
const std::optional<IChunkedArray::TCommonChunkAddress>& /*chunkCurrent*/, const ui64 position, const ui32 chunkIdx) const {
141-
auto it = RemapExternalToInternal.upper_bound(position);
152+
const auto predCompare = [](const ui32 position, const TInternalChunkInfo& item) {
153+
return position < item.GetStartExt();
154+
};
155+
auto it = std::upper_bound(RemapExternalToInternal.begin(), RemapExternalToInternal.end(), position, predCompare);
142156
AFL_VERIFY(it != RemapExternalToInternal.begin());
143157
--it;
144-
if (it->second.GetIsDefault()) {
158+
if (it->GetIsDefault()) {
145159
return IChunkedArray::TLocalDataAddress(
146-
NArrow::TThreadSimpleArraysCache::Get(ColValue->type(), DefaultValue, it->second.GetSize()), StartPosition + it->first, chunkIdx);
160+
NArrow::TThreadSimpleArraysCache::Get(ColValue->type(), DefaultValue, it->GetSize()), StartPosition + it->GetStartExt(), chunkIdx);
147161
} else {
148162
return IChunkedArray::TLocalDataAddress(
149-
ColValue->Slice(it->second.GetStart(), it->second.GetSize()), StartPosition + it->first, chunkIdx);
163+
ColValue->Slice(it->GetStartInt(), it->GetSize()), StartPosition + it->GetStartExt(), chunkIdx);
150164
}
151165
}
152166

153167
std::vector<std::shared_ptr<arrow::Array>> TSparsedArrayChunk::GetChunkedArray() const {
154168
std::vector<std::shared_ptr<arrow::Array>> chunks;
155169
for (auto&& i : RemapExternalToInternal) {
156-
if (i.second.GetIsDefault()) {
157-
chunks.emplace_back(NArrow::TThreadSimpleArraysCache::Get(ColValue->type(), DefaultValue, i.second.GetSize()));
170+
if (i.GetIsDefault()) {
171+
chunks.emplace_back(NArrow::TThreadSimpleArraysCache::Get(ColValue->type(), DefaultValue, i.GetSize()));
158172
} else {
159-
chunks.emplace_back(ColValue->Slice(i.second.GetStart(), i.second.GetSize()));
173+
chunks.emplace_back(ColValue->Slice(i.GetStartInt(), i.GetSize()));
160174
}
161175
}
162176
return chunks;
@@ -189,23 +203,26 @@ TSparsedArrayChunk::TSparsedArrayChunk(const ui32 posStart, const ui32 recordsCo
189203
for (ui32 idx = 0; idx < UI32ColIndex->length(); ++idx) {
190204
if (nextIndex != UI32ColIndex->Value(idx)) {
191205
if (idx - startIndexInt) {
192-
AFL_VERIFY(RemapExternalToInternal.emplace(startIndexExt, TInternalChunkInfo(startIndexInt, idx - startIndexInt, false)).second);
206+
RemapExternalToInternal.emplace_back(startIndexExt, startIndexInt, idx - startIndexInt, false);
193207
}
194-
AFL_VERIFY(RemapExternalToInternal.emplace(nextIndex, TInternalChunkInfo(0, UI32ColIndex->Value(idx) - nextIndex, true)).second);
208+
RemapExternalToInternal.emplace_back(nextIndex, 0, UI32ColIndex->Value(idx) - nextIndex, true);
195209
startIndexExt = UI32ColIndex->Value(idx);
196210
startIndexInt = idx;
197211
}
198212
nextIndex = UI32ColIndex->Value(idx) + 1;
199213
}
200214
if (UI32ColIndex->length() > startIndexInt) {
201-
AFL_VERIFY(RemapExternalToInternal.emplace(startIndexExt, TInternalChunkInfo(startIndexInt, UI32ColIndex->length() - startIndexInt, false)).second);
215+
RemapExternalToInternal.emplace_back(startIndexExt, startIndexInt, UI32ColIndex->length() - startIndexInt, false);
202216
}
203217
if (nextIndex != RecordsCount) {
204-
AFL_VERIFY(RemapExternalToInternal.emplace(nextIndex, TInternalChunkInfo(0, RecordsCount - nextIndex, true)).second);
218+
RemapExternalToInternal.emplace_back(nextIndex, 0, RecordsCount - nextIndex, true);
205219
}
206220
ui32 count = 0;
207221
for (auto&& i : RemapExternalToInternal) {
208-
count += i.second.GetSize();
222+
count += i.GetSize();
223+
}
224+
for (ui32 i = 0; i + 1 < RemapExternalToInternal.size(); ++i) {
225+
AFL_VERIFY(RemapExternalToInternal[i + 1].GetStartExt() == RemapExternalToInternal[i].GetStartExt() + RemapExternalToInternal[i].GetSize());
209226
}
210227
AFL_VERIFY(count == RecordsCount)("count", count)("records_count", RecordsCount);
211228
AFL_VERIFY(ColValue);
@@ -256,7 +273,7 @@ void TSparsedArray::TBuilder::AddChunk(const ui32 recordsCount, const std::share
256273
auto* arr = static_cast<const arrow::UInt32Array*>(data->column(0).get());
257274
AFL_VERIFY(arr->Value(arr->length() - 1) < recordsCount)("val", arr->Value(arr->length() - 1))("count", recordsCount);
258275
}
259-
Chunks.emplace_back(TSparsedArrayChunk(RecordsCount, recordsCount, data, DefaultValue));
276+
Chunks.emplace_back(RecordsCount, recordsCount, data, DefaultValue);
260277
RecordsCount += recordsCount;
261278
}
262279

ydb/core/formats/arrow/accessor/sparsed/accessor.h

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22
#include <ydb/core/formats/arrow/accessor/abstract/accessor.h>
3+
#include <ydb/core/formats/arrow/arrow_helpers.h>
34

45
#include <ydb/library/accessor/accessor.h>
56

@@ -9,7 +10,7 @@
910

1011
namespace NKikimr::NArrow::NAccessor {
1112

12-
class TSparsedArrayChunk {
13+
class TSparsedArrayChunk: public TMoveOnly {
1314
private:
1415
YDB_READONLY(ui32, RecordsCount, 0);
1516
YDB_READONLY(ui32, StartPosition, 0);
@@ -24,20 +25,26 @@ class TSparsedArrayChunk {
2425

2526
class TInternalChunkInfo {
2627
private:
27-
YDB_READONLY(ui32, Start, 0);
28+
YDB_READONLY(ui32, StartExt, 0);
29+
YDB_READONLY(ui32, StartInt, 0);
2830
YDB_READONLY(ui32, Size, 0);
2931
YDB_READONLY(bool, IsDefault, false);
3032

3133
public:
32-
TInternalChunkInfo(const ui32 start, const ui32 size, const bool defaultFlag)
33-
: Start(start)
34+
TInternalChunkInfo(const ui32 startExt, const ui32 startInt, const ui32 size, const bool defaultFlag)
35+
: StartExt(startExt)
36+
, StartInt(startInt)
3437
, Size(size)
3538
, IsDefault(defaultFlag) {
3639
AFL_VERIFY(Size);
3740
}
41+
42+
bool operator<(const TInternalChunkInfo& item) const {
43+
return StartExt < item.StartExt;
44+
}
3845
};
3946

40-
std::map<ui32, TInternalChunkInfo> RemapExternalToInternal;
47+
std::vector<TInternalChunkInfo> RemapExternalToInternal;
4148

4249
public:
4350
ui32 GetFinishPosition() const {
@@ -87,8 +94,7 @@ class TSparsedArray: public IChunkedArray {
8794
virtual std::vector<TChunkedArraySerialized> DoSplitBySizes(
8895
const TColumnSaver& saver, const TString& fullSerializedData, const std::vector<ui64>& splitSizes) override;
8996

90-
virtual TLocalDataAddress DoGetLocalData(
91-
const std::optional<TCommonChunkAddress>& chunkCurrent, const ui64 position) const override {
97+
virtual TLocalDataAddress DoGetLocalData(const std::optional<TCommonChunkAddress>& chunkCurrent, const ui64 position) const override {
9298
ui32 currentIdx = 0;
9399
for (ui32 i = 0; i < Records.size(); ++i) {
94100
if (currentIdx <= position && position < currentIdx + Records[i].GetRecordsCount()) {
@@ -115,38 +121,48 @@ class TSparsedArray: public IChunkedArray {
115121
return bytes;
116122
}
117123

118-
TSparsedArray(std::vector<TSparsedArrayChunk>&& data, const std::shared_ptr<arrow::Scalar>& /*defaultValue*/,
124+
TSparsedArray(std::vector<TSparsedArrayChunk>&& data, const std::shared_ptr<arrow::Scalar>& defaultValue,
119125
const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount)
120126
: TBase(recordsCount, EType::SparsedArray, type)
127+
, DefaultValue(defaultValue)
121128
, Records(std::move(data)) {
122129
}
123130

124131
static ui32 GetLastIndex(const std::shared_ptr<arrow::RecordBatch>& batch);
125132

133+
static std::shared_ptr<arrow::Schema> BuildSchema(const std::shared_ptr<arrow::DataType>& type) {
134+
std::vector<std::shared_ptr<arrow::Field>> fields = { std::make_shared<arrow::Field>("index", arrow::uint32()),
135+
std::make_shared<arrow::Field>("value", type) };
136+
return std::make_shared<arrow::Schema>(fields);
137+
}
138+
139+
static TSparsedArrayChunk MakeDefaultChunk(
140+
const std::shared_ptr<arrow::Scalar>& defaultValue, const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount);
141+
126142
public:
127143
TSparsedArray(const IChunkedArray& defaultArray, const std::shared_ptr<arrow::Scalar>& defaultValue);
128-
TSparsedArray(const std::shared_ptr<arrow::Scalar>& defaultValue,
129-
const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount)
130-
: TSparsedArray({}, defaultValue, type, recordsCount)
131-
{
132-
144+
145+
TSparsedArray(const std::shared_ptr<arrow::Scalar>& defaultValue, const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount)
146+
: TBase(recordsCount, EType::SparsedArray, type)
147+
, DefaultValue(defaultValue) {
148+
Records.emplace_back(MakeDefaultChunk(defaultValue, type, recordsCount));
133149
}
134150

135151
virtual std::shared_ptr<arrow::Scalar> DoGetScalar(const ui32 index) const override {
136-
auto chunk = GetSparsedChunk(index);
152+
auto& chunk = GetSparsedChunk(index);
137153
return chunk.GetScalar(index - chunk.GetStartPosition());
138154
}
139155

140-
TSparsedArrayChunk GetSparsedChunk(const ui64 position) const {
141-
ui32 currentIdx = 0;
142-
for (ui32 i = 0; i < Records.size(); ++i) {
143-
if (currentIdx <= position && position < currentIdx + Records[i].GetRecordsCount()) {
144-
return Records[i];
145-
}
146-
currentIdx += Records[i].GetRecordsCount();
147-
}
148-
AFL_VERIFY(false);
149-
return Records.back();
156+
const TSparsedArrayChunk& GetSparsedChunk(const ui64 position) const {
157+
const auto pred = [](const ui64 position, const TSparsedArrayChunk& item) {
158+
return position < item.GetStartPosition();
159+
};
160+
auto it = std::upper_bound(Records.begin(), Records.end(), position, pred);
161+
AFL_VERIFY(it != Records.begin());
162+
--it;
163+
AFL_VERIFY(position < it->GetStartPosition() + it->GetRecordsCount());
164+
AFL_VERIFY(it->GetStartPosition() <= position);
165+
return *it;
150166
}
151167

152168
class TBuilder {

ydb/core/formats/arrow/arrow_helpers.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,38 @@ bool ScalarLess(const arrow::Scalar& x, const arrow::Scalar& y) {
589589
return ScalarCompare(x, y) < 0;
590590
}
591591

592+
bool ColumnEqualsScalar(
593+
const std::shared_ptr<arrow::Array>& c, const ui32 position, const std::shared_ptr<arrow::Scalar>& s) {
594+
AFL_VERIFY(c);
595+
if (!s) {
596+
return c->IsNull(position) ;
597+
}
598+
AFL_VERIFY(c->type()->Equals(s->type))("s", s->type->ToString())("c", c->type()->ToString());
599+
600+
return SwitchTypeImpl<bool, 0>(c->type()->id(), [&](const auto& type) {
601+
using TWrap = std::decay_t<decltype(type)>;
602+
using TScalar = typename arrow::TypeTraits<typename TWrap::T>::ScalarType;
603+
using TArrayType = typename arrow::TypeTraits<typename TWrap::T>::ArrayType;
604+
using TValue = std::decay_t<decltype(static_cast<const TScalar&>(*s).value)>;
605+
606+
if constexpr (arrow::has_string_view<typename TWrap::T>()) {
607+
const auto& cval = static_cast<const TArrayType&>(*c).GetView(position);
608+
const auto& sval = static_cast<const TScalar&>(*s).value;
609+
AFL_VERIFY(sval);
610+
TStringBuf cBuf(reinterpret_cast<const char*>(cval.data()), cval.size());
611+
TStringBuf sBuf(reinterpret_cast<const char*>(sval->data()), sval->size());
612+
return cBuf == sBuf;
613+
}
614+
if constexpr (std::is_arithmetic_v<TValue>) {
615+
const auto cval = static_cast<const TArrayType&>(*c).GetView(position);
616+
const auto sval = static_cast<const TScalar&>(*s).value;
617+
return (cval == sval);
618+
}
619+
Y_ABORT_UNLESS(false); // TODO: non primitive types
620+
return false;
621+
});
622+
}
623+
592624
int ScalarCompare(const arrow::Scalar& x, const arrow::Scalar& y) {
593625
Y_VERIFY_S(x.type->Equals(y.type), x.type->ToString() + " vs " + y.type->ToString());
594626

ydb/core/formats/arrow/arrow_helpers.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,10 @@ bool IsGoodScalar(const std::shared_ptr<arrow::Scalar>& x);
9999
int ScalarCompare(const arrow::Scalar& x, const arrow::Scalar& y);
100100
int ScalarCompare(const std::shared_ptr<arrow::Scalar>& x, const std::shared_ptr<arrow::Scalar>& y);
101101
int ScalarCompareNullable(const std::shared_ptr<arrow::Scalar>& x, const std::shared_ptr<arrow::Scalar>& y);
102-
std::partial_ordering ColumnsCompare(const std::vector<std::shared_ptr<arrow::Array>>& x, const ui32 xRow, const std::vector<std::shared_ptr<arrow::Array>>& y, const ui32 yRow);
102+
std::partial_ordering ColumnsCompare(
103+
const std::vector<std::shared_ptr<arrow::Array>>& x, const ui32 xRow, const std::vector<std::shared_ptr<arrow::Array>>& y, const ui32 yRow);
104+
bool ColumnEqualsScalar(
105+
const std::shared_ptr<arrow::Array>& c, const ui32 position, const std::shared_ptr<arrow::Scalar>& s);
103106
bool ScalarLess(const std::shared_ptr<arrow::Scalar>& x, const std::shared_ptr<arrow::Scalar>& y);
104107
bool ScalarLess(const arrow::Scalar& x, const arrow::Scalar& y);
105108
std::shared_ptr<arrow::RecordBatch> ReallocateBatch(std::shared_ptr<arrow::RecordBatch> original);

ydb/core/tx/columnshard/engines/changes/compaction/abstract/merger.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ void IColumnMerger::Start(const std::vector<std::shared_ptr<NArrow::NAccessor::I
66
AFL_VERIFY(!Started);
77
Started = true;
88
for (auto&& i : input) {
9+
if (!i) {
10+
continue;
11+
}
912
AFL_VERIFY(i->GetDataType()->id() == Context.GetResultField()->type()->id())("input", i->GetDataType()->ToString())(
1013
"result", Context.GetResultField()->ToString());
1114
}

ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ std::vector<TWritePortionInfoWithBlobsResult> TMerger::Execute(const std::shared
3131

3232
ui32 idx = 0;
3333
for (auto&& batch : Batches) {
34-
AFL_VERIFY(batch->GetColumnsCount() == resultFiltered->GetColumnsCount())("data", batch->GetColumnsCount())(
35-
"schema", resultFiltered->GetColumnsCount());
3634
{
3735
NArrow::NConstruction::IArrayBuilder::TPtr column =
3836
std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TIntConstFiller<arrow::UInt16Type>>>(
@@ -53,9 +51,31 @@ std::vector<TWritePortionInfoWithBlobsResult> TMerger::Execute(const std::shared
5351

5452
std::vector<std::map<ui32, std::vector<TColumnPortionResult>>> chunkGroups;
5553
chunkGroups.resize(batchResults.size());
56-
for (auto&& columnId : resultFiltered->GetColumnIds()) {
57-
NActors::TLogContextGuard logGuard(
58-
NActors::TLogContextBuilder::Build()("field_name", resultFiltered->GetIndexInfo().GetColumnName(columnId)));
54+
55+
using TColumnData = std::vector<std::shared_ptr<NArrow::NAccessor::IChunkedArray>>;
56+
THashMap<ui32, TColumnData> columnsData;
57+
{
58+
ui32 batchIdx = 0;
59+
for (auto&& p : Batches) {
60+
ui32 columnIdx = 0;
61+
for (auto&& i : p->GetSchema()->GetFields()) {
62+
const std::optional<ui32> columnId = resultFiltered->GetIndexInfo().GetColumnIdOptional(i->name());
63+
if (columnId) {
64+
auto it = columnsData.find(*columnId);
65+
if (it == columnsData.end()) {
66+
it = columnsData.emplace(*columnId, TColumnData(Batches.size())).first;
67+
}
68+
it->second[batchIdx] = p->GetColumnVerified(columnIdx);
69+
}
70+
++columnIdx;
71+
}
72+
++batchIdx;
73+
}
74+
}
75+
76+
for (auto&& [columnId, columnData] : columnsData) {
77+
const TString& columnName = resultFiltered->GetIndexInfo().GetColumnName(columnId);
78+
NActors::TLogContextGuard logGuard(NActors::TLogContextBuilder::Build()("field_name", columnName));
5979
auto columnInfo = stats->GetColumnInfo(columnId);
6080

6181
TColumnMergeContext commonContext(
@@ -72,16 +92,7 @@ std::vector<TWritePortionInfoWithBlobsResult> TMerger::Execute(const std::shared
7292
AFL_VERIFY(!!merger)("problem", "cannot create merger")(
7393
"class_name", commonContext.GetLoader()->GetAccessorConstructor().GetClassName());
7494

75-
{
76-
std::vector<std::shared_ptr<NArrow::NAccessor::IChunkedArray>> parts;
77-
for (auto&& p : Batches) {
78-
parts.emplace_back(p->GetColumnVerified(resultFiltered->GetFieldIndex(columnId)));
79-
}
80-
81-
merger->Start(parts);
82-
}
83-
84-
std::map<std::string, std::vector<NCompaction::TColumnPortionResult>> columnChunks;
95+
merger->Start(columnData);
8596
ui32 batchIdx = 0;
8697
for (auto&& batchResult : batchResults) {
8798
const ui32 portionRecordsCountLimit =

0 commit comments

Comments
 (0)