Skip to content

Commit 555ebf3

Browse files
improve chunks splitter performance (#7665)
1 parent 3734eda commit 555ebf3

File tree

1 file changed

+20
-8
lines changed

1 file changed

+20
-8
lines changed

ydb/core/formats/arrow/arrow_helpers.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -934,25 +934,37 @@ std::vector<std::shared_ptr<arrow::RecordBatch>> SliceToRecordBatches(const std:
934934
}
935935
std::sort(positions.begin(), positions.end());
936936
positions.erase(std::unique(positions.begin(), positions.end()), positions.end());
937-
937+
AFL_VERIFY(positions.size() > 1)("size", positions.size())("positions", JoinSeq(",", positions));
938938
std::vector<std::vector<std::shared_ptr<arrow::Array>>> slicedData;
939939
slicedData.resize(positions.size() - 1);
940-
{
941-
for (auto&& i : t->columns()) {
942-
for (ui32 idx = 0; idx + 1 < positions.size(); ++idx) {
943-
auto slice = i->Slice(positions[idx], positions[idx + 1] - positions[idx]);
944-
AFL_VERIFY(slice->num_chunks() == 1);
945-
slicedData[idx].emplace_back(slice->chunks().front());
940+
for (auto&& i : t->columns()) {
941+
ui32 currentPosition = 0;
942+
auto it = i->chunks().begin();
943+
ui32 length = (*it)->length();
944+
for (ui32 idx = 0; idx + 1 < positions.size(); ++idx) {
945+
auto chunk = (*it)->Slice(positions[idx] - currentPosition, positions[idx + 1] - positions[idx]);
946+
AFL_VERIFY_DEBUG(chunk->length() == positions[idx + 1] - positions[idx])("length", chunk->length())(
947+
"delta", positions[idx + 1] - positions[idx]);
948+
AFL_VERIFY_DEBUG(chunk->length())("delta", positions[idx + 1] - positions[idx]);
949+
if (positions[idx + 1] - currentPosition == length) {
950+
if (++it != i->chunks().end()) {
951+
length = (*it)->length();
952+
}
953+
currentPosition = positions[idx + 1];
946954
}
955+
slicedData[idx].emplace_back(chunk);
947956
}
948957
}
949958
std::vector<std::shared_ptr<arrow::RecordBatch>> result;
950959
ui32 count = 0;
951960
for (auto&& i : slicedData) {
961+
AFL_VERIFY_DEBUG(i.size());
962+
AFL_VERIFY_DEBUG(i.front()->length());
952963
result.emplace_back(arrow::RecordBatch::Make(t->schema(), i.front()->length(), i));
953964
count += result.back()->num_rows();
954965
}
955-
AFL_VERIFY(count == t->num_rows())("count", count)("t", t->num_rows());
966+
AFL_VERIFY(count == t->num_rows())("count", count)("t", t->num_rows())("sd_size", slicedData.size())("columns", t->num_columns())(
967+
"schema", t->schema()->ToString());
956968
return result;
957969
}
958970

0 commit comments

Comments
 (0)