Skip to content

Commit 2f62452

Browse files
bloom ngramms speed up (#12982)
1 parent 24d1f57 commit 2f62452

File tree

28 files changed

+345
-154
lines changed

28 files changed

+345
-154
lines changed

ydb/core/formats/arrow/hash/calcer.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99

1010
namespace NKikimr::NArrow::NHash {
1111

12-
void TXX64::AppendField(const std::shared_ptr<arrow::Scalar>& scalar, NXX64::TStreamStringHashCalcer& hashCalcer) {
12+
namespace {
13+
template <class TStreamCalcer>
14+
void AppendFieldImpl(const std::shared_ptr<arrow::Scalar>& scalar, TStreamCalcer& hashCalcer) {
1315
AFL_VERIFY(scalar);
1416
NArrow::SwitchType(scalar->type->id(), [&](const auto& type) {
1517
using TWrap = std::decay_t<decltype(type)>;
@@ -28,7 +30,8 @@ void TXX64::AppendField(const std::shared_ptr<arrow::Scalar>& scalar, NXX64::TSt
2830
});
2931
}
3032

31-
void TXX64::AppendField(const std::shared_ptr<arrow::Array>& array, const int row, NArrow::NHash::NXX64::TStreamStringHashCalcer& hashCalcer) {
33+
template <class TStreamCalcer>
34+
void AppendFieldImpl(const std::shared_ptr<arrow::Array>& array, const int row, TStreamCalcer& hashCalcer) {
3235
NArrow::SwitchType(array->type_id(), [&](const auto& type) {
3336
using TWrap = std::decay_t<decltype(type)>;
3437
using T = typename TWrap::T;
@@ -49,6 +52,24 @@ void TXX64::AppendField(const std::shared_ptr<arrow::Array>& array, const int ro
4952
});
5053
}
5154

55+
} // namespace
56+
57+
void TXX64::AppendField(const std::shared_ptr<arrow::Scalar>& scalar, NXX64::TStreamStringHashCalcer& hashCalcer) {
58+
AppendFieldImpl(scalar, hashCalcer);
59+
}
60+
61+
void TXX64::AppendField(const std::shared_ptr<arrow::Scalar>& scalar, NXX64::TStreamStringHashCalcer_H3& hashCalcer) {
62+
AppendFieldImpl(scalar, hashCalcer);
63+
}
64+
65+
void TXX64::AppendField(const std::shared_ptr<arrow::Array>& array, const int row, NArrow::NHash::NXX64::TStreamStringHashCalcer& hashCalcer) {
66+
AppendFieldImpl(array, row, hashCalcer);
67+
}
68+
69+
void TXX64::AppendField(const std::shared_ptr<arrow::Array>& array, const int row, NArrow::NHash::NXX64::TStreamStringHashCalcer_H3& hashCalcer) {
70+
AppendFieldImpl(array, row, hashCalcer);
71+
}
72+
5273
std::optional<std::vector<ui64>> TXX64::Execute(const std::shared_ptr<arrow::RecordBatch>& batch) const {
5374
std::vector<std::shared_ptr<arrow::Array>> columns = GetColumns(batch);
5475
if (columns.empty()) {

ydb/core/formats/arrow/hash/calcer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class TXX64 {
6767

6868
static void AppendField(const std::shared_ptr<arrow::Array>& array, const int row, NXX64::TStreamStringHashCalcer& hashCalcer);
6969
static void AppendField(const std::shared_ptr<arrow::Scalar>& scalar, NXX64::TStreamStringHashCalcer& hashCalcer);
70+
static void AppendField(const std::shared_ptr<arrow::Array>& array, const int row, NXX64::TStreamStringHashCalcer_H3& hashCalcer);
71+
static void AppendField(const std::shared_ptr<arrow::Scalar>& scalar, NXX64::TStreamStringHashCalcer_H3& hashCalcer);
7072
static ui64 CalcHash(const std::shared_ptr<arrow::Scalar>& scalar);
7173
std::optional<std::vector<ui64>> Execute(const std::shared_ptr<arrow::RecordBatch>& batch) const;
7274

ydb/core/formats/arrow/permutations.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include <ydb/library/actors/core/log.h>
1212

1313
#include <contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h>
14-
#include <contrib/libs/xxhash/xxhash.h>
1514

1615
namespace NKikimr::NArrow {
1716

ydb/core/kqp/ut/olap/indexes_ut.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
482482
}
483483
{
484484
ResetZeroLevel(csController);
485-
ui32 requestsCount = 100;
485+
ui32 requestsCount = 300;
486486
for (ui32 i = 0; i < requestsCount; ++i) {
487487
const ui32 idx = RandomNumber<ui32>(uids.size());
488488
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
@@ -494,12 +494,12 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
494494
};
495495
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
496496
}
497-
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
497+
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)("approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
498498
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
499499
}
500500
{
501501
ResetZeroLevel(csController);
502-
ui32 requestsCount = 100;
502+
ui32 requestsCount = 300;
503503
for (ui32 i = 0; i < requestsCount; ++i) {
504504
const ui32 idx = RandomNumber<ui32>(uids.size());
505505
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
@@ -511,13 +511,13 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
511511
};
512512
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
513513
}
514-
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)(
514+
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)(
515515
"approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
516516
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
517517
}
518518
{
519519
ResetZeroLevel(csController);
520-
ui32 requestsCount = 100;
520+
ui32 requestsCount = 300;
521521
for (ui32 i = 0; i < requestsCount; ++i) {
522522
const ui32 idx = RandomNumber<ui32>(uids.size());
523523
const auto query = [](const TString& res, const TString& uid, const ui32 level) {
@@ -529,7 +529,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
529529
};
530530
ExecuteSQL(query(resourceIds[idx], uids[idx], levels[idx]), "[[1u;]]");
531531
}
532-
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart > 1)(
532+
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() - SkipStart)(
533533
"approved", csController->GetIndexesApprovedOnSelect().Val() - ApproveStart)(
534534
"skipped", csController->GetIndexesSkippingOnSelect().Val() - SkipStart);
535535
}

ydb/core/tx/columnshard/engines/changes/compaction/merger.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ std::vector<TWritePortionInfoWithBlobsResult> TMerger::Execute(const std::shared
147147
for (auto&& i : packs) {
148148
TGeneralSerializedSlice slicePrimary(std::move(i));
149149
auto dataWithSecondary = resultFiltered->GetIndexInfo()
150-
.AppendIndexes(slicePrimary.GetPortionChunksToHash(), SaverContext.GetStoragesManager())
150+
.AppendIndexes(slicePrimary.GetPortionChunksToHash(), SaverContext.GetStoragesManager(), slicePrimary.GetRecordsCount())
151151
.DetachResult();
152152
TGeneralSerializedSlice slice(dataWithSecondary.GetExternalData(), schemaDetails, Context.Counters.SplitterCounters);
153153

ydb/core/tx/columnshard/engines/portions/read_with_blobs.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ std::optional<TWritePortionInfoWithBlobsResult> TReadPortionInfoWithBlobs::SyncP
124124
TIndexInfo::TSecondaryData secondaryData;
125125
secondaryData.MutableExternalData() = entityChunksNew;
126126
for (auto&& i : to->GetIndexInfo().GetIndexes()) {
127-
to->GetIndexInfo().AppendIndex(entityChunksNew, i.first, storages, secondaryData).Validate();
127+
to->GetIndexInfo().AppendIndex(entityChunksNew, i.first, storages, source.PortionInfo.GetPortionInfo().GetRecordsCount(), secondaryData).Validate();
128128
}
129129

130130
const NSplitter::TEntityGroups groups = source.PortionInfo.GetPortionInfo().GetEntityGroupsByStorageId(targetTier, *storages, to->GetIndexInfo());

ydb/core/tx/columnshard/engines/reader/common_reader/iterator/fetching.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ class TFetchingStepSignals: public NColumnShard::TCommonCountersOwner {
2727
public:
2828
TFetchingStepSignals(NColumnShard::TCommonCountersOwner&& owner)
2929
: TBase(std::move(owner))
30-
, DurationCounter(TBase::GetDeriviative("duration_ms"))
31-
, BytesCounter(TBase::GetDeriviative("bytes_ms")) {
30+
, DurationCounter(TBase::GetDeriviative("Duration/Us"))
31+
, BytesCounter(TBase::GetDeriviative("Bytes/Count")) {
3232
}
3333

3434
void AddDuration(const TDuration d) const {
35-
DurationCounter->Add(d.MilliSeconds());
35+
DurationCounter->Add(d.MicroSeconds());
3636
}
3737

3838
void AddBytes(const ui32 v) const {
@@ -56,7 +56,7 @@ class TFetchingStepsSignalsCollection: public NColumnShard::TCommonCountersOwner
5656

5757
public:
5858
TFetchingStepsSignalsCollection()
59-
: TBase("scan_steps") {
59+
: TBase("ScanSteps") {
6060
}
6161

6262
static TFetchingStepSignals GetSignals(const TString& name) {

ydb/core/tx/columnshard/engines/scheme/index_info.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -412,11 +412,11 @@ std::shared_ptr<arrow::Scalar> TIndexInfo::GetColumnExternalDefaultValueVerified
412412
}
413413

414414
NKikimr::TConclusionStatus TIndexInfo::AppendIndex(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& originalData,
415-
const ui32 indexId, const std::shared_ptr<IStoragesManager>& operators, TSecondaryData& result) const {
415+
const ui32 indexId, const std::shared_ptr<IStoragesManager>& operators, const ui32 recordsCount, TSecondaryData& result) const {
416416
auto it = Indexes.find(indexId);
417417
AFL_VERIFY(it != Indexes.end());
418418
auto& index = it->second;
419-
std::shared_ptr<IPortionDataChunk> chunk = index->BuildIndex(originalData, *this);
419+
std::shared_ptr<IPortionDataChunk> chunk = index->BuildIndex(originalData, recordsCount, *this);
420420
auto opStorage = operators->GetOperatorVerified(index->GetStorageId());
421421
if ((i64)chunk->GetPackedSize() > opStorage->GetBlobSplitSettings().GetMaxBlobSize()) {
422422
return TConclusionStatus::Fail("blob size for secondary data (" + ::ToString(indexId) + ") bigger than limit (" +

ydb/core/tx/columnshard/engines/scheme/index_info.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -313,11 +313,11 @@ struct TIndexInfo: public IIndexInfo {
313313
};
314314

315315
[[nodiscard]] TConclusion<TSecondaryData> AppendIndexes(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& primaryData,
316-
const std::shared_ptr<IStoragesManager>& operators) const {
316+
const std::shared_ptr<IStoragesManager>& operators, const ui32 recordsCount) const {
317317
TSecondaryData result;
318318
result.MutableExternalData() = primaryData;
319319
for (auto&& i : Indexes) {
320-
auto conclusion = AppendIndex(primaryData, i.first, operators, result);
320+
auto conclusion = AppendIndex(primaryData, i.first, operators, recordsCount, result);
321321
if (conclusion.IsFail()) {
322322
return conclusion;
323323
}
@@ -329,7 +329,7 @@ struct TIndexInfo: public IIndexInfo {
329329
std::shared_ptr<NIndexes::NCountMinSketch::TIndexMeta> GetIndexMetaCountMinSketch(const std::set<ui32>& columnIds) const;
330330

331331
[[nodiscard]] TConclusionStatus AppendIndex(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& originalData,
332-
const ui32 indexId, const std::shared_ptr<IStoragesManager>& operators, TSecondaryData& result) const;
332+
const ui32 indexId, const std::shared_ptr<IStoragesManager>& operators, const ui32 recordsCount, TSecondaryData& result) const;
333333

334334
/// Returns an id of the column located by name. The name should exists in the schema.
335335
ui32 GetColumnIdVerified(const std::string& name) const;

ydb/core/tx/columnshard/engines/scheme/indexes/abstract/meta.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ class IIndexMeta {
3131
YDB_READONLY(ui32, IndexId, 0);
3232
YDB_READONLY(TString, StorageId, IStoragesManager::DefaultStorageId);
3333
protected:
34-
virtual std::shared_ptr<IPortionDataChunk> DoBuildIndex(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& data, const TIndexInfo& indexInfo) const = 0;
34+
virtual std::shared_ptr<IPortionDataChunk> DoBuildIndex(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& data,
35+
const ui32 recordsCount, const TIndexInfo& indexInfo) const = 0;
3536
virtual void DoFillIndexCheckers(const std::shared_ptr<NRequest::TDataForIndexesCheckers>& info, const NSchemeShard::TOlapSchema& schema) const = 0;
3637
virtual bool DoDeserializeFromProto(const NKikimrSchemeOp::TOlapIndexDescription& proto) = 0;
3738
virtual void DoSerializeToProto(NKikimrSchemeOp::TOlapIndexDescription& proto) const = 0;
@@ -67,8 +68,8 @@ class IIndexMeta {
6768

6869
virtual ~IIndexMeta() = default;
6970

70-
std::shared_ptr<IPortionDataChunk> BuildIndex(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& data, const TIndexInfo& indexInfo) const {
71-
return DoBuildIndex(data, indexInfo);
71+
std::shared_ptr<IPortionDataChunk> BuildIndex(const THashMap<ui32, std::vector<std::shared_ptr<IPortionDataChunk>>>& data, const ui32 recordsCount, const TIndexInfo& indexInfo) const {
72+
return DoBuildIndex(data, recordsCount, indexInfo);
7273
}
7374

7475
void FillIndexCheckers(const std::shared_ptr<NRequest::TDataForIndexesCheckers>& info, const NSchemeShard::TOlapSchema& schema) const {

0 commit comments

Comments
 (0)