Skip to content

Commit 1215fa4

Browse files
indexes data extractor for providing subcolumns info (#15175)
1 parent d08419b commit 1215fa4

File tree

43 files changed

+1044
-346
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1044
-346
lines changed

.github/config/muted_ya.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ ydb/core/kqp/ut/olap KqpOlapBlobsSharing.MultipleSplitsWithRestartsWhenWait
3939
ydb/core/kqp/ut/olap KqpOlapBlobsSharing.TableReshardingConsistency64
4040
ydb/core/kqp/ut/olap KqpOlapBlobsSharing.TableReshardingModuloN
4141
ydb/core/kqp/ut/olap KqpOlapBlobsSharing.UpsertWhileSplitTest
42+
ydb/core/kqp/ut/olap KqpOlapJson.BloomIndexesVariants
4243
ydb/core/kqp/ut/olap KqpOlapSysView.StatsSysViewBytesDictActualization
4344
ydb/core/kqp/ut/olap KqpOlapSysView.StatsSysViewBytesDictStatActualization
4445
ydb/core/kqp/ut/olap KqpOlapWrite.TierDraftsGCWithRestart

ydb/core/formats/arrow/accessor/sub_columns/stats.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ class TDictStats {
2424
std::shared_ptr<arrow::UInt8Array> AccessorType;
2525

2626
public:
27+
ui32 GetFilledValuesCount() const {
28+
ui32 result = 0;
29+
for (ui32 i = 0; i < (ui32)DataRecordsCount->length(); ++i) {
30+
result += DataRecordsCount->Value(i);
31+
}
32+
return result;
33+
}
34+
2735
NJson::TJsonValue DebugJson() const {
2836
NJson::TJsonValue result = NJson::JSON_MAP;
2937
result.InsertValue("key_names", NArrow::DebugJson(DataNames, 1000000, 1000000)["data"]);

ydb/core/kqp/ut/olap/indexes_ut.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
9393
auto alterQuery =
9494
TStringBuilder() <<
9595
R"(ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=UPSERT_INDEX, NAME=index_resource_id, TYPE=BLOOM_FILTER,
96-
FEATURES=`{"column_names" : ["resource_id", "level"], "false_positive_probability" : 0.05}`);
96+
FEATURES=`{"column_names" : ["resource_id"], "false_positive_probability" : 0.05}`);
9797
)";
9898
auto session = tableClient.CreateSession().GetValueSync().GetSession();
9999
auto alterResult = session.ExecuteSchemeQuery(alterQuery).GetValueSync();
@@ -128,6 +128,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
128128
Cerr << csController->GetIndexesSkippingOnSelect().Val() << " / " << csController->GetIndexesApprovedOnSelect().Val() << Endl;
129129
CompareYson(result, R"([[0u;]])");
130130
AFL_VERIFY(csController->GetIndexesSkippedNoData().Val() == 0);
131+
AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() == 0);
131132
AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() < csController->GetIndexesSkippingOnSelect().Val())
132133
("approve", csController->GetIndexesApprovedOnSelect().Val())("skip", csController->GetIndexesSkippingOnSelect().Val());
133134
}
@@ -432,7 +433,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
432433
auto alterQuery =
433434
TStringBuilder() << Sprintf(
434435
R"(ALTER OBJECT `/Root/olapStore` (TYPE TABLESTORE) SET (ACTION=UPSERT_INDEX, NAME=index_resource_id, TYPE=BLOOM_FILTER,
435-
FEATURES=`{"column_names" : ["resource_id", "level"], "false_positive_probability" : 0.05, "storage_id" : "%s"}`);
436+
FEATURES=`{"column_names" : ["resource_id"], "false_positive_probability" : 0.05, "storage_id" : "%s"}`);
436437
)",
437438
StorageId.data());
438439
auto session = tableClient.CreateSession().GetValueSync().GetSession();
@@ -474,7 +475,7 @@ Y_UNIT_TEST_SUITE(KqpOlapIndexes) {
474475

475476
AFL_VERIFY(csController->GetIndexesSkippingOnSelect().Val() == 0);
476477
AFL_VERIFY(csController->GetIndexesApprovedOnSelect().Val() == 0);
477-
csController->WaitCompactions(TDuration::Seconds(25));
478+
csController->WaitCompactions(TDuration::Seconds(5));
478479
// important checker for control compactions (<=21) and control indexes constructed (>=21)
479480
AFL_VERIFY(csController->GetCompactionStartedCounter().Val() == 3)("count", csController->GetCompactionStartedCounter().Val());
480481

ydb/core/kqp/ut/olap/json_ut.cpp

Lines changed: 119 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,21 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
6565

6666
class TSelectCommand: public ICommand {
6767
private:
68-
const TString Command;
69-
const TString Compare;
68+
TString Command;
69+
TString Compare;
70+
std::optional<ui64> ExpectIndexSkip;
71+
std::optional<ui64> ExpectIndexNoData;
72+
std::optional<ui64> ExpectIndexApprove;
73+
ui64 IndexSkipStart = 0;
74+
ui64 IndexNoDataStart = 0;
75+
ui64 IndexApproveStart = 0;
76+
7077
virtual TConclusionStatus DoExecute(TKikimrRunner& kikimr) override {
78+
auto controller = NYDBTest::TControllers::GetControllerAs<NYDBTest::NColumnShard::TController>();
79+
AFL_VERIFY(controller);
80+
IndexSkipStart = controller->GetIndexesSkippingOnSelect().Val();
81+
IndexApproveStart = controller->GetIndexesApprovedOnSelect().Val();
82+
IndexNoDataStart = controller->GetIndexesSkippedNoData().Val();
7183
Cerr << "EXECUTE: " << Command << Endl;
7284
auto session = kikimr.GetTableClient().CreateSession().GetValueSync().GetSession();
7385
auto it = kikimr.GetQueryClient().StreamExecuteQuery(Command, NYdb::NQuery::TTxControl::BeginTx().CommitTx()).ExtractValueSync();
@@ -78,14 +90,76 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
7890
Cerr << "OUTPUT: " << output << Endl;
7991
CompareYson(output, Compare);
8092
}
93+
const ui32 skip = controller->GetIndexesSkippingOnSelect().Val() - IndexSkipStart;
94+
const ui32 noData = controller->GetIndexesSkippedNoData().Val() - IndexNoDataStart;
95+
const ui32 approves = controller->GetIndexesApprovedOnSelect().Val() - IndexApproveStart;
96+
Cerr << noData << "/" << skip << "/" << approves << Endl;
97+
if (ExpectIndexSkip) {
98+
AFL_VERIFY(skip == *ExpectIndexSkip)("expect", ExpectIndexSkip)("real", skip)(
99+
"current", controller->GetIndexesSkippingOnSelect().Val())(
100+
"pred", IndexSkipStart);
101+
}
102+
if (ExpectIndexNoData) {
103+
AFL_VERIFY(noData == *ExpectIndexNoData)("expect", ExpectIndexNoData)("real", noData)(
104+
"current", controller->GetIndexesSkippedNoData().Val())(
105+
"pred", IndexNoDataStart);
106+
}
107+
if (ExpectIndexApprove) {
108+
AFL_VERIFY(approves == *ExpectIndexApprove)("expect", ExpectIndexApprove)("real", approves)(
109+
"current", controller->GetIndexesApprovedOnSelect().Val())("pred", IndexApproveStart);
110+
}
81111
return TConclusionStatus::Success();
82112
}
83113

84114
public:
85-
TSelectCommand(const TString& command, const TString& compare)
86-
: Command(command)
87-
, Compare(compare) {
115+
bool DeserializeFromString(const TString& info) {
116+
auto lines = StringSplitter(info).SplitBySet("\n").ToList<TString>();
117+
std::optional<ui32> state;
118+
for (auto&& l : lines) {
119+
l = Strip(l);
120+
if (l.StartsWith("READ:")) {
121+
l = l.substr(5);
122+
state = 0;
123+
} else if (l.StartsWith("EXPECTED:")) {
124+
l = l.substr(9);
125+
state = 1;
126+
} else if (l.StartsWith("IDX_ND_SKIP_APPROVE:")) {
127+
state = 2;
128+
l = l.substr(20);
129+
} else {
130+
AFL_VERIFY(state)("line", l);
131+
}
132+
133+
if (*state == 0) {
134+
Command += l;
135+
} else if (*state == 1) {
136+
Compare += l;
137+
} else if (*state == 2) {
138+
auto idxExpectations = StringSplitter(l).SplitBySet(" ,.;").SkipEmpty().ToList<TString>();
139+
AFL_VERIFY(idxExpectations.size() == 3)("size", idxExpectations.size())("string", l);
140+
if (idxExpectations[0] != "{}") {
141+
ui32 res;
142+
AFL_VERIFY(TryFromString<ui32>(idxExpectations[0], res))("string", l);
143+
ExpectIndexNoData = res;
144+
}
145+
if (idxExpectations[1] != "{}") {
146+
ui32 res;
147+
AFL_VERIFY(TryFromString<ui32>(idxExpectations[1], res))("string", l);
148+
ExpectIndexSkip = res;
149+
}
150+
if (idxExpectations[2] != "{}") {
151+
ui32 res;
152+
AFL_VERIFY(TryFromString<ui32>(idxExpectations[2], res))("string", l);
153+
ExpectIndexApprove = res;
154+
}
155+
} else {
156+
AFL_VERIFY(false)("line", l);
157+
}
158+
}
159+
return true;
88160
}
161+
162+
TSelectCommand() = default;
89163
};
90164

91165
class TStopCompactionCommand: public ICommand {
@@ -167,9 +241,7 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
167241

168242
public:
169243
TScriptExecutor(const std::vector<std::shared_ptr<ICommand>>& commands)
170-
: Commands(commands)
171-
{
172-
244+
: Commands(commands) {
173245
}
174246
void Execute() {
175247
NKikimrConfig::TAppConfig appConfig;
@@ -198,23 +270,9 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
198270
command = command.substr(5);
199271
return std::make_shared<TDataCommand>(command);
200272
} else if (command.StartsWith("READ:")) {
201-
auto lines = StringSplitter(command.substr(5)).SplitBySet("\n").ToList<TString>();
202-
int step = 0;
203-
TString request;
204-
TString expectation;
205-
for (auto&& i : lines) {
206-
i = Strip(i);
207-
if (i.StartsWith("EXPECTED:")) {
208-
step = 1;
209-
i = i.substr(9);
210-
}
211-
if (step == 0) {
212-
request += i;
213-
} else if (step == 1) {
214-
expectation += i;
215-
}
216-
}
217-
return std::make_shared<TSelectCommand>(request, expectation);
273+
auto result = std::make_shared<TSelectCommand>();
274+
AFL_VERIFY(result->DeserializeFromString(command));
275+
return result;
218276
} else if (command.StartsWith("WAIT_COMPACTION")) {
219277
return std::make_shared<TWaitCompactionCommand>();
220278
} else if (command.StartsWith("STOP_COMPACTION")) {
@@ -290,7 +348,6 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
290348
for (auto&& i : Scripts) {
291349
i.Execute();
292350
}
293-
294351
}
295352
};
296353

@@ -636,7 +693,7 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
636693
)";
637694
TScriptVariator(script).Execute();
638695
}
639-
/*
696+
640697
Y_UNIT_TEST(BloomIndexesVariants) {
641698
TString script = R"(
642699
STOP_COMPACTION
@@ -648,7 +705,7 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
648705
PRIMARY KEY (Col1)
649706
)
650707
PARTITION BY HASH(Col1)
651-
WITH (STORE = COLUMN, AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = $$1|2$$);
708+
WITH (STORE = COLUMN, AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = $$2$$);
652709
------
653710
SCHEMA:
654711
ALTER OBJECT `/Root/ColumnTable` (TYPE TABLE) SET (ACTION=UPSERT_OPTIONS, `SCAN_READER_POLICY_NAME`=`SIMPLE`)
@@ -657,34 +714,56 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
657714
ALTER OBJECT `/Root/ColumnTable` (TYPE TABLE) SET (ACTION=ALTER_COLUMN, NAME=Col2, `DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME`=`SUB_COLUMNS`,
658715
`COLUMNS_LIMIT`=`$$0|1|1024$$`, `SPARSED_DETECTOR_KFF`=`$$0|10|1000$$`, `MEM_LIMIT_CHUNK`=`$$0|100|1000000$$`, `OTHERS_ALLOWED_FRACTION`=`$$0|0.5$$`)
659716
------
660-
SCHEMA:
661-
ALTER OBJECT `/Root/ColumnTable` (TYPE TABLE) SET (ACTION=ALTER_COLUMN, NAME=Col2, `DATA_ACCESSOR_CONSTRUCTOR.CLASS_NAME`=`SUB_COLUMNS`,
662-
`COLUMNS_LIMIT`=`$$0|1|1024$$`, `SPARSED_DETECTOR_KFF`=`$$0|10|1000$$`, `MEM_LIMIT_CHUNK`=`$$0|100|1000000$$`, `OTHERS_ALLOWED_FRACTION`=`$$0|0.5$$`)
663-
------
664717
DATA:
665-
REPLACE INTO `/Root/ColumnTable` (Col1, Col2) VALUES(1u, JsonDocument('{"a" : "a1"}')), (2u, JsonDocument('{"a" : "a2"}')),
666-
(3u, JsonDocument('{"b" : "b3"}')), (4u, JsonDocument('{"b" : "b4", "a" : "a4"}'))
718+
REPLACE INTO `/Root/ColumnTable` (Col1, Col2) VALUES(1u, JsonDocument('{"a.b.c" : "a1"}')), (2u, JsonDocument('{"a.b.c" : "a2"}')),
719+
(3u, JsonDocument('{"b.c.d" : "b3"}')), (4u, JsonDocument('{"b.c.d" : "b4", "a" : "a4"}'))
667720
------
668721
DATA:
669-
REPLACE INTO `/Root/ColumnTable` (Col1, Col2) VALUES(11u, JsonDocument('{"a" : "1a1"}')), (12u, JsonDocument('{"a" : "1a2"}')),
670-
(13u, JsonDocument('{"b" : "1b3"}')), (14u, JsonDocument('{"b" : "1b4", "a" : "a4"}'))
722+
REPLACE INTO `/Root/ColumnTable` (Col1, Col2) VALUES(11u, JsonDocument('{"a.b.c" : "1a1"}')), (12u, JsonDocument('{"a.b.c" : "1a2"}')),
723+
(13u, JsonDocument('{"b.c.d" : "1b3"}')), (14u, JsonDocument('{"b.c.d" : "1b4", "a" : "a4"}'))
671724
------
672725
SCHEMA:
673726
ALTER OBJECT `/Root/ColumnTable` (TYPE TABLE) SET (ACTION=UPSERT_INDEX, NAME=a_index, TYPE=BLOOM_FILTER,
674-
FEATURES=`{"column_names" : ["Col2"], "data_extractor" : {"class_name" : "SUB_COLUMN", "sub_column_name" : "a"}, "false_positive_probability" : 0.05}`)
727+
FEATURES=`{"column_names" : ["Col2"], "false_positive_probability" : 0.01}`)
728+
------
729+
SCHEMA:
730+
ALTER OBJECT `/Root/ColumnTable` (TYPE TABLE) SET (ACTION=UPSERT_INDEX, NAME=index_ngramm_b, TYPE=BLOOM_NGRAMM_FILTER,
731+
FEATURES=`{"column_name" : "Col2", "ngramm_size" : 3, "hashes_count" : 2, "filter_size_bytes" : 4096,
732+
"records_count" : 1024, "data_extractor" : {"class_name" : "SUB_COLUMN", "sub_column_name" : "b.c.d"}}`);
675733
------
676734
DATA:
677735
REPLACE INTO `/Root/ColumnTable` (Col1) VALUES(10u)
678736
------
679737
ONE_ACTUALIZATION
680738
------
681-
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.a") = "1a1" ORDER BY Col1;
682-
EXPECTED: [[11u;["{\"a\":\"1a1\"}"]]]
739+
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.\"a.b.c\"") = "a1" ORDER BY Col1;
740+
EXPECTED: [[1u;["{\"a.b.c\":\"a1\"}"]]]
741+
IDX_ND_SKIP_APPROVE: 0, 3, 1
742+
------
743+
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.\"a.b.c\"") = "1a1" ORDER BY Col1;
744+
EXPECTED: [[11u;["{\"a.b.c\":\"1a1\"}"]]]
745+
IDX_ND_SKIP_APPROVE: 0, 3, 1
746+
------
747+
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.\"b.c.d\"") = "1b4" ORDER BY Col1;
748+
EXPECTED: [[14u;["{\"a\":\"a4\",\"b.c.d\":\"1b4\"}"]]]
749+
IDX_ND_SKIP_APPROVE: 0, 3, 1
750+
------
751+
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.\"b.c.d\"") = "1b5" ORDER BY Col1;
752+
EXPECTED: []
753+
IDX_ND_SKIP_APPROVE: 0, 4, 0
754+
------
755+
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.\"b.c.d\"") like "1b3" ORDER BY Col1;
756+
EXPECTED: [[13u;["{\"b.c.d\":\"1b3\"}"]]]
757+
IDX_ND_SKIP_APPROVE: 0, 3, 1
758+
------
759+
READ: SELECT * FROM `/Root/ColumnTable` WHERE JSON_VALUE(Col2, "$.\"b.c.d\"") like "1b5" ORDER BY Col1;
760+
EXPECTED: []
761+
IDX_ND_SKIP_APPROVE: 0, 4, 0
683762
684763
)";
685764
TScriptVariator(script).Execute();
686765
}
687-
*/
766+
688767
Y_UNIT_TEST(SwitchAccessorCompactionVariants) {
689768
TString script = R"(
690769
STOP_COMPACTION
@@ -765,7 +844,6 @@ Y_UNIT_TEST_SUITE(KqpOlapJson) {
765844
)";
766845
TScriptVariator(script).Execute();
767846
}
768-
769847
}
770848

771849
} // namespace NKikimr::NKqp

ydb/core/protos/flat_scheme_op.proto

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,9 +388,25 @@ message TOlapColumnDescription {
388388
optional string ColumnFamilyName = 15;
389389
}
390390

391+
message TIndexDataExtractor {
392+
optional string ClassName = 1;
393+
message TDefault {
394+
}
395+
396+
message TSubColumn {
397+
optional string SubColumnName = 1;
398+
}
399+
400+
oneof Implementation {
401+
TDefault Default = 20;
402+
TSubColumn SubColumn = 21;
403+
}
404+
}
405+
391406
message TRequestedBloomFilter {
392407
optional double FalsePositiveProbability = 1 [default = 0.1];
393408
repeated string ColumnNames = 3;
409+
optional TIndexDataExtractor DataExtractor = 4;
394410
}
395411

396412
message TRequestedBloomNGrammFilter {
@@ -399,6 +415,7 @@ message TRequestedBloomNGrammFilter {
399415
optional uint32 HashesCount = 3;
400416
optional string ColumnName = 4;
401417
optional uint32 RecordsCount = 5;
418+
optional TIndexDataExtractor DataExtractor = 6;
402419
}
403420

404421
message TRequestedMaxIndex {
@@ -428,6 +445,7 @@ message TBloomFilter {
428445
optional double FalsePositiveProbability = 1 [default = 0.1];
429446
optional uint64 MaxBytesCount = 2 [default = 8196];
430447
repeated uint32 ColumnIds = 3;
448+
optional TIndexDataExtractor DataExtractor = 4;
431449
}
432450

433451
message TBloomNGrammFilter {
@@ -436,6 +454,7 @@ message TBloomNGrammFilter {
436454
optional uint32 HashesCount = 3;
437455
optional uint32 ColumnId = 4;
438456
optional uint32 RecordsCount = 5;
457+
optional TIndexDataExtractor DataExtractor = 6;
439458
}
440459

441460
message TMaxIndex {
@@ -1204,7 +1223,7 @@ message TBackupTask {
12041223

12051224
// currently available only for s3:
12061225
optional TCompressionOptions Compression = 13;
1207-
optional bool EnableChecksums = 16;
1226+
optional bool EnableChecksums = 16;
12081227
optional bool EnablePermissions = 18;
12091228
}
12101229

ydb/core/tx/columnshard/bg_tasks/abstract/status_channel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class TStatusChannelContainer: public NBackgroundTasks::TInterfaceStringContaine
7878
using TBase::TBase;
7979
bool DeserializeFromString(const TString& data) {
8080
if (!TBase::DeserializeFromString(data)) {
81-
Initialize(TFakeStatusChannel::GetClassNameStatic());
81+
AFL_VERIFY(Initialize(TFakeStatusChannel::GetClassNameStatic()));
8282
return false;
8383
}
8484
return true;

0 commit comments

Comments
 (0)