Skip to content

Commit 13965ec

Browse files
authored
YQL: Fix lineage for flatten columns (#8908)
1 parent b91f3b1 commit 13965ec

File tree

9 files changed

+159
-28
lines changed

9 files changed

+159
-28
lines changed

ydb/library/yql/core/services/yql_lineage.cpp

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,8 @@ class TLineageScanner {
298298
}
299299

300300
TMaybe<TFieldsLineage> ScanExprLineage(const TExprNode& node, const TExprNode* arg, const TLineage* src,
301-
TNodeMap<TMaybe<TFieldsLineage>>& visited) {
301+
TNodeMap<TMaybe<TFieldsLineage>>& visited,
302+
const THashMap<const TExprNode*, TString>& flattenColumns) {
302303
if (&node == arg) {
303304
return Nothing();
304305
}
@@ -308,6 +309,10 @@ class TLineageScanner {
308309
return it->second;
309310
}
310311

312+
if (auto itFlatten = flattenColumns.find(&node); itFlatten != flattenColumns.end()) {
313+
return it->second = *(*src->Fields).FindPtr(itFlatten->second);
314+
}
315+
311316
if (node.IsCallable("Member")) {
312317
if (&node.Head() == arg && src) {
313318
return it->second = *(*src->Fields).FindPtr(node.Tail().Content());
@@ -325,7 +330,7 @@ class TLineageScanner {
325330
}
326331
}
327332

328-
auto inner = ScanExprLineage(node.Head(), arg, src, visited);
333+
auto inner = ScanExprLineage(node.Head(), arg, src, visited, {});
329334
if (!inner) {
330335
return Nothing();
331336
}
@@ -365,7 +370,7 @@ class TLineageScanner {
365370
continue;
366371
}
367372

368-
auto inner = ScanExprLineage(*child, arg, src, visited);
373+
auto inner = ScanExprLineage(*child, arg, src, visited, {});
369374
if (!inner) {
370375
return Nothing();
371376
}
@@ -392,10 +397,11 @@ class TLineageScanner {
392397
}
393398

394399
void MergeLineageFromUsedFields(const TExprNode& expr, const TExprNode& arg, const TLineage& src,
395-
TFieldLineageSet& dst, const TString& newTransforms = "") {
400+
TFieldLineageSet& dst, const THashMap<const TExprNode*, TString>& flattenColumns,
401+
const TString& newTransforms = "") {
396402

397403
TNodeMap<TMaybe<TFieldsLineage>> visited;
398-
auto res = ScanExprLineage(expr, &arg, &src, visited);
404+
auto res = ScanExprLineage(expr, &arg, &src, visited, flattenColumns);
399405
if (!res) {
400406
for (const auto& f : *src.Fields) {
401407
for (const auto& i: f.second.Items) {
@@ -410,7 +416,8 @@ class TLineageScanner {
410416
}
411417

412418
void MergeLineageFromUsedFields(const TExprNode& expr, const TExprNode& arg, const TLineage& src,
413-
TFieldsLineage& dst, bool produceStruct, const TString& newTransforms = "") {
419+
TFieldsLineage& dst, bool produceStruct, const THashMap<const TExprNode*, TString>& flattenColumns,
420+
const TString& newTransforms = "") {
414421
if (produceStruct) {
415422
auto root = &expr;
416423
while (root->IsCallable("Just")) {
@@ -427,7 +434,7 @@ class TLineageScanner {
427434
for (const auto& x : root->Children()) {
428435
auto fieldName = x->Head().Content();
429436
auto& s = (*dst.StructItems)[fieldName];
430-
MergeLineageFromUsedFields(x->Tail(), arg, src, s, newTransforms);
437+
MergeLineageFromUsedFields(x->Tail(), arg, src, s, flattenColumns, newTransforms);
431438
}
432439
} else if (root->IsCallable("Member") && &root->Head() == &arg) {
433440
auto fieldName = root->Tail().Content();
@@ -436,11 +443,11 @@ class TLineageScanner {
436443
}
437444
}
438445

439-
MergeLineageFromUsedFields(expr, arg, src, dst.Items, newTransforms);
446+
MergeLineageFromUsedFields(expr, arg, src, dst.Items, flattenColumns, newTransforms);
440447
}
441448

442449
void FillStructLineage(TLineage& lineage, const TExprNode* value, const TExprNode& arg, const TLineage& innerLineage,
443-
const TTypeAnnotationNode* extType) {
450+
const TTypeAnnotationNode* extType, const THashMap<const TExprNode*, TString>& flattenColumns) {
444451
TMaybe<TString> oneField;
445452
if (value && value->IsCallable("Member") && &value->Head() == &arg) {
446453
TString field(value->Tail().Content());
@@ -462,8 +469,8 @@ class TLineageScanner {
462469
TLineage left, right;
463470
left.Fields.ConstructInPlace();
464471
right.Fields.ConstructInPlace();
465-
FillStructLineage(left, value->Child(1), arg, innerLineage, extType);
466-
FillStructLineage(right, value->Child(2), arg, innerLineage, extType);
472+
FillStructLineage(left, value->Child(1), arg, innerLineage, extType, {});
473+
FillStructLineage(right, value->Child(2), arg, innerLineage, extType, {});
467474
for (const auto& f : *left.Fields) {
468475
auto& res = (*lineage.Fields)[f.first];
469476
res.Items.insert(f.second.Items.begin(), f.second.Items.end());
@@ -483,7 +490,7 @@ class TLineageScanner {
483490
auto& res = (*lineage.Fields)[field];
484491
const auto& expr = child->Tail();
485492
TString newTransforms;
486-
auto root = &expr;
493+
const TExprNode* root = &expr;
487494
while (root->IsCallable("Just")) {
488495
root = &root->Head();
489496
}
@@ -492,7 +499,7 @@ class TLineageScanner {
492499
newTransforms = "Copy";
493500
}
494501

495-
MergeLineageFromUsedFields(expr, arg, innerLineage, res, true, newTransforms);
502+
MergeLineageFromUsedFields(expr, arg, innerLineage, res, true, flattenColumns, newTransforms);
496503
}
497504

498505
return;
@@ -526,13 +533,30 @@ class TLineageScanner {
526533
const auto& lambda = node.Tail();
527534
const auto& arg = lambda.Head().Head();
528535
const auto& body = lambda.Tail();
529-
const TExprNode* value;
536+
THashMap<const TExprNode*, TString> flattenColumns;
537+
const TExprNode* value = &body.Tail();
530538
if (body.IsCallable({"OptionalIf", "FlatListIf"})) {
531539
value = &body.Tail();
532540
} else if (body.IsCallable("Just")) {
533541
value = &body.Head();
534542
} else if (body.IsCallable({"FlatMap", "OrderedFlatMap"})) {
535-
value = &body.Head();
543+
if (lambda.GetTypeAnn()->GetKind() == ETypeAnnotationKind::List) {
544+
value = &body;
545+
while(value->IsCallable({"FlatMap", "OrderedFlatMap"})) {
546+
if (value->Head().IsCallable("Member") && &value->Head().Head() == &arg) {
547+
TString field(value->Head().Tail().Content());
548+
flattenColumns.emplace(value->Tail().Head().HeadPtr().Get(), field);
549+
}
550+
value = &value->Tail().Tail();
551+
}
552+
if (value->IsCallable("Just")) {
553+
value = &value->Head();
554+
} else if (value->IsCallable({"OptionalIf", "FlatListIf"})) {
555+
value = &value->Tail();
556+
}
557+
} else {
558+
value = &body.Head();
559+
}
536560
} else {
537561
Warning(body);
538562
return;
@@ -544,7 +568,7 @@ class TLineageScanner {
544568
}
545569

546570
lineage.Fields.ConstructInPlace();
547-
FillStructLineage(lineage, value, arg, innerLineage, GetSeqItemType(body.GetTypeAnn()));
571+
FillStructLineage(lineage, value, arg, innerLineage, GetSeqItemType(body.GetTypeAnn()), flattenColumns);
548572
}
549573

550574
void HandleAggregate(TLineage& lineage, const TExprNode& node) {
@@ -578,12 +602,12 @@ class TLineageScanner {
578602
// merge all used fields from init/update handlers
579603
auto initHandler = payload->Child(1)->Child(1);
580604
auto updateHandler = payload->Child(1)->Child(2);
581-
MergeLineageFromUsedFields(initHandler->Tail(), initHandler->Head().Head(), innerLineage, source, false);
582-
MergeLineageFromUsedFields(updateHandler->Tail(), updateHandler->Head().Head(), innerLineage, source, false);
605+
MergeLineageFromUsedFields(initHandler->Tail(), initHandler->Head().Head(), innerLineage, source, false, {});
606+
MergeLineageFromUsedFields(updateHandler->Tail(), updateHandler->Head().Head(), innerLineage, source, false, {});
583607
} else if (payload->Child(1)->IsCallable("AggApply")) {
584608
auto extractHandler = payload->Child(1)->Child(2);
585609
bool produceStruct = payload->Child(1)->Head().Content() == "some";
586-
MergeLineageFromUsedFields(extractHandler->Tail(), extractHandler->Head().Head(), innerLineage, source, produceStruct);
610+
MergeLineageFromUsedFields(extractHandler->Tail(), extractHandler->Head().Head(), innerLineage, source, produceStruct, {});
587611
} else {
588612
Warning(*payload->Child(1));
589613
lineage.Fields.Clear();
@@ -612,7 +636,7 @@ class TLineageScanner {
612636
}
613637

614638
lineage.Fields.ConstructInPlace();
615-
FillStructLineage(lineage, nullptr, arg, innerLineage, GetSeqItemType(body.GetTypeAnn()));
639+
FillStructLineage(lineage, nullptr, arg, innerLineage, GetSeqItemType(body.GetTypeAnn()), {});
616640
}
617641

618642
void HandlePartitionByKeys(TLineage& lineage, const TExprNode& node) {
@@ -630,7 +654,7 @@ class TLineageScanner {
630654
}
631655

632656
lineage.Fields.ConstructInPlace();
633-
FillStructLineage(lineage, nullptr, arg, innerLineage, GetSeqItemType(body.GetTypeAnn()));
657+
FillStructLineage(lineage, nullptr, arg, innerLineage, GetSeqItemType(body.GetTypeAnn()), {});
634658
}
635659

636660
void HandleExtend(TLineage& lineage, const TExprNode& node) {
@@ -709,8 +733,8 @@ class TLineageScanner {
709733
auto& res = (*lineage.Fields)[sessionColumn->Content()];
710734
const auto& initHandler = node.Child(4)->Child(2);
711735
const auto& updateHandler = node.Child(4)->Child(2);
712-
MergeLineageFromUsedFields(initHandler->Tail(), initHandler->Head().Head(), innerLineage, res, false);
713-
MergeLineageFromUsedFields(updateHandler->Tail(), updateHandler->Head().Head(), innerLineage, res, false);
736+
MergeLineageFromUsedFields(initHandler->Tail(), initHandler->Head().Head(), innerLineage, res, false, {});
737+
MergeLineageFromUsedFields(updateHandler->Tail(), updateHandler->Head().Head(), innerLineage, res, false, {});
714738
}
715739
}
716740

@@ -730,12 +754,12 @@ class TLineageScanner {
730754
} else if (list->Tail().IsCallable({"Lag","Lead","Rank","DenseRank","PercentRank"})) {
731755
const auto& lambda = list->Tail().Child(1);
732756
bool produceStruct = list->Tail().IsCallable({"Lag","Lead"});
733-
MergeLineageFromUsedFields(lambda->Tail(), lambda->Head().Head(), innerLineage, res, produceStruct);
757+
MergeLineageFromUsedFields(lambda->Tail(), lambda->Head().Head(), innerLineage, res, produceStruct, {});
734758
} else if (list->Tail().IsCallable("WindowTraits")) {
735759
const auto& initHandler = list->Tail().Child(1);
736760
const auto& updateHandler = list->Tail().Child(2);
737-
MergeLineageFromUsedFields(initHandler->Tail(), initHandler->Head().Head(), innerLineage, res, false);
738-
MergeLineageFromUsedFields(updateHandler->Tail(), updateHandler->Head().Head(), innerLineage, res, false);
761+
MergeLineageFromUsedFields(initHandler->Tail(), initHandler->Head().Head(), innerLineage, res, false, {});
762+
MergeLineageFromUsedFields(updateHandler->Tail(), updateHandler->Head().Head(), innerLineage, res, false, {});
739763
} else {
740764
lineage.Fields.Clear();
741765
return;
@@ -850,15 +874,15 @@ class TLineageScanner {
850874
if (child->IsCallable("AsStruct")) {
851875
for (const auto& f : child->Children()) {
852876
TNodeMap<TMaybe<TFieldsLineage>> visited;
853-
auto res = ScanExprLineage(f->Tail(), nullptr, nullptr, visited);
877+
auto res = ScanExprLineage(f->Tail(), nullptr, nullptr, visited, {});
854878
if (res) {
855879
auto name = f->Head().Content();
856880
(*lineage.Fields)[name].MergeFrom(*res);
857881
}
858882
}
859883
} else {
860884
TNodeMap<TMaybe<TFieldsLineage>> visited;
861-
auto res = ScanExprLineage(*child, nullptr, nullptr, visited);
885+
auto res = ScanExprLineage(*child, nullptr, nullptr, visited, {});
862886
if (res) {
863887
for (const auto& i : structType->GetItems()) {
864888
if (i->GetName().StartsWith("_yql_sys_")) {

ydb/library/yql/tests/sql/dq_file/part15/canondata/result.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,6 +1576,28 @@
15761576
}
15771577
],
15781578
"test.test[limit-limit_over_sort_desc_in_subquery--Results]": [],
1579+
"test.test[lineage-flatten_by--Analyze]": [
1580+
{
1581+
"checksum": "de38a224e0104e35a4d3f64f505d9158",
1582+
"size": 7782,
1583+
"uri": "https://{canondata_backend}/1773845/4743168c84575c5ee74764d6369a8a7b6f309d6e/resource.tar.gz#test.test_lineage-flatten_by--Analyze_/plan.txt"
1584+
}
1585+
],
1586+
"test.test[lineage-flatten_by--Debug]": [
1587+
{
1588+
"checksum": "df47e3dc178c04a8c1c6fc0a83120230",
1589+
"size": 3759,
1590+
"uri": "https://{canondata_backend}/1773845/4743168c84575c5ee74764d6369a8a7b6f309d6e/resource.tar.gz#test.test_lineage-flatten_by--Debug_/opt.yql_patched"
1591+
}
1592+
],
1593+
"test.test[lineage-flatten_by--Plan]": [
1594+
{
1595+
"checksum": "de38a224e0104e35a4d3f64f505d9158",
1596+
"size": 7782,
1597+
"uri": "https://{canondata_backend}/1773845/4743168c84575c5ee74764d6369a8a7b6f309d6e/resource.tar.gz#test.test_lineage-flatten_by--Plan_/plan.txt"
1598+
}
1599+
],
1600+
"test.test[lineage-flatten_by--Results]": [],
15791601
"test.test[lineage-grouping_sets--Analyze]": [
15801602
{
15811603
"checksum": "7cd08ec1563a4f59f7b1dd446b2dd421",

ydb/library/yql/tests/sql/hybrid_file/part9/canondata/result.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1525,6 +1525,20 @@
15251525
"uri": "https://{canondata_backend}/1809005/2a59475dc877549ac4197a291aacd77d92f24ab4/resource.tar.gz#test.test_limit-empty_input_after_limit-default.txt-Plan_/plan.txt"
15261526
}
15271527
],
1528+
"test.test[lineage-flatten_by--Debug]": [
1529+
{
1530+
"checksum": "ca67fb8416e26fcc6941474954bc8efa",
1531+
"size": 3102,
1532+
"uri": "https://{canondata_backend}/1900335/8db5941a4ed2bc94d6ae42d0eae7b6c741fa5a59/resource.tar.gz#test.test_lineage-flatten_by--Debug_/opt.yql_patched"
1533+
}
1534+
],
1535+
"test.test[lineage-flatten_by--Plan]": [
1536+
{
1537+
"checksum": "d8e99e1cc64bfe7d765d01c4f3c575e8",
1538+
"size": 8880,
1539+
"uri": "https://{canondata_backend}/1900335/8db5941a4ed2bc94d6ae42d0eae7b6c741fa5a59/resource.tar.gz#test.test_lineage-flatten_by--Plan_/plan.txt"
1540+
}
1541+
],
15281542
"test.test[match_recognize-alerts_without_order-default.txt-Debug]": [
15291543
{
15301544
"checksum": "acba759d95a9b70640e6418dc1febb2d",

ydb/library/yql/tests/sql/sql2yql/canondata/result.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10597,6 +10597,13 @@
1059710597
"uri": "https://{canondata_backend}/1784826/8212a6594777651314d94a2e2f95179c0016604c/resource.tar.gz#test_sql2yql.test_lineage-error_type_/sql.yql"
1059810598
}
1059910599
],
10600+
"test_sql2yql.test[lineage-flatten_by]": [
10601+
{
10602+
"checksum": "a761432fec83da9adc9a7828296bda6f",
10603+
"size": 4072,
10604+
"uri": "https://{canondata_backend}/1937367/b35833bd1950efa4b6fa264900a396b8f3f198a8/resource.tar.gz#test_sql2yql.test_lineage-flatten_by_/sql.yql"
10605+
}
10606+
],
1060010607
"test_sql2yql.test[lineage-flatten_list_nested_lambda]": [
1060110608
{
1060210609
"checksum": "1405a87aecd4676d7955fff219819b5f",
@@ -30253,6 +30260,13 @@
3025330260
"uri": "https://{canondata_backend}/1784826/8212a6594777651314d94a2e2f95179c0016604c/resource.tar.gz#test_sql_format.test_lineage-error_type_/formatted.sql"
3025430261
}
3025530262
],
30263+
"test_sql_format.test[lineage-flatten_by]": [
30264+
{
30265+
"checksum": "3f32f309ac009b3158e11e36cc0a92b7",
30266+
"size": 451,
30267+
"uri": "https://{canondata_backend}/1937367/b35833bd1950efa4b6fa264900a396b8f3f198a8/resource.tar.gz#test_sql_format.test_lineage-flatten_by_/formatted.sql"
30268+
}
30269+
],
3025630270
"test_sql_format.test[lineage-flatten_list_nested_lambda]": [
3025730271
{
3025830272
"checksum": "3fdec3c3ffc5993a6088aa56eac4fcea",
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
in Input input_list_2.txt
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
use plato;
2+
3+
$subquery1 =
4+
SELECT
5+
key, subkey, z
6+
FROM Input
7+
FLATTEN LIST BY value as z;
8+
9+
$subquery2 =
10+
SELECT
11+
key, subkey, value as z, value2
12+
FROM Input
13+
FLATTEN LIST BY (value, value2);
14+
15+
INSERT INTO @tmp1 WITH TRUNCATE
16+
SELECT *
17+
FROM $subquery1;
18+
19+
INSERT INTO @tmp2 WITH TRUNCATE
20+
SELECT *
21+
FROM $subquery2;
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"key"="075";"subkey"="1";"value"=["abc";"cde"];"value2"=["efg"; "ghj"]};
2+
{"key"="020";"subkey"="3";"value"=["qqq";"ttt"];"value2"=["ppp";"rrr"]};
3+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"_yql_row_spec"={
3+
"Type"=["StructType";[
4+
["key";["DataType";"String";];];
5+
["subkey";["DataType";"String";];];
6+
["value";["ListType";["DataType";"String";];];];
7+
["value2";["ListType";["DataType";"String";];];];
8+
];];
9+
}
10+
}

ydb/library/yql/tests/sql/yt_native_file/part15/canondata/result.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,28 @@
14321432
"uri": "https://{canondata_backend}/1903280/4c77300cd3fef018d27d7f75b6ff956e63258b21/resource.tar.gz#test.test_limit-limit_over_sort_desc_in_subquery--Results_/results.txt"
14331433
}
14341434
],
1435+
"test.test[lineage-flatten_by--Debug]": [
1436+
{
1437+
"checksum": "b9673e0336e9f6e9bfa2f44fc8b88803",
1438+
"size": 3035,
1439+
"uri": "https://{canondata_backend}/1775319/264e08443d41b710cd528563fbaa24c32c366555/resource.tar.gz#test.test_lineage-flatten_by--Debug_/opt.yql"
1440+
}
1441+
],
1442+
"test.test[lineage-flatten_by--Lineage]": [
1443+
{
1444+
"checksum": "f4929578f1fe2fce5f566df8020cc0ad",
1445+
"size": 3001,
1446+
"uri": "https://{canondata_backend}/1775319/264e08443d41b710cd528563fbaa24c32c366555/resource.tar.gz#test.test_lineage-flatten_by--Lineage_/results.txt"
1447+
}
1448+
],
1449+
"test.test[lineage-flatten_by--Plan]": [
1450+
{
1451+
"checksum": "093e3952e33d4d1b806cce1781e9e189",
1452+
"size": 8880,
1453+
"uri": "https://{canondata_backend}/1775319/264e08443d41b710cd528563fbaa24c32c366555/resource.tar.gz#test.test_lineage-flatten_by--Plan_/plan.txt"
1454+
}
1455+
],
1456+
"test.test[lineage-flatten_by--Results]": [],
14351457
"test.test[lineage-grouping_sets--Debug]": [
14361458
{
14371459
"checksum": "9d2798e2536159bea2cb8dc1a8089078",

0 commit comments

Comments
 (0)