Skip to content

Commit 78d52ac

Browse files
zverevgenyzverevgeny
authored andcommitted
Primitives for case insensitive simple pattern match
commit_hash:5f4bdb090c2f60459073e3e95ccd39ec58b95232 Conflicts: yql/essentials/sql/v1/context.cpp
1 parent 243a4e6 commit 78d52ac

File tree

15 files changed

+266
-13
lines changed

15 files changed

+266
-13
lines changed

yql/essentials/core/expr_nodes/yql_expr_nodes.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,11 @@
12531253
"Base": "TCoCompare",
12541254
"Match": {"Type": "Callable", "Name": "=="}
12551255
},
1256+
{
1257+
"Name": "TCoCmpEqualsIgnoreCase",
1258+
"Base": "TCoCompare",
1259+
"Match": {"Type": "Callable", "Name": "EqualsIgnoreCase"}
1260+
},
12561261
{
12571262
"Name": "TCoCmpNotEqual",
12581263
"Base": "TCoCompare",
@@ -1273,16 +1278,32 @@
12731278
"Base": "TCoCompare",
12741279
"Match": {"Type": "Callable", "Name": "StartsWith"}
12751280
},
1281+
{
1282+
"Name": "TCoCmpStartsWithIgnoreCase",
1283+
"Base": "TCoCompare",
1284+
"Match": {"Type": "Callable", "Name": "StartsWithIgnoreCase"}
1285+
},
12761286
{
12771287
"Name": "TCoCmpEndsWith",
12781288
"Base": "TCoCompare",
12791289
"Match": {"Type": "Callable", "Name": "EndsWith"}
12801290
},
1291+
{
1292+
"Name": "TCoCmpEndsWithIgnoreCase",
1293+
"Base": "TCoCompare",
1294+
"Match": {"Type": "Callable", "Name": "EndsWithIgnoreCase"}
1295+
},
1296+
12811297
{
12821298
"Name": "TCoCmpStringContains",
12831299
"Base": "TCoCompare",
12841300
"Match": {"Type": "Callable", "Name": "StringContains"}
12851301
},
1302+
{
1303+
"Name": "TCoCmpStringContainsIgnoreCase",
1304+
"Base": "TCoCompare",
1305+
"Match": {"Type": "Callable", "Name": "StringContainsIgnoreCase"}
1306+
},
12861307
{
12871308
"Name": "TCoInc",
12881309
"Base": "TCallable",

yql/essentials/core/peephole_opt/yql_opt_peephole_physical.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8356,6 +8356,69 @@ TExprNode::TPtr ExpandSqlCompare(const TExprNode::TPtr& node, TExprContext& ctx)
83568356

83578357
return node;
83588358
}
8359+
TExprNode::TPtr ExpandContainsIgnoreCase(const TExprNode::TPtr& node, TExprContext& ctx) {
8360+
YQL_CLOG(DEBUG, CorePeepHole) << "Expand " << node->Content();
8361+
const TString part{node->Child(1)->Child(0)->Content()};
8362+
TString pattern;
8363+
if (node->Content() == "EqualsIgnoreCase") {
8364+
pattern = part;
8365+
} else if (node->Content() == "StartsWithIgnoreCase") {
8366+
pattern = part + "%";
8367+
} else if (node->Content() == "EndsWithIgnoreCase") {
8368+
pattern = "%" + part;
8369+
} else if (node->Content() == "StringContainsIgnoreCase") {
8370+
pattern = "%" + part + "%";
8371+
} else {
8372+
YQL_ENSURE(!"Unknown IngoreCase node");
8373+
}
8374+
const auto pos = node->Pos();
8375+
auto patternExpr = ctx.Builder(pos)
8376+
.Callable("Apply")
8377+
.Callable(0, "Udf")
8378+
.Atom(0, "Re2.PatternFromLike")
8379+
.Seal()
8380+
.Callable(1, node->Child(1)->Content())
8381+
.Atom(0, pattern)
8382+
.Seal()
8383+
.Seal()
8384+
.Build();
8385+
8386+
8387+
auto optionsExpr = ctx.Builder(pos)
8388+
.Callable("NamedApply")
8389+
.Callable(0, "Udf")
8390+
.Atom(0, "Re2.Options")
8391+
.Seal()
8392+
.List(1)
8393+
.Seal()
8394+
.Callable(2, "AsStruct")
8395+
.List(0)
8396+
.Atom(0, "CaseSensitive")
8397+
.Callable(1, "Bool")
8398+
.Atom(0, "false", TNodeFlags::Default)
8399+
.Seal()
8400+
.Seal()
8401+
.Seal()
8402+
.Seal()
8403+
.Build();
8404+
8405+
auto result = ctx.Builder(pos)
8406+
.Callable("Apply")
8407+
.Callable(0, "AssumeStrict")
8408+
.Callable(0, "Udf")
8409+
.Atom(0, "Re2.Match")
8410+
.List(1)
8411+
.Add(0, patternExpr)
8412+
.Add(1, optionsExpr)
8413+
.Seal()
8414+
.Seal()
8415+
.Seal()
8416+
.Add(1, node->Child(0))
8417+
.Seal()
8418+
.Build();
8419+
8420+
return result;
8421+
}
83598422

83608423
template <bool Equals>
83618424
TExprNode::TPtr ExpandAggrEqual(const TExprNode::TPtr& node, TExprContext& ctx) {
@@ -8680,6 +8743,10 @@ struct TPeepHoleRules {
86808743
{"EmptyIterator", &DropDependsOnFromEmptyIterator},
86818744
{"Version", &ExpandVersion},
86828745
{RightName, &ExpandRightOverCons},
8746+
{"EqualsIgnoreCase", &ExpandContainsIgnoreCase},
8747+
{"StartsWithIgnoreCase", &ExpandContainsIgnoreCase},
8748+
{"EndsWithIgnoreCase", &ExpandContainsIgnoreCase},
8749+
{"StringContainsIgnoreCase", &ExpandContainsIgnoreCase},
86838750
};
86848751

86858752
const TExtPeepHoleOptimizerMap CommonStageExtRules = {

yql/essentials/core/type_ann/type_ann_core.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3322,7 +3322,9 @@ namespace NTypeAnnImpl {
33223322
return IGraphTransformer::TStatus::Error;
33233323
}
33243324

3325-
if (IsNull(input->Head()) || IsNull(input->Tail())) {
3325+
const auto ignoreNulls = input->Content().ends_with("IgnoreCase");
3326+
3327+
if ((IsNull(input->Head()) || IsNull(input->Tail())) && !ignoreNulls) {
33263328
output = MakeBoolNothing(input->Pos(), ctx.Expr);
33273329
return IGraphTransformer::TStatus::Repeat;
33283330
}
@@ -3342,17 +3344,19 @@ namespace NTypeAnnImpl {
33423344
}
33433345
bool isOptional = false;
33443346
const TDataExprType* dataType = nullptr;
3345-
if (!IsDataOrOptionalOfData(type, isOptional, dataType) ||
3346-
!(dataType->GetSlot() == EDataSlot::String || dataType->GetSlot() == EDataSlot::Utf8))
3347+
if ((!IsDataOrOptionalOfData(type, isOptional, dataType) ||
3348+
!(dataType->GetSlot() == EDataSlot::String || dataType->GetSlot() == EDataSlot::Utf8) ||
3349+
dataType->IsOptionalOrNull()) &&
3350+
(!IsNull(*type) && ignoreNulls)
3351+
)
33473352
{
33483353
ctx.Expr.AddError(TIssue(ctx.Expr.GetPosition(child->Pos()), TStringBuilder()
33493354
<< "Expected (optional) string/utf8 or corresponding Pg type, but got: " << *child->GetTypeAnn()));
33503355
return IGraphTransformer::TStatus::Error;
33513356
}
33523357
hasOptionals = hasOptionals || isOptional;
33533358
}
3354-
3355-
if (hasOptionals)
3359+
if (hasOptionals && !ignoreNulls)
33563360
input->SetTypeAnn(ctx.Expr.MakeType<TOptionalExprType>(ctx.Expr.MakeType<TDataExprType>(EDataSlot::Bool)));
33573361
else
33583362
input->SetTypeAnn(ctx.Expr.MakeType<TDataExprType>(EDataSlot::Bool));
@@ -12419,6 +12423,7 @@ template <NKikimr::NUdf::EDataSlot DataSlot>
1241912423
Functions["GreaterOrEqual"] = &CompareWrapper<false>;
1242012424
Functions["=="] = &CompareWrapper<true>;
1242112425
Functions["Equal"] = &CompareWrapper<true>;
12426+
Functions["EqualsIgnoreCase"] = &WithWrapper;
1242212427
Functions["!="] = &CompareWrapper<true>;
1242312428
Functions["NotEqual"] = &CompareWrapper<true>;
1242412429
Functions["Inc"] = &IncDecWrapper<true>;
@@ -12482,8 +12487,11 @@ template <NKikimr::NUdf::EDataSlot DataSlot>
1248212487
Functions["Find"] = &FindWrapper;
1248312488
Functions["RFind"] = &FindWrapper;
1248412489
Functions["StartsWith"] = &WithWrapper;
12490+
Functions["StartsWithIgnoreCase"] = &WithWrapper;
1248512491
Functions["EndsWith"] = &WithWrapper;
12492+
Functions["EndsWithIgnoreCase"] = &WithWrapper;
1248612493
Functions["StringContains"] = &WithWrapper;
12494+
Functions["StringContainsIgnoreCase"] = &WithWrapper;
1248712495
Functions["ByteAt"] = &ByteAtWrapper;
1248812496
Functions["ListIf"] = &ListIfWrapper;
1248912497
Functions["AsList"] = &AsListWrapper<false>;

yql/essentials/sql/v1/context.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ THashMap<TStringBuf, TPragmaField> CTX_PRAGMA_FIELDS = {
6868
{"DistinctOverWindow", &TContext::DistinctOverWindow},
6969
{"EmitUnionMerge", &TContext::EmitUnionMerge},
7070
{"SeqMode", &TContext::SeqMode},
71+
{"OptimizeSimpleILIKE", &TContext::OptimizeSimpleIlike}
7172
};
7273

7374
typedef TMaybe<bool> TContext::*TPragmaMaybeField;

yql/essentials/sql/v1/context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ namespace NSQLTranslationV1 {
382382
bool DistinctOverWindow = false;
383383
bool SeqMode = false;
384384
bool EmitUnionMerge = false;
385+
bool OptimizeSimpleIlike = false;
385386
TVector<size_t> ForAllStatementsParts;
386387

387388
TMaybe<TString> Engine;

yql/essentials/sql/v1/node.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3179,7 +3179,10 @@ TNodePtr BuildBinaryOp(TContext& ctx, TPosition pos, const TString& opName, TNod
31793179
return nullptr;
31803180
}
31813181

3182-
static const THashSet<TStringBuf> nullSafeOps = {"IsDistinctFrom", "IsNotDistinctFrom"};
3182+
static const THashSet<TStringBuf> nullSafeOps = {
3183+
"IsDistinctFrom", "IsNotDistinctFrom",
3184+
"EqualsIgnoreCase", "StartsWithIgnoreCase", "EndsWithIgnoreCase", "StringContainsIgnoreCase"
3185+
};
31833186
if (!nullSafeOps.contains(opName)) {
31843187
const bool bothArgNull = a->IsNull() && b->IsNull();
31853188
const bool oneArgNull = a->IsNull() || b->IsNull();

yql/essentials/sql/v1/sql_expression.cpp

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1798,19 +1798,25 @@ TNodePtr TSqlExpression::SubExpr(const TRule_xor_subexpr& node, const TTrailingQ
17981798
return nullptr;
17991799
}
18001800

1801-
if (opName == "like" || mayIgnoreCase) {
1801+
if ((opName == "like") || mayIgnoreCase || Ctx.OptimizeSimpleIlike) {
18021802
// TODO: expand LIKE in optimizers - we can analyze argument types there
1803+
const bool useIgnoreCaseOp = (opName == "ilike") && !mayIgnoreCase;
1804+
const auto& equalOp = useIgnoreCaseOp ? "EqualsIgnoreCase" : "==";
1805+
const auto& startsWithOp = useIgnoreCaseOp ? "StartsWithIgnoreCase" : "StartsWith";
1806+
const auto& endsWithOp = useIgnoreCaseOp ? "EndsWithIgnoreCase" : "EndsWith";
1807+
const auto& containsOp = useIgnoreCaseOp ? "StringContainsIgnoreCase" : "StringContains";
1808+
18031809
YQL_ENSURE(!components.empty());
18041810
const auto& first = components.front();
18051811
if (components.size() == 1 && first.IsSimple) {
18061812
// no '%'s and '_'s in pattern
18071813
YQL_ENSURE(first.Prefix == first.Suffix);
1808-
isMatch = BuildBinaryOp(Ctx, pos, "==", res, BuildLiteralRawString(pos, first.Suffix, isUtf8));
1814+
isMatch = BuildBinaryOp(Ctx, pos, equalOp, res, BuildLiteralRawString(pos, first.Suffix, isUtf8));
18091815
} else if (!first.Prefix.empty()) {
18101816
const TString& prefix = first.Prefix;
18111817
TNodePtr prefixMatch;
18121818
if (Ctx.EmitStartsWith) {
1813-
prefixMatch = BuildBinaryOp(Ctx, pos, "StartsWith", res, BuildLiteralRawString(pos, prefix, isUtf8));
1819+
prefixMatch = BuildBinaryOp(Ctx, pos, startsWithOp, res, BuildLiteralRawString(pos, prefix, isUtf8));
18141820
} else {
18151821
prefixMatch = BuildBinaryOp(Ctx, pos, ">=", res, BuildLiteralRawString(pos, prefix, isUtf8));
18161822
auto upperBound = isUtf8 ? NextValidUtf8(prefix) : NextLexicographicString(prefix);
@@ -1835,7 +1841,7 @@ TNodePtr TSqlExpression::SubExpr(const TRule_xor_subexpr& node, const TTrailingQ
18351841
TNodePtr sizePred = BuildBinaryOp(Ctx, pos, ">=",
18361842
TNodePtr(new TCallNodeImpl(pos, "Size", { res })),
18371843
TNodePtr(new TLiteralNumberNode<ui32>(pos, "Uint32", ToString(prefix.size() + suffix.size()))));
1838-
TNodePtr suffixMatch = BuildBinaryOp(Ctx, pos, "EndsWith", res, BuildLiteralRawString(pos, suffix, isUtf8));
1844+
TNodePtr suffixMatch = BuildBinaryOp(Ctx, pos, endsWithOp, res, BuildLiteralRawString(pos, suffix, isUtf8));
18391845
isMatch = new TCallNodeImpl(pos, "And", {
18401846
sizePred,
18411847
prefixMatch,
@@ -1850,14 +1856,14 @@ TNodePtr TSqlExpression::SubExpr(const TRule_xor_subexpr& node, const TTrailingQ
18501856
if (components.size() == 3 && components.back().Prefix.empty()) {
18511857
// '%foo%'
18521858
YQL_ENSURE(!components[1].Prefix.empty());
1853-
isMatch = BuildBinaryOp(Ctx, pos, "StringContains", res, BuildLiteralRawString(pos, components[1].Prefix, isUtf8));
1859+
isMatch = BuildBinaryOp(Ctx, pos, containsOp, res, BuildLiteralRawString(pos, components[1].Prefix, isUtf8));
18541860
} else if (components.size() == 2) {
18551861
// '%foo'
1856-
isMatch = BuildBinaryOp(Ctx, pos, "EndsWith", res, BuildLiteralRawString(pos, components[1].Prefix, isUtf8));
1862+
isMatch = BuildBinaryOp(Ctx, pos, endsWithOp, res, BuildLiteralRawString(pos, components[1].Prefix, isUtf8));
18571863
}
18581864
} else if (Ctx.AnsiLike && !components.back().Suffix.empty()) {
18591865
const TString& suffix = components.back().Suffix;
1860-
TNodePtr suffixMatch = BuildBinaryOp(Ctx, pos, "EndsWith", res, BuildLiteralRawString(pos, suffix, isUtf8));
1866+
TNodePtr suffixMatch = BuildBinaryOp(Ctx, pos, endsWithOp, res, BuildLiteralRawString(pos, suffix, isUtf8));
18611867
isMatch = BuildBinaryOp(Ctx, pos, "And", suffixMatch, isMatch);
18621868
}
18631869
// TODO: more StringContains/StartsWith/EndsWith cases?

yql/essentials/sql/v1/sql_query.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3391,6 +3391,12 @@ TNodePtr TSqlQuery::PragmaStatement(const TRule_pragma_stmt& stmt, bool& success
33913391
}
33923392

33933393
Ctx.Engine = *literal;
3394+
} else if (normalizedPragma == "optimizesimpleilike") {
3395+
Ctx.OptimizeSimpleIlike = true;
3396+
Ctx.IncrementMonCounter("sql_pragma", "OptimizeSimpleILIKE");
3397+
} else if (normalizedPragma == "disableoptimizesimpleilike") {
3398+
Ctx.OptimizeSimpleIlike = false;
3399+
Ctx.IncrementMonCounter("sql_pragma", "DisableOptimizeSimpleILIKE");
33943400
} else {
33953401
Error() << "Unknown pragma: " << pragma;
33963402
Ctx.IncrementMonCounter("sql_errors", "UnknownPragma");

yql/essentials/tests/sql/minirun/part5/canondata/result.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,20 @@
11171117
"uri": "https://{canondata_backend}/1942100/1466d7e49a6dc5a8df761a5ac92539095e1a14a0/resource.tar.gz#test.test_library-library_udf--Results_/results.txt"
11181118
}
11191119
],
1120+
"test.test[like-ilike--Debug]": [
1121+
{
1122+
"checksum": "e9edc47e022f94844d6755223a748bbe",
1123+
"size": 5224,
1124+
"uri": "https://{canondata_backend}/1899731/c18b136d437186ca17b897aa3cbd89e03d6ab70e/resource.tar.gz#test.test_like-ilike--Debug_/opt.yql"
1125+
}
1126+
],
1127+
"test.test[like-ilike--Results]": [
1128+
{
1129+
"checksum": "4f140efd3491b22880e294dce1c735c2",
1130+
"size": 9773,
1131+
"uri": "https://{canondata_backend}/1899731/c18b136d437186ca17b897aa3cbd89e03d6ab70e/resource.tar.gz#test.test_like-ilike--Results_/results.txt"
1132+
}
1133+
],
11201134
"test.test[like-like_escape-default.txt-Debug]": [
11211135
{
11221136
"checksum": "9a241374a2f6995712f675e514d7e64a",

yql/essentials/tests/sql/minirun/part6/canondata/result.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,20 @@
847847
"uri": "file://test.test_lambda-list_aggregate_flatmap-default.txt-Results_/extracted"
848848
}
849849
],
850+
"test.test[like-ilike-Ansi-Debug]": [
851+
{
852+
"checksum": "2bf3a064a80a62b0d1e57b4fc19f85aa",
853+
"size": 4198,
854+
"uri": "https://{canondata_backend}/1871102/cf0bf303bf8ddaa5f80dc41d0b1079fd931793f8/resource.tar.gz#test.test_like-ilike-Ansi-Debug_/opt.yql"
855+
}
856+
],
857+
"test.test[like-ilike-Ansi-Results]": [
858+
{
859+
"checksum": "4f140efd3491b22880e294dce1c735c2",
860+
"size": 9773,
861+
"uri": "https://{canondata_backend}/1871102/cf0bf303bf8ddaa5f80dc41d0b1079fd931793f8/resource.tar.gz#test.test_like-ilike-Ansi-Results_/results.txt"
862+
}
863+
],
850864
"test.test[match_recognize-alerts_without_order-default.txt-Debug]": [
851865
{
852866
"checksum": "617ad997b55cab0792da8c20f2fdeb07",

0 commit comments

Comments
 (0)