Skip to content

Commit 7b88770

Browse files
vitstnzverevgeny
authored andcommitted
YQL-20050 ascii ilike in the peephole
commit_hash:ecabde17ea40a36ec6c992bec2b371529d6cc84c Conflicts: yql/essentials/core/peephole_opt/yql_opt_peephole_physical.cpp
1 parent b6cfc35 commit 7b88770

File tree

2 files changed

+148
-25
lines changed

2 files changed

+148
-25
lines changed

yql/essentials/core/peephole_opt/yql_opt_peephole_physical.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <yql/essentials/utils/yql_paths.h>
2323

2424
#include <util/generic/xrange.h>
25+
#include <util/string/ascii.h>
2526

2627
#include <library/cpp/svnversion/svnversion.h>
2728
#include <library/cpp/yson/writer.h>
@@ -8357,6 +8358,101 @@ TExprNode::TPtr ExpandSqlCompare(const TExprNode::TPtr& node, TExprContext& ctx)
83578358
return node;
83588359
}
83598360

8361+
TExprNode::TPtr ExpandContainsIgnoreCase(const TExprNode::TPtr& node, TExprContext& ctx) {
8362+
YQL_CLOG(DEBUG, CorePeepHole) << "Expand " << node->Content();
8363+
const auto pos = node->Pos();
8364+
const TString part{node->Child(1)->Child(0)->Content()};
8365+
if (node->Child(0)->GetTypeAnn()->GetKind() == ETypeAnnotationKind::Null) {
8366+
return MakeBool<false>(pos, ctx);
8367+
}
8368+
8369+
if (AllOf(part, IsAscii)) {
8370+
TString func = "String._yql_";
8371+
if (node->Content() == "EqualsIgnoreCase") {
8372+
func += "AsciiEqualsIgnoreCase";
8373+
} else if (node->Content() == "StartsWithIgnoreCase") {
8374+
func += "AsciiStartsWithIgnoreCase";
8375+
} else if (node->Content() == "EndsWithIgnoreCase") {
8376+
func += "AsciiEndsWithIgnoreCase";
8377+
} else if (node->Content() == "StringContainsIgnoreCase") {
8378+
func += "AsciiContainsIgnoreCase";
8379+
} else {
8380+
YQL_ENSURE(!"Unknown IngoreCase node");
8381+
}
8382+
8383+
return ctx.Builder(pos)
8384+
.Callable("Apply")
8385+
.Callable(0, "Udf")
8386+
.Atom(0, func)
8387+
.Seal()
8388+
.Add(1, node->ChildPtr(0))
8389+
.Callable(2, "String")
8390+
.Atom(0, part)
8391+
.Seal()
8392+
.Seal()
8393+
.Build();
8394+
}
8395+
8396+
TString pattern;
8397+
if (node->Content() == "EqualsIgnoreCase") {
8398+
pattern = part;
8399+
} else if (node->Content() == "StartsWithIgnoreCase") {
8400+
pattern = part + "%";
8401+
} else if (node->Content() == "EndsWithIgnoreCase") {
8402+
pattern = "%" + part;
8403+
} else if (node->Content() == "StringContainsIgnoreCase") {
8404+
pattern = "%" + part + "%";
8405+
} else {
8406+
YQL_ENSURE(!"Unknown IngoreCase node");
8407+
}
8408+
auto patternExpr = ctx.Builder(pos)
8409+
.Callable("Apply")
8410+
.Callable(0, "Udf")
8411+
.Atom(0, "Re2.PatternFromLike")
8412+
.Seal()
8413+
.Callable(1, node->Child(1)->Content())
8414+
.Atom(0, pattern)
8415+
.Seal()
8416+
.Seal()
8417+
.Build();
8418+
8419+
8420+
auto optionsExpr = ctx.Builder(pos)
8421+
.Callable("NamedApply")
8422+
.Callable(0, "Udf")
8423+
.Atom(0, "Re2.Options")
8424+
.Seal()
8425+
.List(1)
8426+
.Seal()
8427+
.Callable(2, "AsStruct")
8428+
.List(0)
8429+
.Atom(0, "CaseSensitive")
8430+
.Callable(1, "Bool")
8431+
.Atom(0, "false", TNodeFlags::Default)
8432+
.Seal()
8433+
.Seal()
8434+
.Seal()
8435+
.Seal()
8436+
.Build();
8437+
8438+
auto result = ctx.Builder(pos)
8439+
.Callable("Apply")
8440+
.Callable(0, "AssumeStrict")
8441+
.Callable(0, "Udf")
8442+
.Atom(0, "Re2.Match")
8443+
.List(1)
8444+
.Add(0, patternExpr)
8445+
.Add(1, optionsExpr)
8446+
.Seal()
8447+
.Seal()
8448+
.Seal()
8449+
.Add(1, node->Child(0))
8450+
.Seal()
8451+
.Build();
8452+
8453+
return result;
8454+
}
8455+
83608456
template <bool Equals>
83618457
TExprNode::TPtr ExpandAggrEqual(const TExprNode::TPtr& node, TExprContext& ctx) {
83628458
if (&node->Head() == &node->Tail()) {

yql/essentials/udfs/common/string/string_udf.cpp

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,7 @@ namespace {
166166
}
167167

168168
#define STRING_ASCII_CMP_IGNORE_CASE_UDF(udfName, function) \
169-
BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(T##udfName, \
170-
bool(TOptional<char*>, char*), \
171-
builder.SetMinLangVer(NYql::MakeLangVersion(2025, 2))) \
172-
{ \
173-
Y_UNUSED(valueBuilder); \
169+
TUnboxedValuePod udfName##Impl(const TUnboxedValuePod* args) { \
174170
if (args[0]) { \
175171
const TString haystack(args[0].AsStringRef()); \
176172
const TString needle(args[1].AsStringRef()); \
@@ -197,9 +193,26 @@ namespace {
197193
} \
198194
}; \
199195
\
200-
END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do)
196+
BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(T##udfName, \
197+
bool(TOptional<char*>, char*), \
198+
builder.SetMinLangVer(NYql::MakeLangVersion(2025, 2))) \
199+
{ \
200+
Y_UNUSED(valueBuilder); \
201+
return udfName##Impl(args); \
202+
} \
203+
\
204+
END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) \
205+
\
206+
BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_##udfName, \
207+
bool(TOptional<char*>, char*)) \
208+
{ \
209+
Y_UNUSED(valueBuilder); \
210+
return udfName##Impl(args); \
211+
} \
212+
\
213+
END_SIMPLE_ARROW_UDF(T_yql_##udfName, T##udfName##KernelExec::Do)
201214

202-
#define IS_ASCII_UDF(function) \
215+
#define IS_ASCII_UDF(function) \
203216
BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, bool(TOptional<char*>)) { \
204217
Y_UNUSED(valueBuilder); \
205218
if (args[0]) { \
@@ -493,24 +506,6 @@ namespace {
493506
return AsciiToUpper(a) == AsciiToUpper(b);
494507
}
495508

496-
BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(TAsciiContainsIgnoreCase, bool(TOptional<char*>, char*),
497-
builder.SetMinLangVer(NYql::MakeLangVersion(2025, 2)))
498-
{
499-
Y_UNUSED(valueBuilder);
500-
if (!args[0]) {
501-
return TUnboxedValuePod(false);
502-
}
503-
504-
const TString haystack(args[0].AsStringRef());
505-
const TString needle(args[1].AsStringRef());
506-
if (haystack.empty()) {
507-
return TUnboxedValuePod(needle.empty());
508-
}
509-
const auto found = std::search(haystack.cbegin(), haystack.cend(),
510-
needle.cbegin(), needle.cend(), IgnoreCaseComparator);
511-
return TUnboxedValuePod(found != haystack.cend());
512-
}
513-
514509
struct TAsciiContainsIgnoreCaseKernelExec
515510
: public TBinaryKernelExec<TAsciiContainsIgnoreCaseKernelExec>
516511
{
@@ -531,8 +526,37 @@ namespace {
531526
}
532527
};
533528

529+
TUnboxedValuePod AsciiContainsIgnoreCaseImpl(const TUnboxedValuePod* args) {
530+
if (!args[0]) {
531+
return TUnboxedValuePod(false);
532+
}
533+
534+
const TString haystack(args[0].AsStringRef());
535+
const TString needle(args[1].AsStringRef());
536+
if (haystack.empty()) {
537+
return TUnboxedValuePod(needle.empty());
538+
}
539+
const auto found = std::search(haystack.cbegin(), haystack.cend(),
540+
needle.cbegin(), needle.cend(), IgnoreCaseComparator);
541+
return TUnboxedValuePod(found != haystack.cend());
542+
}
543+
544+
BEGIN_SIMPLE_STRICT_ARROW_UDF_OPTIONS(TAsciiContainsIgnoreCase, bool(TOptional<char*>, char*),
545+
builder.SetMinLangVer(NYql::MakeLangVersion(2025, 2)))
546+
{
547+
Y_UNUSED(valueBuilder);
548+
return AsciiContainsIgnoreCaseImpl(args);
549+
}
550+
534551
END_SIMPLE_ARROW_UDF(TAsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do);
535552

553+
BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, bool(TOptional<char*>, char*))
554+
{
555+
Y_UNUSED(valueBuilder);
556+
return AsciiContainsIgnoreCaseImpl(args);
557+
}
558+
559+
END_SIMPLE_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do);
536560

537561
BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) {
538562
if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
@@ -958,6 +982,7 @@ namespace {
958982
}
959983

960984
#define STRING_REGISTER_UDF(udfName, ...) T##udfName,
985+
#define STRING_OPT_REGISTER_UDF(udfName, ...) T_yql_##udfName,
961986

962987
STRING_UDF_MAP(STRING_UDF)
963988
STRING_UNSAFE_UDF_MAP(STRING_UNSAFE_UDF)
@@ -984,6 +1009,7 @@ namespace {
9841009
STROKA_FIND_UDF_MAP(STRING_REGISTER_UDF)
9851010
STRING_TWO_ARGS_UDF_MAP_DEPRECATED_2025_02(STRING_REGISTER_UDF)
9861011
STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_REGISTER_UDF)
1012+
STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_OPT_REGISTER_UDF)
9871013
IS_ASCII_UDF_MAP(STRING_REGISTER_UDF)
9881014
STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
9891015
STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
@@ -998,6 +1024,7 @@ namespace {
9981024
TRemoveLast,
9991025
TContains,
10001026
TAsciiContainsIgnoreCase,
1027+
T_yql_AsciiContainsIgnoreCase,
10011028
TFind,
10021029
TReverseFind,
10031030
TSubstring,

0 commit comments

Comments
 (0)