Skip to content

Commit 243a4e6

Browse files
authored
Integrate ascii case insensitive substing match in String UDF (#20672)
2 parents 2d2e6e4 + b5fd6f7 commit 243a4e6

File tree

5 files changed

+176
-0
lines changed

5 files changed

+176
-0
lines changed

yql/essentials/udfs/common/string/string_udf.cpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,52 @@ namespace {
163163
} \
164164
}
165165

166+
#define STRING_ASCII_CMP_IGNORE_CASE_UDF(udfName, function) \
167+
TUnboxedValuePod udfName##Impl(const TUnboxedValuePod* args) { \
168+
if (args[0]) { \
169+
const TString haystack(args[0].AsStringRef()); \
170+
const TString needle(args[1].AsStringRef()); \
171+
return TUnboxedValuePod(function(haystack, needle)); \
172+
} else { \
173+
return TUnboxedValuePod(false); \
174+
} \
175+
} \
176+
\
177+
struct T##udfName##KernelExec \
178+
: public TBinaryKernelExec<T##udfName##KernelExec> \
179+
{ \
180+
template <typename TSink> \
181+
static void Process(const IValueBuilder*, TBlockItem arg1, \
182+
TBlockItem arg2, const TSink& sink) \
183+
{ \
184+
if (arg1) { \
185+
const TString haystack(arg1.AsStringRef()); \
186+
const TString needle(arg2.AsStringRef()); \
187+
sink(TBlockItem(function(haystack, needle))); \
188+
} else { \
189+
sink(TBlockItem(false)); \
190+
} \
191+
} \
192+
}; \
193+
\
194+
BEGIN_SIMPLE_STRICT_ARROW_UDF(T##udfName, \
195+
bool(TOptional<char*>, char*)) \
196+
{ \
197+
Y_UNUSED(valueBuilder); \
198+
return udfName##Impl(args); \
199+
} \
200+
\
201+
END_SIMPLE_ARROW_UDF(T##udfName, T##udfName##KernelExec::Do) \
202+
\
203+
BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_##udfName, \
204+
bool(TOptional<char*>, char*)) \
205+
{ \
206+
Y_UNUSED(valueBuilder); \
207+
return udfName##Impl(args); \
208+
} \
209+
\
210+
END_SIMPLE_ARROW_UDF(T_yql_##udfName, T##udfName##KernelExec::Do)
211+
166212
#define IS_ASCII_UDF(function) \
167213
BEGIN_SIMPLE_STRICT_ARROW_UDF(T##function, bool(TOptional<char*>)) { \
168214
Y_UNUSED(valueBuilder); \
@@ -368,6 +414,11 @@ namespace {
368414
XX(HasPrefixIgnoreCase, AsciiHasPrefixIgnoreCase) \
369415
XX(HasSuffixIgnoreCase, AsciiHasSuffixIgnoreCase)
370416

417+
#define STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(XX) \
418+
XX(AsciiStartsWithIgnoreCase, AsciiHasPrefixIgnoreCase) \
419+
XX(AsciiEndsWithIgnoreCase, AsciiHasSuffixIgnoreCase) \
420+
XX(AsciiEqualsIgnoreCase, AsciiEqualsIgnoreCase)
421+
371422
// NOTE: The functions below are marked as deprecated, so block implementation
372423
// is not required for them. Hence, STROKA_UDF provides only the scalar one at
373424
// the moment.
@@ -449,6 +500,60 @@ namespace {
449500

450501
END_SIMPLE_ARROW_UDF(TContains, TContainsKernelExec::Do);
451502

503+
static bool IgnoreCaseComparator(char a, char b) {
504+
return AsciiToUpper(a) == AsciiToUpper(b);
505+
}
506+
507+
struct TAsciiContainsIgnoreCaseKernelExec
508+
: public TBinaryKernelExec<TAsciiContainsIgnoreCaseKernelExec>
509+
{
510+
template <typename TSink>
511+
static void Process(const IValueBuilder*, TBlockItem arg1, TBlockItem arg2, const TSink& sink) {
512+
if (!arg1) {
513+
return sink(TBlockItem(arg2 ? false : true));
514+
}
515+
516+
const TString haystack(arg1.AsStringRef());
517+
const TString needle(arg2.AsStringRef());
518+
if (haystack.empty()) {
519+
return sink(TBlockItem((needle.empty())));
520+
}
521+
const auto found = std::search(haystack.cbegin(), haystack.cend(),
522+
needle.cbegin(), needle.cend(), IgnoreCaseComparator);
523+
sink(TBlockItem(found != haystack.cend()));
524+
}
525+
};
526+
527+
TUnboxedValuePod AsciiContainsIgnoreCaseImpl(const TUnboxedValuePod* args) {
528+
if (!args[0]) {
529+
return TUnboxedValuePod(false);
530+
}
531+
532+
const TString haystack(args[0].AsStringRef());
533+
const TString needle(args[1].AsStringRef());
534+
if (haystack.empty()) {
535+
return TUnboxedValuePod(needle.empty());
536+
}
537+
const auto found = std::search(haystack.cbegin(), haystack.cend(),
538+
needle.cbegin(), needle.cend(), IgnoreCaseComparator);
539+
return TUnboxedValuePod(found != haystack.cend());
540+
}
541+
542+
BEGIN_SIMPLE_STRICT_ARROW_UDF(TAsciiContainsIgnoreCase, bool(TOptional<char*>, char*))
543+
{
544+
Y_UNUSED(valueBuilder);
545+
return AsciiContainsIgnoreCaseImpl(args);
546+
}
547+
548+
END_SIMPLE_ARROW_UDF(TAsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do);
549+
550+
BEGIN_SIMPLE_STRICT_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, bool(TOptional<char*>, char*))
551+
{
552+
Y_UNUSED(valueBuilder);
553+
return AsciiContainsIgnoreCaseImpl(args);
554+
}
555+
556+
END_SIMPLE_ARROW_UDF(T_yql_AsciiContainsIgnoreCase, TAsciiContainsIgnoreCaseKernelExec::Do);
452557

453558
BEGIN_SIMPLE_STRICT_ARROW_UDF(TReplaceAll, char*(TAutoMap<char*>, char*, char*)) {
454559
if (TString result(args[0].AsStringRef()); SubstGlobal(result, args[1].AsStringRef(), args[2].AsStringRef()))
@@ -874,6 +979,7 @@ namespace {
874979
}
875980

876981
#define STRING_REGISTER_UDF(udfName, ...) T##udfName,
982+
#define STRING_OPT_REGISTER_UDF(udfName, ...) T_yql_##udfName,
877983

878984
STRING_UDF_MAP(STRING_UDF)
879985
STRING_UNSAFE_UDF_MAP(STRING_UNSAFE_UDF)
@@ -882,6 +988,7 @@ namespace {
882988
STROKA_ASCII_CASE_UDF_MAP(STROKA_ASCII_CASE_UDF)
883989
STROKA_FIND_UDF_MAP(STROKA_FIND_UDF)
884990
STRING_TWO_ARGS_UDF_MAP(STRING_TWO_ARGS_UDF)
991+
STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_ASCII_CMP_IGNORE_CASE_UDF)
885992
IS_ASCII_UDF_MAP(IS_ASCII_UDF)
886993

887994
static constexpr ui64 padLim = 1000000;
@@ -898,6 +1005,8 @@ namespace {
8981005
STROKA_ASCII_CASE_UDF_MAP(STRING_REGISTER_UDF)
8991006
STROKA_FIND_UDF_MAP(STRING_REGISTER_UDF)
9001007
STRING_TWO_ARGS_UDF_MAP(STRING_REGISTER_UDF)
1008+
// STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_REGISTER_UDF) not going to be expoesed until 2025.2
1009+
STRING_ASCII_CMP_IGNORE_CASE_UDF_MAP(STRING_OPT_REGISTER_UDF)
9011010
IS_ASCII_UDF_MAP(STRING_REGISTER_UDF)
9021011
STRING_STREAM_PAD_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
9031012
STRING_STREAM_NUM_FORMATTER_UDF_MAP(STRING_REGISTER_UDF)
@@ -911,6 +1020,8 @@ namespace {
9111020
TRemoveFirst,
9121021
TRemoveLast,
9131022
TContains,
1023+
//TAsciiContainsIgnoreCase, not going to be expoesed until 2025.2
1024+
T_yql_AsciiContainsIgnoreCase,
9141025
TFind,
9151026
TReverseFind,
9161027
TSubstring,

yql/essentials/udfs/common/string/test/canondata/result.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
"uri": "file://test.test_AsciiChecks_/results.txt"
55
}
66
],
7+
"test.test[AsciiCmpIgnoreCase]": [
8+
{
9+
"uri": "file://test.test_AsciiCmpIgnoreCase_/extracted"
10+
}
11+
],
712
"test.test[Base32Decode]": [
813
{
914
"uri": "file://test.test_Base32Decode_/results.txt"
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<tmp_path>/program.sql:<main>: Error: Type annotation
2+
3+
<tmp_path>/program.sql:<main>:2:1: Error: At function: RemovePrefixMembers, At function: Unordered, At function: PersistableRepr, At function: OrderedSqlProject
4+
SELECT
5+
^
6+
<tmp_path>/program.sql:<main>:2:1: Error: At function: SqlProjectItem
7+
SELECT
8+
^
9+
<tmp_path>/program.sql:<main>:6:13: Error: At function: Apply, At function: Udf, At String.AsciiContainsIgnoreCase
10+
String::AsciiContainsIgnoreCase(value, "AS") AS iccontains,
11+
^
12+
<tmp_path>/program.sql:<main>:6:13: Error: Failed to find UDF function: String.AsciiContainsIgnoreCase, reason: Error: Module: String, function: AsciiContainsIgnoreCase, error: Unknown function: AsciiContainsIgnoreCase
13+
String::AsciiContainsIgnoreCase(value, "AS") AS iccontains,
14+
^
15+
<tmp_path>/program.sql:<main>:2:1: Error: At function: SqlProjectItem
16+
SELECT
17+
^
18+
<tmp_path>/program.sql:<main>:8:13: Error: At function: Apply, At function: Udf, At String.AsciiContainsIgnoreCase
19+
String::AsciiContainsIgnoreCase(value, "") AS icempty,
20+
^
21+
<tmp_path>/program.sql:<main>:8:13: Error: Failed to find UDF function: String.AsciiContainsIgnoreCase, reason: Error: Module: String, function: AsciiContainsIgnoreCase, error: Unknown function: AsciiContainsIgnoreCase
22+
String::AsciiContainsIgnoreCase(value, "") AS icempty,
23+
^
24+
<tmp_path>/program.sql:<main>:2:1: Error: At function: SqlProjectItem
25+
SELECT
26+
^
27+
<tmp_path>/program.sql:<main>:10:13: Error: At function: Apply, At function: Udf, At String.AsciiStartsWithIgnoreCase
28+
String::AsciiStartsWithIgnoreCase(value, "AS") AS icstarts,
29+
^
30+
<tmp_path>/program.sql:<main>:10:13: Error: Failed to find UDF function: String.AsciiStartsWithIgnoreCase, reason: Error: Module: String, function: AsciiStartsWithIgnoreCase, error: Unknown function: AsciiStartsWithIgnoreCase
31+
String::AsciiStartsWithIgnoreCase(value, "AS") AS icstarts,
32+
^
33+
<tmp_path>/program.sql:<main>:2:1: Error: At function: SqlProjectItem
34+
SELECT
35+
^
36+
<tmp_path>/program.sql:<main>:12:13: Error: At function: Apply, At function: Udf, At String.AsciiEndsWithIgnoreCase
37+
String::AsciiEndsWithIgnoreCase(value, "AS") AS icends,
38+
^
39+
<tmp_path>/program.sql:<main>:12:13: Error: Failed to find UDF function: String.AsciiEndsWithIgnoreCase, reason: Error: Module: String, function: AsciiEndsWithIgnoreCase, error: Unknown function: AsciiEndsWithIgnoreCase
40+
String::AsciiEndsWithIgnoreCase(value, "AS") AS icends,
41+
^
42+
<tmp_path>/program.sql:<main>:2:1: Error: At function: SqlProjectItem
43+
SELECT
44+
^
45+
<tmp_path>/program.sql:<main>:14:13: Error: At function: Apply, At function: Udf, At String.AsciiEqualsIgnoreCase
46+
String::AsciiEqualsIgnoreCase(value, "FDSA") AS icequals,
47+
^
48+
<tmp_path>/program.sql:<main>:14:13: Error: Failed to find UDF function: String.AsciiEqualsIgnoreCase, reason: Error: Module: String, function: AsciiEqualsIgnoreCase, error: Unknown function: AsciiEqualsIgnoreCase
49+
String::AsciiEqualsIgnoreCase(value, "FDSA") AS icequals,
50+
^
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
xfail
2+
in plato.Input default.in
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
SELECT
2+
value,
3+
String::AsciiContainsIgnoreCase(value, "AS") AS iccontains,
4+
String::AsciiContainsIgnoreCase(value, "") AS icempty,
5+
String::AsciiStartsWithIgnoreCase(value, "AS") AS icstarts,
6+
String::AsciiEndsWithIgnoreCase(value, "AS") AS icends,
7+
String::AsciiEqualsIgnoreCase(value, "FDSA") AS icequals,
8+
FROM Input;

0 commit comments

Comments
 (0)