Skip to content

Commit f7153b7

Browse files
branch-3.0: [feat](function) SUBSTRING_INDEX function delimiter supports dynamic #50149 (#50302)
Cherry-picked from #50149 Co-authored-by: lw112 <131352377+felixwluo@users.noreply.github.com>
1 parent 6344083 commit f7153b7

File tree

8 files changed

+549
-40
lines changed

8 files changed

+549
-40
lines changed

be/src/vec/functions/function_string.h

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1975,24 +1975,42 @@ class FunctionSubstringIndex : public IFunction {
19751975

19761976
const auto* str_col = assert_cast<const ColumnString*>(content_column.get());
19771977

1978-
[[maybe_unused]] const auto& [delimiter_col, delimiter_const] =
1978+
// Handle both constant and non-constant delimiter parameters
1979+
ColumnPtr delimiter_column_ptr;
1980+
bool delimiter_const = false;
1981+
std::tie(delimiter_column_ptr, delimiter_const) =
19791982
unpack_if_const(block.get_by_position(arguments[1]).column);
1980-
auto delimiter = delimiter_col->get_data_at(0);
1981-
int32_t delimiter_size = delimiter.size;
1983+
const auto* delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get());
19821984

1983-
[[maybe_unused]] const auto& [part_num_col, part_const] =
1985+
ColumnPtr part_num_column_ptr;
1986+
bool part_num_const = false;
1987+
std::tie(part_num_column_ptr, part_num_const) =
19841988
unpack_if_const(block.get_by_position(arguments[2]).column);
1985-
auto part_number = *((int*)part_num_col->get_data_at(0).data);
1989+
const ColumnVector<Int32>* part_num_col =
1990+
assert_cast<const ColumnVector<Int32>*>(part_num_column_ptr.get());
19861991

1987-
if (part_number == 0 || delimiter_size == 0) {
1988-
for (size_t i = 0; i < input_rows_count; ++i) {
1992+
// For constant multi-character delimiters, create StringRef and StringSearch only once
1993+
std::optional<StringRef> const_delimiter_ref;
1994+
std::optional<StringSearch> const_search;
1995+
if (delimiter_const && delimiter_col->get_data_at(0).size > 1) {
1996+
const_delimiter_ref.emplace(delimiter_col->get_data_at(0));
1997+
const_search.emplace(&const_delimiter_ref.value());
1998+
}
1999+
2000+
for (size_t i = 0; i < input_rows_count; ++i) {
2001+
auto str = str_col->get_data_at(i);
2002+
auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i);
2003+
int32_t delimiter_size = delimiter.size;
2004+
2005+
auto part_number = part_num_col->get_element(part_num_const ? 0 : i);
2006+
2007+
if (part_number == 0 || delimiter_size == 0) {
19892008
StringOP::push_empty_string(i, res_chars, res_offsets);
2009+
continue;
19902010
}
1991-
} else if (part_number > 0) {
1992-
if (delimiter_size == 1) {
1993-
// If delimiter is a char, use memchr to split
1994-
for (size_t i = 0; i < input_rows_count; ++i) {
1995-
auto str = str_col->get_data_at(i);
2011+
2012+
if (part_number > 0) {
2013+
if (delimiter_size == 1) {
19962014
int32_t offset = -1;
19972015
int32_t num = 0;
19982016
while (num < part_number) {
@@ -2018,18 +2036,23 @@ class FunctionSubstringIndex : public IFunction {
20182036
StringOP::push_value_string(std::string_view(str.data, str.size), i,
20192037
res_chars, res_offsets);
20202038
}
2021-
}
2022-
} else {
2023-
StringRef delimiter_ref(delimiter);
2024-
StringSearch search(&delimiter_ref);
2025-
for (size_t i = 0; i < input_rows_count; ++i) {
2026-
auto str = str_col->get_data_at(i);
2039+
} else {
2040+
// For multi-character delimiters
2041+
// Use pre-created StringRef and StringSearch for constant delimiters
2042+
StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value()
2043+
: StringRef(delimiter);
2044+
const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr;
2045+
StringSearch local_search(&delimiter_ref);
2046+
if (!search_ptr) {
2047+
search_ptr = &local_search;
2048+
}
2049+
20272050
int32_t offset = -delimiter_size;
20282051
int32_t num = 0;
20292052
while (num < part_number) {
20302053
size_t n = str.size - offset - delimiter_size;
20312054
// search first match delimter_ref index from src string among str_offset to end
2032-
const char* pos = search.search(str.data + offset + delimiter_size, n);
2055+
const char* pos = search_ptr->search(str.data + offset + delimiter_size, n);
20332056
if (pos < str.data + str.size) {
20342057
offset = pos - str.data;
20352058
num++;
@@ -2050,21 +2073,25 @@ class FunctionSubstringIndex : public IFunction {
20502073
res_chars, res_offsets);
20512074
}
20522075
}
2053-
}
2054-
} else {
2055-
// if part_number is negative
2056-
part_number = -part_number;
2057-
for (size_t i = 0; i < input_rows_count; ++i) {
2058-
auto str = str_col->get_data_at(i);
2076+
} else {
2077+
int neg_part_number = -part_number;
20592078
auto str_str = str.to_string();
20602079
int32_t offset = str.size;
20612080
int32_t pre_offset = offset;
20622081
int32_t num = 0;
20632082
auto substr = str_str;
2064-
while (num <= part_number && offset >= 0) {
2065-
offset = (int)substr.rfind(delimiter, offset);
2083+
2084+
// Use pre-created StringRef for constant delimiters
2085+
StringRef delimiter_str =
2086+
const_delimiter_ref
2087+
? const_delimiter_ref.value()
2088+
: StringRef(reinterpret_cast<const char*>(delimiter.data),
2089+
delimiter.size);
2090+
2091+
while (num <= neg_part_number && offset >= 0) {
2092+
offset = (int)substr.rfind(delimiter_str, offset);
20662093
if (offset != -1) {
2067-
if (++num == part_number) {
2094+
if (++num == neg_part_number) {
20682095
break;
20692096
}
20702097
pre_offset = offset;
@@ -2076,7 +2103,7 @@ class FunctionSubstringIndex : public IFunction {
20762103
}
20772104
num = (offset == -1 && num != 0) ? num + 1 : num;
20782105

2079-
if (num == part_number) {
2106+
if (num == neg_part_number) {
20802107
if (offset == -1) {
20812108
StringOP::push_value_string(std::string_view(str.data, str.size), i,
20822109
res_chars, res_offsets);

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
package org.apache.doris.nereids.trees.expressions.functions.scalar;
1919

2020
import org.apache.doris.catalog.FunctionSignature;
21-
import org.apache.doris.nereids.exceptions.AnalysisException;
2221
import org.apache.doris.nereids.trees.expressions.Expression;
2322
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
2423
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
@@ -53,16 +52,6 @@ public SubstringIndex(Expression arg0, Expression arg1, Expression arg2) {
5352
super("substring_index", arg0, arg1, arg2);
5453
}
5554

56-
@Override
57-
public void checkLegalityBeforeTypeCoercion() {
58-
for (int i = 1; i < children.size(); ++i) {
59-
if (!getArgument(i).isConstant()) {
60-
throw new AnalysisException(getName()
61-
+ " function except for the first argument, other parameter must be a constant.");
62-
}
63-
}
64-
}
65-
6655
/**
6756
* withChildren.
6857
*/
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !sql --
3+
1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB -1 _02|CCC_03|DDD_04|EEE_05|FFF_06
4+
2 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 ccc -1 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06
5+
3 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD -1 _04|EEE_05|FFF_06
6+
4 sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 DDD -1 _04|rfv_05|rgb_06
7+
5 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 eee -1 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06
8+
6 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 A_01 -1 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06
9+
7 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB 1 AAA_01|
10+
8 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | 2 AAA_01|BBB_02
11+
9 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | -2 EEE_05|FFF_06
12+
10 ABC | 1 ABC
13+
11 ABC|DEF | 0
14+
12 ABC 1
15+
13 ABC|DEF|GHI 1
16+
17+
-- !sql --
18+
101 北京市|上海市|广州市|深圳市|成都市 | 2 北京市|上海市
19+
102 北京市|上海市|广州市|深圳市|成都市 | -2 深圳市|成都市
20+
103 北京市|上海市|广州市|深圳市|成都市 上海 -1 市|广州市|深圳市|成都市
21+
104 中国人民共和国 人民 1 中国
22+
105 中国人民共和国 人民 -1 共和国
23+
106 你好,世界!你好,朋友! 你好 1
24+
107 你好,世界!你好,朋友! 你好 -1 ,朋友!
25+
108 你好,世界!你好,朋友! 世界 -1 !你好,朋友!
26+
109 中文|测试|数据 测试 1 中文|
27+
110 中文|测试|数据 测试 -1 |数据
28+
29+
-- !sql --
30+
201 hello😀world😀example 😀 1 hello
31+
202 hello😀world😀example 😀 2 hello😀world
32+
203 hello😀world😀example 😀 -1 example
33+
204 👋👋hello👋world👋 👋 2 👋
34+
205 👋👋hello👋world👋 👋 -2 world👋
35+
36+
-- !sql --
37+
1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB -1 _02|CCC_03|DDD_04|EEE_05|FFF_06
38+
3 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD -1 _04|EEE_05|FFF_06
39+
7 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB 1 AAA_01|
40+
8 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | 2 AAA_01|BBB_02
41+
101 北京市|上海市|广州市|深圳市|成都市 | 2 北京市|上海市
42+
103 北京市|上海市|广州市|深圳市|成都市 上海 -1 市|广州市|深圳市|成都市
43+
201 hello😀world😀example 😀 1 hello
44+
203 hello😀world😀example 😀 -1 example
45+
46+
-- !sql --
47+
test|test test|test
48+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !sql --
3+
1 BBB AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06
4+
2 ccc zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06
5+
3 DDD AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _04|EEE_05|FFF_06
6+
4 DDD sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 _04|rfv_05|rgb_06
7+
5 eee cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06
8+
6 A_01 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06
9+
10+
-- !sql --
11+
7 市 北京市|上海市|广州市|深圳市 北京
12+
8 人民 中华人民共和国 中华 共和国
13+
9 分隔符 中文分隔符测试分隔符数据 中文 数据
14+
10 你好 你好,世界!你好,朋友! ,朋友!
15+
16+
-- !sql --
17+
1 field1,field2,field3,field4 , 2 field1,field2
18+
2 field1,field2,field3,field4 , -1 field4
19+
3 AAA_01|BBB_02|CCC_03 | 2 AAA_01|BBB_02
20+
4 AAA_01|BBB_02|CCC_03 | -2 BBB_02|CCC_03
21+
5 中文分隔符测试分隔符数据 分隔符 1 中文
22+
6 中文分隔符测试分隔符数据 分隔符 -1 数据
23+
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !sql --
3+
AAA_01
4+
5+
-- !sql --
6+
CCC_03
7+
8+
-- !sql --
9+
AAA_01|BBB_02
10+
11+
-- !sql --
12+
BBB_02|CCC_03
13+
14+
-- !sql --
15+
AAA_01|BBB_02|CCC_03
16+
17+
-- !sql --
18+
AAA_01|BBB_02|CCC_03
19+
20+
-- !sql --
21+
22+
23+
-- !sql --
24+
25+
26+
-- !sql --
27+
28+
29+
-- !sql --
30+
AAA_01|BBB_02|CCC_03
31+
32+
-- !sql --
33+
AAA_01|BBB_02|CCC_03
34+
35+
-- !sql --
36+
AAA_01
37+
38+
-- !sql --
39+
CCC_03
40+
41+
-- !sql --
42+
_02|CCC_03
43+
44+
-- !sql --
45+
46+
47+
-- !sql --
48+
49+
50+
-- !sql --
51+
北京市|上海市
52+
53+
-- !sql --
54+
北京市
55+
56+
-- !sql --
57+
广州市
58+
59+
-- !sql --
60+
hello
61+
62+
-- !sql --
63+
example
64+
65+
-- !sql --
66+
AAA_01|BBB_02
67+
68+
-- !sql --
69+
AAA_01|BBB_02
70+
71+
-- !sql --
72+
AAA_01|BBB_02
73+
74+
-- !sql --
75+
AAA_01|BBB_02
76+
77+
-- !sql --
78+
中文_
79+
80+
-- !sql --
81+
_02|CCC_03 AAA_01|BBB_02|CCC_03
82+

0 commit comments

Comments
 (0)