@@ -1975,24 +1975,42 @@ class FunctionSubstringIndex : public IFunction {
1975
1975
1976
1976
const auto * str_col = assert_cast<const ColumnString*>(content_column.get ());
1977
1977
1978
- [[maybe_unused]] const auto & [delimiter_col, delimiter_const] =
1978
+ // Handle both constant and non-constant delimiter parameters
1979
+ ColumnPtr delimiter_column_ptr;
1980
+ bool delimiter_const = false ;
1981
+ std::tie (delimiter_column_ptr, delimiter_const) =
1979
1982
unpack_if_const (block.get_by_position (arguments[1 ]).column );
1980
- auto delimiter = delimiter_col->get_data_at (0 );
1981
- int32_t delimiter_size = delimiter.size ;
1983
+ const auto * delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get ());
1982
1984
1983
- [[maybe_unused]] const auto & [part_num_col, part_const] =
1985
+ ColumnPtr part_num_column_ptr;
1986
+ bool part_num_const = false ;
1987
+ std::tie (part_num_column_ptr, part_num_const) =
1984
1988
unpack_if_const (block.get_by_position (arguments[2 ]).column );
1985
- auto part_number = *((int *)part_num_col->get_data_at (0 ).data );
1989
+ const ColumnVector<Int32>* part_num_col =
1990
+ assert_cast<const ColumnVector<Int32>*>(part_num_column_ptr.get ());
1986
1991
1987
- if (part_number == 0 || delimiter_size == 0 ) {
1988
- for (size_t i = 0 ; i < input_rows_count; ++i) {
1992
+ // For constant multi-character delimiters, create StringRef and StringSearch only once
1993
+ std::optional<StringRef> const_delimiter_ref;
1994
+ std::optional<StringSearch> const_search;
1995
+ if (delimiter_const && delimiter_col->get_data_at (0 ).size > 1 ) {
1996
+ const_delimiter_ref.emplace (delimiter_col->get_data_at (0 ));
1997
+ const_search.emplace (&const_delimiter_ref.value ());
1998
+ }
1999
+
2000
+ for (size_t i = 0 ; i < input_rows_count; ++i) {
2001
+ auto str = str_col->get_data_at (i);
2002
+ auto delimiter = delimiter_col->get_data_at (delimiter_const ? 0 : i);
2003
+ int32_t delimiter_size = delimiter.size ;
2004
+
2005
+ auto part_number = part_num_col->get_element (part_num_const ? 0 : i);
2006
+
2007
+ if (part_number == 0 || delimiter_size == 0 ) {
1989
2008
StringOP::push_empty_string (i, res_chars, res_offsets);
2009
+ continue ;
1990
2010
}
1991
- } else if (part_number > 0 ) {
1992
- if (delimiter_size == 1 ) {
1993
- // If delimiter is a char, use memchr to split
1994
- for (size_t i = 0 ; i < input_rows_count; ++i) {
1995
- auto str = str_col->get_data_at (i);
2011
+
2012
+ if (part_number > 0 ) {
2013
+ if (delimiter_size == 1 ) {
1996
2014
int32_t offset = -1 ;
1997
2015
int32_t num = 0 ;
1998
2016
while (num < part_number) {
@@ -2018,18 +2036,23 @@ class FunctionSubstringIndex : public IFunction {
2018
2036
StringOP::push_value_string (std::string_view (str.data , str.size ), i,
2019
2037
res_chars, res_offsets);
2020
2038
}
2021
- }
2022
- } else {
2023
- StringRef delimiter_ref (delimiter);
2024
- StringSearch search (&delimiter_ref);
2025
- for (size_t i = 0 ; i < input_rows_count; ++i) {
2026
- auto str = str_col->get_data_at (i);
2039
+ } else {
2040
+ // For multi-character delimiters
2041
+ // Use pre-created StringRef and StringSearch for constant delimiters
2042
+ StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value ()
2043
+ : StringRef (delimiter);
2044
+ const StringSearch* search_ptr = const_search ? &const_search.value () : nullptr ;
2045
+ StringSearch local_search (&delimiter_ref);
2046
+ if (!search_ptr) {
2047
+ search_ptr = &local_search;
2048
+ }
2049
+
2027
2050
int32_t offset = -delimiter_size;
2028
2051
int32_t num = 0 ;
2029
2052
while (num < part_number) {
2030
2053
size_t n = str.size - offset - delimiter_size;
2031
2054
// search first match delimter_ref index from src string among str_offset to end
2032
- const char * pos = search. search (str.data + offset + delimiter_size, n);
2055
+ const char * pos = search_ptr-> search (str.data + offset + delimiter_size, n);
2033
2056
if (pos < str.data + str.size ) {
2034
2057
offset = pos - str.data ;
2035
2058
num++;
@@ -2050,21 +2073,25 @@ class FunctionSubstringIndex : public IFunction {
2050
2073
res_chars, res_offsets);
2051
2074
}
2052
2075
}
2053
- }
2054
- } else {
2055
- // if part_number is negative
2056
- part_number = -part_number;
2057
- for (size_t i = 0 ; i < input_rows_count; ++i) {
2058
- auto str = str_col->get_data_at (i);
2076
+ } else {
2077
+ int neg_part_number = -part_number;
2059
2078
auto str_str = str.to_string ();
2060
2079
int32_t offset = str.size ;
2061
2080
int32_t pre_offset = offset;
2062
2081
int32_t num = 0 ;
2063
2082
auto substr = str_str;
2064
- while (num <= part_number && offset >= 0 ) {
2065
- offset = (int )substr.rfind (delimiter, offset);
2083
+
2084
+ // Use pre-created StringRef for constant delimiters
2085
+ StringRef delimiter_str =
2086
+ const_delimiter_ref
2087
+ ? const_delimiter_ref.value ()
2088
+ : StringRef (reinterpret_cast <const char *>(delimiter.data ),
2089
+ delimiter.size );
2090
+
2091
+ while (num <= neg_part_number && offset >= 0 ) {
2092
+ offset = (int )substr.rfind (delimiter_str, offset);
2066
2093
if (offset != -1 ) {
2067
- if (++num == part_number ) {
2094
+ if (++num == neg_part_number ) {
2068
2095
break ;
2069
2096
}
2070
2097
pre_offset = offset;
@@ -2076,7 +2103,7 @@ class FunctionSubstringIndex : public IFunction {
2076
2103
}
2077
2104
num = (offset == -1 && num != 0 ) ? num + 1 : num;
2078
2105
2079
- if (num == part_number ) {
2106
+ if (num == neg_part_number ) {
2080
2107
if (offset == -1 ) {
2081
2108
StringOP::push_value_string (std::string_view (str.data , str.size ), i,
2082
2109
res_chars, res_offsets);
0 commit comments