Skip to content

Commit ce6e979

Browse files
authored
feat(query): add string function: split and split_part (#13303)
* feat(query): add string function: split and split_part * add some test
1 parent 8908d27 commit ce6e979

File tree

5 files changed

+233
-0
lines changed

5 files changed

+233
-0
lines changed

src/query/functions/src/scalars/string.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use common_expression::types::number::SimpleDomain;
2222
use common_expression::types::number::UInt64Type;
2323
use common_expression::types::string::StringColumn;
2424
use common_expression::types::string::StringColumnBuilder;
25+
use common_expression::types::ArrayType;
2526
use common_expression::types::NumberType;
2627
use common_expression::types::StringType;
2728
use common_expression::vectorize_with_builder_1_arg;
@@ -757,6 +758,73 @@ pub fn register(registry: &mut FunctionRegistry) {
757758
}
758759
}),
759760
);
761+
762+
registry
763+
.register_passthrough_nullable_2_arg::<StringType, StringType, ArrayType<StringType>, _, _>(
764+
"split",
765+
|_, _, _| FunctionDomain::Full,
766+
vectorize_with_builder_2_arg::<StringType, StringType, ArrayType<StringType>>(
767+
|str, sep, output, ctx| match String::from_utf8(str.to_vec()) {
768+
Ok(s) => match String::from_utf8(sep.to_vec()) {
769+
Ok(sep) => {
770+
let res: Vec<&str> = s.split(&sep).collect();
771+
let len = res.len();
772+
let mut builder = StringColumnBuilder::with_capacity(len, len);
773+
for i in res {
774+
builder.put_slice(i.as_bytes());
775+
builder.commit_row();
776+
}
777+
let column = builder.build();
778+
output.builder.append_column(&column);
779+
output.commit_row()
780+
}
781+
Err(e) => {
782+
ctx.set_error(output.len(), e.to_string());
783+
output.commit_row();
784+
}
785+
},
786+
Err(e) => {
787+
ctx.set_error(output.len(), e.to_string());
788+
output.commit_row();
789+
}
790+
},
791+
),
792+
);
793+
794+
registry
795+
.register_passthrough_nullable_3_arg::<StringType, StringType, NumberType<i64>, StringType, _, _>(
796+
"split_part",
797+
|_, _, _, _| FunctionDomain::Full,
798+
vectorize_with_builder_3_arg::<StringType, StringType, NumberType<i64>, StringType>(
799+
|str, sep, part, output, ctx| match String::from_utf8(str.to_vec()) {
800+
Ok(s) => match String::from_utf8(sep.to_vec()) {
801+
Ok(sep) => {
802+
let split: Vec<&str> = s.split(&sep).collect();
803+
let len = split.len();
804+
if part <= len as i64 && part >= -(len as i64) {
805+
let idx = match part.cmp(&(0i64)) {
806+
Ordering::Greater => (part-1) as usize,
807+
Ordering::Less => (len as i64 + part) as usize,
808+
Ordering::Equal => 0
809+
};
810+
let res = split[idx];
811+
output.put_slice(res.as_bytes());
812+
813+
}
814+
output.commit_row();
815+
}
816+
Err(e) => {
817+
ctx.set_error(output.len(), e.to_string());
818+
output.commit_row();
819+
}
820+
},
821+
Err(e) => {
822+
ctx.set_error(output.len(), e.to_string());
823+
output.commit_row();
824+
}
825+
},
826+
),
827+
)
760828
}
761829

762830
pub(crate) mod soundex {

src/query/functions/tests/it/scalars/string.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ fn test_string() {
6161
test_left(file);
6262
test_right(file);
6363
test_substr(file);
64+
test_split(file)
6465
}
6566

6667
fn test_upper(file: &mut impl Write) {
@@ -679,3 +680,24 @@ fn test_substr(file: &mut impl Write) {
679680
),
680681
]);
681682
}
683+
684+
fn test_split(file: &mut impl Write) {
685+
run_ast(file, "split('Sakila', 'il')", &[]);
686+
run_ast(file, "split('sakila', 'a')", &[]);
687+
run_ast(file, "split('abc','b')", &[]);
688+
run_ast(file, "split(str, sep)", &[
689+
(
690+
"str",
691+
StringType::from_data_with_validity(
692+
&["127.0.0.1", "aaa--bbb-BBB--ccc", "cc", "aeeceedeef"],
693+
vec![false, true, true, true],
694+
),
695+
),
696+
(
697+
"sep",
698+
StringType::from_data_with_validity(&[".", "--", "cc", "ee"], vec![
699+
false, true, true, true,
700+
]),
701+
),
702+
]);
703+
}

src/query/functions/tests/it/scalars/testdata/function_list.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3080,6 +3080,10 @@ Functions overloads:
30803080
1 soundex(String NULL) :: String NULL
30813081
0 space(UInt64) :: String
30823082
1 space(UInt64 NULL) :: String NULL
3083+
0 split(String, String) :: Array(String)
3084+
1 split(String NULL, String NULL) :: Array(String) NULL
3085+
0 split_part(String, String, Int64) :: String
3086+
1 split_part(String NULL, String NULL, Int64 NULL) :: String NULL
30833087
0 sqrt(UInt8) :: Float64
30843088
1 sqrt(UInt8 NULL) :: Float64 NULL
30853089
2 sqrt(UInt16) :: Float64

src/query/functions/tests/it/scalars/testdata/string.txt

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3282,3 +3282,54 @@ evaluation (internal):
32823282
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
32833283

32843284

3285+
ast : split('Sakila', 'il')
3286+
raw expr : split('Sakila', 'il')
3287+
checked expr : split<String, String>("Sakila", "il")
3288+
optimized expr : ['Sak', 'a']
3289+
output type : Array(String)
3290+
output domain : [{"Sak"..="a"}]
3291+
output : ['Sak', 'a']
3292+
3293+
3294+
ast : split('sakila', 'a')
3295+
raw expr : split('sakila', 'a')
3296+
checked expr : split<String, String>("sakila", "a")
3297+
optimized expr : ['s', 'kil', '']
3298+
output type : Array(String)
3299+
output domain : [{""..="s"}]
3300+
output : ['s', 'kil', '']
3301+
3302+
3303+
ast : split('abc','b')
3304+
raw expr : split('abc', 'b')
3305+
checked expr : split<String, String>("abc", "b")
3306+
optimized expr : ['a', 'c']
3307+
output type : Array(String)
3308+
output domain : [{"a"..="c"}]
3309+
output : ['a', 'c']
3310+
3311+
3312+
ast : split(str, sep)
3313+
raw expr : split(str::String NULL, sep::String NULL)
3314+
checked expr : split<String NULL, String NULL>(str, sep)
3315+
evaluation:
3316+
+--------+-------------------------------+------------------------+---------------------------+
3317+
| | str | sep | Output |
3318+
+--------+-------------------------------+------------------------+---------------------------+
3319+
| Type | String NULL | String NULL | Array(String) NULL |
3320+
| Domain | {"127.0.0.1"..="cc"} ∪ {NULL} | {"--"..="ee"} ∪ {NULL} | [{""..}] ∪ {NULL} |
3321+
| Row 0 | NULL | NULL | NULL |
3322+
| Row 1 | 'aaa--bbb-BBB--ccc' | '--' | ['aaa', 'bbb-BBB', 'ccc'] |
3323+
| Row 2 | 'cc' | 'cc' | ['', ''] |
3324+
| Row 3 | 'aeeceedeef' | 'ee' | ['a', 'c', 'd', 'f'] |
3325+
+--------+-------------------------------+------------------------+---------------------------+
3326+
evaluation (internal):
3327+
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
3328+
| Column | Data |
3329+
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
3330+
| str | NullableColumn { column: StringColumn { data: 0x3132372e302e302e316161612d2d6262622d4242422d2d636363636361656563656564656566, offsets: [0, 9, 26, 28, 38] }, validity: [0b____1110] } |
3331+
| sep | NullableColumn { column: StringColumn { data: 0x2e2d2d63636565, offsets: [0, 1, 3, 5, 7] }, validity: [0b____1110] } |
3332+
| Output | NullableColumn { column: ArrayColumn { values: StringColumn { data: 0x3132373030316161616262622d42424263636361636466, offsets: [0, 3, 4, 5, 6, 9, 16, 19, 19, 19, 20, 21, 22, 23] }, offsets: [0, 4, 7, 9, 13] }, validity: [0b____1110] } |
3333+
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
3334+
3335+
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
query T
2+
select split('127.0.0.1', '.');
3+
----
4+
['127','0','0','1']
5+
6+
query T
7+
select split('|', '|');
8+
----
9+
['','']
10+
11+
query T
12+
select split('ab', '');
13+
----
14+
['','a','b','']
15+
16+
query T
17+
select split_part('ab', '', 1);
18+
----
19+
(empty)
20+
21+
query T
22+
select split_part('ab', '', 2);
23+
----
24+
a
25+
26+
query T
27+
select split_part('|', '|', 1);
28+
----
29+
(empty)
30+
31+
query T
32+
select split_part(null, null, 1);
33+
----
34+
NULL
35+
36+
query T
37+
select split(null, null);
38+
----
39+
NULL
40+
41+
42+
query TT
43+
select * from
44+
(select 0, split_part('11.22.33', '.', 0) UNION
45+
select 1, split_part('11.22.33', '.', 1) UNION
46+
select 2, split_part('11.22.33', '.', 2) UNION
47+
select 3, split_part('11.22.33', '.', 3) UNION
48+
select 4, split_part('11.22.33', '.', 4) UNION
49+
select -1, split_part('11.22.33', '.', -1) UNION
50+
select -2, split_part('11.22.33', '.', -2) UNION
51+
select -3, split_part('11.22.33', '.', -3) UNION
52+
select -4, split_part('11.22.33', '.', -4)) order by `0`;
53+
----
54+
-4 (empty)
55+
-3 11
56+
-2 22
57+
-1 33
58+
0 11
59+
1 11
60+
2 22
61+
3 33
62+
4 (empty)
63+
64+
statement ok
65+
drop table if exists t;
66+
67+
statement ok
68+
create table t(c1 string);
69+
70+
statement ok
71+
insert into t values('127.0.0.1'), ('127.0.0.2'), ('192.168.1.3.2222')
72+
73+
query T
74+
select split(c1, '.') from t order by c1
75+
----
76+
['127','0','0','1']
77+
['127','0','0','2']
78+
['192','168','1','3','2222']
79+
80+
query T
81+
select split_part(c1, '.', -5), split_part(c1, '.', -4), split_part(c1, '.', 0), split_part(c1, '.', 1), split_part(c1, '.', 4), split_part(c1, '.', 5) from t order by c1
82+
----
83+
(empty) 127 127 127 1 (empty)
84+
(empty) 127 127 127 2 (empty)
85+
192 168 192 192 3 2222
86+
87+
statement ok
88+
drop table t;

0 commit comments

Comments
 (0)