Skip to content

Commit 2f64773

Browse files
authored
chore(query): support domain contains in string type (#15023)
* feat(query): support domain contains in string type * feat(query): support domain contains in string type * add random tests
1 parent 9b47de7 commit 2f64773

File tree

4 files changed

+113
-8
lines changed

4 files changed

+113
-8
lines changed

src/query/expression/src/property.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ pub trait SimpleDomainCmp {
409409
fn domain_gte(&self, other: &Self) -> FunctionDomain<BooleanType>;
410410
fn domain_lt(&self, other: &Self) -> FunctionDomain<BooleanType>;
411411
fn domain_lte(&self, other: &Self) -> FunctionDomain<BooleanType>;
412+
fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType>;
412413
}
413414

414415
const ALL_TRUE_DOMAIN: BooleanDomain = BooleanDomain {
@@ -477,6 +478,14 @@ impl<T: Ord + PartialOrd> SimpleDomainCmp for SimpleDomain<T> {
477478
FunctionDomain::Full
478479
}
479480
}
481+
482+
fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType> {
483+
if self.min > other.max || self.max < other.min {
484+
FunctionDomain::Domain(ALL_FALSE_DOMAIN)
485+
} else {
486+
FunctionDomain::Full
487+
}
488+
}
480489
}
481490

482491
impl SimpleDomainCmp for StringDomain {
@@ -509,6 +518,11 @@ impl SimpleDomainCmp for StringDomain {
509518
let (d1, d2) = unify_string(self, other);
510519
d1.domain_lte(&d2)
511520
}
521+
522+
fn domain_contains(&self, other: &Self) -> FunctionDomain<BooleanType> {
523+
let (d1, d2) = unify_string(self, other);
524+
d1.domain_contains(&d2)
525+
}
512526
}
513527

514528
pub fn unify_string(

src/query/functions/src/scalars/array.rs

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ use databend_common_expression::FunctionRegistry;
5757
use databend_common_expression::FunctionSignature;
5858
use databend_common_expression::Scalar;
5959
use databend_common_expression::ScalarRef;
60+
use databend_common_expression::SimpleDomainCmp;
6061
use databend_common_expression::SortColumnDescription;
6162
use databend_common_expression::Value;
6263
use databend_common_expression::ValueRef;
@@ -481,11 +482,9 @@ pub fn register(registry: &mut FunctionRegistry) {
481482
registry.register_passthrough_nullable_2_arg::<ArrayType<NumberType<NUM_TYPE>>, NumberType<NUM_TYPE>, BooleanType, _, _>(
482483
"contains",
483484
|_, lhs, rhs| {
484-
let has_true = lhs.is_some_and(|lhs| !(lhs.min > rhs.max || lhs.max < rhs.min));
485-
FunctionDomain::Domain(BooleanDomain {
486-
has_false: true,
487-
has_true,
488-
})
485+
lhs.as_ref().map(|lhs| {
486+
lhs.domain_contains(rhs)
487+
}).unwrap_or(FunctionDomain::Full)
489488
},
490489
|lhs, rhs, _| eval_contains::<NumberType<NUM_TYPE>>(lhs, rhs)
491490
);
@@ -495,9 +494,11 @@ pub fn register(registry: &mut FunctionRegistry) {
495494

496495
registry.register_passthrough_nullable_2_arg::<ArrayType<StringType>, StringType, BooleanType, _, _>(
497496
"contains",
498-
|_, _, _| {
499-
FunctionDomain::Full
500-
},
497+
|_, lhs, rhs| {
498+
lhs.as_ref().map(|lhs| {
499+
lhs.domain_contains(rhs)
500+
}).unwrap_or(FunctionDomain::Full)
501+
},
501502
|lhs, rhs, _| {
502503
match lhs {
503504
ValueRef::Scalar(array) => {

src/query/functions/tests/it/scalars/array.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ fn test_contains(file: &mut impl Write) {
155155

156156
let columns = [
157157
("int8_col", Int8Type::from_data(vec![1i8, 2, 7, 8])),
158+
(
159+
"string_col",
160+
StringType::from_data(vec![r#"1"#, r#"2"#, r#"5"#, r#"1234"#]),
161+
),
158162
(
159163
"nullable_col",
160164
Int64Type::from_data_with_validity(vec![9i64, 10, 11, 12], vec![
@@ -164,6 +168,20 @@ fn test_contains(file: &mut impl Write) {
164168
];
165169

166170
run_ast(file, "int8_col not in (1, 2, 3, 4, 5, null)", &columns);
171+
run_ast(
172+
file,
173+
"contains(['5000', '6000', '7000'], string_col)",
174+
&columns,
175+
);
176+
177+
run_ast(file, "contains(['1', '5'], string_col)", &columns);
178+
179+
run_ast(
180+
file,
181+
"contains(['15000', '6000', '7000'], string_col)",
182+
&columns,
183+
);
184+
167185
run_ast(file, "contains([1,2,null], nullable_col)", &columns);
168186
run_ast(
169187
file,

src/query/functions/tests/it/scalars/testdata/array.txt

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,78 @@ evaluation (internal):
439439
+----------+-----------------------+
440440

441441

442+
ast : contains(['5000', '6000', '7000'], string_col)
443+
raw expr : contains(array('5000', '6000', '7000'), string_col::String)
444+
checked expr : contains<Array(String), String>(array<T0=String><T0, T0, T0>("5000", "6000", "7000"), string_col)
445+
optimized expr : false
446+
evaluation:
447+
+--------+-------------+---------+
448+
| | string_col | Output |
449+
+--------+-------------+---------+
450+
| Type | String | Boolean |
451+
| Domain | {"1"..="5"} | {FALSE} |
452+
| Row 0 | '1' | false |
453+
| Row 1 | '2' | false |
454+
| Row 2 | '5' | false |
455+
| Row 3 | '1234' | false |
456+
+--------+-------------+---------+
457+
evaluation (internal):
458+
+------------+-------------------------------------------------------------------+
459+
| Column | Data |
460+
+------------+-------------------------------------------------------------------+
461+
| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
462+
| Output | Boolean([0b____0000]) |
463+
+------------+-------------------------------------------------------------------+
464+
465+
466+
ast : contains(['1', '5'], string_col)
467+
raw expr : contains(array('1', '5'), string_col::String)
468+
checked expr : contains<Array(String), String>(array<T0=String><T0, T0>("1", "5"), string_col)
469+
optimized expr : contains<Array(String), String>(['1', '5'], string_col)
470+
evaluation:
471+
+--------+-------------+---------------+
472+
| | string_col | Output |
473+
+--------+-------------+---------------+
474+
| Type | String | Boolean |
475+
| Domain | {"1"..="5"} | {FALSE, TRUE} |
476+
| Row 0 | '1' | true |
477+
| Row 1 | '2' | false |
478+
| Row 2 | '5' | true |
479+
| Row 3 | '1234' | false |
480+
+--------+-------------+---------------+
481+
evaluation (internal):
482+
+------------+-------------------------------------------------------------------+
483+
| Column | Data |
484+
+------------+-------------------------------------------------------------------+
485+
| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
486+
| Output | Boolean([0b____0101]) |
487+
+------------+-------------------------------------------------------------------+
488+
489+
490+
ast : contains(['15000', '6000', '7000'], string_col)
491+
raw expr : contains(array('15000', '6000', '7000'), string_col::String)
492+
checked expr : contains<Array(String), String>(array<T0=String><T0, T0, T0>("15000", "6000", "7000"), string_col)
493+
optimized expr : contains<Array(String), String>(['15000', '6000', '7000'], string_col)
494+
evaluation:
495+
+--------+-------------+---------------+
496+
| | string_col | Output |
497+
+--------+-------------+---------------+
498+
| Type | String | Boolean |
499+
| Domain | {"1"..="5"} | {FALSE, TRUE} |
500+
| Row 0 | '1' | false |
501+
| Row 1 | '2' | false |
502+
| Row 2 | '5' | false |
503+
| Row 3 | '1234' | false |
504+
+--------+-------------+---------------+
505+
evaluation (internal):
506+
+------------+-------------------------------------------------------------------+
507+
| Column | Data |
508+
+------------+-------------------------------------------------------------------+
509+
| string_col | StringColumn { data: 0x31323531323334, offsets: [0, 1, 2, 3, 7] } |
510+
| Output | Boolean([0b____0000]) |
511+
+------------+-------------------------------------------------------------------+
512+
513+
442514
ast : contains([1,2,null], nullable_col)
443515
raw expr : contains(array(1, 2, NULL), nullable_col::Int64 NULL)
444516
checked expr : contains<T0=Int64 NULL><Array(T0), T0>(CAST(array<T0=UInt8 NULL><T0, T0, T0>(CAST(1_u8 AS UInt8 NULL), CAST(2_u8 AS UInt8 NULL), CAST(NULL AS UInt8 NULL)) AS Array(Int64 NULL)), nullable_col)

0 commit comments

Comments
 (0)