14
14
15
15
use std:: collections:: BTreeMap ;
16
16
use std:: collections:: HashMap ;
17
+ use std:: hash:: DefaultHasher ;
17
18
use std:: hash:: Hasher ;
18
19
use std:: ops:: ControlFlow ;
19
20
use std:: ops:: Deref ;
@@ -35,12 +36,18 @@ use databend_common_expression::types::BinaryType;
35
36
use databend_common_expression:: types:: Bitmap ;
36
37
use databend_common_expression:: types:: Buffer ;
37
38
use databend_common_expression:: types:: DataType ;
39
+ use databend_common_expression:: types:: DateType ;
38
40
use databend_common_expression:: types:: MapType ;
39
41
use databend_common_expression:: types:: NullableType ;
40
42
use databend_common_expression:: types:: Number ;
41
43
use databend_common_expression:: types:: NumberDataType ;
44
+ use databend_common_expression:: types:: NumberType ;
45
+ use databend_common_expression:: types:: StringType ;
46
+ use databend_common_expression:: types:: TimestampType ;
42
47
use databend_common_expression:: types:: UInt64Type ;
48
+ use databend_common_expression:: types:: ValueType ;
43
49
use databend_common_expression:: visit_expr;
50
+ use databend_common_expression:: with_number_mapped_type;
44
51
use databend_common_expression:: BlockEntry ;
45
52
use databend_common_expression:: Column ;
46
53
use databend_common_expression:: ColumnBuilder ;
@@ -349,6 +356,71 @@ impl BloomIndex {
349
356
Ok ( column)
350
357
}
351
358
359
+ pub fn calculate_digest_by_type ( data_type : & DataType , column : & Column ) -> Result < Vec < u64 > > {
360
+ let inner_type = data_type. remove_nullable ( ) ;
361
+ with_number_mapped_type ! ( |NUM_TYPE | match inner_type {
362
+ DataType :: Number ( NumberDataType :: NUM_TYPE ) => {
363
+ Self :: calculate_nullable_column_digests:: <NumberType <NUM_TYPE >>( column)
364
+ }
365
+ DataType :: String => {
366
+ Self :: calculate_nullable_column_digests:: <StringType >( column)
367
+ }
368
+ DataType :: Date => {
369
+ Self :: calculate_nullable_column_digests:: <DateType >( column)
370
+ }
371
+ DataType :: Timestamp => {
372
+ Self :: calculate_nullable_column_digests:: <TimestampType >( column)
373
+ }
374
+ _ => Err ( ErrorCode :: Internal ( format!(
375
+ "Unsupported data type: {:?}" ,
376
+ data_type
377
+ ) ) ) ,
378
+ } )
379
+ }
380
+
381
+ #[ inline( always) ]
382
+ fn hash_one < T : DFHash > ( v : & T ) -> u64 {
383
+ let mut hasher = DefaultHasher :: default ( ) ;
384
+ DFHash :: hash ( v, & mut hasher) ;
385
+ hasher. finish ( )
386
+ }
387
+
388
+ fn calculate_nullable_column_digests < T : ValueType > ( column : & Column ) -> Result < Vec < u64 > >
389
+ where for < ' a > T :: ScalarRef < ' a > : DFHash {
390
+ let ( column, validity) = if let Column :: Nullable ( box inner) = column {
391
+ let validity = if inner. validity . null_count ( ) == 0 {
392
+ None
393
+ } else {
394
+ Some ( & inner. validity )
395
+ } ;
396
+ ( & inner. column , validity)
397
+ } else {
398
+ ( column, None )
399
+ } ;
400
+
401
+ let capacity = validity. map_or ( column. len ( ) , |v| v. true_count ( ) + 1 ) ;
402
+ let mut result = Vec :: with_capacity ( capacity) ;
403
+ if validity. is_some ( ) {
404
+ result. push ( 0 ) ;
405
+ }
406
+ let column = T :: try_downcast_column ( column) . unwrap ( ) ;
407
+ if let Some ( validity) = validity {
408
+ let column_iter = T :: iter_column ( & column) ;
409
+ let value_iter = column_iter
410
+ . zip ( validity. iter ( ) )
411
+ . filter ( |( _, v) | * v)
412
+ . map ( |( v, _) | v) ;
413
+ for value in value_iter {
414
+ result. push ( Self :: hash_one ( & value) ) ;
415
+ }
416
+ } else {
417
+ for value in T :: iter_column ( & column) {
418
+ result. push ( Self :: hash_one ( & value) ) ;
419
+ }
420
+ }
421
+ Ok ( result)
422
+ }
423
+
352
424
/// calculate digest for column that may have null values
353
425
///
354
426
/// returns (column, validity) where column is the digest of the column
@@ -734,24 +806,8 @@ impl BloomIndexBuilder {
734
806
}
735
807
} ;
736
808
737
- let ( column, validity) =
738
- BloomIndex :: calculate_nullable_column_digest ( & self . func_ctx , & column, & data_type) ?;
739
- // create filter per column
740
- if validity. as_ref ( ) . map ( |v| v. null_count ( ) ) . unwrap_or ( 0 ) > 0 {
741
- let validity = validity. unwrap ( ) ;
742
- let it = column. deref ( ) . iter ( ) . zip ( validity. iter ( ) ) . map (
743
- |( v, b) | {
744
- if !b {
745
- & 0
746
- } else {
747
- v
748
- }
749
- } ,
750
- ) ;
751
- index_column. builder . add_digests ( it) ;
752
- } else {
753
- index_column. builder . add_digests ( column. deref ( ) ) ;
754
- }
809
+ let column = BloomIndex :: calculate_digest_by_type ( & data_type, & column) ?;
810
+ index_column. builder . add_digests ( column. deref ( ) ) ;
755
811
}
756
812
for index_column in self . ngram_columns . iter_mut ( ) {
757
813
let field_type = & block. data_type ( index_column. index ) ;
0 commit comments