13
13
// limitations under the License.
14
14
15
15
use std:: collections:: HashMap ;
16
+ use std:: hash:: Hash ;
17
+ use std:: marker:: PhantomData ;
16
18
17
19
use databend_common_exception:: Result ;
18
- use databend_common_expression:: types:: AccessType ;
20
+ use databend_common_expression:: types:: boolean :: TrueIdxIter ;
19
21
use databend_common_expression:: types:: DataType ;
20
22
use databend_common_expression:: types:: DateType ;
21
- use databend_common_expression:: types:: DecimalColumn ;
22
- use databend_common_expression:: types:: DecimalScalar ;
23
+ use databend_common_expression:: types:: Decimal128Type ;
24
+ use databend_common_expression:: types:: Decimal256Type ;
23
25
use databend_common_expression:: types:: NumberDataType ;
24
26
use databend_common_expression:: types:: NumberType ;
25
27
use databend_common_expression:: types:: StringType ;
26
28
use databend_common_expression:: types:: TimestampType ;
29
+ use databend_common_expression:: types:: ValueType ;
27
30
use databend_common_expression:: with_number_mapped_type;
28
31
use databend_common_expression:: Column ;
29
32
use databend_common_expression:: ColumnId ;
@@ -32,6 +35,7 @@ use databend_common_expression::Scalar;
32
35
use databend_common_expression:: ScalarRef ;
33
36
use databend_common_expression:: TableSchemaRef ;
34
37
use databend_common_expression:: Value ;
38
+ use databend_common_expression:: SELECTIVITY_THRESHOLD ;
35
39
use databend_common_functions:: aggregates:: eval_aggr;
36
40
use databend_storages_common_table_meta:: meta:: ColumnDistinctHLL ;
37
41
use databend_storages_common_table_meta:: meta:: ColumnStatistics ;
@@ -43,19 +47,19 @@ use crate::statistics::Trim;
43
47
44
48
pub struct ColumnStatisticsState {
45
49
col_stats : HashMap < ColumnId , Vec < ColumnStatistics > > ,
46
- distinct_columns : HashMap < ColumnId , ColumnDistinctHLL > ,
50
+ distinct_columns : HashMap < ColumnId , Box < dyn ColumnNDVEstimator > > ,
47
51
}
48
52
49
53
impl ColumnStatisticsState {
50
- pub fn new ( stats_columns : & [ ColumnId ] , distinct_columns : & [ ColumnId ] ) -> Self {
54
+ pub fn new ( stats_columns : & [ ColumnId ] , distinct_columns : & [ ( ColumnId , DataType ) ] ) -> Self {
51
55
let col_stats = stats_columns
52
56
. iter ( )
53
57
. map ( |& col_id| ( col_id, Vec :: new ( ) ) )
54
58
. collect ( ) ;
55
59
56
60
let distinct_columns = distinct_columns
57
61
. iter ( )
58
- . map ( |& col_id| ( col_id, ColumnDistinctHLL :: default ( ) ) )
62
+ . map ( |( col_id, data_type ) | ( * col_id, create_estimator ( data_type ) ) )
59
63
. collect ( ) ;
60
64
61
65
Self {
@@ -80,8 +84,8 @@ impl ColumnStatisticsState {
80
84
in_memory_size as u64 ,
81
85
None ,
82
86
) ;
83
- if let Some ( hll ) = self . distinct_columns . get_mut ( & column_id) {
84
- scalar_update_hll_cardinality ( & s. as_ref ( ) , & data_type , hll ) ;
87
+ if let Some ( estimator ) = self . distinct_columns . get_mut ( & column_id) {
88
+ estimator . update_scalar ( & s. as_ref ( ) ) ;
85
89
}
86
90
self . col_stats . get_mut ( & column_id) . unwrap ( ) . push ( col_stats) ;
87
91
}
@@ -128,8 +132,8 @@ impl ColumnStatisticsState {
128
132
self . col_stats . get_mut ( & column_id) . unwrap ( ) . push ( col_stats) ;
129
133
130
134
// use distinct count calculated by the xor hash function to avoid repetitive operation.
131
- if let Some ( hll ) = self . distinct_columns . get_mut ( & column_id) {
132
- column_update_hll_cardinality ( & col, & data_type , hll ) ;
135
+ if let Some ( estimator ) = self . distinct_columns . get_mut ( & column_id) {
136
+ estimator . update_column ( & col) ;
133
137
}
134
138
}
135
139
}
@@ -146,102 +150,107 @@ impl ColumnStatisticsState {
146
150
let mut col_stats = reduce_column_statistics ( stats) ;
147
151
if let Some ( count) = column_distinct_count. get ( id) {
148
152
col_stats. distinct_of_values = Some ( * count as u64 ) ;
149
- } else if let Some ( hll ) = self . distinct_columns . get ( id) {
150
- col_stats. distinct_of_values = Some ( hll . count ( ) as u64 ) ;
153
+ } else if let Some ( estimator ) = self . distinct_columns . get ( id) {
154
+ col_stats. distinct_of_values = Some ( estimator . finalize ( ) ) ;
151
155
}
152
156
statistics. insert ( * id, col_stats) ;
153
157
}
154
158
Ok ( statistics)
155
159
}
156
160
}
157
161
158
- fn column_update_hll_cardinality ( col : & Column , ty : & DataType , hll : & mut ColumnDistinctHLL ) {
159
- if let DataType :: Nullable ( inner) = ty {
160
- let col = col. as_nullable ( ) . unwrap ( ) ;
161
- for ( i, v) in col. validity . iter ( ) . enumerate ( ) {
162
- if v {
163
- let scalar = unsafe { col. column . index_unchecked ( i) } ;
164
- scalar_update_hll_cardinality ( & scalar, inner, hll) ;
165
- }
166
- }
167
- return ;
168
- }
162
+ pub trait ColumnNDVEstimator : Send + Sync {
163
+ fn update_column ( & mut self , column : & Column ) ;
164
+ fn update_scalar ( & mut self , scalar : & ScalarRef ) ;
165
+ fn finalize ( & self ) -> u64 ;
166
+ }
169
167
170
- with_number_mapped_type ! ( |NUM_TYPE | match ty {
168
+ pub fn create_estimator ( data_type : & DataType ) -> Box < dyn ColumnNDVEstimator > {
169
+ let inner_type = data_type. remove_nullable ( ) ;
170
+ with_number_mapped_type ! ( |NUM_TYPE | match inner_type {
171
171
DataType :: Number ( NumberDataType :: NUM_TYPE ) => {
172
- let col = NumberType :: <NUM_TYPE >:: try_downcast_column( col) . unwrap( ) ;
173
- for v in col. iter( ) {
174
- hll. add_object( v) ;
175
- }
172
+ ColumnNDVEstimatorImpl :: <NumberType <NUM_TYPE >>:: create( )
176
173
}
177
174
DataType :: String => {
178
- let col = StringType :: try_downcast_column( col) . unwrap( ) ;
179
- for v in col. iter( ) {
180
- hll. add_object( & v) ;
181
- }
175
+ ColumnNDVEstimatorImpl :: <StringType >:: create( )
182
176
}
183
177
DataType :: Date => {
184
- let col = DateType :: try_downcast_column( col) . unwrap( ) ;
185
- for v in col. iter( ) {
186
- hll. add_object( v) ;
187
- }
178
+ ColumnNDVEstimatorImpl :: <DateType >:: create( )
188
179
}
189
180
DataType :: Timestamp => {
190
- let col = TimestampType :: try_downcast_column ( col ) . unwrap ( ) ;
191
- for v in col . iter ( ) {
192
- hll . add_object ( v ) ;
193
- }
181
+ ColumnNDVEstimatorImpl :: < TimestampType > :: create ( )
182
+ }
183
+ DataType :: Decimal ( s ) if s . can_carried_by_128 ( ) => {
184
+ ColumnNDVEstimatorImpl :: < Decimal128Type > :: create ( )
194
185
}
195
186
DataType :: Decimal ( _) => {
196
- match col {
197
- Column :: Decimal ( DecimalColumn :: Decimal128 ( col, _) ) => {
198
- for v in col. iter( ) {
199
- hll. add_object( v) ;
200
- }
201
- }
202
- Column :: Decimal ( DecimalColumn :: Decimal256 ( col, _) ) => {
203
- for v in col. iter( ) {
204
- hll. add_object( v) ;
205
- }
206
- }
207
- _ => unreachable!( ) ,
208
- } ;
187
+ ColumnNDVEstimatorImpl :: <Decimal256Type >:: create( )
209
188
}
210
- _ => unreachable!( "Unsupported data type: {:?}" , ty ) ,
211
- } ) ;
189
+ _ => unreachable!( "Unsupported data type: {:?}" , data_type ) ,
190
+ } )
212
191
}
213
192
214
- fn scalar_update_hll_cardinality ( scalar : & ScalarRef , ty : & DataType , hll : & mut ColumnDistinctHLL ) {
215
- if matches ! ( scalar, ScalarRef :: Null ) {
216
- return ;
217
- }
193
+ pub struct ColumnNDVEstimatorImpl < T >
194
+ where
195
+ T : ValueType + Send + Sync ,
196
+ T :: Scalar : Hash ,
197
+ {
198
+ hll : ColumnDistinctHLL ,
199
+ _phantom : PhantomData < T > ,
200
+ }
218
201
219
- let ty = ty. remove_nullable ( ) ;
202
+ impl < T > ColumnNDVEstimatorImpl < T >
203
+ where
204
+ T : ValueType + Send + Sync ,
205
+ T :: Scalar : Hash ,
206
+ {
207
+ pub fn create ( ) -> Box < dyn ColumnNDVEstimator > {
208
+ Box :: new ( Self {
209
+ hll : ColumnDistinctHLL :: new ( ) ,
210
+ _phantom : Default :: default ( ) ,
211
+ } )
212
+ }
213
+ }
220
214
221
- with_number_mapped_type ! ( |NUM_TYPE | match ty {
222
- DataType :: Number ( NumberDataType :: NUM_TYPE ) => {
223
- let val = NumberType :: <NUM_TYPE >:: try_downcast_scalar( scalar) . unwrap( ) ;
224
- hll. add_object( & val) ;
225
- }
226
- DataType :: String => {
227
- let val = StringType :: try_downcast_scalar( scalar) . unwrap( ) ;
228
- hll. add_object( & val) ;
229
- }
230
- DataType :: Date => {
231
- let val = DateType :: try_downcast_scalar( scalar) . unwrap( ) ;
232
- hll. add_object( & val) ;
233
- }
234
- DataType :: Timestamp => {
235
- let val = TimestampType :: try_downcast_scalar( scalar) . unwrap( ) ;
236
- hll. add_object( & val) ;
237
- }
238
- DataType :: Decimal ( _) => {
239
- match scalar {
240
- ScalarRef :: Decimal ( DecimalScalar :: Decimal128 ( v, _) ) => hll. add_object( & v) ,
241
- ScalarRef :: Decimal ( DecimalScalar :: Decimal256 ( v, _) ) => hll. add_object( & v) ,
242
- _ => unreachable!( ) ,
215
+ impl < T > ColumnNDVEstimator for ColumnNDVEstimatorImpl < T >
216
+ where
217
+ T : ValueType + Send + Sync ,
218
+ T :: Scalar : Hash ,
219
+ {
220
+ fn update_column ( & mut self , column : & Column ) {
221
+ if let Column :: Nullable ( box inner) = column {
222
+ let validity_len = inner. validity . len ( ) ;
223
+ let column = T :: try_downcast_column ( & inner. column ) . unwrap ( ) ;
224
+ if inner. validity . true_count ( ) as f64 / validity_len as f64 >= SELECTIVITY_THRESHOLD {
225
+ for ( data, valid) in T :: iter_column ( & column) . zip ( inner. validity . iter ( ) ) {
226
+ if valid {
227
+ self . hll . add_object ( & T :: to_owned_scalar ( data) ) ;
228
+ }
229
+ }
230
+ } else {
231
+ TrueIdxIter :: new ( validity_len, Some ( & inner. validity ) ) . for_each ( |idx| {
232
+ let val = unsafe { T :: index_column_unchecked ( & column, idx) } ;
233
+ self . hll . add_object ( & T :: to_owned_scalar ( val) ) ;
234
+ } )
235
+ }
236
+ } else {
237
+ let column = T :: try_downcast_column ( column) . unwrap ( ) ;
238
+ for value in T :: iter_column ( & column) {
239
+ self . hll . add_object ( & T :: to_owned_scalar ( value) ) ;
243
240
}
244
241
}
245
- _ => unreachable!( "Unsupported data type: {:?}" , ty) ,
246
- } ) ;
242
+ }
243
+
244
+ fn update_scalar ( & mut self , scalar : & ScalarRef ) {
245
+ if matches ! ( scalar, ScalarRef :: Null ) {
246
+ return ;
247
+ }
248
+
249
+ let val = T :: try_downcast_scalar ( scalar) . unwrap ( ) ;
250
+ self . hll . add_object ( & T :: to_owned_scalar ( val) ) ;
251
+ }
252
+
253
+ fn finalize ( & self ) -> u64 {
254
+ self . hll . count ( ) as u64
255
+ }
247
256
}
0 commit comments