Skip to content

Commit ca55812

Browse files
committed
fix
1 parent f0b0d93 commit ca55812

File tree

2 files changed

+78
-19
lines changed

2 files changed

+78
-19
lines changed

src/query/storages/common/index/src/bloom_index.rs

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use std::collections::BTreeMap;
1616
use std::collections::HashMap;
17+
use std::hash::DefaultHasher;
1718
use std::hash::Hasher;
1819
use std::ops::ControlFlow;
1920
use std::ops::Deref;
@@ -35,12 +36,18 @@ use databend_common_expression::types::BinaryType;
3536
use databend_common_expression::types::Bitmap;
3637
use databend_common_expression::types::Buffer;
3738
use databend_common_expression::types::DataType;
39+
use databend_common_expression::types::DateType;
3840
use databend_common_expression::types::MapType;
3941
use databend_common_expression::types::NullableType;
4042
use databend_common_expression::types::Number;
4143
use databend_common_expression::types::NumberDataType;
44+
use databend_common_expression::types::NumberType;
45+
use databend_common_expression::types::StringType;
46+
use databend_common_expression::types::TimestampType;
4247
use databend_common_expression::types::UInt64Type;
48+
use databend_common_expression::types::ValueType;
4349
use databend_common_expression::visit_expr;
50+
use databend_common_expression::with_number_mapped_type;
4451
use databend_common_expression::BlockEntry;
4552
use databend_common_expression::Column;
4653
use databend_common_expression::ColumnBuilder;
@@ -349,6 +356,71 @@ impl BloomIndex {
349356
Ok(column)
350357
}
351358

359+
pub fn calculate_digest_by_type(data_type: &DataType, column: &Column) -> Result<Vec<u64>> {
360+
let inner_type = data_type.remove_nullable();
361+
with_number_mapped_type!(|NUM_TYPE| match inner_type {
362+
DataType::Number(NumberDataType::NUM_TYPE) => {
363+
Self::calculate_nullable_column_digests::<NumberType<NUM_TYPE>>(column)
364+
}
365+
DataType::String => {
366+
Self::calculate_nullable_column_digests::<StringType>(column)
367+
}
368+
DataType::Date => {
369+
Self::calculate_nullable_column_digests::<DateType>(column)
370+
}
371+
DataType::Timestamp => {
372+
Self::calculate_nullable_column_digests::<TimestampType>(column)
373+
}
374+
_ => Err(ErrorCode::Internal(format!(
375+
"Unsupported data type: {:?}",
376+
data_type
377+
))),
378+
})
379+
}
380+
381+
#[inline(always)]
382+
fn hash_one<T: DFHash>(v: &T) -> u64 {
383+
let mut hasher = DefaultHasher::default();
384+
DFHash::hash(v, &mut hasher);
385+
hasher.finish()
386+
}
387+
388+
fn calculate_nullable_column_digests<T: ValueType>(column: &Column) -> Result<Vec<u64>>
389+
where for<'a> T::ScalarRef<'a>: DFHash {
390+
let (column, validity) = if let Column::Nullable(box inner) = column {
391+
let validity = if inner.validity.null_count() == 0 {
392+
None
393+
} else {
394+
Some(&inner.validity)
395+
};
396+
(&inner.column, validity)
397+
} else {
398+
(column, None)
399+
};
400+
401+
let capacity = validity.map_or(column.len(), |v| v.true_count() + 1);
402+
let mut result = Vec::with_capacity(capacity);
403+
if validity.is_some() {
404+
result.push(0);
405+
}
406+
let column = T::try_downcast_column(column).unwrap();
407+
if let Some(validity) = validity {
408+
let column_iter = T::iter_column(&column);
409+
let value_iter = column_iter
410+
.zip(validity.iter())
411+
.filter(|(_, v)| *v)
412+
.map(|(v, _)| v);
413+
for value in value_iter {
414+
result.push(Self::hash_one(&value));
415+
}
416+
} else {
417+
for value in T::iter_column(&column) {
418+
result.push(Self::hash_one(&value));
419+
}
420+
}
421+
Ok(result)
422+
}
423+
352424
/// calculate digest for column that may have null values
353425
///
354426
/// returns (column, validity) where column is the digest of the column
@@ -734,24 +806,8 @@ impl BloomIndexBuilder {
734806
}
735807
};
736808

737-
let (column, validity) =
738-
BloomIndex::calculate_nullable_column_digest(&self.func_ctx, &column, &data_type)?;
739-
// create filter per column
740-
if validity.as_ref().map(|v| v.null_count()).unwrap_or(0) > 0 {
741-
let validity = validity.unwrap();
742-
let it = column.deref().iter().zip(validity.iter()).map(
743-
|(v, b)| {
744-
if !b {
745-
&0
746-
} else {
747-
v
748-
}
749-
},
750-
);
751-
index_column.builder.add_digests(it);
752-
} else {
753-
index_column.builder.add_digests(column.deref());
754-
}
809+
let column = BloomIndex::calculate_digest_by_type(&data_type, &column)?;
810+
index_column.builder.add_digests(column.deref());
755811
}
756812
for index_column in self.ngram_columns.iter_mut() {
757813
let field_type = &block.data_type(index_column.index);

src/query/storages/fuse/src/io/write/stream/column_statistics.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,10 @@ where
359359
for v in iter {
360360
if matches!(min.partial_cmp(&v), Some(Ordering::Greater)) {
361361
min = v;
362-
} else if matches!(max.partial_cmp(&v), Some(Ordering::Less)) {
362+
continue;
363+
}
364+
365+
if matches!(max.partial_cmp(&v), Some(Ordering::Less)) {
363366
max = v;
364367
}
365368
}

0 commit comments

Comments
 (0)