feat(parquet): prune row groups before reading.

RinChanNOWWW · RinChanNOWWW · commit 352d86542fa0 · 2022-12-13T17:56:01.000+08:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/query/catalog/src/plan/pushdown.rs b/src/query/catalog/src/plan/pushdown.rs
@@ -21,7 +21,7 @@ use crate::plan::Projection;
 
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)]
 pub struct PrewhereInfo {
-    /// columns to be ouput be prewhere scan
+    /// columns to be output by prewhere scan
     pub output_columns: Projection,
     /// columns used for prewhere
     pub prewhere_columns: Projection,
diff --git a/src/query/storages/parquet/Cargo.toml b/src/query/storages/parquet/Cargo.toml
@@ -24,6 +24,8 @@ common-meta-app = { path = "../../../meta/app" }
 common-pipeline-core = { path = "../../pipeline/core" }
 common-sql = { path = "../../sql" }
 common-storage = { path = "../../../common/storage" }
+common-storages-pruner = { path = "../pruner" }
+common-storages-table-meta = { path = "../table-meta" }
 
 async-trait = { version = "0.1.57", package = "async-trait-fn" }
 chrono = { workspace = true }
diff --git a/src/query/storages/parquet/src/parquet_reader/meta.rs b/src/query/storages/parquet/src/parquet_reader/meta.rs
@@ -12,13 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
+use std::collections::HashSet;
 use std::fs::File;
 
 use common_arrow::arrow::datatypes::Schema as ArrowSchema;
 use common_arrow::arrow::io::parquet::read as pread;
 use common_arrow::parquet::metadata::FileMetaData;
+use common_arrow::parquet::metadata::RowGroupMetaData;
+use common_datavalues::Column;
+use common_datavalues::ColumnRef;
+use common_datavalues::IntoColumn;
+use common_datavalues::UInt64Column;
 use common_exception::ErrorCode;
 use common_exception::Result;
+use common_storages_table_meta::meta::ColumnStatistics;
+use common_storages_table_meta::meta::StatisticsOfColumns;
 
 use crate::ParquetReader;
 
@@ -43,4 +52,71 @@ impl ParquetReader {
         });
         Ok(arrow_schema)
     }
+
+    /// Collect statistics of a batch of row groups of the specified columns.
+    ///
+    /// The retuened vector's length is the same as `rgs`.
+    pub fn collect_row_group_stats(
+        schema: &ArrowSchema,
+        rgs: &[RowGroupMetaData],
+        indices: &HashSet<usize>,
+    ) -> Result<Vec<StatisticsOfColumns>> {
+        let mut stats = Vec::with_capacity(rgs.len());
+        let mut stats_of_row_groups = HashMap::with_capacity(rgs.len());
+
+        for index in indices {
+            let field = &schema.fields[*index];
+            let column_stats = pread::statistics::deserialize(field, rgs)?;
+            stats_of_row_groups.insert(*index, BatchStatistics::from(column_stats));
+        }
+
+        for (rg_idx, _) in rgs.iter().enumerate() {
+            let mut cols_stats = HashMap::new();
+            cols_stats.reserve(stats.capacity());
+            for index in indices {
+                let col_stats = stats_of_row_groups[index].get(rg_idx);
+                cols_stats.insert(*index as u32, col_stats);
+            }
+            stats.push(cols_stats);
+        }
+
+        Ok(stats)
+    }
+}
+
+/// A temporary struct to present [`pread::statistics::Statistics`].
+///
+/// Convert the inner fields into Databend data structures.
+pub struct BatchStatistics {
+    pub null_count: UInt64Column,
+    pub distinct_count: UInt64Column,
+    pub min_values: ColumnRef,
+    pub max_values: ColumnRef,
+}
+
+impl BatchStatistics {
+    pub fn get(&self, index: usize) -> ColumnStatistics {
+        ColumnStatistics {
+            min: self.min_values.get(0),
+            max: self.max_values.get(0),
+            null_count: self.null_count.get_u64(index).unwrap(),
+            in_memory_size: 0, // this field is not used.
+            distinct_of_values: self.distinct_count.get_u64(index).ok(),
+        }
+    }
+}
+
+impl From<pread::statistics::Statistics> for BatchStatistics {
+    fn from(stats: pread::statistics::Statistics) -> Self {
+        let null_count = UInt64Column::from_arrow_array(&*stats.null_count);
+        let distinct_count = UInt64Column::from_arrow_array(&*stats.distinct_count);
+        let min_values = stats.min_value.clone().into_column();
+        let max_values = stats.min_value.clone().into_column();
+        Self {
+            null_count,
+            distinct_count,
+            min_values,
+            max_values,
+        }
+    }
 }
diff --git a/src/query/storages/parquet/src/parquet_reader/mod.rs b/src/query/storages/parquet/src/parquet_reader/mod.rs
@@ -58,7 +58,7 @@ pub struct ParquetReader {
     /// The actual schema used to read parquet. It will be converted to [`common_datavalues::DataSchema`] when output [`common_datablocks::DataBlock`].
     ///
     /// The reason of using [`ArrowSchema`] to read parquet is that
-    /// There are some types that Databend not support such as Timestmap of nanoseconds.
+    /// There are some types that Databend not support such as Timestamp of nanoseconds.
     /// Such types will be convert to supported types after deserialization.
     projected_arrow_schema: ArrowSchema,
     /// [`ColumnLeaves`] corresponding to the `projected_schema`.
diff --git a/src/query/storages/parquet/src/table_function/read.rs b/src/query/storages/parquet/src/table_function/read.rs
@@ -15,6 +15,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
+use common_arrow::arrow::io::parquet::read as pread;
 use common_catalog::plan::DataSourcePlan;
 use common_catalog::plan::Partitions;
 use common_catalog::plan::PartitionsShuffleKind;
@@ -26,6 +27,7 @@ use common_exception::Result;
 use common_pipeline_core::Pipeline;
 use common_sql::evaluator::EvalNode;
 use common_sql::evaluator::Evaluator;
+use common_storages_pruner::range_pruner;
 
 use super::ParquetTable;
 use super::TableContext;
@@ -134,11 +136,44 @@ impl ParquetTable {
         // `dummy_reader` is only used for prune columns in row groups.
         let (_, _, _, columns_to_read) =
             ParquetReader::do_projection(&plan.source_info.schema().to_arrow(), &columns_to_read)?;
+
+        // do parition at the begin of the whole pipeline.
+        let push_downs = plan.push_downs.clone();
+        let schema = plan.schema();
         pipeline.set_on_init(move || {
             let mut partitions = Vec::with_capacity(locations.len());
+
+            // build row group pruner.
+
+            let filter_expr = push_downs.as_ref().map(|extra| extra.filters.as_slice());
+            let row_group_pruner = range_pruner::new_range_pruner(&ctx_ref, filter_expr, &schema)?;
+
             for location in &locations {
                 let file_meta = ParquetReader::read_meta(location)?;
-                for rg in &file_meta.row_groups {
+                let arrow_schema = pread::infer_schema(&file_meta)?;
+                let mut row_group_pruned = vec![false; file_meta.row_groups.len()];
+
+                // If collecting stats fails or `should_keep` is true, we still read the row group.
+                // Otherwise, the row group will be pruned.
+                if let Ok(row_group_stats) = ParquetReader::collect_row_group_stats(
+                    &arrow_schema,
+                    &file_meta.row_groups,
+                    &columns_to_read,
+                ) {
+                    for (idx, (stats, rg)) in row_group_stats
+                        .iter()
+                        .zip(file_meta.row_groups.iter())
+                        .enumerate()
+                    {
+                        row_group_pruned[idx] =
+                            !row_group_pruner.should_keep(stats, rg.num_rows() as u64);
+                    }
+                }
+
+                for (idx, rg) in file_meta.row_groups.iter().enumerate() {
+                    if row_group_pruned[idx] {
+                        continue;
+                    }
                     let mut column_metas = HashMap::with_capacity(columns_to_read.len());
                     for index in &columns_to_read {
                         let c = &rg.columns()[*index];