feat: new settings fuse_parquet_read_batch_size

dantengsky · dantengsky · commit 8189ebf3c647 · 2025-04-01T15:37:18.000+08:00
Which controls the bach size during deserializing of fuse parquet data
block. The default value of this setting is 8192.
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
@@ -1242,6 +1242,13 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(0..=1)),
                 }),
+                ("fuse_parquet_read_batch_size", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(8192),
+                    desc: "The batch size while deserializing fuse table with parquet storage format",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(0..=1_0000_000)),
+                }),
             ]);
 
             Ok(Arc::new(DefaultSettings {
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
@@ -924,4 +924,8 @@ impl Settings {
     pub fn get_enable_use_vacuum2_to_purge_transient_table_data(&self) -> Result<bool> {
         Ok(self.try_get_u64("use_vacuum2_to_purge_transient_table_data")? == 1)
     }
+
+    pub fn get_fuse_parquet_read_batch_size(&self) -> Result<usize> {
+        Ok(self.try_get_u64("fuse_parquet_read_batch_size")? as usize)
+    }
 }
diff --git a/src/query/storages/fuse/src/io/read/agg_index/agg_index_reader.rs b/src/query/storages/fuse/src/io/read/agg_index/agg_index_reader.rs
@@ -97,7 +97,7 @@ impl AggIndexReader {
         self.index_id
     }
 
-    pub(super) fn apply_agg_info(&self, block: DataBlock) -> Result<DataBlock> {
+    pub(super) fn apply_agg_info_to_block(&self, block: DataBlock) -> Result<DataBlock> {
         let evaluator = Evaluator::new(&block, &self.func_ctx, &BUILTIN_FUNCTIONS);
 
         // 1. Filter the block if there is a filter.
@@ -145,4 +145,11 @@ impl AggIndexReader {
             )),
         ))
     }
+
+    pub(super) fn apply_agg_info(&self, block: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        block
+            .into_iter()
+            .map(|block| self.apply_agg_info_to_block(block))
+            .collect::<Result<_>>()
+    }
 }
diff --git a/src/query/storages/fuse/src/io/read/agg_index/agg_index_reader_native.rs b/src/query/storages/fuse/src/io/read/agg_index/agg_index_reader_native.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::sync::Arc;
+use std::vec;
 
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
@@ -166,7 +167,8 @@ impl AggIndexReader {
             let block = DataBlock::new_from_columns(columns);
             blocks.push(block);
         }
-        let block = DataBlock::concat(&blocks)?;
-        self.apply_agg_info(block)
+        let blocks = self.apply_agg_info(blocks)?;
+
+        DataBlock::concat(blocks.as_slice())
     }
 }
diff --git a/src/query/storages/fuse/src/io/read/agg_index/agg_index_reader_parquet.rs b/src/query/storages/fuse/src/io/read/agg_index/agg_index_reader_parquet.rs
@@ -113,15 +113,17 @@ impl AggIndexReader {
         &self,
         part: PartInfoPtr,
         data: BlockReadResult,
-    ) -> Result<DataBlock> {
+        batch_size: usize,
+    ) -> Result<Vec<DataBlock>> {
         let columns_chunks = data.columns_chunks()?;
         let part = FuseBlockPartInfo::from_part(&part)?;
-        let block = self.reader.deserialize_parquet_chunks(
+        let block = self.reader.deserialize_parquet_to_blocks(
             part.nums_rows,
             &part.columns_meta,
             columns_chunks,
             &part.compression,
             &part.location,
+            batch_size,
         )?;
 
         self.apply_agg_info(block)
diff --git a/src/query/storages/fuse/src/io/read/block/parquet/deserialize.rs b/src/query/storages/fuse/src/io/read/block/parquet/deserialize.rs
@@ -19,6 +19,7 @@ use arrow_schema::Schema;
 use databend_common_expression::ColumnId;
 use databend_common_expression::TableSchema;
 use databend_storages_common_table_meta::meta::Compression;
+use itertools::Itertools;
 use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
 use parquet::arrow::parquet_to_arrow_field_levels;
 use parquet::arrow::ArrowSchemaConverter;
@@ -34,7 +35,8 @@ pub fn column_chunks_to_record_batch(
     num_rows: usize,
     column_chunks: &HashMap<ColumnId, DataItem>,
     compression: &Compression,
-) -> databend_common_exception::Result<RecordBatch> {
+    batch_size: usize,
+) -> databend_common_exception::Result<Vec<RecordBatch>> {
     let arrow_schema = Schema::from(original_schema);
     let parquet_schema = ArrowSchemaConverter::new().convert(&arrow_schema)?;
 
@@ -66,13 +68,13 @@ pub fn column_chunks_to_record_batch(
         ProjectionMask::leaves(&parquet_schema, projection_mask),
         Some(arrow_schema.fields()),
     )?;
-    let mut record_reader = ParquetRecordBatchReader::try_new_with_row_groups(
+    let record_reader = ParquetRecordBatchReader::try_new_with_row_groups(
         &field_levels,
         row_group.as_ref(),
-        num_rows,
+        batch_size,
         None,
     )?;
-    let record = record_reader.next().unwrap()?;
-    assert!(record_reader.next().is_none());
-    Ok(record)
+    let records = record_reader.try_collect()?;
+    // TODO assert the row numbers?
+    Ok(records)
 }
diff --git a/src/query/storages/fuse/src/io/read/block/parquet/mod.rs b/src/query/storages/fuse/src/io/read/block/parquet/mod.rs
@@ -35,6 +35,7 @@ mod adapter;
 mod deserialize;
 
 pub use adapter::RowGroupImplBuilder;
+use databend_common_exception::Result;
 pub use deserialize::column_chunks_to_record_batch;
 
 use crate::io::read::block::block_reader_merge_io::DataItem;
@@ -48,17 +49,41 @@ impl BlockReader {
         column_chunks: HashMap<ColumnId, DataItem>,
         compression: &Compression,
         block_path: &str,
-    ) -> databend_common_exception::Result<DataBlock> {
+    ) -> Result<DataBlock> {
+        let mut blocks = self.deserialize_parquet_to_blocks(
+            num_rows,
+            column_metas,
+            column_chunks,
+            compression,
+            block_path,
+            num_rows,
+        )?;
+        // Defensive check: using `num_rows` as batch_size, expects only one block
+        assert_eq!(blocks.len(), 1);
+        Ok(blocks.pop().unwrap())
+    }
+
+    pub(crate) fn deserialize_parquet_to_blocks(
+        &self,
+        num_rows: usize,
+        column_metas: &HashMap<ColumnId, ColumnMeta>,
+        column_chunks: HashMap<ColumnId, DataItem>,
+        compression: &Compression,
+        block_path: &str,
+        batch_size: usize,
+    ) -> Result<Vec<DataBlock>> {
         if column_chunks.is_empty() {
-            return self.build_default_values_block(num_rows);
+            return Ok(vec![self.build_default_values_block(num_rows)?]);
         }
-        let record_batch = column_chunks_to_record_batch(
+
+        let record_batches = column_chunks_to_record_batch(
             &self.original_schema,
             num_rows,
             &column_chunks,
             compression,
+            batch_size,
         )?;
-        let mut columns = Vec::with_capacity(self.projected_schema.fields.len());
+
         let name_paths = column_name_paths(&self.projection, &self.original_schema);
 
         let array_cache = if self.put_cache {
@@ -67,58 +92,69 @@ impl BlockReader {
             None
         };
 
-        for ((i, field), column_node) in self
-            .projected_schema
-            .fields
-            .iter()
-            .enumerate()
-            .zip(self.project_column_nodes.iter())
-        {
-            let data_type = field.data_type().into();
-
-            // NOTE, there is something tricky here:
-            // - `column_chunks` always contains data of leaf columns
-            // - here we may processing a nested type field
-            // - But, even if the field being processed is a field with multiple leaf columns
-            //    `column_chunks.get(&field.column_id)` will still return Some(DataItem::_)[^1],
-            //    even if we are getting data from `column_chunks` using a non-leaf
-            //    `column_id` of `projected_schema.fields`
-            //
-            //   [^1]: Except in the current block, there is no data stored for the
-            //         corresponding field, and a default value has been declared for
-            //         the corresponding field.
-            //
-            //  Yes, it is too obscure, we need to polish it later.
-
-            let value = match column_chunks.get(&field.column_id) {
-                Some(DataItem::RawData(data)) => {
-                    // get the deserialized arrow array, which may be a nested array
-                    let arrow_array = column_by_name(&record_batch, &name_paths[i]);
-                    if !column_node.is_nested {
-                        if let Some(cache) = &array_cache {
-                            let meta = column_metas.get(&field.column_id).unwrap();
-                            let (offset, len) = meta.offset_length();
-                            let key =
-                                TableDataCacheKey::new(block_path, field.column_id, offset, len);
-                            cache.insert(key.into(), (arrow_array.clone(), data.len()));
+        let mut blocks = Vec::with_capacity(record_batches.len());
+
+        for record_batch in record_batches {
+            let mut columns = Vec::with_capacity(self.projected_schema.fields.len());
+            for ((i, field), column_node) in self
+                .projected_schema
+                .fields
+                .iter()
+                .enumerate()
+                .zip(self.project_column_nodes.iter())
+            {
+                let data_type = field.data_type().into();
+
+                // NOTE, there is something tricky here:
+                // - `column_chunks` always contains data of leaf columns
+                // - here we may processing a nested type field
+                // - But, even if the field being processed is a field with multiple leaf columns
+                //    `column_chunks.get(&field.column_id)` will still return Some(DataItem::_)[^1],
+                //    even if we are getting data from `column_chunks` using a non-leaf
+                //    `column_id` of `projected_schema.fields`
+                //
+                //   [^1]: Except in the current block, there is no data stored for the
+                //         corresponding field, and a default value has been declared for
+                //         the corresponding field.
+                //
+                //  Yes, it is too obscure, we need to polish it later.
+
+                let value = match column_chunks.get(&field.column_id) {
+                    Some(DataItem::RawData(data)) => {
+                        // get the deserialized arrow array, which may be a nested array
+                        let arrow_array = column_by_name(&record_batch, &name_paths[i]);
+                        if !column_node.is_nested {
+                            if let Some(cache) = &array_cache {
+                                let meta = column_metas.get(&field.column_id).unwrap();
+                                let (offset, len) = meta.offset_length();
+                                let key = TableDataCacheKey::new(
+                                    block_path,
+                                    field.column_id,
+                                    offset,
+                                    len,
+                                );
+                                cache.insert(key.into(), (arrow_array.clone(), data.len()));
+                            }
                         }
+                        Value::from_arrow_rs(arrow_array, &data_type)?
                     }
-                    Value::from_arrow_rs(arrow_array, &data_type)?
-                }
-                Some(DataItem::ColumnArray(cached)) => {
-                    if column_node.is_nested {
-                        // a defensive check, should never happen
-                        return Err(ErrorCode::StorageOther(
-                            "unexpected nested field: nested leaf field hits cached",
-                        ));
+                    Some(DataItem::ColumnArray(cached)) => {
+                        if column_node.is_nested {
+                            // a defensive check, should never happen
+                            return Err(ErrorCode::StorageOther(
+                                "unexpected nested field: nested leaf field hits cached",
+                            ));
+                        }
+                        Value::from_arrow_rs(cached.0.clone(), &data_type)?
                     }
-                    Value::from_arrow_rs(cached.0.clone(), &data_type)?
-                }
-                None => Value::Scalar(self.default_vals[i].clone()),
-            };
-            columns.push(BlockEntry::new(data_type, value));
+                    None => Value::Scalar(self.default_vals[i].clone()),
+                };
+                columns.push(BlockEntry::new(data_type, value));
+            }
+            blocks.push(DataBlock::new(columns, num_rows));
         }
-        Ok(DataBlock::new(columns, num_rows))
+
+        Ok(blocks)
     }
 }
 
diff --git a/src/query/storages/fuse/src/io/read/virtual_column/virtual_column_reader_parquet.rs b/src/query/storages/fuse/src/io/read/virtual_column/virtual_column_reader_parquet.rs
@@ -14,13 +14,17 @@
 
 use std::collections::HashSet;
 
+use arrow_array::RecordBatch;
+use databend_common_catalog::plan::VirtualColumnInfo;
 use databend_common_exception::Result;
 use databend_common_expression::eval_function;
 use databend_common_expression::types::DataType;
 use databend_common_expression::BlockEntry;
 use databend_common_expression::Column;
 use databend_common_expression::ColumnId;
 use databend_common_expression::DataBlock;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::TableSchemaRef;
 use databend_common_expression::Value;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_storages_common_io::MergeIOReader;
@@ -128,30 +132,60 @@ impl VirtualColumnReader {
         ))
     }
 
-    pub fn deserialize_virtual_columns(
+    pub fn try_create_paster(
         &self,
-        mut data_block: DataBlock,
         virtual_data: Option<VirtualBlockReadResult>,
-    ) -> Result<DataBlock> {
-        let record_batch = virtual_data
-            .map(|virtual_data| {
-                let columns_chunks = virtual_data.data.columns_chunks()?;
-                column_chunks_to_record_batch(
-                    &self.virtual_column_info.schema,
-                    virtual_data.num_rows,
-                    &columns_chunks,
-                    &virtual_data.compression,
-                )
-            })
-            .transpose()?;
-
-        // If the virtual column has already generated, add it directly,
-        // otherwise extract it from the source column
+        batch_size: usize,
+    ) -> Result<VirtualColumnDataModifier> {
+        let chunks = if let Some(virtual_data) = virtual_data {
+            let columns_chunks = virtual_data.data.columns_chunks()?;
+            let chunks = column_chunks_to_record_batch(
+                &self.virtual_column_info.schema,
+                virtual_data.num_rows,
+                &columns_chunks,
+                &virtual_data.compression,
+                batch_size,
+            )?;
+            Some(chunks)
+        } else {
+            None
+        };
+
         let func_ctx = self.ctx.get_function_context()?;
+
+        Ok(VirtualColumnDataModifier {
+            record_batches: chunks,
+            function_context: func_ctx,
+            next_record_batch_index: 0,
+            virtual_column_info: self.virtual_column_info.clone(),
+            source_schema: self.source_schema.clone(),
+        })
+    }
+}
+
+pub struct VirtualColumnDataModifier {
+    record_batches: Option<Vec<RecordBatch>>,
+    next_record_batch_index: usize,
+    function_context: FunctionContext,
+    virtual_column_info: VirtualColumnInfo,
+    source_schema: TableSchemaRef,
+}
+
+impl VirtualColumnDataModifier {
+    pub fn paste_virtual_column(&mut self, mut data_block: DataBlock) -> Result<DataBlock> {
+        let record_batch = if let Some(record_batches) = &self.record_batches {
+            assert!(record_batches.len() > self.next_record_batch_index);
+            Some(&record_batches[self.next_record_batch_index])
+        } else {
+            None
+        };
+
+        self.next_record_batch_index += 1;
+
+        let func_ctx = &self.function_context;
         for virtual_column_field in self.virtual_column_info.virtual_column_fields.iter() {
-            if let Some(arrow_array) = record_batch
-                .as_ref()
-                .and_then(|r| r.column_by_name(&virtual_column_field.name).cloned())
+            if let Some(arrow_array) =
+                record_batch.and_then(|r| r.column_by_name(&virtual_column_field.name).cloned())
             {
                 let data_type: DataType = virtual_column_field.data_type.as_ref().into();
                 let value = Value::Column(Column::from_arrow_rs(arrow_array, &data_type)?);
diff --git a/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs b/src/query/storages/fuse/src/operations/read/parquet_data_source_deserializer.rs

Original file line number	Diff line number	Diff line change
`@@ -924,4 +924,8 @@ impl Settings {`
`924`	`924`	`pub fn get_enable_use_vacuum2_to_purge_transient_table_data(&self) -> Result<bool> {`
`925`	`925`	`Ok(self.try_get_u64("use_vacuum2_to_purge_transient_table_data")? == 1)`
`926`	`926`	`}`
	`927`	`+`
	`928`	`+ pub fn get_fuse_parquet_read_batch_size(&self) -> Result<usize> {`
	`929`	`+ Ok(self.try_get_u64("fuse_parquet_read_batch_size")? as usize)`
	`930`	`+ }`
`927`	`931`	`}`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ impl AggIndexReader {`
`97`	`97`	`self.index_id`
`98`	`98`	`}`
`99`	`99`
`100`		`- pub(super) fn apply_agg_info(&self, block: DataBlock) -> Result<DataBlock> {`
	`100`	`+ pub(super) fn apply_agg_info_to_block(&self, block: DataBlock) -> Result<DataBlock> {`
`101`	`101`	`let evaluator = Evaluator::new(&block, &self.func_ctx, &BUILTIN_FUNCTIONS);`
`102`	`102`
`103`	`103`	`// 1. Filter the block if there is a filter.`
`@@ -145,4 +145,11 @@ impl AggIndexReader {`
`145`	`145`	`)),`
`146`	`146`	`))`
`147`	`147`	`}`
	`148`	`+`
	`149`	`+ pub(super) fn apply_agg_info(&self, block: Vec<DataBlock>) -> Result<Vec<DataBlock>> {`
	`150`	`+ block`
	`151`	`+ .into_iter()`
	`152`	`+ .map(\|block\| self.apply_agg_info_to_block(block))`
	`153`	`+ .collect::<Result<_>>()`
	`154`	`+ }`
`148`	`155`	`}`
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`// limitations under the License.`
`14`	`14`
`15`	`15`	`use std::sync::Arc;`
	`16`	`+use std::vec;`
`16`	`17`
`17`	`18`	`use databend_common_exception::Result;`
`18`	`19`	`use databend_common_expression::DataBlock;`
`@@ -166,7 +167,8 @@ impl AggIndexReader {`
`166`	`167`	`let block = DataBlock::new_from_columns(columns);`
`167`	`168`	`blocks.push(block);`
`168`	`169`	`}`
`169`		`- let block = DataBlock::concat(&blocks)?;`
`170`		`- self.apply_agg_info(block)`
	`170`	`+ let blocks = self.apply_agg_info(blocks)?;`
	`171`	`+`
	`172`	`+ DataBlock::concat(blocks.as_slice())`
`171`	`173`	`}`
`172`	`174`	`}`