From 58690aa31e6601807ec1c3bf2d358c3e3ae3d1ca Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Wed, 30 Apr 2025 13:11:51 +0800
Subject: [PATCH 01/36] hilbert recluster support block stream write

---
 .../expression/src/utils/block_thresholds.rs  |  4 +-
 .../interpreter_table_recluster.rs            | 56 ++++++++++------
 .../builders/builder_hilbert_partition.rs     | 45 +++++++++----
 .../partition/data_processor_strategy.rs      | 12 +++-
 .../partition/hilbert_partition_exchange.rs   | 30 +++++----
 .../src/schedulers/fragments/fragmenter.rs    | 17 +++--
 .../src/schedulers/fragments/plan_fragment.rs | 66 +++++++++++++++++--
 .../physical_plans/physical_recluster.rs      |  3 +-
 .../storages/fuse/src/operations/append.rs    |  5 +-
 9 files changed, 171 insertions(+), 67 deletions(-)

diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs
index f19a26f6dedee..4fd35638cb863 100644
--- a/src/query/expression/src/utils/block_thresholds.rs
+++ b/src/query/expression/src/utils/block_thresholds.rs
@@ -152,8 +152,8 @@ impl BlockThresholds {
 
         let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed);
         // Adjust the number of blocks based on block size thresholds.
-        let max_bytes_per_block = (4 * self.min_bytes_per_block).min(400 * 1024 * 1024);
-        let min_bytes_per_block = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024);
+        let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024);
+        let min_bytes_per_block = self.min_bytes_per_block.min(100 * 1024 * 1024);
         let block_nums = if bytes_per_block > max_bytes_per_block {
             // Case 1: If the block size is too bigger.
             total_bytes.div_ceil(max_bytes_per_block)
diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs
index f3c53597b06d7..558be0d8bdbba 100644
--- a/src/query/service/src/interpreters/interpreter_table_recluster.rs
+++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs
@@ -28,6 +28,7 @@ use databend_common_catalog::table::TableExt;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::type_check::check_function;
+use databend_common_expression::types::NumberScalar;
 use databend_common_expression::DataBlock;
 use databend_common_expression::Scalar;
 use databend_common_functions::BUILTIN_FUNCTIONS;
@@ -52,6 +53,8 @@ use databend_common_sql::plans::plan_hilbert_sql;
 use databend_common_sql::plans::replace_with_constant;
 use databend_common_sql::plans::set_update_stream_columns;
 use databend_common_sql::plans::BoundColumnRef;
+use databend_common_sql::plans::ConstantExpr;
+use databend_common_sql::plans::FunctionCall;
 use databend_common_sql::plans::Plan;
 use databend_common_sql::plans::ReclusterPlan;
 use databend_common_sql::IdentifierNormalizer;
@@ -325,19 +328,7 @@ impl ReclusterTableInterpreter {
             block_thresholds.calc_rows_for_recluster(total_rows, total_bytes, total_compressed);
 
         // Calculate initial partition count based on data volume and block size
-        let mut total_partitions = std::cmp::max(total_rows / rows_per_block, 1);
-
-        // Adjust number of partitions according to the block size thresholds
-        if total_partitions < block_thresholds.block_per_segment
-            && block_thresholds.check_perfect_segment(
-                block_thresholds.block_per_segment, // this effectively by-pass the total_blocks criteria
-                total_rows,
-                total_bytes,
-                total_compressed,
-            )
-        {
-            total_partitions = block_thresholds.block_per_segment;
-        }
+        let total_partitions = std::cmp::max(total_rows / rows_per_block, 1);
 
         warn!(
             "Do hilbert recluster, total_bytes: {}, total_rows: {}, total_partitions: {}",
@@ -439,15 +430,37 @@ impl ReclusterTableInterpreter {
 
         // For distributed execution, add an exchange operator to distribute work
         if is_distributed {
+            let nodes_num = cluster.nodes.len() as u64;
+            let scalar_expr = ScalarExpr::FunctionCall(FunctionCall {
+                span: None,
+                func_name: "div".to_string(),
+                params: vec![],
+                arguments: vec![
+                    ScalarExpr::FunctionCall(FunctionCall {
+                        span: None,
+                        func_name: "multiply".to_string(),
+                        params: vec![],
+                        arguments: vec![
+                            ScalarExpr::BoundColumnRef(BoundColumnRef {
+                                span: None,
+                                column: bind_context.columns.last().unwrap().clone(),
+                            }),
+                            ScalarExpr::ConstantExpr(ConstantExpr {
+                                span: None,
+                                value: Scalar::Number(NumberScalar::UInt64(nodes_num)),
+                            }),
+                        ],
+                    }),
+                    ScalarExpr::ConstantExpr(ConstantExpr {
+                        span: None,
+                        value: Scalar::Number(NumberScalar::UInt64(total_partitions as u64)),
+                    }),
+                ],
+            });
+
             // Create an expression for the partition column,
             // i.e.`range_partition_id(hilbert_range_index({hilbert_keys_str}), [...]) AS _predicate`
-            let expr = scalar_expr_to_remote_expr(
-                &ScalarExpr::BoundColumnRef(BoundColumnRef {
-                    span: None,
-                    column: bind_context.columns.last().unwrap().clone(),
-                }),
-                plan.output_schema()?.as_ref(),
-            )?;
+            let expr = scalar_expr_to_remote_expr(&scalar_expr, plan.output_schema()?.as_ref())?;
 
             // Add exchange operator for data distribution,
             // shuffling data based on the hash of range partition IDs derived from the Hilbert index.
@@ -471,7 +484,8 @@ impl ReclusterTableInterpreter {
             plan_id: 0,
             input: plan,
             table_info: table_info.clone(),
-            num_partitions: total_partitions,
+            range_start: 0,
+            range_width: total_partitions,
             table_meta_timestamps,
             rows_per_block,
         }));
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index 86b23bfca9238..2d38a1e3b8281 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -15,6 +15,7 @@
 use std::sync::atomic;
 use std::sync::atomic::AtomicUsize;
 
+use databend_common_catalog::table::Table;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
 use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
@@ -22,6 +23,7 @@ use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::executor::physical_plans::HilbertPartition;
 use databend_common_sql::executor::physical_plans::MutationKind;
+use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::statistics::ClusterStatsGenerator;
 use databend_common_storages_fuse::FuseTable;
@@ -43,10 +45,12 @@ impl PipelineBuilder {
             .ctx
             .build_table_by_table_info(&partition.table_info, None)?;
         let table = FuseTable::try_from_table(table.as_ref())?;
+        let enable_stream_writer = self.ctx.get_settings().get_enable_block_stream_write()?
+            && table.storage_format_as_parquet();
 
         self.main_pipeline.exchange(
             num_processors,
-            HilbertPartitionExchange::create(partition.num_partitions),
+            HilbertPartitionExchange::create(partition.range_start, partition.range_width),
         );
 
         let settings = self.ctx.get_settings();
@@ -77,26 +81,43 @@ impl PipelineBuilder {
                     &settings,
                     processor_id.fetch_add(1, atomic::Ordering::AcqRel),
                     num_processors,
-                    partition.num_partitions,
+                    partition.range_width,
                     window_spill_settings.clone(),
                     disk_spill.clone(),
-                    CompactStrategy::new(partition.rows_per_block, max_bytes_per_block),
+                    CompactStrategy::new(
+                        partition.rows_per_block,
+                        max_bytes_per_block,
+                        enable_stream_writer,
+                    ),
                 )?,
             )))
         })?;
 
-        self.main_pipeline
-            .add_transform(|transform_input_port, transform_output_port| {
-                let proc = TransformSerializeBlock::try_create(
+        if enable_stream_writer {
+            self.main_pipeline.add_transform(|input, output| {
+                TransformBlockWriter::try_create(
                     self.ctx.clone(),
-                    transform_input_port,
-                    transform_output_port,
+                    input,
+                    output,
                     table,
-                    ClusterStatsGenerator::default(),
-                    MutationKind::Recluster,
                     partition.table_meta_timestamps,
-                )?;
-                proc.into_processor()
+                    false,
+                )
             })
+        } else {
+            self.main_pipeline
+                .add_transform(|transform_input_port, transform_output_port| {
+                    let proc = TransformSerializeBlock::try_create(
+                        self.ctx.clone(),
+                        transform_input_port,
+                        transform_output_port,
+                        table,
+                        ClusterStatsGenerator::default(),
+                        MutationKind::Recluster,
+                        partition.table_meta_timestamps,
+                    )?;
+                    proc.into_processor()
+                })
+        }
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
index 75793aa415e08..3515858340e89 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
@@ -27,13 +27,19 @@ pub trait DataProcessorStrategy: Send + Sync + 'static {
 pub struct CompactStrategy {
     max_bytes_per_block: usize,
     max_rows_per_block: usize,
+    enable_stream_writer: bool,
 }
 
 impl CompactStrategy {
-    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
+    pub fn new(
+        max_rows_per_block: usize,
+        max_bytes_per_block: usize,
+        enable_stream_writer: bool,
+    ) -> Self {
         Self {
             max_bytes_per_block,
             max_rows_per_block,
+            enable_stream_writer,
         }
     }
 
@@ -50,6 +56,10 @@ impl DataProcessorStrategy for CompactStrategy {
     const NAME: &'static str = "Compact";
 
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        if self.enable_stream_writer {
+            return Ok(data_blocks);
+        }
+
         let blocks_num = data_blocks.len();
         if blocks_num < 2 {
             return Ok(data_blocks);
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs
index 93a6ce2aa4b6e..16215dded2b15 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs
@@ -25,12 +25,13 @@ use databend_common_pipeline_core::processors::Exchange;
 use crate::pipelines::processors::transforms::WindowPartitionMeta;
 
 pub struct HilbertPartitionExchange {
-    num_partitions: usize,
+    start: u64,
+    width: usize,
 }
 
 impl HilbertPartitionExchange {
-    pub fn create(num_partitions: usize) -> Arc<HilbertPartitionExchange> {
-        Arc::new(HilbertPartitionExchange { num_partitions })
+    pub fn create(start: u64, width: usize) -> Arc<HilbertPartitionExchange> {
+        Arc::new(HilbertPartitionExchange { start, width })
     }
 }
 
@@ -48,20 +49,25 @@ impl Exchange for HilbertPartitionExchange {
         // Scatter the data block to different partitions.
         let indices = range_ids
             .iter()
-            .map(|&id| (id % self.num_partitions as u64) as u16)
+            .map(|&id| (id - self.start) as u16)
             .collect::<Vec<_>>();
         data_block.pop_columns(1);
-        let scatter_indices =
-            DataBlock::divide_indices_by_scatter_size(&indices, self.num_partitions);
+
+        let scatter_indices = DataBlock::divide_indices_by_scatter_size(&indices, self.width);
         // Partition the data blocks to different processors.
+        let base = self.width / n;
+        let remainder = self.width % n;
         let mut output_data_blocks = vec![vec![]; n];
-        for (partition_id, indices) in scatter_indices.iter().take(self.num_partitions).enumerate()
-        {
-            if indices.is_empty() {
-                continue;
+        for (partition_id, indices) in scatter_indices.into_iter().take(self.width).enumerate() {
+            if !indices.is_empty() {
+                let target = if partition_id < remainder * (base + 1) {
+                    partition_id / (base + 1)
+                } else {
+                    (partition_id - remainder) / base
+                };
+                let block = data_block.take_with_optimize_size(&indices)?;
+                output_data_blocks[target].push((partition_id, block));
             }
-            let block = data_block.take_with_optimize_size(indices)?;
-            output_data_blocks[partition_id % n].push((partition_id, block));
         }
 
         // Union data blocks for each processor.
diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs
index 2e7a6e878b819..dbdda532daca8 100644
--- a/src/query/service/src/schedulers/fragments/fragmenter.rs
+++ b/src/query/service/src/schedulers/fragments/fragmenter.rs
@@ -26,6 +26,7 @@ use databend_common_sql::executor::physical_plans::ExchangeSink;
 use databend_common_sql::executor::physical_plans::ExchangeSource;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 use databend_common_sql::executor::physical_plans::HashJoin;
+use databend_common_sql::executor::physical_plans::HilbertPartition;
 use databend_common_sql::executor::physical_plans::MutationSource;
 use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::executor::physical_plans::ReplaceInto;
@@ -41,7 +42,6 @@ use crate::servers::flight::v1::exchange::DataExchange;
 use crate::servers::flight::v1::exchange::MergeExchange;
 use crate::servers::flight::v1::exchange::ShuffleDataExchange;
 use crate::sessions::QueryContext;
-use crate::sql::executor::physical_plans::Mutation;
 use crate::sql::executor::PhysicalPlan;
 
 /// Visitor to split a `PhysicalPlan` into fragments.
@@ -67,6 +67,7 @@ enum State {
     Compact,
     Recluster,
     Other,
+    HilbertRecluster,
 }
 
 impl Fragmenter {
@@ -170,14 +171,6 @@ impl PhysicalPlanReplacer for Fragmenter {
         Ok(PhysicalPlan::MutationSource(plan.clone()))
     }
 
-    fn replace_mutation(&mut self, plan: &Mutation) -> Result<PhysicalPlan> {
-        let input = self.replace(&plan.input)?;
-        Ok(PhysicalPlan::Mutation(Box::new(Mutation {
-            input: Box::new(input),
-            ..plan.clone()
-        })))
-    }
-
     fn replace_replace_into(&mut self, plan: &ReplaceInto) -> Result<PhysicalPlan> {
         let input = self.replace(&plan.input)?;
         self.state = State::ReplaceInto;
@@ -209,6 +202,11 @@ impl PhysicalPlanReplacer for Fragmenter {
         Ok(PhysicalPlan::Recluster(Box::new(plan.clone())))
     }
 
+    fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result<PhysicalPlan> {
+        self.state = State::HilbertRecluster;
+        Ok(PhysicalPlan::HilbertPartition(Box::new(plan.clone())))
+    }
+
     fn replace_compact_source(&mut self, plan: &CompactSource) -> Result<PhysicalPlan> {
         self.state = State::Compact;
         Ok(PhysicalPlan::CompactSource(Box::new(plan.clone())))
@@ -310,6 +308,7 @@ impl PhysicalPlanReplacer for Fragmenter {
             State::ReplaceInto => FragmentType::ReplaceInto,
             State::Compact => FragmentType::Compact,
             State::Recluster => FragmentType::Recluster,
+            State::HilbertRecluster => FragmentType::HilbertRecluster,
         };
         self.state = State::Other;
         let exchange = Self::get_exchange(self.ctx.clone(), &plan)?;
diff --git a/src/query/service/src/schedulers/fragments/plan_fragment.rs b/src/query/service/src/schedulers/fragments/plan_fragment.rs
index 18f2b35267eb4..e8306854a981f 100644
--- a/src/query/service/src/schedulers/fragments/plan_fragment.rs
+++ b/src/query/service/src/schedulers/fragments/plan_fragment.rs
@@ -28,6 +28,7 @@ use databend_common_sql::executor::physical_plans::CompactSource;
 use databend_common_sql::executor::physical_plans::ConstantTableScan;
 use databend_common_sql::executor::physical_plans::CopyIntoTable;
 use databend_common_sql::executor::physical_plans::CopyIntoTableSource;
+use databend_common_sql::executor::physical_plans::HilbertPartition;
 use databend_common_sql::executor::physical_plans::MutationSource;
 use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::executor::physical_plans::ReplaceDeduplicate;
@@ -64,6 +65,7 @@ pub enum FragmentType {
     Compact,
     Recluster,
     MutationSource,
+    HilbertRecluster,
 }
 
 #[derive(Clone)]
@@ -136,6 +138,9 @@ impl PlanFragment {
             FragmentType::Recluster => {
                 self.redistribute_recluster(ctx, &mut fragment_actions)?;
             }
+            FragmentType::HilbertRecluster => {
+                self.redistribute_hilbert(ctx, &mut fragment_actions)?;
+            }
         }
 
         if let Some(ref exchange) = self.exchange {
@@ -376,6 +381,40 @@ impl PlanFragment {
         Ok(())
     }
 
+    fn redistribute_hilbert(
+        &self,
+        ctx: Arc<QueryContext>,
+        fragment_actions: &mut QueryFragmentActions,
+    ) -> Result<()> {
+        let exchange_sink = match &self.plan {
+            PhysicalPlan::ExchangeSink(plan) => plan,
+            _ => unreachable!("logic error"),
+        };
+        let hilbert = match exchange_sink.input.as_ref() {
+            PhysicalPlan::HilbertPartition(plan) => plan,
+            _ => unreachable!("logic error"),
+        };
+
+        let total_ranges = hilbert.range_width;
+        let executors = Fragmenter::get_executors(ctx);
+        let num_executors = executors.len();
+        let base_width = total_ranges / num_executors;
+        let remainder = total_ranges % num_executors;
+        for (executor_idx, executor) in executors.into_iter().enumerate() {
+            let width = base_width + if executor_idx < remainder { 1 } else { 0 };
+            let min = executor_idx * base_width + std::cmp::min(executor_idx, remainder);
+            let mut plan = self.plan.clone();
+            let mut replace_hilbert = ReplaceHilbert {
+                range_width: width,
+                range_start: min as u64,
+            };
+            plan = replace_hilbert.replace(&plan)?;
+            fragment_actions.add_action(QueryFragmentAction::create(executor, plan));
+        }
+
+        Ok(())
+    }
+
     fn reshuffle<T: Clone>(
         executors: Vec<String>,
         partitions: Vec<T>,
@@ -551,8 +590,23 @@ impl PhysicalPlanReplacer for ReplaceReadSource {
     }
 }
 
+struct ReplaceHilbert {
+    range_width: usize,
+    range_start: u64,
+}
+
+impl PhysicalPlanReplacer for ReplaceHilbert {
+    fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result<PhysicalPlan> {
+        Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition {
+            range_width: self.range_width,
+            range_start: self.range_start,
+            ..plan.clone()
+        })))
+    }
+}
+
 struct ReplaceRecluster {
-    pub tasks: Vec<ReclusterTask>,
+    tasks: Vec<ReclusterTask>,
 }
 
 impl PhysicalPlanReplacer for ReplaceRecluster {
@@ -565,7 +619,7 @@ impl PhysicalPlanReplacer for ReplaceRecluster {
 }
 
 struct ReplaceMutationSource {
-    pub partitions: Partitions,
+    partitions: Partitions,
 }
 
 impl PhysicalPlanReplacer for ReplaceMutationSource {
@@ -578,7 +632,7 @@ impl PhysicalPlanReplacer for ReplaceMutationSource {
 }
 
 struct ReplaceCompactBlock {
-    pub partitions: Partitions,
+    partitions: Partitions,
 }
 
 impl PhysicalPlanReplacer for ReplaceCompactBlock {
@@ -591,10 +645,10 @@ impl PhysicalPlanReplacer for ReplaceCompactBlock {
 }
 
 struct ReplaceReplaceInto {
-    pub partitions: Vec<(usize, Location)>,
+    partitions: Vec<(usize, Location)>,
     // for standalone mode, slot is None
-    pub slot: Option<BlockSlotDescription>,
-    pub need_insert: bool,
+    slot: Option<BlockSlotDescription>,
+    need_insert: bool,
 }
 
 impl PhysicalPlanReplacer for ReplaceReplaceInto {
diff --git a/src/query/sql/src/executor/physical_plans/physical_recluster.rs b/src/query/sql/src/executor/physical_plans/physical_recluster.rs
index 9227c86b64199..43236e53766a5 100644
--- a/src/query/sql/src/executor/physical_plans/physical_recluster.rs
+++ b/src/query/sql/src/executor/physical_plans/physical_recluster.rs
@@ -31,7 +31,8 @@ pub struct HilbertPartition {
     pub plan_id: u32,
     pub input: Box<PhysicalPlan>,
     pub table_info: TableInfo,
-    pub num_partitions: usize,
     pub table_meta_timestamps: TableMetaTimestamps,
     pub rows_per_block: usize,
+    pub range_start: u64,
+    pub range_width: usize,
 }
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index 9316374128528..e6460a7c247a0 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -40,7 +40,6 @@ use databend_storages_common_table_meta::table::ClusterType;
 use crate::operations::TransformBlockWriter;
 use crate::operations::TransformSerializeBlock;
 use crate::statistics::ClusterStatsGenerator;
-use crate::FuseStorageFormat;
 use crate::FuseTable;
 
 impl FuseTable {
@@ -50,8 +49,8 @@ impl FuseTable {
         pipeline: &mut Pipeline,
         table_meta_timestamps: TableMetaTimestamps,
     ) -> Result<()> {
-        let enable_stream_block_write = ctx.get_settings().get_enable_block_stream_write()?
-            && matches!(self.storage_format, FuseStorageFormat::Parquet);
+        let enable_stream_block_write =
+            ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet();
         if enable_stream_block_write {
             pipeline.add_transform(|input, output| {
                 TransformBlockWriter::try_create(

From 8c77f3592f56704021cbbc2258021c60a8d30e0b Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sun, 4 May 2025 02:19:01 +0800
Subject: [PATCH 02/36] fix exchange

---
 .../interpreter_table_recluster.rs            |  2 +-
 .../builders/builder_hilbert_partition.rs     |  1 +
 .../aggregator/aggregate_exchange_injector.rs |  1 +
 .../src/schedulers/fragments/fragmenter.rs    | 17 +++-
 .../src/schedulers/fragments/plan_fragment.rs |  2 +
 .../query_fragment_actions_display.rs         |  1 +
 .../flight/v1/exchange/data_exchange.rs       | 17 ++++
 .../flight/v1/exchange/exchange_injector.rs   |  6 ++
 .../flight/v1/exchange/exchange_manager.rs    | 15 ++-
 .../src/servers/flight/v1/exchange/mod.rs     |  1 +
 .../flight/v1/scatter/flight_scatter_mod.rs   | 92 +++++++++++++++++++
 .../src/servers/flight/v1/scatter/mod.rs      |  2 +
 src/query/sql/src/executor/format.rs          |  8 ++
 .../sql/src/executor/physical_plans/common.rs |  2 +
 .../physical_plans/physical_exchange.rs       |  8 ++
 .../planner/format/display_rel_operator.rs    |  1 +
 .../sql/src/planner/optimizer/ir/format.rs    |  1 +
 .../planner/optimizer/ir/property/enforcer.rs |  1 +
 .../planner/optimizer/ir/property/property.rs |  8 +-
 .../optimizers/cascades/cost/model.rs         |  2 +-
 src/query/sql/src/planner/plans/exchange.rs   |  4 +-
 .../storages/fuse/src/operations/append.rs    |  1 +
 .../processors/transform_block_writer.rs      | 22 ++++-
 23 files changed, 208 insertions(+), 7 deletions(-)
 create mode 100644 src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs

diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs
index 558be0d8bdbba..2f8e77f29738f 100644
--- a/src/query/service/src/interpreters/interpreter_table_recluster.rs
+++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs
@@ -467,7 +467,7 @@ impl ReclusterTableInterpreter {
             plan = Box::new(PhysicalPlan::Exchange(Exchange {
                 plan_id: 0,
                 input: plan,
-                kind: FragmentKind::Normal,
+                kind: FragmentKind::Modulo,
                 keys: vec![expr],
                 allow_adjust_parallelism: true,
                 ignore_exchange: false,
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index 2d38a1e3b8281..6104fefbbb93e 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -99,6 +99,7 @@ impl PipelineBuilder {
                     self.ctx.clone(),
                     input,
                     output,
+                    MutationKind::Recluster,
                     table,
                     partition.table_meta_timestamps,
                     false,
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
index 55688a4347259..40904ea2c8e16 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
@@ -233,6 +233,7 @@ impl ExchangeInjector for AggregateInjector {
         match exchange {
             DataExchange::Merge(_) => unreachable!(),
             DataExchange::Broadcast(_) => unreachable!(),
+            DataExchange::Modulo(_) => unreachable!(),
             DataExchange::ShuffleDataExchange(exchange) => {
                 Ok(Arc::new(Box::new(HashTableHashScatter {
                     buckets: exchange.destination_ids.len(),
diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs
index dbdda532daca8..dc267c896dc17 100644
--- a/src/query/service/src/schedulers/fragments/fragmenter.rs
+++ b/src/query/service/src/schedulers/fragments/fragmenter.rs
@@ -15,6 +15,7 @@
 use std::sync::Arc;
 
 use databend_common_catalog::table_context::TableContext;
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_meta_types::NodeInfo;
 use databend_common_sql::executor::physical_plans::CompactSource;
@@ -40,6 +41,7 @@ use crate::schedulers::PlanFragment;
 use crate::servers::flight::v1::exchange::BroadcastExchange;
 use crate::servers::flight::v1::exchange::DataExchange;
 use crate::servers::flight::v1::exchange::MergeExchange;
+use crate::servers::flight::v1::exchange::ModuloExchange;
 use crate::servers::flight::v1::exchange::ShuffleDataExchange;
 use crate::sessions::QueryContext;
 use crate::sql::executor::PhysicalPlan;
@@ -116,6 +118,15 @@ impl Fragmenter {
                 FragmentKind::Expansive => {
                     Ok(Some(BroadcastExchange::create(Self::get_executors(ctx))))
                 }
+                FragmentKind::Modulo => {
+                    if plan.keys.len() != 1 {
+                        return Err(ErrorCode::Internal("Modulo exchange require one key"));
+                    }
+                    Ok(Some(ModuloExchange::create(
+                        Self::get_executors(ctx),
+                        plan.keys[0].clone(),
+                    )))
+                }
                 _ => Ok(None),
             },
             _ => Ok(None),
@@ -203,8 +214,12 @@ impl PhysicalPlanReplacer for Fragmenter {
     }
 
     fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result<PhysicalPlan> {
+        let input = self.replace(&plan.input)?;
         self.state = State::HilbertRecluster;
-        Ok(PhysicalPlan::HilbertPartition(Box::new(plan.clone())))
+        Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition {
+            input: Box::new(input),
+            ..plan.clone()
+        })))
     }
 
     fn replace_compact_source(&mut self, plan: &CompactSource) -> Result<PhysicalPlan> {
diff --git a/src/query/service/src/schedulers/fragments/plan_fragment.rs b/src/query/service/src/schedulers/fragments/plan_fragment.rs
index e8306854a981f..fab77a79d29f5 100644
--- a/src/query/service/src/schedulers/fragments/plan_fragment.rs
+++ b/src/query/service/src/schedulers/fragments/plan_fragment.rs
@@ -597,7 +597,9 @@ struct ReplaceHilbert {
 
 impl PhysicalPlanReplacer for ReplaceHilbert {
     fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result<PhysicalPlan> {
+        let input = self.replace(&plan.input)?;
         Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition {
+            input: Box::new(input),
             range_width: self.range_width,
             range_start: self.range_start,
             ..plan.clone()
diff --git a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs
index adb0b6c3bcd18..36d8f0c257eb1 100644
--- a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs
+++ b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs
@@ -72,6 +72,7 @@ impl Display for QueryFragmentActionsWrap<'_> {
                 DataExchange::Merge(_) => writeln!(f, "  DataExchange: Merge")?,
                 DataExchange::Broadcast(_) => writeln!(f, "  DataExchange: Broadcast")?,
                 DataExchange::ShuffleDataExchange(_) => writeln!(f, "  DataExchange: Shuffle")?,
+                DataExchange::Modulo(_) => writeln!(f, "  DataExchange: Modulo")?,
             }
         }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs
index f23c7582559a7..0fba30c72ec7b 100644
--- a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs
@@ -19,6 +19,7 @@ pub enum DataExchange {
     Merge(MergeExchange),
     Broadcast(BroadcastExchange),
     ShuffleDataExchange(ShuffleDataExchange),
+    Modulo(ModuloExchange),
 }
 
 impl DataExchange {
@@ -27,6 +28,7 @@ impl DataExchange {
             DataExchange::Merge(exchange) => vec![exchange.destination_id.clone()],
             DataExchange::Broadcast(exchange) => exchange.destination_ids.clone(),
             DataExchange::ShuffleDataExchange(exchange) => exchange.destination_ids.clone(),
+            DataExchange::Modulo(exchange) => exchange.destination_ids.clone(),
         }
     }
 }
@@ -77,3 +79,18 @@ impl BroadcastExchange {
         DataExchange::Broadcast(BroadcastExchange { destination_ids })
     }
 }
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct ModuloExchange {
+    pub destination_ids: Vec<String>,
+    pub shuffle_key: RemoteExpr,
+}
+
+impl ModuloExchange {
+    pub fn create(destination_ids: Vec<String>, shuffle_key: RemoteExpr) -> DataExchange {
+        DataExchange::Modulo(ModuloExchange {
+            destination_ids,
+            shuffle_key,
+        })
+    }
+}
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
index 4aa65ba175a83..5b10b4f346960 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
@@ -29,6 +29,7 @@ use crate::servers::flight::v1::exchange::ShuffleExchangeParams;
 use crate::servers::flight::v1::scatter::BroadcastFlightScatter;
 use crate::servers::flight::v1::scatter::FlightScatter;
 use crate::servers::flight::v1::scatter::HashFlightScatter;
+use crate::servers::flight::v1::scatter::ModFlightScatter;
 use crate::sessions::QueryContext;
 
 pub trait ExchangeInjector: Send + Sync + 'static {
@@ -100,6 +101,11 @@ impl ExchangeInjector for DefaultExchangeInjector {
                     local_pos,
                 )?
             }
+            DataExchange::Modulo(exchange) => ModFlightScatter::try_create(
+                ctx.get_function_context()?,
+                &exchange.shuffle_key,
+                exchange.destination_ids.len(),
+            )?,
         }))
     }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
index 13a6a57742127..13a65e33ebf08 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
@@ -303,7 +303,7 @@ impl DataExchangeManager {
                         None,
                         Some(config.query.to_rpc_client_tls_config()),
                     )
-                    .await?,
+                        .await?,
                 ))),
                 false => Ok(FlightClient::new(FlightServiceClient::new(
                     ConnectionFactory::create_rpc_channel(address.to_owned(), None, None).await?,
@@ -1011,6 +1011,19 @@ impl FragmentCoordinator {
                         .flight_scatter(&info.query_ctx, data_exchange)?,
                 }),
             )),
+            DataExchange::Modulo(exchange) => {
+                Ok(Some(ExchangeParams::ShuffleExchange(ShuffleExchangeParams {
+                    exchange_injector: exchange_injector.clone(),
+                    schema: self.physical_plan.output_schema()?,
+                    fragment_id: self.fragment_id,
+                    query_id: info.query_id.to_string(),
+                    executor_id: info.current_executor.to_string(),
+                    destination_ids: exchange.destination_ids.to_owned(),
+                    shuffle_scatter: exchange_injector
+                        .flight_scatter(&info.query_ctx, data_exchange)?,
+                })
+                ))
+            }
         }
     }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/mod.rs b/src/query/service/src/servers/flight/v1/exchange/mod.rs
index 194f2cbe1e3e5..ada27909df959 100644
--- a/src/query/service/src/servers/flight/v1/exchange/mod.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/mod.rs
@@ -32,6 +32,7 @@ pub mod serde;
 pub use data_exchange::BroadcastExchange;
 pub use data_exchange::DataExchange;
 pub use data_exchange::MergeExchange;
+pub use data_exchange::ModuloExchange;
 pub use data_exchange::ShuffleDataExchange;
 pub use exchange_injector::DefaultExchangeInjector;
 pub use exchange_injector::ExchangeInjector;
diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs
new file mode 100644
index 0000000000000..f83fea3f574c2
--- /dev/null
+++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs
@@ -0,0 +1,92 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_exception::ErrorCode;
+use databend_common_exception::Result;
+use databend_common_expression::type_check::check_function;
+use databend_common_expression::types::DataType;
+use databend_common_expression::types::NumberDataType;
+use databend_common_expression::types::NumberScalar;
+use databend_common_expression::DataBlock;
+use databend_common_expression::Evaluator;
+use databend_common_expression::Expr;
+use databend_common_expression::FunctionContext;
+use databend_common_expression::RemoteExpr;
+use databend_common_expression::Scalar;
+use databend_common_functions::BUILTIN_FUNCTIONS;
+
+use crate::servers::flight::v1::scatter::FlightScatter;
+
+#[derive(Clone)]
+pub struct ModFlightScatter {
+    scatter_size: usize,
+    func_ctx: FunctionContext,
+    expr: Expr,
+}
+
+impl ModFlightScatter {
+    pub fn try_create(
+        func_ctx: FunctionContext,
+        expr: &RemoteExpr,
+        scatter_size: usize,
+    ) -> Result<Box<dyn FlightScatter>> {
+        let expr = check_function(
+            None,
+            "modulo",
+            &[],
+            &[
+                expr.as_expr(&BUILTIN_FUNCTIONS),
+                Expr::constant(
+                    Scalar::Number(NumberScalar::UInt64(scatter_size as u64)),
+                    Some(DataType::Number(NumberDataType::UInt64)),
+                ),
+            ],
+            &BUILTIN_FUNCTIONS,
+        )?;
+        let return_type = expr.data_type();
+        if !matches!(return_type, DataType::Number(NumberDataType::UInt64)) {
+            return Err(ErrorCode::Internal(format!(
+                "ModFlightScatter expects modulo expression to return UInt64, but got {:?}",
+                return_type
+            )));
+        }
+
+        Ok(Box::new(ModFlightScatter {
+            scatter_size,
+            func_ctx,
+            expr,
+        }))
+    }
+}
+
+impl FlightScatter for ModFlightScatter {
+    fn execute(&self, data_block: DataBlock) -> Result<Vec<DataBlock>> {
+        let evaluator = Evaluator::new(&data_block, &self.func_ctx, &BUILTIN_FUNCTIONS);
+        let num = data_block.num_rows();
+
+        let column = evaluator
+            .run(&self.expr)?
+            .into_full_column(&DataType::Number(NumberDataType::UInt64), num);
+        let indices = column.as_number().unwrap().as_u_int64().unwrap();
+        let data_blocks = DataBlock::scatter(&data_block, indices, self.scatter_size)?;
+
+        let block_meta = data_block.get_meta();
+        let mut res = Vec::with_capacity(data_blocks.len());
+        for data_block in data_blocks {
+            res.push(data_block.add_meta(block_meta.cloned())?);
+        }
+
+        Ok(res)
+    }
+}
diff --git a/src/query/service/src/servers/flight/v1/scatter/mod.rs b/src/query/service/src/servers/flight/v1/scatter/mod.rs
index b5f5f900dab71..2904ed87684ca 100644
--- a/src/query/service/src/servers/flight/v1/scatter/mod.rs
+++ b/src/query/service/src/servers/flight/v1/scatter/mod.rs
@@ -15,7 +15,9 @@
 mod flight_scatter;
 mod flight_scatter_broadcast;
 mod flight_scatter_hash;
+mod flight_scatter_mod;
 
 pub use flight_scatter::FlightScatter;
 pub use flight_scatter_broadcast::BroadcastFlightScatter;
 pub use flight_scatter_hash::HashFlightScatter;
+pub use flight_scatter_mod::ModFlightScatter;
diff --git a/src/query/sql/src/executor/format.rs b/src/query/sql/src/executor/format.rs
index 1e3f8879339f3..29f5bc2529dad 100644
--- a/src/query/sql/src/executor/format.rs
+++ b/src/query/sql/src/executor/format.rs
@@ -1639,6 +1639,14 @@ fn exchange_to_format_tree(
             ),
             FragmentKind::Expansive => "Broadcast".to_string(),
             FragmentKind::Merge => "Merge".to_string(),
+            FragmentKind::Modulo => format!(
+                "Modulo({})",
+                plan.keys
+                    .iter()
+                    .map(|key| { key.as_expr(&BUILTIN_FUNCTIONS).sql_display() })
+                    .collect::<Vec<_>>()
+                    .join(", ")
+            ),
         })),
         to_format_tree(&plan.input, metadata, profs, context)?,
     ]))
diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs
index 545179b4af4d6..10859f8391da1 100644
--- a/src/query/sql/src/executor/physical_plans/common.rs
+++ b/src/query/sql/src/executor/physical_plans/common.rs
@@ -67,6 +67,8 @@ pub enum FragmentKind {
     // Broadcast
     Expansive,
     Merge,
+    // Partitioned by a specified expression % node_nums
+    Modulo,
 }
 
 #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)]
diff --git a/src/query/sql/src/executor/physical_plans/physical_exchange.rs b/src/query/sql/src/executor/physical_plans/physical_exchange.rs
index 1e831519c415b..b4507942dc8ba 100644
--- a/src/query/sql/src/executor/physical_plans/physical_exchange.rs
+++ b/src/query/sql/src/executor/physical_plans/physical_exchange.rs
@@ -81,6 +81,14 @@ impl PhysicalPlanBuilder {
                 allow_adjust_parallelism = false;
                 FragmentKind::Merge
             }
+            crate::plans::Exchange::Modulo(scalar) => {
+                let expr = scalar
+                    .type_check(input_schema.as_ref())?
+                    .project_column_ref(|index| input_schema.index_of(&index.to_string()).unwrap());
+                let (expr, _) = ConstantFolder::fold(&expr, &self.func_ctx, &BUILTIN_FUNCTIONS);
+                keys.push(expr.as_remote_expr());
+                FragmentKind::Modulo
+            }
         };
         Ok(PhysicalPlan::Exchange(Exchange {
             plan_id: 0,
diff --git a/src/query/sql/src/planner/format/display_rel_operator.rs b/src/query/sql/src/planner/format/display_rel_operator.rs
index 9835bbedf0cdd..a82e84ddc59ac 100644
--- a/src/query/sql/src/planner/format/display_rel_operator.rs
+++ b/src/query/sql/src/planner/format/display_rel_operator.rs
@@ -397,6 +397,7 @@ fn exchange_to_format_tree<I: IdHumanizer>(id_humanizer: &I, op: &Exchange) -> F
         Exchange::Broadcast => "Exchange(Broadcast)",
         Exchange::Merge => "Exchange(Merge)",
         Exchange::MergeSort => "Exchange(MergeSort)",
+        Exchange::Modulo(_) => "Exchange(Modulo)",
     };
 
     match op {
diff --git a/src/query/sql/src/planner/optimizer/ir/format.rs b/src/query/sql/src/planner/optimizer/ir/format.rs
index f9613af6b35ef..017c9bec97203 100644
--- a/src/query/sql/src/planner/optimizer/ir/format.rs
+++ b/src/query/sql/src/planner/optimizer/ir/format.rs
@@ -66,6 +66,7 @@ fn display_rel_op(rel_op: &RelOperator) -> String {
                 Exchange::Broadcast => "Broadcast".to_string(),
                 Exchange::Merge => "Merge".to_string(),
                 Exchange::MergeSort => "MergeSort".to_string(),
+                Exchange::Modulo(scalar) => format!("Modulo({})", scalar.as_raw_expr()),
             })
         }
         RelOperator::DummyTableScan(_) => "DummyTableScan".to_string(),
diff --git a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs
index 1229898f5bef4..69abb144e11a4 100644
--- a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs
+++ b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs
@@ -73,6 +73,7 @@ impl Enforcer for DistributionEnforcer {
             Distribution::Random | Distribution::Any => Err(ErrorCode::Internal(
                 "Cannot enforce random or any distribution",
             )),
+            Distribution::Modulo(key) => Ok(Exchange::Modulo(key.clone()).into()),
         }
     }
 }
diff --git a/src/query/sql/src/planner/optimizer/ir/property/property.rs b/src/query/sql/src/planner/optimizer/ir/property/property.rs
index 9ae23730ca7e7..3eff4f594a2d2 100644
--- a/src/query/sql/src/planner/optimizer/ir/property/property.rs
+++ b/src/query/sql/src/planner/optimizer/ir/property/property.rs
@@ -92,6 +92,7 @@ pub enum Distribution {
     Serial,
     Broadcast,
     Hash(Vec<ScalarExpr>),
+    Modulo(Box<ScalarExpr>),
 }
 
 impl Default for Distribution {
@@ -110,11 +111,15 @@ impl Distribution {
             | (Distribution::Random, _)
             | (Distribution::Serial, Distribution::Serial)
             | (Distribution::Broadcast, Distribution::Broadcast)
-            | (Distribution::Hash(_), Distribution::Broadcast) => true,
+            | (Distribution::Hash(_), Distribution::Broadcast)
+            | (Distribution::Modulo(_), Distribution::Broadcast) => true,
 
             (Distribution::Hash(ref keys), Distribution::Hash(ref other_keys)) => {
                 keys == other_keys
             }
+            (Distribution::Modulo(ref key), Distribution::Modulo(ref other_key)) => {
+                key == other_key
+            }
             _ => false,
         }
     }
@@ -135,6 +140,7 @@ impl Display for Distribution {
                     .collect::<Vec<_>>()
                     .join(", ")
             ),
+            Distribution::Modulo(ref key) => write!(f, "Modulo({})", key.as_raw_expr()),
         }
     }
 }
diff --git a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs
index 6d39e793a7231..5bd737365a76b 100644
--- a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs
+++ b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs
@@ -158,7 +158,7 @@ impl DefaultCostModel {
         let exchange: Exchange = (*m_expr.plan.clone()).clone().try_into()?;
         let group = memo.group(m_expr.group_index)?;
         let cost = match exchange {
-            Exchange::Hash(_) => {
+            Exchange::Hash(_) | Exchange::Modulo(_) => {
                 group.stat_info.cardinality * self.network_per_row
                     + group.stat_info.cardinality * self.compute_per_row
             }
diff --git a/src/query/sql/src/planner/plans/exchange.rs b/src/query/sql/src/planner/plans/exchange.rs
index a7aca885b2ed1..db8dffd95d8cf 100644
--- a/src/query/sql/src/planner/plans/exchange.rs
+++ b/src/query/sql/src/planner/plans/exchange.rs
@@ -30,7 +30,8 @@ pub enum Exchange {
     Hash(Vec<ScalarExpr>),
     Broadcast,
     Merge,
-    MergeSort, // For distributed sort
+    MergeSort,               // For distributed sort
+    Modulo(Box<ScalarExpr>), // For recluster
 }
 
 impl Operator for Exchange {
@@ -49,6 +50,7 @@ impl Operator for Exchange {
                 Exchange::Broadcast => Distribution::Broadcast,
                 Exchange::Merge => Distribution::Serial,
                 Exchange::MergeSort => Distribution::Serial,
+                Exchange::Modulo(key) => Distribution::Modulo(key.clone()),
             },
         })
     }
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index e6460a7c247a0..84bdd8ca77609 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -57,6 +57,7 @@ impl FuseTable {
                     ctx.clone(),
                     input,
                     output,
+                    MutationKind::Insert,
                     self,
                     table_meta_timestamps,
                     false,
diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
index 7a3615233b8ce..b3d1fd7bb416e 100644
--- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
@@ -24,11 +24,13 @@ use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
 use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT;
+use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums;
 use databend_common_pipeline_core::processors::Event;
 use databend_common_pipeline_core::processors::InputPort;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::Processor;
 use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_storage::MutationStatus;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
 use opendal::Operator;
@@ -37,6 +39,8 @@ use crate::io::BlockSerialization;
 use crate::io::BlockWriter;
 use crate::io::StreamBlockBuilder;
 use crate::io::StreamBlockProperties;
+use crate::operations::MutationLogEntry;
+use crate::operations::MutationLogs;
 use crate::FuseTable;
 use crate::FUSE_OPT_KEY_ROW_PER_BLOCK;
 
@@ -54,6 +58,7 @@ pub struct TransformBlockWriter {
     state: State,
     input: Arc<InputPort>,
     output: Arc<OutputPort>,
+    kind: MutationKind,
 
     properties: Arc<StreamBlockProperties>,
 
@@ -76,6 +81,7 @@ impl TransformBlockWriter {
         ctx: Arc<dyn TableContext>,
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
+        kind: MutationKind,
         table: &FuseTable,
         table_meta_timestamps: TableMetaTimestamps,
         with_tid: bool,
@@ -89,6 +95,7 @@ impl TransformBlockWriter {
             state: State::Consume,
             input,
             output,
+            kind,
             properties,
             builder: None,
             dal: table.get_operator(),
@@ -273,7 +280,20 @@ impl Processor for TransformBlockWriter {
                     });
                 }
 
-                self.output_data = Some(DataBlock::empty_with_meta(Box::new(extended_block_meta)));
+                let output = if matches!(self.kind, MutationKind::Insert) {
+                    DataBlock::empty_with_meta(Box::new(extended_block_meta))
+                } else {
+                    if matches!(self.kind, MutationKind::Recluster) {
+                        metrics_inc_recluster_write_block_nums();
+                    }
+
+                    DataBlock::empty_with_meta(Box::new(MutationLogs {
+                        entries: vec![MutationLogEntry::AppendBlock {
+                            block_meta: Arc::new(extended_block_meta),
+                        }],
+                    }))
+                };
+                self.output_data = Some(output);
             }
             _ => return Err(ErrorCode::Internal("It's a bug.")),
         }

From 67bd532d06fca17f6b5095e4fd908617d3100dfb Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 9 May 2025 02:16:56 +0800
Subject: [PATCH 03/36] add transform hilbert collect

---
 .../expression/src/utils/block_thresholds.rs  |  49 ++++
 .../interpreter_table_recluster.rs            |  14 +-
 .../builders/builder_hilbert_partition.rs     |  65 +++--
 .../partition/data_processor_strategy.rs      |  12 +-
 .../transforms/window/partition/mod.rs        |   2 +
 .../partition/transform_hilbert_collect.rs    | 253 ++++++++++++++++++
 .../partition/window_partition_buffer.rs      | 121 +++++----
 src/query/service/src/spillers/spiller.rs     |  18 ++
 .../physical_plans/physical_recluster.rs      |   1 +
 .../fuse/src/io/write/stream/block_builder.rs |  10 +-
 .../storages/fuse/src/operations/append.rs    |   1 +
 .../processors/transform_block_writer.rs      |  26 +-
 12 files changed, 458 insertions(+), 114 deletions(-)
 create mode 100644 src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs

diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs
index 4fd35638cb863..fe72302382d8a 100644
--- a/src/query/expression/src/utils/block_thresholds.rs
+++ b/src/query/expression/src/utils/block_thresholds.rs
@@ -166,4 +166,53 @@ impl BlockThresholds {
         };
         total_rows.div_ceil(block_nums.max(1)).max(1)
     }
+
+    /// Calculates the optimal number of partitions (blocks) based on total data size and row count.
+    ///
+    /// # Parameters
+    /// - `total_rows`: The total number of rows in the data.
+    /// - `total_bytes`: The total uncompressed size of the data in bytes.
+    /// - `total_compressed`: The total compressed size of the data in bytes.
+    ///
+    /// # Returns
+    /// - The calculated number of partitions (blocks) needed.
+    #[inline]
+    pub fn calc_partitions_for_recluster(
+        &self,
+        total_rows: usize,
+        total_bytes: usize,
+        total_compressed: usize,
+    ) -> usize {
+        // If the data is already compact enough, return a single partition.
+        if self.check_for_compact(total_rows, total_bytes)
+            && total_compressed < 2 * self.min_compressed_per_block
+        {
+            return 1;
+        }
+
+        // Estimate the number of blocks based on row count and compressed size.
+        let by_rows = std::cmp::max(total_rows / self.max_rows_per_block, 1);
+        let by_compressed = total_compressed / self.max_compressed_per_block;
+        // If row-based block count is greater, use max rows per block as limit.
+        if by_rows >= by_compressed {
+            return by_rows;
+        }
+
+        // Adjust block count based on byte size thresholds.
+        let bytes_per_block = total_bytes.div_ceil(by_compressed);
+        let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024);
+        let min_bytes = max_bytes / 2;
+        let total_partitions = if bytes_per_block > max_bytes {
+            // Block size is too large.
+            total_bytes / max_bytes
+        } else if bytes_per_block < min_bytes {
+            // Block size is too small.
+            total_bytes / min_bytes
+        } else {
+            // Block size is acceptable.
+            by_compressed
+        };
+
+        std::cmp::max(total_partitions, 1)
+    }
 }
diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs
index 2f8e77f29738f..411452a48d5a0 100644
--- a/src/query/service/src/interpreters/interpreter_table_recluster.rs
+++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs
@@ -323,12 +323,15 @@ impl ReclusterTableInterpreter {
         let total_rows = recluster_info.removed_statistics.row_count as usize;
         let total_compressed = recluster_info.removed_statistics.compressed_byte_size as usize;
 
-        // Determine rows per block based on data size and compression ratio
-        let rows_per_block =
-            block_thresholds.calc_rows_for_recluster(total_rows, total_bytes, total_compressed);
-
+        // Determine rows per block based on data size and compression ratio,
         // Calculate initial partition count based on data volume and block size
-        let total_partitions = std::cmp::max(total_rows / rows_per_block, 1);
+        let total_partitions = block_thresholds.calc_partitions_for_recluster(
+            total_rows,
+            total_bytes,
+            total_compressed,
+        );
+        let bytes_per_block = (total_bytes / total_partitions).max(1);
+        let rows_per_block = (total_rows / total_partitions).max(1);
 
         warn!(
             "Do hilbert recluster, total_bytes: {}, total_rows: {}, total_partitions: {}",
@@ -487,6 +490,7 @@ impl ReclusterTableInterpreter {
             range_start: 0,
             range_width: total_partitions,
             table_meta_timestamps,
+            bytes_per_block,
             rows_per_block,
         }));
 
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index 6104fefbbb93e..fd351cd1e5fb7 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -18,7 +18,6 @@ use std::sync::atomic::AtomicUsize;
 use databend_common_catalog::table::Table;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
-use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
 use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::executor::physical_plans::HilbertPartition;
@@ -27,12 +26,12 @@ use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::statistics::ClusterStatsGenerator;
 use databend_common_storages_fuse::FuseTable;
-use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD;
 use databend_storages_common_cache::TempDirManager;
 
 use crate::pipelines::memory_settings::MemorySettingsExt;
 use crate::pipelines::processors::transforms::CompactStrategy;
 use crate::pipelines::processors::transforms::HilbertPartitionExchange;
+use crate::pipelines::processors::transforms::TransformHilbertCollect;
 use crate::pipelines::processors::transforms::TransformWindowPartitionCollect;
 use crate::pipelines::PipelineBuilder;
 use crate::spillers::SpillerDiskConfig;
@@ -65,35 +64,25 @@ impl PipelineBuilder {
 
         let window_spill_settings = MemorySettings::from_window_settings(&self.ctx)?;
         let processor_id = AtomicUsize::new(0);
-        let max_bytes_per_block = std::cmp::min(
-            4 * table.get_option(
-                FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD,
-                DEFAULT_BLOCK_BUFFER_SIZE,
-            ),
-            400 * 1024 * 1024,
-        );
-        self.main_pipeline.add_transform(|input, output| {
-            Ok(ProcessorPtr::create(Box::new(
-                TransformWindowPartitionCollect::new(
-                    self.ctx.clone(),
-                    input,
-                    output,
-                    &settings,
-                    processor_id.fetch_add(1, atomic::Ordering::AcqRel),
-                    num_processors,
-                    partition.range_width,
-                    window_spill_settings.clone(),
-                    disk_spill.clone(),
-                    CompactStrategy::new(
-                        partition.rows_per_block,
-                        max_bytes_per_block,
-                        enable_stream_writer,
-                    ),
-                )?,
-            )))
-        })?;
 
         if enable_stream_writer {
+            self.main_pipeline.add_transform(|input, output| {
+                Ok(ProcessorPtr::create(Box::new(
+                    TransformHilbertCollect::new(
+                        self.ctx.clone(),
+                        input,
+                        output,
+                        &settings,
+                        processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                        num_processors,
+                        partition.range_width,
+                        window_spill_settings.clone(),
+                        disk_spill.clone(),
+                        partition.bytes_per_block,
+                    )?,
+                )))
+            })?;
+
             self.main_pipeline.add_transform(|input, output| {
                 TransformBlockWriter::try_create(
                     self.ctx.clone(),
@@ -103,9 +92,27 @@ impl PipelineBuilder {
                     table,
                     partition.table_meta_timestamps,
                     false,
+                    Some(partition.bytes_per_block),
                 )
             })
         } else {
+            self.main_pipeline.add_transform(|input, output| {
+                Ok(ProcessorPtr::create(Box::new(
+                    TransformWindowPartitionCollect::new(
+                        self.ctx.clone(),
+                        input,
+                        output,
+                        &settings,
+                        processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                        num_processors,
+                        partition.range_width,
+                        window_spill_settings.clone(),
+                        disk_spill.clone(),
+                        CompactStrategy::new(partition.rows_per_block, partition.bytes_per_block),
+                    )?,
+                )))
+            })?;
+
             self.main_pipeline
                 .add_transform(|transform_input_port, transform_output_port| {
                     let proc = TransformSerializeBlock::try_create(
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
index 3515858340e89..75793aa415e08 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
@@ -27,19 +27,13 @@ pub trait DataProcessorStrategy: Send + Sync + 'static {
 pub struct CompactStrategy {
     max_bytes_per_block: usize,
     max_rows_per_block: usize,
-    enable_stream_writer: bool,
 }
 
 impl CompactStrategy {
-    pub fn new(
-        max_rows_per_block: usize,
-        max_bytes_per_block: usize,
-        enable_stream_writer: bool,
-    ) -> Self {
+    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
         Self {
             max_bytes_per_block,
             max_rows_per_block,
-            enable_stream_writer,
         }
     }
 
@@ -56,10 +50,6 @@ impl DataProcessorStrategy for CompactStrategy {
     const NAME: &'static str = "Compact";
 
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
-        if self.enable_stream_writer {
-            return Ok(data_blocks);
-        }
-
         let blocks_num = data_blocks.len();
         if blocks_num < 2 {
             return Ok(data_blocks);
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
index 5aa4562c98865..96edfcc986434 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
@@ -14,6 +14,7 @@
 
 mod data_processor_strategy;
 mod hilbert_partition_exchange;
+mod transform_hilbert_collect;
 mod transform_window_partition_collect;
 mod window_partition_buffer;
 mod window_partition_exchange;
@@ -22,6 +23,7 @@ mod window_partition_partial_top_n_exchange;
 
 pub use data_processor_strategy::*;
 pub use hilbert_partition_exchange::*;
+pub use transform_hilbert_collect::*;
 pub use transform_window_partition_collect::*;
 pub use window_partition_buffer::*;
 pub use window_partition_exchange::*;
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
new file mode 100644
index 0000000000000..17f6fc17eeb65
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
@@ -0,0 +1,253 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_pipeline_core::processors::Event;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+use databend_common_pipeline_transforms::MemorySettings;
+use databend_common_settings::Settings;
+use databend_common_storage::DataOperator;
+
+use super::WindowPartitionBuffer;
+use super::WindowPartitionMeta;
+use crate::sessions::QueryContext;
+use crate::spillers::Spiller;
+use crate::spillers::SpillerConfig;
+use crate::spillers::SpillerDiskConfig;
+use crate::spillers::SpillerType;
+
+enum State {
+    Collect,
+    Flush,
+    Spill,
+    Restore,
+    Concat(Vec<DataBlock>),
+}
+
+pub struct TransformHilbertCollect {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    immediate_output_blocks: Vec<(usize, DataBlock)>,
+    output_data_blocks: VecDeque<DataBlock>,
+
+    // The partition id is used to map the partition id to the new partition id.
+    partition_id: Vec<usize>,
+    partition_sizes: Vec<usize>,
+    // The buffer is used to control the memory usage of the window operator.
+    buffer: WindowPartitionBuffer,
+
+    max_block_size: usize,
+    // Event variables.
+    state: State,
+}
+
+impl TransformHilbertCollect {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        ctx: Arc<QueryContext>,
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        settings: &Settings,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+        memory_settings: MemorySettings,
+        disk_spill: Option<SpillerDiskConfig>,
+        max_block_size: usize,
+    ) -> Result<Self> {
+        // Calculate the partition ids collected by the processor.
+        let partitions: Vec<usize> = (0..num_partitions)
+            .filter(|&partition| partition % num_processors == processor_id)
+            .collect();
+
+        // Map each partition id to new partition id.
+        let mut partition_id = vec![0; num_partitions];
+        for (new_partition_id, partition) in partitions.iter().enumerate() {
+            partition_id[*partition] = new_partition_id;
+        }
+
+        let location_prefix = ctx.query_id_spill_prefix();
+        let spill_config = SpillerConfig {
+            spiller_type: SpillerType::Window,
+            location_prefix,
+            disk_spill,
+            use_parquet: settings.get_spilling_file_format()?.is_parquet(),
+        };
+
+        // Create an inner `Spiller` to spill data.
+        let operator = DataOperator::instance().spill_operator();
+        let spiller = Spiller::create(ctx, operator, spill_config)?;
+
+        // Create the window partition buffer.
+        let sort_block_size = settings.get_window_partition_sort_block_size()? as usize;
+        let buffer = WindowPartitionBuffer::new(
+            spiller,
+            partitions.len(),
+            sort_block_size,
+            memory_settings,
+        )?;
+
+        Ok(Self {
+            input,
+            output,
+            partition_id,
+            buffer,
+            immediate_output_blocks: vec![],
+            partition_sizes: vec![0; num_partitions],
+            max_block_size,
+            output_data_blocks: VecDeque::new(),
+            state: State::Collect,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl Processor for TransformHilbertCollect {
+    fn name(&self) -> String {
+        "TransformHilbertCollect".to_string()
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if matches!(self.state, State::Concat(_)) {
+            return Ok(Event::Sync);
+        }
+
+        if matches!(self.state, State::Flush | State::Spill | State::Restore) {
+            return Ok(Event::Async);
+        }
+
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(data_block) = self.output_data_blocks.pop_front() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.need_spill() {
+            self.state = State::Spill;
+            return Ok(Event::Async);
+        }
+
+        if !self.immediate_output_blocks.is_empty() {
+            self.state = State::Flush;
+            return Ok(Event::Async);
+        }
+
+        if self.input.is_finished() {
+            if !self.buffer.is_empty() {
+                self.state = State::Restore;
+                return Ok(Event::Async);
+            }
+
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        if self.input.has_data() {
+            self.collect_data_block()?;
+
+            if self.need_spill() {
+                self.state = State::Spill;
+                return Ok(Event::Async);
+            }
+
+            if !self.immediate_output_blocks.is_empty() {
+                self.state = State::Flush;
+                return Ok(Event::Async);
+            }
+        }
+
+        self.input.set_need_data();
+        Ok(Event::NeedData)
+    }
+
+    fn process(&mut self) -> Result<()> {
+        match std::mem::replace(&mut self.state, State::Collect) {
+            State::Concat(blocks) => {
+                let output = DataBlock::concat(&blocks)?;
+                self.output_data_blocks.push_back(output);
+            }
+            _ => unreachable!(),
+        }
+        Ok(())
+    }
+
+    #[async_backtrace::framed]
+    async fn async_process(&mut self) -> Result<()> {
+        match std::mem::replace(&mut self.state, State::Collect) {
+            State::Spill => {
+                self.buffer.spill().await?;
+            }
+            State::Flush => {
+                if let Some((partition_id, data_block)) = self.immediate_output_blocks.pop() {
+                    let mut restored_data_blocks = self.buffer.restore_by_id(partition_id).await?;
+                    restored_data_blocks.push(data_block);
+                    self.state = State::Concat(restored_data_blocks);
+                }
+            }
+            State::Restore => {
+                let restored_data_blocks = self.buffer.restore().await?;
+                self.output_data_blocks.extend(restored_data_blocks);
+            }
+            _ => unreachable!(),
+        }
+        Ok(())
+    }
+}
+
+impl TransformHilbertCollect {
+    fn collect_data_block(&mut self) -> Result<()> {
+        let data_block = self.input.pull_data().unwrap()?;
+        if let Some(meta) = data_block
+            .get_owned_meta()
+            .and_then(WindowPartitionMeta::downcast_from)
+        {
+            for (partition_id, data_block) in meta.partitioned_data.into_iter() {
+                let new_id = self.partition_id[partition_id];
+                self.partition_sizes[new_id] += data_block.estimate_block_size();
+                if self.partition_sizes[new_id] >= self.max_block_size {
+                    self.immediate_output_blocks.push((new_id, data_block));
+                    self.partition_sizes[new_id] = 0;
+                    continue;
+                }
+                self.buffer.add_data_block(new_id, data_block);
+            }
+        }
+        Ok(())
+    }
+
+    fn need_spill(&mut self) -> bool {
+        self.buffer.need_spill()
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs
index bf01acedc586c..0a14b73bc26b5 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs
@@ -145,75 +145,80 @@ impl WindowPartitionBuffer {
         while self.next_to_restore_partition_id + 1 < self.num_partitions as isize {
             self.next_to_restore_partition_id += 1;
             let partition_id = self.next_to_restore_partition_id as usize;
-            // Restore large partitions from spilled files.
-            let mut result = self.spiller.read_spilled_partition(&partition_id).await?;
-
-            // Restore small merged partitions from spilled files.
-            let spilled_small_partitions =
-                std::mem::take(&mut self.spilled_small_partitions[partition_id]);
-            for index in spilled_small_partitions {
-                let out_of_memory_limit = self.out_of_memory_limit();
-                let (merged_partitions, restored, partial_restored) =
-                    &mut self.spilled_merged_partitions[index];
-                if *restored {
-                    continue;
-                }
-                let MergedPartition {
-                    location,
-                    partitions,
-                } = merged_partitions;
-                if out_of_memory_limit || *partial_restored {
-                    if let Some(pos) = partitions.iter().position(|(id, _)| *id == partition_id) {
-                        let data_block = self
-                            .spiller
-                            .read_chunk(location, &partitions[pos].1)
-                            .await?;
-                        self.restored_partition_buffer
-                            .add_data_block(partition_id, data_block);
-                        partitions.remove(pos);
-                        *partial_restored = true;
-                    }
-                } else {
-                    let partitioned_data = self
+            let result = self.restore_by_id(partition_id).await?;
+            if !result.is_empty() {
+                return Ok(result);
+            }
+        }
+        Ok(vec![])
+    }
+
+    pub async fn restore_by_id(&mut self, partition_id: usize) -> Result<Vec<DataBlock>> {
+        // Restore large partitions from spilled files.
+        let mut result = self.spiller.take_spilled_partition(&partition_id).await?;
+
+        // Restore small merged partitions from spilled files.
+        let spilled_small_partitions =
+            std::mem::take(&mut self.spilled_small_partitions[partition_id]);
+        for index in spilled_small_partitions {
+            let out_of_memory_limit = self.out_of_memory_limit();
+            let (merged_partitions, restored, partial_restored) =
+                &mut self.spilled_merged_partitions[index];
+            if *restored {
+                continue;
+            }
+            let MergedPartition {
+                location,
+                partitions,
+            } = merged_partitions;
+            if out_of_memory_limit || *partial_restored {
+                if let Some(pos) = partitions.iter().position(|(id, _)| *id == partition_id) {
+                    let data_block = self
                         .spiller
-                        .read_merged_partitions(merged_partitions)
+                        .read_chunk(location, &partitions[pos].1)
                         .await?;
-                    for (partition_id, data_block) in partitioned_data.into_iter() {
-                        self.restored_partition_buffer
-                            .add_data_block(partition_id, data_block);
-                    }
-                    *restored = true;
+                    self.restored_partition_buffer
+                        .add_data_block(partition_id, data_block);
+                    partitions.remove(pos);
+                    *partial_restored = true;
                 }
-            }
-
-            if !self.partition_buffer.is_partition_empty(partition_id) {
-                let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0);
-                if let Some(data_blocks) = self
-                    .partition_buffer
-                    .fetch_data_blocks(partition_id, &option)?
-                {
-                    result.extend(self.concat_data_blocks(data_blocks)?);
+            } else {
+                let partitioned_data = self
+                    .spiller
+                    .read_merged_partitions(merged_partitions)
+                    .await?;
+                for (partition_id, data_block) in partitioned_data.into_iter() {
+                    self.restored_partition_buffer
+                        .add_data_block(partition_id, data_block);
                 }
+                *restored = true;
             }
+        }
 
-            if !self
-                .restored_partition_buffer
-                .is_partition_empty(partition_id)
+        if !self.partition_buffer.is_partition_empty(partition_id) {
+            let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0);
+            if let Some(data_blocks) = self
+                .partition_buffer
+                .fetch_data_blocks(partition_id, &option)?
             {
-                let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0);
-                if let Some(data_blocks) = self
-                    .restored_partition_buffer
-                    .fetch_data_blocks(partition_id, &option)?
-                {
-                    result.extend(self.concat_data_blocks(data_blocks)?);
-                }
+                result.extend(self.concat_data_blocks(data_blocks)?);
             }
+        }
 
-            if !result.is_empty() {
-                return Ok(result);
+        if !self
+            .restored_partition_buffer
+            .is_partition_empty(partition_id)
+        {
+            let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0);
+            if let Some(data_blocks) = self
+                .restored_partition_buffer
+                .fetch_data_blocks(partition_id, &option)?
+            {
+                result.extend(self.concat_data_blocks(data_blocks)?);
             }
         }
-        Ok(vec![])
+
+        Ok(result)
     }
 
     fn concat_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs
index 6c454ab89638b..3627f1b83a117 100644
--- a/src/query/service/src/spillers/spiller.rs
+++ b/src/query/service/src/spillers/spiller.rs
@@ -386,6 +386,24 @@ impl Spiller {
         }
     }
 
+    #[async_backtrace::framed]
+    /// Read and remove spilled data with partition id
+    pub async fn take_spilled_partition(&mut self, p_id: &usize) -> Result<Vec<DataBlock>> {
+        if let Some(locs) = self.partition_location.remove(p_id) {
+            let mut spilled_data = Vec::with_capacity(locs.len());
+            for loc in locs {
+                let block = self.read_spilled_file(&loc).await?;
+
+                if block.num_rows() != 0 {
+                    spilled_data.push(block);
+                }
+            }
+            Ok(spilled_data)
+        } else {
+            Ok(vec![])
+        }
+    }
+
     pub async fn read_merged_partitions(
         &self,
         MergedPartition {
diff --git a/src/query/sql/src/executor/physical_plans/physical_recluster.rs b/src/query/sql/src/executor/physical_plans/physical_recluster.rs
index 43236e53766a5..0a5520ccb5ba2 100644
--- a/src/query/sql/src/executor/physical_plans/physical_recluster.rs
+++ b/src/query/sql/src/executor/physical_plans/physical_recluster.rs
@@ -33,6 +33,7 @@ pub struct HilbertPartition {
     pub table_info: TableInfo,
     pub table_meta_timestamps: TableMetaTimestamps,
     pub rows_per_block: usize,
+    pub bytes_per_block: usize,
     pub range_start: u64,
     pub range_width: usize,
 }
diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index 27eaf89c6616d..d0b8ca8d5d288 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -233,11 +233,16 @@ impl StreamBlockBuilder {
     }
 
     pub fn need_flush(&self) -> bool {
+        if let Some(max_block_bytes) = self.properties.max_block_bytes {
+            if self.block_size >= max_block_bytes {
+                return true;
+            }
+        };
         let file_size = self.block_writer.compressed_size();
         self.row_count >= self.properties.block_thresholds.min_rows_per_block
             || self.block_size >= self.properties.block_thresholds.max_bytes_per_block
             || (file_size >= self.properties.block_thresholds.min_compressed_per_block
-                && self.block_size >= self.properties.block_thresholds.min_bytes_per_block)
+            && self.block_size >= self.properties.block_thresholds.min_bytes_per_block)
     }
 
     pub fn write(&mut self, block: DataBlock) -> Result<()> {
@@ -350,6 +355,7 @@ pub struct StreamBlockProperties {
     pub(crate) ctx: Arc<dyn TableContext>,
     pub(crate) write_settings: WriteSettings,
     pub(crate) block_thresholds: BlockThresholds,
+    pub(crate) max_block_bytes: Option<usize>,
 
     meta_locations: TableMetaLocationGenerator,
     source_schema: TableSchemaRef,
@@ -368,6 +374,7 @@ impl StreamBlockProperties {
         ctx: Arc<dyn TableContext>,
         table: &FuseTable,
         table_meta_timestamps: TableMetaTimestamps,
+        max_block_bytes: Option<usize>,
     ) -> Result<Arc<Self>> {
         // remove virtual computed fields.
         let fields = table
@@ -430,6 +437,7 @@ impl StreamBlockProperties {
             ngram_args,
             inverted_index_builders,
             table_meta_timestamps,
+            max_block_bytes,
         }))
     }
 }
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index 84bdd8ca77609..57cc6c8af06ea 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -61,6 +61,7 @@ impl FuseTable {
                     self,
                     table_meta_timestamps,
                     false,
+                    None,
                 )
             })?;
         } else {
diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
index b3d1fd7bb416e..73a85bf4f52c5 100644
--- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
@@ -71,7 +71,7 @@ pub struct TransformBlockWriter {
     // Only used in multi table insert
     table_id: Option<u64>,
 
-    max_block_size: usize,
+    max_block_rows: usize,
     input_data: VecDeque<DataBlock>,
     output_data: Option<DataBlock>,
 }
@@ -85,12 +85,14 @@ impl TransformBlockWriter {
         table: &FuseTable,
         table_meta_timestamps: TableMetaTimestamps,
         with_tid: bool,
+        max_block_bytes: Option<usize>,
     ) -> Result<ProcessorPtr> {
-        let max_block_size = std::cmp::min(
+        let max_block_rows = std::cmp::min(
             ctx.get_settings().get_max_block_size()? as usize,
             table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT),
         );
-        let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?;
+        let properties =
+            StreamBlockProperties::try_create(ctx, table, table_meta_timestamps, max_block_bytes)?;
         Ok(ProcessorPtr::create(Box::new(TransformBlockWriter {
             state: State::Consume,
             input,
@@ -105,7 +107,7 @@ impl TransformBlockWriter {
             input_data_size: 0,
             input_num_rows: 0,
             output_data: None,
-            max_block_size,
+            max_block_rows,
         })))
     }
 
@@ -118,16 +120,16 @@ impl TransformBlockWriter {
         Ok(self.builder.as_mut().unwrap())
     }
 
-    fn calc_max_block_size(&self, block: &DataBlock) -> usize {
+    fn calc_max_block_rows(&self, block: &DataBlock) -> usize {
         let min_bytes_per_block = self.properties.block_thresholds.min_bytes_per_block;
         let block_size = block.estimate_block_size();
         if block_size < min_bytes_per_block {
-            return self.max_block_size;
+            return self.max_block_rows;
         }
         let num_rows = block.num_rows();
         let average_row_size = block_size.div_ceil(num_rows);
         let max_rows = min_bytes_per_block.div_ceil(average_row_size);
-        self.max_block_size.min(max_rows)
+        self.max_block_rows.min(max_rows)
     }
 }
 
@@ -205,9 +207,13 @@ impl Processor for TransformBlockWriter {
                 block.check_valid()?;
                 self.input_data_size += block.estimate_block_size();
                 self.input_num_rows += block.num_rows();
-                let max_rows_per_block = self.calc_max_block_size(&block);
-                let blocks = block.split_by_rows_no_tail(max_rows_per_block);
-                self.input_data.extend(blocks);
+                if self.properties.max_block_bytes.is_some() {
+                    self.input_data.push_back(block);
+                } else {
+                    let max_rows_per_block = self.calc_max_block_rows(&block);
+                    let blocks = block.split_by_rows_no_tail(max_rows_per_block);
+                    self.input_data.extend(blocks);
+                }
             }
             State::Serialize => {
                 while let Some(b) = self.input_data.pop_front() {

From 3cf0b6fdb6871dc939bfe79a3ba560f92e7f9cf3 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 9 May 2025 02:45:14 +0800
Subject: [PATCH 04/36] partial restore

---
 .../window/partition/transform_hilbert_collect.rs      |  3 ++-
 .../window/partition/window_partition_buffer.rs        | 10 +++++++---
 .../storages/fuse/src/io/write/stream/block_builder.rs |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
index 17f6fc17eeb65..cba5ec06cf0d8 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
@@ -211,7 +211,8 @@ impl Processor for TransformHilbertCollect {
             }
             State::Flush => {
                 if let Some((partition_id, data_block)) = self.immediate_output_blocks.pop() {
-                    let mut restored_data_blocks = self.buffer.restore_by_id(partition_id).await?;
+                    let mut restored_data_blocks =
+                        self.buffer.restore_by_id(partition_id, true).await?;
                     restored_data_blocks.push(data_block);
                     self.state = State::Concat(restored_data_blocks);
                 }
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs
index 0a14b73bc26b5..b58bafca0ee9a 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs
@@ -145,7 +145,7 @@ impl WindowPartitionBuffer {
         while self.next_to_restore_partition_id + 1 < self.num_partitions as isize {
             self.next_to_restore_partition_id += 1;
             let partition_id = self.next_to_restore_partition_id as usize;
-            let result = self.restore_by_id(partition_id).await?;
+            let result = self.restore_by_id(partition_id, false).await?;
             if !result.is_empty() {
                 return Ok(result);
             }
@@ -153,7 +153,11 @@ impl WindowPartitionBuffer {
         Ok(vec![])
     }
 
-    pub async fn restore_by_id(&mut self, partition_id: usize) -> Result<Vec<DataBlock>> {
+    pub async fn restore_by_id(
+        &mut self,
+        partition_id: usize,
+        partial_restore: bool,
+    ) -> Result<Vec<DataBlock>> {
         // Restore large partitions from spilled files.
         let mut result = self.spiller.take_spilled_partition(&partition_id).await?;
 
@@ -171,7 +175,7 @@ impl WindowPartitionBuffer {
                 location,
                 partitions,
             } = merged_partitions;
-            if out_of_memory_limit || *partial_restored {
+            if out_of_memory_limit || *partial_restored || partial_restore {
                 if let Some(pos) = partitions.iter().position(|(id, _)| *id == partition_id) {
                     let data_block = self
                         .spiller
diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index d0b8ca8d5d288..69e81f8dec714 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -242,7 +242,7 @@ impl StreamBlockBuilder {
         self.row_count >= self.properties.block_thresholds.min_rows_per_block
             || self.block_size >= self.properties.block_thresholds.max_bytes_per_block
             || (file_size >= self.properties.block_thresholds.min_compressed_per_block
-            && self.block_size >= self.properties.block_thresholds.min_bytes_per_block)
+                && self.block_size >= self.properties.block_thresholds.min_bytes_per_block)
     }
 
     pub fn write(&mut self, block: DataBlock) -> Result<()> {

From 82a5457f08532a413107e8d281be084841d5cde1 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 9 May 2025 02:52:32 +0800
Subject: [PATCH 05/36] format

---
 .../servers/flight/v1/exchange/exchange_manager.rs    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
index 13a65e33ebf08..8d96b11c3488d 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
@@ -303,7 +303,7 @@ impl DataExchangeManager {
                         None,
                         Some(config.query.to_rpc_client_tls_config()),
                     )
-                        .await?,
+                    .await?,
                 ))),
                 false => Ok(FlightClient::new(FlightServiceClient::new(
                     ConnectionFactory::create_rpc_channel(address.to_owned(), None, None).await?,
@@ -1011,8 +1011,8 @@ impl FragmentCoordinator {
                         .flight_scatter(&info.query_ctx, data_exchange)?,
                 }),
             )),
-            DataExchange::Modulo(exchange) => {
-                Ok(Some(ExchangeParams::ShuffleExchange(ShuffleExchangeParams {
+            DataExchange::Modulo(exchange) => Ok(Some(ExchangeParams::ShuffleExchange(
+                ShuffleExchangeParams {
                     exchange_injector: exchange_injector.clone(),
                     schema: self.physical_plan.output_schema()?,
                     fragment_id: self.fragment_id,
@@ -1021,9 +1021,8 @@ impl FragmentCoordinator {
                     destination_ids: exchange.destination_ids.to_owned(),
                     shuffle_scatter: exchange_injector
                         .flight_scatter(&info.query_ctx, data_exchange)?,
-                })
-                ))
-            }
+                },
+            ))),
         }
     }
 

From ac6ca412287eda38293d06d3e50ab2c51d1c47a8 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 9 May 2025 18:43:11 +0800
Subject: [PATCH 06/36] add test

---
 .../ee/src/hilbert_clustering/handler.rs      | 22 +++++++++--
 .../expression/src/utils/block_thresholds.rs  |  2 +-
 .../expression/tests/it/block_thresholds.rs   | 37 ++++++++++++++++---
 .../builders/builder_hilbert_partition.rs     |  1 +
 .../partition/transform_hilbert_collect.rs    | 12 +++---
 5 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/src/query/ee/src/hilbert_clustering/handler.rs b/src/query/ee/src/hilbert_clustering/handler.rs
index cebfbadc5947e..c7ee957a77ed9 100644
--- a/src/query/ee/src/hilbert_clustering/handler.rs
+++ b/src/query/ee/src/hilbert_clustering/handler.rs
@@ -63,7 +63,7 @@ impl HilbertClusteringHandler for RealHilbertClusteringHandler {
         let max_bytes_per_block = fuse_table.get_option(
             FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD,
             DEFAULT_BLOCK_BUFFER_SIZE,
-        );
+        ) * 2;
         let hilbert_min_bytes = std::cmp::max(
             hilbert_clustering_min_bytes,
             max_bytes_per_block * block_per_seg,
@@ -76,6 +76,7 @@ impl HilbertClusteringHandler for RealHilbertClusteringHandler {
         let mut checker = ReclusterChecker::new(
             cluster_key_id,
             hilbert_min_bytes,
+            block_per_seg,
             push_downs.as_ref().is_none_or(|v| v.filters.is_none()),
         );
         'FOR: for chunk in segment_locations.chunks(chunk_size) {
@@ -139,19 +140,29 @@ struct ReclusterChecker {
     hilbert_min_bytes: usize,
     total_bytes: usize,
 
+    hilbert_min_blocks: usize,
+    total_blocks: usize,
+
     finished: bool,
     // Whether the target segments is at the head of snapshot.
     head_of_snapshot: bool,
 }
 
 impl ReclusterChecker {
-    fn new(default_cluster_id: u32, hilbert_min_bytes: usize, head_of_snapshot: bool) -> Self {
+    fn new(
+        default_cluster_id: u32,
+        hilbert_min_bytes: usize,
+        hilbert_min_blocks: usize,
+        head_of_snapshot: bool,
+    ) -> Self {
         Self {
             segments: vec![],
             last_segment: None,
             default_cluster_id,
+            hilbert_min_blocks,
             hilbert_min_bytes,
             total_bytes: 0,
+            total_blocks: 0,
             finished: false,
             head_of_snapshot,
         }
@@ -164,10 +175,14 @@ impl ReclusterChecker {
 
         if segment_should_recluster || !self.head_of_snapshot {
             self.total_bytes += segment.summary.uncompressed_byte_size as usize;
+            self.total_blocks += segment.summary.block_count as usize;
             self.segments.push((location.clone(), segment.clone()));
         }
 
-        if !segment_should_recluster || self.total_bytes >= self.hilbert_min_bytes {
+        if !segment_should_recluster
+            || (self.total_bytes >= self.hilbert_min_bytes
+                && self.total_blocks >= self.hilbert_min_blocks)
+        {
             if self.check_for_recluster() {
                 self.finished = true;
                 return true;
@@ -208,6 +223,7 @@ impl ReclusterChecker {
 
     fn reset(&mut self) {
         self.total_bytes = 0;
+        self.total_blocks = 0;
         self.head_of_snapshot = false;
         self.segments.clear();
     }
diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs
index fe72302382d8a..742bfdf489261 100644
--- a/src/query/expression/src/utils/block_thresholds.rs
+++ b/src/query/expression/src/utils/block_thresholds.rs
@@ -153,7 +153,7 @@ impl BlockThresholds {
         let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed);
         // Adjust the number of blocks based on block size thresholds.
         let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024);
-        let min_bytes_per_block = self.min_bytes_per_block.min(100 * 1024 * 1024);
+        let min_bytes_per_block = max_bytes_per_block / 2;
         let block_nums = if bytes_per_block > max_bytes_per_block {
             // Case 1: If the block size is too bigger.
             total_bytes.div_ceil(max_bytes_per_block)
diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs
index 08793eb2a78e4..371a8194f552c 100644
--- a/src/query/expression/tests/it/block_thresholds.rs
+++ b/src/query/expression/tests/it/block_thresholds.rs
@@ -15,7 +15,7 @@
 use databend_common_expression::BlockThresholds;
 
 fn default_thresholds() -> BlockThresholds {
-    BlockThresholds::new(1000, 1_000_000, 100_000, 4)
+    BlockThresholds::new(1_000, 1_000_000, 100_000, 4)
 }
 
 #[test]
@@ -101,14 +101,41 @@ fn test_calc_rows_for_recluster() {
     );
 
     // Case 1: If the block size is too bigger.
-    let result = t.calc_rows_for_recluster(4_000, 30_000_000, 600_000);
-    assert_eq!(result, 400);
+    let result = t.calc_rows_for_recluster(4_500, 30_000_000, 600_000);
+    assert_eq!(result, 300);
 
     // Case 2: If the block size is too smaller.
-    let result = t.calc_rows_for_recluster(4_000, 2_000_000, 600_000);
-    assert_eq!(result, 800);
+    let result = t.calc_rows_for_recluster(4_000, 4_000_000, 600_000);
+    assert_eq!(result, 1000);
 
     // Case 3: use the compressed-based block count.
     let result = t.calc_rows_for_recluster(4_000, 10_000_000, 600_000);
     assert_eq!(result, 667);
 }
+
+#[test]
+fn test_calc_partitions_for_recluster() {
+    let t = default_thresholds();
+
+    // compact enough to skip further calculations
+    assert_eq!(t.calc_partitions_for_recluster(1000, 500_000, 100_000), 1);
+
+    // row-based block count exceeds compressed-based block count, use max rows per block.
+    assert_eq!(
+        t.calc_partitions_for_recluster(10_000, 2_000_000, 100_000),
+        10
+    );
+
+    // Case 1: If the block size is too bigger.
+    let result = t.calc_partitions_for_recluster(4_500, 30_000_000, 600_000);
+    assert_eq!(result, 15);
+
+    // Case 2: If the block size is too smaller.
+    let result = t.calc_partitions_for_recluster(4_000, 4_000_000, 600_000);
+    assert_eq!(result, 4);
+
+    // Case 3: use the compressed-based block count.
+    let result = t.calc_partitions_for_recluster(4_000, 10_000_000, 600_000);
+    assert_eq!(result, 6);
+}
+
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index fd351cd1e5fb7..1ffe3e5e2c69b 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -78,6 +78,7 @@ impl PipelineBuilder {
                         partition.range_width,
                         window_spill_settings.clone(),
                         disk_spill.clone(),
+                        partition.rows_per_block,
                         partition.bytes_per_block,
                     )?,
                 )))
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
index cba5ec06cf0d8..9b6928fb58a9b 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
@@ -73,6 +73,7 @@ impl TransformHilbertCollect {
         num_partitions: usize,
         memory_settings: MemorySettings,
         disk_spill: Option<SpillerDiskConfig>,
+        max_block_rows: usize,
         max_block_size: usize,
     ) -> Result<Self> {
         // Calculate the partition ids collected by the processor.
@@ -99,13 +100,10 @@ impl TransformHilbertCollect {
         let spiller = Spiller::create(ctx, operator, spill_config)?;
 
         // Create the window partition buffer.
-        let sort_block_size = settings.get_window_partition_sort_block_size()? as usize;
-        let buffer = WindowPartitionBuffer::new(
-            spiller,
-            partitions.len(),
-            sort_block_size,
-            memory_settings,
-        )?;
+        let max_block_rows =
+            max_block_rows.min(settings.get_window_partition_sort_block_size()? as usize);
+        let buffer =
+            WindowPartitionBuffer::new(spiller, partitions.len(), max_block_rows, memory_settings)?;
 
         Ok(Self {
             input,

From f3dbc57d2249a8da8b667d43c0eba1d85d3f75eb Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 9 May 2025 19:01:53 +0800
Subject: [PATCH 07/36] fix test

---
 .../ee/07_hilbert_clustering/07_0000_recluster_final.test | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
index b3354e66740f6..6fc4bfbf44841 100644
--- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
+++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
@@ -48,12 +48,12 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
 
 
 statement ok
-alter table t recluster final;
+alter table t recluster;
 
 query I
 select count() from fuse_snapshot('test_hilbert','t');
 ----
-6
+5
 
 query II
 select count(a), sum(a) from t;
@@ -77,7 +77,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
 query I
 select count() from fuse_snapshot('test_hilbert','t');
 ----
-9
+8
 
 query II
 select block_count,row_count from fuse_segment('test_hilbert','t');
@@ -109,7 +109,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
 query T
 select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t', 'a,b');
 ----
-(a, b) linear {"average_depth":1.4,"average_overlaps":0.4,"block_depth_histogram":{"00001":3,"00002":2},"constant_block_count":0,"total_block_count":5}
+(a, b) linear {"average_depth":1.0,"average_overlaps":0.0,"block_depth_histogram":{"00001":5},"constant_block_count":0,"total_block_count":5}
 
 # column specified not exist
 statement error 1065

From 22f2d3a1bc078c557423e1c60e81d4cf9093f3b4 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 9 May 2025 19:12:15 +0800
Subject: [PATCH 08/36] format

---
 src/query/expression/tests/it/block_thresholds.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs
index 371a8194f552c..e7ad1304ae3cc 100644
--- a/src/query/expression/tests/it/block_thresholds.rs
+++ b/src/query/expression/tests/it/block_thresholds.rs
@@ -138,4 +138,3 @@ fn test_calc_partitions_for_recluster() {
     let result = t.calc_partitions_for_recluster(4_000, 10_000_000, 600_000);
     assert_eq!(result, 6);
 }
-

From f5e0491fd5dee7d3c29e3b5e89216e52daaa494c Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Wed, 14 May 2025 02:05:11 +0800
Subject: [PATCH 09/36] add compact strategy

---
 .../pipelines/processors/transforms/mod.rs    |  2 +
 .../transforms/recluster/compact_strategy.rs  | 78 +++++++++++++++++++
 .../hilbert_partition_exchange.rs             |  4 -
 .../processors/transforms/recluster/mod.rs    | 21 +++++
 .../transform_hilbert_collect.rs              | 24 +++---
 .../partition/data_processor_strategy.rs      | 60 --------------
 .../transforms/window/partition/mod.rs        |  4 -
 7 files changed, 114 insertions(+), 79 deletions(-)
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs
 rename src/query/service/src/pipelines/processors/transforms/{window/partition => recluster}/hilbert_partition_exchange.rs (90%)
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
 rename src/query/service/src/pipelines/processors/transforms/{window/partition => recluster}/transform_hilbert_collect.rs (90%)

diff --git a/src/query/service/src/pipelines/processors/transforms/mod.rs b/src/query/service/src/pipelines/processors/transforms/mod.rs
index 80966daa5fa8d..5c7c2264f7a4f 100644
--- a/src/query/service/src/pipelines/processors/transforms/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/mod.rs
@@ -17,6 +17,7 @@ pub mod aggregator;
 mod broadcast;
 mod hash_join;
 pub(crate) mod range_join;
+mod recluster;
 mod runtime_pool;
 mod transform_add_computed_columns;
 mod transform_add_const_columns;
@@ -46,6 +47,7 @@ mod window;
 pub use broadcast::BroadcastSinkProcessor;
 pub use broadcast::BroadcastSourceProcessor;
 pub use hash_join::*;
+pub use recluster::*;
 pub use transform_add_computed_columns::TransformAddComputedColumns;
 pub use transform_add_const_columns::TransformAddConstColumns;
 pub use transform_add_internal_columns::TransformAddInternalColumns;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs
new file mode 100644
index 0000000000000..bd02855159648
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs
@@ -0,0 +1,78 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_exception::Result;
+use databend_common_expression::DataBlock;
+
+use crate::pipelines::processors::transforms::DataProcessorStrategy;
+
+pub struct CompactStrategy {
+    max_bytes_per_block: usize,
+    max_rows_per_block: usize,
+}
+
+impl CompactStrategy {
+    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
+        Self {
+            max_bytes_per_block,
+            max_rows_per_block,
+        }
+    }
+
+    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
+        DataBlock::concat(&blocks)
+    }
+
+    fn check_large_enough(&self, rows: usize, bytes: usize) -> bool {
+        rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block
+    }
+}
+
+impl DataProcessorStrategy for CompactStrategy {
+    const NAME: &'static str = "Compact";
+
+    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        let blocks_num = data_blocks.len();
+        if blocks_num < 2 {
+            return Ok(data_blocks);
+        }
+
+        let mut accumulated_rows = 0;
+        let mut accumulated_bytes = 0;
+        let mut pending_blocks = Vec::with_capacity(blocks_num);
+        let mut staged_blocks = Vec::with_capacity(blocks_num);
+        let mut result = Vec::with_capacity(blocks_num);
+        for block in data_blocks {
+            accumulated_rows += block.num_rows();
+            accumulated_bytes += block.estimate_block_size();
+            pending_blocks.push(block);
+            if !self.check_large_enough(accumulated_rows, accumulated_bytes) {
+                continue;
+            }
+            if !staged_blocks.is_empty() {
+                result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+            }
+            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
+            accumulated_rows = 0;
+            accumulated_bytes = 0;
+        }
+
+        staged_blocks.append(&mut pending_blocks);
+        if !staged_blocks.is_empty() {
+            result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+        }
+
+        Ok(result)
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs
similarity index 90%
rename from src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs
rename to src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs
index 16215dded2b15..c6a79277af909 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs
@@ -12,10 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Some variables and functions are named and designed with reference to ClickHouse.
-// - https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/Transforms/WindowTransform.h
-// - https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/Transforms/WindowTransform.cpp
-
 use std::sync::Arc;
 
 use databend_common_exception::Result;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
new file mode 100644
index 0000000000000..aba21e76d3cd2
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -0,0 +1,21 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod compact_strategy;
+mod hilbert_partition_exchange;
+mod transform_hilbert_collect;
+
+pub use compact_strategy::CompactStrategy;
+pub use hilbert_partition_exchange::HilbertPartitionExchange;
+pub use transform_hilbert_collect::TransformHilbertCollect;
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs
similarity index 90%
rename from src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
rename to src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs
index 9b6928fb58a9b..07740e7b56377 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs
@@ -27,8 +27,10 @@ use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_settings::Settings;
 use databend_common_storage::DataOperator;
 
-use super::WindowPartitionBuffer;
-use super::WindowPartitionMeta;
+use crate::pipelines::processors::transforms::CompactStrategy;
+use crate::pipelines::processors::transforms::DataProcessorStrategy;
+use crate::pipelines::processors::transforms::WindowPartitionBuffer;
+use crate::pipelines::processors::transforms::WindowPartitionMeta;
 use crate::sessions::QueryContext;
 use crate::spillers::Spiller;
 use crate::spillers::SpillerConfig;
@@ -40,7 +42,7 @@ enum State {
     Flush,
     Spill,
     Restore,
-    Concat(Vec<DataBlock>),
+    Compact(Vec<DataBlock>),
 }
 
 pub struct TransformHilbertCollect {
@@ -56,6 +58,7 @@ pub struct TransformHilbertCollect {
     // The buffer is used to control the memory usage of the window operator.
     buffer: WindowPartitionBuffer,
 
+    compact_strategy: CompactStrategy,
     max_block_size: usize,
     // Event variables.
     state: State,
@@ -100,8 +103,6 @@ impl TransformHilbertCollect {
         let spiller = Spiller::create(ctx, operator, spill_config)?;
 
         // Create the window partition buffer.
-        let max_block_rows =
-            max_block_rows.min(settings.get_window_partition_sort_block_size()? as usize);
         let buffer =
             WindowPartitionBuffer::new(spiller, partitions.len(), max_block_rows, memory_settings)?;
 
@@ -113,6 +114,7 @@ impl TransformHilbertCollect {
             immediate_output_blocks: vec![],
             partition_sizes: vec![0; num_partitions],
             max_block_size,
+            compact_strategy: CompactStrategy::new(max_block_rows, max_block_size),
             output_data_blocks: VecDeque::new(),
             state: State::Collect,
         })
@@ -130,7 +132,7 @@ impl Processor for TransformHilbertCollect {
     }
 
     fn event(&mut self) -> Result<Event> {
-        if matches!(self.state, State::Concat(_)) {
+        if matches!(self.state, State::Compact(_)) {
             return Ok(Event::Sync);
         }
 
@@ -192,9 +194,9 @@ impl Processor for TransformHilbertCollect {
 
     fn process(&mut self) -> Result<()> {
         match std::mem::replace(&mut self.state, State::Collect) {
-            State::Concat(blocks) => {
-                let output = DataBlock::concat(&blocks)?;
-                self.output_data_blocks.push_back(output);
+            State::Compact(blocks) => {
+                let output = self.compact_strategy.process_data_blocks(blocks)?;
+                self.output_data_blocks.extend(output);
             }
             _ => unreachable!(),
         }
@@ -212,12 +214,12 @@ impl Processor for TransformHilbertCollect {
                     let mut restored_data_blocks =
                         self.buffer.restore_by_id(partition_id, true).await?;
                     restored_data_blocks.push(data_block);
-                    self.state = State::Concat(restored_data_blocks);
+                    self.state = State::Compact(restored_data_blocks);
                 }
             }
             State::Restore => {
                 let restored_data_blocks = self.buffer.restore().await?;
-                self.output_data_blocks.extend(restored_data_blocks);
+                self.state = State::Compact(restored_data_blocks);
             }
             _ => unreachable!(),
         }
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
index 75793aa415e08..d0808f1d423ef 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
@@ -24,66 +24,6 @@ pub trait DataProcessorStrategy: Send + Sync + 'static {
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>>;
 }
 
-pub struct CompactStrategy {
-    max_bytes_per_block: usize,
-    max_rows_per_block: usize,
-}
-
-impl CompactStrategy {
-    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
-        Self {
-            max_bytes_per_block,
-            max_rows_per_block,
-        }
-    }
-
-    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
-        DataBlock::concat(&blocks)
-    }
-
-    fn check_large_enough(&self, rows: usize, bytes: usize) -> bool {
-        rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block
-    }
-}
-
-impl DataProcessorStrategy for CompactStrategy {
-    const NAME: &'static str = "Compact";
-
-    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
-        let blocks_num = data_blocks.len();
-        if blocks_num < 2 {
-            return Ok(data_blocks);
-        }
-
-        let mut accumulated_rows = 0;
-        let mut accumulated_bytes = 0;
-        let mut pending_blocks = Vec::with_capacity(blocks_num);
-        let mut staged_blocks = Vec::with_capacity(blocks_num);
-        let mut result = Vec::with_capacity(blocks_num);
-        for block in data_blocks {
-            accumulated_rows += block.num_rows();
-            accumulated_bytes += block.estimate_block_size();
-            pending_blocks.push(block);
-            if !self.check_large_enough(accumulated_rows, accumulated_bytes) {
-                continue;
-            }
-            if !staged_blocks.is_empty() {
-                result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
-            }
-            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
-            accumulated_rows = 0;
-            accumulated_bytes = 0;
-        }
-
-        staged_blocks.append(&mut pending_blocks);
-        if !staged_blocks.is_empty() {
-            result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
-        }
-
-        Ok(result)
-    }
-}
-
 pub struct SortStrategy {
     sort_desc: Vec<SortColumnDescription>,
     schema: DataSchemaRef,
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
index 96edfcc986434..aaa93a459f8b6 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 mod data_processor_strategy;
-mod hilbert_partition_exchange;
-mod transform_hilbert_collect;
 mod transform_window_partition_collect;
 mod window_partition_buffer;
 mod window_partition_exchange;
@@ -22,8 +20,6 @@ mod window_partition_meta;
 mod window_partition_partial_top_n_exchange;
 
 pub use data_processor_strategy::*;
-pub use hilbert_partition_exchange::*;
-pub use transform_hilbert_collect::*;
 pub use transform_window_partition_collect::*;
 pub use window_partition_buffer::*;
 pub use window_partition_exchange::*;

From 504e60f3b7b0890d0fe7e8b15b788f729068d4a9 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Wed, 14 May 2025 09:39:34 +0800
Subject: [PATCH 10/36] fix

---
 src/query/ee/src/hilbert_clustering/handler.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/query/ee/src/hilbert_clustering/handler.rs b/src/query/ee/src/hilbert_clustering/handler.rs
index c7ee957a77ed9..dc116c57ab13b 100644
--- a/src/query/ee/src/hilbert_clustering/handler.rs
+++ b/src/query/ee/src/hilbert_clustering/handler.rs
@@ -63,7 +63,7 @@ impl HilbertClusteringHandler for RealHilbertClusteringHandler {
         let max_bytes_per_block = fuse_table.get_option(
             FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD,
             DEFAULT_BLOCK_BUFFER_SIZE,
-        ) * 2;
+        );
         let hilbert_min_bytes = std::cmp::max(
             hilbert_clustering_min_bytes,
             max_bytes_per_block * block_per_seg,

From 0798c0f5e60b5ca9d658ab9795e4b7c3e8d3b814 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 15 May 2025 02:12:02 +0800
Subject: [PATCH 11/36] fix test

---
 .../ee/07_hilbert_clustering/07_0000_recluster_final.test     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
index 6fc4bfbf44841..c1f3c647936ab 100644
--- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
+++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
@@ -77,7 +77,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
 query I
 select count() from fuse_snapshot('test_hilbert','t');
 ----
-8
+9
 
 query II
 select block_count,row_count from fuse_segment('test_hilbert','t');
@@ -109,7 +109,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
 query T
 select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t', 'a,b');
 ----
-(a, b) linear {"average_depth":1.0,"average_overlaps":0.0,"block_depth_histogram":{"00001":5},"constant_block_count":0,"total_block_count":5}
+(a, b) linear {"average_depth":1.4,"average_overlaps":0.4,"block_depth_histogram":{"00001":3,"00002":2},"constant_block_count":0,"total_block_count":5}
 
 # column specified not exist
 statement error 1065

From 98279ec6bbb39fe7b130eb683332563f25a37302 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 15 May 2025 10:32:29 +0800
Subject: [PATCH 12/36] fix test

---
 .../07_hilbert_clustering/07_0000_recluster_final.test   | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
index c1f3c647936ab..2416bb509bea1 100644
--- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
+++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
@@ -21,6 +21,9 @@ USE test_hilbert
 statement ok
 create or replace table t(a int, b int) cluster by hilbert(a, b) row_per_block=2 block_per_segment=2 block_size_threshold = 18;
 
+statement ok
+set enable_block_stream_write = 0
+
 statement ok
 set hilbert_clustering_min_bytes = 35;
 
@@ -97,10 +100,10 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
 statement ok
 alter table t recluster final;
 
-query T
-select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t');
+query II
+select info:partial_segment_count, info:unclustered_segment_count from clustering_information('test_hilbert','t');
 ----
-(b, a) hilbert {"partial_block_count":0,"partial_segment_count":0,"stable_block_count":5,"stable_segment_count":2,"total_block_count":5,"total_segment_count":2,"unclustered_block_count":0,"unclustered_segment_count":0}
+0 0
 
 ########################################################
 #  force eval as linear clustering by specify columns  #

From d150dc03ea646af7461b4d0f5f297f454c302c09 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 15 May 2025 16:47:48 +0800
Subject: [PATCH 13/36] fix test

---
 .../ee/07_hilbert_clustering/07_0000_recluster_final.test     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
index 2416bb509bea1..4e0822f4589db 100644
--- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
+++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test
@@ -110,9 +110,9 @@ select info:partial_segment_count, info:unclustered_segment_count from clusterin
 ########################################################
 
 query T
-select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t', 'a,b');
+select cluster_key, type, info:constant_block_count from clustering_information('test_hilbert','t', 'a,b');
 ----
-(a, b) linear {"average_depth":1.4,"average_overlaps":0.4,"block_depth_histogram":{"00001":3,"00002":2},"constant_block_count":0,"total_block_count":5}
+(a, b) linear 0
 
 # column specified not exist
 statement error 1065

From 797177f67f65af894922678263ce3eabe9065828 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Mon, 19 May 2025 12:40:41 +0800
Subject: [PATCH 14/36] spill block build and block write

---
 .../expression/src/utils/block_thresholds.rs  |   4 +-
 .../builders/builder_hilbert_partition.rs     |  66 ++---
 .../src/pipelines/builders/builder_window.rs  |   8 +-
 .../transforms/recluster/compact_strategy.rs  |  78 ------
 .../processors/transforms/recluster/mod.rs    |  11 +-
 ...nge.rs => recluster_partition_exchange.rs} |  20 +-
 .../recluster/recluster_partition_strategy.rs | 151 +++++++++++
 .../recluster/transform_hilbert_collect.rs    | 254 ------------------
 .../transforms/window/partition/mod.rs        |   4 +-
 ...ategy.rs => partition_process_strategy.rs} |  30 ++-
 .../transform_window_partition_collect.rs     |  34 +--
 src/query/storages/fuse/src/io/mod.rs         |   4 +-
 .../fuse/src/io/write/block_writer.rs         |   8 +
 .../fuse/src/io/write/bloom_index_writer.rs   |   1 +
 .../src/io/write/inverted_index_writer.rs     |   1 +
 src/query/storages/fuse/src/io/write/mod.rs   |   4 +-
 .../fuse/src/io/write/stream/block_builder.rs |  15 +-
 .../storages/fuse/src/io/write/stream/mod.rs  |   4 +-
 .../storages/fuse/src/operations/append.rs    |  10 +-
 .../src/operations/common/processors/mod.rs   |   1 +
 .../processors/transform_block_writer.rs      | 116 ++++----
 21 files changed, 338 insertions(+), 486 deletions(-)
 delete mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs
 rename src/query/service/src/pipelines/processors/transforms/recluster/{hilbert_partition_exchange.rs => recluster_partition_exchange.rs} (78%)
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
 delete mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs
 rename src/query/service/src/pipelines/processors/transforms/window/partition/{data_processor_strategy.rs => partition_process_strategy.rs} (75%)

diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs
index 742bfdf489261..01c0631abe124 100644
--- a/src/query/expression/src/utils/block_thresholds.rs
+++ b/src/query/expression/src/utils/block_thresholds.rs
@@ -153,7 +153,7 @@ impl BlockThresholds {
         let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed);
         // Adjust the number of blocks based on block size thresholds.
         let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024);
-        let min_bytes_per_block = max_bytes_per_block / 2;
+        let min_bytes_per_block = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024);
         let block_nums = if bytes_per_block > max_bytes_per_block {
             // Case 1: If the block size is too bigger.
             total_bytes.div_ceil(max_bytes_per_block)
@@ -201,7 +201,7 @@ impl BlockThresholds {
         // Adjust block count based on byte size thresholds.
         let bytes_per_block = total_bytes.div_ceil(by_compressed);
         let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024);
-        let min_bytes = max_bytes / 2;
+        let min_bytes = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024);
         let total_partitions = if bytes_per_block > max_bytes {
             // Block size is too large.
             total_bytes / max_bytes
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index 1ffe3e5e2c69b..9ae7941e01e9e 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -20,8 +20,10 @@ use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
 use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_transforms::MemorySettings;
+use databend_common_pipeline_transforms::TransformPipelineHelper;
 use databend_common_sql::executor::physical_plans::HilbertPartition;
 use databend_common_sql::executor::physical_plans::MutationKind;
+use databend_common_storages_fuse::io::StreamBlockProperties;
 use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::statistics::ClusterStatsGenerator;
@@ -29,10 +31,10 @@ use databend_common_storages_fuse::FuseTable;
 use databend_storages_common_cache::TempDirManager;
 
 use crate::pipelines::memory_settings::MemorySettingsExt;
-use crate::pipelines::processors::transforms::CompactStrategy;
-use crate::pipelines::processors::transforms::HilbertPartitionExchange;
-use crate::pipelines::processors::transforms::TransformHilbertCollect;
-use crate::pipelines::processors::transforms::TransformWindowPartitionCollect;
+use crate::pipelines::processors::transforms::CompactPartitionStrategy;
+use crate::pipelines::processors::transforms::ReclusterPartitionExchange;
+use crate::pipelines::processors::transforms::ReclusterPartitionStrategy;
+use crate::pipelines::processors::transforms::TransformPartitionCollect;
 use crate::pipelines::PipelineBuilder;
 use crate::spillers::SpillerDiskConfig;
 
@@ -49,7 +51,7 @@ impl PipelineBuilder {
 
         self.main_pipeline.exchange(
             num_processors,
-            HilbertPartitionExchange::create(partition.range_start, partition.range_width),
+            ReclusterPartitionExchange::create(partition.range_start, partition.range_width),
         );
 
         let settings = self.ctx.get_settings();
@@ -66,9 +68,15 @@ impl PipelineBuilder {
         let processor_id = AtomicUsize::new(0);
 
         if enable_stream_writer {
+            let properties = StreamBlockProperties::try_create(
+                self.ctx.clone(),
+                table,
+                partition.table_meta_timestamps,
+            )?;
+
             self.main_pipeline.add_transform(|input, output| {
                 Ok(ProcessorPtr::create(Box::new(
-                    TransformHilbertCollect::new(
+                    TransformPartitionCollect::new(
                         self.ctx.clone(),
                         input,
                         output,
@@ -78,28 +86,24 @@ impl PipelineBuilder {
                         partition.range_width,
                         window_spill_settings.clone(),
                         disk_spill.clone(),
-                        partition.rows_per_block,
-                        partition.bytes_per_block,
+                        ReclusterPartitionStrategy::new(properties.clone()),
                     )?,
                 )))
             })?;
 
-            self.main_pipeline.add_transform(|input, output| {
-                TransformBlockWriter::try_create(
+            self.main_pipeline.add_async_accumulating_transformer(|| {
+                TransformBlockWriter::create(
                     self.ctx.clone(),
-                    input,
-                    output,
                     MutationKind::Recluster,
                     table,
-                    partition.table_meta_timestamps,
                     false,
-                    Some(partition.bytes_per_block),
                 )
-            })
+            });
+            Ok(())
         } else {
             self.main_pipeline.add_transform(|input, output| {
                 Ok(ProcessorPtr::create(Box::new(
-                    TransformWindowPartitionCollect::new(
+                    TransformPartitionCollect::new(
                         self.ctx.clone(),
                         input,
                         output,
@@ -109,24 +113,26 @@ impl PipelineBuilder {
                         partition.range_width,
                         window_spill_settings.clone(),
                         disk_spill.clone(),
-                        CompactStrategy::new(partition.rows_per_block, partition.bytes_per_block),
+                        CompactPartitionStrategy::new(
+                            partition.rows_per_block,
+                            partition.bytes_per_block,
+                        ),
                     )?,
                 )))
             })?;
 
-            self.main_pipeline
-                .add_transform(|transform_input_port, transform_output_port| {
-                    let proc = TransformSerializeBlock::try_create(
-                        self.ctx.clone(),
-                        transform_input_port,
-                        transform_output_port,
-                        table,
-                        ClusterStatsGenerator::default(),
-                        MutationKind::Recluster,
-                        partition.table_meta_timestamps,
-                    )?;
-                    proc.into_processor()
-                })
+            self.main_pipeline.add_transform(|input, output| {
+                let proc = TransformSerializeBlock::try_create(
+                    self.ctx.clone(),
+                    input,
+                    output,
+                    table,
+                    ClusterStatsGenerator::default(),
+                    MutationKind::Recluster,
+                    partition.table_meta_timestamps,
+                )?;
+                proc.into_processor()
+            })
         }
     }
 }
diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs
index 187bb25d7dd77..64dbbe0e41e18 100644
--- a/src/query/service/src/pipelines/builders/builder_window.rs
+++ b/src/query/service/src/pipelines/builders/builder_window.rs
@@ -30,11 +30,11 @@ use databend_storages_common_cache::TempDirManager;
 
 use crate::pipelines::memory_settings::MemorySettingsExt;
 use crate::pipelines::processors::transforms::FrameBound;
-use crate::pipelines::processors::transforms::SortStrategy;
+use crate::pipelines::processors::transforms::TransformPartitionCollect;
 use crate::pipelines::processors::transforms::TransformWindow;
-use crate::pipelines::processors::transforms::TransformWindowPartitionCollect;
 use crate::pipelines::processors::transforms::WindowFunctionInfo;
 use crate::pipelines::processors::transforms::WindowPartitionExchange;
+use crate::pipelines::processors::transforms::WindowPartitionStrategy;
 use crate::pipelines::processors::transforms::WindowPartitionTopNExchange;
 use crate::pipelines::processors::transforms::WindowSortDesc;
 use crate::pipelines::PipelineBuilder;
@@ -203,14 +203,14 @@ impl PipelineBuilder {
 
         let processor_id = AtomicUsize::new(0);
         self.main_pipeline.add_transform(|input, output| {
-            let strategy = SortStrategy::try_create(
+            let strategy = WindowPartitionStrategy::try_create(
                 &settings,
                 sort_desc.clone(),
                 plan_schema.clone(),
                 have_order_col,
             )?;
             Ok(ProcessorPtr::create(Box::new(
-                TransformWindowPartitionCollect::new(
+                TransformPartitionCollect::new(
                     self.ctx.clone(),
                     input,
                     output,
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs
deleted file mode 100644
index bd02855159648..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use databend_common_exception::Result;
-use databend_common_expression::DataBlock;
-
-use crate::pipelines::processors::transforms::DataProcessorStrategy;
-
-pub struct CompactStrategy {
-    max_bytes_per_block: usize,
-    max_rows_per_block: usize,
-}
-
-impl CompactStrategy {
-    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
-        Self {
-            max_bytes_per_block,
-            max_rows_per_block,
-        }
-    }
-
-    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
-        DataBlock::concat(&blocks)
-    }
-
-    fn check_large_enough(&self, rows: usize, bytes: usize) -> bool {
-        rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block
-    }
-}
-
-impl DataProcessorStrategy for CompactStrategy {
-    const NAME: &'static str = "Compact";
-
-    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
-        let blocks_num = data_blocks.len();
-        if blocks_num < 2 {
-            return Ok(data_blocks);
-        }
-
-        let mut accumulated_rows = 0;
-        let mut accumulated_bytes = 0;
-        let mut pending_blocks = Vec::with_capacity(blocks_num);
-        let mut staged_blocks = Vec::with_capacity(blocks_num);
-        let mut result = Vec::with_capacity(blocks_num);
-        for block in data_blocks {
-            accumulated_rows += block.num_rows();
-            accumulated_bytes += block.estimate_block_size();
-            pending_blocks.push(block);
-            if !self.check_large_enough(accumulated_rows, accumulated_bytes) {
-                continue;
-            }
-            if !staged_blocks.is_empty() {
-                result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
-            }
-            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
-            accumulated_rows = 0;
-            accumulated_bytes = 0;
-        }
-
-        staged_blocks.append(&mut pending_blocks);
-        if !staged_blocks.is_empty() {
-            result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
-        }
-
-        Ok(result)
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
index aba21e76d3cd2..a3c680958f00b 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod compact_strategy;
-mod hilbert_partition_exchange;
-mod transform_hilbert_collect;
+mod recluster_partition_exchange;
+mod recluster_partition_strategy;
 
-pub use compact_strategy::CompactStrategy;
-pub use hilbert_partition_exchange::HilbertPartitionExchange;
-pub use transform_hilbert_collect::TransformHilbertCollect;
+pub use recluster_partition_exchange::ReclusterPartitionExchange;
+pub use recluster_partition_strategy::CompactPartitionStrategy;
+pub use recluster_partition_strategy::ReclusterPartitionStrategy;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
similarity index 78%
rename from src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs
rename to src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
index c6a79277af909..221d4328ef67a 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
@@ -20,19 +20,19 @@ use databend_common_pipeline_core::processors::Exchange;
 
 use crate::pipelines::processors::transforms::WindowPartitionMeta;
 
-pub struct HilbertPartitionExchange {
+pub struct ReclusterPartitionExchange {
     start: u64,
     width: usize,
 }
 
-impl HilbertPartitionExchange {
-    pub fn create(start: u64, width: usize) -> Arc<HilbertPartitionExchange> {
-        Arc::new(HilbertPartitionExchange { start, width })
+impl ReclusterPartitionExchange {
+    pub fn create(start: u64, width: usize) -> Arc<ReclusterPartitionExchange> {
+        Arc::new(ReclusterPartitionExchange { start, width })
     }
 }
 
-impl Exchange for HilbertPartitionExchange {
-    const NAME: &'static str = "Hilbert";
+impl Exchange for ReclusterPartitionExchange {
+    const NAME: &'static str = "Recluster";
     fn partition(&self, data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
         let mut data_block = data_block;
         let range_ids = data_block
@@ -51,16 +51,10 @@ impl Exchange for HilbertPartitionExchange {
 
         let scatter_indices = DataBlock::divide_indices_by_scatter_size(&indices, self.width);
         // Partition the data blocks to different processors.
-        let base = self.width / n;
-        let remainder = self.width % n;
         let mut output_data_blocks = vec![vec![]; n];
         for (partition_id, indices) in scatter_indices.into_iter().take(self.width).enumerate() {
             if !indices.is_empty() {
-                let target = if partition_id < remainder * (base + 1) {
-                    partition_id / (base + 1)
-                } else {
-                    (partition_id - remainder) / base
-                };
+                let target = (partition_id * n) / self.width;
                 let block = data_block.take_with_optimize_size(&indices)?;
                 output_data_blocks[target].push((partition_id, block));
             }
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
new file mode 100644
index 0000000000000..7f478c94b8d43
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
@@ -0,0 +1,151 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::DataBlock;
+use databend_common_storages_fuse::io::StreamBlockBuilder;
+use databend_common_storages_fuse::io::StreamBlockProperties;
+
+use crate::pipelines::processors::transforms::PartitionProcessStrategy;
+
+/// `ReclusterPartitionStrategy` is used when block stream writing is enabled.
+/// It incrementally writes blocks using `StreamBlockBuilder`, which allows
+/// partial serialization and flush during reclustering (e.g., Hilbert clustering).
+pub struct ReclusterPartitionStrategy {
+    properties: Arc<StreamBlockProperties>,
+}
+
+impl ReclusterPartitionStrategy {
+    pub fn new(properties: Arc<StreamBlockProperties>) -> Self {
+        Self { properties }
+    }
+}
+
+impl PartitionProcessStrategy for ReclusterPartitionStrategy {
+    const NAME: &'static str = "Recluster";
+
+    fn calc_partitions(
+        &self,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+    ) -> Vec<usize> {
+        (0..num_partitions)
+            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
+            .collect()
+    }
+
+    /// Stream write each block, and flush it conditionally based on builder status
+    /// and input size estimation.
+    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        let mut input_sizes: usize = data_blocks.iter().map(|b| b.estimate_block_size()).sum();
+        let mut input_rows: usize = data_blocks.iter().map(|b| b.num_rows()).sum();
+
+        let mut result = Vec::new();
+        let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
+        for block in data_blocks {
+            input_sizes -= block.estimate_block_size();
+            input_rows -= block.num_rows();
+            builder.write(block)?;
+            if builder.need_flush() && self.properties.check_large_enough(input_rows, input_sizes) {
+                let serialized = builder.finish()?;
+                result.push(DataBlock::empty_with_meta(Box::new(serialized)));
+                builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
+            }
+        }
+
+        if !builder.is_empty() {
+            let serialized = builder.finish()?;
+            result.push(DataBlock::empty_with_meta(Box::new(serialized)));
+        }
+        Ok(result)
+    }
+}
+
+/// `CompactPartitionStrategy` is used when stream write is NOT enabled.
+/// It uses a traditional "accumulate and concat" strategy to build large blocks
+/// once input thresholds (row count or size) are exceeded.
+pub struct CompactPartitionStrategy {
+    max_bytes_per_block: usize,
+    max_rows_per_block: usize,
+}
+
+impl CompactPartitionStrategy {
+    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
+        Self {
+            max_bytes_per_block,
+            max_rows_per_block,
+        }
+    }
+
+    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
+        DataBlock::concat(&blocks)
+    }
+
+    fn check_large_enough(&self, rows: usize, bytes: usize) -> bool {
+        rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block
+    }
+}
+
+impl PartitionProcessStrategy for CompactPartitionStrategy {
+    const NAME: &'static str = "Compact";
+
+    fn calc_partitions(
+        &self,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+    ) -> Vec<usize> {
+        (0..num_partitions)
+            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
+            .collect()
+    }
+
+    /// Collects blocks into batches and merges them via `concat` when size threshold is reached.
+    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        let blocks_num = data_blocks.len();
+        if blocks_num < 2 {
+            return Ok(data_blocks);
+        }
+
+        let mut accumulated_rows = 0;
+        let mut accumulated_bytes = 0;
+        let mut pending_blocks = Vec::with_capacity(blocks_num);
+        let mut staged_blocks = Vec::with_capacity(blocks_num);
+        let mut result = Vec::with_capacity(blocks_num);
+        for block in data_blocks {
+            accumulated_rows += block.num_rows();
+            accumulated_bytes += block.estimate_block_size();
+            pending_blocks.push(block);
+            if !self.check_large_enough(accumulated_rows, accumulated_bytes) {
+                continue;
+            }
+            if !staged_blocks.is_empty() {
+                result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+            }
+            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
+            accumulated_rows = 0;
+            accumulated_bytes = 0;
+        }
+
+        staged_blocks.append(&mut pending_blocks);
+        if !staged_blocks.is_empty() {
+            result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+        }
+
+        Ok(result)
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs
deleted file mode 100644
index 07740e7b56377..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::any::Any;
-use std::collections::VecDeque;
-use std::sync::Arc;
-
-use databend_common_exception::Result;
-use databend_common_expression::BlockMetaInfoDowncast;
-use databend_common_expression::DataBlock;
-use databend_common_pipeline_core::processors::Event;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_transforms::MemorySettings;
-use databend_common_settings::Settings;
-use databend_common_storage::DataOperator;
-
-use crate::pipelines::processors::transforms::CompactStrategy;
-use crate::pipelines::processors::transforms::DataProcessorStrategy;
-use crate::pipelines::processors::transforms::WindowPartitionBuffer;
-use crate::pipelines::processors::transforms::WindowPartitionMeta;
-use crate::sessions::QueryContext;
-use crate::spillers::Spiller;
-use crate::spillers::SpillerConfig;
-use crate::spillers::SpillerDiskConfig;
-use crate::spillers::SpillerType;
-
-enum State {
-    Collect,
-    Flush,
-    Spill,
-    Restore,
-    Compact(Vec<DataBlock>),
-}
-
-pub struct TransformHilbertCollect {
-    input: Arc<InputPort>,
-    output: Arc<OutputPort>,
-
-    immediate_output_blocks: Vec<(usize, DataBlock)>,
-    output_data_blocks: VecDeque<DataBlock>,
-
-    // The partition id is used to map the partition id to the new partition id.
-    partition_id: Vec<usize>,
-    partition_sizes: Vec<usize>,
-    // The buffer is used to control the memory usage of the window operator.
-    buffer: WindowPartitionBuffer,
-
-    compact_strategy: CompactStrategy,
-    max_block_size: usize,
-    // Event variables.
-    state: State,
-}
-
-impl TransformHilbertCollect {
-    #[allow(clippy::too_many_arguments)]
-    pub fn new(
-        ctx: Arc<QueryContext>,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        settings: &Settings,
-        processor_id: usize,
-        num_processors: usize,
-        num_partitions: usize,
-        memory_settings: MemorySettings,
-        disk_spill: Option<SpillerDiskConfig>,
-        max_block_rows: usize,
-        max_block_size: usize,
-    ) -> Result<Self> {
-        // Calculate the partition ids collected by the processor.
-        let partitions: Vec<usize> = (0..num_partitions)
-            .filter(|&partition| partition % num_processors == processor_id)
-            .collect();
-
-        // Map each partition id to new partition id.
-        let mut partition_id = vec![0; num_partitions];
-        for (new_partition_id, partition) in partitions.iter().enumerate() {
-            partition_id[*partition] = new_partition_id;
-        }
-
-        let location_prefix = ctx.query_id_spill_prefix();
-        let spill_config = SpillerConfig {
-            spiller_type: SpillerType::Window,
-            location_prefix,
-            disk_spill,
-            use_parquet: settings.get_spilling_file_format()?.is_parquet(),
-        };
-
-        // Create an inner `Spiller` to spill data.
-        let operator = DataOperator::instance().spill_operator();
-        let spiller = Spiller::create(ctx, operator, spill_config)?;
-
-        // Create the window partition buffer.
-        let buffer =
-            WindowPartitionBuffer::new(spiller, partitions.len(), max_block_rows, memory_settings)?;
-
-        Ok(Self {
-            input,
-            output,
-            partition_id,
-            buffer,
-            immediate_output_blocks: vec![],
-            partition_sizes: vec![0; num_partitions],
-            max_block_size,
-            compact_strategy: CompactStrategy::new(max_block_rows, max_block_size),
-            output_data_blocks: VecDeque::new(),
-            state: State::Collect,
-        })
-    }
-}
-
-#[async_trait::async_trait]
-impl Processor for TransformHilbertCollect {
-    fn name(&self) -> String {
-        "TransformHilbertCollect".to_string()
-    }
-
-    fn as_any(&mut self) -> &mut dyn Any {
-        self
-    }
-
-    fn event(&mut self) -> Result<Event> {
-        if matches!(self.state, State::Compact(_)) {
-            return Ok(Event::Sync);
-        }
-
-        if matches!(self.state, State::Flush | State::Spill | State::Restore) {
-            return Ok(Event::Async);
-        }
-
-        if self.output.is_finished() {
-            self.input.finish();
-            return Ok(Event::Finished);
-        }
-
-        if !self.output.can_push() {
-            return Ok(Event::NeedConsume);
-        }
-
-        if let Some(data_block) = self.output_data_blocks.pop_front() {
-            self.output.push_data(Ok(data_block));
-            return Ok(Event::NeedConsume);
-        }
-
-        if self.need_spill() {
-            self.state = State::Spill;
-            return Ok(Event::Async);
-        }
-
-        if !self.immediate_output_blocks.is_empty() {
-            self.state = State::Flush;
-            return Ok(Event::Async);
-        }
-
-        if self.input.is_finished() {
-            if !self.buffer.is_empty() {
-                self.state = State::Restore;
-                return Ok(Event::Async);
-            }
-
-            self.output.finish();
-            return Ok(Event::Finished);
-        }
-
-        if self.input.has_data() {
-            self.collect_data_block()?;
-
-            if self.need_spill() {
-                self.state = State::Spill;
-                return Ok(Event::Async);
-            }
-
-            if !self.immediate_output_blocks.is_empty() {
-                self.state = State::Flush;
-                return Ok(Event::Async);
-            }
-        }
-
-        self.input.set_need_data();
-        Ok(Event::NeedData)
-    }
-
-    fn process(&mut self) -> Result<()> {
-        match std::mem::replace(&mut self.state, State::Collect) {
-            State::Compact(blocks) => {
-                let output = self.compact_strategy.process_data_blocks(blocks)?;
-                self.output_data_blocks.extend(output);
-            }
-            _ => unreachable!(),
-        }
-        Ok(())
-    }
-
-    #[async_backtrace::framed]
-    async fn async_process(&mut self) -> Result<()> {
-        match std::mem::replace(&mut self.state, State::Collect) {
-            State::Spill => {
-                self.buffer.spill().await?;
-            }
-            State::Flush => {
-                if let Some((partition_id, data_block)) = self.immediate_output_blocks.pop() {
-                    let mut restored_data_blocks =
-                        self.buffer.restore_by_id(partition_id, true).await?;
-                    restored_data_blocks.push(data_block);
-                    self.state = State::Compact(restored_data_blocks);
-                }
-            }
-            State::Restore => {
-                let restored_data_blocks = self.buffer.restore().await?;
-                self.state = State::Compact(restored_data_blocks);
-            }
-            _ => unreachable!(),
-        }
-        Ok(())
-    }
-}
-
-impl TransformHilbertCollect {
-    fn collect_data_block(&mut self) -> Result<()> {
-        let data_block = self.input.pull_data().unwrap()?;
-        if let Some(meta) = data_block
-            .get_owned_meta()
-            .and_then(WindowPartitionMeta::downcast_from)
-        {
-            for (partition_id, data_block) in meta.partitioned_data.into_iter() {
-                let new_id = self.partition_id[partition_id];
-                self.partition_sizes[new_id] += data_block.estimate_block_size();
-                if self.partition_sizes[new_id] >= self.max_block_size {
-                    self.immediate_output_blocks.push((new_id, data_block));
-                    self.partition_sizes[new_id] = 0;
-                    continue;
-                }
-                self.buffer.add_data_block(new_id, data_block);
-            }
-        }
-        Ok(())
-    }
-
-    fn need_spill(&mut self) -> bool {
-        self.buffer.need_spill()
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
index aaa93a459f8b6..1418388cf2553 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod data_processor_strategy;
+mod partition_process_strategy;
 mod transform_window_partition_collect;
 mod window_partition_buffer;
 mod window_partition_exchange;
 mod window_partition_meta;
 mod window_partition_partial_top_n_exchange;
 
-pub use data_processor_strategy::*;
+pub use partition_process_strategy::*;
 pub use transform_window_partition_collect::*;
 pub use window_partition_buffer::*;
 pub use window_partition_exchange::*;
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs
similarity index 75%
rename from src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
rename to src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs
index d0808f1d423ef..bec3f8a84e91f 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs
@@ -19,12 +19,21 @@ use databend_common_expression::SortColumnDescription;
 use databend_common_pipeline_transforms::sort_merge;
 use databend_common_settings::Settings;
 
-pub trait DataProcessorStrategy: Send + Sync + 'static {
+pub trait PartitionProcessStrategy: Send + Sync + 'static {
     const NAME: &'static str;
+
+    /// Partition assignment: map partition index to processor via proportional mapping.
+    fn calc_partitions(
+        &self,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+    ) -> Vec<usize>;
+
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>>;
 }
 
-pub struct SortStrategy {
+pub struct WindowPartitionStrategy {
     sort_desc: Vec<SortColumnDescription>,
     schema: DataSchemaRef,
     max_block_size: usize,
@@ -33,7 +42,7 @@ pub struct SortStrategy {
     have_order_col: bool,
 }
 
-impl SortStrategy {
+impl WindowPartitionStrategy {
     pub fn try_create(
         settings: &Settings,
         sort_desc: Vec<SortColumnDescription>,
@@ -54,8 +63,19 @@ impl SortStrategy {
     }
 }
 
-impl DataProcessorStrategy for SortStrategy {
-    const NAME: &'static str = "Sort";
+impl PartitionProcessStrategy for WindowPartitionStrategy {
+    const NAME: &'static str = "Window";
+
+    fn calc_partitions(
+        &self,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+    ) -> Vec<usize> {
+        (0..num_partitions)
+            .filter(|&partition| partition % num_processors == processor_id)
+            .collect()
+    }
 
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
         let data_blocks = data_blocks
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs
index 3051a2f0f018c..d1f011404223b 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs
@@ -33,7 +33,7 @@ use databend_common_storage::DataOperator;
 
 use super::WindowPartitionBuffer;
 use super::WindowPartitionMeta;
-use crate::pipelines::processors::transforms::DataProcessorStrategy;
+use crate::pipelines::processors::transforms::PartitionProcessStrategy;
 use crate::sessions::QueryContext;
 use crate::spillers::Spiller;
 use crate::spillers::SpillerConfig;
@@ -59,7 +59,7 @@ pub enum AsyncStep {
     Restore,
 }
 
-pub struct TransformWindowPartitionCollect<S: DataProcessorStrategy> {
+pub struct TransformPartitionCollect<S: PartitionProcessStrategy> {
     input: Arc<InputPort>,
     output: Arc<OutputPort>,
 
@@ -78,7 +78,7 @@ pub struct TransformWindowPartitionCollect<S: DataProcessorStrategy> {
     is_collect_finished: bool,
 }
 
-impl<S: DataProcessorStrategy> TransformWindowPartitionCollect<S> {
+impl<S: PartitionProcessStrategy> TransformPartitionCollect<S> {
     pub fn new(
         ctx: Arc<QueryContext>,
         input: Arc<InputPort>,
@@ -92,9 +92,7 @@ impl<S: DataProcessorStrategy> TransformWindowPartitionCollect<S> {
         strategy: S,
     ) -> Result<Self> {
         // Calculate the partition ids collected by the processor.
-        let partitions: Vec<usize> = (0..num_partitions)
-            .filter(|&partition| partition % num_processors == processor_id)
-            .collect();
+        let partitions = strategy.calc_partitions(processor_id, num_processors, num_partitions);
 
         // Map each partition id to new partition id.
         let mut partition_id = vec![0; num_partitions];
@@ -162,11 +160,7 @@ impl<S: DataProcessorStrategy> TransformWindowPartitionCollect<S> {
         }
 
         if self.input.has_data() {
-            Self::collect_data_block(
-                self.input.pull_data().unwrap()?,
-                &self.partition_id,
-                &mut self.buffer,
-            );
+            self.collect_data_block()?;
         }
 
         // Check again. flush memory data to external storage if need
@@ -209,9 +203,9 @@ impl<S: DataProcessorStrategy> TransformWindowPartitionCollect<S> {
 }
 
 #[async_trait::async_trait]
-impl<S: DataProcessorStrategy> Processor for TransformWindowPartitionCollect<S> {
+impl<S: PartitionProcessStrategy> Processor for TransformPartitionCollect<S> {
     fn name(&self) -> String {
-        format!("TransformWindowPartitionCollect({})", S::NAME)
+        format!("TransformPartitionCollect({})", S::NAME)
     }
 
     fn as_any(&mut self) -> &mut dyn Any {
@@ -271,21 +265,19 @@ impl<S: DataProcessorStrategy> Processor for TransformWindowPartitionCollect<S>
     }
 }
 
-impl<S: DataProcessorStrategy> TransformWindowPartitionCollect<S> {
-    fn collect_data_block(
-        data_block: DataBlock,
-        partition_ids: &[usize],
-        buffer: &mut WindowPartitionBuffer,
-    ) {
+impl<S: PartitionProcessStrategy> TransformPartitionCollect<S> {
+    fn collect_data_block(&mut self) -> Result<()> {
+        let data_block = self.input.pull_data().unwrap()?;
         if let Some(meta) = data_block
             .get_owned_meta()
             .and_then(WindowPartitionMeta::downcast_from)
         {
             for (partition_id, data_block) in meta.partitioned_data.into_iter() {
-                let partition_id = partition_ids[partition_id];
-                buffer.add_data_block(partition_id, data_block);
+                let new_id = self.partition_id[partition_id];
+                self.buffer.add_data_block(new_id, data_block);
             }
         }
+        Ok(())
     }
 
     fn need_spill(&mut self) -> bool {
diff --git a/src/query/storages/fuse/src/io/mod.rs b/src/query/storages/fuse/src/io/mod.rs
index 63b43a9ff785f..93c695bd06f5c 100644
--- a/src/query/storages/fuse/src/io/mod.rs
+++ b/src/query/storages/fuse/src/io/mod.rs
@@ -50,8 +50,8 @@ pub use write::CachedMetaWriter;
 pub use write::InvertedIndexBuilder;
 pub use write::InvertedIndexWriter;
 pub use write::MetaWriter;
-pub(crate) use write::StreamBlockBuilder;
-pub(crate) use write::StreamBlockProperties;
+pub use write::StreamBlockBuilder;
+pub use write::StreamBlockProperties;
 pub use write::VirtualColumnBuilder;
 pub use write::WriteSettings;
 pub use write::MAX_BLOCK_UNCOMPRESSED_SIZE;
diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs
index 8b9b269327b5f..1264a757e94f8 100644
--- a/src/query/storages/fuse/src/io/write/block_writer.rs
+++ b/src/query/storages/fuse/src/io/write/block_writer.rs
@@ -20,6 +20,8 @@ use std::time::Instant;
 use chrono::Utc;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
+use databend_common_expression::local_block_meta_serde;
+use databend_common_expression::BlockMetaInfo;
 use databend_common_expression::Column;
 use databend_common_expression::ColumnId;
 use databend_common_expression::DataBlock;
@@ -124,6 +126,7 @@ pub async fn write_data(data: Vec<u8>, data_accessor: &Operator, location: &str)
     Ok(())
 }
 
+#[derive(Debug)]
 pub struct BlockSerialization {
     pub block_raw_data: Vec<u8>,
     pub block_meta: BlockMeta,
@@ -132,6 +135,11 @@ pub struct BlockSerialization {
     pub virtual_column_state: Option<VirtualColumnState>,
 }
 
+local_block_meta_serde!(BlockSerialization);
+
+#[typetag::serde(name = "block_serialization_meta")]
+impl BlockMetaInfo for BlockSerialization {}
+
 #[derive(Clone)]
 pub struct BlockBuilder {
     pub ctx: Arc<dyn TableContext>,
diff --git a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs
index ec49070a6f08f..738c33ac2f2c3 100644
--- a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs
+++ b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs
@@ -40,6 +40,7 @@ use opendal::Operator;
 use crate::io::BlockReader;
 use crate::FuseStorageFormat;
 
+#[derive(Debug)]
 pub struct BloomIndexState {
     pub(crate) data: Vec<u8>,
     pub(crate) size: u64,
diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs
index 74377a86108cb..8cf0b5f2355f0 100644
--- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs
+++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs
@@ -121,6 +121,7 @@ pub fn create_inverted_index_builders(table_meta: &TableMeta) -> Vec<InvertedInd
     inverted_index_builders
 }
 
+#[derive(Debug)]
 pub struct InvertedIndexState {
     pub(crate) data: Vec<u8>,
     pub(crate) size: u64,
diff --git a/src/query/storages/fuse/src/io/write/mod.rs b/src/query/storages/fuse/src/io/write/mod.rs
index b0af3633055dc..24bf6fd52c042 100644
--- a/src/query/storages/fuse/src/io/write/mod.rs
+++ b/src/query/storages/fuse/src/io/write/mod.rs
@@ -35,8 +35,8 @@ pub(crate) use inverted_index_writer::InvertedIndexState;
 pub use inverted_index_writer::InvertedIndexWriter;
 pub use meta_writer::CachedMetaWriter;
 pub use meta_writer::MetaWriter;
-pub(crate) use stream::StreamBlockBuilder;
-pub(crate) use stream::StreamBlockProperties;
+pub use stream::StreamBlockBuilder;
+pub use stream::StreamBlockProperties;
 pub use virtual_column_builder::VirtualColumnBuilder;
 pub use write_settings::WriteSettings;
 pub use write_settings::MAX_BLOCK_UNCOMPRESSED_SIZE;
diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index 69e81f8dec714..49473f1fd7032 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -233,16 +233,11 @@ impl StreamBlockBuilder {
     }
 
     pub fn need_flush(&self) -> bool {
-        if let Some(max_block_bytes) = self.properties.max_block_bytes {
-            if self.block_size >= max_block_bytes {
-                return true;
-            }
-        };
         let file_size = self.block_writer.compressed_size();
         self.row_count >= self.properties.block_thresholds.min_rows_per_block
             || self.block_size >= self.properties.block_thresholds.max_bytes_per_block
             || (file_size >= self.properties.block_thresholds.min_compressed_per_block
-                && self.block_size >= self.properties.block_thresholds.min_bytes_per_block)
+                && self.block_size >= self.properties.block_thresholds.min_bytes_per_block / 2)
     }
 
     pub fn write(&mut self, block: DataBlock) -> Result<()> {
@@ -355,7 +350,6 @@ pub struct StreamBlockProperties {
     pub(crate) ctx: Arc<dyn TableContext>,
     pub(crate) write_settings: WriteSettings,
     pub(crate) block_thresholds: BlockThresholds,
-    pub(crate) max_block_bytes: Option<usize>,
 
     meta_locations: TableMetaLocationGenerator,
     source_schema: TableSchemaRef,
@@ -374,7 +368,6 @@ impl StreamBlockProperties {
         ctx: Arc<dyn TableContext>,
         table: &FuseTable,
         table_meta_timestamps: TableMetaTimestamps,
-        max_block_bytes: Option<usize>,
     ) -> Result<Arc<Self>> {
         // remove virtual computed fields.
         let fields = table
@@ -437,7 +430,11 @@ impl StreamBlockProperties {
             ngram_args,
             inverted_index_builders,
             table_meta_timestamps,
-            max_block_bytes,
         }))
     }
+
+    pub fn check_large_enough(&self, num_rows: usize, data_size: usize) -> bool {
+        self.block_thresholds
+            .check_large_enough(num_rows, data_size)
+    }
 }
diff --git a/src/query/storages/fuse/src/io/write/stream/mod.rs b/src/query/storages/fuse/src/io/write/stream/mod.rs
index 26d32ee679582..0c99368220ed4 100644
--- a/src/query/storages/fuse/src/io/write/stream/mod.rs
+++ b/src/query/storages/fuse/src/io/write/stream/mod.rs
@@ -16,5 +16,5 @@ mod block_builder;
 mod cluster_statistics;
 mod column_statistics;
 
-pub(crate) use block_builder::StreamBlockBuilder;
-pub(crate) use block_builder::StreamBlockProperties;
+pub use block_builder::StreamBlockBuilder;
+pub use block_builder::StreamBlockProperties;
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index 57cc6c8af06ea..6c9269bd13620 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -37,6 +37,7 @@ use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
 use databend_storages_common_table_meta::table::ClusterType;
 
+use crate::operations::TransformBlockBuilder;
 use crate::operations::TransformBlockWriter;
 use crate::operations::TransformSerializeBlock;
 use crate::statistics::ClusterStatsGenerator;
@@ -53,17 +54,18 @@ impl FuseTable {
             ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet();
         if enable_stream_block_write {
             pipeline.add_transform(|input, output| {
-                TransformBlockWriter::try_create(
+                TransformBlockBuilder::try_create(
                     ctx.clone(),
                     input,
                     output,
-                    MutationKind::Insert,
                     self,
                     table_meta_timestamps,
-                    false,
-                    None,
                 )
             })?;
+
+            pipeline.add_async_accumulating_transformer(|| {
+                TransformBlockWriter::create(ctx.clone(), MutationKind::Insert, self, false)
+            });
         } else {
             let block_thresholds = self.get_block_thresholds();
             build_compact_block_pipeline(pipeline, block_thresholds)?;
diff --git a/src/query/storages/fuse/src/operations/common/processors/mod.rs b/src/query/storages/fuse/src/operations/common/processors/mod.rs
index e0e3d3b25f25a..d43c569c14016 100644
--- a/src/query/storages/fuse/src/operations/common/processors/mod.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/mod.rs
@@ -22,6 +22,7 @@ mod transform_serialize_segment;
 
 pub use multi_table_insert_commit::CommitMultiTableInsert;
 pub use sink_commit::CommitSink;
+pub use transform_block_writer::TransformBlockBuilder;
 pub use transform_block_writer::TransformBlockWriter;
 pub use transform_merge_commit_meta::TransformMergeCommitMeta;
 pub use transform_mutation_aggregator::TableMutationAggregator;
diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
index 73a85bf4f52c5..5af36b0a1f522 100644
--- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
@@ -22,6 +22,7 @@ use databend_common_catalog::table::Table;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
+use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
 use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT;
 use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums;
@@ -30,6 +31,7 @@ use databend_common_pipeline_core::processors::InputPort;
 use databend_common_pipeline_core::processors::OutputPort;
 use databend_common_pipeline_core::processors::Processor;
 use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_pipeline_transforms::AsyncAccumulatingTransform;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_storage::MutationStatus;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
@@ -44,65 +46,51 @@ use crate::operations::MutationLogs;
 use crate::FuseTable;
 use crate::FUSE_OPT_KEY_ROW_PER_BLOCK;
 
-#[allow(clippy::large_enum_variant)]
 enum State {
     Consume,
     Collect(DataBlock),
     Serialize,
     Finalize,
     Flush,
-    Write(BlockSerialization),
 }
 
-pub struct TransformBlockWriter {
+pub struct TransformBlockBuilder {
     state: State,
     input: Arc<InputPort>,
     output: Arc<OutputPort>,
-    kind: MutationKind,
 
     properties: Arc<StreamBlockProperties>,
+    max_block_rows: usize,
 
     builder: Option<StreamBlockBuilder>,
     need_flush: bool,
     input_data_size: usize,
     input_num_rows: usize,
 
-    dal: Operator,
-    // Only used in multi table insert
-    table_id: Option<u64>,
-
-    max_block_rows: usize,
     input_data: VecDeque<DataBlock>,
     output_data: Option<DataBlock>,
 }
 
-impl TransformBlockWriter {
+impl TransformBlockBuilder {
     pub fn try_create(
         ctx: Arc<dyn TableContext>,
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
-        kind: MutationKind,
         table: &FuseTable,
         table_meta_timestamps: TableMetaTimestamps,
-        with_tid: bool,
-        max_block_bytes: Option<usize>,
     ) -> Result<ProcessorPtr> {
         let max_block_rows = std::cmp::min(
             ctx.get_settings().get_max_block_size()? as usize,
             table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT),
         );
-        let properties =
-            StreamBlockProperties::try_create(ctx, table, table_meta_timestamps, max_block_bytes)?;
-        Ok(ProcessorPtr::create(Box::new(TransformBlockWriter {
+        let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?;
+        Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder {
             state: State::Consume,
             input,
             output,
-            kind,
             properties,
             builder: None,
-            dal: table.get_operator(),
             need_flush: false,
-            table_id: if with_tid { Some(table.get_id()) } else { None },
             input_data: VecDeque::new(),
             input_data_size: 0,
             input_num_rows: 0,
@@ -134,9 +122,9 @@ impl TransformBlockWriter {
 }
 
 #[async_trait]
-impl Processor for TransformBlockWriter {
+impl Processor for TransformBlockBuilder {
     fn name(&self) -> String {
-        "TransformBlockWriter".to_string()
+        "TransformBlockBuilder".to_string()
     }
 
     fn as_any(&mut self) -> &mut dyn Any {
@@ -144,12 +132,11 @@ impl Processor for TransformBlockWriter {
     }
 
     fn event(&mut self) -> Result<Event> {
-        match &self.state {
-            State::Collect(_) | State::Serialize | State::Flush | State::Finalize => {
-                return Ok(Event::Sync)
-            }
-            State::Write(_) => return Ok(Event::Async),
-            _ => {}
+        if matches!(
+            self.state,
+            State::Collect(_) | State::Serialize | State::Flush | State::Finalize
+        ) {
+            return Ok(Event::Sync);
         }
 
         if self.output.is_finished() {
@@ -169,7 +156,6 @@ impl Processor for TransformBlockWriter {
         if self.need_flush
             && self
                 .properties
-                .block_thresholds
                 .check_large_enough(self.input_num_rows, self.input_data_size)
         {
             self.state = State::Flush;
@@ -207,13 +193,9 @@ impl Processor for TransformBlockWriter {
                 block.check_valid()?;
                 self.input_data_size += block.estimate_block_size();
                 self.input_num_rows += block.num_rows();
-                if self.properties.max_block_bytes.is_some() {
-                    self.input_data.push_back(block);
-                } else {
-                    let max_rows_per_block = self.calc_max_block_rows(&block);
-                    let blocks = block.split_by_rows_no_tail(max_rows_per_block);
-                    self.input_data.extend(blocks);
-                }
+                let max_rows_per_block = self.calc_max_block_rows(&block);
+                let blocks = block.split_by_rows_no_tail(max_rows_per_block);
+                self.input_data.extend(blocks);
             }
             State::Serialize => {
                 while let Some(b) = self.input_data.pop_front() {
@@ -240,7 +222,7 @@ impl Processor for TransformBlockWriter {
                 let builder = self.builder.take().unwrap();
                 if !builder.is_empty() {
                     let serialized = builder.finish()?;
-                    self.state = State::Write(serialized);
+                    self.output_data = Some(DataBlock::empty_with_meta(Box::new(serialized)));
                 }
                 self.need_flush = false;
             }
@@ -248,11 +230,41 @@ impl Processor for TransformBlockWriter {
         }
         Ok(())
     }
+}
 
-    #[async_backtrace::framed]
-    async fn async_process(&mut self) -> Result<()> {
-        match std::mem::replace(&mut self.state, State::Consume) {
-            State::Write(serialized) => {
+pub struct TransformBlockWriter {
+    kind: MutationKind,
+    dal: Operator,
+    ctx: Arc<dyn TableContext>,
+    // Only used in multi table insert
+    table_id: Option<u64>,
+}
+
+impl TransformBlockWriter {
+    pub fn create(
+        ctx: Arc<dyn TableContext>,
+        kind: MutationKind,
+        table: &FuseTable,
+        with_tid: bool,
+    ) -> Self {
+        Self {
+            ctx,
+            dal: table.get_operator(),
+            table_id: if with_tid { Some(table.get_id()) } else { None },
+            kind,
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl AsyncAccumulatingTransform for TransformBlockWriter {
+    const NAME: &'static str = "TransformBlockWriter";
+
+    async fn transform(&mut self, data: DataBlock) -> Result<Option<DataBlock>> {
+        debug_assert!(data.is_empty());
+
+        if let Some(ptr) = data.get_owned_meta() {
+            if let Some(serialized) = BlockSerialization::downcast_from(ptr) {
                 let extended_block_meta = BlockWriter::write_down(&self.dal, serialized).await?;
 
                 let bytes = if let Some(draft_virtual_block_meta) =
@@ -264,22 +276,19 @@ impl Processor for TransformBlockWriter {
                     extended_block_meta.block_meta.block_size as usize
                 };
 
-                self.properties
-                    .ctx
-                    .get_write_progress()
-                    .incr(&ProgressValues {
-                        rows: extended_block_meta.block_meta.row_count as usize,
-                        bytes,
-                    });
+                self.ctx.get_write_progress().incr(&ProgressValues {
+                    rows: extended_block_meta.block_meta.row_count as usize,
+                    bytes,
+                });
 
                 // appending new data block
                 if let Some(tid) = self.table_id {
-                    self.properties.ctx.update_multi_table_insert_status(
+                    self.ctx.update_multi_table_insert_status(
                         tid,
                         extended_block_meta.block_meta.row_count,
                     );
                 } else {
-                    self.properties.ctx.add_mutation_status(MutationStatus {
+                    self.ctx.add_mutation_status(MutationStatus {
                         insert_rows: extended_block_meta.block_meta.row_count,
                         update_rows: 0,
                         deleted_rows: 0,
@@ -299,10 +308,13 @@ impl Processor for TransformBlockWriter {
                         }],
                     }))
                 };
-                self.output_data = Some(output);
+
+                return Ok(Some(output));
             }
-            _ => return Err(ErrorCode::Internal("It's a bug.")),
         }
-        Ok(())
+
+        Err(ErrorCode::Internal(
+            "Cannot downcast meta to BlockSerialization",
+        ))
     }
 }

From 62f2093350108405de3c9f748fb9477e2a235e95 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Tue, 20 May 2025 18:42:01 +0800
Subject: [PATCH 15/36] fix test

---
 src/query/expression/tests/it/block_thresholds.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs
index e7ad1304ae3cc..33b1d1b0ff394 100644
--- a/src/query/expression/tests/it/block_thresholds.rs
+++ b/src/query/expression/tests/it/block_thresholds.rs
@@ -105,8 +105,8 @@ fn test_calc_rows_for_recluster() {
     assert_eq!(result, 300);
 
     // Case 2: If the block size is too smaller.
-    let result = t.calc_rows_for_recluster(4_000, 4_000_000, 600_000);
-    assert_eq!(result, 1000);
+    let result = t.calc_rows_for_recluster(4_000, 2_000_000, 600_000);
+    assert_eq!(result, 800);
 
     // Case 3: use the compressed-based block count.
     let result = t.calc_rows_for_recluster(4_000, 10_000_000, 600_000);
@@ -131,8 +131,8 @@ fn test_calc_partitions_for_recluster() {
     assert_eq!(result, 15);
 
     // Case 2: If the block size is too smaller.
-    let result = t.calc_partitions_for_recluster(4_000, 4_000_000, 600_000);
-    assert_eq!(result, 4);
+    let result = t.calc_partitions_for_recluster(4_000, 800_000, 800_000);
+    assert_eq!(result, 2);
 
     // Case 3: use the compressed-based block count.
     let result = t.calc_partitions_for_recluster(4_000, 10_000_000, 600_000);

From e2b02f73c4b320da7054c700cce6f0e74d408116 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Wed, 21 May 2025 02:25:18 +0800
Subject: [PATCH 16/36] add noise for hilbert recluster

---
 src/query/functions/src/scalars/hilbert.rs    | 95 ++++++++++++++++++-
 .../interpreter_table_recluster.rs            |  2 +-
 src/query/sql/src/planner/binder/ddl/table.rs | 11 ++-
 3 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/src/query/functions/src/scalars/hilbert.rs b/src/query/functions/src/scalars/hilbert.rs
index 060fe5ab97abe..2ba386450b0e0 100644
--- a/src/query/functions/src/scalars/hilbert.rs
+++ b/src/query/functions/src/scalars/hilbert.rs
@@ -21,24 +21,33 @@ use databend_common_expression::types::BinaryType;
 use databend_common_expression::types::DataType;
 use databend_common_expression::types::GenericType;
 use databend_common_expression::types::NullableType;
+use databend_common_expression::types::NumberDataType;
 use databend_common_expression::types::NumberType;
 use databend_common_expression::types::ReturnType;
+use databend_common_expression::types::StringType;
 use databend_common_expression::types::ValueType;
+use databend_common_expression::types::ALL_NUMERICS_TYPES;
+use databend_common_expression::vectorize_with_builder_1_arg;
 use databend_common_expression::vectorize_with_builder_2_arg;
+use databend_common_expression::with_number_mapped_type;
 use databend_common_expression::Column;
 use databend_common_expression::FixedLengthEncoding;
 use databend_common_expression::Function;
 use databend_common_expression::FunctionDomain;
 use databend_common_expression::FunctionEval;
 use databend_common_expression::FunctionFactory;
+use databend_common_expression::FunctionProperty;
 use databend_common_expression::FunctionRegistry;
 use databend_common_expression::FunctionSignature;
 use databend_common_expression::ScalarRef;
 use databend_common_expression::Value;
+use rand::rngs::SmallRng;
+use rand::Rng;
+use rand::SeedableRng;
 
 /// Registers Hilbert curve related functions with the function registry.
 pub fn register(registry: &mut FunctionRegistry) {
-    // Register the hilbert_range_index function that calculates Hilbert indices for multi-dimensional data
+    // Register the hilbert_range_index function that calculates Hilbert indices for multidimensional data
     let factory = FunctionFactory::Closure(Box::new(|_, args_type: &[DataType]| {
         let args_num = args_type.len();
         // The function supports 2, 3, 4, or 5 dimensions (each dimension requires 2 arguments)
@@ -97,7 +106,7 @@ pub fn register(registry: &mut FunctionRegistry) {
                             points.push(key);
                         }
 
-                        // Convert the multi-dimensional point to a Hilbert index
+                        // Convert the multidimensional point to a Hilbert index
                         // This maps the n-dimensional point to a 1-dimensional value
                         let points = points
                             .iter()
@@ -153,6 +162,88 @@ pub fn register(registry: &mut FunctionRegistry) {
             builder.push(id);
         }),
     );
+
+    // We use true randomness by appending a random u8 value at the end of the binary key.
+    // This introduces noise to break tie cases in clustering keys that are not uniformly distributed.
+    // Although this may slightly affect the accuracy of range_bound estimation,
+    // it ensures that Hilbert index + scatter will no longer suffer from data skew.
+    // Moreover, since the noise is added at the tail, the original order of the keys is preserved.
+    registry.properties.insert(
+        "add_noise".to_string(),
+        FunctionProperty::default().non_deterministic(),
+    );
+
+    registry.register_passthrough_nullable_1_arg::<StringType, BinaryType, _, _>(
+        "add_noise",
+        |_, _| FunctionDomain::Full,
+        vectorize_with_builder_1_arg::<StringType, BinaryType>(|val, builder, _| {
+            let mut bytes = val.as_bytes().to_vec();
+            let mut rng = SmallRng::from_entropy();
+            bytes.push(rng.gen::<u8>());
+            builder.put_slice(&bytes);
+            builder.commit_row();
+        }),
+    );
+
+    for ty in ALL_NUMERICS_TYPES {
+        with_number_mapped_type!(|NUM_TYPE| match ty {
+            NumberDataType::NUM_TYPE => {
+                registry
+                    .register_passthrough_nullable_1_arg::<NumberType<NUM_TYPE>, BinaryType, _, _>(
+                        "add_noise",
+                        |_, _| FunctionDomain::Full,
+                        vectorize_with_builder_1_arg::<NumberType<NUM_TYPE>, BinaryType>(
+                            |val, builder, _| {
+                                let mut encoded = val.encode().to_vec();
+                                let mut rng = SmallRng::from_entropy();
+                                encoded.push(rng.gen::<u8>());
+                                builder.put_slice(&encoded);
+                                builder.commit_row();
+                            },
+                        ),
+                    );
+            }
+        })
+    }
+
+    registry.register_passthrough_nullable_2_arg::<StringType, NumberType<u64>, BinaryType, _, _>(
+        "add_noise",
+        |_, _, _| FunctionDomain::Full,
+        vectorize_with_builder_2_arg::<StringType, NumberType<u64>, BinaryType>(
+            |val, level, builder, _| {
+                let mut bytes = val.as_bytes().to_vec();
+                let mut rng = SmallRng::from_entropy();
+                for _ in 0..level {
+                    bytes.push(rng.gen::<u8>());
+                }
+                builder.put_slice(&bytes);
+                builder.commit_row();
+            },
+        ),
+    );
+
+    for ty in ALL_NUMERICS_TYPES {
+        with_number_mapped_type!(|NUM_TYPE| match ty {
+            NumberDataType::NUM_TYPE => {
+                registry
+                    .register_passthrough_nullable_2_arg::<NumberType<NUM_TYPE>, NumberType<u64>, BinaryType, _, _>(
+                        "add_noise",
+                        |_, _, _| FunctionDomain::Full,
+                        vectorize_with_builder_2_arg::<NumberType<NUM_TYPE>, NumberType<u64>, BinaryType>(
+                            |val, level, builder, _| {
+                                let mut encoded = val.encode().to_vec();
+                                let mut rng = SmallRng::from_entropy();
+                                for _ in 0..level {
+                                    encoded.push(rng.gen::<u8>());
+                                }
+                                builder.put_slice(&encoded);
+                                builder.commit_row();
+                            },
+                        ),
+                    );
+            }
+        })
+    }
 }
 
 /// Calculates the partition ID for a value based on range boundaries.
diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs
index 411452a48d5a0..237a2252ea067 100644
--- a/src/query/service/src/interpreters/interpreter_table_recluster.rs
+++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs
@@ -652,7 +652,7 @@ impl ReclusterTableInterpreter {
                 "range_bound(1000, {sample_size})({cluster_key_str})"
             ));
 
-            hilbert_keys.push(format!("{table}.{cluster_key_str}, []"));
+            hilbert_keys.push(format!("{cluster_key_str}, []"));
         }
         let hilbert_keys_str = hilbert_keys.join(", ");
 
diff --git a/src/query/sql/src/planner/binder/ddl/table.rs b/src/query/sql/src/planner/binder/ddl/table.rs
index 8f95e1d17958f..7d3263a8cdceb 100644
--- a/src/query/sql/src/planner/binder/ddl/table.rs
+++ b/src/query/sql/src/planner/binder/ddl/table.rs
@@ -149,6 +149,7 @@ use crate::plans::VacuumTemporaryFilesPlan;
 use crate::BindContext;
 use crate::DefaultExprBinder;
 use crate::Planner;
+use crate::ScalarExpr;
 use crate::SelectBuilder;
 
 pub(in crate::planner::binder) struct AnalyzeCreateTableResult {
@@ -1767,7 +1768,7 @@ impl Binder {
 
         let mut cluster_keys = Vec::with_capacity(expr_len);
         for cluster_expr in cluster_exprs.iter() {
-            let (cluster_key, _) = scalar_binder.bind(cluster_expr)?;
+            let (mut cluster_key, _) = scalar_binder.bind(cluster_expr)?;
             if cluster_key.used_columns().len() != 1 || !cluster_key.evaluable() {
                 return Err(ErrorCode::InvalidClusterKeys(format!(
                     "Cluster by expression `{:#}` is invalid",
@@ -1775,6 +1776,14 @@ impl Binder {
                 )));
             }
 
+            if let ScalarExpr::FunctionCall(func) = &cluster_key {
+                if func.func_name == "add_noise" && matches!(cluster_type, AstClusterType::Hilbert)
+                {
+                    debug_assert!(func.arguments.len() == 1);
+                    cluster_key = func.arguments[0].clone();
+                }
+            }
+
             let expr = cluster_key.as_expr()?;
             if !expr.is_deterministic(&BUILTIN_FUNCTIONS) {
                 return Err(ErrorCode::InvalidClusterKeys(format!(

From 81e8dba0254c92d34251f23b3a73a7913ee76d5f Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 5 Jun 2025 03:39:18 +0800
Subject: [PATCH 17/36] update

---
 src/common/base/src/base/watch_notify.rs      |   4 +
 .../src/aggregates/aggregate_range_bound.rs   |   4 +-
 .../transforms/recluster/builder.rs           | 198 ++++++++++++++++++
 .../processors/transforms/recluster/mod.rs    |  11 +
 .../recluster/range_bound_sampler.rs          |  86 ++++++++
 .../recluster/recluster_partition_exchange.rs |   3 +-
 .../recluster/recluster_sample_state.rs       | 140 +++++++++++++
 .../recluster/transform_add_order_column.rs   |  72 +++++++
 .../transform_range_partition_indexer.rs      | 164 +++++++++++++++
 .../recluster/transform_recluster_collect.rs  |  82 ++++++++
 10 files changed, 759 insertions(+), 5 deletions(-)
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/builder.rs
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs

diff --git a/src/common/base/src/base/watch_notify.rs b/src/common/base/src/base/watch_notify.rs
index be05dfc9028c0..c7a677474b63f 100644
--- a/src/common/base/src/base/watch_notify.rs
+++ b/src/common/base/src/base/watch_notify.rs
@@ -44,6 +44,10 @@ impl WatchNotify {
     pub fn notify_waiters(&self) {
         let _ = self.tx.send_replace(true);
     }
+
+    pub fn is_notified(&self) -> bool {
+        *self.rx.borrow()
+    }
 }
 
 #[cfg(test)]
diff --git a/src/query/functions/src/aggregates/aggregate_range_bound.rs b/src/query/functions/src/aggregates/aggregate_range_bound.rs
index 9776caac786c8..2572429300182 100644
--- a/src/query/functions/src/aggregates/aggregate_range_bound.rs
+++ b/src/query/functions/src/aggregates/aggregate_range_bound.rs
@@ -326,9 +326,7 @@ pub fn try_create_aggregate_range_bound_function(
 /// For a column with values `(0, 1, 3, 6, 8)` and `partition_num = 3`, the function calculates the
 /// partition boundaries based on the distribution of the data. The boundaries might be `[1, 6]`.
 pub fn aggregate_range_bound_function_desc() -> AggregateFunctionDescription {
-    AggregateFunctionDescription::creator(Box::new(
-        crate::aggregates::try_create_aggregate_range_bound_function,
-    ))
+    AggregateFunctionDescription::creator(Box::new(try_create_aggregate_range_bound_function))
 }
 
 fn get_partitions(
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs b/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs
new file mode 100644
index 0000000000000..0a58f27556f91
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs
@@ -0,0 +1,198 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::row::RowConverter as CommonConverter;
+use databend_common_expression::types::AccessType;
+use databend_common_expression::types::ArgType;
+use databend_common_expression::types::DataType;
+use databend_common_expression::types::DateType;
+use databend_common_expression::types::NumberDataType;
+use databend_common_expression::types::NumberType;
+use databend_common_expression::types::StringType;
+use databend_common_expression::types::TimestampType;
+use databend_common_expression::with_number_mapped_type;
+use databend_common_expression::DataSchemaRef;
+use databend_common_expression::SortColumnDescription;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+use databend_common_pipeline_transforms::sort::CommonRows;
+use databend_common_pipeline_transforms::sort::RowConverter;
+use databend_common_pipeline_transforms::sort::Rows;
+use databend_common_pipeline_transforms::sort::SimpleRowConverter;
+use databend_common_pipeline_transforms::sort::SimpleRowsAsc;
+use databend_common_pipeline_transforms::AccumulatingTransformer;
+use databend_common_pipeline_transforms::Transformer;
+use match_template::match_template;
+
+use crate::pipelines::processors::transforms::recluster::transform_add_order_column::TransformAddOrderColumn;
+use crate::pipelines::processors::transforms::recluster::TransformRangePartitionIndexer;
+use crate::pipelines::processors::transforms::SampleState;
+use crate::pipelines::processors::transforms::TransformReclusterCollect;
+
+pub struct TransformReclusterBuilder {
+    schema: DataSchemaRef,
+    sort_desc: Arc<[SortColumnDescription]>,
+    sample_rate: f64,
+    seed: u64,
+}
+
+impl TransformReclusterBuilder {
+    pub fn build_recluster_sample(
+        &self,
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+    ) -> Result<Box<dyn Processor>> {
+        self.build_inner(BuilderType::ReclusterSample, input, output, None)
+    }
+
+    pub fn build_range_partition_indexer(
+        &self,
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        state: Arc<SampleState>,
+    ) -> Result<Box<dyn Processor>> {
+        self.build_inner(
+            BuilderType::RangePartitionIndexer,
+            input,
+            output,
+            Some(state),
+        )
+    }
+
+    pub fn build_add_order_column(
+        &self,
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+    ) -> Result<Box<dyn Processor>> {
+        self.build_inner(BuilderType::AddOrderColumn, input, output, None)
+    }
+
+    fn build_inner(
+        &self,
+        typ: BuilderType,
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        state: Option<Arc<SampleState>>,
+    ) -> Result<Box<dyn Processor>> {
+        let mut build = BuilderInner {
+            input,
+            output,
+            typ,
+            base: self,
+            state,
+        };
+        build.select_row_type()
+    }
+}
+
+enum BuilderType {
+    AddOrderColumn,
+    ReclusterSample,
+    RangePartitionIndexer,
+}
+
+struct BuilderInner<'a> {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+    typ: BuilderType,
+    base: &'a TransformReclusterBuilder,
+    state: Option<Arc<SampleState>>,
+}
+
+impl BuilderInner<'_> {
+    pub fn select_row_type(&mut self) -> Result<Box<dyn Processor>> {
+        match self.base.sort_desc.as_ref() {
+            [desc] => {
+                let schema = self.base.schema.clone();
+                let sort_type = schema.field(desc.offset).data_type();
+                assert!(desc.asc);
+
+                match_template! {
+                    T = [ Date => DateType, Timestamp => TimestampType, String => StringType ],
+                    match sort_type {
+                        DataType::T => {
+                            self.visit_type::<SimpleRowsAsc<T>, SimpleRowConverter<T>>()
+                        },
+                        DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty {
+                            NumberDataType::NUM_TYPE => {
+                                self.visit_type::<SimpleRowsAsc<NumberType<NUM_TYPE>>, SimpleRowConverter<NumberType<NUM_TYPE>>>()
+                            }
+                        }),
+                        _ => self.visit_type::<CommonRows, CommonConverter>()
+                    }
+                }
+            }
+            _ => self.visit_type::<CommonRows, CommonConverter>(),
+        }
+    }
+
+    fn visit_type<R, C>(&mut self) -> Result<Box<dyn Processor>>
+    where
+        R: Rows + 'static,
+        C: RowConverter<R> + Send + 'static,
+        R::Type: ArgType + Send + Sync,
+        <R::Type as AccessType>::Scalar: Ord + Send + Sync,
+    {
+        match self.typ {
+            BuilderType::AddOrderColumn => self.build_add_order_column::<R, C>(),
+            BuilderType::ReclusterSample => self.build_recluster_sample::<R::Type>(),
+            BuilderType::RangePartitionIndexer => self.build_range_partition_indexer::<R::Type>(),
+        }
+    }
+
+    fn build_add_order_column<R, C>(&mut self) -> Result<Box<dyn Processor>>
+    where
+        R: Rows + 'static,
+        C: RowConverter<R> + Send + 'static,
+    {
+        let inner = TransformAddOrderColumn::<R, C>::try_new(
+            self.base.sort_desc.clone(),
+            self.base.schema.clone(),
+        )?;
+        Ok(Transformer::create(
+            self.input.clone(),
+            self.output.clone(),
+            inner,
+        ))
+    }
+
+    fn build_range_partition_indexer<T>(&mut self) -> Result<Box<dyn Processor>>
+    where
+        T: ArgType + Send + Sync,
+        T::Scalar: Ord + Send + Sync,
+    {
+        Ok(TransformRangePartitionIndexer::<T>::create(
+            self.input.clone(),
+            self.output.clone(),
+            self.state.clone().unwrap(),
+        ))
+    }
+
+    fn build_recluster_sample<T>(&mut self) -> Result<Box<dyn Processor>>
+    where
+        T: ArgType + Send + Sync,
+        T::Scalar: Ord + Send + Sync,
+    {
+        let offset = self.base.schema.fields().len();
+        Ok(AccumulatingTransformer::create(
+            self.input.clone(),
+            self.output.clone(),
+            TransformReclusterCollect::<T>::new(offset, self.base.sample_rate, self.base.seed),
+        ))
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
index a3c680958f00b..fd1db0c11426c 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -12,9 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod builder;
+mod range_bound_sampler;
 mod recluster_partition_exchange;
 mod recluster_partition_strategy;
+mod recluster_sample_state;
+mod transform_add_order_column;
+mod transform_range_partition_indexer;
+mod transform_recluster_collect;
 
+pub use range_bound_sampler::RangeBoundSampler;
 pub use recluster_partition_exchange::ReclusterPartitionExchange;
 pub use recluster_partition_strategy::CompactPartitionStrategy;
 pub use recluster_partition_strategy::ReclusterPartitionStrategy;
+pub(crate) use recluster_sample_state::SampleState;
+pub use transform_range_partition_indexer::TransformRangePartitionIndexer;
+pub(crate) use transform_recluster_collect::ReclusterSampleMeta;
+pub use transform_recluster_collect::TransformReclusterCollect;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
new file mode 100644
index 0000000000000..0dfee36475b36
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
@@ -0,0 +1,86 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::marker::PhantomData;
+
+use databend_common_expression::types::ArgType;
+use databend_common_expression::types::ValueType;
+use databend_common_expression::DataBlock;
+use databend_common_expression::Scalar;
+use rand::prelude::SliceRandom;
+use rand::prelude::SmallRng;
+use rand::SeedableRng;
+
+pub struct RangeBoundSampler<T>
+where T: ValueType
+{
+    offset: usize,
+    sample_rate: f64,
+    rng: SmallRng,
+
+    values: Vec<(u64, Vec<Scalar>)>,
+    _t: PhantomData<T>,
+}
+
+impl<T> RangeBoundSampler<T>
+where T: ValueType
+{
+    pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self {
+        let rng = SmallRng::seed_from_u64(seed);
+        Self {
+            offset,
+            sample_rate,
+            rng,
+            values: vec![],
+            _t: PhantomData,
+        }
+    }
+}
+
+impl<T> RangeBoundSampler<T>
+where
+    T: ArgType,
+    T::Scalar: Ord + Send,
+{
+    pub fn add_block(&mut self, data: &DataBlock) {
+        let rows = data.num_rows();
+        assert!(rows > 0);
+        let column = data.get_by_offset(self.offset).to_column(rows);
+
+        let sample_size = std::cmp::max((self.sample_rate * rows as f64).ceil() as usize, 100);
+        let mut indices = (0..rows).collect::<Vec<_>>();
+
+        let sampled_indices = if rows > sample_size {
+            indices.shuffle(&mut self.rng);
+            &indices[..sample_size]
+        } else {
+            &indices
+        };
+
+        let column = T::try_downcast_column(&column).unwrap();
+        let sample_values = sampled_indices
+            .iter()
+            .map(|i| {
+                T::upcast_scalar(T::to_owned_scalar(unsafe {
+                    T::index_column_unchecked(&column, *i)
+                }))
+            })
+            .collect::<Vec<_>>();
+        self.values.push((rows as u64, sample_values));
+    }
+
+    pub fn sample_values(&mut self) -> Vec<(u64, Vec<Scalar>)> {
+        std::mem::take(&mut self.values)
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
index 221d4328ef67a..dd5257850ac9f 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
@@ -33,8 +33,7 @@ impl ReclusterPartitionExchange {
 
 impl Exchange for ReclusterPartitionExchange {
     const NAME: &'static str = "Recluster";
-    fn partition(&self, data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
-        let mut data_block = data_block;
+    fn partition(&self, mut data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
         let range_ids = data_block
             .get_last_column()
             .as_number()
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
new file mode 100644
index 0000000000000..77c0fa0a9483f
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
@@ -0,0 +1,140 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::sync::RwLock;
+
+use databend_common_base::base::WatchNotify;
+use databend_common_exception::Result;
+use databend_common_expression::compare_columns;
+use databend_common_expression::types::ArgType;
+use databend_common_expression::Scalar;
+
+pub struct SampleState {
+    pub inner: RwLock<SampleStateInner>,
+    pub done: Arc<WatchNotify>,
+}
+
+impl SampleState {
+    pub fn new(total_inputs: usize, partitions: usize) -> Arc<Self> {
+        Arc::new(SampleState {
+            inner: RwLock::new(SampleStateInner {
+                partitions,
+                total_inputs,
+                completed_inputs: 0,
+                values: vec![],
+                bounds: vec![],
+            }),
+            done: Arc::new(WatchNotify::new()),
+        })
+    }
+
+    pub fn merge_sample<T>(&self, values: Vec<(u64, Vec<Scalar>)>) -> Result<()>
+    where
+        T: ArgType,
+        T::Scalar: Ord,
+    {
+        let mut inner = self.inner.write().unwrap();
+        inner.completed_inputs += 1;
+        inner.values.extend_from_slice(&values);
+
+        if inner.completed_inputs >= inner.total_inputs {
+            inner.determine_bounds::<T>()?;
+            self.done.notify_waiters();
+        }
+        Ok(())
+    }
+
+    pub fn get_bounds<T>(&self) -> Vec<T::Scalar>
+    where
+        T: ArgType,
+        T::Scalar: Ord,
+    {
+        let inner = self.inner.read().unwrap();
+        inner
+            .bounds
+            .iter()
+            .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()))
+            .collect()
+    }
+}
+
+pub struct SampleStateInner {
+    partitions: usize,
+    total_inputs: usize,
+
+    completed_inputs: usize,
+    bounds: Vec<Scalar>,
+
+    values: Vec<(u64, Vec<Scalar>)>,
+}
+
+impl SampleStateInner {
+    fn determine_bounds<T>(&mut self) -> Result<()>
+    where
+        T: ArgType,
+        T::Scalar: Ord,
+    {
+        if self.partitions < 2 {
+            return Ok(());
+        }
+
+        let (total_samples, total_rows) = self
+            .values
+            .iter()
+            .fold((0, 0), |(acc_samples, acc_rows), (rows, vals)| {
+                (acc_samples + vals.len(), acc_rows + *rows)
+            });
+        let step = total_rows as f64 / self.partitions as f64;
+        let values = std::mem::take(&mut self.values);
+        let mut data = Vec::with_capacity(total_samples);
+        let mut weights = Vec::with_capacity(total_samples);
+
+        for (num, values) in values.into_iter() {
+            let weight = num as f64 / values.len() as f64;
+            values.into_iter().for_each(|v| {
+                let val = T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap());
+                data.push(val);
+                weights.push(weight);
+            });
+        }
+        let col = T::upcast_column(T::column_from_vec(data.clone(), &[]));
+        let indices = compare_columns(vec![col], total_samples)?;
+
+        let mut cum_weight = 0.0;
+        let mut target = step;
+        let mut bounds = Vec::with_capacity(self.partitions - 1);
+        let mut previous_bound = None;
+
+        let mut i = 0;
+        let mut j = 0;
+        while i < total_samples && j < self.partitions - 1 {
+            let idx = indices[i] as usize;
+            let weight = weights[idx];
+            cum_weight += weight;
+            if cum_weight >= target {
+                let data = &data[idx];
+                if previous_bound.as_ref().is_none_or(|prev| data > prev) {
+                    bounds.push(T::upcast_scalar(data.clone()));
+                    target += step;
+                    j += 1;
+                    previous_bound = Some(data.clone());
+                }
+            }
+            i += 1;
+        }
+        self.bounds = bounds;
+        Ok(())
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
new file mode 100644
index 0000000000000..7b40593e887c3
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
@@ -0,0 +1,72 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::marker::PhantomData;
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::BlockEntry;
+use databend_common_expression::DataBlock;
+use databend_common_expression::DataSchemaRef;
+use databend_common_expression::SortColumnDescription;
+use databend_common_expression::Value;
+use databend_common_pipeline_transforms::sort::RowConverter;
+use databend_common_pipeline_transforms::sort::Rows;
+use databend_common_pipeline_transforms::Transform;
+
+pub struct TransformAddOrderColumn<R, C> {
+    row_converter: C,
+    sort_desc: Arc<[SortColumnDescription]>,
+    _r: PhantomData<R>,
+}
+
+impl<R, C> TransformAddOrderColumn<R, C>
+where
+    R: Rows,
+    C: RowConverter<R>,
+{
+    pub fn try_new(sort_desc: Arc<[SortColumnDescription]>, schema: DataSchemaRef) -> Result<Self> {
+        let row_converter = C::create(&sort_desc, schema.clone())?;
+        Ok(Self {
+            row_converter,
+            sort_desc,
+            _r: PhantomData,
+        })
+    }
+}
+
+impl<R, C> Transform for TransformAddOrderColumn<R, C>
+where
+    R: Rows + 'static,
+    C: RowConverter<R> + Send + 'static,
+{
+    const NAME: &'static str = "TransformAddOrderColumn";
+
+    fn transform(&mut self, mut data: DataBlock) -> Result<DataBlock> {
+        let order_by_cols = self
+            .sort_desc
+            .iter()
+            .map(|desc| data.get_by_offset(desc.offset).clone())
+            .collect::<Vec<_>>();
+        let rows = self
+            .row_converter
+            .convert(&order_by_cols, data.num_rows())?;
+        let order_col = rows.to_column();
+        data.add_column(BlockEntry {
+            data_type: order_col.data_type(),
+            value: Value::Column(order_col),
+        });
+        Ok(data)
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
new file mode 100644
index 0000000000000..6d10600366eda
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -0,0 +1,164 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::types::ArgType;
+use databend_common_expression::types::DataType;
+use databend_common_expression::types::NumberDataType;
+use databend_common_expression::types::UInt64Type;
+use databend_common_expression::BlockEntry;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_expression::FromData;
+use databend_common_expression::Value;
+use databend_common_pipeline_core::processors::Event;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+
+use crate::pipelines::processors::transforms::ReclusterSampleMeta;
+use crate::pipelines::processors::transforms::SampleState;
+
+pub struct TransformRangePartitionIndexer<T>
+where T: ArgType
+{
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    state: Arc<SampleState>,
+    input_data: Vec<DataBlock>,
+    output_data: VecDeque<DataBlock>,
+    bounds: Vec<T::Scalar>,
+}
+
+impl<T> TransformRangePartitionIndexer<T>
+where
+    T: ArgType + Send + Sync,
+    T::Scalar: Ord + Send + Sync,
+{
+    pub fn create(
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        state: Arc<SampleState>,
+    ) -> Box<dyn Processor> {
+        Box::new(Self {
+            input,
+            output,
+            state,
+            input_data: vec![],
+            output_data: VecDeque::new(),
+            bounds: vec![],
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl<T> Processor for TransformRangePartitionIndexer<T>
+where
+    T: ArgType + Send + Sync,
+    T::Scalar: Ord + Send + Sync,
+{
+    fn name(&self) -> String {
+        "TransformRangePartitionIndexer".to_owned()
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            self.input.set_not_need_data();
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(data_block) = self.output_data.pop_front() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if !self.input_data.is_empty() {
+            return Ok(Event::Sync);
+        }
+
+        if self.input.is_finished() {
+            assert!(self.state.done.is_notified());
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.input.has_data() {
+            self.input.set_need_data();
+            return Ok(Event::NeedData);
+        }
+
+        let mut input_data = self.input.pull_data().unwrap()?;
+        let meta = input_data
+            .take_meta()
+            .and_then(ReclusterSampleMeta::downcast_from)
+            .expect("require a ReclusterSampleMeta");
+        self.input_data = meta.blocks;
+        self.state.merge_sample::<T>(meta.sample_values)?;
+        Ok(Event::Async)
+    }
+
+    fn process(&mut self) -> Result<()> {
+        if let Some(mut block) = self.input_data.pop() {
+            let num_rows = block.num_rows();
+            let last = block.get_last_column().clone();
+            block.pop_columns(1);
+            let mut builder = Vec::with_capacity(num_rows);
+            let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap();
+            for index in 0..num_rows {
+                let val =
+                    T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) });
+                let mut low = 0;
+                let mut high = self.bounds.len();
+                while low < high {
+                    let mid = low + ((high - low) / 2);
+                    let bound = unsafe { self.bounds.get_unchecked(mid) }.clone();
+                    if val > bound {
+                        low = mid + 1;
+                    } else {
+                        high = mid;
+                    }
+                }
+                builder.push(low as u64);
+            }
+
+            block.add_column(BlockEntry::new(
+                DataType::Number(NumberDataType::UInt64),
+                Value::Column(UInt64Type::from_data(builder)),
+            ));
+            self.output_data.push_back(block);
+        }
+        Ok(())
+    }
+
+    #[async_backtrace::framed]
+    async fn async_process(&mut self) -> Result<()> {
+        self.state.done.notified().await;
+        self.bounds = self.state.get_bounds::<T>();
+        Ok(())
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
new file mode 100644
index 0000000000000..3e9fe42a8dee4
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
@@ -0,0 +1,82 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use databend_common_exception::Result;
+use databend_common_expression::local_block_meta_serde;
+use databend_common_expression::types::ArgType;
+use databend_common_expression::types::ValueType;
+use databend_common_expression::BlockMetaInfo;
+use databend_common_expression::DataBlock;
+use databend_common_expression::Scalar;
+use databend_common_pipeline_transforms::AccumulatingTransform;
+
+use crate::pipelines::processors::transforms::RangeBoundSampler;
+
+pub struct TransformReclusterCollect<T>
+where
+    T: ArgType + Send + Sync,
+    T::Scalar: Ord + Send,
+{
+    input_data: Vec<DataBlock>,
+    sampler: RangeBoundSampler<T>,
+}
+
+impl<T> TransformReclusterCollect<T>
+where
+    T: ArgType + Send + Sync,
+    T::Scalar: Ord + Send,
+{
+    pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self {
+        Self {
+            input_data: vec![],
+            sampler: RangeBoundSampler::<T>::new(offset, sample_rate, seed),
+        }
+    }
+}
+
+impl<T> AccumulatingTransform for TransformReclusterCollect<T>
+where
+    T: ArgType + Send + Sync,
+    T::Scalar: Ord + Send,
+{
+    const NAME: &'static str = "TransformReclusterCollect";
+
+    fn transform(&mut self, data: DataBlock) -> Result<Vec<DataBlock>> {
+        self.sampler.add_block(&data);
+        self.input_data.push(data);
+        Ok(vec![])
+    }
+
+    fn on_finish(&mut self, _output: bool) -> Result<Vec<DataBlock>> {
+        let sample_values = self.sampler.sample_values();
+        let blocks = std::mem::take(&mut self.input_data);
+        let meta = ReclusterSampleMeta {
+            blocks,
+            sample_values,
+        };
+
+        Ok(vec![DataBlock::empty_with_meta(Box::new(meta))])
+    }
+}
+
+#[derive(Debug)]
+pub struct ReclusterSampleMeta {
+    pub blocks: Vec<DataBlock>,
+    pub sample_values: Vec<(u64, Vec<Scalar>)>,
+}
+
+local_block_meta_serde!(ReclusterSampleMeta);
+
+#[typetag::serde(name = "recluster_sample")]
+impl BlockMetaInfo for ReclusterSampleMeta {}

From e59fe4daeda21e06dcc86f7fdbc12bdb4df5106a Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 6 Jun 2025 19:08:55 +0800
Subject: [PATCH 18/36] linear recluster support block stream writer

---
 src/query/functions/src/scalars/hilbert.rs    |   2 +-
 src/query/service/src/local/display.rs        |   2 +-
 .../builders/builder_hilbert_partition.rs     |   2 +
 .../pipelines/builders/builder_recluster.rs   | 333 ++++++++++++++----
 .../transforms/recluster/builder.rs           | 198 -----------
 .../processors/transforms/recluster/mod.rs    |   6 +-
 .../transform_range_partition_indexer.rs      |   2 +-
 .../recluster/transform_recluster_collect.rs  |   1 -
 src/query/settings/src/settings_default.rs    |   2 +-
 .../fuse/src/io/write/stream/block_builder.rs |  28 +-
 .../src/io/write/stream/cluster_statistics.rs |  47 ++-
 .../storages/fuse/src/operations/append.rs    |  35 +-
 .../processors/transform_block_writer.rs      |   4 +-
 13 files changed, 362 insertions(+), 300 deletions(-)
 delete mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/builder.rs

diff --git a/src/query/functions/src/scalars/hilbert.rs b/src/query/functions/src/scalars/hilbert.rs
index 2ba386450b0e0..d570b83740c1e 100644
--- a/src/query/functions/src/scalars/hilbert.rs
+++ b/src/query/functions/src/scalars/hilbert.rs
@@ -267,7 +267,7 @@ fn calc_range_partition_id(val: ScalarRef, arr: &Column) -> u64 {
     while low < high {
         let mid = low + ((high - low) / 2);
         let bound = unsafe { arr.index_unchecked(mid) };
-        if val > bound {
+        if val >= bound {
             low = mid + 1;
         } else {
             high = mid;
diff --git a/src/query/service/src/local/display.rs b/src/query/service/src/local/display.rs
index 32546082de770..b782accabdd00 100644
--- a/src/query/service/src/local/display.rs
+++ b/src/query/service/src/local/display.rs
@@ -259,7 +259,7 @@ impl FormatDisplay<'_> {
                 rows_str,
                 self.start.elapsed().as_secs_f64(),
                 humanize_count(stats.total_rows as f64),
-                HumanBytes(stats.total_rows as u64),
+                HumanBytes(stats.total_bytes as u64),
                 humanize_count(stats.total_rows as f64 / self.start.elapsed().as_secs_f64()),
                 HumanBytes((stats.total_bytes as f64 / self.start.elapsed().as_secs_f64()) as u64),
             );
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index 9ae7941e01e9e..aebafaa53566d 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -71,6 +71,8 @@ impl PipelineBuilder {
             let properties = StreamBlockProperties::try_create(
                 self.ctx.clone(),
                 table,
+                MutationKind::Recluster,
+                None,
                 partition.table_meta_timestamps,
             )?;
 
diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index 05d2d63dd55d7..a5f39011080d8 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -12,29 +12,62 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::sync::atomic;
+use std::sync::atomic::AtomicUsize;
+use std::sync::Arc;
+
 use databend_common_catalog::plan::DataSourceInfo;
 use databend_common_catalog::plan::DataSourcePlan;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
+use databend_common_expression::row::RowConverter as CommonConverter;
+use databend_common_expression::types::AccessType;
+use databend_common_expression::types::ArgType;
+use databend_common_expression::types::DataType;
+use databend_common_expression::types::DateType;
+use databend_common_expression::types::NumberDataType;
+use databend_common_expression::types::NumberType;
+use databend_common_expression::types::StringType;
+use databend_common_expression::types::TimestampType;
+use databend_common_expression::with_number_mapped_type;
+use databend_common_expression::DataSchemaRef;
 use databend_common_expression::DataSchemaRefExt;
 use databend_common_expression::SortColumnDescription;
 use databend_common_metrics::storage::metrics_inc_recluster_block_bytes_to_read;
 use databend_common_metrics::storage::metrics_inc_recluster_block_nums_to_read;
 use databend_common_metrics::storage::metrics_inc_recluster_row_nums_to_read;
+use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_pipeline_core::Pipeline;
 use databend_common_pipeline_sources::EmptySource;
 use databend_common_pipeline_transforms::processors::build_compact_block_no_split_pipeline;
 use databend_common_pipeline_transforms::processors::TransformPipelineHelper;
+use databend_common_pipeline_transforms::sort::CommonRows;
+use databend_common_pipeline_transforms::sort::RowConverter;
+use databend_common_pipeline_transforms::sort::Rows;
+use databend_common_pipeline_transforms::sort::SimpleRowConverter;
+use databend_common_pipeline_transforms::sort::SimpleRowsAsc;
+use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::evaluator::CompoundBlockOperator;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::StreamContext;
 use databend_common_storages_factory::Table;
+use databend_common_storages_fuse::io::StreamBlockProperties;
+use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::FuseTable;
 use databend_common_storages_fuse::TableContext;
+use match_template::match_template;
 
 use crate::pipelines::builders::SortPipelineBuilder;
+use crate::pipelines::processors::transforms::ReclusterPartitionExchange;
+use crate::pipelines::processors::transforms::ReclusterPartitionStrategy;
+use crate::pipelines::processors::transforms::SampleState;
+use crate::pipelines::processors::transforms::TransformAddOrderColumn;
 use crate::pipelines::processors::transforms::TransformAddStreamColumns;
+use crate::pipelines::processors::transforms::TransformPartitionCollect;
+use crate::pipelines::processors::transforms::TransformRangePartitionIndexer;
+use crate::pipelines::processors::transforms::TransformReclusterCollect;
 use crate::pipelines::PipelineBuilder;
 
 impl PipelineBuilder {
@@ -71,7 +104,7 @@ impl PipelineBuilder {
                 let recluster_block_nums = task.parts.len();
                 let block_thresholds = table.get_block_thresholds();
                 let table_info = table.get_table_info();
-                let schema = table.schema_with_stream();
+                let schema = Arc::new(table.schema_with_stream().remove_virtual_computed_fields());
                 let description = task.stats.get_description(&table_info.desc);
                 let plan = DataSourcePlan {
                     source_info: DataSourceInfo::TableSource(table_info.clone()),
@@ -117,72 +150,158 @@ impl PipelineBuilder {
                         .add_transformer(|| TransformAddStreamColumns::new(stream_ctx.clone()));
                 }
 
-                let cluster_stats_gen = table.get_cluster_stats_gen(
-                    self.ctx.clone(),
-                    task.level + 1,
-                    block_thresholds,
-                    None,
-                )?;
-                let operators = cluster_stats_gen.operators.clone();
-                if !operators.is_empty() {
-                    let func_ctx2 = cluster_stats_gen.func_ctx.clone();
-                    self.main_pipeline.add_transformer(move || {
-                        CompoundBlockOperator::new(
-                            operators.clone(),
-                            func_ctx2.clone(),
-                            num_input_columns,
-                        )
-                    });
-                }
+                let level = task.level + 1;
+                let enable_stream_writer =
+                    self.ctx.get_settings().get_enable_block_stream_write()?
+                        && table.storage_format_as_parquet();
+                if enable_stream_writer {
+                    let properties = StreamBlockProperties::try_create(
+                        self.ctx.clone(),
+                        table,
+                        MutationKind::Recluster,
+                        Some(level),
+                        recluster.table_meta_timestamps,
+                    )?;
+                    let operators = properties.cluster_operators();
+                    if !operators.is_empty() {
+                        let func_ctx = self.ctx.get_function_context()?;
+                        self.main_pipeline.add_transformer(move || {
+                            CompoundBlockOperator::new(
+                                operators.clone(),
+                                func_ctx.clone(),
+                                num_input_columns,
+                            )
+                        });
+                    }
+
+                    let fields_with_cluster_key = properties.fields_with_cluster_key();
+                    let schema = DataSchemaRefExt::create(fields_with_cluster_key);
+                    let sort_descs: Vec<_> = properties
+                        .cluster_key_index()
+                        .iter()
+                        .map(|&offset| SortColumnDescription {
+                            offset,
+                            asc: true,
+                            nulls_first: false,
+                        })
+                        .collect();
 
-                // construct output fields
-                let output_fields = cluster_stats_gen.out_fields.clone();
-                let schema = DataSchemaRefExt::create(output_fields);
-                let sort_descs: Vec<_> = cluster_stats_gen
-                    .cluster_key_index
-                    .iter()
-                    .map(|offset| SortColumnDescription {
-                        offset: *offset,
-                        asc: true,
-                        nulls_first: false,
-                    })
-                    .collect();
-
-                // merge sort
-                let sort_block_size = block_thresholds.calc_rows_for_recluster(
-                    task.total_rows,
-                    task.total_bytes,
-                    task.total_compressed,
-                );
-
-                let sort_pipeline_builder =
-                    SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())?
-                        .with_block_size_hit(sort_block_size)
-                        .remove_order_col_at_last();
-                // Todo(zhyass): Recluster will no longer perform sort in the near future.
-                sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?;
-
-                // Compact after merge sort.
-                let max_threads = self.ctx.get_settings().get_max_threads()? as usize;
-                build_compact_block_no_split_pipeline(
-                    &mut self.main_pipeline,
-                    block_thresholds,
-                    max_threads,
-                )?;
-
-                self.main_pipeline
-                    .add_transform(|transform_input_port, transform_output_port| {
-                        let proc = TransformSerializeBlock::try_create(
+                    let num_processors = self.main_pipeline.output_len();
+                    let sample_rate = 0.01;
+                    let partitions = block_thresholds.calc_partitions_for_recluster(
+                        task.total_rows,
+                        task.total_bytes,
+                        task.total_compressed,
+                    );
+                    let state = SampleState::new(num_processors, partitions);
+                    let recluster_pipeline_builder =
+                        ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_rate)
+                            .with_state(state);
+                    recluster_pipeline_builder
+                        .build_recluster_sample_pipeline(&mut self.main_pipeline)?;
+
+                    self.main_pipeline.exchange(
+                        num_processors,
+                        ReclusterPartitionExchange::create(0, partitions),
+                    );
+                    let processor_id = AtomicUsize::new(0);
+                    let settings = self.ctx.get_settings();
+                    let memory_settings = MemorySettings::disable_spill();
+                    self.main_pipeline.add_transform(|input, output| {
+                        Ok(ProcessorPtr::create(Box::new(
+                            TransformPartitionCollect::new(
+                                self.ctx.clone(),
+                                input,
+                                output,
+                                &settings,
+                                processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                                num_processors,
+                                partitions,
+                                memory_settings.clone(),
+                                None,
+                                ReclusterPartitionStrategy::new(properties.clone()),
+                            )?,
+                        )))
+                    })?;
+
+                    self.main_pipeline.add_async_accumulating_transformer(|| {
+                        TransformBlockWriter::create(
                             self.ctx.clone(),
-                            transform_input_port,
-                            transform_output_port,
-                            table,
-                            cluster_stats_gen.clone(),
                             MutationKind::Recluster,
-                            recluster.table_meta_timestamps,
-                        )?;
-                        proc.into_processor()
-                    })
+                            table,
+                            false,
+                        )
+                    });
+                    Ok(())
+                } else {
+                    let cluster_stats_gen = table.get_cluster_stats_gen(
+                        self.ctx.clone(),
+                        level,
+                        block_thresholds,
+                        None,
+                    )?;
+                    let operators = cluster_stats_gen.operators.clone();
+                    if !operators.is_empty() {
+                        let func_ctx2 = cluster_stats_gen.func_ctx.clone();
+                        self.main_pipeline.add_transformer(move || {
+                            CompoundBlockOperator::new(
+                                operators.clone(),
+                                func_ctx2.clone(),
+                                num_input_columns,
+                            )
+                        });
+                    }
+
+                    // construct output fields
+                    let output_fields = cluster_stats_gen.out_fields.clone();
+                    let schema = DataSchemaRefExt::create(output_fields);
+                    let sort_descs: Vec<_> = cluster_stats_gen
+                        .cluster_key_index
+                        .iter()
+                        .map(|offset| SortColumnDescription {
+                            offset: *offset,
+                            asc: true,
+                            nulls_first: false,
+                        })
+                        .collect();
+
+                    // merge sort
+                    let sort_block_size = block_thresholds.calc_rows_for_recluster(
+                        task.total_rows,
+                        task.total_bytes,
+                        task.total_compressed,
+                    );
+
+                    let sort_pipeline_builder =
+                        SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())?
+                            .with_block_size_hit(sort_block_size)
+                            .remove_order_col_at_last();
+                    // Todo(zhyass): Recluster will no longer perform sort in the near future.
+                    sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?;
+
+                    // Compact after merge sort.
+                    let max_threads = self.ctx.get_settings().get_max_threads()? as usize;
+                    build_compact_block_no_split_pipeline(
+                        &mut self.main_pipeline,
+                        block_thresholds,
+                        max_threads,
+                    )?;
+
+                    self.main_pipeline.add_transform(
+                        |transform_input_port, transform_output_port| {
+                            let proc = TransformSerializeBlock::try_create(
+                                self.ctx.clone(),
+                                transform_input_port,
+                                transform_output_port,
+                                table,
+                                cluster_stats_gen.clone(),
+                                MutationKind::Recluster,
+                                recluster.table_meta_timestamps,
+                            )?;
+                            proc.into_processor()
+                        },
+                    )
+                }
             }
             _ => Err(ErrorCode::Internal(
                 "A node can only execute one recluster task".to_string(),
@@ -190,3 +309,89 @@ impl PipelineBuilder {
         }
     }
 }
+
+struct ReclusterPipelineBuilder {
+    schema: DataSchemaRef,
+    sort_desc: Arc<[SortColumnDescription]>,
+    state: Option<Arc<SampleState>>,
+    sample_rate: f64,
+    seed: u64,
+}
+
+impl ReclusterPipelineBuilder {
+    fn create(
+        schema: DataSchemaRef,
+        sort_desc: Arc<[SortColumnDescription]>,
+        sample_rate: f64,
+    ) -> Self {
+        Self {
+            schema,
+            sort_desc,
+            state: None,
+            sample_rate,
+            seed: rand::random(),
+        }
+    }
+
+    #[allow(unused)]
+    fn with_seed(mut self, seed: u64) -> Self {
+        self.seed = seed;
+        self
+    }
+
+    fn with_state(mut self, state: Arc<SampleState>) -> Self {
+        self.state = Some(state);
+        self
+    }
+
+    fn build_recluster_sample_pipeline(&self, pipeline: &mut Pipeline) -> Result<()> {
+        match self.sort_desc.as_ref() {
+            [desc] => {
+                let schema = self.schema.clone();
+                let sort_type = schema.field(desc.offset).data_type();
+                assert!(desc.asc);
+
+                match_template! {
+                    T = [ Date => DateType, Timestamp => TimestampType, String => StringType ],
+                    match sort_type {
+                        DataType::T => {
+                            self.visit_type::<SimpleRowsAsc<T>, SimpleRowConverter<T>>(pipeline)
+                        },
+                        DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty {
+                            NumberDataType::NUM_TYPE => {
+                                self.visit_type::<SimpleRowsAsc<NumberType<NUM_TYPE>>, SimpleRowConverter<NumberType<NUM_TYPE>>>(pipeline)
+                            }
+                        }),
+                        _ => self.visit_type::<CommonRows, CommonConverter>(pipeline)
+                    }
+                }
+            }
+            _ => self.visit_type::<CommonRows, CommonConverter>(pipeline),
+        }
+    }
+
+    fn visit_type<R, C>(&self, pipeline: &mut Pipeline) -> Result<()>
+    where
+        R: Rows + 'static,
+        C: RowConverter<R> + Send + 'static,
+        R::Type: ArgType + Send + Sync,
+        <R::Type as AccessType>::Scalar: Ord + Send + Sync,
+    {
+        pipeline.try_add_transformer(|| {
+            TransformAddOrderColumn::<R, C>::try_new(self.sort_desc.clone(), self.schema.clone())
+        })?;
+        let offset = self.schema.num_fields();
+        pipeline.add_accumulating_transformer(|| {
+            TransformReclusterCollect::<R::Type>::new(offset, self.sample_rate, self.seed)
+        });
+        pipeline.add_transform(|input, output| {
+            Ok(ProcessorPtr::create(TransformRangePartitionIndexer::<
+                R::Type,
+            >::create(
+                input,
+                output,
+                self.state.clone().unwrap(),
+            )))
+        })
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs b/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs
deleted file mode 100644
index 0a58f27556f91..0000000000000
--- a/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use databend_common_exception::Result;
-use databend_common_expression::row::RowConverter as CommonConverter;
-use databend_common_expression::types::AccessType;
-use databend_common_expression::types::ArgType;
-use databend_common_expression::types::DataType;
-use databend_common_expression::types::DateType;
-use databend_common_expression::types::NumberDataType;
-use databend_common_expression::types::NumberType;
-use databend_common_expression::types::StringType;
-use databend_common_expression::types::TimestampType;
-use databend_common_expression::with_number_mapped_type;
-use databend_common_expression::DataSchemaRef;
-use databend_common_expression::SortColumnDescription;
-use databend_common_pipeline_core::processors::InputPort;
-use databend_common_pipeline_core::processors::OutputPort;
-use databend_common_pipeline_core::processors::Processor;
-use databend_common_pipeline_transforms::sort::CommonRows;
-use databend_common_pipeline_transforms::sort::RowConverter;
-use databend_common_pipeline_transforms::sort::Rows;
-use databend_common_pipeline_transforms::sort::SimpleRowConverter;
-use databend_common_pipeline_transforms::sort::SimpleRowsAsc;
-use databend_common_pipeline_transforms::AccumulatingTransformer;
-use databend_common_pipeline_transforms::Transformer;
-use match_template::match_template;
-
-use crate::pipelines::processors::transforms::recluster::transform_add_order_column::TransformAddOrderColumn;
-use crate::pipelines::processors::transforms::recluster::TransformRangePartitionIndexer;
-use crate::pipelines::processors::transforms::SampleState;
-use crate::pipelines::processors::transforms::TransformReclusterCollect;
-
-pub struct TransformReclusterBuilder {
-    schema: DataSchemaRef,
-    sort_desc: Arc<[SortColumnDescription]>,
-    sample_rate: f64,
-    seed: u64,
-}
-
-impl TransformReclusterBuilder {
-    pub fn build_recluster_sample(
-        &self,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-    ) -> Result<Box<dyn Processor>> {
-        self.build_inner(BuilderType::ReclusterSample, input, output, None)
-    }
-
-    pub fn build_range_partition_indexer(
-        &self,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        state: Arc<SampleState>,
-    ) -> Result<Box<dyn Processor>> {
-        self.build_inner(
-            BuilderType::RangePartitionIndexer,
-            input,
-            output,
-            Some(state),
-        )
-    }
-
-    pub fn build_add_order_column(
-        &self,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-    ) -> Result<Box<dyn Processor>> {
-        self.build_inner(BuilderType::AddOrderColumn, input, output, None)
-    }
-
-    fn build_inner(
-        &self,
-        typ: BuilderType,
-        input: Arc<InputPort>,
-        output: Arc<OutputPort>,
-        state: Option<Arc<SampleState>>,
-    ) -> Result<Box<dyn Processor>> {
-        let mut build = BuilderInner {
-            input,
-            output,
-            typ,
-            base: self,
-            state,
-        };
-        build.select_row_type()
-    }
-}
-
-enum BuilderType {
-    AddOrderColumn,
-    ReclusterSample,
-    RangePartitionIndexer,
-}
-
-struct BuilderInner<'a> {
-    input: Arc<InputPort>,
-    output: Arc<OutputPort>,
-    typ: BuilderType,
-    base: &'a TransformReclusterBuilder,
-    state: Option<Arc<SampleState>>,
-}
-
-impl BuilderInner<'_> {
-    pub fn select_row_type(&mut self) -> Result<Box<dyn Processor>> {
-        match self.base.sort_desc.as_ref() {
-            [desc] => {
-                let schema = self.base.schema.clone();
-                let sort_type = schema.field(desc.offset).data_type();
-                assert!(desc.asc);
-
-                match_template! {
-                    T = [ Date => DateType, Timestamp => TimestampType, String => StringType ],
-                    match sort_type {
-                        DataType::T => {
-                            self.visit_type::<SimpleRowsAsc<T>, SimpleRowConverter<T>>()
-                        },
-                        DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty {
-                            NumberDataType::NUM_TYPE => {
-                                self.visit_type::<SimpleRowsAsc<NumberType<NUM_TYPE>>, SimpleRowConverter<NumberType<NUM_TYPE>>>()
-                            }
-                        }),
-                        _ => self.visit_type::<CommonRows, CommonConverter>()
-                    }
-                }
-            }
-            _ => self.visit_type::<CommonRows, CommonConverter>(),
-        }
-    }
-
-    fn visit_type<R, C>(&mut self) -> Result<Box<dyn Processor>>
-    where
-        R: Rows + 'static,
-        C: RowConverter<R> + Send + 'static,
-        R::Type: ArgType + Send + Sync,
-        <R::Type as AccessType>::Scalar: Ord + Send + Sync,
-    {
-        match self.typ {
-            BuilderType::AddOrderColumn => self.build_add_order_column::<R, C>(),
-            BuilderType::ReclusterSample => self.build_recluster_sample::<R::Type>(),
-            BuilderType::RangePartitionIndexer => self.build_range_partition_indexer::<R::Type>(),
-        }
-    }
-
-    fn build_add_order_column<R, C>(&mut self) -> Result<Box<dyn Processor>>
-    where
-        R: Rows + 'static,
-        C: RowConverter<R> + Send + 'static,
-    {
-        let inner = TransformAddOrderColumn::<R, C>::try_new(
-            self.base.sort_desc.clone(),
-            self.base.schema.clone(),
-        )?;
-        Ok(Transformer::create(
-            self.input.clone(),
-            self.output.clone(),
-            inner,
-        ))
-    }
-
-    fn build_range_partition_indexer<T>(&mut self) -> Result<Box<dyn Processor>>
-    where
-        T: ArgType + Send + Sync,
-        T::Scalar: Ord + Send + Sync,
-    {
-        Ok(TransformRangePartitionIndexer::<T>::create(
-            self.input.clone(),
-            self.output.clone(),
-            self.state.clone().unwrap(),
-        ))
-    }
-
-    fn build_recluster_sample<T>(&mut self) -> Result<Box<dyn Processor>>
-    where
-        T: ArgType + Send + Sync,
-        T::Scalar: Ord + Send + Sync,
-    {
-        let offset = self.base.schema.fields().len();
-        Ok(AccumulatingTransformer::create(
-            self.input.clone(),
-            self.output.clone(),
-            TransformReclusterCollect::<T>::new(offset, self.base.sample_rate, self.base.seed),
-        ))
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
index fd1db0c11426c..0f3612043c7d9 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod builder;
 mod range_bound_sampler;
 mod recluster_partition_exchange;
 mod recluster_partition_strategy;
@@ -25,7 +24,8 @@ pub use range_bound_sampler::RangeBoundSampler;
 pub use recluster_partition_exchange::ReclusterPartitionExchange;
 pub use recluster_partition_strategy::CompactPartitionStrategy;
 pub use recluster_partition_strategy::ReclusterPartitionStrategy;
-pub(crate) use recluster_sample_state::SampleState;
+pub use recluster_sample_state::SampleState;
+pub use transform_add_order_column::TransformAddOrderColumn;
 pub use transform_range_partition_indexer::TransformRangePartitionIndexer;
-pub(crate) use transform_recluster_collect::ReclusterSampleMeta;
+pub use transform_recluster_collect::ReclusterSampleMeta;
 pub use transform_recluster_collect::TransformReclusterCollect;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index 6d10600366eda..6b944a487664b 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -137,7 +137,7 @@ where
                 while low < high {
                     let mid = low + ((high - low) / 2);
                     let bound = unsafe { self.bounds.get_unchecked(mid) }.clone();
-                    if val > bound {
+                    if val >= bound {
                         low = mid + 1;
                     } else {
                         high = mid;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
index 3e9fe42a8dee4..3900fd81db6d7 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
@@ -15,7 +15,6 @@
 use databend_common_exception::Result;
 use databend_common_expression::local_block_meta_serde;
 use databend_common_expression::types::ArgType;
-use databend_common_expression::types::ValueType;
 use databend_common_expression::BlockMetaInfo;
 use databend_common_expression::DataBlock;
 use databend_common_expression::Scalar;
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
index f5c42c4e05053..4dbc98b2b1d8a 100644
--- a/src/query/settings/src/settings_default.rs
+++ b/src/query/settings/src/settings_default.rs
@@ -860,7 +860,7 @@ impl DefaultSettings {
                     desc: "Sets the maximum byte size of blocks for recluster",
                     mode: SettingMode::Both,
                     scope: SettingScope::Both,
-                    range: Some(SettingRange::Numeric(0..=u64::MAX)),
+                    range: Some(SettingRange::Numeric(0..=80 * 1024 * 1024 * 1024)),
                 }),
                 ("compact_max_block_selection", DefaultSettingValue {
                     value: UserSettingValue::UInt64(10000),
diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index 49473f1fd7032..50a701c31be93 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -28,6 +28,7 @@ use databend_common_expression::Column;
 use databend_common_expression::ColumnId;
 use databend_common_expression::ComputedExpr;
 use databend_common_expression::DataBlock;
+use databend_common_expression::DataField;
 use databend_common_expression::FieldIndex;
 use databend_common_expression::TableField;
 use databend_common_expression::TableSchema;
@@ -35,6 +36,8 @@ use databend_common_expression::TableSchemaRef;
 use databend_common_expression::ORIGIN_BLOCK_ROW_NUM_COLUMN_ID;
 use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
 use databend_common_native::write::NativeWriter;
+use databend_common_sql::evaluator::BlockOperator;
+use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_storages_common_index::BloomIndex;
 use databend_storages_common_index::BloomIndexBuilder;
 use databend_storages_common_index::Index;
@@ -367,17 +370,24 @@ impl StreamBlockProperties {
     pub fn try_create(
         ctx: Arc<dyn TableContext>,
         table: &FuseTable,
+        kind: MutationKind,
+        level: Option<i32>,
         table_meta_timestamps: TableMetaTimestamps,
     ) -> Result<Arc<Self>> {
         // remove virtual computed fields.
-        let fields = table
+        let mut fields = table
             .schema()
             .fields()
             .iter()
             .filter(|f| !matches!(f.computed_expr(), Some(ComputedExpr::Virtual(_))))
             .cloned()
             .collect::<Vec<_>>();
-
+        if !matches!(kind, MutationKind::Insert | MutationKind::Replace) {
+            // add stream fields.
+            for stream_column in table.stream_columns().iter() {
+                fields.push(stream_column.table_field());
+            }
+        }
         let source_schema = Arc::new(TableSchema {
             fields,
             ..table.schema().as_ref().clone()
@@ -400,7 +410,7 @@ impl StreamBlockProperties {
         let inverted_index_builders = create_inverted_index_builders(&table.table_info.meta);
 
         let cluster_stats_builder =
-            ClusterStatisticsBuilder::try_create(table, ctx.clone(), &source_schema)?;
+            ClusterStatisticsBuilder::try_create(table, ctx.clone(), &source_schema, level)?;
 
         let mut stats_columns = vec![];
         let mut distinct_columns = vec![];
@@ -437,4 +447,16 @@ impl StreamBlockProperties {
         self.block_thresholds
             .check_large_enough(num_rows, data_size)
     }
+
+    pub fn cluster_operators(&self) -> Vec<BlockOperator> {
+        self.cluster_stats_builder.operators()
+    }
+
+    pub fn fields_with_cluster_key(&self) -> Vec<DataField> {
+        self.cluster_stats_builder.out_fields()
+    }
+
+    pub fn cluster_key_index(&self) -> &Vec<usize> {
+        self.cluster_stats_builder.cluster_key_index()
+    }
 }
diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
index 84bba6b663db1..a0bd91888995e 100644
--- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
@@ -20,9 +20,9 @@ use databend_common_exception::Result;
 use databend_common_expression::Column;
 use databend_common_expression::ColumnRef;
 use databend_common_expression::DataBlock;
+use databend_common_expression::DataField;
 use databend_common_expression::DataSchema;
 use databend_common_expression::Expr;
-use databend_common_expression::FunctionContext;
 use databend_common_expression::Scalar;
 use databend_common_expression::TableSchemaRef;
 use databend_common_functions::aggregates::eval_aggr;
@@ -35,12 +35,13 @@ use crate::FuseTable;
 
 #[derive(Default, Clone)]
 pub struct ClusterStatisticsBuilder {
+    out_fields: Vec<DataField>,
+    level: i32,
     cluster_key_id: u32,
     cluster_key_index: Vec<usize>,
 
     extra_key_num: usize,
     operators: Vec<BlockOperator>,
-    func_ctx: FunctionContext,
 }
 
 impl ClusterStatisticsBuilder {
@@ -48,6 +49,7 @@ impl ClusterStatisticsBuilder {
         table: &FuseTable,
         ctx: Arc<dyn TableContext>,
         source_schema: &TableSchemaRef,
+        level: Option<i32>,
     ) -> Result<Arc<Self>> {
         let cluster_type = table.cluster_type();
         if cluster_type.is_none_or(|v| v == ClusterType::Hilbert) {
@@ -55,9 +57,9 @@ impl ClusterStatisticsBuilder {
         }
 
         let input_schema: Arc<DataSchema> = DataSchema::from(source_schema).into();
-        let input_filed_len = input_schema.fields.len();
+        let mut out_fields = input_schema.fields().clone();
 
-        let cluster_keys = table.linear_cluster_keys(ctx.clone());
+        let cluster_keys = table.linear_cluster_keys(ctx);
         let mut cluster_key_index = Vec::with_capacity(cluster_keys.len());
         let mut extra_key_num = 0;
 
@@ -69,8 +71,11 @@ impl ClusterStatisticsBuilder {
             let index = match &expr {
                 Expr::ColumnRef(ColumnRef { id, .. }) => *id,
                 _ => {
+                    let cname = format!("{}", expr);
+                    out_fields.push(DataField::new(cname.as_str(), expr.data_type().clone()));
                     exprs.push(expr);
-                    let offset = input_filed_len + extra_key_num;
+
+                    let offset = out_fields.len() - 1;
                     extra_key_num += 1;
                     offset
                 }
@@ -90,14 +95,26 @@ impl ClusterStatisticsBuilder {
             cluster_key_id: table.cluster_key_meta.as_ref().unwrap().0,
             cluster_key_index,
             extra_key_num,
-            func_ctx: ctx.get_function_context()?,
             operators,
+            out_fields,
+            level: level.unwrap_or(0),
         }))
     }
+
+    pub fn operators(&self) -> Vec<BlockOperator> {
+        self.operators.clone()
+    }
+
+    pub fn out_fields(&self) -> Vec<DataField> {
+        self.out_fields.clone()
+    }
+
+    pub fn cluster_key_index(&self) -> &Vec<usize> {
+        &self.cluster_key_index
+    }
 }
 
 pub struct ClusterStatisticsState {
-    level: i32,
     mins: Vec<Scalar>,
     maxs: Vec<Scalar>,
 
@@ -107,29 +124,23 @@ pub struct ClusterStatisticsState {
 impl ClusterStatisticsState {
     pub fn new(builder: Arc<ClusterStatisticsBuilder>) -> Self {
         Self {
-            level: 0,
             mins: vec![],
             maxs: vec![],
             builder,
         }
     }
 
-    pub fn add_block(&mut self, input: DataBlock) -> Result<DataBlock> {
+    pub fn add_block(&mut self, mut input: DataBlock) -> Result<DataBlock> {
         if self.builder.cluster_key_index.is_empty() {
             return Ok(input);
         }
 
         let num_rows = input.num_rows();
-        let mut block = self
-            .builder
-            .operators
-            .iter()
-            .try_fold(input, |input, op| op.execute(&self.builder.func_ctx, input))?;
         let cols = self
             .builder
             .cluster_key_index
             .iter()
-            .map(|&i| block.get_by_offset(i).to_column())
+            .map(|&i| input.get_by_offset(i).to_column())
             .collect();
         let tuple = Column::Tuple(cols);
         let (min, _) = eval_aggr("min", vec![], &[tuple.clone()], num_rows, vec![])?;
@@ -138,8 +149,8 @@ impl ClusterStatisticsState {
         assert_eq!(max.len(), 1);
         self.mins.push(min.index(0).unwrap().to_owned());
         self.maxs.push(max.index(0).unwrap().to_owned());
-        block.pop_columns(self.builder.extra_key_num);
-        Ok(block)
+        input.pop_columns(self.builder.extra_key_num);
+        Ok(input)
     }
 
     pub fn finalize(self, perfect: bool) -> Result<Option<ClusterStatistics>> {
@@ -167,7 +178,7 @@ impl ClusterStatisticsState {
         let level = if min == max && perfect {
             -1
         } else {
-            self.level
+            self.builder.level
         };
 
         Ok(Some(ClusterStatistics {
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index 6c9269bd13620..f938768066ce1 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -37,6 +37,7 @@ use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_storages_common_table_meta::meta::TableMetaTimestamps;
 use databend_storages_common_table_meta::table::ClusterType;
 
+use crate::io::StreamBlockProperties;
 use crate::operations::TransformBlockBuilder;
 use crate::operations::TransformBlockWriter;
 use crate::operations::TransformSerializeBlock;
@@ -53,13 +54,34 @@ impl FuseTable {
         let enable_stream_block_write =
             ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet();
         if enable_stream_block_write {
+            let properties = StreamBlockProperties::try_create(
+                ctx.clone(),
+                self,
+                MutationKind::Insert,
+                None,
+                table_meta_timestamps,
+            )?;
+
+            let cluster_operators = properties.cluster_operators();
+            if !cluster_operators.is_empty() {
+                let num_input_columns = self.table_info.schema().num_fields();
+                let func_ctx = ctx.get_function_context()?;
+                pipeline.add_transformer(move || {
+                    CompoundBlockOperator::new(
+                        cluster_operators.clone(),
+                        func_ctx.clone(),
+                        num_input_columns,
+                    )
+                });
+            }
+
             pipeline.add_transform(|input, output| {
                 TransformBlockBuilder::try_create(
                     ctx.clone(),
                     input,
                     output,
                     self,
-                    table_meta_timestamps,
+                    properties.clone(),
                 )
             })?;
 
@@ -70,7 +92,7 @@ impl FuseTable {
             let block_thresholds = self.get_block_thresholds();
             build_compact_block_pipeline(pipeline, block_thresholds)?;
 
-            let schema = DataSchema::from(self.schema()).into();
+            let schema = DataSchema::from(&self.schema().remove_virtual_computed_fields()).into();
             let cluster_stats_gen =
                 self.cluster_gen_for_append(ctx.clone(), pipeline, block_thresholds, Some(schema))?;
             pipeline.add_transform(|input, output| {
@@ -103,7 +125,7 @@ impl FuseTable {
 
         let operators = cluster_stats_gen.operators.clone();
         if !operators.is_empty() {
-            let num_input_columns = self.table_info.schema().fields().len();
+            let num_input_columns = self.table_info.schema().num_fields();
             let func_ctx2 = cluster_stats_gen.func_ctx.clone();
             let mut builder = pipeline.try_create_transform_pipeline_builder_with_len(
                 move || {
@@ -162,7 +184,7 @@ impl FuseTable {
 
         let operators = cluster_stats_gen.operators.clone();
         if !operators.is_empty() {
-            let num_input_columns = self.table_info.schema().fields().len();
+            let num_input_columns = self.table_info.schema().num_fields();
             let func_ctx2 = cluster_stats_gen.func_ctx.clone();
 
             pipeline.add_transformer(move || {
@@ -199,8 +221,9 @@ impl FuseTable {
             return Ok(ClusterStatsGenerator::default());
         }
 
-        let input_schema =
-            modified_schema.unwrap_or(DataSchema::from(self.schema_with_stream()).into());
+        let input_schema = modified_schema.unwrap_or(
+            DataSchema::from(&self.schema_with_stream().remove_virtual_computed_fields()).into(),
+        );
         let mut merged = input_schema.fields().clone();
 
         let cluster_keys = self.linear_cluster_keys(ctx.clone());
diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
index 5af36b0a1f522..f5f61b8001c4d 100644
--- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
@@ -34,7 +34,6 @@ use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_transforms::AsyncAccumulatingTransform;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_storage::MutationStatus;
-use databend_storages_common_table_meta::meta::TableMetaTimestamps;
 use opendal::Operator;
 
 use crate::io::BlockSerialization;
@@ -77,13 +76,12 @@ impl TransformBlockBuilder {
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
         table: &FuseTable,
-        table_meta_timestamps: TableMetaTimestamps,
+        properties: Arc<StreamBlockProperties>,
     ) -> Result<ProcessorPtr> {
         let max_block_rows = std::cmp::min(
             ctx.get_settings().get_max_block_size()? as usize,
             table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT),
         );
-        let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?;
         Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder {
             state: State::Consume,
             input,

From 8811b22e4a56681b5bcebdac77ded59a639bc056 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sat, 7 Jun 2025 13:58:21 +0800
Subject: [PATCH 19/36] fix

---
 src/query/functions/src/scalars/hilbert.rs    | 10 ++++----
 .../recluster/recluster_sample_state.rs       | 24 +++++++++++++++----
 .../transform_range_partition_indexer.rs      | 15 +++++++++---
 3 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/query/functions/src/scalars/hilbert.rs b/src/query/functions/src/scalars/hilbert.rs
index d570b83740c1e..b57c6aa77a17a 100644
--- a/src/query/functions/src/scalars/hilbert.rs
+++ b/src/query/functions/src/scalars/hilbert.rs
@@ -257,17 +257,17 @@ pub fn register(registry: &mut FunctionRegistry) {
 ///
 /// # Example
 /// For boundaries [10, 20, 30]:
-/// - Values < 10 get partition ID 0
-/// - Values >= 10 and < 20 get partition ID 1
-/// - Values >= 20 and < 30 get partition ID 2
-/// - Values >= 30 get partition ID 3
+/// - Values <= 10 get partition ID 0
+/// - Values > 10 and <= 20 get partition ID 1
+/// - Values > 20 and <= 30 get partition ID 2
+/// - Values > 30 get partition ID 3
 fn calc_range_partition_id(val: ScalarRef, arr: &Column) -> u64 {
     let mut low = 0;
     let mut high = arr.len();
     while low < high {
         let mid = low + ((high - low) / 2);
         let bound = unsafe { arr.index_unchecked(mid) };
-        if val >= bound {
+        if val > bound {
             low = mid + 1;
         } else {
             high = mid;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
index 77c0fa0a9483f..12d50653b8b68 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
@@ -35,6 +35,7 @@ impl SampleState {
                 completed_inputs: 0,
                 values: vec![],
                 bounds: vec![],
+                max_value: None,
             }),
             done: Arc::new(WatchNotify::new()),
         })
@@ -56,17 +57,22 @@ impl SampleState {
         Ok(())
     }
 
-    pub fn get_bounds<T>(&self) -> Vec<T::Scalar>
+    pub fn get_bounds<T>(&self) -> (Vec<T::Scalar>, Option<T::Scalar>)
     where
         T: ArgType,
         T::Scalar: Ord,
     {
         let inner = self.inner.read().unwrap();
-        inner
+        let bounds = inner
             .bounds
             .iter()
             .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()))
-            .collect()
+            .collect();
+        let max_value = inner
+            .max_value
+            .as_ref()
+            .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()));
+        (bounds, max_value)
     }
 }
 
@@ -76,6 +82,7 @@ pub struct SampleStateInner {
 
     completed_inputs: usize,
     bounds: Vec<Scalar>,
+    max_value: Option<Scalar>,
 
     values: Vec<(u64, Vec<Scalar>)>,
 }
@@ -112,6 +119,9 @@ impl SampleStateInner {
         let col = T::upcast_column(T::column_from_vec(data.clone(), &[]));
         let indices = compare_columns(vec![col], total_samples)?;
 
+        let max_index = indices[total_samples - 1] as usize;
+        let max_val = data[max_index].clone();
+
         let mut cum_weight = 0.0;
         let mut target = step;
         let mut bounds = Vec::with_capacity(self.partitions - 1);
@@ -126,7 +136,13 @@ impl SampleStateInner {
             if cum_weight >= target {
                 let data = &data[idx];
                 if previous_bound.as_ref().is_none_or(|prev| data > prev) {
-                    bounds.push(T::upcast_scalar(data.clone()));
+                    if data == &max_val {
+                        self.max_value = Some(T::upcast_scalar(max_val));
+                        break;
+                    }
+
+                    let bound = T::upcast_scalar(data.clone());
+                    bounds.push(bound);
                     target += step;
                     j += 1;
                     previous_bound = Some(data.clone());
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index 6b944a487664b..215e0c977eb8f 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -44,6 +44,7 @@ where T: ArgType
     input_data: Vec<DataBlock>,
     output_data: VecDeque<DataBlock>,
     bounds: Vec<T::Scalar>,
+    max_value: Option<T::Scalar>,
 }
 
 impl<T> TransformRangePartitionIndexer<T>
@@ -63,6 +64,7 @@ where
             input_data: vec![],
             output_data: VecDeque::new(),
             bounds: vec![],
+            max_value: None,
         })
     }
 }
@@ -124,6 +126,7 @@ where
 
     fn process(&mut self) -> Result<()> {
         if let Some(mut block) = self.input_data.pop() {
+            let bound_len = self.bounds.len();
             let num_rows = block.num_rows();
             let last = block.get_last_column().clone();
             block.pop_columns(1);
@@ -132,12 +135,18 @@ where
             for index in 0..num_rows {
                 let val =
                     T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) });
+                if self.max_value.as_ref().is_some_and(|v| val >= *v) {
+                    let range_id = bound_len + 1;
+                    builder.push(range_id as u64);
+                    continue;
+                }
+
                 let mut low = 0;
-                let mut high = self.bounds.len();
+                let mut high = bound_len;
                 while low < high {
                     let mid = low + ((high - low) / 2);
                     let bound = unsafe { self.bounds.get_unchecked(mid) }.clone();
-                    if val >= bound {
+                    if val > bound {
                         low = mid + 1;
                     } else {
                         high = mid;
@@ -158,7 +167,7 @@ where
     #[async_backtrace::framed]
     async fn async_process(&mut self) -> Result<()> {
         self.state.done.notified().await;
-        self.bounds = self.state.get_bounds::<T>();
+        (self.bounds, self.max_value) = self.state.get_bounds::<T>();
         Ok(())
     }
 }

From e395e1095289a424700f2289eb9feea32983fe35 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sun, 8 Jun 2025 09:38:38 +0800
Subject: [PATCH 20/36] fix

---
 .../fuse/operations/mutation/recluster_mutator.rs |  2 ++
 .../mutation/mutator/recluster_mutator.rs         | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs
index b3e8cf59c5a65..bd091a35ef5f2 100644
--- a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs
+++ b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs
@@ -156,6 +156,7 @@ async fn test_recluster_mutator_block_select() -> Result<()> {
         cluster_key_id,
         1,
         column_ids,
+        1,
     );
     let (_, parts) = mutator
         .target_select(compact_segments, ReclusterMode::Recluster)
@@ -280,6 +281,7 @@ async fn test_safety_for_recluster() -> Result<()> {
             cluster_key_id,
             max_tasks,
             column_ids,
+            500,
         ));
         let (mode, selected_segs) = mutator.select_segments(&compact_segments, 8)?;
         // select the blocks with the highest depth.
diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
index 3c50cd9f83153..0debc7375566c 100644
--- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
+++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
@@ -72,6 +72,8 @@ pub struct ReclusterMutator {
     pub(crate) max_tasks: usize,
     pub(crate) cluster_key_types: Vec<DataType>,
     pub(crate) column_ids: HashSet<u32>,
+
+    average_size: usize,
 }
 
 impl ReclusterMutator {
@@ -102,6 +104,13 @@ impl ReclusterMutator {
         // NOTE: The snapshot schema does not contain the stream column.
         let column_ids = snapshot.schema.to_leaf_column_id_set();
 
+        let average_size = cmp::max(
+            snapshot
+                .summary
+                .uncompressed_byte_size
+                .div_ceil(snapshot.summary.block_count) as usize,
+            block_thresholds.max_bytes_per_block / 2,
+        );
         Ok(Self {
             ctx,
             schema,
@@ -111,6 +120,7 @@ impl ReclusterMutator {
             max_tasks,
             cluster_key_types,
             column_ids,
+            average_size,
         })
     }
 
@@ -125,6 +135,7 @@ impl ReclusterMutator {
         cluster_key_id: u32,
         max_tasks: usize,
         column_ids: HashSet<u32>,
+        average_size: usize,
     ) -> Self {
         Self {
             ctx,
@@ -135,6 +146,7 @@ impl ReclusterMutator {
             max_tasks,
             cluster_key_types,
             column_ids,
+            average_size,
         }
     }
 
@@ -196,8 +208,7 @@ impl ReclusterMutator {
             .get_recluster_block_size()?
             .min(avail_memory_usage * 30 / 100) as usize;
         // specify a rather small value, so that `recluster_block_size` might be tuned to lower value.
-        let max_blocks_num =
-            (memory_threshold / self.block_thresholds.max_bytes_per_block).max(2) * self.max_tasks;
+        let max_blocks_num = (memory_threshold / self.average_size).max(2) * self.max_tasks;
         let block_per_seg = self.block_thresholds.block_per_segment;
 
         // Prepare task generation parameters

From e6a2a253d4421d76f923b12c5e126290a779db70 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Mon, 9 Jun 2025 11:21:11 +0800
Subject: [PATCH 21/36] fix test

---
 .../it/scalars/testdata/function_list.txt     | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/src/query/functions/tests/it/scalars/testdata/function_list.txt b/src/query/functions/tests/it/scalars/testdata/function_list.txt
index 06d4012daf92b..1119a04aa7373 100644
--- a/src/query/functions/tests/it/scalars/testdata/function_list.txt
+++ b/src/query/functions/tests/it/scalars/testdata/function_list.txt
@@ -115,6 +115,50 @@ Functions overloads:
 1 add_months(Date NULL, Int64 NULL) :: Date NULL
 2 add_months(Timestamp, Int64) :: Timestamp
 3 add_months(Timestamp NULL, Int64 NULL) :: Timestamp NULL
+0 add_noise(String) :: Binary
+1 add_noise(String NULL) :: Binary NULL
+2 add_noise(UInt8) :: Binary
+3 add_noise(UInt8 NULL) :: Binary NULL
+4 add_noise(UInt16) :: Binary
+5 add_noise(UInt16 NULL) :: Binary NULL
+6 add_noise(UInt32) :: Binary
+7 add_noise(UInt32 NULL) :: Binary NULL
+8 add_noise(UInt64) :: Binary
+9 add_noise(UInt64 NULL) :: Binary NULL
+10 add_noise(Int8) :: Binary
+11 add_noise(Int8 NULL) :: Binary NULL
+12 add_noise(Int16) :: Binary
+13 add_noise(Int16 NULL) :: Binary NULL
+14 add_noise(Int32) :: Binary
+15 add_noise(Int32 NULL) :: Binary NULL
+16 add_noise(Int64) :: Binary
+17 add_noise(Int64 NULL) :: Binary NULL
+18 add_noise(Float32) :: Binary
+19 add_noise(Float32 NULL) :: Binary NULL
+20 add_noise(Float64) :: Binary
+21 add_noise(Float64 NULL) :: Binary NULL
+22 add_noise(String, UInt64) :: Binary
+23 add_noise(String NULL, UInt64 NULL) :: Binary NULL
+24 add_noise(UInt8, UInt64) :: Binary
+25 add_noise(UInt8 NULL, UInt64 NULL) :: Binary NULL
+26 add_noise(UInt16, UInt64) :: Binary
+27 add_noise(UInt16 NULL, UInt64 NULL) :: Binary NULL
+28 add_noise(UInt32, UInt64) :: Binary
+29 add_noise(UInt32 NULL, UInt64 NULL) :: Binary NULL
+30 add_noise(UInt64, UInt64) :: Binary
+31 add_noise(UInt64 NULL, UInt64 NULL) :: Binary NULL
+32 add_noise(Int8, UInt64) :: Binary
+33 add_noise(Int8 NULL, UInt64 NULL) :: Binary NULL
+34 add_noise(Int16, UInt64) :: Binary
+35 add_noise(Int16 NULL, UInt64 NULL) :: Binary NULL
+36 add_noise(Int32, UInt64) :: Binary
+37 add_noise(Int32 NULL, UInt64 NULL) :: Binary NULL
+38 add_noise(Int64, UInt64) :: Binary
+39 add_noise(Int64 NULL, UInt64 NULL) :: Binary NULL
+40 add_noise(Float32, UInt64) :: Binary
+41 add_noise(Float32 NULL, UInt64 NULL) :: Binary NULL
+42 add_noise(Float64, UInt64) :: Binary
+43 add_noise(Float64 NULL, UInt64 NULL) :: Binary NULL
 0 add_quarters(Date, Int64) :: Date
 1 add_quarters(Date NULL, Int64 NULL) :: Date NULL
 2 add_quarters(Timestamp, Int64) :: Timestamp

From fa6f023490fadea28534cacdc8245157430c4d78 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Mon, 9 Jun 2025 20:17:55 +0800
Subject: [PATCH 22/36] fix

---
 .../fuse/src/operations/mutation/mutator/recluster_mutator.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
index 0debc7375566c..85d275bcabef3 100644
--- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
+++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
@@ -109,7 +109,7 @@ impl ReclusterMutator {
                 .summary
                 .uncompressed_byte_size
                 .div_ceil(snapshot.summary.block_count) as usize,
-            block_thresholds.max_bytes_per_block / 2,
+            block_thresholds.min_bytes_per_block,
         );
         Ok(Self {
             ctx,
@@ -206,7 +206,7 @@ impl ReclusterMutator {
             settings.get_max_memory_usage()? - GLOBAL_MEM_STAT.get_memory_usage() as u64;
         let memory_threshold = settings
             .get_recluster_block_size()?
-            .min(avail_memory_usage * 30 / 100) as usize;
+            .min(avail_memory_usage * 50 / 100) as usize;
         // specify a rather small value, so that `recluster_block_size` might be tuned to lower value.
         let max_blocks_num = (memory_threshold / self.average_size).max(2) * self.max_tasks;
         let block_per_seg = self.block_thresholds.block_per_segment;

From 42ebabafcee4fcf60f2e192eae77256a1c3ea79b Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Mon, 9 Jun 2025 22:13:37 +0800
Subject: [PATCH 23/36] fix test

---
 src/query/storages/fuse/src/operations/recluster.rs           | 4 ++++
 .../base/09_fuse_engine/09_0008_fuse_optimize_table.test      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/query/storages/fuse/src/operations/recluster.rs b/src/query/storages/fuse/src/operations/recluster.rs
index 4ce55ee9b9052..3cbf4cd1f922b 100644
--- a/src/query/storages/fuse/src/operations/recluster.rs
+++ b/src/query/storages/fuse/src/operations/recluster.rs
@@ -71,6 +71,10 @@ impl FuseTable {
             return Ok(None);
         };
 
+        if snapshot.summary.block_count == 0 {
+            return Ok(None);
+        }
+
         let mutator = Arc::new(ReclusterMutator::try_create(
             self,
             ctx.clone(),
diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test
index 43a2b262ca2f9..929e042c13122 100644
--- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test
+++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test
@@ -510,7 +510,7 @@ select segment_count, block_count from fuse_snapshot('db_09_0008', 't9') limit 2
 2 2
 
 query I
-select a from t9
+select a from t9 order by a
 ----
 1
 2

From bd996f2d1f6dff0cc864a1d7c53aaf90cfd6c38e Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Tue, 10 Jun 2025 01:40:19 +0800
Subject: [PATCH 24/36] fix test

---
 .../expression/src/utils/block_thresholds.rs  |  8 ++--
 .../expression/tests/it/block_thresholds.rs   |  6 +--
 .../interpreter_table_recluster.rs            |  2 +-
 .../pipelines/builders/builder_recluster.rs   | 16 +++++---
 .../recluster/range_bound_sampler.rs          | 17 +++-----
 .../recluster/recluster_partition_strategy.rs | 40 +++++++++++++++----
 .../recluster/transform_recluster_collect.rs  |  4 +-
 src/query/settings/src/settings_default.rs    |  8 ++--
 .../settings/src/settings_getter_setter.rs    |  4 +-
 .../fuse/src/io/write/stream/block_builder.rs |  4 +-
 .../src/io/write/stream/column_statistics.rs  |  2 +-
 .../storages/fuse/src/operations/append.rs    | 14 ++-----
 .../processors/transform_block_writer.rs      | 26 +-----------
 .../mutation/mutator/recluster_mutator.rs     | 19 ++++++---
 .../09_0008_fuse_optimize_table.test          |  2 +-
 15 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs
index 01c0631abe124..66cb1fbabc9e1 100644
--- a/src/query/expression/src/utils/block_thresholds.rs
+++ b/src/query/expression/src/utils/block_thresholds.rs
@@ -39,7 +39,7 @@ impl Default for BlockThresholds {
             max_bytes_per_block: DEFAULT_BLOCK_BUFFER_SIZE * 2,
             min_bytes_per_block: (DEFAULT_BLOCK_BUFFER_SIZE * 4).div_ceil(5),
             max_compressed_per_block: DEFAULT_BLOCK_COMPRESSED_SIZE,
-            min_compressed_per_block: (DEFAULT_BLOCK_COMPRESSED_SIZE * 4).div_ceil(5),
+            min_compressed_per_block: (DEFAULT_BLOCK_COMPRESSED_SIZE * 3).div_ceil(5),
             block_per_segment: DEFAULT_BLOCK_PER_SEGMENT,
         }
     }
@@ -58,7 +58,7 @@ impl BlockThresholds {
             max_bytes_per_block: bytes_per_block * 2,
             min_bytes_per_block: (bytes_per_block * 4).div_ceil(5),
             max_compressed_per_block,
-            min_compressed_per_block: (max_compressed_per_block * 4).div_ceil(5),
+            min_compressed_per_block: (max_compressed_per_block * 3).div_ceil(5),
             block_per_segment,
         }
     }
@@ -153,7 +153,7 @@ impl BlockThresholds {
         let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed);
         // Adjust the number of blocks based on block size thresholds.
         let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024);
-        let min_bytes_per_block = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024);
+        let min_bytes_per_block = self.min_bytes_per_block.min(100 * 1024 * 1024);
         let block_nums = if bytes_per_block > max_bytes_per_block {
             // Case 1: If the block size is too bigger.
             total_bytes.div_ceil(max_bytes_per_block)
@@ -201,7 +201,7 @@ impl BlockThresholds {
         // Adjust block count based on byte size thresholds.
         let bytes_per_block = total_bytes.div_ceil(by_compressed);
         let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024);
-        let min_bytes = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024);
+        let min_bytes = self.min_bytes_per_block.min(100 * 1024 * 1024);
         let total_partitions = if bytes_per_block > max_bytes {
             // Block size is too large.
             total_bytes / max_bytes
diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs
index 33b1d1b0ff394..b7409208386aa 100644
--- a/src/query/expression/tests/it/block_thresholds.rs
+++ b/src/query/expression/tests/it/block_thresholds.rs
@@ -105,8 +105,8 @@ fn test_calc_rows_for_recluster() {
     assert_eq!(result, 300);
 
     // Case 2: If the block size is too smaller.
-    let result = t.calc_rows_for_recluster(4_000, 2_000_000, 600_000);
-    assert_eq!(result, 800);
+    let result = t.calc_rows_for_recluster(4_000, 1_600_000, 600_000);
+    assert_eq!(result, 2000);
 
     // Case 3: use the compressed-based block count.
     let result = t.calc_rows_for_recluster(4_000, 10_000_000, 600_000);
@@ -131,7 +131,7 @@ fn test_calc_partitions_for_recluster() {
     assert_eq!(result, 15);
 
     // Case 2: If the block size is too smaller.
-    let result = t.calc_partitions_for_recluster(4_000, 800_000, 800_000);
+    let result = t.calc_partitions_for_recluster(4_000, 1_600_000, 800_000);
     assert_eq!(result, 2);
 
     // Case 3: use the compressed-based block count.
diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs
index 237a2252ea067..602e58df1670a 100644
--- a/src/query/service/src/interpreters/interpreter_table_recluster.rs
+++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs
@@ -631,7 +631,7 @@ impl ReclusterTableInterpreter {
         let database = &self.plan.database;
         let table = &self.plan.table;
         let settings = self.ctx.get_settings();
-        let sample_size = settings.get_hilbert_sample_size_per_block()?;
+        let sample_size = settings.get_recluster_sample_size_per_block()?;
 
         let name_resolution_ctx = NameResolutionContext::try_from(settings.as_ref())?;
         let ast_exprs = tbl.resolve_cluster_keys(self.ctx.clone()).unwrap();
diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index a5f39011080d8..cbcde2069c441 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -187,7 +187,11 @@ impl PipelineBuilder {
                         .collect();
 
                     let num_processors = self.main_pipeline.output_len();
-                    let sample_rate = 0.01;
+                    let sample_size = self
+                        .ctx
+                        .get_settings()
+                        .get_recluster_sample_size_per_block()?
+                        as usize;
                     let partitions = block_thresholds.calc_partitions_for_recluster(
                         task.total_rows,
                         task.total_bytes,
@@ -195,7 +199,7 @@ impl PipelineBuilder {
                     );
                     let state = SampleState::new(num_processors, partitions);
                     let recluster_pipeline_builder =
-                        ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_rate)
+                        ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_size)
                             .with_state(state);
                     recluster_pipeline_builder
                         .build_recluster_sample_pipeline(&mut self.main_pipeline)?;
@@ -314,7 +318,7 @@ struct ReclusterPipelineBuilder {
     schema: DataSchemaRef,
     sort_desc: Arc<[SortColumnDescription]>,
     state: Option<Arc<SampleState>>,
-    sample_rate: f64,
+    sample_size: usize,
     seed: u64,
 }
 
@@ -322,13 +326,13 @@ impl ReclusterPipelineBuilder {
     fn create(
         schema: DataSchemaRef,
         sort_desc: Arc<[SortColumnDescription]>,
-        sample_rate: f64,
+        sample_size: usize,
     ) -> Self {
         Self {
             schema,
             sort_desc,
             state: None,
-            sample_rate,
+            sample_size,
             seed: rand::random(),
         }
     }
@@ -382,7 +386,7 @@ impl ReclusterPipelineBuilder {
         })?;
         let offset = self.schema.num_fields();
         pipeline.add_accumulating_transformer(|| {
-            TransformReclusterCollect::<R::Type>::new(offset, self.sample_rate, self.seed)
+            TransformReclusterCollect::<R::Type>::new(offset, self.sample_size, self.seed)
         });
         pipeline.add_transform(|input, output| {
             Ok(ProcessorPtr::create(TransformRangePartitionIndexer::<
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
index 0dfee36475b36..a76417256416f 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
@@ -26,7 +26,7 @@ pub struct RangeBoundSampler<T>
 where T: ValueType
 {
     offset: usize,
-    sample_rate: f64,
+    sample_size: usize,
     rng: SmallRng,
 
     values: Vec<(u64, Vec<Scalar>)>,
@@ -36,11 +36,11 @@ where T: ValueType
 impl<T> RangeBoundSampler<T>
 where T: ValueType
 {
-    pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self {
+    pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self {
         let rng = SmallRng::seed_from_u64(seed);
         Self {
             offset,
-            sample_rate,
+            sample_size,
             rng,
             values: vec![],
             _t: PhantomData,
@@ -58,15 +58,10 @@ where
         assert!(rows > 0);
         let column = data.get_by_offset(self.offset).to_column(rows);
 
-        let sample_size = std::cmp::max((self.sample_rate * rows as f64).ceil() as usize, 100);
+        let sample_size = std::cmp::min(self.sample_size, rows);
         let mut indices = (0..rows).collect::<Vec<_>>();
-
-        let sampled_indices = if rows > sample_size {
-            indices.shuffle(&mut self.rng);
-            &indices[..sample_size]
-        } else {
-            &indices
-        };
+        indices.shuffle(&mut self.rng);
+        let sampled_indices = &indices[..sample_size];
 
         let column = T::try_downcast_column(&column).unwrap();
         let sample_values = sampled_indices
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
index 7f478c94b8d43..269aae2e3abea 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
@@ -32,6 +32,10 @@ impl ReclusterPartitionStrategy {
     pub fn new(properties: Arc<StreamBlockProperties>) -> Self {
         Self { properties }
     }
+
+    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
+        DataBlock::concat(&blocks)
+    }
 }
 
 impl PartitionProcessStrategy for ReclusterPartitionStrategy {
@@ -51,22 +55,44 @@ impl PartitionProcessStrategy for ReclusterPartitionStrategy {
     /// Stream write each block, and flush it conditionally based on builder status
     /// and input size estimation.
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
-        let mut input_sizes: usize = data_blocks.iter().map(|b| b.estimate_block_size()).sum();
-        let mut input_rows: usize = data_blocks.iter().map(|b| b.num_rows()).sum();
+        let blocks_num = data_blocks.len();
+        let mut accumulated_rows = 0;
+        let mut accumulated_bytes = 0;
+        let mut pending_blocks = Vec::with_capacity(blocks_num);
+        let mut staged_blocks = Vec::with_capacity(blocks_num);
+        let mut compacted = Vec::with_capacity(blocks_num);
+        for block in data_blocks {
+            accumulated_rows += block.num_rows();
+            accumulated_bytes += block.estimate_block_size();
+            pending_blocks.push(block);
+            if !self
+                .properties
+                .check_large_enough(accumulated_rows, accumulated_bytes)
+            {
+                continue;
+            }
+            if !staged_blocks.is_empty() {
+                compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+            }
+            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
+            accumulated_rows = 0;
+            accumulated_bytes = 0;
+        }
+        staged_blocks.append(&mut pending_blocks);
+        if !staged_blocks.is_empty() {
+            compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+        }
 
         let mut result = Vec::new();
         let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
-        for block in data_blocks {
-            input_sizes -= block.estimate_block_size();
-            input_rows -= block.num_rows();
+        for block in compacted {
             builder.write(block)?;
-            if builder.need_flush() && self.properties.check_large_enough(input_rows, input_sizes) {
+            if builder.need_flush() {
                 let serialized = builder.finish()?;
                 result.push(DataBlock::empty_with_meta(Box::new(serialized)));
                 builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
             }
         }
-
         if !builder.is_empty() {
             let serialized = builder.finish()?;
             result.push(DataBlock::empty_with_meta(Box::new(serialized)));
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
index 3900fd81db6d7..46684b42b31e3 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
@@ -36,10 +36,10 @@ where
     T: ArgType + Send + Sync,
     T::Scalar: Ord + Send,
 {
-    pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self {
+    pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self {
         Self {
             input_data: vec![],
-            sampler: RangeBoundSampler::<T>::new(offset, sample_rate, seed),
+            sampler: RangeBoundSampler::<T>::new(offset, sample_size, seed),
         }
     }
 }
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
index 4dbc98b2b1d8a..a686891024a90 100644
--- a/src/query/settings/src/settings_default.rs
+++ b/src/query/settings/src/settings_default.rs
@@ -755,7 +755,7 @@ impl DefaultSettings {
                     range: Some(SettingRange::Numeric(0..=1)),
                 }),
                 ("enable_distributed_compact", DefaultSettingValue {
-                    value: UserSettingValue::UInt64(0),
+                    value: UserSettingValue::UInt64(1),
                     desc: "Enables distributed execution of table compaction.",
                     mode: SettingMode::Both,
                     scope: SettingScope::Both,
@@ -870,7 +870,7 @@ impl DefaultSettings {
                     range: Some(SettingRange::Numeric(2..=u64::MAX)),
                 }),
                 ("enable_distributed_recluster", DefaultSettingValue {
-                    value: UserSettingValue::UInt64(0),
+                    value: UserSettingValue::UInt64(1),
                     desc: "Enable distributed execution of table recluster.",
                     mode: SettingMode::Both,
                     scope: SettingScope::Both,
@@ -1220,9 +1220,9 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(1..=65535)),
                 }),
-                ("hilbert_sample_size_per_block", DefaultSettingValue {
+                ("recluster_sample_size_per_block", DefaultSettingValue {
                     value: UserSettingValue::UInt64(1000),
-                    desc: "Specifies the number of sample points per block used in Hilbert clustering.",
+                    desc: "Specifies the number of sample points per block used in clustering.",
                     mode: SettingMode::Both,
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(1..=u64::MAX)),
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
index a3ef08cc9264b..89061370021da 100644
--- a/src/query/settings/src/settings_getter_setter.rs
+++ b/src/query/settings/src/settings_getter_setter.rs
@@ -889,8 +889,8 @@ impl Settings {
         self.try_get_u64("hilbert_num_range_ids")
     }
 
-    pub fn get_hilbert_sample_size_per_block(&self) -> Result<u64> {
-        self.try_get_u64("hilbert_sample_size_per_block")
+    pub fn get_recluster_sample_size_per_block(&self) -> Result<u64> {
+        self.try_get_u64("recluster_sample_size_per_block")
     }
 
     pub fn get_hilbert_clustering_min_bytes(&self) -> Result<u64> {
diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index 50a701c31be93..30d3b2b45543d 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -238,9 +238,9 @@ impl StreamBlockBuilder {
     pub fn need_flush(&self) -> bool {
         let file_size = self.block_writer.compressed_size();
         self.row_count >= self.properties.block_thresholds.min_rows_per_block
-            || self.block_size >= self.properties.block_thresholds.max_bytes_per_block
+            || self.block_size >= self.properties.block_thresholds.min_bytes_per_block * 2
             || (file_size >= self.properties.block_thresholds.min_compressed_per_block
-                && self.block_size >= self.properties.block_thresholds.min_bytes_per_block / 2)
+                && self.block_size >= self.properties.block_thresholds.min_bytes_per_block)
     }
 
     pub fn write(&mut self, block: DataBlock) -> Result<()> {
diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
index 4c0e3cd715227..8df60aa61f03f 100644
--- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
@@ -160,7 +160,7 @@ fn column_update_hll_cardinality(col: &Column, ty: &DataType, hll: &mut ColumnDi
         let col = col.as_nullable().unwrap();
         for (i, v) in col.validity.iter().enumerate() {
             if v {
-                let scalar = col.column.index(i).unwrap();
+                let scalar = unsafe { col.column.index_unchecked(i) };
                 scalar_update_hll_cardinality(&scalar, inner, hll);
             }
         }
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index f938768066ce1..fc14e9589071e 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -51,6 +51,9 @@ impl FuseTable {
         pipeline: &mut Pipeline,
         table_meta_timestamps: TableMetaTimestamps,
     ) -> Result<()> {
+        let block_thresholds = self.get_block_thresholds();
+        build_compact_block_pipeline(pipeline, block_thresholds)?;
+
         let enable_stream_block_write =
             ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet();
         if enable_stream_block_write {
@@ -76,22 +79,13 @@ impl FuseTable {
             }
 
             pipeline.add_transform(|input, output| {
-                TransformBlockBuilder::try_create(
-                    ctx.clone(),
-                    input,
-                    output,
-                    self,
-                    properties.clone(),
-                )
+                TransformBlockBuilder::try_create(input, output, properties.clone())
             })?;
 
             pipeline.add_async_accumulating_transformer(|| {
                 TransformBlockWriter::create(ctx.clone(), MutationKind::Insert, self, false)
             });
         } else {
-            let block_thresholds = self.get_block_thresholds();
-            build_compact_block_pipeline(pipeline, block_thresholds)?;
-
             let schema = DataSchema::from(&self.schema().remove_virtual_computed_fields()).into();
             let cluster_stats_gen =
                 self.cluster_gen_for_append(ctx.clone(), pipeline, block_thresholds, Some(schema))?;
diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
index f5f61b8001c4d..52a4e309560c8 100644
--- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
@@ -24,7 +24,6 @@ use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
-use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT;
 use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums;
 use databend_common_pipeline_core::processors::Event;
 use databend_common_pipeline_core::processors::InputPort;
@@ -43,7 +42,6 @@ use crate::io::StreamBlockProperties;
 use crate::operations::MutationLogEntry;
 use crate::operations::MutationLogs;
 use crate::FuseTable;
-use crate::FUSE_OPT_KEY_ROW_PER_BLOCK;
 
 enum State {
     Consume,
@@ -59,7 +57,6 @@ pub struct TransformBlockBuilder {
     output: Arc<OutputPort>,
 
     properties: Arc<StreamBlockProperties>,
-    max_block_rows: usize,
 
     builder: Option<StreamBlockBuilder>,
     need_flush: bool,
@@ -72,16 +69,10 @@ pub struct TransformBlockBuilder {
 
 impl TransformBlockBuilder {
     pub fn try_create(
-        ctx: Arc<dyn TableContext>,
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
-        table: &FuseTable,
         properties: Arc<StreamBlockProperties>,
     ) -> Result<ProcessorPtr> {
-        let max_block_rows = std::cmp::min(
-            ctx.get_settings().get_max_block_size()? as usize,
-            table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT),
-        );
         Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder {
             state: State::Consume,
             input,
@@ -93,7 +84,6 @@ impl TransformBlockBuilder {
             input_data_size: 0,
             input_num_rows: 0,
             output_data: None,
-            max_block_rows,
         })))
     }
 
@@ -105,18 +95,6 @@ impl TransformBlockBuilder {
         }
         Ok(self.builder.as_mut().unwrap())
     }
-
-    fn calc_max_block_rows(&self, block: &DataBlock) -> usize {
-        let min_bytes_per_block = self.properties.block_thresholds.min_bytes_per_block;
-        let block_size = block.estimate_block_size();
-        if block_size < min_bytes_per_block {
-            return self.max_block_rows;
-        }
-        let num_rows = block.num_rows();
-        let average_row_size = block_size.div_ceil(num_rows);
-        let max_rows = min_bytes_per_block.div_ceil(average_row_size);
-        self.max_block_rows.min(max_rows)
-    }
 }
 
 #[async_trait]
@@ -191,9 +169,7 @@ impl Processor for TransformBlockBuilder {
                 block.check_valid()?;
                 self.input_data_size += block.estimate_block_size();
                 self.input_num_rows += block.num_rows();
-                let max_rows_per_block = self.calc_max_block_rows(&block);
-                let blocks = block.split_by_rows_no_tail(max_rows_per_block);
-                self.input_data.extend(blocks);
+                self.input_data.push_back(block);
             }
             State::Serialize => {
                 while let Some(b) = self.input_data.pop_front() {
diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
index 85d275bcabef3..792d4f0c1e85d 100644
--- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
+++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
@@ -206,9 +206,9 @@ impl ReclusterMutator {
             settings.get_max_memory_usage()? - GLOBAL_MEM_STAT.get_memory_usage() as u64;
         let memory_threshold = settings
             .get_recluster_block_size()?
-            .min(avail_memory_usage * 50 / 100) as usize;
+            .min(avail_memory_usage * 40 / 100) as usize;
         // specify a rather small value, so that `recluster_block_size` might be tuned to lower value.
-        let max_blocks_num = (memory_threshold / self.average_size).max(2) * self.max_tasks;
+        let mut max_blocks_per_task = (memory_threshold / self.average_size).max(2);
         let block_per_seg = self.block_thresholds.block_per_segment;
 
         // Prepare task generation parameters
@@ -276,8 +276,11 @@ impl ReclusterMutator {
             }
 
             // Select blocks for reclustering based on depth threshold and max block size
-            let mut selected_idx =
-                self.fetch_max_depth(points_map, self.depth_threshold, max_blocks_num)?;
+            let mut selected_idx = self.fetch_max_depth(
+                points_map,
+                self.depth_threshold,
+                max_blocks_per_task * self.max_tasks,
+            )?;
             if selected_idx.is_empty() {
                 if level != 0 || small_blocks.len() < 2 {
                     continue;
@@ -291,13 +294,19 @@ impl ReclusterMutator {
             let mut task_compressed = 0;
             let mut task_indices = Vec::new();
             let mut selected_blocks = Vec::new();
+            if selected_idx.len() > max_blocks_per_task {
+                max_blocks_per_task = selected_idx.len().div_ceil(self.max_tasks).max(10);
+            }
             for idx in selected_idx {
                 let block = blocks[idx].clone();
                 let block_size = block.block_size as usize;
                 let row_count = block.row_count as usize;
+                let selected_len = selected_blocks.len();
 
                 // If memory threshold exceeded, generate a new task and reset accumulators
-                if task_bytes + block_size > memory_threshold && selected_blocks.len() > 1 {
+                if selected_len > max_blocks_per_task
+                    || (task_bytes + block_size > memory_threshold && selected_len > 1)
+                {
                     selected_blocks_idx.extend(std::mem::take(&mut task_indices));
 
                     tasks.push(self.generate_task(
diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test
index 929e042c13122..176d17124c5d5 100644
--- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test
+++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test
@@ -512,10 +512,10 @@ select segment_count, block_count from fuse_snapshot('db_09_0008', 't9') limit 2
 query I
 select a from t9 order by a
 ----
+-5
 1
 2
 4
--5
 
 statement ok
 insert into t9 values(-3)

From ad818da956e16289d0effd37d6be4b34f5e06eff Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 12 Jun 2025 21:27:22 +0800
Subject: [PATCH 25/36] improve recluster partition

---
 .../pipelines/builders/builder_recluster.rs   |  28 +--
 .../processors/transforms/recluster/mod.rs    |   2 +
 .../transform_recluster_partition.rs          | 230 ++++++++++++++++++
 .../processors/transform_block_writer.rs      |   1 +
 .../mutation/mutator/recluster_mutator.rs     |   2 +-
 5 files changed, 243 insertions(+), 20 deletions(-)
 create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs

diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index cbcde2069c441..853ccbc2b2fef 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -46,7 +46,6 @@ use databend_common_pipeline_transforms::sort::RowConverter;
 use databend_common_pipeline_transforms::sort::Rows;
 use databend_common_pipeline_transforms::sort::SimpleRowConverter;
 use databend_common_pipeline_transforms::sort::SimpleRowsAsc;
-use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::evaluator::CompoundBlockOperator;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_sql::executor::physical_plans::Recluster;
@@ -61,13 +60,12 @@ use match_template::match_template;
 
 use crate::pipelines::builders::SortPipelineBuilder;
 use crate::pipelines::processors::transforms::ReclusterPartitionExchange;
-use crate::pipelines::processors::transforms::ReclusterPartitionStrategy;
 use crate::pipelines::processors::transforms::SampleState;
 use crate::pipelines::processors::transforms::TransformAddOrderColumn;
 use crate::pipelines::processors::transforms::TransformAddStreamColumns;
-use crate::pipelines::processors::transforms::TransformPartitionCollect;
 use crate::pipelines::processors::transforms::TransformRangePartitionIndexer;
 use crate::pipelines::processors::transforms::TransformReclusterCollect;
+use crate::pipelines::processors::transforms::TransformReclusterPartition;
 use crate::pipelines::PipelineBuilder;
 
 impl PipelineBuilder {
@@ -209,23 +207,15 @@ impl PipelineBuilder {
                         ReclusterPartitionExchange::create(0, partitions),
                     );
                     let processor_id = AtomicUsize::new(0);
-                    let settings = self.ctx.get_settings();
-                    let memory_settings = MemorySettings::disable_spill();
                     self.main_pipeline.add_transform(|input, output| {
-                        Ok(ProcessorPtr::create(Box::new(
-                            TransformPartitionCollect::new(
-                                self.ctx.clone(),
-                                input,
-                                output,
-                                &settings,
-                                processor_id.fetch_add(1, atomic::Ordering::AcqRel),
-                                num_processors,
-                                partitions,
-                                memory_settings.clone(),
-                                None,
-                                ReclusterPartitionStrategy::new(properties.clone()),
-                            )?,
-                        )))
+                        TransformReclusterPartition::try_create(
+                            input,
+                            output,
+                            properties.clone(),
+                            processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                            num_processors,
+                            partitions,
+                        )
                     })?;
 
                     self.main_pipeline.add_async_accumulating_transformer(|| {
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
index 0f3612043c7d9..a024e330be25b 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -19,6 +19,7 @@ mod recluster_sample_state;
 mod transform_add_order_column;
 mod transform_range_partition_indexer;
 mod transform_recluster_collect;
+mod transform_recluster_partition;
 
 pub use range_bound_sampler::RangeBoundSampler;
 pub use recluster_partition_exchange::ReclusterPartitionExchange;
@@ -29,3 +30,4 @@ pub use transform_add_order_column::TransformAddOrderColumn;
 pub use transform_range_partition_indexer::TransformRangePartitionIndexer;
 pub use transform_recluster_collect::ReclusterSampleMeta;
 pub use transform_recluster_collect::TransformReclusterCollect;
+pub use transform_recluster_partition::TransformReclusterPartition;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
new file mode 100644
index 0000000000000..9bec274cb90f6
--- /dev/null
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
@@ -0,0 +1,230 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::BlockMetaInfoDowncast;
+use databend_common_expression::DataBlock;
+use databend_common_pipeline_core::processors::Event;
+use databend_common_pipeline_core::processors::InputPort;
+use databend_common_pipeline_core::processors::OutputPort;
+use databend_common_pipeline_core::processors::Processor;
+use databend_common_pipeline_core::processors::ProcessorPtr;
+use databend_common_storages_fuse::io::StreamBlockBuilder;
+use databend_common_storages_fuse::io::StreamBlockProperties;
+
+use crate::pipelines::processors::transforms::WindowPartitionMeta;
+
+enum Step {
+    Consume,
+    Collect,
+    Flush,
+}
+
+struct PartitionData {
+    builder: Option<StreamBlockBuilder>,
+    data_blocks: Vec<DataBlock>,
+    block_size: usize,
+    block_rows: usize,
+}
+
+impl PartitionData {
+    fn new() -> Self {
+        Self {
+            builder: None,
+            data_blocks: vec![],
+            block_size: 0,
+            block_rows: 0,
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.builder.as_ref().is_none_or(|v| v.is_empty()) && self.data_blocks.is_empty()
+    }
+}
+
+pub struct TransformReclusterPartition {
+    input: Arc<InputPort>,
+    output: Arc<OutputPort>,
+
+    properties: Arc<StreamBlockProperties>,
+
+    // The partition id is used to map the partition id to the new partition id.
+    partition_id: Vec<usize>,
+    partition_data: Vec<PartitionData>,
+    output_data: VecDeque<DataBlock>,
+
+    step: Step,
+}
+
+impl TransformReclusterPartition {
+    pub fn try_create(
+        input: Arc<InputPort>,
+        output: Arc<OutputPort>,
+        properties: Arc<StreamBlockProperties>,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+    ) -> Result<ProcessorPtr> {
+        let partitions = (0..num_partitions)
+            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
+            .collect::<Vec<_>>();
+        let mut partition_id = vec![0; num_partitions];
+        let mut partition_data = Vec::with_capacity(num_partitions);
+        for (new_partition_id, partition) in partitions.iter().enumerate() {
+            partition_id[*partition] = new_partition_id;
+            partition_data.push(PartitionData::new());
+        }
+        Ok(ProcessorPtr::create(Box::new(
+            TransformReclusterPartition {
+                input,
+                output,
+                properties,
+                partition_id,
+                partition_data,
+                output_data: VecDeque::new(),
+                step: Step::Consume,
+            },
+        )))
+    }
+}
+
+impl Processor for TransformReclusterPartition {
+    fn name(&self) -> String {
+        "TransformReclusterPartition".to_string()
+    }
+
+    fn as_any(&mut self) -> &mut dyn Any {
+        self
+    }
+
+    fn event(&mut self) -> Result<Event> {
+        if matches!(self.step, Step::Collect | Step::Flush) {
+            return Ok(Event::Sync);
+        }
+
+        if self.output.is_finished() {
+            self.input.finish();
+            return Ok(Event::Finished);
+        }
+
+        if !self.output.can_push() {
+            return Ok(Event::NeedConsume);
+        }
+
+        if let Some(data_block) = self.output_data.pop_front() {
+            self.output.push_data(Ok(data_block));
+            return Ok(Event::NeedConsume);
+        }
+
+        if self.input.is_finished() {
+            if !self.partition_data.is_empty() {
+                self.step = Step::Flush;
+                return Ok(Event::Sync);
+            }
+            self.output.finish();
+            return Ok(Event::Finished);
+        }
+
+        if self.input.has_data() {
+            self.step = Step::Collect;
+            return Ok(Event::Sync);
+        }
+
+        self.input.set_need_data();
+        Ok(Event::NeedData)
+    }
+
+    fn process(&mut self) -> Result<()> {
+        match std::mem::replace(&mut self.step, Step::Consume) {
+            Step::Collect => {
+                let data_block = self.input.pull_data().unwrap()?;
+                if let Some(meta) = data_block
+                    .get_owned_meta()
+                    .and_then(WindowPartitionMeta::downcast_from)
+                {
+                    for (partition_id, data_block) in meta.partitioned_data.into_iter() {
+                        if data_block.is_empty() {
+                            continue;
+                        }
+
+                        let new_id = self.partition_id[partition_id];
+                        let partition_data =
+                            unsafe { self.partition_data.get_unchecked_mut(new_id) };
+                        if partition_data.builder.is_none() {
+                            partition_data.builder = Some(StreamBlockBuilder::try_new_with_config(
+                                self.properties.clone(),
+                            )?);
+                        }
+                        let builder = partition_data.builder.as_mut().unwrap();
+                        if !builder.need_flush() {
+                            builder.write(data_block)?;
+                        } else {
+                            partition_data.block_size += data_block.estimate_block_size();
+                            partition_data.block_rows += data_block.num_rows();
+                            partition_data.data_blocks.push(data_block);
+
+                            if self.properties.check_large_enough(
+                                partition_data.block_rows,
+                                partition_data.block_size,
+                            ) {
+                                let builder = partition_data.builder.take().unwrap();
+                                let serialized = builder.finish()?;
+                                self.output_data
+                                    .push_back(DataBlock::empty_with_meta(Box::new(serialized)));
+
+                                let mut builder = StreamBlockBuilder::try_new_with_config(
+                                    self.properties.clone(),
+                                )?;
+                                for block in
+                                    std::mem::take(&mut partition_data.data_blocks).into_iter()
+                                {
+                                    builder.write(block)?;
+                                }
+                                partition_data.builder = Some(builder);
+                                partition_data.block_rows = 0;
+                                partition_data.block_size = 0;
+                            }
+                        }
+                    }
+                }
+            }
+            Step::Flush => {
+                while let Some(mut partition_data) = self.partition_data.pop() {
+                    if partition_data.is_empty() {
+                        continue;
+                    }
+
+                    let mut builder = if partition_data.builder.is_none() {
+                        StreamBlockBuilder::try_new_with_config(self.properties.clone())?
+                    } else {
+                        partition_data.builder.take().unwrap()
+                    };
+                    for block in partition_data.data_blocks {
+                        builder.write(block)?;
+                    }
+                    let serialized = builder.finish()?;
+                    self.output_data
+                        .push_back(DataBlock::empty_with_meta(Box::new(serialized)));
+                    break;
+                }
+            }
+            _ => unreachable!(),
+        }
+        Ok(())
+    }
+}
diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
index 52a4e309560c8..ea5b4b9dc5c0c 100644
--- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
+++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs
@@ -116,6 +116,7 @@ impl Processor for TransformBlockBuilder {
         }
 
         if self.output.is_finished() {
+            self.input.finish();
             return Ok(Event::Finished);
         }
 
diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
index 792d4f0c1e85d..be706f9d123b9 100644
--- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
+++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs
@@ -206,7 +206,7 @@ impl ReclusterMutator {
             settings.get_max_memory_usage()? - GLOBAL_MEM_STAT.get_memory_usage() as u64;
         let memory_threshold = settings
             .get_recluster_block_size()?
-            .min(avail_memory_usage * 40 / 100) as usize;
+            .min(avail_memory_usage * 30 / 100) as usize;
         // specify a rather small value, so that `recluster_block_size` might be tuned to lower value.
         let mut max_blocks_per_task = (memory_threshold / self.average_size).max(2);
         let block_per_seg = self.block_thresholds.block_per_segment;

From 22c37b30acd14ea99aa069aa93b236786282d0a1 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 13 Jun 2025 11:36:08 +0800
Subject: [PATCH 26/36] for test

---
 .../recluster/recluster_partition_exchange.rs          |  3 +++
 .../recluster/transform_range_partition_indexer.rs     |  5 +++++
 .../recluster/transform_recluster_partition.rs         | 10 ++++++++++
 3 files changed, 18 insertions(+)

diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
index dd5257850ac9f..9e25119f2d15a 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::sync::Arc;
+use std::time::Instant;
 
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
@@ -34,6 +35,7 @@ impl ReclusterPartitionExchange {
 impl Exchange for ReclusterPartitionExchange {
     const NAME: &'static str = "Recluster";
     fn partition(&self, mut data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
+        let start = Instant::now();
         let range_ids = data_block
             .get_last_column()
             .as_number()
@@ -58,6 +60,7 @@ impl Exchange for ReclusterPartitionExchange {
                 output_data_blocks[target].push((partition_id, block));
             }
         }
+        log::info!("Recluster range exchange: {:?}", start.elapsed());
 
         // Union data blocks for each processor.
         Ok(output_data_blocks
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index 215e0c977eb8f..96027933631cb 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -15,6 +15,7 @@
 use std::any::Any;
 use std::collections::VecDeque;
 use std::sync::Arc;
+use std::time::Instant;
 
 use databend_common_exception::Result;
 use databend_common_expression::types::ArgType;
@@ -45,6 +46,8 @@ where T: ArgType
     output_data: VecDeque<DataBlock>,
     bounds: Vec<T::Scalar>,
     max_value: Option<T::Scalar>,
+
+    start: Instant,
 }
 
 impl<T> TransformRangePartitionIndexer<T>
@@ -65,6 +68,7 @@ where
             output_data: VecDeque::new(),
             bounds: vec![],
             max_value: None,
+            start: Instant::now(),
         })
     }
 }
@@ -121,6 +125,7 @@ where
             .expect("require a ReclusterSampleMeta");
         self.input_data = meta.blocks;
         self.state.merge_sample::<T>(meta.sample_values)?;
+        log::info!("Recluster range partition: {:?}", self.start.elapsed());
         Ok(Event::Async)
     }
 
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
index 9bec274cb90f6..1f8749dd4e647 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
@@ -15,6 +15,7 @@
 use std::any::Any;
 use std::collections::VecDeque;
 use std::sync::Arc;
+use std::time::Instant;
 
 use databend_common_exception::Result;
 use databend_common_expression::BlockMetaInfoDowncast;
@@ -68,6 +69,9 @@ pub struct TransformReclusterPartition {
     partition_data: Vec<PartitionData>,
     output_data: VecDeque<DataBlock>,
 
+    start: Instant,
+    cnt: usize,
+
     step: Step,
 }
 
@@ -98,6 +102,8 @@ impl TransformReclusterPartition {
                 partition_data,
                 output_data: VecDeque::new(),
                 step: Step::Consume,
+                start: Instant::now(),
+                cnt: 0,
             },
         )))
     }
@@ -133,6 +139,10 @@ impl Processor for TransformReclusterPartition {
 
         if self.input.is_finished() {
             if !self.partition_data.is_empty() {
+                if self.cnt == 0 {
+                    log::info!("Recluster: start flush: {:?}", self.start.elapsed());
+                }
+                self.cnt += 1;
                 self.step = Step::Flush;
                 return Ok(Event::Sync);
             }

From 27fa7db6685d3aef628f918318eb548151170388 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Fri, 13 Jun 2025 13:20:16 +0800
Subject: [PATCH 27/36] for test

---
 .../recluster/recluster_partition_exchange.rs | 10 +++++---
 .../transform_recluster_partition.rs          |  6 +++++
 .../mode/standalone/explain/window.test       | 24 +++++++++----------
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
index 9e25119f2d15a..444c81296de26 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
@@ -24,18 +24,22 @@ use crate::pipelines::processors::transforms::WindowPartitionMeta;
 pub struct ReclusterPartitionExchange {
     start: u64,
     width: usize,
+    start_time: Instant,
 }
 
 impl ReclusterPartitionExchange {
     pub fn create(start: u64, width: usize) -> Arc<ReclusterPartitionExchange> {
-        Arc::new(ReclusterPartitionExchange { start, width })
+        Arc::new(ReclusterPartitionExchange {
+            start,
+            width,
+            start_time: Instant::now(),
+        })
     }
 }
 
 impl Exchange for ReclusterPartitionExchange {
     const NAME: &'static str = "Recluster";
     fn partition(&self, mut data_block: DataBlock, n: usize) -> Result<Vec<DataBlock>> {
-        let start = Instant::now();
         let range_ids = data_block
             .get_last_column()
             .as_number()
@@ -60,7 +64,7 @@ impl Exchange for ReclusterPartitionExchange {
                 output_data_blocks[target].push((partition_id, block));
             }
         }
-        log::info!("Recluster range exchange: {:?}", start.elapsed());
+        log::info!("Recluster range exchange: {:?}", self.start_time.elapsed());
 
         // Union data blocks for each processor.
         Ok(output_data_blocks
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
index 1f8749dd4e647..5a3e19d2b0e3e 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs
@@ -162,6 +162,7 @@ impl Processor for TransformReclusterPartition {
     fn process(&mut self) -> Result<()> {
         match std::mem::replace(&mut self.step, Step::Consume) {
             Step::Collect => {
+                let start_cost = self.start.elapsed();
                 let data_block = self.input.pull_data().unwrap()?;
                 if let Some(meta) = data_block
                     .get_owned_meta()
@@ -212,6 +213,11 @@ impl Processor for TransformReclusterPartition {
                         }
                     }
                 }
+                log::info!(
+                    "Recluster: start collect: {:?}, end: {:?}",
+                    start_cost,
+                    self.start.elapsed()
+                );
             }
             Step::Flush => {
                 while let Some(mut partition_data) = self.partition_data.pop() {
diff --git a/tests/sqllogictests/suites/mode/standalone/explain/window.test b/tests/sqllogictests/suites/mode/standalone/explain/window.test
index 9846febbd9819..11943d90e42eb 100644
--- a/tests/sqllogictests/suites/mode/standalone/explain/window.test
+++ b/tests/sqllogictests/suites/mode/standalone/explain/window.test
@@ -59,7 +59,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "Resize" ]
     8 [ label = "SortPartialTransform" ]
@@ -108,7 +108,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "Resize" ]
     8 [ label = "SortPartialTransform" ]
@@ -429,7 +429,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "LimitTransform" ]
     8 [ label = "CompoundBlockOperator(Project)" ]
@@ -457,7 +457,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "LimitTransform" ]
     8 [ label = "CompoundBlockOperator(Project)" ]
@@ -486,7 +486,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "LimitTransform" ]
     8 [ label = "CompoundBlockOperator(Project)" ]
@@ -510,7 +510,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "LimitTransform" ]
     8 [ label = "CompoundBlockOperator(Project)" ]
@@ -534,7 +534,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "LimitTransform" ]
     8 [ label = "CompoundBlockOperator(Project)" ]
@@ -559,7 +559,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(Window)" ]
     4 [ label = "ShuffleMergePartition(Window)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "Resize" ]
     8 [ label = "SortPartialTransform" ]
@@ -610,7 +610,7 @@ digraph {
     4 [ label = "TransformFilter" ]
     5 [ label = "ShufflePartition(Window)" ]
     6 [ label = "ShuffleMergePartition(Window)" ]
-    7 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    7 [ label = "TransformPartitionCollect(Window)" ]
     8 [ label = "Transform Window" ]
     9 [ label = "Resize" ]
     10 [ label = "SortPartialTransform" ]
@@ -708,7 +708,7 @@ digraph {
     2 [ label = "DeserializeDataTransform" ]
     3 [ label = "ShufflePartition(WindowTopN)" ]
     4 [ label = "ShuffleMergePartition(WindowTopN)" ]
-    5 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    5 [ label = "TransformPartitionCollect(Window)" ]
     6 [ label = "Transform Window" ]
     7 [ label = "TransformFilter" ]
     8 [ label = "LimitTransform" ]
@@ -786,12 +786,12 @@ digraph {
     1 [ label = "CompoundBlockOperator(Map)" ]
     2 [ label = "ShufflePartition(Window)" ]
     3 [ label = "ShuffleMergePartition(Window)" ]
-    4 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    4 [ label = "TransformPartitionCollect(Window)" ]
     5 [ label = "Transform Window" ]
     6 [ label = "CompoundBlockOperator(Map)" ]
     7 [ label = "ShufflePartition(Window)" ]
     8 [ label = "ShuffleMergePartition(Window)" ]
-    9 [ label = "TransformWindowPartitionCollect(Sort)" ]
+    9 [ label = "TransformPartitionCollect(Window)" ]
     10 [ label = "Transform Window" ]
     11 [ label = "CompoundBlockOperator(Project)" ]
     0 -> 1 [ label = "" ]

From 8875f5372b05a66290fdd08d770beb6370a51200 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sat, 14 Jun 2025 00:49:24 +0800
Subject: [PATCH 28/36] for test

---
 .../transform_range_partition_indexer.rs      | 68 ++++++++++---------
 1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index 96027933631cb..0f8439140f66c 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -130,42 +130,46 @@ where
     }
 
     fn process(&mut self) -> Result<()> {
-        if let Some(mut block) = self.input_data.pop() {
-            let bound_len = self.bounds.len();
-            let num_rows = block.num_rows();
-            let last = block.get_last_column().clone();
-            block.pop_columns(1);
-            let mut builder = Vec::with_capacity(num_rows);
-            let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap();
-            for index in 0..num_rows {
-                let val =
-                    T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) });
-                if self.max_value.as_ref().is_some_and(|v| val >= *v) {
-                    let range_id = bound_len + 1;
-                    builder.push(range_id as u64);
-                    continue;
-                }
+        let start = Instant::now();
+        let mut block = {
+            let blocks = std::mem::take(&mut self.input_data);
+            DataBlock::concat(&blocks)?
+        };
+
+        let bound_len = self.bounds.len();
+        let num_rows = block.num_rows();
+        let last = block.get_last_column().clone();
+        block.pop_columns(1);
+        let mut builder = Vec::with_capacity(num_rows);
+        let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap();
+        for index in 0..num_rows {
+            let val = T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) });
+            if self.max_value.as_ref().is_some_and(|v| val >= *v) {
+                let range_id = bound_len + 1;
+                builder.push(range_id as u64);
+                continue;
+            }
 
-                let mut low = 0;
-                let mut high = bound_len;
-                while low < high {
-                    let mid = low + ((high - low) / 2);
-                    let bound = unsafe { self.bounds.get_unchecked(mid) }.clone();
-                    if val > bound {
-                        low = mid + 1;
-                    } else {
-                        high = mid;
-                    }
+            let mut low = 0;
+            let mut high = bound_len;
+            while low < high {
+                let mid = low + ((high - low) / 2);
+                let bound = unsafe { self.bounds.get_unchecked(mid) }.clone();
+                if val > bound {
+                    low = mid + 1;
+                } else {
+                    high = mid;
                 }
-                builder.push(low as u64);
             }
-
-            block.add_column(BlockEntry::new(
-                DataType::Number(NumberDataType::UInt64),
-                Value::Column(UInt64Type::from_data(builder)),
-            ));
-            self.output_data.push_back(block);
+            builder.push(low as u64);
         }
+
+        block.add_column(BlockEntry::new(
+            DataType::Number(NumberDataType::UInt64),
+            Value::Column(UInt64Type::from_data(builder)),
+        ));
+        self.output_data.push_back(block);
+        log::info!("Recluster range output: {:?}", start.elapsed());
         Ok(())
     }
 

From c807783badd52faff80da0e1261e27f5e61baa80 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sat, 14 Jun 2025 02:36:55 +0800
Subject: [PATCH 29/36] fix

---
 .../transforms/recluster/range_bound_sampler.rs          | 2 +-
 .../transforms/recluster/transform_add_order_column.rs   | 7 +------
 .../recluster/transform_range_partition_indexer.rs       | 9 +--------
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
index a76417256416f..8e0afd6e647c3 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
@@ -56,7 +56,7 @@ where
     pub fn add_block(&mut self, data: &DataBlock) {
         let rows = data.num_rows();
         assert!(rows > 0);
-        let column = data.get_by_offset(self.offset).to_column(rows);
+        let column = data.get_by_offset(self.offset).to_column();
 
         let sample_size = std::cmp::min(self.sample_size, rows);
         let mut indices = (0..rows).collect::<Vec<_>>();
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
index 7b40593e887c3..f90458a8c44b6 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
@@ -16,11 +16,9 @@ use std::marker::PhantomData;
 use std::sync::Arc;
 
 use databend_common_exception::Result;
-use databend_common_expression::BlockEntry;
 use databend_common_expression::DataBlock;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::SortColumnDescription;
-use databend_common_expression::Value;
 use databend_common_pipeline_transforms::sort::RowConverter;
 use databend_common_pipeline_transforms::sort::Rows;
 use databend_common_pipeline_transforms::Transform;
@@ -63,10 +61,7 @@ where
             .row_converter
             .convert(&order_by_cols, data.num_rows())?;
         let order_col = rows.to_column();
-        data.add_column(BlockEntry {
-            data_type: order_col.data_type(),
-            value: Value::Column(order_col),
-        });
+        data.add_column(order_col);
         Ok(data)
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index 0f8439140f66c..ea2c4983ffb42 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -19,14 +19,10 @@ use std::time::Instant;
 
 use databend_common_exception::Result;
 use databend_common_expression::types::ArgType;
-use databend_common_expression::types::DataType;
-use databend_common_expression::types::NumberDataType;
 use databend_common_expression::types::UInt64Type;
-use databend_common_expression::BlockEntry;
 use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
 use databend_common_expression::FromData;
-use databend_common_expression::Value;
 use databend_common_pipeline_core::processors::Event;
 use databend_common_pipeline_core::processors::InputPort;
 use databend_common_pipeline_core::processors::OutputPort;
@@ -164,10 +160,7 @@ where
             builder.push(low as u64);
         }
 
-        block.add_column(BlockEntry::new(
-            DataType::Number(NumberDataType::UInt64),
-            Value::Column(UInt64Type::from_data(builder)),
-        ));
+        block.add_column(UInt64Type::from_data(builder));
         self.output_data.push_back(block);
         log::info!("Recluster range output: {:?}", start.elapsed());
         Ok(())

From cb50347e2be7ec37205fa98bd7cf189c74fc29f6 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Sun, 15 Jun 2025 12:51:48 +0800
Subject: [PATCH 30/36] for test

---
 .../pipelines/builders/builder_recluster.rs   | 145 ++++++++----------
 .../processors/transforms/recluster/mod.rs    |   1 +
 .../recluster/range_bound_sampler.rs          |  33 +---
 .../recluster/recluster_partition_strategy.rs |  88 +++++++++++
 .../recluster/recluster_sample_state.rs       |  73 +++------
 .../recluster/transform_add_order_column.rs   |  28 +---
 .../transform_range_partition_indexer.rs      |  78 ++++------
 .../recluster/transform_recluster_collect.rs  |  26 +---
 src/query/settings/src/settings_default.rs    |   7 +
 .../settings/src/settings_getter_setter.rs    |   4 +
 .../src/io/write/stream/cluster_statistics.rs |  56 +++----
 .../storages/fuse/src/operations/append.rs    |  36 +++--
 .../fuse/src/statistics/cluster_statistics.rs |   2 +-
 13 files changed, 287 insertions(+), 290 deletions(-)

diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index 853ccbc2b2fef..5af820ccda385 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -20,16 +20,6 @@ use databend_common_catalog::plan::DataSourceInfo;
 use databend_common_catalog::plan::DataSourcePlan;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
-use databend_common_expression::row::RowConverter as CommonConverter;
-use databend_common_expression::types::AccessType;
-use databend_common_expression::types::ArgType;
-use databend_common_expression::types::DataType;
-use databend_common_expression::types::DateType;
-use databend_common_expression::types::NumberDataType;
-use databend_common_expression::types::NumberType;
-use databend_common_expression::types::StringType;
-use databend_common_expression::types::TimestampType;
-use databend_common_expression::with_number_mapped_type;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::DataSchemaRefExt;
 use databend_common_expression::SortColumnDescription;
@@ -41,28 +31,27 @@ use databend_common_pipeline_core::Pipeline;
 use databend_common_pipeline_sources::EmptySource;
 use databend_common_pipeline_transforms::processors::build_compact_block_no_split_pipeline;
 use databend_common_pipeline_transforms::processors::TransformPipelineHelper;
-use databend_common_pipeline_transforms::sort::CommonRows;
-use databend_common_pipeline_transforms::sort::RowConverter;
-use databend_common_pipeline_transforms::sort::Rows;
-use databend_common_pipeline_transforms::sort::SimpleRowConverter;
-use databend_common_pipeline_transforms::sort::SimpleRowsAsc;
+use databend_common_pipeline_transforms::sort::utils::add_order_field;
+use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::evaluator::CompoundBlockOperator;
 use databend_common_sql::executor::physical_plans::MutationKind;
 use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::StreamContext;
 use databend_common_storages_factory::Table;
 use databend_common_storages_fuse::io::StreamBlockProperties;
+use databend_common_storages_fuse::operations::TransformBlockBuilder;
 use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::FuseTable;
 use databend_common_storages_fuse::TableContext;
-use match_template::match_template;
 
 use crate::pipelines::builders::SortPipelineBuilder;
 use crate::pipelines::processors::transforms::ReclusterPartitionExchange;
+use crate::pipelines::processors::transforms::ReclusterPartitionStrategys;
 use crate::pipelines::processors::transforms::SampleState;
 use crate::pipelines::processors::transforms::TransformAddOrderColumn;
 use crate::pipelines::processors::transforms::TransformAddStreamColumns;
+use crate::pipelines::processors::transforms::TransformPartitionCollect;
 use crate::pipelines::processors::transforms::TransformRangePartitionIndexer;
 use crate::pipelines::processors::transforms::TransformReclusterCollect;
 use crate::pipelines::processors::transforms::TransformReclusterPartition;
@@ -172,9 +161,7 @@ impl PipelineBuilder {
                         });
                     }
 
-                    let fields_with_cluster_key = properties.fields_with_cluster_key();
-                    let schema = DataSchemaRefExt::create(fields_with_cluster_key);
-                    let sort_descs: Vec<_> = properties
+                    let sort_desc: Vec<_> = properties
                         .cluster_key_index()
                         .iter()
                         .map(|&offset| SortColumnDescription {
@@ -183,6 +170,10 @@ impl PipelineBuilder {
                             nulls_first: false,
                         })
                         .collect();
+                    let fields_with_cluster_key = properties.fields_with_cluster_key();
+                    let schema = DataSchemaRefExt::create(fields_with_cluster_key);
+                    let schema = add_order_field(schema, &sort_desc);
+                    let order_offset = schema.fields.len() - 1;
 
                     let num_processors = self.main_pipeline.output_len();
                     let sample_size = self
@@ -196,9 +187,12 @@ impl PipelineBuilder {
                         task.total_compressed,
                     );
                     let state = SampleState::new(num_processors, partitions);
-                    let recluster_pipeline_builder =
-                        ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_size)
-                            .with_state(state);
+                    let recluster_pipeline_builder = ReclusterPipelineBuilder::create(
+                        schema.clone(),
+                        sort_desc.clone(),
+                        sample_size,
+                    )
+                    .with_state(state);
                     recluster_pipeline_builder
                         .build_recluster_sample_pipeline(&mut self.main_pipeline)?;
 
@@ -207,16 +201,46 @@ impl PipelineBuilder {
                         ReclusterPartitionExchange::create(0, partitions),
                     );
                     let processor_id = AtomicUsize::new(0);
-                    self.main_pipeline.add_transform(|input, output| {
-                        TransformReclusterPartition::try_create(
-                            input,
-                            output,
-                            properties.clone(),
-                            processor_id.fetch_add(1, atomic::Ordering::AcqRel),
-                            num_processors,
-                            partitions,
-                        )
-                    })?;
+
+                    let settings = self.ctx.get_settings();
+                    let enable_writings = settings.get_enable_block_stream_writes()?;
+                    if enable_writings {
+                        let memory_settings = MemorySettings::disable_spill();
+                        self.main_pipeline.add_transform(|input, output| {
+                            let strategy =
+                                ReclusterPartitionStrategys::new(properties.clone(), order_offset);
+
+                            Ok(ProcessorPtr::create(Box::new(
+                                TransformPartitionCollect::new(
+                                    self.ctx.clone(),
+                                    input,
+                                    output,
+                                    &settings,
+                                    processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                                    num_processors,
+                                    partitions,
+                                    memory_settings.clone(),
+                                    None,
+                                    strategy,
+                                )?,
+                            )))
+                        })?;
+
+                        self.main_pipeline.add_transform(|input, output| {
+                            TransformBlockBuilder::try_create(input, output, properties.clone())
+                        })?;
+                    } else {
+                        self.main_pipeline.add_transform(|input, output| {
+                            TransformReclusterPartition::try_create(
+                                input,
+                                output,
+                                properties.clone(),
+                                processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                                num_processors,
+                                partitions,
+                            )
+                        })?;
+                    }
 
                     self.main_pipeline.add_async_accumulating_transformer(|| {
                         TransformBlockWriter::create(
@@ -249,7 +273,7 @@ impl PipelineBuilder {
                     // construct output fields
                     let output_fields = cluster_stats_gen.out_fields.clone();
                     let schema = DataSchemaRefExt::create(output_fields);
-                    let sort_descs: Vec<_> = cluster_stats_gen
+                    let sort_desc: Vec<_> = cluster_stats_gen
                         .cluster_key_index
                         .iter()
                         .map(|offset| SortColumnDescription {
@@ -267,10 +291,9 @@ impl PipelineBuilder {
                     );
 
                     let sort_pipeline_builder =
-                        SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())?
+                        SortPipelineBuilder::create(self.ctx.clone(), schema, sort_desc.into())?
                             .with_block_size_hit(sort_block_size)
                             .remove_order_col_at_last();
-                    // Todo(zhyass): Recluster will no longer perform sort in the near future.
                     sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?;
 
                     // Compact after merge sort.
@@ -306,7 +329,7 @@ impl PipelineBuilder {
 
 struct ReclusterPipelineBuilder {
     schema: DataSchemaRef,
-    sort_desc: Arc<[SortColumnDescription]>,
+    sort_desc: Vec<SortColumnDescription>,
     state: Option<Arc<SampleState>>,
     sample_size: usize,
     seed: u64,
@@ -315,7 +338,7 @@ struct ReclusterPipelineBuilder {
 impl ReclusterPipelineBuilder {
     fn create(
         schema: DataSchemaRef,
-        sort_desc: Arc<[SortColumnDescription]>,
+        sort_desc: Vec<SortColumnDescription>,
         sample_size: usize,
     ) -> Self {
         Self {
@@ -339,53 +362,17 @@ impl ReclusterPipelineBuilder {
     }
 
     fn build_recluster_sample_pipeline(&self, pipeline: &mut Pipeline) -> Result<()> {
-        match self.sort_desc.as_ref() {
-            [desc] => {
-                let schema = self.schema.clone();
-                let sort_type = schema.field(desc.offset).data_type();
-                assert!(desc.asc);
-
-                match_template! {
-                    T = [ Date => DateType, Timestamp => TimestampType, String => StringType ],
-                    match sort_type {
-                        DataType::T => {
-                            self.visit_type::<SimpleRowsAsc<T>, SimpleRowConverter<T>>(pipeline)
-                        },
-                        DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty {
-                            NumberDataType::NUM_TYPE => {
-                                self.visit_type::<SimpleRowsAsc<NumberType<NUM_TYPE>>, SimpleRowConverter<NumberType<NUM_TYPE>>>(pipeline)
-                            }
-                        }),
-                        _ => self.visit_type::<CommonRows, CommonConverter>(pipeline)
-                    }
-                }
-            }
-            _ => self.visit_type::<CommonRows, CommonConverter>(pipeline),
-        }
-    }
-
-    fn visit_type<R, C>(&self, pipeline: &mut Pipeline) -> Result<()>
-    where
-        R: Rows + 'static,
-        C: RowConverter<R> + Send + 'static,
-        R::Type: ArgType + Send + Sync,
-        <R::Type as AccessType>::Scalar: Ord + Send + Sync,
-    {
         pipeline.try_add_transformer(|| {
-            TransformAddOrderColumn::<R, C>::try_new(self.sort_desc.clone(), self.schema.clone())
+            TransformAddOrderColumn::try_new(self.sort_desc.clone(), self.schema.clone())
         })?;
-        let offset = self.schema.num_fields();
+        let offset = self.schema.num_fields() - 1;
         pipeline.add_accumulating_transformer(|| {
-            TransformReclusterCollect::<R::Type>::new(offset, self.sample_size, self.seed)
+            TransformReclusterCollect::new(offset, self.sample_size, self.seed)
         });
         pipeline.add_transform(|input, output| {
-            Ok(ProcessorPtr::create(TransformRangePartitionIndexer::<
-                R::Type,
-            >::create(
-                input,
-                output,
-                self.state.clone().unwrap(),
-            )))
+            Ok(ProcessorPtr::create(
+                TransformRangePartitionIndexer::create(input, output, self.state.clone().unwrap()),
+            ))
         })
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
index a024e330be25b..b87be1f1e4d51 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -25,6 +25,7 @@ pub use range_bound_sampler::RangeBoundSampler;
 pub use recluster_partition_exchange::ReclusterPartitionExchange;
 pub use recluster_partition_strategy::CompactPartitionStrategy;
 pub use recluster_partition_strategy::ReclusterPartitionStrategy;
+pub use recluster_partition_strategy::ReclusterPartitionStrategys;
 pub use recluster_sample_state::SampleState;
 pub use transform_add_order_column::TransformAddOrderColumn;
 pub use transform_range_partition_indexer::TransformRangePartitionIndexer;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
index 8e0afd6e647c3..b3fe9a77a4660 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs
@@ -12,30 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::marker::PhantomData;
-
-use databend_common_expression::types::ArgType;
-use databend_common_expression::types::ValueType;
 use databend_common_expression::DataBlock;
-use databend_common_expression::Scalar;
 use rand::prelude::SliceRandom;
 use rand::prelude::SmallRng;
 use rand::SeedableRng;
 
-pub struct RangeBoundSampler<T>
-where T: ValueType
-{
+pub struct RangeBoundSampler {
     offset: usize,
     sample_size: usize,
     rng: SmallRng,
 
-    values: Vec<(u64, Vec<Scalar>)>,
-    _t: PhantomData<T>,
+    values: Vec<(u64, Vec<Vec<u8>>)>,
 }
 
-impl<T> RangeBoundSampler<T>
-where T: ValueType
-{
+impl RangeBoundSampler {
     pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self {
         let rng = SmallRng::seed_from_u64(seed);
         Self {
@@ -43,16 +33,11 @@ where T: ValueType
             sample_size,
             rng,
             values: vec![],
-            _t: PhantomData,
         }
     }
 }
 
-impl<T> RangeBoundSampler<T>
-where
-    T: ArgType,
-    T::Scalar: Ord + Send,
-{
+impl RangeBoundSampler {
     pub fn add_block(&mut self, data: &DataBlock) {
         let rows = data.num_rows();
         assert!(rows > 0);
@@ -63,19 +48,15 @@ where
         indices.shuffle(&mut self.rng);
         let sampled_indices = &indices[..sample_size];
 
-        let column = T::try_downcast_column(&column).unwrap();
+        let column = column.as_binary().unwrap();
         let sample_values = sampled_indices
             .iter()
-            .map(|i| {
-                T::upcast_scalar(T::to_owned_scalar(unsafe {
-                    T::index_column_unchecked(&column, *i)
-                }))
-            })
+            .map(|i| unsafe { column.index_unchecked(*i) }.to_vec())
             .collect::<Vec<_>>();
         self.values.push((rows as u64, sample_values));
     }
 
-    pub fn sample_values(&mut self) -> Vec<(u64, Vec<Scalar>)> {
+    pub fn sample_values(&mut self) -> Vec<(u64, Vec<Vec<u8>>)> {
         std::mem::take(&mut self.values)
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
index 269aae2e3abea..fb10cb0caec08 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
@@ -16,6 +16,8 @@ use std::sync::Arc;
 
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
+use databend_common_expression::LimitType;
+use databend_common_expression::SortColumnDescription;
 use databend_common_storages_fuse::io::StreamBlockBuilder;
 use databend_common_storages_fuse::io::StreamBlockProperties;
 
@@ -175,3 +177,89 @@ impl PartitionProcessStrategy for CompactPartitionStrategy {
         Ok(result)
     }
 }
+
+pub struct ReclusterPartitionStrategys {
+    properties: Arc<StreamBlockProperties>,
+    sort_desc: Vec<SortColumnDescription>,
+}
+
+impl ReclusterPartitionStrategys {
+    pub fn new(properties: Arc<StreamBlockProperties>, offset: usize) -> Self {
+        Self {
+            properties,
+            sort_desc: vec![SortColumnDescription {
+                offset,
+                asc: true,
+                nulls_first: false,
+            }],
+        }
+    }
+
+    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
+        DataBlock::concat(&blocks)
+    }
+}
+
+impl PartitionProcessStrategy for ReclusterPartitionStrategys {
+    const NAME: &'static str = "Recluster";
+
+    fn calc_partitions(
+        &self,
+        processor_id: usize,
+        num_processors: usize,
+        num_partitions: usize,
+    ) -> Vec<usize> {
+        (0..num_partitions)
+            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
+            .collect()
+    }
+
+    /// Stream write each block, and flush it conditionally based on builder status
+    /// and input size estimation.
+    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
+        let blocks_num = data_blocks.len();
+        let mut accumulated_rows = 0;
+        let mut accumulated_bytes = 0;
+        let mut pending_blocks = Vec::with_capacity(blocks_num);
+        let mut staged_blocks = Vec::with_capacity(blocks_num);
+        let mut compacted = Vec::with_capacity(blocks_num);
+        for block in data_blocks {
+            accumulated_rows += block.num_rows();
+            accumulated_bytes += block.estimate_block_size();
+            pending_blocks.push(block);
+            if !self
+                .properties
+                .check_large_enough(accumulated_rows, accumulated_bytes)
+            {
+                continue;
+            }
+            if !staged_blocks.is_empty() {
+                compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+            }
+            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
+            accumulated_rows = 0;
+            accumulated_bytes = 0;
+        }
+        staged_blocks.append(&mut pending_blocks);
+        if !staged_blocks.is_empty() {
+            compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+        }
+
+        let mut result = Vec::new();
+        let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
+        for block in compacted {
+            let block = DataBlock::sort_with_type(&block, &self.sort_desc, LimitType::None)?;
+            builder.write(block)?;
+            if builder.need_flush() {
+                let serialized = builder.finish()?;
+                result.push(DataBlock::empty_with_meta(Box::new(serialized)));
+                builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
+            }
+        }
+        if !builder.is_empty() {
+            let serialized = builder.finish()?;
+            result.push(DataBlock::empty_with_meta(Box::new(serialized)));
+        }
+        Ok(result)
+    }
+}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
index 12d50653b8b68..35e22321339d2 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::intrinsics::unlikely;
 use std::sync::Arc;
 use std::sync::RwLock;
 
 use databend_common_base::base::WatchNotify;
 use databend_common_exception::Result;
 use databend_common_expression::compare_columns;
-use databend_common_expression::types::ArgType;
-use databend_common_expression::Scalar;
+use databend_common_expression::types::BinaryType;
+use databend_common_expression::FromData;
 
 pub struct SampleState {
     pub inner: RwLock<SampleStateInner>,
@@ -41,38 +42,21 @@ impl SampleState {
         })
     }
 
-    pub fn merge_sample<T>(&self, values: Vec<(u64, Vec<Scalar>)>) -> Result<()>
-    where
-        T: ArgType,
-        T::Scalar: Ord,
-    {
+    pub fn merge_sample(&self, values: Vec<(u64, Vec<Vec<u8>>)>) -> Result<()> {
         let mut inner = self.inner.write().unwrap();
         inner.completed_inputs += 1;
         inner.values.extend_from_slice(&values);
 
         if inner.completed_inputs >= inner.total_inputs {
-            inner.determine_bounds::<T>()?;
+            inner.determine_bounds()?;
             self.done.notify_waiters();
         }
         Ok(())
     }
 
-    pub fn get_bounds<T>(&self) -> (Vec<T::Scalar>, Option<T::Scalar>)
-    where
-        T: ArgType,
-        T::Scalar: Ord,
-    {
+    pub fn get_bounds(&self) -> (Vec<Vec<u8>>, Option<Vec<u8>>) {
         let inner = self.inner.read().unwrap();
-        let bounds = inner
-            .bounds
-            .iter()
-            .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()))
-            .collect();
-        let max_value = inner
-            .max_value
-            .as_ref()
-            .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()));
-        (bounds, max_value)
+        (inner.bounds.clone(), inner.max_value.clone())
     }
 }
 
@@ -81,18 +65,14 @@ pub struct SampleStateInner {
     total_inputs: usize,
 
     completed_inputs: usize,
-    bounds: Vec<Scalar>,
-    max_value: Option<Scalar>,
+    bounds: Vec<Vec<u8>>,
+    max_value: Option<Vec<u8>>,
 
-    values: Vec<(u64, Vec<Scalar>)>,
+    values: Vec<(u64, Vec<Vec<u8>>)>,
 }
 
 impl SampleStateInner {
-    fn determine_bounds<T>(&mut self) -> Result<()>
-    where
-        T: ArgType,
-        T::Scalar: Ord,
-    {
+    fn determine_bounds(&mut self) -> Result<()> {
         if self.partitions < 2 {
             return Ok(());
         }
@@ -111,16 +91,15 @@ impl SampleStateInner {
         for (num, values) in values.into_iter() {
             let weight = num as f64 / values.len() as f64;
             values.into_iter().for_each(|v| {
-                let val = T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap());
-                data.push(val);
+                data.push(v);
                 weights.push(weight);
             });
         }
-        let col = T::upcast_column(T::column_from_vec(data.clone(), &[]));
+        let col = BinaryType::from_data(data.clone());
         let indices = compare_columns(vec![col], total_samples)?;
 
         let max_index = indices[total_samples - 1] as usize;
-        let max_val = data[max_index].clone();
+        let max_val = &data[max_index];
 
         let mut cum_weight = 0.0;
         let mut target = step;
@@ -131,22 +110,20 @@ impl SampleStateInner {
         let mut j = 0;
         while i < total_samples && j < self.partitions - 1 {
             let idx = indices[i] as usize;
+            let value = &data[idx];
             let weight = weights[idx];
             cum_weight += weight;
-            if cum_weight >= target {
-                let data = &data[idx];
-                if previous_bound.as_ref().is_none_or(|prev| data > prev) {
-                    if data == &max_val {
-                        self.max_value = Some(T::upcast_scalar(max_val));
-                        break;
-                    }
-
-                    let bound = T::upcast_scalar(data.clone());
-                    bounds.push(bound);
-                    target += step;
-                    j += 1;
-                    previous_bound = Some(data.clone());
+
+            if cum_weight >= target && previous_bound.map_or(true, |prev| value > prev) {
+                if unlikely(value == max_val) {
+                    self.max_value = Some(max_val.clone());
+                    break;
                 }
+
+                bounds.push(value.clone());
+                previous_bound = Some(value);
+                target += step;
+                j += 1;
             }
             i += 1;
         }
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
index f90458a8c44b6..b3fa11ba7ddda 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::marker::PhantomData;
-use std::sync::Arc;
-
 use databend_common_exception::Result;
+use databend_common_expression::row::RowConverter as CommonConverter;
 use databend_common_expression::DataBlock;
 use databend_common_expression::DataSchemaRef;
 use databend_common_expression::SortColumnDescription;
@@ -23,32 +21,22 @@ use databend_common_pipeline_transforms::sort::RowConverter;
 use databend_common_pipeline_transforms::sort::Rows;
 use databend_common_pipeline_transforms::Transform;
 
-pub struct TransformAddOrderColumn<R, C> {
-    row_converter: C,
-    sort_desc: Arc<[SortColumnDescription]>,
-    _r: PhantomData<R>,
+pub struct TransformAddOrderColumn {
+    row_converter: CommonConverter,
+    sort_desc: Vec<SortColumnDescription>,
 }
 
-impl<R, C> TransformAddOrderColumn<R, C>
-where
-    R: Rows,
-    C: RowConverter<R>,
-{
-    pub fn try_new(sort_desc: Arc<[SortColumnDescription]>, schema: DataSchemaRef) -> Result<Self> {
-        let row_converter = C::create(&sort_desc, schema.clone())?;
+impl TransformAddOrderColumn {
+    pub fn try_new(sort_desc: Vec<SortColumnDescription>, schema: DataSchemaRef) -> Result<Self> {
+        let row_converter = CommonConverter::create(&sort_desc, schema.clone())?;
         Ok(Self {
             row_converter,
             sort_desc,
-            _r: PhantomData,
         })
     }
 }
 
-impl<R, C> Transform for TransformAddOrderColumn<R, C>
-where
-    R: Rows + 'static,
-    C: RowConverter<R> + Send + 'static,
-{
+impl Transform for TransformAddOrderColumn {
     const NAME: &'static str = "TransformAddOrderColumn";
 
     fn transform(&mut self, mut data: DataBlock) -> Result<DataBlock> {
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index ea2c4983ffb42..3888b130d4227 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -14,11 +14,11 @@
 
 use std::any::Any;
 use std::collections::VecDeque;
+use std::intrinsics::unlikely;
 use std::sync::Arc;
 use std::time::Instant;
 
 use databend_common_exception::Result;
-use databend_common_expression::types::ArgType;
 use databend_common_expression::types::UInt64Type;
 use databend_common_expression::BlockMetaInfoDowncast;
 use databend_common_expression::DataBlock;
@@ -31,26 +31,20 @@ use databend_common_pipeline_core::processors::Processor;
 use crate::pipelines::processors::transforms::ReclusterSampleMeta;
 use crate::pipelines::processors::transforms::SampleState;
 
-pub struct TransformRangePartitionIndexer<T>
-where T: ArgType
-{
+pub struct TransformRangePartitionIndexer {
     input: Arc<InputPort>,
     output: Arc<OutputPort>,
 
     state: Arc<SampleState>,
     input_data: Vec<DataBlock>,
     output_data: VecDeque<DataBlock>,
-    bounds: Vec<T::Scalar>,
-    max_value: Option<T::Scalar>,
+    bounds: Vec<Vec<u8>>,
+    max_value: Option<Vec<u8>>,
 
     start: Instant,
 }
 
-impl<T> TransformRangePartitionIndexer<T>
-where
-    T: ArgType + Send + Sync,
-    T::Scalar: Ord + Send + Sync,
-{
+impl TransformRangePartitionIndexer {
     pub fn create(
         input: Arc<InputPort>,
         output: Arc<OutputPort>,
@@ -70,11 +64,7 @@ where
 }
 
 #[async_trait::async_trait]
-impl<T> Processor for TransformRangePartitionIndexer<T>
-where
-    T: ArgType + Send + Sync,
-    T::Scalar: Ord + Send + Sync,
-{
+impl Processor for TransformRangePartitionIndexer {
     fn name(&self) -> String {
         "TransformRangePartitionIndexer".to_owned()
     }
@@ -120,48 +110,36 @@ where
             .and_then(ReclusterSampleMeta::downcast_from)
             .expect("require a ReclusterSampleMeta");
         self.input_data = meta.blocks;
-        self.state.merge_sample::<T>(meta.sample_values)?;
+        self.state.merge_sample(meta.sample_values)?;
         log::info!("Recluster range partition: {:?}", self.start.elapsed());
         Ok(Event::Async)
     }
 
     fn process(&mut self) -> Result<()> {
         let start = Instant::now();
-        let mut block = {
-            let blocks = std::mem::take(&mut self.input_data);
-            DataBlock::concat(&blocks)?
-        };
-
-        let bound_len = self.bounds.len();
-        let num_rows = block.num_rows();
-        let last = block.get_last_column().clone();
-        block.pop_columns(1);
-        let mut builder = Vec::with_capacity(num_rows);
-        let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap();
-        for index in 0..num_rows {
-            let val = T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) });
-            if self.max_value.as_ref().is_some_and(|v| val >= *v) {
-                let range_id = bound_len + 1;
-                builder.push(range_id as u64);
-                continue;
-            }
-
-            let mut low = 0;
-            let mut high = bound_len;
-            while low < high {
-                let mid = low + ((high - low) / 2);
-                let bound = unsafe { self.bounds.get_unchecked(mid) }.clone();
-                if val > bound {
-                    low = mid + 1;
-                } else {
-                    high = mid;
+        if let Some(mut block) = self.input_data.pop() {
+            let bound_len = self.bounds.len();
+            let num_rows = block.num_rows();
+            let mut builder = Vec::with_capacity(num_rows);
+            let last_col = block.get_last_column().as_binary().unwrap();
+            for index in 0..num_rows {
+                let val = unsafe { last_col.index_unchecked(index) };
+                if unlikely(self.max_value.as_ref().is_some_and(|v| val >= v.as_slice())) {
+                    let range_id = bound_len + 1;
+                    builder.push(range_id as u64);
+                    continue;
                 }
+
+                let idx = self
+                    .bounds
+                    .binary_search_by(|b| b.as_slice().cmp(val))
+                    .unwrap_or_else(|i| i);
+                builder.push(idx as u64);
             }
-            builder.push(low as u64);
-        }
 
-        block.add_column(UInt64Type::from_data(builder));
-        self.output_data.push_back(block);
+            block.add_column(UInt64Type::from_data(builder));
+            self.output_data.push_back(block);
+        }
         log::info!("Recluster range output: {:?}", start.elapsed());
         Ok(())
     }
@@ -169,7 +147,7 @@ where
     #[async_backtrace::framed]
     async fn async_process(&mut self) -> Result<()> {
         self.state.done.notified().await;
-        (self.bounds, self.max_value) = self.state.get_bounds::<T>();
+        (self.bounds, self.max_value) = self.state.get_bounds();
         Ok(())
     }
 }
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
index 46684b42b31e3..28d1c0aed8b54 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs
@@ -14,41 +14,27 @@
 
 use databend_common_exception::Result;
 use databend_common_expression::local_block_meta_serde;
-use databend_common_expression::types::ArgType;
 use databend_common_expression::BlockMetaInfo;
 use databend_common_expression::DataBlock;
-use databend_common_expression::Scalar;
 use databend_common_pipeline_transforms::AccumulatingTransform;
 
 use crate::pipelines::processors::transforms::RangeBoundSampler;
 
-pub struct TransformReclusterCollect<T>
-where
-    T: ArgType + Send + Sync,
-    T::Scalar: Ord + Send,
-{
+pub struct TransformReclusterCollect {
     input_data: Vec<DataBlock>,
-    sampler: RangeBoundSampler<T>,
+    sampler: RangeBoundSampler,
 }
 
-impl<T> TransformReclusterCollect<T>
-where
-    T: ArgType + Send + Sync,
-    T::Scalar: Ord + Send,
-{
+impl TransformReclusterCollect {
     pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self {
         Self {
             input_data: vec![],
-            sampler: RangeBoundSampler::<T>::new(offset, sample_size, seed),
+            sampler: RangeBoundSampler::new(offset, sample_size, seed),
         }
     }
 }
 
-impl<T> AccumulatingTransform for TransformReclusterCollect<T>
-where
-    T: ArgType + Send + Sync,
-    T::Scalar: Ord + Send,
-{
+impl AccumulatingTransform for TransformReclusterCollect {
     const NAME: &'static str = "TransformReclusterCollect";
 
     fn transform(&mut self, data: DataBlock) -> Result<Vec<DataBlock>> {
@@ -72,7 +58,7 @@ where
 #[derive(Debug)]
 pub struct ReclusterSampleMeta {
     pub blocks: Vec<DataBlock>,
-    pub sample_values: Vec<(u64, Vec<Scalar>)>,
+    pub sample_values: Vec<(u64, Vec<Vec<u8>>)>,
 }
 
 local_block_meta_serde!(ReclusterSampleMeta);
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
index a686891024a90..f85d03b3115ef 100644
--- a/src/query/settings/src/settings_default.rs
+++ b/src/query/settings/src/settings_default.rs
@@ -1312,6 +1312,13 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(0..=1)),
                 }),
+                ("enable_block_stream_writes", DefaultSettingValue {
+                    value: UserSettingValue::UInt64(0),
+                    desc: "Enables block stream write",
+                    mode: SettingMode::Both,
+                    scope: SettingScope::Both,
+                    range: Some(SettingRange::Numeric(0..=1)),
+                }),
                 ("trace_sample_rate", DefaultSettingValue {
                     value: UserSettingValue::UInt64(1),
                     desc: "Setting the trace sample rate. The value should be between '0' and '100'",
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
index 89061370021da..9289d4badbcd6 100644
--- a/src/query/settings/src/settings_getter_setter.rs
+++ b/src/query/settings/src/settings_getter_setter.rs
@@ -964,6 +964,10 @@ impl Settings {
         Ok(self.try_get_u64("enable_block_stream_write")? == 1)
     }
 
+    pub fn get_enable_block_stream_writes(&self) -> Result<bool> {
+        Ok(self.try_get_u64("enable_block_stream_writes")? == 1)
+    }
+
     pub fn get_statement_queue_ttl_in_seconds(&self) -> Result<u64> {
         self.try_get_u64("statement_queue_ttl_in_seconds")
     }
diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
index a0bd91888995e..c7cc352f7ecc6 100644
--- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
@@ -17,7 +17,6 @@ use std::sync::Arc;
 use databend_common_catalog::table::Table;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
-use databend_common_expression::Column;
 use databend_common_expression::ColumnRef;
 use databend_common_expression::DataBlock;
 use databend_common_expression::DataField;
@@ -25,7 +24,6 @@ use databend_common_expression::DataSchema;
 use databend_common_expression::Expr;
 use databend_common_expression::Scalar;
 use databend_common_expression::TableSchemaRef;
-use databend_common_functions::aggregates::eval_aggr;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_sql::evaluator::BlockOperator;
 use databend_storages_common_table_meta::meta::ClusterStatistics;
@@ -115,8 +113,8 @@ impl ClusterStatisticsBuilder {
 }
 
 pub struct ClusterStatisticsState {
-    mins: Vec<Scalar>,
-    maxs: Vec<Scalar>,
+    min_values: Vec<Vec<Scalar>>,
+    max_values: Vec<Vec<Scalar>>,
 
     builder: Arc<ClusterStatisticsBuilder>,
 }
@@ -124,8 +122,8 @@ pub struct ClusterStatisticsState {
 impl ClusterStatisticsState {
     pub fn new(builder: Arc<ClusterStatisticsBuilder>) -> Self {
         Self {
-            mins: vec![],
-            maxs: vec![],
+            min_values: vec![],
+            max_values: vec![],
             builder,
         }
     }
@@ -135,20 +133,20 @@ impl ClusterStatisticsState {
             return Ok(input);
         }
 
-        let num_rows = input.num_rows();
-        let cols = self
-            .builder
-            .cluster_key_index
-            .iter()
-            .map(|&i| input.get_by_offset(i).to_column())
-            .collect();
-        let tuple = Column::Tuple(cols);
-        let (min, _) = eval_aggr("min", vec![], &[tuple.clone()], num_rows, vec![])?;
-        let (max, _) = eval_aggr("max", vec![], &[tuple.clone()], num_rows, vec![])?;
-        assert_eq!(min.len(), 1);
-        assert_eq!(max.len(), 1);
-        self.mins.push(min.index(0).unwrap().to_owned());
-        self.maxs.push(max.index(0).unwrap().to_owned());
+        let mut min = Vec::with_capacity(self.builder.cluster_key_index.len());
+        let mut max = Vec::with_capacity(self.builder.cluster_key_index.len());
+        for key in self.builder.cluster_key_index.iter() {
+            let val = input.get_by_offset(*key);
+            let left = unsafe { val.index_unchecked(0) }.to_owned();
+            min.push(left);
+
+            // The maximum in cluster statistics needn't larger than the non-trimmed one.
+            // So we use trim_min directly.
+            let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned();
+            max.push(right);
+        }
+        self.min_values.push(min);
+        self.max_values.push(max);
         input.pop_columns(self.builder.extra_key_num);
         Ok(input)
     }
@@ -158,22 +156,8 @@ impl ClusterStatisticsState {
             return Ok(None);
         }
 
-        let min = self
-            .mins
-            .into_iter()
-            .min_by(|x, y| x.as_ref().cmp(&y.as_ref()))
-            .unwrap()
-            .as_tuple()
-            .unwrap()
-            .clone();
-        let max = self
-            .maxs
-            .into_iter()
-            .max_by(|x, y| x.as_ref().cmp(&y.as_ref()))
-            .unwrap()
-            .as_tuple()
-            .unwrap()
-            .clone();
+        let min = self.min_values.into_iter().min().unwrap();
+        let max = self.max_values.into_iter().max().unwrap();
 
         let level = if min == max && perfect {
             -1
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index fc14e9589071e..8a2349f8ff359 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -65,16 +65,32 @@ impl FuseTable {
                 table_meta_timestamps,
             )?;
 
-            let cluster_operators = properties.cluster_operators();
-            if !cluster_operators.is_empty() {
-                let num_input_columns = self.table_info.schema().num_fields();
-                let func_ctx = ctx.get_function_context()?;
-                pipeline.add_transformer(move || {
-                    CompoundBlockOperator::new(
-                        cluster_operators.clone(),
-                        func_ctx.clone(),
-                        num_input_columns,
-                    )
+            let cluster_key_index = properties.cluster_key_index();
+            if !cluster_key_index.is_empty() {
+                let cluster_operators = properties.cluster_operators();
+                if !cluster_operators.is_empty() {
+                    let num_input_columns = self.table_info.schema().num_fields();
+                    let func_ctx = ctx.get_function_context()?;
+                    pipeline.add_transformer(move || {
+                        CompoundBlockOperator::new(
+                            cluster_operators.clone(),
+                            func_ctx.clone(),
+                            num_input_columns,
+                        )
+                    });
+                }
+
+                let sort_desc: Vec<SortColumnDescription> = cluster_key_index
+                    .iter()
+                    .map(|index| SortColumnDescription {
+                        offset: *index,
+                        asc: true,
+                        nulls_first: false,
+                    })
+                    .collect();
+                let sort_desc: Arc<[_]> = sort_desc.into();
+                pipeline.add_transformer(|| {
+                    TransformSortPartial::new(LimitType::None, sort_desc.clone())
                 });
             }
 
diff --git a/src/query/storages/fuse/src/statistics/cluster_statistics.rs b/src/query/storages/fuse/src/statistics/cluster_statistics.rs
index f452938fd4c25..904446690d93f 100644
--- a/src/query/storages/fuse/src/statistics/cluster_statistics.rs
+++ b/src/query/storages/fuse/src/statistics/cluster_statistics.rs
@@ -122,7 +122,7 @@ impl ClusterStatsGenerator {
             let left = unsafe { val.index_unchecked(0) }.to_owned();
             min.push(left);
 
-            // The maximum in cluster statistics neednot larger than the non-trimmed one.
+            // The maximum in cluster statistics needn't larger than the non-trimmed one.
             // So we use trim_min directly.
             let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned();
             max.push(right);

From 35bfee36631836ea66beb0dad45aeca5d33f09d0 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Mon, 16 Jun 2025 04:12:11 +0800
Subject: [PATCH 31/36] fix

---
 .../pipelines/builders/builder_recluster.rs   |  51 +++------
 .../processors/transforms/recluster/mod.rs    |   1 -
 .../recluster/recluster_partition_strategy.rs | 102 ++----------------
 .../recluster/recluster_sample_state.rs       |   2 +-
 .../transform_range_partition_indexer.rs      |  40 ++++---
 src/query/settings/src/settings_default.rs    |   7 --
 .../settings/src/settings_getter_setter.rs    |   4 -
 .../src/io/write/stream/cluster_statistics.rs |  56 ++++++----
 8 files changed, 83 insertions(+), 180 deletions(-)

diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index 5af820ccda385..28b5feaacfd39 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -39,7 +39,6 @@ use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::StreamContext;
 use databend_common_storages_factory::Table;
 use databend_common_storages_fuse::io::StreamBlockProperties;
-use databend_common_storages_fuse::operations::TransformBlockBuilder;
 use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::FuseTable;
@@ -47,14 +46,13 @@ use databend_common_storages_fuse::TableContext;
 
 use crate::pipelines::builders::SortPipelineBuilder;
 use crate::pipelines::processors::transforms::ReclusterPartitionExchange;
-use crate::pipelines::processors::transforms::ReclusterPartitionStrategys;
+use crate::pipelines::processors::transforms::ReclusterPartitionStrategy;
 use crate::pipelines::processors::transforms::SampleState;
 use crate::pipelines::processors::transforms::TransformAddOrderColumn;
 use crate::pipelines::processors::transforms::TransformAddStreamColumns;
 use crate::pipelines::processors::transforms::TransformPartitionCollect;
 use crate::pipelines::processors::transforms::TransformRangePartitionIndexer;
 use crate::pipelines::processors::transforms::TransformReclusterCollect;
-use crate::pipelines::processors::transforms::TransformReclusterPartition;
 use crate::pipelines::PipelineBuilder;
 
 impl PipelineBuilder {
@@ -173,7 +171,6 @@ impl PipelineBuilder {
                     let fields_with_cluster_key = properties.fields_with_cluster_key();
                     let schema = DataSchemaRefExt::create(fields_with_cluster_key);
                     let schema = add_order_field(schema, &sort_desc);
-                    let order_offset = schema.fields.len() - 1;
 
                     let num_processors = self.main_pipeline.output_len();
                     let sample_size = self
@@ -203,44 +200,24 @@ impl PipelineBuilder {
                     let processor_id = AtomicUsize::new(0);
 
                     let settings = self.ctx.get_settings();
-                    let enable_writings = settings.get_enable_block_stream_writes()?;
-                    if enable_writings {
-                        let memory_settings = MemorySettings::disable_spill();
-                        self.main_pipeline.add_transform(|input, output| {
-                            let strategy =
-                                ReclusterPartitionStrategys::new(properties.clone(), order_offset);
-
-                            Ok(ProcessorPtr::create(Box::new(
-                                TransformPartitionCollect::new(
-                                    self.ctx.clone(),
-                                    input,
-                                    output,
-                                    &settings,
-                                    processor_id.fetch_add(1, atomic::Ordering::AcqRel),
-                                    num_processors,
-                                    partitions,
-                                    memory_settings.clone(),
-                                    None,
-                                    strategy,
-                                )?,
-                            )))
-                        })?;
-
-                        self.main_pipeline.add_transform(|input, output| {
-                            TransformBlockBuilder::try_create(input, output, properties.clone())
-                        })?;
-                    } else {
-                        self.main_pipeline.add_transform(|input, output| {
-                            TransformReclusterPartition::try_create(
+                    let memory_settings = MemorySettings::disable_spill();
+                    self.main_pipeline.add_transform(|input, output| {
+                        let strategy = ReclusterPartitionStrategy::new(properties.clone());
+                        Ok(ProcessorPtr::create(Box::new(
+                            TransformPartitionCollect::new(
+                                self.ctx.clone(),
                                 input,
                                 output,
-                                properties.clone(),
+                                &settings,
                                 processor_id.fetch_add(1, atomic::Ordering::AcqRel),
                                 num_processors,
                                 partitions,
-                            )
-                        })?;
-                    }
+                                memory_settings.clone(),
+                                None,
+                                strategy,
+                            )?,
+                        )))
+                    })?;
 
                     self.main_pipeline.add_async_accumulating_transformer(|| {
                         TransformBlockWriter::create(
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
index b87be1f1e4d51..a024e330be25b 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs
@@ -25,7 +25,6 @@ pub use range_bound_sampler::RangeBoundSampler;
 pub use recluster_partition_exchange::ReclusterPartitionExchange;
 pub use recluster_partition_strategy::CompactPartitionStrategy;
 pub use recluster_partition_strategy::ReclusterPartitionStrategy;
-pub use recluster_partition_strategy::ReclusterPartitionStrategys;
 pub use recluster_sample_state::SampleState;
 pub use transform_add_order_column::TransformAddOrderColumn;
 pub use transform_range_partition_indexer::TransformRangePartitionIndexer;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
index fb10cb0caec08..9c8a4573171a1 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
@@ -16,8 +16,6 @@ use std::sync::Arc;
 
 use databend_common_exception::Result;
 use databend_common_expression::DataBlock;
-use databend_common_expression::LimitType;
-use databend_common_expression::SortColumnDescription;
 use databend_common_storages_fuse::io::StreamBlockBuilder;
 use databend_common_storages_fuse::io::StreamBlockProperties;
 
@@ -34,10 +32,6 @@ impl ReclusterPartitionStrategy {
     pub fn new(properties: Arc<StreamBlockProperties>) -> Self {
         Self { properties }
     }
-
-    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
-        DataBlock::concat(&blocks)
-    }
 }
 
 impl PartitionProcessStrategy for ReclusterPartitionStrategy {
@@ -74,7 +68,7 @@ impl PartitionProcessStrategy for ReclusterPartitionStrategy {
                 continue;
             }
             if !staged_blocks.is_empty() {
-                compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+                compacted.push(std::mem::take(&mut staged_blocks));
             }
             std::mem::swap(&mut staged_blocks, &mut pending_blocks);
             accumulated_rows = 0;
@@ -82,13 +76,15 @@ impl PartitionProcessStrategy for ReclusterPartitionStrategy {
         }
         staged_blocks.append(&mut pending_blocks);
         if !staged_blocks.is_empty() {
-            compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
+            compacted.push(std::mem::take(&mut staged_blocks));
         }
 
         let mut result = Vec::new();
         let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
-        for block in compacted {
-            builder.write(block)?;
+        for blocks in compacted {
+            for block in blocks {
+                builder.write(block)?;
+            }
             if builder.need_flush() {
                 let serialized = builder.finish()?;
                 result.push(DataBlock::empty_with_meta(Box::new(serialized)));
@@ -177,89 +173,3 @@ impl PartitionProcessStrategy for CompactPartitionStrategy {
         Ok(result)
     }
 }
-
-pub struct ReclusterPartitionStrategys {
-    properties: Arc<StreamBlockProperties>,
-    sort_desc: Vec<SortColumnDescription>,
-}
-
-impl ReclusterPartitionStrategys {
-    pub fn new(properties: Arc<StreamBlockProperties>, offset: usize) -> Self {
-        Self {
-            properties,
-            sort_desc: vec![SortColumnDescription {
-                offset,
-                asc: true,
-                nulls_first: false,
-            }],
-        }
-    }
-
-    fn concat_blocks(blocks: Vec<DataBlock>) -> Result<DataBlock> {
-        DataBlock::concat(&blocks)
-    }
-}
-
-impl PartitionProcessStrategy for ReclusterPartitionStrategys {
-    const NAME: &'static str = "Recluster";
-
-    fn calc_partitions(
-        &self,
-        processor_id: usize,
-        num_processors: usize,
-        num_partitions: usize,
-    ) -> Vec<usize> {
-        (0..num_partitions)
-            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
-            .collect()
-    }
-
-    /// Stream write each block, and flush it conditionally based on builder status
-    /// and input size estimation.
-    fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
-        let blocks_num = data_blocks.len();
-        let mut accumulated_rows = 0;
-        let mut accumulated_bytes = 0;
-        let mut pending_blocks = Vec::with_capacity(blocks_num);
-        let mut staged_blocks = Vec::with_capacity(blocks_num);
-        let mut compacted = Vec::with_capacity(blocks_num);
-        for block in data_blocks {
-            accumulated_rows += block.num_rows();
-            accumulated_bytes += block.estimate_block_size();
-            pending_blocks.push(block);
-            if !self
-                .properties
-                .check_large_enough(accumulated_rows, accumulated_bytes)
-            {
-                continue;
-            }
-            if !staged_blocks.is_empty() {
-                compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
-            }
-            std::mem::swap(&mut staged_blocks, &mut pending_blocks);
-            accumulated_rows = 0;
-            accumulated_bytes = 0;
-        }
-        staged_blocks.append(&mut pending_blocks);
-        if !staged_blocks.is_empty() {
-            compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?);
-        }
-
-        let mut result = Vec::new();
-        let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
-        for block in compacted {
-            let block = DataBlock::sort_with_type(&block, &self.sort_desc, LimitType::None)?;
-            builder.write(block)?;
-            if builder.need_flush() {
-                let serialized = builder.finish()?;
-                result.push(DataBlock::empty_with_meta(Box::new(serialized)));
-                builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?;
-            }
-        }
-        if !builder.is_empty() {
-            let serialized = builder.finish()?;
-            result.push(DataBlock::empty_with_meta(Box::new(serialized)));
-        }
-        Ok(result)
-    }
-}
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
index 35e22321339d2..07960939ee538 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs
@@ -114,7 +114,7 @@ impl SampleStateInner {
             let weight = weights[idx];
             cum_weight += weight;
 
-            if cum_weight >= target && previous_bound.map_or(true, |prev| value > prev) {
+            if cum_weight >= target && previous_bound.is_none_or(|prev| value > prev) {
                 if unlikely(value == max_val) {
                     self.max_value = Some(max_val.clone());
                     break;
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
index 3888b130d4227..39efbec0b20a7 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs
@@ -118,28 +118,40 @@ impl Processor for TransformRangePartitionIndexer {
     fn process(&mut self) -> Result<()> {
         let start = Instant::now();
         if let Some(mut block) = self.input_data.pop() {
-            let bound_len = self.bounds.len();
             let num_rows = block.num_rows();
             let mut builder = Vec::with_capacity(num_rows);
             let last_col = block.get_last_column().as_binary().unwrap();
-            for index in 0..num_rows {
-                let val = unsafe { last_col.index_unchecked(index) };
-                if unlikely(self.max_value.as_ref().is_some_and(|v| val >= v.as_slice())) {
-                    let range_id = bound_len + 1;
-                    builder.push(range_id as u64);
-                    continue;
+            if let Some(max_value) = self.max_value.as_ref() {
+                let bound_len = self.bounds.len();
+                for index in 0..num_rows {
+                    let val = unsafe { last_col.index_unchecked(index) };
+                    if unlikely(val >= max_value.as_slice()) {
+                        let range_id = bound_len + 1;
+                        builder.push(range_id as u64);
+                        continue;
+                    }
+
+                    let idx = self
+                        .bounds
+                        .binary_search_by(|b| b.as_slice().cmp(val))
+                        .unwrap_or_else(|i| i);
+                    builder.push(idx as u64);
+                }
+            } else {
+                for index in 0..num_rows {
+                    let val = unsafe { last_col.index_unchecked(index) };
+                    let idx = self
+                        .bounds
+                        .binary_search_by(|b| b.as_slice().cmp(val))
+                        .unwrap_or_else(|i| i);
+                    builder.push(idx as u64);
                 }
-
-                let idx = self
-                    .bounds
-                    .binary_search_by(|b| b.as_slice().cmp(val))
-                    .unwrap_or_else(|i| i);
-                builder.push(idx as u64);
             }
-
+            block.pop_columns(1);
             block.add_column(UInt64Type::from_data(builder));
             self.output_data.push_back(block);
         }
+
         log::info!("Recluster range output: {:?}", start.elapsed());
         Ok(())
     }
diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs
index f85d03b3115ef..a686891024a90 100644
--- a/src/query/settings/src/settings_default.rs
+++ b/src/query/settings/src/settings_default.rs
@@ -1312,13 +1312,6 @@ impl DefaultSettings {
                     scope: SettingScope::Both,
                     range: Some(SettingRange::Numeric(0..=1)),
                 }),
-                ("enable_block_stream_writes", DefaultSettingValue {
-                    value: UserSettingValue::UInt64(0),
-                    desc: "Enables block stream write",
-                    mode: SettingMode::Both,
-                    scope: SettingScope::Both,
-                    range: Some(SettingRange::Numeric(0..=1)),
-                }),
                 ("trace_sample_rate", DefaultSettingValue {
                     value: UserSettingValue::UInt64(1),
                     desc: "Setting the trace sample rate. The value should be between '0' and '100'",
diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs
index 9289d4badbcd6..89061370021da 100644
--- a/src/query/settings/src/settings_getter_setter.rs
+++ b/src/query/settings/src/settings_getter_setter.rs
@@ -964,10 +964,6 @@ impl Settings {
         Ok(self.try_get_u64("enable_block_stream_write")? == 1)
     }
 
-    pub fn get_enable_block_stream_writes(&self) -> Result<bool> {
-        Ok(self.try_get_u64("enable_block_stream_writes")? == 1)
-    }
-
     pub fn get_statement_queue_ttl_in_seconds(&self) -> Result<u64> {
         self.try_get_u64("statement_queue_ttl_in_seconds")
     }
diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
index c7cc352f7ecc6..a0bd91888995e 100644
--- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
@@ -17,6 +17,7 @@ use std::sync::Arc;
 use databend_common_catalog::table::Table;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
+use databend_common_expression::Column;
 use databend_common_expression::ColumnRef;
 use databend_common_expression::DataBlock;
 use databend_common_expression::DataField;
@@ -24,6 +25,7 @@ use databend_common_expression::DataSchema;
 use databend_common_expression::Expr;
 use databend_common_expression::Scalar;
 use databend_common_expression::TableSchemaRef;
+use databend_common_functions::aggregates::eval_aggr;
 use databend_common_functions::BUILTIN_FUNCTIONS;
 use databend_common_sql::evaluator::BlockOperator;
 use databend_storages_common_table_meta::meta::ClusterStatistics;
@@ -113,8 +115,8 @@ impl ClusterStatisticsBuilder {
 }
 
 pub struct ClusterStatisticsState {
-    min_values: Vec<Vec<Scalar>>,
-    max_values: Vec<Vec<Scalar>>,
+    mins: Vec<Scalar>,
+    maxs: Vec<Scalar>,
 
     builder: Arc<ClusterStatisticsBuilder>,
 }
@@ -122,8 +124,8 @@ pub struct ClusterStatisticsState {
 impl ClusterStatisticsState {
     pub fn new(builder: Arc<ClusterStatisticsBuilder>) -> Self {
         Self {
-            min_values: vec![],
-            max_values: vec![],
+            mins: vec![],
+            maxs: vec![],
             builder,
         }
     }
@@ -133,20 +135,20 @@ impl ClusterStatisticsState {
             return Ok(input);
         }
 
-        let mut min = Vec::with_capacity(self.builder.cluster_key_index.len());
-        let mut max = Vec::with_capacity(self.builder.cluster_key_index.len());
-        for key in self.builder.cluster_key_index.iter() {
-            let val = input.get_by_offset(*key);
-            let left = unsafe { val.index_unchecked(0) }.to_owned();
-            min.push(left);
-
-            // The maximum in cluster statistics needn't larger than the non-trimmed one.
-            // So we use trim_min directly.
-            let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned();
-            max.push(right);
-        }
-        self.min_values.push(min);
-        self.max_values.push(max);
+        let num_rows = input.num_rows();
+        let cols = self
+            .builder
+            .cluster_key_index
+            .iter()
+            .map(|&i| input.get_by_offset(i).to_column())
+            .collect();
+        let tuple = Column::Tuple(cols);
+        let (min, _) = eval_aggr("min", vec![], &[tuple.clone()], num_rows, vec![])?;
+        let (max, _) = eval_aggr("max", vec![], &[tuple.clone()], num_rows, vec![])?;
+        assert_eq!(min.len(), 1);
+        assert_eq!(max.len(), 1);
+        self.mins.push(min.index(0).unwrap().to_owned());
+        self.maxs.push(max.index(0).unwrap().to_owned());
         input.pop_columns(self.builder.extra_key_num);
         Ok(input)
     }
@@ -156,8 +158,22 @@ impl ClusterStatisticsState {
             return Ok(None);
         }
 
-        let min = self.min_values.into_iter().min().unwrap();
-        let max = self.max_values.into_iter().max().unwrap();
+        let min = self
+            .mins
+            .into_iter()
+            .min_by(|x, y| x.as_ref().cmp(&y.as_ref()))
+            .unwrap()
+            .as_tuple()
+            .unwrap()
+            .clone();
+        let max = self
+            .maxs
+            .into_iter()
+            .max_by(|x, y| x.as_ref().cmp(&y.as_ref()))
+            .unwrap()
+            .as_tuple()
+            .unwrap()
+            .clone();
 
         let level = if min == max && perfect {
             -1

From 0eb6279a6897fd3e83a580d8259f66a3c0f3b692 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Tue, 17 Jun 2025 12:48:07 +0800
Subject: [PATCH 32/36] add column ndv estimator

---
 .../fuse/src/io/write/stream/block_builder.rs |   8 +-
 .../src/io/write/stream/column_statistics.rs  | 177 +++++++++---------
 2 files changed, 97 insertions(+), 88 deletions(-)

diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index 30d3b2b45543d..2bff1283e2fdd 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -359,7 +359,7 @@ pub struct StreamBlockProperties {
 
     cluster_stats_builder: Arc<ClusterStatisticsBuilder>,
     stats_columns: Vec<ColumnId>,
-    distinct_columns: Vec<ColumnId>,
+    distinct_columns: Vec<(ColumnId, DataType)>,
     bloom_columns_map: BTreeMap<FieldIndex, TableField>,
     ngram_args: Vec<NgramArgs>,
     inverted_index_builders: Vec<InvertedIndexBuilder>,
@@ -417,12 +417,12 @@ impl StreamBlockProperties {
         let leaf_fields = source_schema.leaf_fields();
         for field in leaf_fields.iter() {
             let column_id = field.column_id();
-            if RangeIndex::supported_type(&DataType::from(field.data_type()))
-                && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID
+            let data_type = DataType::from(field.data_type());
+            if RangeIndex::supported_type(&data_type) && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID
             {
                 stats_columns.push(column_id);
                 if !bloom_column_ids.contains(&column_id) {
-                    distinct_columns.push(column_id);
+                    distinct_columns.push((column_id, data_type));
                 }
             }
         }
diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
index 8df60aa61f03f..1e11c56af4c1c 100644
--- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
@@ -13,17 +13,20 @@
 // limitations under the License.
 
 use std::collections::HashMap;
+use std::hash::Hash;
+use std::marker::PhantomData;
 
 use databend_common_exception::Result;
-use databend_common_expression::types::AccessType;
+use databend_common_expression::types::boolean::TrueIdxIter;
 use databend_common_expression::types::DataType;
 use databend_common_expression::types::DateType;
-use databend_common_expression::types::DecimalColumn;
-use databend_common_expression::types::DecimalScalar;
+use databend_common_expression::types::Decimal128Type;
+use databend_common_expression::types::Decimal256Type;
 use databend_common_expression::types::NumberDataType;
 use databend_common_expression::types::NumberType;
 use databend_common_expression::types::StringType;
 use databend_common_expression::types::TimestampType;
+use databend_common_expression::types::ValueType;
 use databend_common_expression::with_number_mapped_type;
 use databend_common_expression::Column;
 use databend_common_expression::ColumnId;
@@ -32,6 +35,7 @@ use databend_common_expression::Scalar;
 use databend_common_expression::ScalarRef;
 use databend_common_expression::TableSchemaRef;
 use databend_common_expression::Value;
+use databend_common_expression::SELECTIVITY_THRESHOLD;
 use databend_common_functions::aggregates::eval_aggr;
 use databend_storages_common_table_meta::meta::ColumnDistinctHLL;
 use databend_storages_common_table_meta::meta::ColumnStatistics;
@@ -43,11 +47,11 @@ use crate::statistics::Trim;
 
 pub struct ColumnStatisticsState {
     col_stats: HashMap<ColumnId, Vec<ColumnStatistics>>,
-    distinct_columns: HashMap<ColumnId, ColumnDistinctHLL>,
+    distinct_columns: HashMap<ColumnId, Box<dyn ColumnNDVEstimator>>,
 }
 
 impl ColumnStatisticsState {
-    pub fn new(stats_columns: &[ColumnId], distinct_columns: &[ColumnId]) -> Self {
+    pub fn new(stats_columns: &[ColumnId], distinct_columns: &[(ColumnId, DataType)]) -> Self {
         let col_stats = stats_columns
             .iter()
             .map(|&col_id| (col_id, Vec::new()))
@@ -55,7 +59,7 @@ impl ColumnStatisticsState {
 
         let distinct_columns = distinct_columns
             .iter()
-            .map(|&col_id| (col_id, ColumnDistinctHLL::default()))
+            .map(|(col_id, data_type)| (*col_id, create_estimator(data_type)))
             .collect();
 
         Self {
@@ -80,8 +84,8 @@ impl ColumnStatisticsState {
                         in_memory_size as u64,
                         None,
                     );
-                    if let Some(hll) = self.distinct_columns.get_mut(&column_id) {
-                        scalar_update_hll_cardinality(&s.as_ref(), &data_type, hll);
+                    if let Some(estimator) = self.distinct_columns.get_mut(&column_id) {
+                        estimator.update_scalar(&s.as_ref());
                     }
                     self.col_stats.get_mut(&column_id).unwrap().push(col_stats);
                 }
@@ -128,8 +132,8 @@ impl ColumnStatisticsState {
                     self.col_stats.get_mut(&column_id).unwrap().push(col_stats);
 
                     // use distinct count calculated by the xor hash function to avoid repetitive operation.
-                    if let Some(hll) = self.distinct_columns.get_mut(&column_id) {
-                        column_update_hll_cardinality(&col, &data_type, hll);
+                    if let Some(estimator) = self.distinct_columns.get_mut(&column_id) {
+                        estimator.update_column(&col);
                     }
                 }
             }
@@ -146,8 +150,8 @@ impl ColumnStatisticsState {
             let mut col_stats = reduce_column_statistics(stats);
             if let Some(count) = column_distinct_count.get(id) {
                 col_stats.distinct_of_values = Some(*count as u64);
-            } else if let Some(hll) = self.distinct_columns.get(id) {
-                col_stats.distinct_of_values = Some(hll.count() as u64);
+            } else if let Some(estimator) = self.distinct_columns.get(id) {
+                col_stats.distinct_of_values = Some(estimator.finalize());
             }
             statistics.insert(*id, col_stats);
         }
@@ -155,93 +159,98 @@ impl ColumnStatisticsState {
     }
 }
 
-fn column_update_hll_cardinality(col: &Column, ty: &DataType, hll: &mut ColumnDistinctHLL) {
-    if let DataType::Nullable(inner) = ty {
-        let col = col.as_nullable().unwrap();
-        for (i, v) in col.validity.iter().enumerate() {
-            if v {
-                let scalar = unsafe { col.column.index_unchecked(i) };
-                scalar_update_hll_cardinality(&scalar, inner, hll);
-            }
-        }
-        return;
-    }
+pub trait ColumnNDVEstimator: Send + Sync {
+    fn update_column(&mut self, column: &Column);
+    fn update_scalar(&mut self, scalar: &ScalarRef);
+    fn finalize(&self) -> u64;
+}
 
-    with_number_mapped_type!(|NUM_TYPE| match ty {
+pub fn create_estimator(data_type: &DataType) -> Box<dyn ColumnNDVEstimator> {
+    let inner_type = data_type.remove_nullable();
+    with_number_mapped_type!(|NUM_TYPE| match inner_type {
         DataType::Number(NumberDataType::NUM_TYPE) => {
-            let col = NumberType::<NUM_TYPE>::try_downcast_column(col).unwrap();
-            for v in col.iter() {
-                hll.add_object(v);
-            }
+            ColumnNDVEstimatorImpl::<NumberType<NUM_TYPE>>::create()
         }
         DataType::String => {
-            let col = StringType::try_downcast_column(col).unwrap();
-            for v in col.iter() {
-                hll.add_object(&v);
-            }
+            ColumnNDVEstimatorImpl::<StringType>::create()
         }
         DataType::Date => {
-            let col = DateType::try_downcast_column(col).unwrap();
-            for v in col.iter() {
-                hll.add_object(v);
-            }
+            ColumnNDVEstimatorImpl::<DateType>::create()
         }
         DataType::Timestamp => {
-            let col = TimestampType::try_downcast_column(col).unwrap();
-            for v in col.iter() {
-                hll.add_object(v);
-            }
+            ColumnNDVEstimatorImpl::<TimestampType>::create()
+        }
+        DataType::Decimal(s) if s.can_carried_by_128() => {
+            ColumnNDVEstimatorImpl::<Decimal128Type>::create()
         }
         DataType::Decimal(_) => {
-            match col {
-                Column::Decimal(DecimalColumn::Decimal128(col, _)) => {
-                    for v in col.iter() {
-                        hll.add_object(v);
-                    }
-                }
-                Column::Decimal(DecimalColumn::Decimal256(col, _)) => {
-                    for v in col.iter() {
-                        hll.add_object(v);
-                    }
-                }
-                _ => unreachable!(),
-            };
+            ColumnNDVEstimatorImpl::<Decimal256Type>::create()
         }
-        _ => unreachable!("Unsupported data type: {:?}", ty),
-    });
+        _ => unreachable!("Unsupported data type: {:?}", data_type),
+    })
 }
 
-fn scalar_update_hll_cardinality(scalar: &ScalarRef, ty: &DataType, hll: &mut ColumnDistinctHLL) {
-    if matches!(scalar, ScalarRef::Null) {
-        return;
-    }
+pub struct ColumnNDVEstimatorImpl<T>
+where
+    T: ValueType + Send + Sync,
+    T::Scalar: Hash,
+{
+    hll: ColumnDistinctHLL,
+    _phantom: PhantomData<T>,
+}
 
-    let ty = ty.remove_nullable();
+impl<T> ColumnNDVEstimatorImpl<T>
+where
+    T: ValueType + Send + Sync,
+    T::Scalar: Hash,
+{
+    pub fn create() -> Box<dyn ColumnNDVEstimator> {
+        Box::new(Self {
+            hll: ColumnDistinctHLL::new(),
+            _phantom: Default::default(),
+        })
+    }
+}
 
-    with_number_mapped_type!(|NUM_TYPE| match ty {
-        DataType::Number(NumberDataType::NUM_TYPE) => {
-            let val = NumberType::<NUM_TYPE>::try_downcast_scalar(scalar).unwrap();
-            hll.add_object(&val);
-        }
-        DataType::String => {
-            let val = StringType::try_downcast_scalar(scalar).unwrap();
-            hll.add_object(&val);
-        }
-        DataType::Date => {
-            let val = DateType::try_downcast_scalar(scalar).unwrap();
-            hll.add_object(&val);
-        }
-        DataType::Timestamp => {
-            let val = TimestampType::try_downcast_scalar(scalar).unwrap();
-            hll.add_object(&val);
-        }
-        DataType::Decimal(_) => {
-            match scalar {
-                ScalarRef::Decimal(DecimalScalar::Decimal128(v, _)) => hll.add_object(&v),
-                ScalarRef::Decimal(DecimalScalar::Decimal256(v, _)) => hll.add_object(&v),
-                _ => unreachable!(),
+impl<T> ColumnNDVEstimator for ColumnNDVEstimatorImpl<T>
+where
+    T: ValueType + Send + Sync,
+    T::Scalar: Hash,
+{
+    fn update_column(&mut self, column: &Column) {
+        if let Column::Nullable(box inner) = column {
+            let validity_len = inner.validity.len();
+            let column = T::try_downcast_column(&inner.column).unwrap();
+            if inner.validity.true_count() as f64 / validity_len as f64 >= SELECTIVITY_THRESHOLD {
+                for (data, valid) in T::iter_column(&column).zip(inner.validity.iter()) {
+                    if valid {
+                        self.hll.add_object(&T::to_owned_scalar(data));
+                    }
+                }
+            } else {
+                TrueIdxIter::new(validity_len, Some(&inner.validity)).for_each(|idx| {
+                    let val = unsafe { T::index_column_unchecked(&column, idx) };
+                    self.hll.add_object(&T::to_owned_scalar(val));
+                })
+            }
+        } else {
+            let column = T::try_downcast_column(column).unwrap();
+            for value in T::iter_column(&column) {
+                self.hll.add_object(&T::to_owned_scalar(value));
             }
         }
-        _ => unreachable!("Unsupported data type: {:?}", ty),
-    });
+    }
+
+    fn update_scalar(&mut self, scalar: &ScalarRef) {
+        if matches!(scalar, ScalarRef::Null) {
+            return;
+        }
+
+        let val = T::try_downcast_scalar(scalar).unwrap();
+        self.hll.add_object(&T::to_owned_scalar(val));
+    }
+
+    fn finalize(&self) -> u64 {
+        self.hll.count() as u64
+    }
 }

From f0b0d930b2add9290e9f2bf59b906db39f0343a5 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 19 Jun 2025 04:05:04 +0800
Subject: [PATCH 33/36] add column min max state

---
 .../fuse/src/io/write/stream/block_builder.rs |   4 +-
 .../src/io/write/stream/column_statistics.rs  | 362 ++++++++++++++----
 2 files changed, 289 insertions(+), 77 deletions(-)

diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
index 2bff1283e2fdd..c125ae026d49f 100644
--- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs
+++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs
@@ -358,7 +358,7 @@ pub struct StreamBlockProperties {
     source_schema: TableSchemaRef,
 
     cluster_stats_builder: Arc<ClusterStatisticsBuilder>,
-    stats_columns: Vec<ColumnId>,
+    stats_columns: Vec<(ColumnId, DataType)>,
     distinct_columns: Vec<(ColumnId, DataType)>,
     bloom_columns_map: BTreeMap<FieldIndex, TableField>,
     ngram_args: Vec<NgramArgs>,
@@ -420,7 +420,7 @@ impl StreamBlockProperties {
             let data_type = DataType::from(field.data_type());
             if RangeIndex::supported_type(&data_type) && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID
             {
-                stats_columns.push(column_id);
+                stats_columns.push((column_id, data_type.clone()));
                 if !bloom_column_ids.contains(&column_id) {
                     distinct_columns.push((column_id, data_type));
                 }
diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
index 1e11c56af4c1c..161245c9f89b5 100644
--- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
@@ -12,14 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::hash::Hash;
 use std::marker::PhantomData;
 
+use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::types::boolean::TrueIdxIter;
 use databend_common_expression::types::DataType;
 use databend_common_expression::types::DateType;
+use databend_common_expression::types::Decimal;
 use databend_common_expression::types::Decimal128Type;
 use databend_common_expression::types::Decimal256Type;
 use databend_common_expression::types::NumberDataType;
@@ -36,25 +39,26 @@ use databend_common_expression::ScalarRef;
 use databend_common_expression::TableSchemaRef;
 use databend_common_expression::Value;
 use databend_common_expression::SELECTIVITY_THRESHOLD;
-use databend_common_functions::aggregates::eval_aggr;
 use databend_storages_common_table_meta::meta::ColumnDistinctHLL;
 use databend_storages_common_table_meta::meta::ColumnStatistics;
 use databend_storages_common_table_meta::meta::StatisticsOfColumns;
 
-use crate::statistics::reducers::reduce_column_statistics;
 use crate::statistics::traverse_values_dfs;
 use crate::statistics::Trim;
 
 pub struct ColumnStatisticsState {
-    col_stats: HashMap<ColumnId, Vec<ColumnStatistics>>,
+    col_stats: HashMap<ColumnId, Box<dyn ColumnMinMaxState>>,
     distinct_columns: HashMap<ColumnId, Box<dyn ColumnNDVEstimator>>,
 }
 
 impl ColumnStatisticsState {
-    pub fn new(stats_columns: &[ColumnId], distinct_columns: &[(ColumnId, DataType)]) -> Self {
+    pub fn new(
+        stats_columns: &[(ColumnId, DataType)],
+        distinct_columns: &[(ColumnId, DataType)],
+    ) -> Self {
         let col_stats = stats_columns
             .iter()
-            .map(|&col_id| (col_id, Vec::new()))
+            .map(|(col_id, data_type)| (*col_id, create_column_minmax_state(data_type)))
             .collect();
 
         let distinct_columns = distinct_columns
@@ -74,63 +78,20 @@ impl ColumnStatisticsState {
         for (column_id, col, data_type) in leaves {
             match col {
                 Value::Scalar(s) => {
-                    let unset_bits = if s == Scalar::Null { rows } else { 0 };
-                    // when we read it back from parquet, it is a Column instead of Scalar
-                    let in_memory_size = s.as_ref().estimated_scalar_repeat_size(rows, &data_type);
-                    let col_stats = ColumnStatistics::new(
-                        s.clone(),
-                        s.clone(),
-                        unset_bits as u64,
-                        in_memory_size as u64,
-                        None,
+                    self.col_stats.get_mut(&column_id).unwrap().update_scalar(
+                        &s.as_ref(),
+                        rows,
+                        &data_type,
                     );
                     if let Some(estimator) = self.distinct_columns.get_mut(&column_id) {
                         estimator.update_scalar(&s.as_ref());
                     }
-                    self.col_stats.get_mut(&column_id).unwrap().push(col_stats);
                 }
                 Value::Column(col) => {
-                    // later, during the evaluation of expressions, name of field does not matter
-                    let mut min = Scalar::Null;
-                    let mut max = Scalar::Null;
-
-                    let (mins, _) = eval_aggr("min", vec![], &[col.clone()], rows, vec![])?;
-                    if mins.len() > 0 {
-                        min = if let Some(v) = mins.index(0) {
-                            // safe upwrap.
-                            v.to_owned().trim_min().unwrap()
-                        } else {
-                            self.col_stats.remove(&column_id);
-                            continue;
-                        }
-                    }
-
-                    let (maxs, _) = eval_aggr("max", vec![], &[col.clone()], rows, vec![])?;
-                    if maxs.len() > 0 {
-                        max = if let Some(v) = maxs.index(0) {
-                            if let Some(v) = v.to_owned().trim_max() {
-                                v
-                            } else {
-                                self.col_stats.remove(&column_id);
-                                continue;
-                            }
-                        } else {
-                            self.col_stats.remove(&column_id);
-                            continue;
-                        }
-                    }
-
-                    let (is_all_null, bitmap) = col.validity();
-                    let unset_bits = match (is_all_null, bitmap) {
-                        (true, _) => rows,
-                        (false, Some(bitmap)) => bitmap.null_count(),
-                        (false, None) => 0,
-                    };
-                    let in_memory_size = col.memory_size() as u64;
-                    let col_stats =
-                        ColumnStatistics::new(min, max, unset_bits as u64, in_memory_size, None);
-                    self.col_stats.get_mut(&column_id).unwrap().push(col_stats);
-
+                    self.col_stats
+                        .get_mut(&column_id)
+                        .unwrap()
+                        .update_column(&col);
                     // use distinct count calculated by the xor hash function to avoid repetitive operation.
                     if let Some(estimator) = self.distinct_columns.get_mut(&column_id) {
                         estimator.update_column(&col);
@@ -146,14 +107,14 @@ impl ColumnStatisticsState {
         column_distinct_count: HashMap<ColumnId, usize>,
     ) -> Result<StatisticsOfColumns> {
         let mut statistics = StatisticsOfColumns::with_capacity(self.col_stats.len());
-        for (id, stats) in &self.col_stats {
-            let mut col_stats = reduce_column_statistics(stats);
-            if let Some(count) = column_distinct_count.get(id) {
+        for (id, stats) in self.col_stats {
+            let mut col_stats = stats.finalize()?;
+            if let Some(count) = column_distinct_count.get(&id) {
                 col_stats.distinct_of_values = Some(*count as u64);
-            } else if let Some(estimator) = self.distinct_columns.get(id) {
+            } else if let Some(estimator) = self.distinct_columns.get(&id) {
                 col_stats.distinct_of_values = Some(estimator.finalize());
             }
-            statistics.insert(*id, col_stats);
+            statistics.insert(id, col_stats);
         }
         Ok(statistics)
     }
@@ -193,7 +154,7 @@ pub fn create_estimator(data_type: &DataType) -> Box<dyn ColumnNDVEstimator> {
 pub struct ColumnNDVEstimatorImpl<T>
 where
     T: ValueType + Send + Sync,
-    T::Scalar: Hash,
+    for<'a> T::ScalarRef<'a>: Hash,
 {
     hll: ColumnDistinctHLL,
     _phantom: PhantomData<T>,
@@ -202,7 +163,7 @@ where
 impl<T> ColumnNDVEstimatorImpl<T>
 where
     T: ValueType + Send + Sync,
-    T::Scalar: Hash,
+    for<'a> T::ScalarRef<'a>: Hash,
 {
     pub fn create() -> Box<dyn ColumnNDVEstimator> {
         Box::new(Self {
@@ -215,28 +176,37 @@ where
 impl<T> ColumnNDVEstimator for ColumnNDVEstimatorImpl<T>
 where
     T: ValueType + Send + Sync,
-    T::Scalar: Hash,
+    for<'a> T::ScalarRef<'a>: Hash,
 {
     fn update_column(&mut self, column: &Column) {
-        if let Column::Nullable(box inner) = column {
-            let validity_len = inner.validity.len();
-            let column = T::try_downcast_column(&inner.column).unwrap();
-            if inner.validity.true_count() as f64 / validity_len as f64 >= SELECTIVITY_THRESHOLD {
-                for (data, valid) in T::iter_column(&column).zip(inner.validity.iter()) {
+        let (column, validity) = if let Column::Nullable(box inner) = column {
+            let validity = if inner.validity.null_count() == 0 {
+                None
+            } else {
+                Some(&inner.validity)
+            };
+            (&inner.column, validity)
+        } else {
+            (column, None)
+        };
+
+        let column = T::try_downcast_column(column).unwrap();
+        if let Some(v) = validity {
+            if v.true_count() as f64 / v.len() as f64 >= SELECTIVITY_THRESHOLD {
+                for (data, valid) in T::iter_column(&column).zip(v.iter()) {
                     if valid {
-                        self.hll.add_object(&T::to_owned_scalar(data));
+                        self.hll.add_object(&data);
                     }
                 }
             } else {
-                TrueIdxIter::new(validity_len, Some(&inner.validity)).for_each(|idx| {
+                TrueIdxIter::new(v.len(), Some(v)).for_each(|idx| {
                     let val = unsafe { T::index_column_unchecked(&column, idx) };
-                    self.hll.add_object(&T::to_owned_scalar(val));
+                    self.hll.add_object(&val);
                 })
             }
         } else {
-            let column = T::try_downcast_column(column).unwrap();
             for value in T::iter_column(&column) {
-                self.hll.add_object(&T::to_owned_scalar(value));
+                self.hll.add_object(&value);
             }
         }
     }
@@ -247,10 +217,252 @@ where
         }
 
         let val = T::try_downcast_scalar(scalar).unwrap();
-        self.hll.add_object(&T::to_owned_scalar(val));
+        self.hll.add_object(&val);
     }
 
     fn finalize(&self) -> u64 {
         self.hll.count() as u64
     }
 }
+
+pub trait ColumnMinMaxState: Send + Sync {
+    fn update_column(&mut self, column: &Column);
+
+    fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType);
+
+    fn finalize(self: Box<Self>) -> Result<ColumnStatistics>;
+}
+
+pub trait MinMaxAdapter<T: ValueType>: Send + Sync {
+    type Value: Clone + Send + Sync;
+
+    fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value;
+
+    fn value_to_scalar(val: Self::Value) -> T::Scalar;
+
+    fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering);
+}
+
+pub struct CommonAdapter;
+
+impl<T> MinMaxAdapter<T> for CommonAdapter
+where
+    T: ValueType,
+    T::Scalar: Send + Sync,
+    for<'a, 'b> T::ScalarRef<'a>: PartialOrd<T::ScalarRef<'b>>,
+{
+    type Value = T::Scalar;
+
+    fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value {
+        T::to_owned_scalar(val)
+    }
+
+    fn value_to_scalar(val: Self::Value) -> T::Scalar {
+        val
+    }
+
+    fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering) {
+        if scalar.partial_cmp(&T::to_scalar_ref(value)) == Some(ordering) {
+            *value = T::to_owned_scalar(scalar);
+        }
+    }
+}
+
+pub struct DecimalAdapter;
+
+impl<T> MinMaxAdapter<T> for DecimalAdapter
+where
+    T: ValueType,
+    T::Scalar: Decimal + Send + Sync,
+    for<'a, 'b> T::ScalarRef<'a>: PartialOrd<T::ScalarRef<'b>>,
+{
+    type Value = <T::Scalar as Decimal>::U64Array;
+
+    fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value {
+        T::Scalar::to_u64_array(T::to_owned_scalar(val))
+    }
+
+    fn value_to_scalar(val: Self::Value) -> T::Scalar {
+        T::Scalar::from_u64_array(val)
+    }
+
+    fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering) {
+        let val = T::Scalar::from_u64_array(*value);
+        if scalar.partial_cmp(&T::to_scalar_ref(&val)) == Some(ordering) {
+            *value = T::Scalar::to_u64_array(T::to_owned_scalar(scalar));
+        }
+    }
+}
+
+pub fn create_column_minmax_state(data_type: &DataType) -> Box<dyn ColumnMinMaxState> {
+    let inner_type = data_type.remove_nullable();
+    with_number_mapped_type!(|NUM_TYPE| match inner_type {
+        DataType::Number(NumberDataType::NUM_TYPE) => {
+            GenericColumnMinMaxState::<NumberType<NUM_TYPE>, CommonAdapter>::create(inner_type)
+        }
+        DataType::String => {
+            GenericColumnMinMaxState::<StringType, CommonAdapter>::create(inner_type)
+        }
+        DataType::Date => {
+            GenericColumnMinMaxState::<DateType, CommonAdapter>::create(inner_type)
+        }
+        DataType::Timestamp => {
+            GenericColumnMinMaxState::<TimestampType, CommonAdapter>::create(inner_type)
+        }
+        DataType::Decimal(s) if s.can_carried_by_128() => {
+            GenericColumnMinMaxState::<Decimal128Type, DecimalAdapter>::create(inner_type)
+        }
+        DataType::Decimal(_) => {
+            GenericColumnMinMaxState::<Decimal256Type, DecimalAdapter>::create(inner_type)
+        }
+        _ => unreachable!("Unsupported data type: {:?}", data_type),
+    })
+}
+
+pub struct GenericColumnMinMaxState<T, A>
+where
+    T: ValueType,
+    A: MinMaxAdapter<T>,
+{
+    min: Option<A::Value>,
+    max: Option<A::Value>,
+    null_count: usize,
+    in_memory_size: usize,
+    data_type: DataType,
+
+    _phantom: PhantomData<(T, A)>,
+}
+
+impl<T, A> GenericColumnMinMaxState<T, A>
+where
+    T: ValueType + Send + Sync,
+    T::Scalar: Send + Sync,
+    A: MinMaxAdapter<T> + 'static,
+    for<'a, 'b> T::ScalarRef<'a>: PartialOrd<T::ScalarRef<'b>>,
+{
+    pub fn create(data_type: DataType) -> Box<dyn ColumnMinMaxState> {
+        Box::new(Self {
+            min: None,
+            max: None,
+            null_count: 0,
+            in_memory_size: 0,
+            data_type,
+            _phantom: PhantomData,
+        })
+    }
+
+    fn add_batch<'a, I>(&mut self, mut iter: I)
+    where I: Iterator<Item = T::ScalarRef<'a>> {
+        let first = iter.next().unwrap();
+        let mut min = first.clone();
+        let mut max = first;
+        for v in iter {
+            if matches!(min.partial_cmp(&v), Some(Ordering::Greater)) {
+                min = v;
+            } else if matches!(max.partial_cmp(&v), Some(Ordering::Less)) {
+                max = v;
+            }
+        }
+
+        self.add(min, max);
+    }
+
+    fn add(&mut self, min: T::ScalarRef<'_>, max: T::ScalarRef<'_>) {
+        if let Some(val) = self.min.as_mut() {
+            A::update_value(val, min, Ordering::Less);
+        } else {
+            self.min = Some(A::scalar_to_value(min));
+        }
+
+        if let Some(val) = self.max.as_mut() {
+            A::update_value(val, max, Ordering::Greater);
+        } else {
+            self.max = Some(A::scalar_to_value(max));
+        }
+    }
+}
+
+impl<T, A> ColumnMinMaxState for GenericColumnMinMaxState<T, A>
+where
+    T: ValueType + Send + Sync,
+    T::Scalar: Send + Sync,
+    A: MinMaxAdapter<T> + 'static,
+    for<'a, 'b> T::ScalarRef<'a>: PartialOrd<T::ScalarRef<'b>>,
+{
+    fn update_column(&mut self, column: &Column) {
+        self.in_memory_size += column.memory_size();
+        let (column, validity) = if let Column::Nullable(box inner) = column {
+            let validity = if inner.validity.null_count() == 0 {
+                None
+            } else {
+                Some(&inner.validity)
+            };
+            (&inner.column, validity)
+        } else {
+            (column, None)
+        };
+        self.null_count += validity.map_or(0, |v| v.null_count());
+
+        let column = T::try_downcast_column(column).unwrap();
+        if let Some(v) = validity {
+            if v.true_count() as f64 / v.len() as f64 >= SELECTIVITY_THRESHOLD {
+                let column_iter = T::iter_column(&column);
+                let value_iter = column_iter
+                    .zip(v.iter())
+                    .filter(|(_, v)| *v)
+                    .map(|(v, _)| v);
+                self.add_batch(value_iter);
+            } else {
+                for idx in TrueIdxIter::new(v.len(), Some(v)) {
+                    let v = unsafe { T::index_column_unchecked(&column, idx) };
+                    self.add(v.clone(), v);
+                }
+            }
+        } else {
+            let column_iter = T::iter_column(&column);
+            self.add_batch(column_iter);
+        }
+    }
+
+    fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType) {
+        // when we read it back from parquet, it is a Column instead of Scalar
+        self.in_memory_size += scalar.estimated_scalar_repeat_size(num_rows, data_type);
+        if scalar.is_null() {
+            self.null_count += num_rows;
+            return;
+        }
+
+        let val = T::try_downcast_scalar(scalar).unwrap();
+        self.add(val.clone(), val);
+    }
+
+    fn finalize(self: Box<Self>) -> Result<ColumnStatistics> {
+        let min = if let Some(v) = self.min {
+            let v = A::value_to_scalar(v);
+            // safe upwrap.
+            T::upcast_scalar_with_type(v, &self.data_type)
+                .trim_min()
+                .unwrap()
+        } else {
+            Scalar::Null
+        };
+        let max = if let Some(v) = self.max {
+            let v = A::value_to_scalar(v);
+            if let Some(v) = T::upcast_scalar_with_type(v, &self.data_type).trim_max() {
+                v
+            } else {
+                return Err(ErrorCode::Internal("Unable to trim string"));
+            }
+        } else {
+            Scalar::Null
+        };
+
+        Ok(ColumnStatistics::new(
+            min,
+            max,
+            self.null_count as u64,
+            self.in_memory_size as u64,
+            None,
+        ))
+    }
+}

From ca55812c4a46f47060ccf7cd2d87ccc6ca69258b Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 19 Jun 2025 16:52:02 +0800
Subject: [PATCH 34/36] fix

---
 .../storages/common/index/src/bloom_index.rs  | 92 +++++++++++++++----
 .../src/io/write/stream/column_statistics.rs  |  5 +-
 2 files changed, 78 insertions(+), 19 deletions(-)

diff --git a/src/query/storages/common/index/src/bloom_index.rs b/src/query/storages/common/index/src/bloom_index.rs
index bf5ec35035e64..59d06f6298a49 100644
--- a/src/query/storages/common/index/src/bloom_index.rs
+++ b/src/query/storages/common/index/src/bloom_index.rs
@@ -14,6 +14,7 @@
 
 use std::collections::BTreeMap;
 use std::collections::HashMap;
+use std::hash::DefaultHasher;
 use std::hash::Hasher;
 use std::ops::ControlFlow;
 use std::ops::Deref;
@@ -35,12 +36,18 @@ use databend_common_expression::types::BinaryType;
 use databend_common_expression::types::Bitmap;
 use databend_common_expression::types::Buffer;
 use databend_common_expression::types::DataType;
+use databend_common_expression::types::DateType;
 use databend_common_expression::types::MapType;
 use databend_common_expression::types::NullableType;
 use databend_common_expression::types::Number;
 use databend_common_expression::types::NumberDataType;
+use databend_common_expression::types::NumberType;
+use databend_common_expression::types::StringType;
+use databend_common_expression::types::TimestampType;
 use databend_common_expression::types::UInt64Type;
+use databend_common_expression::types::ValueType;
 use databend_common_expression::visit_expr;
+use databend_common_expression::with_number_mapped_type;
 use databend_common_expression::BlockEntry;
 use databend_common_expression::Column;
 use databend_common_expression::ColumnBuilder;
@@ -349,6 +356,71 @@ impl BloomIndex {
         Ok(column)
     }
 
+    pub fn calculate_digest_by_type(data_type: &DataType, column: &Column) -> Result<Vec<u64>> {
+        let inner_type = data_type.remove_nullable();
+        with_number_mapped_type!(|NUM_TYPE| match inner_type {
+            DataType::Number(NumberDataType::NUM_TYPE) => {
+                Self::calculate_nullable_column_digests::<NumberType<NUM_TYPE>>(column)
+            }
+            DataType::String => {
+                Self::calculate_nullable_column_digests::<StringType>(column)
+            }
+            DataType::Date => {
+                Self::calculate_nullable_column_digests::<DateType>(column)
+            }
+            DataType::Timestamp => {
+                Self::calculate_nullable_column_digests::<TimestampType>(column)
+            }
+            _ => Err(ErrorCode::Internal(format!(
+                "Unsupported data type: {:?}",
+                data_type
+            ))),
+        })
+    }
+
+    #[inline(always)]
+    fn hash_one<T: DFHash>(v: &T) -> u64 {
+        let mut hasher = DefaultHasher::default();
+        DFHash::hash(v, &mut hasher);
+        hasher.finish()
+    }
+
+    fn calculate_nullable_column_digests<T: ValueType>(column: &Column) -> Result<Vec<u64>>
+    where for<'a> T::ScalarRef<'a>: DFHash {
+        let (column, validity) = if let Column::Nullable(box inner) = column {
+            let validity = if inner.validity.null_count() == 0 {
+                None
+            } else {
+                Some(&inner.validity)
+            };
+            (&inner.column, validity)
+        } else {
+            (column, None)
+        };
+
+        let capacity = validity.map_or(column.len(), |v| v.true_count() + 1);
+        let mut result = Vec::with_capacity(capacity);
+        if validity.is_some() {
+            result.push(0);
+        }
+        let column = T::try_downcast_column(column).unwrap();
+        if let Some(validity) = validity {
+            let column_iter = T::iter_column(&column);
+            let value_iter = column_iter
+                .zip(validity.iter())
+                .filter(|(_, v)| *v)
+                .map(|(v, _)| v);
+            for value in value_iter {
+                result.push(Self::hash_one(&value));
+            }
+        } else {
+            for value in T::iter_column(&column) {
+                result.push(Self::hash_one(&value));
+            }
+        }
+        Ok(result)
+    }
+
     /// calculate digest for column that may have null values
     ///
     /// returns (column, validity) where column is the digest of the column
@@ -734,24 +806,8 @@ impl BloomIndexBuilder {
                 }
             };
 
-            let (column, validity) =
-                BloomIndex::calculate_nullable_column_digest(&self.func_ctx, &column, &data_type)?;
-            // create filter per column
-            if validity.as_ref().map(|v| v.null_count()).unwrap_or(0) > 0 {
-                let validity = validity.unwrap();
-                let it = column.deref().iter().zip(validity.iter()).map(
-                    |(v, b)| {
-                        if !b {
-                            &0
-                        } else {
-                            v
-                        }
-                    },
-                );
-                index_column.builder.add_digests(it);
-            } else {
-                index_column.builder.add_digests(column.deref());
-            }
+            let column = BloomIndex::calculate_digest_by_type(&data_type, &column)?;
+            index_column.builder.add_digests(column.deref());
         }
         for index_column in self.ngram_columns.iter_mut() {
             let field_type = &block.data_type(index_column.index);
diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
index 161245c9f89b5..ee520eb5007b3 100644
--- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
@@ -359,7 +359,10 @@ where
         for v in iter {
             if matches!(min.partial_cmp(&v), Some(Ordering::Greater)) {
                 min = v;
-            } else if matches!(max.partial_cmp(&v), Some(Ordering::Less)) {
+                continue;
+            }
+
+            if matches!(max.partial_cmp(&v), Some(Ordering::Less)) {
                 max = v;
             }
         }

From e7483b5ed981297d1c4763f403325488a43bb03a Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Thu, 19 Jun 2025 18:44:02 +0800
Subject: [PATCH 35/36] remove unused codes

---
 .../interpreter_table_recluster.rs            | 44 ++-------
 .../builders/builder_hilbert_partition.rs     |  6 +-
 .../pipelines/builders/builder_recluster.rs   |  2 +-
 .../aggregator/aggregate_exchange_injector.rs |  1 -
 .../recluster/recluster_partition_exchange.rs | 23 ++---
 .../recluster/recluster_partition_strategy.rs | 22 -----
 .../partition/partition_process_strategy.rs   | 19 ----
 .../transform_window_partition_collect.rs     |  4 +-
 .../src/schedulers/fragments/fragmenter.rs    | 23 -----
 .../src/schedulers/fragments/plan_fragment.rs | 56 -----------
 .../query_fragment_actions_display.rs         |  1 -
 .../flight/v1/exchange/data_exchange.rs       | 17 ----
 .../flight/v1/exchange/exchange_injector.rs   |  6 --
 .../flight/v1/exchange/exchange_manager.rs    | 12 ---
 .../src/servers/flight/v1/exchange/mod.rs     |  1 -
 .../flight/v1/scatter/flight_scatter_mod.rs   | 92 -------------------
 .../src/servers/flight/v1/scatter/mod.rs      |  2 -
 src/query/sql/src/executor/format.rs          |  8 --
 .../sql/src/executor/physical_plans/common.rs |  2 -
 .../physical_plans/physical_exchange.rs       |  8 --
 .../physical_plans/physical_recluster.rs      |  3 +-
 .../planner/format/display_rel_operator.rs    |  1 -
 .../sql/src/planner/optimizer/ir/format.rs    |  1 -
 .../planner/optimizer/ir/property/enforcer.rs |  1 -
 .../planner/optimizer/ir/property/property.rs |  8 +-
 .../optimizers/cascades/cost/model.rs         |  2 +-
 src/query/sql/src/planner/plans/exchange.rs   |  4 +-
 .../fuse/src/statistics/cluster_statistics.rs |  2 +-
 28 files changed, 33 insertions(+), 338 deletions(-)
 delete mode 100644 src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs

diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs
index 602e58df1670a..2468c6b2cd159 100644
--- a/src/query/service/src/interpreters/interpreter_table_recluster.rs
+++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs
@@ -28,7 +28,6 @@ use databend_common_catalog::table::TableExt;
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_expression::type_check::check_function;
-use databend_common_expression::types::NumberScalar;
 use databend_common_expression::DataBlock;
 use databend_common_expression::Scalar;
 use databend_common_functions::BUILTIN_FUNCTIONS;
@@ -53,8 +52,6 @@ use databend_common_sql::plans::plan_hilbert_sql;
 use databend_common_sql::plans::replace_with_constant;
 use databend_common_sql::plans::set_update_stream_columns;
 use databend_common_sql::plans::BoundColumnRef;
-use databend_common_sql::plans::ConstantExpr;
-use databend_common_sql::plans::FunctionCall;
 use databend_common_sql::plans::Plan;
 use databend_common_sql::plans::ReclusterPlan;
 use databend_common_sql::IdentifierNormalizer;
@@ -433,44 +430,22 @@ impl ReclusterTableInterpreter {
 
         // For distributed execution, add an exchange operator to distribute work
         if is_distributed {
-            let nodes_num = cluster.nodes.len() as u64;
-            let scalar_expr = ScalarExpr::FunctionCall(FunctionCall {
-                span: None,
-                func_name: "div".to_string(),
-                params: vec![],
-                arguments: vec![
-                    ScalarExpr::FunctionCall(FunctionCall {
-                        span: None,
-                        func_name: "multiply".to_string(),
-                        params: vec![],
-                        arguments: vec![
-                            ScalarExpr::BoundColumnRef(BoundColumnRef {
-                                span: None,
-                                column: bind_context.columns.last().unwrap().clone(),
-                            }),
-                            ScalarExpr::ConstantExpr(ConstantExpr {
-                                span: None,
-                                value: Scalar::Number(NumberScalar::UInt64(nodes_num)),
-                            }),
-                        ],
-                    }),
-                    ScalarExpr::ConstantExpr(ConstantExpr {
-                        span: None,
-                        value: Scalar::Number(NumberScalar::UInt64(total_partitions as u64)),
-                    }),
-                ],
-            });
-
             // Create an expression for the partition column,
             // i.e.`range_partition_id(hilbert_range_index({hilbert_keys_str}), [...]) AS _predicate`
-            let expr = scalar_expr_to_remote_expr(&scalar_expr, plan.output_schema()?.as_ref())?;
+            let expr = scalar_expr_to_remote_expr(
+                &ScalarExpr::BoundColumnRef(BoundColumnRef {
+                    span: None,
+                    column: bind_context.columns.last().unwrap().clone(),
+                }),
+                plan.output_schema()?.as_ref(),
+            )?;
 
             // Add exchange operator for data distribution,
             // shuffling data based on the hash of range partition IDs derived from the Hilbert index.
             plan = Box::new(PhysicalPlan::Exchange(Exchange {
                 plan_id: 0,
                 input: plan,
-                kind: FragmentKind::Modulo,
+                kind: FragmentKind::Normal,
                 keys: vec![expr],
                 allow_adjust_parallelism: true,
                 ignore_exchange: false,
@@ -487,8 +462,7 @@ impl ReclusterTableInterpreter {
             plan_id: 0,
             input: plan,
             table_info: table_info.clone(),
-            range_start: 0,
-            range_width: total_partitions,
+            num_partitions: total_partitions,
             table_meta_timestamps,
             bytes_per_block,
             rows_per_block,
diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
index aebafaa53566d..870db16444a42 100644
--- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
+++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
@@ -51,7 +51,7 @@ impl PipelineBuilder {
 
         self.main_pipeline.exchange(
             num_processors,
-            ReclusterPartitionExchange::create(partition.range_start, partition.range_width),
+            ReclusterPartitionExchange::create(partition.num_partitions),
         );
 
         let settings = self.ctx.get_settings();
@@ -85,7 +85,7 @@ impl PipelineBuilder {
                         &settings,
                         processor_id.fetch_add(1, atomic::Ordering::AcqRel),
                         num_processors,
-                        partition.range_width,
+                        partition.num_partitions,
                         window_spill_settings.clone(),
                         disk_spill.clone(),
                         ReclusterPartitionStrategy::new(properties.clone()),
@@ -112,7 +112,7 @@ impl PipelineBuilder {
                         &settings,
                         processor_id.fetch_add(1, atomic::Ordering::AcqRel),
                         num_processors,
-                        partition.range_width,
+                        partition.num_partitions,
                         window_spill_settings.clone(),
                         disk_spill.clone(),
                         CompactPartitionStrategy::new(
diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index 28b5feaacfd39..b4c9a396c5fcb 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -195,7 +195,7 @@ impl PipelineBuilder {
 
                     self.main_pipeline.exchange(
                         num_processors,
-                        ReclusterPartitionExchange::create(0, partitions),
+                        ReclusterPartitionExchange::create(partitions),
                     );
                     let processor_id = AtomicUsize::new(0);
 
diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
index 40904ea2c8e16..55688a4347259 100644
--- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
+++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs
@@ -233,7 +233,6 @@ impl ExchangeInjector for AggregateInjector {
         match exchange {
             DataExchange::Merge(_) => unreachable!(),
             DataExchange::Broadcast(_) => unreachable!(),
-            DataExchange::Modulo(_) => unreachable!(),
             DataExchange::ShuffleDataExchange(exchange) => {
                 Ok(Arc::new(Box::new(HashTableHashScatter {
                     buckets: exchange.destination_ids.len(),
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
index 444c81296de26..7fc006d3afad2 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs
@@ -22,16 +22,14 @@ use databend_common_pipeline_core::processors::Exchange;
 use crate::pipelines::processors::transforms::WindowPartitionMeta;
 
 pub struct ReclusterPartitionExchange {
-    start: u64,
-    width: usize,
+    num_partitions: usize,
     start_time: Instant,
 }
 
 impl ReclusterPartitionExchange {
-    pub fn create(start: u64, width: usize) -> Arc<ReclusterPartitionExchange> {
+    pub fn create(num_partitions: usize) -> Arc<ReclusterPartitionExchange> {
         Arc::new(ReclusterPartitionExchange {
-            start,
-            width,
+            num_partitions,
             start_time: Instant::now(),
         })
     }
@@ -50,18 +48,21 @@ impl Exchange for ReclusterPartitionExchange {
         // Scatter the data block to different partitions.
         let indices = range_ids
             .iter()
-            .map(|&id| (id - self.start) as u16)
+            .map(|&id| (id % self.num_partitions as u64) as u16)
             .collect::<Vec<_>>();
         data_block.pop_columns(1);
-
-        let scatter_indices = DataBlock::divide_indices_by_scatter_size(&indices, self.width);
+        let scatter_indices =
+            DataBlock::divide_indices_by_scatter_size(&indices, self.num_partitions);
         // Partition the data blocks to different processors.
         let mut output_data_blocks = vec![vec![]; n];
-        for (partition_id, indices) in scatter_indices.into_iter().take(self.width).enumerate() {
+        for (partition_id, indices) in scatter_indices
+            .into_iter()
+            .take(self.num_partitions)
+            .enumerate()
+        {
             if !indices.is_empty() {
-                let target = (partition_id * n) / self.width;
                 let block = data_block.take_with_optimize_size(&indices)?;
-                output_data_blocks[target].push((partition_id, block));
+                output_data_blocks[partition_id % n].push((partition_id, block));
             }
         }
         log::info!("Recluster range exchange: {:?}", self.start_time.elapsed());
diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
index 9c8a4573171a1..d8f3443c4c6e0 100644
--- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs
@@ -37,17 +37,6 @@ impl ReclusterPartitionStrategy {
 impl PartitionProcessStrategy for ReclusterPartitionStrategy {
     const NAME: &'static str = "Recluster";
 
-    fn calc_partitions(
-        &self,
-        processor_id: usize,
-        num_processors: usize,
-        num_partitions: usize,
-    ) -> Vec<usize> {
-        (0..num_partitions)
-            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
-            .collect()
-    }
-
     /// Stream write each block, and flush it conditionally based on builder status
     /// and input size estimation.
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
@@ -127,17 +116,6 @@ impl CompactPartitionStrategy {
 impl PartitionProcessStrategy for CompactPartitionStrategy {
     const NAME: &'static str = "Compact";
 
-    fn calc_partitions(
-        &self,
-        processor_id: usize,
-        num_processors: usize,
-        num_partitions: usize,
-    ) -> Vec<usize> {
-        (0..num_partitions)
-            .filter(|&partition| (partition * num_processors) / num_partitions == processor_id)
-            .collect()
-    }
-
     /// Collects blocks into batches and merges them via `concat` when size threshold is reached.
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
         let blocks_num = data_blocks.len();
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs
index bec3f8a84e91f..cffa542136623 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs
@@ -22,14 +22,6 @@ use databend_common_settings::Settings;
 pub trait PartitionProcessStrategy: Send + Sync + 'static {
     const NAME: &'static str;
 
-    /// Partition assignment: map partition index to processor via proportional mapping.
-    fn calc_partitions(
-        &self,
-        processor_id: usize,
-        num_processors: usize,
-        num_partitions: usize,
-    ) -> Vec<usize>;
-
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>>;
 }
 
@@ -66,17 +58,6 @@ impl WindowPartitionStrategy {
 impl PartitionProcessStrategy for WindowPartitionStrategy {
     const NAME: &'static str = "Window";
 
-    fn calc_partitions(
-        &self,
-        processor_id: usize,
-        num_processors: usize,
-        num_partitions: usize,
-    ) -> Vec<usize> {
-        (0..num_partitions)
-            .filter(|&partition| partition % num_processors == processor_id)
-            .collect()
-    }
-
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
         let data_blocks = data_blocks
             .into_iter()
diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs
index d1f011404223b..0171af6053d7c 100644
--- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs
+++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs
@@ -92,7 +92,9 @@ impl<S: PartitionProcessStrategy> TransformPartitionCollect<S> {
         strategy: S,
     ) -> Result<Self> {
         // Calculate the partition ids collected by the processor.
-        let partitions = strategy.calc_partitions(processor_id, num_processors, num_partitions);
+        let partitions: Vec<usize> = (0..num_partitions)
+            .filter(|&partition| partition % num_processors == processor_id)
+            .collect();
 
         // Map each partition id to new partition id.
         let mut partition_id = vec![0; num_partitions];
diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs
index dc267c896dc17..5b83c20a670d8 100644
--- a/src/query/service/src/schedulers/fragments/fragmenter.rs
+++ b/src/query/service/src/schedulers/fragments/fragmenter.rs
@@ -15,7 +15,6 @@
 use std::sync::Arc;
 
 use databend_common_catalog::table_context::TableContext;
-use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
 use databend_common_meta_types::NodeInfo;
 use databend_common_sql::executor::physical_plans::CompactSource;
@@ -27,7 +26,6 @@ use databend_common_sql::executor::physical_plans::ExchangeSink;
 use databend_common_sql::executor::physical_plans::ExchangeSource;
 use databend_common_sql::executor::physical_plans::FragmentKind;
 use databend_common_sql::executor::physical_plans::HashJoin;
-use databend_common_sql::executor::physical_plans::HilbertPartition;
 use databend_common_sql::executor::physical_plans::MutationSource;
 use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::executor::physical_plans::ReplaceInto;
@@ -41,7 +39,6 @@ use crate::schedulers::PlanFragment;
 use crate::servers::flight::v1::exchange::BroadcastExchange;
 use crate::servers::flight::v1::exchange::DataExchange;
 use crate::servers::flight::v1::exchange::MergeExchange;
-use crate::servers::flight::v1::exchange::ModuloExchange;
 use crate::servers::flight::v1::exchange::ShuffleDataExchange;
 use crate::sessions::QueryContext;
 use crate::sql::executor::PhysicalPlan;
@@ -69,7 +66,6 @@ enum State {
     Compact,
     Recluster,
     Other,
-    HilbertRecluster,
 }
 
 impl Fragmenter {
@@ -118,15 +114,6 @@ impl Fragmenter {
                 FragmentKind::Expansive => {
                     Ok(Some(BroadcastExchange::create(Self::get_executors(ctx))))
                 }
-                FragmentKind::Modulo => {
-                    if plan.keys.len() != 1 {
-                        return Err(ErrorCode::Internal("Modulo exchange require one key"));
-                    }
-                    Ok(Some(ModuloExchange::create(
-                        Self::get_executors(ctx),
-                        plan.keys[0].clone(),
-                    )))
-                }
                 _ => Ok(None),
             },
             _ => Ok(None),
@@ -213,15 +200,6 @@ impl PhysicalPlanReplacer for Fragmenter {
         Ok(PhysicalPlan::Recluster(Box::new(plan.clone())))
     }
 
-    fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result<PhysicalPlan> {
-        let input = self.replace(&plan.input)?;
-        self.state = State::HilbertRecluster;
-        Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition {
-            input: Box::new(input),
-            ..plan.clone()
-        })))
-    }
-
     fn replace_compact_source(&mut self, plan: &CompactSource) -> Result<PhysicalPlan> {
         self.state = State::Compact;
         Ok(PhysicalPlan::CompactSource(Box::new(plan.clone())))
@@ -323,7 +301,6 @@ impl PhysicalPlanReplacer for Fragmenter {
             State::ReplaceInto => FragmentType::ReplaceInto,
             State::Compact => FragmentType::Compact,
             State::Recluster => FragmentType::Recluster,
-            State::HilbertRecluster => FragmentType::HilbertRecluster,
         };
         self.state = State::Other;
         let exchange = Self::get_exchange(self.ctx.clone(), &plan)?;
diff --git a/src/query/service/src/schedulers/fragments/plan_fragment.rs b/src/query/service/src/schedulers/fragments/plan_fragment.rs
index fab77a79d29f5..2f52da8b04ce3 100644
--- a/src/query/service/src/schedulers/fragments/plan_fragment.rs
+++ b/src/query/service/src/schedulers/fragments/plan_fragment.rs
@@ -28,7 +28,6 @@ use databend_common_sql::executor::physical_plans::CompactSource;
 use databend_common_sql::executor::physical_plans::ConstantTableScan;
 use databend_common_sql::executor::physical_plans::CopyIntoTable;
 use databend_common_sql::executor::physical_plans::CopyIntoTableSource;
-use databend_common_sql::executor::physical_plans::HilbertPartition;
 use databend_common_sql::executor::physical_plans::MutationSource;
 use databend_common_sql::executor::physical_plans::Recluster;
 use databend_common_sql::executor::physical_plans::ReplaceDeduplicate;
@@ -65,7 +64,6 @@ pub enum FragmentType {
     Compact,
     Recluster,
     MutationSource,
-    HilbertRecluster,
 }
 
 #[derive(Clone)]
@@ -138,9 +136,6 @@ impl PlanFragment {
             FragmentType::Recluster => {
                 self.redistribute_recluster(ctx, &mut fragment_actions)?;
             }
-            FragmentType::HilbertRecluster => {
-                self.redistribute_hilbert(ctx, &mut fragment_actions)?;
-            }
         }
 
         if let Some(ref exchange) = self.exchange {
@@ -381,40 +376,6 @@ impl PlanFragment {
         Ok(())
     }
 
-    fn redistribute_hilbert(
-        &self,
-        ctx: Arc<QueryContext>,
-        fragment_actions: &mut QueryFragmentActions,
-    ) -> Result<()> {
-        let exchange_sink = match &self.plan {
-            PhysicalPlan::ExchangeSink(plan) => plan,
-            _ => unreachable!("logic error"),
-        };
-        let hilbert = match exchange_sink.input.as_ref() {
-            PhysicalPlan::HilbertPartition(plan) => plan,
-            _ => unreachable!("logic error"),
-        };
-
-        let total_ranges = hilbert.range_width;
-        let executors = Fragmenter::get_executors(ctx);
-        let num_executors = executors.len();
-        let base_width = total_ranges / num_executors;
-        let remainder = total_ranges % num_executors;
-        for (executor_idx, executor) in executors.into_iter().enumerate() {
-            let width = base_width + if executor_idx < remainder { 1 } else { 0 };
-            let min = executor_idx * base_width + std::cmp::min(executor_idx, remainder);
-            let mut plan = self.plan.clone();
-            let mut replace_hilbert = ReplaceHilbert {
-                range_width: width,
-                range_start: min as u64,
-            };
-            plan = replace_hilbert.replace(&plan)?;
-            fragment_actions.add_action(QueryFragmentAction::create(executor, plan));
-        }
-
-        Ok(())
-    }
-
     fn reshuffle<T: Clone>(
         executors: Vec<String>,
         partitions: Vec<T>,
@@ -590,23 +551,6 @@ impl PhysicalPlanReplacer for ReplaceReadSource {
     }
 }
 
-struct ReplaceHilbert {
-    range_width: usize,
-    range_start: u64,
-}
-
-impl PhysicalPlanReplacer for ReplaceHilbert {
-    fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result<PhysicalPlan> {
-        let input = self.replace(&plan.input)?;
-        Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition {
-            input: Box::new(input),
-            range_width: self.range_width,
-            range_start: self.range_start,
-            ..plan.clone()
-        })))
-    }
-}
-
 struct ReplaceRecluster {
     tasks: Vec<ReclusterTask>,
 }
diff --git a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs
index 36d8f0c257eb1..adb0b6c3bcd18 100644
--- a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs
+++ b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs
@@ -72,7 +72,6 @@ impl Display for QueryFragmentActionsWrap<'_> {
                 DataExchange::Merge(_) => writeln!(f, "  DataExchange: Merge")?,
                 DataExchange::Broadcast(_) => writeln!(f, "  DataExchange: Broadcast")?,
                 DataExchange::ShuffleDataExchange(_) => writeln!(f, "  DataExchange: Shuffle")?,
-                DataExchange::Modulo(_) => writeln!(f, "  DataExchange: Modulo")?,
             }
         }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs
index 0fba30c72ec7b..f23c7582559a7 100644
--- a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs
@@ -19,7 +19,6 @@ pub enum DataExchange {
     Merge(MergeExchange),
     Broadcast(BroadcastExchange),
     ShuffleDataExchange(ShuffleDataExchange),
-    Modulo(ModuloExchange),
 }
 
 impl DataExchange {
@@ -28,7 +27,6 @@ impl DataExchange {
             DataExchange::Merge(exchange) => vec![exchange.destination_id.clone()],
             DataExchange::Broadcast(exchange) => exchange.destination_ids.clone(),
             DataExchange::ShuffleDataExchange(exchange) => exchange.destination_ids.clone(),
-            DataExchange::Modulo(exchange) => exchange.destination_ids.clone(),
         }
     }
 }
@@ -79,18 +77,3 @@ impl BroadcastExchange {
         DataExchange::Broadcast(BroadcastExchange { destination_ids })
     }
 }
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct ModuloExchange {
-    pub destination_ids: Vec<String>,
-    pub shuffle_key: RemoteExpr,
-}
-
-impl ModuloExchange {
-    pub fn create(destination_ids: Vec<String>, shuffle_key: RemoteExpr) -> DataExchange {
-        DataExchange::Modulo(ModuloExchange {
-            destination_ids,
-            shuffle_key,
-        })
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
index 5b10b4f346960..4aa65ba175a83 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs
@@ -29,7 +29,6 @@ use crate::servers::flight::v1::exchange::ShuffleExchangeParams;
 use crate::servers::flight::v1::scatter::BroadcastFlightScatter;
 use crate::servers::flight::v1::scatter::FlightScatter;
 use crate::servers::flight::v1::scatter::HashFlightScatter;
-use crate::servers::flight::v1::scatter::ModFlightScatter;
 use crate::sessions::QueryContext;
 
 pub trait ExchangeInjector: Send + Sync + 'static {
@@ -101,11 +100,6 @@ impl ExchangeInjector for DefaultExchangeInjector {
                     local_pos,
                 )?
             }
-            DataExchange::Modulo(exchange) => ModFlightScatter::try_create(
-                ctx.get_function_context()?,
-                &exchange.shuffle_key,
-                exchange.destination_ids.len(),
-            )?,
         }))
     }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
index 8d96b11c3488d..13a6a57742127 100644
--- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs
@@ -1011,18 +1011,6 @@ impl FragmentCoordinator {
                         .flight_scatter(&info.query_ctx, data_exchange)?,
                 }),
             )),
-            DataExchange::Modulo(exchange) => Ok(Some(ExchangeParams::ShuffleExchange(
-                ShuffleExchangeParams {
-                    exchange_injector: exchange_injector.clone(),
-                    schema: self.physical_plan.output_schema()?,
-                    fragment_id: self.fragment_id,
-                    query_id: info.query_id.to_string(),
-                    executor_id: info.current_executor.to_string(),
-                    destination_ids: exchange.destination_ids.to_owned(),
-                    shuffle_scatter: exchange_injector
-                        .flight_scatter(&info.query_ctx, data_exchange)?,
-                },
-            ))),
         }
     }
 
diff --git a/src/query/service/src/servers/flight/v1/exchange/mod.rs b/src/query/service/src/servers/flight/v1/exchange/mod.rs
index ada27909df959..194f2cbe1e3e5 100644
--- a/src/query/service/src/servers/flight/v1/exchange/mod.rs
+++ b/src/query/service/src/servers/flight/v1/exchange/mod.rs
@@ -32,7 +32,6 @@ pub mod serde;
 pub use data_exchange::BroadcastExchange;
 pub use data_exchange::DataExchange;
 pub use data_exchange::MergeExchange;
-pub use data_exchange::ModuloExchange;
 pub use data_exchange::ShuffleDataExchange;
 pub use exchange_injector::DefaultExchangeInjector;
 pub use exchange_injector::ExchangeInjector;
diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs
deleted file mode 100644
index f83fea3f574c2..0000000000000
--- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright 2021 Datafuse Labs
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use databend_common_exception::ErrorCode;
-use databend_common_exception::Result;
-use databend_common_expression::type_check::check_function;
-use databend_common_expression::types::DataType;
-use databend_common_expression::types::NumberDataType;
-use databend_common_expression::types::NumberScalar;
-use databend_common_expression::DataBlock;
-use databend_common_expression::Evaluator;
-use databend_common_expression::Expr;
-use databend_common_expression::FunctionContext;
-use databend_common_expression::RemoteExpr;
-use databend_common_expression::Scalar;
-use databend_common_functions::BUILTIN_FUNCTIONS;
-
-use crate::servers::flight::v1::scatter::FlightScatter;
-
-#[derive(Clone)]
-pub struct ModFlightScatter {
-    scatter_size: usize,
-    func_ctx: FunctionContext,
-    expr: Expr,
-}
-
-impl ModFlightScatter {
-    pub fn try_create(
-        func_ctx: FunctionContext,
-        expr: &RemoteExpr,
-        scatter_size: usize,
-    ) -> Result<Box<dyn FlightScatter>> {
-        let expr = check_function(
-            None,
-            "modulo",
-            &[],
-            &[
-                expr.as_expr(&BUILTIN_FUNCTIONS),
-                Expr::constant(
-                    Scalar::Number(NumberScalar::UInt64(scatter_size as u64)),
-                    Some(DataType::Number(NumberDataType::UInt64)),
-                ),
-            ],
-            &BUILTIN_FUNCTIONS,
-        )?;
-        let return_type = expr.data_type();
-        if !matches!(return_type, DataType::Number(NumberDataType::UInt64)) {
-            return Err(ErrorCode::Internal(format!(
-                "ModFlightScatter expects modulo expression to return UInt64, but got {:?}",
-                return_type
-            )));
-        }
-
-        Ok(Box::new(ModFlightScatter {
-            scatter_size,
-            func_ctx,
-            expr,
-        }))
-    }
-}
-
-impl FlightScatter for ModFlightScatter {
-    fn execute(&self, data_block: DataBlock) -> Result<Vec<DataBlock>> {
-        let evaluator = Evaluator::new(&data_block, &self.func_ctx, &BUILTIN_FUNCTIONS);
-        let num = data_block.num_rows();
-
-        let column = evaluator
-            .run(&self.expr)?
-            .into_full_column(&DataType::Number(NumberDataType::UInt64), num);
-        let indices = column.as_number().unwrap().as_u_int64().unwrap();
-        let data_blocks = DataBlock::scatter(&data_block, indices, self.scatter_size)?;
-
-        let block_meta = data_block.get_meta();
-        let mut res = Vec::with_capacity(data_blocks.len());
-        for data_block in data_blocks {
-            res.push(data_block.add_meta(block_meta.cloned())?);
-        }
-
-        Ok(res)
-    }
-}
diff --git a/src/query/service/src/servers/flight/v1/scatter/mod.rs b/src/query/service/src/servers/flight/v1/scatter/mod.rs
index 2904ed87684ca..b5f5f900dab71 100644
--- a/src/query/service/src/servers/flight/v1/scatter/mod.rs
+++ b/src/query/service/src/servers/flight/v1/scatter/mod.rs
@@ -15,9 +15,7 @@
 mod flight_scatter;
 mod flight_scatter_broadcast;
 mod flight_scatter_hash;
-mod flight_scatter_mod;
 
 pub use flight_scatter::FlightScatter;
 pub use flight_scatter_broadcast::BroadcastFlightScatter;
 pub use flight_scatter_hash::HashFlightScatter;
-pub use flight_scatter_mod::ModFlightScatter;
diff --git a/src/query/sql/src/executor/format.rs b/src/query/sql/src/executor/format.rs
index 29f5bc2529dad..1e3f8879339f3 100644
--- a/src/query/sql/src/executor/format.rs
+++ b/src/query/sql/src/executor/format.rs
@@ -1639,14 +1639,6 @@ fn exchange_to_format_tree(
             ),
             FragmentKind::Expansive => "Broadcast".to_string(),
             FragmentKind::Merge => "Merge".to_string(),
-            FragmentKind::Modulo => format!(
-                "Modulo({})",
-                plan.keys
-                    .iter()
-                    .map(|key| { key.as_expr(&BUILTIN_FUNCTIONS).sql_display() })
-                    .collect::<Vec<_>>()
-                    .join(", ")
-            ),
         })),
         to_format_tree(&plan.input, metadata, profs, context)?,
     ]))
diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs
index 10859f8391da1..545179b4af4d6 100644
--- a/src/query/sql/src/executor/physical_plans/common.rs
+++ b/src/query/sql/src/executor/physical_plans/common.rs
@@ -67,8 +67,6 @@ pub enum FragmentKind {
     // Broadcast
     Expansive,
     Merge,
-    // Partitioned by a specified expression % node_nums
-    Modulo,
 }
 
 #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)]
diff --git a/src/query/sql/src/executor/physical_plans/physical_exchange.rs b/src/query/sql/src/executor/physical_plans/physical_exchange.rs
index b4507942dc8ba..1e831519c415b 100644
--- a/src/query/sql/src/executor/physical_plans/physical_exchange.rs
+++ b/src/query/sql/src/executor/physical_plans/physical_exchange.rs
@@ -81,14 +81,6 @@ impl PhysicalPlanBuilder {
                 allow_adjust_parallelism = false;
                 FragmentKind::Merge
             }
-            crate::plans::Exchange::Modulo(scalar) => {
-                let expr = scalar
-                    .type_check(input_schema.as_ref())?
-                    .project_column_ref(|index| input_schema.index_of(&index.to_string()).unwrap());
-                let (expr, _) = ConstantFolder::fold(&expr, &self.func_ctx, &BUILTIN_FUNCTIONS);
-                keys.push(expr.as_remote_expr());
-                FragmentKind::Modulo
-            }
         };
         Ok(PhysicalPlan::Exchange(Exchange {
             plan_id: 0,
diff --git a/src/query/sql/src/executor/physical_plans/physical_recluster.rs b/src/query/sql/src/executor/physical_plans/physical_recluster.rs
index 0a5520ccb5ba2..a90df3a9fdef5 100644
--- a/src/query/sql/src/executor/physical_plans/physical_recluster.rs
+++ b/src/query/sql/src/executor/physical_plans/physical_recluster.rs
@@ -31,9 +31,8 @@ pub struct HilbertPartition {
     pub plan_id: u32,
     pub input: Box<PhysicalPlan>,
     pub table_info: TableInfo,
+    pub num_partitions: usize,
     pub table_meta_timestamps: TableMetaTimestamps,
     pub rows_per_block: usize,
     pub bytes_per_block: usize,
-    pub range_start: u64,
-    pub range_width: usize,
 }
diff --git a/src/query/sql/src/planner/format/display_rel_operator.rs b/src/query/sql/src/planner/format/display_rel_operator.rs
index a82e84ddc59ac..9835bbedf0cdd 100644
--- a/src/query/sql/src/planner/format/display_rel_operator.rs
+++ b/src/query/sql/src/planner/format/display_rel_operator.rs
@@ -397,7 +397,6 @@ fn exchange_to_format_tree<I: IdHumanizer>(id_humanizer: &I, op: &Exchange) -> F
         Exchange::Broadcast => "Exchange(Broadcast)",
         Exchange::Merge => "Exchange(Merge)",
         Exchange::MergeSort => "Exchange(MergeSort)",
-        Exchange::Modulo(_) => "Exchange(Modulo)",
     };
 
     match op {
diff --git a/src/query/sql/src/planner/optimizer/ir/format.rs b/src/query/sql/src/planner/optimizer/ir/format.rs
index 017c9bec97203..f9613af6b35ef 100644
--- a/src/query/sql/src/planner/optimizer/ir/format.rs
+++ b/src/query/sql/src/planner/optimizer/ir/format.rs
@@ -66,7 +66,6 @@ fn display_rel_op(rel_op: &RelOperator) -> String {
                 Exchange::Broadcast => "Broadcast".to_string(),
                 Exchange::Merge => "Merge".to_string(),
                 Exchange::MergeSort => "MergeSort".to_string(),
-                Exchange::Modulo(scalar) => format!("Modulo({})", scalar.as_raw_expr()),
             })
         }
         RelOperator::DummyTableScan(_) => "DummyTableScan".to_string(),
diff --git a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs
index 69abb144e11a4..1229898f5bef4 100644
--- a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs
+++ b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs
@@ -73,7 +73,6 @@ impl Enforcer for DistributionEnforcer {
             Distribution::Random | Distribution::Any => Err(ErrorCode::Internal(
                 "Cannot enforce random or any distribution",
             )),
-            Distribution::Modulo(key) => Ok(Exchange::Modulo(key.clone()).into()),
         }
     }
 }
diff --git a/src/query/sql/src/planner/optimizer/ir/property/property.rs b/src/query/sql/src/planner/optimizer/ir/property/property.rs
index 3eff4f594a2d2..9ae23730ca7e7 100644
--- a/src/query/sql/src/planner/optimizer/ir/property/property.rs
+++ b/src/query/sql/src/planner/optimizer/ir/property/property.rs
@@ -92,7 +92,6 @@ pub enum Distribution {
     Serial,
     Broadcast,
     Hash(Vec<ScalarExpr>),
-    Modulo(Box<ScalarExpr>),
 }
 
 impl Default for Distribution {
@@ -111,15 +110,11 @@ impl Distribution {
             | (Distribution::Random, _)
             | (Distribution::Serial, Distribution::Serial)
             | (Distribution::Broadcast, Distribution::Broadcast)
-            | (Distribution::Hash(_), Distribution::Broadcast)
-            | (Distribution::Modulo(_), Distribution::Broadcast) => true,
+            | (Distribution::Hash(_), Distribution::Broadcast) => true,
 
             (Distribution::Hash(ref keys), Distribution::Hash(ref other_keys)) => {
                 keys == other_keys
             }
-            (Distribution::Modulo(ref key), Distribution::Modulo(ref other_key)) => {
-                key == other_key
-            }
             _ => false,
         }
     }
@@ -140,7 +135,6 @@ impl Display for Distribution {
                     .collect::<Vec<_>>()
                     .join(", ")
             ),
-            Distribution::Modulo(ref key) => write!(f, "Modulo({})", key.as_raw_expr()),
         }
     }
 }
diff --git a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs
index 5bd737365a76b..6d39e793a7231 100644
--- a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs
+++ b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs
@@ -158,7 +158,7 @@ impl DefaultCostModel {
         let exchange: Exchange = (*m_expr.plan.clone()).clone().try_into()?;
         let group = memo.group(m_expr.group_index)?;
         let cost = match exchange {
-            Exchange::Hash(_) | Exchange::Modulo(_) => {
+            Exchange::Hash(_) => {
                 group.stat_info.cardinality * self.network_per_row
                     + group.stat_info.cardinality * self.compute_per_row
             }
diff --git a/src/query/sql/src/planner/plans/exchange.rs b/src/query/sql/src/planner/plans/exchange.rs
index db8dffd95d8cf..a7aca885b2ed1 100644
--- a/src/query/sql/src/planner/plans/exchange.rs
+++ b/src/query/sql/src/planner/plans/exchange.rs
@@ -30,8 +30,7 @@ pub enum Exchange {
     Hash(Vec<ScalarExpr>),
     Broadcast,
     Merge,
-    MergeSort,               // For distributed sort
-    Modulo(Box<ScalarExpr>), // For recluster
+    MergeSort, // For distributed sort
 }
 
 impl Operator for Exchange {
@@ -50,7 +49,6 @@ impl Operator for Exchange {
                 Exchange::Broadcast => Distribution::Broadcast,
                 Exchange::Merge => Distribution::Serial,
                 Exchange::MergeSort => Distribution::Serial,
-                Exchange::Modulo(key) => Distribution::Modulo(key.clone()),
             },
         })
     }
diff --git a/src/query/storages/fuse/src/statistics/cluster_statistics.rs b/src/query/storages/fuse/src/statistics/cluster_statistics.rs
index 904446690d93f..f452938fd4c25 100644
--- a/src/query/storages/fuse/src/statistics/cluster_statistics.rs
+++ b/src/query/storages/fuse/src/statistics/cluster_statistics.rs
@@ -122,7 +122,7 @@ impl ClusterStatsGenerator {
             let left = unsafe { val.index_unchecked(0) }.to_owned();
             min.push(left);
 
-            // The maximum in cluster statistics needn't larger than the non-trimmed one.
+            // The maximum in cluster statistics neednot larger than the non-trimmed one.
             // So we use trim_min directly.
             let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned();
             max.push(right);

From 7d140eb7b081db31a436107292ae9288c6300451 Mon Sep 17 00:00:00 2001
From: zhyass <mytesla@live.com>
Date: Tue, 24 Jun 2025 11:59:30 +0800
Subject: [PATCH 36/36] add cluster sample

---
 src/query/expression/src/block.rs             |   4 +-
 .../src/sampler/fixed_size_sampler.rs         |   2 +-
 src/query/expression/src/sampler/mod.rs       |   1 +
 .../pipelines/builders/builder_recluster.rs   |   9 +-
 src/query/storages/fuse/src/constants.rs      |   1 +
 .../src/io/write/stream/cluster_statistics.rs |   6 +-
 .../src/io/write/stream/column_statistics.rs  | 104 +++++++++++++++++-
 .../storages/fuse/src/operations/append.rs    |  13 ---
 8 files changed, 111 insertions(+), 29 deletions(-)

diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs
index 03c12fc8f6962..0ee73b994beae 100644
--- a/src/query/expression/src/block.rs
+++ b/src/query/expression/src/block.rs
@@ -504,8 +504,8 @@ impl DataBlock {
     }
 
     #[inline]
-    pub fn remove_column(&mut self, index: usize) {
-        self.entries.remove(index);
+    pub fn remove_column(&mut self, index: usize) -> BlockEntry {
+        self.entries.remove(index)
     }
 
     #[inline]
diff --git a/src/query/expression/src/sampler/fixed_size_sampler.rs b/src/query/expression/src/sampler/fixed_size_sampler.rs
index dd7500d40759b..b1317c38ba693 100644
--- a/src/query/expression/src/sampler/fixed_size_sampler.rs
+++ b/src/query/expression/src/sampler/fixed_size_sampler.rs
@@ -162,7 +162,7 @@ fn compact_indices(indices: &mut Vec<BlockRowIndex>, blocks: &mut Vec<DataBlock>
         .collect();
 }
 
-mod reservoir_sampling {
+pub mod reservoir_sampling {
     use std::num::NonZeroUsize;
 
     use rand::Rng;
diff --git a/src/query/expression/src/sampler/mod.rs b/src/query/expression/src/sampler/mod.rs
index c34b36905bd0f..558770c854f7b 100644
--- a/src/query/expression/src/sampler/mod.rs
+++ b/src/query/expression/src/sampler/mod.rs
@@ -16,4 +16,5 @@ mod fixed_rate_sampler;
 mod fixed_size_sampler;
 
 pub use fixed_rate_sampler::FixedRateSampler;
+pub use fixed_size_sampler::reservoir_sampling::AlgoL;
 pub use fixed_size_sampler::FixedSizeSampler;
diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs
index b4c9a396c5fcb..8364e45d726fa 100644
--- a/src/query/service/src/pipelines/builders/builder_recluster.rs
+++ b/src/query/service/src/pipelines/builders/builder_recluster.rs
@@ -184,12 +184,9 @@ impl PipelineBuilder {
                         task.total_compressed,
                     );
                     let state = SampleState::new(num_processors, partitions);
-                    let recluster_pipeline_builder = ReclusterPipelineBuilder::create(
-                        schema.clone(),
-                        sort_desc.clone(),
-                        sample_size,
-                    )
-                    .with_state(state);
+                    let recluster_pipeline_builder =
+                        ReclusterPipelineBuilder::create(schema, sort_desc.clone(), sample_size)
+                            .with_state(state);
                     recluster_pipeline_builder
                         .build_recluster_sample_pipeline(&mut self.main_pipeline)?;
 
diff --git a/src/query/storages/fuse/src/constants.rs b/src/query/storages/fuse/src/constants.rs
index 890153cb5f15b..9f56c913e5ba1 100644
--- a/src/query/storages/fuse/src/constants.rs
+++ b/src/query/storages/fuse/src/constants.rs
@@ -31,6 +31,7 @@ pub const FUSE_TBL_XOR_BLOOM_INDEX_PREFIX: &str = "_i_b_v2";
 pub const FUSE_TBL_SEGMENT_PREFIX: &str = "_sg";
 pub const FUSE_TBL_SNAPSHOT_PREFIX: &str = "_ss";
 pub const FUSE_TBL_SNAPSHOT_STATISTICS_PREFIX: &str = "_ts";
+pub const FUSE_TBL_BLOCK_STATS_PREFIX: &str = "_bs";
 pub const FUSE_TBL_LAST_SNAPSHOT_HINT: &str = "last_snapshot_location_hint";
 pub const FUSE_TBL_LAST_SNAPSHOT_HINT_V2: &str = "last_snapshot_location_hint_v2";
 pub const FUSE_TBL_VIRTUAL_BLOCK_PREFIX: &str = "_vb";
diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
index a0bd91888995e..4c33ff80566db 100644
--- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs
@@ -37,7 +37,7 @@ use crate::FuseTable;
 pub struct ClusterStatisticsBuilder {
     out_fields: Vec<DataField>,
     level: i32,
-    cluster_key_id: u32,
+    cluster_key_id: Option<u32>,
     cluster_key_index: Vec<usize>,
 
     extra_key_num: usize,
@@ -92,7 +92,7 @@ impl ClusterStatisticsBuilder {
             }]
         };
         Ok(Arc::new(Self {
-            cluster_key_id: table.cluster_key_meta.as_ref().unwrap().0,
+            cluster_key_id: table.cluster_key_id(),
             cluster_key_index,
             extra_key_num,
             operators,
@@ -185,7 +185,7 @@ impl ClusterStatisticsState {
             max,
             min,
             level,
-            cluster_key_id: self.builder.cluster_key_id,
+            cluster_key_id: self.builder.cluster_key_id.unwrap(),
             pages: None,
         }))
     }
diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
index ee520eb5007b3..b0da462368914 100644
--- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
+++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs
@@ -13,13 +13,18 @@
 // limitations under the License.
 
 use std::cmp::Ordering;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::hash::Hash;
 use std::marker::PhantomData;
 
 use databend_common_exception::ErrorCode;
 use databend_common_exception::Result;
+use databend_common_expression::sampler::AlgoL;
 use databend_common_expression::types::boolean::TrueIdxIter;
+use databend_common_expression::types::AccessType;
+use databend_common_expression::types::ArrayType;
+use databend_common_expression::types::BinaryColumn;
+use databend_common_expression::types::BinaryType;
 use databend_common_expression::types::DataType;
 use databend_common_expression::types::DateType;
 use databend_common_expression::types::Decimal;
@@ -30,7 +35,8 @@ use databend_common_expression::types::NumberType;
 use databend_common_expression::types::StringType;
 use databend_common_expression::types::TimestampType;
 use databend_common_expression::types::ValueType;
-use databend_common_expression::with_number_mapped_type;
+use databend_common_expression::{with_number_mapped_type, BlockRowIndex};
+use databend_common_expression::BlockEntry;
 use databend_common_expression::Column;
 use databend_common_expression::ColumnId;
 use databend_common_expression::DataBlock;
@@ -39,16 +45,18 @@ use databend_common_expression::ScalarRef;
 use databend_common_expression::TableSchemaRef;
 use databend_common_expression::Value;
 use databend_common_expression::SELECTIVITY_THRESHOLD;
-use databend_storages_common_table_meta::meta::ColumnDistinctHLL;
+use databend_storages_common_table_meta::meta::{ColumnDistinctHLL, Location};
 use databend_storages_common_table_meta::meta::ColumnStatistics;
 use databend_storages_common_table_meta::meta::StatisticsOfColumns;
-
+use rand::rngs::SmallRng;
+use databend_common_expression::types::binary::BinaryColumnBuilder;
 use crate::statistics::traverse_values_dfs;
 use crate::statistics::Trim;
 
 pub struct ColumnStatisticsState {
     col_stats: HashMap<ColumnId, Box<dyn ColumnMinMaxState>>,
     distinct_columns: HashMap<ColumnId, Box<dyn ColumnNDVEstimator>>,
+    // cluster_key: Option<u32>,
 }
 
 impl ColumnStatisticsState {
@@ -120,6 +128,94 @@ impl ColumnStatisticsState {
     }
 }
 
+#[derive(Debug)]
+pub struct BlockStatisticsState {
+    pub(crate) data: Vec<u8>,
+    pub(crate) size: u64,
+    pub(crate) location: Location,
+}
+
+pub struct ClusterStateSampler {
+    k: usize,
+    origins: Vec<BinaryColumn>,
+    indices: Vec<BlockRowIndex>,
+    core: AlgoL<SmallRng>,
+
+    s: usize,
+}
+
+impl ClusterStateSampler {
+    pub fn new(k: usize, rng: SmallRng) -> Self {
+        let core = AlgoL::new(k.try_into().unwrap(), rng);
+        Self {
+            origins: Vec::new(),
+            indices: Vec::with_capacity(k),
+            k,
+            core,
+            s: usize::MAX,
+        }
+    }
+
+    pub fn add_column(&mut self, data: BinaryColumn) {
+        let rows = data.len();
+        assert!(rows > 0);
+        let block_idx = self.origins.len() as u32;
+        let change = self.add_indices(rows, block_idx);
+        if change {
+            self.origins.push(data);
+        }
+    }
+
+    fn add_indices(&mut self, rows: usize, block_idx: u32) -> bool {
+        let mut change = false;
+        let mut cur = 0;
+
+        // Fill initial reservoir
+        if self.indices.len() < self.k {
+            let remain = self.k - self.indices.len();
+
+            if rows <= remain {
+                self.indices.extend((0..rows).map(|i| (block_idx, i as u32, 1)));
+                if self.indices.len() == self.k {
+                    self.s = self.core.search();
+                }
+                return true;
+            }
+
+            self.indices.extend((0..remain).map(|i| (block_idx, i as u32, 1)));
+            cur += remain;
+            self.s = self.core.search();
+            change = true;
+        }
+
+        // Apply AlgoL
+        while rows - cur > self.s {
+            cur += self.s;
+            let pos = self.core.pos();
+            self.indices[pos] = (block_idx, cur as u32, 1);
+            self.core.update_w();
+            self.s = self.core.search();
+            change = true;
+        }
+
+        self.s -= rows - cur;
+        change
+    }
+
+    pub fn finalize(self) -> BlockEntry {
+        let columns = self.origins;
+        let mut builder = BinaryColumnBuilder::with_capacity(self.k, 0);
+        for (block_index, row, times) in self.indices {
+            let val = 
+                unsafe { BinaryType::index_column_unchecked(&columns[block_index as usize], row as usize) };
+            for _ in 0..times {
+                BinaryType::push_item(&mut builder, val.clone())
+            }
+        }
+        BlockEntry::new_const_column_arg::<ArrayType<BinaryType>>(builder.build(), 1)
+    }
+}
+
 pub trait ColumnNDVEstimator: Send + Sync {
     fn update_column(&mut self, column: &Column);
     fn update_scalar(&mut self, scalar: &ScalarRef);
diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs
index 8a2349f8ff359..1d02e58b52f0b 100644
--- a/src/query/storages/fuse/src/operations/append.rs
+++ b/src/query/storages/fuse/src/operations/append.rs
@@ -79,19 +79,6 @@ impl FuseTable {
                         )
                     });
                 }
-
-                let sort_desc: Vec<SortColumnDescription> = cluster_key_index
-                    .iter()
-                    .map(|index| SortColumnDescription {
-                        offset: *index,
-                        asc: true,
-                        nulls_first: false,
-                    })
-                    .collect();
-                let sort_desc: Arc<[_]> = sort_desc.into();
-                pipeline.add_transformer(|| {
-                    TransformSortPartial::new(LimitType::None, sort_desc.clone())
-                });
             }
 
             pipeline.add_transform(|input, output| {