databendlabs
diff --git a/‎src/query/expression/src/utils/block_thresholds.rs
Lines changed: 49 additions & 0 deletions b/‎src/query/expression/src/utils/block_thresholds.rs
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/query/service/src/interpreters/interpreter_table_recluster.rs
Lines changed: 9 additions & 5 deletions b/‎src/query/service/src/interpreters/interpreter_table_recluster.rs
Lines changed: 9 additions & 5 deletions
diff --git a/‎src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
Lines changed: 36 additions & 29 deletions b/‎src/query/service/src/pipelines/builders/builder_hilbert_partition.rs
Lines changed: 36 additions & 29 deletions
diff --git a/‎src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
Lines changed: 1 addition & 11 deletions b/‎src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs
Lines changed: 1 addition & 11 deletions
diff --git a/‎src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
Lines changed: 2 additions & 0 deletions b/‎src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs
Lines changed: 2 additions & 0 deletions
@@ -166,4 +166,53 @@ impl BlockThresholds {
         };
         total_rows.div_ceil(block_nums.max(1)).max(1)
     }
+
+    /// Calculates the optimal number of partitions (blocks) based on total data size and row count.
+    ///
+    /// # Parameters
+    /// - `total_rows`: The total number of rows in the data.
+    /// - `total_bytes`: The total uncompressed size of the data in bytes.
+    /// - `total_compressed`: The total compressed size of the data in bytes.
+    ///
+    /// # Returns
+    /// - The calculated number of partitions (blocks) needed.
+    #[inline]
+    pub fn calc_partitions_for_recluster(
+        &self,
+        total_rows: usize,
+        total_bytes: usize,
+        total_compressed: usize,
+    ) -> usize {
+        // If the data is already compact enough, return a single partition.
+        if self.check_for_compact(total_rows, total_bytes)
+            && total_compressed < 2 * self.min_compressed_per_block
+        {
+            return 1;
+        }
+
+        // Estimate the number of blocks based on row count and compressed size.
+        let by_rows = std::cmp::max(total_rows / self.max_rows_per_block, 1);
+        let by_compressed = total_compressed / self.max_compressed_per_block;
+        // If row-based block count is greater, use max rows per block as limit.
+        if by_rows >= by_compressed {
+            return by_rows;
+        }
+
+        // Adjust block count based on byte size thresholds.
+        let bytes_per_block = total_bytes.div_ceil(by_compressed);
+        let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024);
+        let min_bytes = max_bytes / 2;
+        let total_partitions = if bytes_per_block > max_bytes {
+            // Block size is too large.
+            total_bytes / max_bytes
+        } else if bytes_per_block < min_bytes {
+            // Block size is too small.
+            total_bytes / min_bytes
+        } else {
+            // Block size is acceptable.
+            by_compressed
+        };
+
+        std::cmp::max(total_partitions, 1)
+    }
 }
@@ -323,12 +323,15 @@ impl ReclusterTableInterpreter {
         let total_rows = recluster_info.removed_statistics.row_count as usize;
         let total_compressed = recluster_info.removed_statistics.compressed_byte_size as usize;
 
-        // Determine rows per block based on data size and compression ratio
-        let rows_per_block =
-            block_thresholds.calc_rows_for_recluster(total_rows, total_bytes, total_compressed);
-
+        // Determine rows per block based on data size and compression ratio,
         // Calculate initial partition count based on data volume and block size
-        let total_partitions = std::cmp::max(total_rows / rows_per_block, 1);
+        let total_partitions = block_thresholds.calc_partitions_for_recluster(
+            total_rows,
+            total_bytes,
+            total_compressed,
+        );
+        let bytes_per_block = (total_bytes / total_partitions).max(1);
+        let rows_per_block = (total_rows / total_partitions).max(1);
 
         warn!(
             "Do hilbert recluster, total_bytes: {}, total_rows: {}, total_partitions: {}",
@@ -487,6 +490,7 @@ impl ReclusterTableInterpreter {
             range_start: 0,
             range_width: total_partitions,
             table_meta_timestamps,
+            bytes_per_block,
             rows_per_block,
         }));
 
 
@@ -18,7 +18,6 @@ use std::sync::atomic::AtomicUsize;
 use databend_common_catalog::table::Table;
 use databend_common_catalog::table_context::TableContext;
 use databend_common_exception::Result;
-use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
 use databend_common_pipeline_core::processors::ProcessorPtr;
 use databend_common_pipeline_transforms::MemorySettings;
 use databend_common_sql::executor::physical_plans::HilbertPartition;
@@ -27,12 +26,12 @@ use databend_common_storages_fuse::operations::TransformBlockWriter;
 use databend_common_storages_fuse::operations::TransformSerializeBlock;
 use databend_common_storages_fuse::statistics::ClusterStatsGenerator;
 use databend_common_storages_fuse::FuseTable;
-use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD;
 use databend_storages_common_cache::TempDirManager;
 
 use crate::pipelines::memory_settings::MemorySettingsExt;
 use crate::pipelines::processors::transforms::CompactStrategy;
 use crate::pipelines::processors::transforms::HilbertPartitionExchange;
+use crate::pipelines::processors::transforms::TransformHilbertCollect;
 use crate::pipelines::processors::transforms::TransformWindowPartitionCollect;
 use crate::pipelines::PipelineBuilder;
 use crate::spillers::SpillerDiskConfig;
@@ -65,35 +64,25 @@ impl PipelineBuilder {
 
         let window_spill_settings = MemorySettings::from_window_settings(&self.ctx)?;
         let processor_id = AtomicUsize::new(0);
-        let max_bytes_per_block = std::cmp::min(
-            4 * table.get_option(
-                FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD,
-                DEFAULT_BLOCK_BUFFER_SIZE,
-            ),
-            400 * 1024 * 1024,
-        );
-        self.main_pipeline.add_transform(|input, output| {
-            Ok(ProcessorPtr::create(Box::new(
-                TransformWindowPartitionCollect::new(
-                    self.ctx.clone(),
-                    input,
-                    output,
-                    &settings,
-                    processor_id.fetch_add(1, atomic::Ordering::AcqRel),
-                    num_processors,
-                    partition.range_width,
-                    window_spill_settings.clone(),
-                    disk_spill.clone(),
-                    CompactStrategy::new(
-                        partition.rows_per_block,
-                        max_bytes_per_block,
-                        enable_stream_writer,
-                    ),
-                )?,
-            )))
-        })?;
 
         if enable_stream_writer {
+            self.main_pipeline.add_transform(|input, output| {
+                Ok(ProcessorPtr::create(Box::new(
+                    TransformHilbertCollect::new(
+                        self.ctx.clone(),
+                        input,
+                        output,
+                        &settings,
+                        processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                        num_processors,
+                        partition.range_width,
+                        window_spill_settings.clone(),
+                        disk_spill.clone(),
+                        partition.bytes_per_block,
+                    )?,
+                )))
+            })?;
+
             self.main_pipeline.add_transform(|input, output| {
                 TransformBlockWriter::try_create(
                     self.ctx.clone(),
@@ -103,9 +92,27 @@ impl PipelineBuilder {
                     table,
                     partition.table_meta_timestamps,
                     false,
+                    Some(partition.bytes_per_block),
                 )
             })
         } else {
+            self.main_pipeline.add_transform(|input, output| {
+                Ok(ProcessorPtr::create(Box::new(
+                    TransformWindowPartitionCollect::new(
+                        self.ctx.clone(),
+                        input,
+                        output,
+                        &settings,
+                        processor_id.fetch_add(1, atomic::Ordering::AcqRel),
+                        num_processors,
+                        partition.range_width,
+                        window_spill_settings.clone(),
+                        disk_spill.clone(),
+                        CompactStrategy::new(partition.rows_per_block, partition.bytes_per_block),
+                    )?,
+                )))
+            })?;
+
             self.main_pipeline
                 .add_transform(|transform_input_port, transform_output_port| {
                     let proc = TransformSerializeBlock::try_create(
 
@@ -27,19 +27,13 @@ pub trait DataProcessorStrategy: Send + Sync + 'static {
 pub struct CompactStrategy {
     max_bytes_per_block: usize,
     max_rows_per_block: usize,
-    enable_stream_writer: bool,
 }
 
 impl CompactStrategy {
-    pub fn new(
-        max_rows_per_block: usize,
-        max_bytes_per_block: usize,
-        enable_stream_writer: bool,
-    ) -> Self {
+    pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self {
         Self {
             max_bytes_per_block,
             max_rows_per_block,
-            enable_stream_writer,
         }
     }
 
@@ -56,10 +50,6 @@ impl DataProcessorStrategy for CompactStrategy {
     const NAME: &'static str = "Compact";
 
     fn process_data_blocks(&self, data_blocks: Vec<DataBlock>) -> Result<Vec<DataBlock>> {
-        if self.enable_stream_writer {
-            return Ok(data_blocks);
-        }
-
         let blocks_num = data_blocks.len();
         if blocks_num < 2 {
             return Ok(data_blocks);
 
@@ -14,6 +14,7 @@
 
 mod data_processor_strategy;
 mod hilbert_partition_exchange;
+mod transform_hilbert_collect;
 mod transform_window_partition_collect;
 mod window_partition_buffer;
 mod window_partition_exchange;
@@ -22,6 +23,7 @@ mod window_partition_partial_top_n_exchange;
 
 pub use data_processor_strategy::*;
 pub use hilbert_partition_exchange::*;
+pub use transform_hilbert_collect::*;
 pub use transform_window_partition_collect::*;
 pub use window_partition_buffer::*;
 pub use window_partition_exchange::*;