From 58690aa31e6601807ec1c3bf2d358c3e3ae3d1ca Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 30 Apr 2025 13:11:51 +0800 Subject: [PATCH 01/36] hilbert recluster support block stream write --- .../expression/src/utils/block_thresholds.rs | 4 +- .../interpreter_table_recluster.rs | 56 ++++++++++------ .../builders/builder_hilbert_partition.rs | 45 +++++++++---- .../partition/data_processor_strategy.rs | 12 +++- .../partition/hilbert_partition_exchange.rs | 30 +++++---- .../src/schedulers/fragments/fragmenter.rs | 17 +++-- .../src/schedulers/fragments/plan_fragment.rs | 66 +++++++++++++++++-- .../physical_plans/physical_recluster.rs | 3 +- .../storages/fuse/src/operations/append.rs | 5 +- 9 files changed, 171 insertions(+), 67 deletions(-) diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs index f19a26f6dedee..4fd35638cb863 100644 --- a/src/query/expression/src/utils/block_thresholds.rs +++ b/src/query/expression/src/utils/block_thresholds.rs @@ -152,8 +152,8 @@ impl BlockThresholds { let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed); // Adjust the number of blocks based on block size thresholds. - let max_bytes_per_block = (4 * self.min_bytes_per_block).min(400 * 1024 * 1024); - let min_bytes_per_block = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024); + let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024); + let min_bytes_per_block = self.min_bytes_per_block.min(100 * 1024 * 1024); let block_nums = if bytes_per_block > max_bytes_per_block { // Case 1: If the block size is too bigger. total_bytes.div_ceil(max_bytes_per_block) diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs index f3c53597b06d7..558be0d8bdbba 100644 --- a/src/query/service/src/interpreters/interpreter_table_recluster.rs +++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs @@ -28,6 +28,7 @@ use databend_common_catalog::table::TableExt; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::type_check::check_function; +use databend_common_expression::types::NumberScalar; use databend_common_expression::DataBlock; use databend_common_expression::Scalar; use databend_common_functions::BUILTIN_FUNCTIONS; @@ -52,6 +53,8 @@ use databend_common_sql::plans::plan_hilbert_sql; use databend_common_sql::plans::replace_with_constant; use databend_common_sql::plans::set_update_stream_columns; use databend_common_sql::plans::BoundColumnRef; +use databend_common_sql::plans::ConstantExpr; +use databend_common_sql::plans::FunctionCall; use databend_common_sql::plans::Plan; use databend_common_sql::plans::ReclusterPlan; use databend_common_sql::IdentifierNormalizer; @@ -325,19 +328,7 @@ impl ReclusterTableInterpreter { block_thresholds.calc_rows_for_recluster(total_rows, total_bytes, total_compressed); // Calculate initial partition count based on data volume and block size - let mut total_partitions = std::cmp::max(total_rows / rows_per_block, 1); - - // Adjust number of partitions according to the block size thresholds - if total_partitions < block_thresholds.block_per_segment - && block_thresholds.check_perfect_segment( - block_thresholds.block_per_segment, // this effectively by-pass the total_blocks criteria - total_rows, - total_bytes, - total_compressed, - ) - { - total_partitions = block_thresholds.block_per_segment; - } + let total_partitions = std::cmp::max(total_rows / rows_per_block, 1); warn!( "Do hilbert recluster, total_bytes: {}, total_rows: {}, total_partitions: {}", @@ -439,15 +430,37 @@ impl ReclusterTableInterpreter { // For distributed execution, add an exchange operator to distribute work if is_distributed { + let nodes_num = cluster.nodes.len() as u64; + let scalar_expr = ScalarExpr::FunctionCall(FunctionCall { + span: None, + func_name: "div".to_string(), + params: vec![], + arguments: vec![ + ScalarExpr::FunctionCall(FunctionCall { + span: None, + func_name: "multiply".to_string(), + params: vec![], + arguments: vec![ + ScalarExpr::BoundColumnRef(BoundColumnRef { + span: None, + column: bind_context.columns.last().unwrap().clone(), + }), + ScalarExpr::ConstantExpr(ConstantExpr { + span: None, + value: Scalar::Number(NumberScalar::UInt64(nodes_num)), + }), + ], + }), + ScalarExpr::ConstantExpr(ConstantExpr { + span: None, + value: Scalar::Number(NumberScalar::UInt64(total_partitions as u64)), + }), + ], + }); + // Create an expression for the partition column, // i.e.`range_partition_id(hilbert_range_index({hilbert_keys_str}), [...]) AS _predicate` - let expr = scalar_expr_to_remote_expr( - &ScalarExpr::BoundColumnRef(BoundColumnRef { - span: None, - column: bind_context.columns.last().unwrap().clone(), - }), - plan.output_schema()?.as_ref(), - )?; + let expr = scalar_expr_to_remote_expr(&scalar_expr, plan.output_schema()?.as_ref())?; // Add exchange operator for data distribution, // shuffling data based on the hash of range partition IDs derived from the Hilbert index. @@ -471,7 +484,8 @@ impl ReclusterTableInterpreter { plan_id: 0, input: plan, table_info: table_info.clone(), - num_partitions: total_partitions, + range_start: 0, + range_width: total_partitions, table_meta_timestamps, rows_per_block, })); diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index 86b23bfca9238..2d38a1e3b8281 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -15,6 +15,7 @@ use std::sync::atomic; use std::sync::atomic::AtomicUsize; +use databend_common_catalog::table::Table; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; @@ -22,6 +23,7 @@ use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::executor::physical_plans::HilbertPartition; use databend_common_sql::executor::physical_plans::MutationKind; +use databend_common_storages_fuse::operations::TransformBlockWriter; use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::statistics::ClusterStatsGenerator; use databend_common_storages_fuse::FuseTable; @@ -43,10 +45,12 @@ impl PipelineBuilder { .ctx .build_table_by_table_info(&partition.table_info, None)?; let table = FuseTable::try_from_table(table.as_ref())?; + let enable_stream_writer = self.ctx.get_settings().get_enable_block_stream_write()? + && table.storage_format_as_parquet(); self.main_pipeline.exchange( num_processors, - HilbertPartitionExchange::create(partition.num_partitions), + HilbertPartitionExchange::create(partition.range_start, partition.range_width), ); let settings = self.ctx.get_settings(); @@ -77,26 +81,43 @@ impl PipelineBuilder { &settings, processor_id.fetch_add(1, atomic::Ordering::AcqRel), num_processors, - partition.num_partitions, + partition.range_width, window_spill_settings.clone(), disk_spill.clone(), - CompactStrategy::new(partition.rows_per_block, max_bytes_per_block), + CompactStrategy::new( + partition.rows_per_block, + max_bytes_per_block, + enable_stream_writer, + ), )?, ))) })?; - self.main_pipeline - .add_transform(|transform_input_port, transform_output_port| { - let proc = TransformSerializeBlock::try_create( + if enable_stream_writer { + self.main_pipeline.add_transform(|input, output| { + TransformBlockWriter::try_create( self.ctx.clone(), - transform_input_port, - transform_output_port, + input, + output, table, - ClusterStatsGenerator::default(), - MutationKind::Recluster, partition.table_meta_timestamps, - )?; - proc.into_processor() + false, + ) }) + } else { + self.main_pipeline + .add_transform(|transform_input_port, transform_output_port| { + let proc = TransformSerializeBlock::try_create( + self.ctx.clone(), + transform_input_port, + transform_output_port, + table, + ClusterStatsGenerator::default(), + MutationKind::Recluster, + partition.table_meta_timestamps, + )?; + proc.into_processor() + }) + } } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs index 75793aa415e08..3515858340e89 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs @@ -27,13 +27,19 @@ pub trait DataProcessorStrategy: Send + Sync + 'static { pub struct CompactStrategy { max_bytes_per_block: usize, max_rows_per_block: usize, + enable_stream_writer: bool, } impl CompactStrategy { - pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self { + pub fn new( + max_rows_per_block: usize, + max_bytes_per_block: usize, + enable_stream_writer: bool, + ) -> Self { Self { max_bytes_per_block, max_rows_per_block, + enable_stream_writer, } } @@ -50,6 +56,10 @@ impl DataProcessorStrategy for CompactStrategy { const NAME: &'static str = "Compact"; fn process_data_blocks(&self, data_blocks: Vec) -> Result> { + if self.enable_stream_writer { + return Ok(data_blocks); + } + let blocks_num = data_blocks.len(); if blocks_num < 2 { return Ok(data_blocks); diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs index 93a6ce2aa4b6e..16215dded2b15 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs @@ -25,12 +25,13 @@ use databend_common_pipeline_core::processors::Exchange; use crate::pipelines::processors::transforms::WindowPartitionMeta; pub struct HilbertPartitionExchange { - num_partitions: usize, + start: u64, + width: usize, } impl HilbertPartitionExchange { - pub fn create(num_partitions: usize) -> Arc { - Arc::new(HilbertPartitionExchange { num_partitions }) + pub fn create(start: u64, width: usize) -> Arc { + Arc::new(HilbertPartitionExchange { start, width }) } } @@ -48,20 +49,25 @@ impl Exchange for HilbertPartitionExchange { // Scatter the data block to different partitions. let indices = range_ids .iter() - .map(|&id| (id % self.num_partitions as u64) as u16) + .map(|&id| (id - self.start) as u16) .collect::>(); data_block.pop_columns(1); - let scatter_indices = - DataBlock::divide_indices_by_scatter_size(&indices, self.num_partitions); + + let scatter_indices = DataBlock::divide_indices_by_scatter_size(&indices, self.width); // Partition the data blocks to different processors. + let base = self.width / n; + let remainder = self.width % n; let mut output_data_blocks = vec![vec![]; n]; - for (partition_id, indices) in scatter_indices.iter().take(self.num_partitions).enumerate() - { - if indices.is_empty() { - continue; + for (partition_id, indices) in scatter_indices.into_iter().take(self.width).enumerate() { + if !indices.is_empty() { + let target = if partition_id < remainder * (base + 1) { + partition_id / (base + 1) + } else { + (partition_id - remainder) / base + }; + let block = data_block.take_with_optimize_size(&indices)?; + output_data_blocks[target].push((partition_id, block)); } - let block = data_block.take_with_optimize_size(indices)?; - output_data_blocks[partition_id % n].push((partition_id, block)); } // Union data blocks for each processor. diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs index 2e7a6e878b819..dbdda532daca8 100644 --- a/src/query/service/src/schedulers/fragments/fragmenter.rs +++ b/src/query/service/src/schedulers/fragments/fragmenter.rs @@ -26,6 +26,7 @@ use databend_common_sql::executor::physical_plans::ExchangeSink; use databend_common_sql::executor::physical_plans::ExchangeSource; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::HashJoin; +use databend_common_sql::executor::physical_plans::HilbertPartition; use databend_common_sql::executor::physical_plans::MutationSource; use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::executor::physical_plans::ReplaceInto; @@ -41,7 +42,6 @@ use crate::servers::flight::v1::exchange::DataExchange; use crate::servers::flight::v1::exchange::MergeExchange; use crate::servers::flight::v1::exchange::ShuffleDataExchange; use crate::sessions::QueryContext; -use crate::sql::executor::physical_plans::Mutation; use crate::sql::executor::PhysicalPlan; /// Visitor to split a `PhysicalPlan` into fragments. @@ -67,6 +67,7 @@ enum State { Compact, Recluster, Other, + HilbertRecluster, } impl Fragmenter { @@ -170,14 +171,6 @@ impl PhysicalPlanReplacer for Fragmenter { Ok(PhysicalPlan::MutationSource(plan.clone())) } - fn replace_mutation(&mut self, plan: &Mutation) -> Result { - let input = self.replace(&plan.input)?; - Ok(PhysicalPlan::Mutation(Box::new(Mutation { - input: Box::new(input), - ..plan.clone() - }))) - } - fn replace_replace_into(&mut self, plan: &ReplaceInto) -> Result { let input = self.replace(&plan.input)?; self.state = State::ReplaceInto; @@ -209,6 +202,11 @@ impl PhysicalPlanReplacer for Fragmenter { Ok(PhysicalPlan::Recluster(Box::new(plan.clone()))) } + fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result { + self.state = State::HilbertRecluster; + Ok(PhysicalPlan::HilbertPartition(Box::new(plan.clone()))) + } + fn replace_compact_source(&mut self, plan: &CompactSource) -> Result { self.state = State::Compact; Ok(PhysicalPlan::CompactSource(Box::new(plan.clone()))) @@ -310,6 +308,7 @@ impl PhysicalPlanReplacer for Fragmenter { State::ReplaceInto => FragmentType::ReplaceInto, State::Compact => FragmentType::Compact, State::Recluster => FragmentType::Recluster, + State::HilbertRecluster => FragmentType::HilbertRecluster, }; self.state = State::Other; let exchange = Self::get_exchange(self.ctx.clone(), &plan)?; diff --git a/src/query/service/src/schedulers/fragments/plan_fragment.rs b/src/query/service/src/schedulers/fragments/plan_fragment.rs index 18f2b35267eb4..e8306854a981f 100644 --- a/src/query/service/src/schedulers/fragments/plan_fragment.rs +++ b/src/query/service/src/schedulers/fragments/plan_fragment.rs @@ -28,6 +28,7 @@ use databend_common_sql::executor::physical_plans::CompactSource; use databend_common_sql::executor::physical_plans::ConstantTableScan; use databend_common_sql::executor::physical_plans::CopyIntoTable; use databend_common_sql::executor::physical_plans::CopyIntoTableSource; +use databend_common_sql::executor::physical_plans::HilbertPartition; use databend_common_sql::executor::physical_plans::MutationSource; use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::executor::physical_plans::ReplaceDeduplicate; @@ -64,6 +65,7 @@ pub enum FragmentType { Compact, Recluster, MutationSource, + HilbertRecluster, } #[derive(Clone)] @@ -136,6 +138,9 @@ impl PlanFragment { FragmentType::Recluster => { self.redistribute_recluster(ctx, &mut fragment_actions)?; } + FragmentType::HilbertRecluster => { + self.redistribute_hilbert(ctx, &mut fragment_actions)?; + } } if let Some(ref exchange) = self.exchange { @@ -376,6 +381,40 @@ impl PlanFragment { Ok(()) } + fn redistribute_hilbert( + &self, + ctx: Arc, + fragment_actions: &mut QueryFragmentActions, + ) -> Result<()> { + let exchange_sink = match &self.plan { + PhysicalPlan::ExchangeSink(plan) => plan, + _ => unreachable!("logic error"), + }; + let hilbert = match exchange_sink.input.as_ref() { + PhysicalPlan::HilbertPartition(plan) => plan, + _ => unreachable!("logic error"), + }; + + let total_ranges = hilbert.range_width; + let executors = Fragmenter::get_executors(ctx); + let num_executors = executors.len(); + let base_width = total_ranges / num_executors; + let remainder = total_ranges % num_executors; + for (executor_idx, executor) in executors.into_iter().enumerate() { + let width = base_width + if executor_idx < remainder { 1 } else { 0 }; + let min = executor_idx * base_width + std::cmp::min(executor_idx, remainder); + let mut plan = self.plan.clone(); + let mut replace_hilbert = ReplaceHilbert { + range_width: width, + range_start: min as u64, + }; + plan = replace_hilbert.replace(&plan)?; + fragment_actions.add_action(QueryFragmentAction::create(executor, plan)); + } + + Ok(()) + } + fn reshuffle( executors: Vec, partitions: Vec, @@ -551,8 +590,23 @@ impl PhysicalPlanReplacer for ReplaceReadSource { } } +struct ReplaceHilbert { + range_width: usize, + range_start: u64, +} + +impl PhysicalPlanReplacer for ReplaceHilbert { + fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result { + Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition { + range_width: self.range_width, + range_start: self.range_start, + ..plan.clone() + }))) + } +} + struct ReplaceRecluster { - pub tasks: Vec, + tasks: Vec, } impl PhysicalPlanReplacer for ReplaceRecluster { @@ -565,7 +619,7 @@ impl PhysicalPlanReplacer for ReplaceRecluster { } struct ReplaceMutationSource { - pub partitions: Partitions, + partitions: Partitions, } impl PhysicalPlanReplacer for ReplaceMutationSource { @@ -578,7 +632,7 @@ impl PhysicalPlanReplacer for ReplaceMutationSource { } struct ReplaceCompactBlock { - pub partitions: Partitions, + partitions: Partitions, } impl PhysicalPlanReplacer for ReplaceCompactBlock { @@ -591,10 +645,10 @@ impl PhysicalPlanReplacer for ReplaceCompactBlock { } struct ReplaceReplaceInto { - pub partitions: Vec<(usize, Location)>, + partitions: Vec<(usize, Location)>, // for standalone mode, slot is None - pub slot: Option, - pub need_insert: bool, + slot: Option, + need_insert: bool, } impl PhysicalPlanReplacer for ReplaceReplaceInto { diff --git a/src/query/sql/src/executor/physical_plans/physical_recluster.rs b/src/query/sql/src/executor/physical_plans/physical_recluster.rs index 9227c86b64199..43236e53766a5 100644 --- a/src/query/sql/src/executor/physical_plans/physical_recluster.rs +++ b/src/query/sql/src/executor/physical_plans/physical_recluster.rs @@ -31,7 +31,8 @@ pub struct HilbertPartition { pub plan_id: u32, pub input: Box, pub table_info: TableInfo, - pub num_partitions: usize, pub table_meta_timestamps: TableMetaTimestamps, pub rows_per_block: usize, + pub range_start: u64, + pub range_width: usize, } diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index 9316374128528..e6460a7c247a0 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -40,7 +40,6 @@ use databend_storages_common_table_meta::table::ClusterType; use crate::operations::TransformBlockWriter; use crate::operations::TransformSerializeBlock; use crate::statistics::ClusterStatsGenerator; -use crate::FuseStorageFormat; use crate::FuseTable; impl FuseTable { @@ -50,8 +49,8 @@ impl FuseTable { pipeline: &mut Pipeline, table_meta_timestamps: TableMetaTimestamps, ) -> Result<()> { - let enable_stream_block_write = ctx.get_settings().get_enable_block_stream_write()? - && matches!(self.storage_format, FuseStorageFormat::Parquet); + let enable_stream_block_write = + ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet(); if enable_stream_block_write { pipeline.add_transform(|input, output| { TransformBlockWriter::try_create( From 8c77f3592f56704021cbbc2258021c60a8d30e0b Mon Sep 17 00:00:00 2001 From: zhyass Date: Sun, 4 May 2025 02:19:01 +0800 Subject: [PATCH 02/36] fix exchange --- .../interpreter_table_recluster.rs | 2 +- .../builders/builder_hilbert_partition.rs | 1 + .../aggregator/aggregate_exchange_injector.rs | 1 + .../src/schedulers/fragments/fragmenter.rs | 17 +++- .../src/schedulers/fragments/plan_fragment.rs | 2 + .../query_fragment_actions_display.rs | 1 + .../flight/v1/exchange/data_exchange.rs | 17 ++++ .../flight/v1/exchange/exchange_injector.rs | 6 ++ .../flight/v1/exchange/exchange_manager.rs | 15 ++- .../src/servers/flight/v1/exchange/mod.rs | 1 + .../flight/v1/scatter/flight_scatter_mod.rs | 92 +++++++++++++++++++ .../src/servers/flight/v1/scatter/mod.rs | 2 + src/query/sql/src/executor/format.rs | 8 ++ .../sql/src/executor/physical_plans/common.rs | 2 + .../physical_plans/physical_exchange.rs | 8 ++ .../planner/format/display_rel_operator.rs | 1 + .../sql/src/planner/optimizer/ir/format.rs | 1 + .../planner/optimizer/ir/property/enforcer.rs | 1 + .../planner/optimizer/ir/property/property.rs | 8 +- .../optimizers/cascades/cost/model.rs | 2 +- src/query/sql/src/planner/plans/exchange.rs | 4 +- .../storages/fuse/src/operations/append.rs | 1 + .../processors/transform_block_writer.rs | 22 ++++- 23 files changed, 208 insertions(+), 7 deletions(-) create mode 100644 src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs index 558be0d8bdbba..2f8e77f29738f 100644 --- a/src/query/service/src/interpreters/interpreter_table_recluster.rs +++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs @@ -467,7 +467,7 @@ impl ReclusterTableInterpreter { plan = Box::new(PhysicalPlan::Exchange(Exchange { plan_id: 0, input: plan, - kind: FragmentKind::Normal, + kind: FragmentKind::Modulo, keys: vec![expr], allow_adjust_parallelism: true, ignore_exchange: false, diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index 2d38a1e3b8281..6104fefbbb93e 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -99,6 +99,7 @@ impl PipelineBuilder { self.ctx.clone(), input, output, + MutationKind::Recluster, table, partition.table_meta_timestamps, false, diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs index 55688a4347259..40904ea2c8e16 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs @@ -233,6 +233,7 @@ impl ExchangeInjector for AggregateInjector { match exchange { DataExchange::Merge(_) => unreachable!(), DataExchange::Broadcast(_) => unreachable!(), + DataExchange::Modulo(_) => unreachable!(), DataExchange::ShuffleDataExchange(exchange) => { Ok(Arc::new(Box::new(HashTableHashScatter { buckets: exchange.destination_ids.len(), diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs index dbdda532daca8..dc267c896dc17 100644 --- a/src/query/service/src/schedulers/fragments/fragmenter.rs +++ b/src/query/service/src/schedulers/fragments/fragmenter.rs @@ -15,6 +15,7 @@ use std::sync::Arc; use databend_common_catalog::table_context::TableContext; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_meta_types::NodeInfo; use databend_common_sql::executor::physical_plans::CompactSource; @@ -40,6 +41,7 @@ use crate::schedulers::PlanFragment; use crate::servers::flight::v1::exchange::BroadcastExchange; use crate::servers::flight::v1::exchange::DataExchange; use crate::servers::flight::v1::exchange::MergeExchange; +use crate::servers::flight::v1::exchange::ModuloExchange; use crate::servers::flight::v1::exchange::ShuffleDataExchange; use crate::sessions::QueryContext; use crate::sql::executor::PhysicalPlan; @@ -116,6 +118,15 @@ impl Fragmenter { FragmentKind::Expansive => { Ok(Some(BroadcastExchange::create(Self::get_executors(ctx)))) } + FragmentKind::Modulo => { + if plan.keys.len() != 1 { + return Err(ErrorCode::Internal("Modulo exchange require one key")); + } + Ok(Some(ModuloExchange::create( + Self::get_executors(ctx), + plan.keys[0].clone(), + ))) + } _ => Ok(None), }, _ => Ok(None), @@ -203,8 +214,12 @@ impl PhysicalPlanReplacer for Fragmenter { } fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result { + let input = self.replace(&plan.input)?; self.state = State::HilbertRecluster; - Ok(PhysicalPlan::HilbertPartition(Box::new(plan.clone()))) + Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition { + input: Box::new(input), + ..plan.clone() + }))) } fn replace_compact_source(&mut self, plan: &CompactSource) -> Result { diff --git a/src/query/service/src/schedulers/fragments/plan_fragment.rs b/src/query/service/src/schedulers/fragments/plan_fragment.rs index e8306854a981f..fab77a79d29f5 100644 --- a/src/query/service/src/schedulers/fragments/plan_fragment.rs +++ b/src/query/service/src/schedulers/fragments/plan_fragment.rs @@ -597,7 +597,9 @@ struct ReplaceHilbert { impl PhysicalPlanReplacer for ReplaceHilbert { fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result { + let input = self.replace(&plan.input)?; Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition { + input: Box::new(input), range_width: self.range_width, range_start: self.range_start, ..plan.clone() diff --git a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs index adb0b6c3bcd18..36d8f0c257eb1 100644 --- a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs +++ b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs @@ -72,6 +72,7 @@ impl Display for QueryFragmentActionsWrap<'_> { DataExchange::Merge(_) => writeln!(f, " DataExchange: Merge")?, DataExchange::Broadcast(_) => writeln!(f, " DataExchange: Broadcast")?, DataExchange::ShuffleDataExchange(_) => writeln!(f, " DataExchange: Shuffle")?, + DataExchange::Modulo(_) => writeln!(f, " DataExchange: Modulo")?, } } diff --git a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs index f23c7582559a7..0fba30c72ec7b 100644 --- a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs +++ b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs @@ -19,6 +19,7 @@ pub enum DataExchange { Merge(MergeExchange), Broadcast(BroadcastExchange), ShuffleDataExchange(ShuffleDataExchange), + Modulo(ModuloExchange), } impl DataExchange { @@ -27,6 +28,7 @@ impl DataExchange { DataExchange::Merge(exchange) => vec![exchange.destination_id.clone()], DataExchange::Broadcast(exchange) => exchange.destination_ids.clone(), DataExchange::ShuffleDataExchange(exchange) => exchange.destination_ids.clone(), + DataExchange::Modulo(exchange) => exchange.destination_ids.clone(), } } } @@ -77,3 +79,18 @@ impl BroadcastExchange { DataExchange::Broadcast(BroadcastExchange { destination_ids }) } } + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct ModuloExchange { + pub destination_ids: Vec, + pub shuffle_key: RemoteExpr, +} + +impl ModuloExchange { + pub fn create(destination_ids: Vec, shuffle_key: RemoteExpr) -> DataExchange { + DataExchange::Modulo(ModuloExchange { + destination_ids, + shuffle_key, + }) + } +} diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs index 4aa65ba175a83..5b10b4f346960 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs @@ -29,6 +29,7 @@ use crate::servers::flight::v1::exchange::ShuffleExchangeParams; use crate::servers::flight::v1::scatter::BroadcastFlightScatter; use crate::servers::flight::v1::scatter::FlightScatter; use crate::servers::flight::v1::scatter::HashFlightScatter; +use crate::servers::flight::v1::scatter::ModFlightScatter; use crate::sessions::QueryContext; pub trait ExchangeInjector: Send + Sync + 'static { @@ -100,6 +101,11 @@ impl ExchangeInjector for DefaultExchangeInjector { local_pos, )? } + DataExchange::Modulo(exchange) => ModFlightScatter::try_create( + ctx.get_function_context()?, + &exchange.shuffle_key, + exchange.destination_ids.len(), + )?, })) } diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs index 13a6a57742127..13a65e33ebf08 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs @@ -303,7 +303,7 @@ impl DataExchangeManager { None, Some(config.query.to_rpc_client_tls_config()), ) - .await?, + .await?, ))), false => Ok(FlightClient::new(FlightServiceClient::new( ConnectionFactory::create_rpc_channel(address.to_owned(), None, None).await?, @@ -1011,6 +1011,19 @@ impl FragmentCoordinator { .flight_scatter(&info.query_ctx, data_exchange)?, }), )), + DataExchange::Modulo(exchange) => { + Ok(Some(ExchangeParams::ShuffleExchange(ShuffleExchangeParams { + exchange_injector: exchange_injector.clone(), + schema: self.physical_plan.output_schema()?, + fragment_id: self.fragment_id, + query_id: info.query_id.to_string(), + executor_id: info.current_executor.to_string(), + destination_ids: exchange.destination_ids.to_owned(), + shuffle_scatter: exchange_injector + .flight_scatter(&info.query_ctx, data_exchange)?, + }) + )) + } } } diff --git a/src/query/service/src/servers/flight/v1/exchange/mod.rs b/src/query/service/src/servers/flight/v1/exchange/mod.rs index 194f2cbe1e3e5..ada27909df959 100644 --- a/src/query/service/src/servers/flight/v1/exchange/mod.rs +++ b/src/query/service/src/servers/flight/v1/exchange/mod.rs @@ -32,6 +32,7 @@ pub mod serde; pub use data_exchange::BroadcastExchange; pub use data_exchange::DataExchange; pub use data_exchange::MergeExchange; +pub use data_exchange::ModuloExchange; pub use data_exchange::ShuffleDataExchange; pub use exchange_injector::DefaultExchangeInjector; pub use exchange_injector::ExchangeInjector; diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs new file mode 100644 index 0000000000000..f83fea3f574c2 --- /dev/null +++ b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs @@ -0,0 +1,92 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::type_check::check_function; +use databend_common_expression::types::DataType; +use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::NumberScalar; +use databend_common_expression::DataBlock; +use databend_common_expression::Evaluator; +use databend_common_expression::Expr; +use databend_common_expression::FunctionContext; +use databend_common_expression::RemoteExpr; +use databend_common_expression::Scalar; +use databend_common_functions::BUILTIN_FUNCTIONS; + +use crate::servers::flight::v1::scatter::FlightScatter; + +#[derive(Clone)] +pub struct ModFlightScatter { + scatter_size: usize, + func_ctx: FunctionContext, + expr: Expr, +} + +impl ModFlightScatter { + pub fn try_create( + func_ctx: FunctionContext, + expr: &RemoteExpr, + scatter_size: usize, + ) -> Result> { + let expr = check_function( + None, + "modulo", + &[], + &[ + expr.as_expr(&BUILTIN_FUNCTIONS), + Expr::constant( + Scalar::Number(NumberScalar::UInt64(scatter_size as u64)), + Some(DataType::Number(NumberDataType::UInt64)), + ), + ], + &BUILTIN_FUNCTIONS, + )?; + let return_type = expr.data_type(); + if !matches!(return_type, DataType::Number(NumberDataType::UInt64)) { + return Err(ErrorCode::Internal(format!( + "ModFlightScatter expects modulo expression to return UInt64, but got {:?}", + return_type + ))); + } + + Ok(Box::new(ModFlightScatter { + scatter_size, + func_ctx, + expr, + })) + } +} + +impl FlightScatter for ModFlightScatter { + fn execute(&self, data_block: DataBlock) -> Result> { + let evaluator = Evaluator::new(&data_block, &self.func_ctx, &BUILTIN_FUNCTIONS); + let num = data_block.num_rows(); + + let column = evaluator + .run(&self.expr)? + .into_full_column(&DataType::Number(NumberDataType::UInt64), num); + let indices = column.as_number().unwrap().as_u_int64().unwrap(); + let data_blocks = DataBlock::scatter(&data_block, indices, self.scatter_size)?; + + let block_meta = data_block.get_meta(); + let mut res = Vec::with_capacity(data_blocks.len()); + for data_block in data_blocks { + res.push(data_block.add_meta(block_meta.cloned())?); + } + + Ok(res) + } +} diff --git a/src/query/service/src/servers/flight/v1/scatter/mod.rs b/src/query/service/src/servers/flight/v1/scatter/mod.rs index b5f5f900dab71..2904ed87684ca 100644 --- a/src/query/service/src/servers/flight/v1/scatter/mod.rs +++ b/src/query/service/src/servers/flight/v1/scatter/mod.rs @@ -15,7 +15,9 @@ mod flight_scatter; mod flight_scatter_broadcast; mod flight_scatter_hash; +mod flight_scatter_mod; pub use flight_scatter::FlightScatter; pub use flight_scatter_broadcast::BroadcastFlightScatter; pub use flight_scatter_hash::HashFlightScatter; +pub use flight_scatter_mod::ModFlightScatter; diff --git a/src/query/sql/src/executor/format.rs b/src/query/sql/src/executor/format.rs index 1e3f8879339f3..29f5bc2529dad 100644 --- a/src/query/sql/src/executor/format.rs +++ b/src/query/sql/src/executor/format.rs @@ -1639,6 +1639,14 @@ fn exchange_to_format_tree( ), FragmentKind::Expansive => "Broadcast".to_string(), FragmentKind::Merge => "Merge".to_string(), + FragmentKind::Modulo => format!( + "Modulo({})", + plan.keys + .iter() + .map(|key| { key.as_expr(&BUILTIN_FUNCTIONS).sql_display() }) + .collect::>() + .join(", ") + ), })), to_format_tree(&plan.input, metadata, profs, context)?, ])) diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs index 545179b4af4d6..10859f8391da1 100644 --- a/src/query/sql/src/executor/physical_plans/common.rs +++ b/src/query/sql/src/executor/physical_plans/common.rs @@ -67,6 +67,8 @@ pub enum FragmentKind { // Broadcast Expansive, Merge, + // Partitioned by a specified expression % node_nums + Modulo, } #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)] diff --git a/src/query/sql/src/executor/physical_plans/physical_exchange.rs b/src/query/sql/src/executor/physical_plans/physical_exchange.rs index 1e831519c415b..b4507942dc8ba 100644 --- a/src/query/sql/src/executor/physical_plans/physical_exchange.rs +++ b/src/query/sql/src/executor/physical_plans/physical_exchange.rs @@ -81,6 +81,14 @@ impl PhysicalPlanBuilder { allow_adjust_parallelism = false; FragmentKind::Merge } + crate::plans::Exchange::Modulo(scalar) => { + let expr = scalar + .type_check(input_schema.as_ref())? + .project_column_ref(|index| input_schema.index_of(&index.to_string()).unwrap()); + let (expr, _) = ConstantFolder::fold(&expr, &self.func_ctx, &BUILTIN_FUNCTIONS); + keys.push(expr.as_remote_expr()); + FragmentKind::Modulo + } }; Ok(PhysicalPlan::Exchange(Exchange { plan_id: 0, diff --git a/src/query/sql/src/planner/format/display_rel_operator.rs b/src/query/sql/src/planner/format/display_rel_operator.rs index 9835bbedf0cdd..a82e84ddc59ac 100644 --- a/src/query/sql/src/planner/format/display_rel_operator.rs +++ b/src/query/sql/src/planner/format/display_rel_operator.rs @@ -397,6 +397,7 @@ fn exchange_to_format_tree(id_humanizer: &I, op: &Exchange) -> F Exchange::Broadcast => "Exchange(Broadcast)", Exchange::Merge => "Exchange(Merge)", Exchange::MergeSort => "Exchange(MergeSort)", + Exchange::Modulo(_) => "Exchange(Modulo)", }; match op { diff --git a/src/query/sql/src/planner/optimizer/ir/format.rs b/src/query/sql/src/planner/optimizer/ir/format.rs index f9613af6b35ef..017c9bec97203 100644 --- a/src/query/sql/src/planner/optimizer/ir/format.rs +++ b/src/query/sql/src/planner/optimizer/ir/format.rs @@ -66,6 +66,7 @@ fn display_rel_op(rel_op: &RelOperator) -> String { Exchange::Broadcast => "Broadcast".to_string(), Exchange::Merge => "Merge".to_string(), Exchange::MergeSort => "MergeSort".to_string(), + Exchange::Modulo(scalar) => format!("Modulo({})", scalar.as_raw_expr()), }) } RelOperator::DummyTableScan(_) => "DummyTableScan".to_string(), diff --git a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs index 1229898f5bef4..69abb144e11a4 100644 --- a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs +++ b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs @@ -73,6 +73,7 @@ impl Enforcer for DistributionEnforcer { Distribution::Random | Distribution::Any => Err(ErrorCode::Internal( "Cannot enforce random or any distribution", )), + Distribution::Modulo(key) => Ok(Exchange::Modulo(key.clone()).into()), } } } diff --git a/src/query/sql/src/planner/optimizer/ir/property/property.rs b/src/query/sql/src/planner/optimizer/ir/property/property.rs index 9ae23730ca7e7..3eff4f594a2d2 100644 --- a/src/query/sql/src/planner/optimizer/ir/property/property.rs +++ b/src/query/sql/src/planner/optimizer/ir/property/property.rs @@ -92,6 +92,7 @@ pub enum Distribution { Serial, Broadcast, Hash(Vec), + Modulo(Box), } impl Default for Distribution { @@ -110,11 +111,15 @@ impl Distribution { | (Distribution::Random, _) | (Distribution::Serial, Distribution::Serial) | (Distribution::Broadcast, Distribution::Broadcast) - | (Distribution::Hash(_), Distribution::Broadcast) => true, + | (Distribution::Hash(_), Distribution::Broadcast) + | (Distribution::Modulo(_), Distribution::Broadcast) => true, (Distribution::Hash(ref keys), Distribution::Hash(ref other_keys)) => { keys == other_keys } + (Distribution::Modulo(ref key), Distribution::Modulo(ref other_key)) => { + key == other_key + } _ => false, } } @@ -135,6 +140,7 @@ impl Display for Distribution { .collect::>() .join(", ") ), + Distribution::Modulo(ref key) => write!(f, "Modulo({})", key.as_raw_expr()), } } } diff --git a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs index 6d39e793a7231..5bd737365a76b 100644 --- a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs +++ b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs @@ -158,7 +158,7 @@ impl DefaultCostModel { let exchange: Exchange = (*m_expr.plan.clone()).clone().try_into()?; let group = memo.group(m_expr.group_index)?; let cost = match exchange { - Exchange::Hash(_) => { + Exchange::Hash(_) | Exchange::Modulo(_) => { group.stat_info.cardinality * self.network_per_row + group.stat_info.cardinality * self.compute_per_row } diff --git a/src/query/sql/src/planner/plans/exchange.rs b/src/query/sql/src/planner/plans/exchange.rs index a7aca885b2ed1..db8dffd95d8cf 100644 --- a/src/query/sql/src/planner/plans/exchange.rs +++ b/src/query/sql/src/planner/plans/exchange.rs @@ -30,7 +30,8 @@ pub enum Exchange { Hash(Vec), Broadcast, Merge, - MergeSort, // For distributed sort + MergeSort, // For distributed sort + Modulo(Box), // For recluster } impl Operator for Exchange { @@ -49,6 +50,7 @@ impl Operator for Exchange { Exchange::Broadcast => Distribution::Broadcast, Exchange::Merge => Distribution::Serial, Exchange::MergeSort => Distribution::Serial, + Exchange::Modulo(key) => Distribution::Modulo(key.clone()), }, }) } diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index e6460a7c247a0..84bdd8ca77609 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -57,6 +57,7 @@ impl FuseTable { ctx.clone(), input, output, + MutationKind::Insert, self, table_meta_timestamps, false, diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index 7a3615233b8ce..b3d1fd7bb416e 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -24,11 +24,13 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT; +use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storage::MutationStatus; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use opendal::Operator; @@ -37,6 +39,8 @@ use crate::io::BlockSerialization; use crate::io::BlockWriter; use crate::io::StreamBlockBuilder; use crate::io::StreamBlockProperties; +use crate::operations::MutationLogEntry; +use crate::operations::MutationLogs; use crate::FuseTable; use crate::FUSE_OPT_KEY_ROW_PER_BLOCK; @@ -54,6 +58,7 @@ pub struct TransformBlockWriter { state: State, input: Arc, output: Arc, + kind: MutationKind, properties: Arc, @@ -76,6 +81,7 @@ impl TransformBlockWriter { ctx: Arc, input: Arc, output: Arc, + kind: MutationKind, table: &FuseTable, table_meta_timestamps: TableMetaTimestamps, with_tid: bool, @@ -89,6 +95,7 @@ impl TransformBlockWriter { state: State::Consume, input, output, + kind, properties, builder: None, dal: table.get_operator(), @@ -273,7 +280,20 @@ impl Processor for TransformBlockWriter { }); } - self.output_data = Some(DataBlock::empty_with_meta(Box::new(extended_block_meta))); + let output = if matches!(self.kind, MutationKind::Insert) { + DataBlock::empty_with_meta(Box::new(extended_block_meta)) + } else { + if matches!(self.kind, MutationKind::Recluster) { + metrics_inc_recluster_write_block_nums(); + } + + DataBlock::empty_with_meta(Box::new(MutationLogs { + entries: vec![MutationLogEntry::AppendBlock { + block_meta: Arc::new(extended_block_meta), + }], + })) + }; + self.output_data = Some(output); } _ => return Err(ErrorCode::Internal("It's a bug.")), } From 67bd532d06fca17f6b5095e4fd908617d3100dfb Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 9 May 2025 02:16:56 +0800 Subject: [PATCH 03/36] add transform hilbert collect --- .../expression/src/utils/block_thresholds.rs | 49 ++++ .../interpreter_table_recluster.rs | 14 +- .../builders/builder_hilbert_partition.rs | 65 +++-- .../partition/data_processor_strategy.rs | 12 +- .../transforms/window/partition/mod.rs | 2 + .../partition/transform_hilbert_collect.rs | 253 ++++++++++++++++++ .../partition/window_partition_buffer.rs | 121 +++++---- src/query/service/src/spillers/spiller.rs | 18 ++ .../physical_plans/physical_recluster.rs | 1 + .../fuse/src/io/write/stream/block_builder.rs | 10 +- .../storages/fuse/src/operations/append.rs | 1 + .../processors/transform_block_writer.rs | 26 +- 12 files changed, 458 insertions(+), 114 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs index 4fd35638cb863..fe72302382d8a 100644 --- a/src/query/expression/src/utils/block_thresholds.rs +++ b/src/query/expression/src/utils/block_thresholds.rs @@ -166,4 +166,53 @@ impl BlockThresholds { }; total_rows.div_ceil(block_nums.max(1)).max(1) } + + /// Calculates the optimal number of partitions (blocks) based on total data size and row count. + /// + /// # Parameters + /// - `total_rows`: The total number of rows in the data. + /// - `total_bytes`: The total uncompressed size of the data in bytes. + /// - `total_compressed`: The total compressed size of the data in bytes. + /// + /// # Returns + /// - The calculated number of partitions (blocks) needed. + #[inline] + pub fn calc_partitions_for_recluster( + &self, + total_rows: usize, + total_bytes: usize, + total_compressed: usize, + ) -> usize { + // If the data is already compact enough, return a single partition. + if self.check_for_compact(total_rows, total_bytes) + && total_compressed < 2 * self.min_compressed_per_block + { + return 1; + } + + // Estimate the number of blocks based on row count and compressed size. + let by_rows = std::cmp::max(total_rows / self.max_rows_per_block, 1); + let by_compressed = total_compressed / self.max_compressed_per_block; + // If row-based block count is greater, use max rows per block as limit. + if by_rows >= by_compressed { + return by_rows; + } + + // Adjust block count based on byte size thresholds. + let bytes_per_block = total_bytes.div_ceil(by_compressed); + let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024); + let min_bytes = max_bytes / 2; + let total_partitions = if bytes_per_block > max_bytes { + // Block size is too large. + total_bytes / max_bytes + } else if bytes_per_block < min_bytes { + // Block size is too small. + total_bytes / min_bytes + } else { + // Block size is acceptable. + by_compressed + }; + + std::cmp::max(total_partitions, 1) + } } diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs index 2f8e77f29738f..411452a48d5a0 100644 --- a/src/query/service/src/interpreters/interpreter_table_recluster.rs +++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs @@ -323,12 +323,15 @@ impl ReclusterTableInterpreter { let total_rows = recluster_info.removed_statistics.row_count as usize; let total_compressed = recluster_info.removed_statistics.compressed_byte_size as usize; - // Determine rows per block based on data size and compression ratio - let rows_per_block = - block_thresholds.calc_rows_for_recluster(total_rows, total_bytes, total_compressed); - + // Determine rows per block based on data size and compression ratio, // Calculate initial partition count based on data volume and block size - let total_partitions = std::cmp::max(total_rows / rows_per_block, 1); + let total_partitions = block_thresholds.calc_partitions_for_recluster( + total_rows, + total_bytes, + total_compressed, + ); + let bytes_per_block = (total_bytes / total_partitions).max(1); + let rows_per_block = (total_rows / total_partitions).max(1); warn!( "Do hilbert recluster, total_bytes: {}, total_rows: {}, total_partitions: {}", @@ -487,6 +490,7 @@ impl ReclusterTableInterpreter { range_start: 0, range_width: total_partitions, table_meta_timestamps, + bytes_per_block, rows_per_block, })); diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index 6104fefbbb93e..fd351cd1e5fb7 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -18,7 +18,6 @@ use std::sync::atomic::AtomicUsize; use databend_common_catalog::table::Table; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; -use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::executor::physical_plans::HilbertPartition; @@ -27,12 +26,12 @@ use databend_common_storages_fuse::operations::TransformBlockWriter; use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::statistics::ClusterStatsGenerator; use databend_common_storages_fuse::FuseTable; -use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD; use databend_storages_common_cache::TempDirManager; use crate::pipelines::memory_settings::MemorySettingsExt; use crate::pipelines::processors::transforms::CompactStrategy; use crate::pipelines::processors::transforms::HilbertPartitionExchange; +use crate::pipelines::processors::transforms::TransformHilbertCollect; use crate::pipelines::processors::transforms::TransformWindowPartitionCollect; use crate::pipelines::PipelineBuilder; use crate::spillers::SpillerDiskConfig; @@ -65,35 +64,25 @@ impl PipelineBuilder { let window_spill_settings = MemorySettings::from_window_settings(&self.ctx)?; let processor_id = AtomicUsize::new(0); - let max_bytes_per_block = std::cmp::min( - 4 * table.get_option( - FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD, - DEFAULT_BLOCK_BUFFER_SIZE, - ), - 400 * 1024 * 1024, - ); - self.main_pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create(Box::new( - TransformWindowPartitionCollect::new( - self.ctx.clone(), - input, - output, - &settings, - processor_id.fetch_add(1, atomic::Ordering::AcqRel), - num_processors, - partition.range_width, - window_spill_settings.clone(), - disk_spill.clone(), - CompactStrategy::new( - partition.rows_per_block, - max_bytes_per_block, - enable_stream_writer, - ), - )?, - ))) - })?; if enable_stream_writer { + self.main_pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(Box::new( + TransformHilbertCollect::new( + self.ctx.clone(), + input, + output, + &settings, + processor_id.fetch_add(1, atomic::Ordering::AcqRel), + num_processors, + partition.range_width, + window_spill_settings.clone(), + disk_spill.clone(), + partition.bytes_per_block, + )?, + ))) + })?; + self.main_pipeline.add_transform(|input, output| { TransformBlockWriter::try_create( self.ctx.clone(), @@ -103,9 +92,27 @@ impl PipelineBuilder { table, partition.table_meta_timestamps, false, + Some(partition.bytes_per_block), ) }) } else { + self.main_pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(Box::new( + TransformWindowPartitionCollect::new( + self.ctx.clone(), + input, + output, + &settings, + processor_id.fetch_add(1, atomic::Ordering::AcqRel), + num_processors, + partition.range_width, + window_spill_settings.clone(), + disk_spill.clone(), + CompactStrategy::new(partition.rows_per_block, partition.bytes_per_block), + )?, + ))) + })?; + self.main_pipeline .add_transform(|transform_input_port, transform_output_port| { let proc = TransformSerializeBlock::try_create( diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs index 3515858340e89..75793aa415e08 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs @@ -27,19 +27,13 @@ pub trait DataProcessorStrategy: Send + Sync + 'static { pub struct CompactStrategy { max_bytes_per_block: usize, max_rows_per_block: usize, - enable_stream_writer: bool, } impl CompactStrategy { - pub fn new( - max_rows_per_block: usize, - max_bytes_per_block: usize, - enable_stream_writer: bool, - ) -> Self { + pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self { Self { max_bytes_per_block, max_rows_per_block, - enable_stream_writer, } } @@ -56,10 +50,6 @@ impl DataProcessorStrategy for CompactStrategy { const NAME: &'static str = "Compact"; fn process_data_blocks(&self, data_blocks: Vec) -> Result> { - if self.enable_stream_writer { - return Ok(data_blocks); - } - let blocks_num = data_blocks.len(); if blocks_num < 2 { return Ok(data_blocks); diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs index 5aa4562c98865..96edfcc986434 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs @@ -14,6 +14,7 @@ mod data_processor_strategy; mod hilbert_partition_exchange; +mod transform_hilbert_collect; mod transform_window_partition_collect; mod window_partition_buffer; mod window_partition_exchange; @@ -22,6 +23,7 @@ mod window_partition_partial_top_n_exchange; pub use data_processor_strategy::*; pub use hilbert_partition_exchange::*; +pub use transform_hilbert_collect::*; pub use transform_window_partition_collect::*; pub use window_partition_buffer::*; pub use window_partition_exchange::*; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs new file mode 100644 index 0000000000000..17f6fc17eeb65 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs @@ -0,0 +1,253 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_transforms::MemorySettings; +use databend_common_settings::Settings; +use databend_common_storage::DataOperator; + +use super::WindowPartitionBuffer; +use super::WindowPartitionMeta; +use crate::sessions::QueryContext; +use crate::spillers::Spiller; +use crate::spillers::SpillerConfig; +use crate::spillers::SpillerDiskConfig; +use crate::spillers::SpillerType; + +enum State { + Collect, + Flush, + Spill, + Restore, + Concat(Vec), +} + +pub struct TransformHilbertCollect { + input: Arc, + output: Arc, + + immediate_output_blocks: Vec<(usize, DataBlock)>, + output_data_blocks: VecDeque, + + // The partition id is used to map the partition id to the new partition id. + partition_id: Vec, + partition_sizes: Vec, + // The buffer is used to control the memory usage of the window operator. + buffer: WindowPartitionBuffer, + + max_block_size: usize, + // Event variables. + state: State, +} + +impl TransformHilbertCollect { + #[allow(clippy::too_many_arguments)] + pub fn new( + ctx: Arc, + input: Arc, + output: Arc, + settings: &Settings, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + memory_settings: MemorySettings, + disk_spill: Option, + max_block_size: usize, + ) -> Result { + // Calculate the partition ids collected by the processor. + let partitions: Vec = (0..num_partitions) + .filter(|&partition| partition % num_processors == processor_id) + .collect(); + + // Map each partition id to new partition id. + let mut partition_id = vec![0; num_partitions]; + for (new_partition_id, partition) in partitions.iter().enumerate() { + partition_id[*partition] = new_partition_id; + } + + let location_prefix = ctx.query_id_spill_prefix(); + let spill_config = SpillerConfig { + spiller_type: SpillerType::Window, + location_prefix, + disk_spill, + use_parquet: settings.get_spilling_file_format()?.is_parquet(), + }; + + // Create an inner `Spiller` to spill data. + let operator = DataOperator::instance().spill_operator(); + let spiller = Spiller::create(ctx, operator, spill_config)?; + + // Create the window partition buffer. + let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; + let buffer = WindowPartitionBuffer::new( + spiller, + partitions.len(), + sort_block_size, + memory_settings, + )?; + + Ok(Self { + input, + output, + partition_id, + buffer, + immediate_output_blocks: vec![], + partition_sizes: vec![0; num_partitions], + max_block_size, + output_data_blocks: VecDeque::new(), + state: State::Collect, + }) + } +} + +#[async_trait::async_trait] +impl Processor for TransformHilbertCollect { + fn name(&self) -> String { + "TransformHilbertCollect".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if matches!(self.state, State::Concat(_)) { + return Ok(Event::Sync); + } + + if matches!(self.state, State::Flush | State::Spill | State::Restore) { + return Ok(Event::Async); + } + + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data_blocks.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.need_spill() { + self.state = State::Spill; + return Ok(Event::Async); + } + + if !self.immediate_output_blocks.is_empty() { + self.state = State::Flush; + return Ok(Event::Async); + } + + if self.input.is_finished() { + if !self.buffer.is_empty() { + self.state = State::Restore; + return Ok(Event::Async); + } + + self.output.finish(); + return Ok(Event::Finished); + } + + if self.input.has_data() { + self.collect_data_block()?; + + if self.need_spill() { + self.state = State::Spill; + return Ok(Event::Async); + } + + if !self.immediate_output_blocks.is_empty() { + self.state = State::Flush; + return Ok(Event::Async); + } + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Collect) { + State::Concat(blocks) => { + let output = DataBlock::concat(&blocks)?; + self.output_data_blocks.push_back(output); + } + _ => unreachable!(), + } + Ok(()) + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Collect) { + State::Spill => { + self.buffer.spill().await?; + } + State::Flush => { + if let Some((partition_id, data_block)) = self.immediate_output_blocks.pop() { + let mut restored_data_blocks = self.buffer.restore_by_id(partition_id).await?; + restored_data_blocks.push(data_block); + self.state = State::Concat(restored_data_blocks); + } + } + State::Restore => { + let restored_data_blocks = self.buffer.restore().await?; + self.output_data_blocks.extend(restored_data_blocks); + } + _ => unreachable!(), + } + Ok(()) + } +} + +impl TransformHilbertCollect { + fn collect_data_block(&mut self) -> Result<()> { + let data_block = self.input.pull_data().unwrap()?; + if let Some(meta) = data_block + .get_owned_meta() + .and_then(WindowPartitionMeta::downcast_from) + { + for (partition_id, data_block) in meta.partitioned_data.into_iter() { + let new_id = self.partition_id[partition_id]; + self.partition_sizes[new_id] += data_block.estimate_block_size(); + if self.partition_sizes[new_id] >= self.max_block_size { + self.immediate_output_blocks.push((new_id, data_block)); + self.partition_sizes[new_id] = 0; + continue; + } + self.buffer.add_data_block(new_id, data_block); + } + } + Ok(()) + } + + fn need_spill(&mut self) -> bool { + self.buffer.need_spill() + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index bf01acedc586c..0a14b73bc26b5 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -145,75 +145,80 @@ impl WindowPartitionBuffer { while self.next_to_restore_partition_id + 1 < self.num_partitions as isize { self.next_to_restore_partition_id += 1; let partition_id = self.next_to_restore_partition_id as usize; - // Restore large partitions from spilled files. - let mut result = self.spiller.read_spilled_partition(&partition_id).await?; - - // Restore small merged partitions from spilled files. - let spilled_small_partitions = - std::mem::take(&mut self.spilled_small_partitions[partition_id]); - for index in spilled_small_partitions { - let out_of_memory_limit = self.out_of_memory_limit(); - let (merged_partitions, restored, partial_restored) = - &mut self.spilled_merged_partitions[index]; - if *restored { - continue; - } - let MergedPartition { - location, - partitions, - } = merged_partitions; - if out_of_memory_limit || *partial_restored { - if let Some(pos) = partitions.iter().position(|(id, _)| *id == partition_id) { - let data_block = self - .spiller - .read_chunk(location, &partitions[pos].1) - .await?; - self.restored_partition_buffer - .add_data_block(partition_id, data_block); - partitions.remove(pos); - *partial_restored = true; - } - } else { - let partitioned_data = self + let result = self.restore_by_id(partition_id).await?; + if !result.is_empty() { + return Ok(result); + } + } + Ok(vec![]) + } + + pub async fn restore_by_id(&mut self, partition_id: usize) -> Result> { + // Restore large partitions from spilled files. + let mut result = self.spiller.take_spilled_partition(&partition_id).await?; + + // Restore small merged partitions from spilled files. + let spilled_small_partitions = + std::mem::take(&mut self.spilled_small_partitions[partition_id]); + for index in spilled_small_partitions { + let out_of_memory_limit = self.out_of_memory_limit(); + let (merged_partitions, restored, partial_restored) = + &mut self.spilled_merged_partitions[index]; + if *restored { + continue; + } + let MergedPartition { + location, + partitions, + } = merged_partitions; + if out_of_memory_limit || *partial_restored { + if let Some(pos) = partitions.iter().position(|(id, _)| *id == partition_id) { + let data_block = self .spiller - .read_merged_partitions(merged_partitions) + .read_chunk(location, &partitions[pos].1) .await?; - for (partition_id, data_block) in partitioned_data.into_iter() { - self.restored_partition_buffer - .add_data_block(partition_id, data_block); - } - *restored = true; + self.restored_partition_buffer + .add_data_block(partition_id, data_block); + partitions.remove(pos); + *partial_restored = true; } - } - - if !self.partition_buffer.is_partition_empty(partition_id) { - let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); - if let Some(data_blocks) = self - .partition_buffer - .fetch_data_blocks(partition_id, &option)? - { - result.extend(self.concat_data_blocks(data_blocks)?); + } else { + let partitioned_data = self + .spiller + .read_merged_partitions(merged_partitions) + .await?; + for (partition_id, data_block) in partitioned_data.into_iter() { + self.restored_partition_buffer + .add_data_block(partition_id, data_block); } + *restored = true; } + } - if !self - .restored_partition_buffer - .is_partition_empty(partition_id) + if !self.partition_buffer.is_partition_empty(partition_id) { + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + if let Some(data_blocks) = self + .partition_buffer + .fetch_data_blocks(partition_id, &option)? { - let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); - if let Some(data_blocks) = self - .restored_partition_buffer - .fetch_data_blocks(partition_id, &option)? - { - result.extend(self.concat_data_blocks(data_blocks)?); - } + result.extend(self.concat_data_blocks(data_blocks)?); } + } - if !result.is_empty() { - return Ok(result); + if !self + .restored_partition_buffer + .is_partition_empty(partition_id) + { + let option = PartitionBufferFetchOption::PickPartitionWithThreshold(0); + if let Some(data_blocks) = self + .restored_partition_buffer + .fetch_data_blocks(partition_id, &option)? + { + result.extend(self.concat_data_blocks(data_blocks)?); } } - Ok(vec![]) + + Ok(result) } fn concat_data_blocks(&self, data_blocks: Vec) -> Result> { diff --git a/src/query/service/src/spillers/spiller.rs b/src/query/service/src/spillers/spiller.rs index 6c454ab89638b..3627f1b83a117 100644 --- a/src/query/service/src/spillers/spiller.rs +++ b/src/query/service/src/spillers/spiller.rs @@ -386,6 +386,24 @@ impl Spiller { } } + #[async_backtrace::framed] + /// Read and remove spilled data with partition id + pub async fn take_spilled_partition(&mut self, p_id: &usize) -> Result> { + if let Some(locs) = self.partition_location.remove(p_id) { + let mut spilled_data = Vec::with_capacity(locs.len()); + for loc in locs { + let block = self.read_spilled_file(&loc).await?; + + if block.num_rows() != 0 { + spilled_data.push(block); + } + } + Ok(spilled_data) + } else { + Ok(vec![]) + } + } + pub async fn read_merged_partitions( &self, MergedPartition { diff --git a/src/query/sql/src/executor/physical_plans/physical_recluster.rs b/src/query/sql/src/executor/physical_plans/physical_recluster.rs index 43236e53766a5..0a5520ccb5ba2 100644 --- a/src/query/sql/src/executor/physical_plans/physical_recluster.rs +++ b/src/query/sql/src/executor/physical_plans/physical_recluster.rs @@ -33,6 +33,7 @@ pub struct HilbertPartition { pub table_info: TableInfo, pub table_meta_timestamps: TableMetaTimestamps, pub rows_per_block: usize, + pub bytes_per_block: usize, pub range_start: u64, pub range_width: usize, } diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 27eaf89c6616d..d0b8ca8d5d288 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -233,11 +233,16 @@ impl StreamBlockBuilder { } pub fn need_flush(&self) -> bool { + if let Some(max_block_bytes) = self.properties.max_block_bytes { + if self.block_size >= max_block_bytes { + return true; + } + }; let file_size = self.block_writer.compressed_size(); self.row_count >= self.properties.block_thresholds.min_rows_per_block || self.block_size >= self.properties.block_thresholds.max_bytes_per_block || (file_size >= self.properties.block_thresholds.min_compressed_per_block - && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) + && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) } pub fn write(&mut self, block: DataBlock) -> Result<()> { @@ -350,6 +355,7 @@ pub struct StreamBlockProperties { pub(crate) ctx: Arc, pub(crate) write_settings: WriteSettings, pub(crate) block_thresholds: BlockThresholds, + pub(crate) max_block_bytes: Option, meta_locations: TableMetaLocationGenerator, source_schema: TableSchemaRef, @@ -368,6 +374,7 @@ impl StreamBlockProperties { ctx: Arc, table: &FuseTable, table_meta_timestamps: TableMetaTimestamps, + max_block_bytes: Option, ) -> Result> { // remove virtual computed fields. let fields = table @@ -430,6 +437,7 @@ impl StreamBlockProperties { ngram_args, inverted_index_builders, table_meta_timestamps, + max_block_bytes, })) } } diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index 84bdd8ca77609..57cc6c8af06ea 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -61,6 +61,7 @@ impl FuseTable { self, table_meta_timestamps, false, + None, ) })?; } else { diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index b3d1fd7bb416e..73a85bf4f52c5 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -71,7 +71,7 @@ pub struct TransformBlockWriter { // Only used in multi table insert table_id: Option, - max_block_size: usize, + max_block_rows: usize, input_data: VecDeque, output_data: Option, } @@ -85,12 +85,14 @@ impl TransformBlockWriter { table: &FuseTable, table_meta_timestamps: TableMetaTimestamps, with_tid: bool, + max_block_bytes: Option, ) -> Result { - let max_block_size = std::cmp::min( + let max_block_rows = std::cmp::min( ctx.get_settings().get_max_block_size()? as usize, table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT), ); - let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?; + let properties = + StreamBlockProperties::try_create(ctx, table, table_meta_timestamps, max_block_bytes)?; Ok(ProcessorPtr::create(Box::new(TransformBlockWriter { state: State::Consume, input, @@ -105,7 +107,7 @@ impl TransformBlockWriter { input_data_size: 0, input_num_rows: 0, output_data: None, - max_block_size, + max_block_rows, }))) } @@ -118,16 +120,16 @@ impl TransformBlockWriter { Ok(self.builder.as_mut().unwrap()) } - fn calc_max_block_size(&self, block: &DataBlock) -> usize { + fn calc_max_block_rows(&self, block: &DataBlock) -> usize { let min_bytes_per_block = self.properties.block_thresholds.min_bytes_per_block; let block_size = block.estimate_block_size(); if block_size < min_bytes_per_block { - return self.max_block_size; + return self.max_block_rows; } let num_rows = block.num_rows(); let average_row_size = block_size.div_ceil(num_rows); let max_rows = min_bytes_per_block.div_ceil(average_row_size); - self.max_block_size.min(max_rows) + self.max_block_rows.min(max_rows) } } @@ -205,9 +207,13 @@ impl Processor for TransformBlockWriter { block.check_valid()?; self.input_data_size += block.estimate_block_size(); self.input_num_rows += block.num_rows(); - let max_rows_per_block = self.calc_max_block_size(&block); - let blocks = block.split_by_rows_no_tail(max_rows_per_block); - self.input_data.extend(blocks); + if self.properties.max_block_bytes.is_some() { + self.input_data.push_back(block); + } else { + let max_rows_per_block = self.calc_max_block_rows(&block); + let blocks = block.split_by_rows_no_tail(max_rows_per_block); + self.input_data.extend(blocks); + } } State::Serialize => { while let Some(b) = self.input_data.pop_front() { From 3cf0b6fdb6871dc939bfe79a3ba560f92e7f9cf3 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 9 May 2025 02:45:14 +0800 Subject: [PATCH 04/36] partial restore --- .../window/partition/transform_hilbert_collect.rs | 3 ++- .../window/partition/window_partition_buffer.rs | 10 +++++++--- .../storages/fuse/src/io/write/stream/block_builder.rs | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs index 17f6fc17eeb65..cba5ec06cf0d8 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs @@ -211,7 +211,8 @@ impl Processor for TransformHilbertCollect { } State::Flush => { if let Some((partition_id, data_block)) = self.immediate_output_blocks.pop() { - let mut restored_data_blocks = self.buffer.restore_by_id(partition_id).await?; + let mut restored_data_blocks = + self.buffer.restore_by_id(partition_id, true).await?; restored_data_blocks.push(data_block); self.state = State::Concat(restored_data_blocks); } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs index 0a14b73bc26b5..b58bafca0ee9a 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/window_partition_buffer.rs @@ -145,7 +145,7 @@ impl WindowPartitionBuffer { while self.next_to_restore_partition_id + 1 < self.num_partitions as isize { self.next_to_restore_partition_id += 1; let partition_id = self.next_to_restore_partition_id as usize; - let result = self.restore_by_id(partition_id).await?; + let result = self.restore_by_id(partition_id, false).await?; if !result.is_empty() { return Ok(result); } @@ -153,7 +153,11 @@ impl WindowPartitionBuffer { Ok(vec![]) } - pub async fn restore_by_id(&mut self, partition_id: usize) -> Result> { + pub async fn restore_by_id( + &mut self, + partition_id: usize, + partial_restore: bool, + ) -> Result> { // Restore large partitions from spilled files. let mut result = self.spiller.take_spilled_partition(&partition_id).await?; @@ -171,7 +175,7 @@ impl WindowPartitionBuffer { location, partitions, } = merged_partitions; - if out_of_memory_limit || *partial_restored { + if out_of_memory_limit || *partial_restored || partial_restore { if let Some(pos) = partitions.iter().position(|(id, _)| *id == partition_id) { let data_block = self .spiller diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index d0b8ca8d5d288..69e81f8dec714 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -242,7 +242,7 @@ impl StreamBlockBuilder { self.row_count >= self.properties.block_thresholds.min_rows_per_block || self.block_size >= self.properties.block_thresholds.max_bytes_per_block || (file_size >= self.properties.block_thresholds.min_compressed_per_block - && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) + && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) } pub fn write(&mut self, block: DataBlock) -> Result<()> { From 82a5457f08532a413107e8d281be084841d5cde1 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 9 May 2025 02:52:32 +0800 Subject: [PATCH 05/36] format --- .../servers/flight/v1/exchange/exchange_manager.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs index 13a65e33ebf08..8d96b11c3488d 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs @@ -303,7 +303,7 @@ impl DataExchangeManager { None, Some(config.query.to_rpc_client_tls_config()), ) - .await?, + .await?, ))), false => Ok(FlightClient::new(FlightServiceClient::new( ConnectionFactory::create_rpc_channel(address.to_owned(), None, None).await?, @@ -1011,8 +1011,8 @@ impl FragmentCoordinator { .flight_scatter(&info.query_ctx, data_exchange)?, }), )), - DataExchange::Modulo(exchange) => { - Ok(Some(ExchangeParams::ShuffleExchange(ShuffleExchangeParams { + DataExchange::Modulo(exchange) => Ok(Some(ExchangeParams::ShuffleExchange( + ShuffleExchangeParams { exchange_injector: exchange_injector.clone(), schema: self.physical_plan.output_schema()?, fragment_id: self.fragment_id, @@ -1021,9 +1021,8 @@ impl FragmentCoordinator { destination_ids: exchange.destination_ids.to_owned(), shuffle_scatter: exchange_injector .flight_scatter(&info.query_ctx, data_exchange)?, - }) - )) - } + }, + ))), } } From ac6ca412287eda38293d06d3e50ab2c51d1c47a8 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 9 May 2025 18:43:11 +0800 Subject: [PATCH 06/36] add test --- .../ee/src/hilbert_clustering/handler.rs | 22 +++++++++-- .../expression/src/utils/block_thresholds.rs | 2 +- .../expression/tests/it/block_thresholds.rs | 37 ++++++++++++++++--- .../builders/builder_hilbert_partition.rs | 1 + .../partition/transform_hilbert_collect.rs | 12 +++--- 5 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/query/ee/src/hilbert_clustering/handler.rs b/src/query/ee/src/hilbert_clustering/handler.rs index cebfbadc5947e..c7ee957a77ed9 100644 --- a/src/query/ee/src/hilbert_clustering/handler.rs +++ b/src/query/ee/src/hilbert_clustering/handler.rs @@ -63,7 +63,7 @@ impl HilbertClusteringHandler for RealHilbertClusteringHandler { let max_bytes_per_block = fuse_table.get_option( FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD, DEFAULT_BLOCK_BUFFER_SIZE, - ); + ) * 2; let hilbert_min_bytes = std::cmp::max( hilbert_clustering_min_bytes, max_bytes_per_block * block_per_seg, @@ -76,6 +76,7 @@ impl HilbertClusteringHandler for RealHilbertClusteringHandler { let mut checker = ReclusterChecker::new( cluster_key_id, hilbert_min_bytes, + block_per_seg, push_downs.as_ref().is_none_or(|v| v.filters.is_none()), ); 'FOR: for chunk in segment_locations.chunks(chunk_size) { @@ -139,19 +140,29 @@ struct ReclusterChecker { hilbert_min_bytes: usize, total_bytes: usize, + hilbert_min_blocks: usize, + total_blocks: usize, + finished: bool, // Whether the target segments is at the head of snapshot. head_of_snapshot: bool, } impl ReclusterChecker { - fn new(default_cluster_id: u32, hilbert_min_bytes: usize, head_of_snapshot: bool) -> Self { + fn new( + default_cluster_id: u32, + hilbert_min_bytes: usize, + hilbert_min_blocks: usize, + head_of_snapshot: bool, + ) -> Self { Self { segments: vec![], last_segment: None, default_cluster_id, + hilbert_min_blocks, hilbert_min_bytes, total_bytes: 0, + total_blocks: 0, finished: false, head_of_snapshot, } @@ -164,10 +175,14 @@ impl ReclusterChecker { if segment_should_recluster || !self.head_of_snapshot { self.total_bytes += segment.summary.uncompressed_byte_size as usize; + self.total_blocks += segment.summary.block_count as usize; self.segments.push((location.clone(), segment.clone())); } - if !segment_should_recluster || self.total_bytes >= self.hilbert_min_bytes { + if !segment_should_recluster + || (self.total_bytes >= self.hilbert_min_bytes + && self.total_blocks >= self.hilbert_min_blocks) + { if self.check_for_recluster() { self.finished = true; return true; @@ -208,6 +223,7 @@ impl ReclusterChecker { fn reset(&mut self) { self.total_bytes = 0; + self.total_blocks = 0; self.head_of_snapshot = false; self.segments.clear(); } diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs index fe72302382d8a..742bfdf489261 100644 --- a/src/query/expression/src/utils/block_thresholds.rs +++ b/src/query/expression/src/utils/block_thresholds.rs @@ -153,7 +153,7 @@ impl BlockThresholds { let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed); // Adjust the number of blocks based on block size thresholds. let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024); - let min_bytes_per_block = self.min_bytes_per_block.min(100 * 1024 * 1024); + let min_bytes_per_block = max_bytes_per_block / 2; let block_nums = if bytes_per_block > max_bytes_per_block { // Case 1: If the block size is too bigger. total_bytes.div_ceil(max_bytes_per_block) diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs index 08793eb2a78e4..371a8194f552c 100644 --- a/src/query/expression/tests/it/block_thresholds.rs +++ b/src/query/expression/tests/it/block_thresholds.rs @@ -15,7 +15,7 @@ use databend_common_expression::BlockThresholds; fn default_thresholds() -> BlockThresholds { - BlockThresholds::new(1000, 1_000_000, 100_000, 4) + BlockThresholds::new(1_000, 1_000_000, 100_000, 4) } #[test] @@ -101,14 +101,41 @@ fn test_calc_rows_for_recluster() { ); // Case 1: If the block size is too bigger. - let result = t.calc_rows_for_recluster(4_000, 30_000_000, 600_000); - assert_eq!(result, 400); + let result = t.calc_rows_for_recluster(4_500, 30_000_000, 600_000); + assert_eq!(result, 300); // Case 2: If the block size is too smaller. - let result = t.calc_rows_for_recluster(4_000, 2_000_000, 600_000); - assert_eq!(result, 800); + let result = t.calc_rows_for_recluster(4_000, 4_000_000, 600_000); + assert_eq!(result, 1000); // Case 3: use the compressed-based block count. let result = t.calc_rows_for_recluster(4_000, 10_000_000, 600_000); assert_eq!(result, 667); } + +#[test] +fn test_calc_partitions_for_recluster() { + let t = default_thresholds(); + + // compact enough to skip further calculations + assert_eq!(t.calc_partitions_for_recluster(1000, 500_000, 100_000), 1); + + // row-based block count exceeds compressed-based block count, use max rows per block. + assert_eq!( + t.calc_partitions_for_recluster(10_000, 2_000_000, 100_000), + 10 + ); + + // Case 1: If the block size is too bigger. + let result = t.calc_partitions_for_recluster(4_500, 30_000_000, 600_000); + assert_eq!(result, 15); + + // Case 2: If the block size is too smaller. + let result = t.calc_partitions_for_recluster(4_000, 4_000_000, 600_000); + assert_eq!(result, 4); + + // Case 3: use the compressed-based block count. + let result = t.calc_partitions_for_recluster(4_000, 10_000_000, 600_000); + assert_eq!(result, 6); +} + diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index fd351cd1e5fb7..1ffe3e5e2c69b 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -78,6 +78,7 @@ impl PipelineBuilder { partition.range_width, window_spill_settings.clone(), disk_spill.clone(), + partition.rows_per_block, partition.bytes_per_block, )?, ))) diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs index cba5ec06cf0d8..9b6928fb58a9b 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs @@ -73,6 +73,7 @@ impl TransformHilbertCollect { num_partitions: usize, memory_settings: MemorySettings, disk_spill: Option, + max_block_rows: usize, max_block_size: usize, ) -> Result { // Calculate the partition ids collected by the processor. @@ -99,13 +100,10 @@ impl TransformHilbertCollect { let spiller = Spiller::create(ctx, operator, spill_config)?; // Create the window partition buffer. - let sort_block_size = settings.get_window_partition_sort_block_size()? as usize; - let buffer = WindowPartitionBuffer::new( - spiller, - partitions.len(), - sort_block_size, - memory_settings, - )?; + let max_block_rows = + max_block_rows.min(settings.get_window_partition_sort_block_size()? as usize); + let buffer = + WindowPartitionBuffer::new(spiller, partitions.len(), max_block_rows, memory_settings)?; Ok(Self { input, From f3dbc57d2249a8da8b667d43c0eba1d85d3f75eb Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 9 May 2025 19:01:53 +0800 Subject: [PATCH 07/36] fix test --- .../ee/07_hilbert_clustering/07_0000_recluster_final.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test index b3354e66740f6..6fc4bfbf44841 100644 --- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test +++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test @@ -48,12 +48,12 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); statement ok -alter table t recluster final; +alter table t recluster; query I select count() from fuse_snapshot('test_hilbert','t'); ---- -6 +5 query II select count(a), sum(a) from t; @@ -77,7 +77,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); query I select count() from fuse_snapshot('test_hilbert','t'); ---- -9 +8 query II select block_count,row_count from fuse_segment('test_hilbert','t'); @@ -109,7 +109,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); query T select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t', 'a,b'); ---- -(a, b) linear {"average_depth":1.4,"average_overlaps":0.4,"block_depth_histogram":{"00001":3,"00002":2},"constant_block_count":0,"total_block_count":5} +(a, b) linear {"average_depth":1.0,"average_overlaps":0.0,"block_depth_histogram":{"00001":5},"constant_block_count":0,"total_block_count":5} # column specified not exist statement error 1065 From 22f2d3a1bc078c557423e1c60e81d4cf9093f3b4 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 9 May 2025 19:12:15 +0800 Subject: [PATCH 08/36] format --- src/query/expression/tests/it/block_thresholds.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs index 371a8194f552c..e7ad1304ae3cc 100644 --- a/src/query/expression/tests/it/block_thresholds.rs +++ b/src/query/expression/tests/it/block_thresholds.rs @@ -138,4 +138,3 @@ fn test_calc_partitions_for_recluster() { let result = t.calc_partitions_for_recluster(4_000, 10_000_000, 600_000); assert_eq!(result, 6); } - From f5e0491fd5dee7d3c29e3b5e89216e52daaa494c Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 14 May 2025 02:05:11 +0800 Subject: [PATCH 09/36] add compact strategy --- .../pipelines/processors/transforms/mod.rs | 2 + .../transforms/recluster/compact_strategy.rs | 78 +++++++++++++++++++ .../hilbert_partition_exchange.rs | 4 - .../processors/transforms/recluster/mod.rs | 21 +++++ .../transform_hilbert_collect.rs | 24 +++--- .../partition/data_processor_strategy.rs | 60 -------------- .../transforms/window/partition/mod.rs | 4 - 7 files changed, 114 insertions(+), 79 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs rename src/query/service/src/pipelines/processors/transforms/{window/partition => recluster}/hilbert_partition_exchange.rs (90%) create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/mod.rs rename src/query/service/src/pipelines/processors/transforms/{window/partition => recluster}/transform_hilbert_collect.rs (90%) diff --git a/src/query/service/src/pipelines/processors/transforms/mod.rs b/src/query/service/src/pipelines/processors/transforms/mod.rs index 80966daa5fa8d..5c7c2264f7a4f 100644 --- a/src/query/service/src/pipelines/processors/transforms/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/mod.rs @@ -17,6 +17,7 @@ pub mod aggregator; mod broadcast; mod hash_join; pub(crate) mod range_join; +mod recluster; mod runtime_pool; mod transform_add_computed_columns; mod transform_add_const_columns; @@ -46,6 +47,7 @@ mod window; pub use broadcast::BroadcastSinkProcessor; pub use broadcast::BroadcastSourceProcessor; pub use hash_join::*; +pub use recluster::*; pub use transform_add_computed_columns::TransformAddComputedColumns; pub use transform_add_const_columns::TransformAddConstColumns; pub use transform_add_internal_columns::TransformAddInternalColumns; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs new file mode 100644 index 0000000000000..bd02855159648 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs @@ -0,0 +1,78 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; + +use crate::pipelines::processors::transforms::DataProcessorStrategy; + +pub struct CompactStrategy { + max_bytes_per_block: usize, + max_rows_per_block: usize, +} + +impl CompactStrategy { + pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self { + Self { + max_bytes_per_block, + max_rows_per_block, + } + } + + fn concat_blocks(blocks: Vec) -> Result { + DataBlock::concat(&blocks) + } + + fn check_large_enough(&self, rows: usize, bytes: usize) -> bool { + rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block + } +} + +impl DataProcessorStrategy for CompactStrategy { + const NAME: &'static str = "Compact"; + + fn process_data_blocks(&self, data_blocks: Vec) -> Result> { + let blocks_num = data_blocks.len(); + if blocks_num < 2 { + return Ok(data_blocks); + } + + let mut accumulated_rows = 0; + let mut accumulated_bytes = 0; + let mut pending_blocks = Vec::with_capacity(blocks_num); + let mut staged_blocks = Vec::with_capacity(blocks_num); + let mut result = Vec::with_capacity(blocks_num); + for block in data_blocks { + accumulated_rows += block.num_rows(); + accumulated_bytes += block.estimate_block_size(); + pending_blocks.push(block); + if !self.check_large_enough(accumulated_rows, accumulated_bytes) { + continue; + } + if !staged_blocks.is_empty() { + result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + std::mem::swap(&mut staged_blocks, &mut pending_blocks); + accumulated_rows = 0; + accumulated_bytes = 0; + } + + staged_blocks.append(&mut pending_blocks); + if !staged_blocks.is_empty() { + result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + + Ok(result) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs similarity index 90% rename from src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs rename to src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs index 16215dded2b15..c6a79277af909 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/hilbert_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs @@ -12,10 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Some variables and functions are named and designed with reference to ClickHouse. -// - https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/Transforms/WindowTransform.h -// - https://github.com/ClickHouse/ClickHouse/blob/master/src/Processors/Transforms/WindowTransform.cpp - use std::sync::Arc; use databend_common_exception::Result; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs new file mode 100644 index 0000000000000..aba21e76d3cd2 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -0,0 +1,21 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod compact_strategy; +mod hilbert_partition_exchange; +mod transform_hilbert_collect; + +pub use compact_strategy::CompactStrategy; +pub use hilbert_partition_exchange::HilbertPartitionExchange; +pub use transform_hilbert_collect::TransformHilbertCollect; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs similarity index 90% rename from src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs rename to src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs index 9b6928fb58a9b..07740e7b56377 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_hilbert_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs @@ -27,8 +27,10 @@ use databend_common_pipeline_transforms::MemorySettings; use databend_common_settings::Settings; use databend_common_storage::DataOperator; -use super::WindowPartitionBuffer; -use super::WindowPartitionMeta; +use crate::pipelines::processors::transforms::CompactStrategy; +use crate::pipelines::processors::transforms::DataProcessorStrategy; +use crate::pipelines::processors::transforms::WindowPartitionBuffer; +use crate::pipelines::processors::transforms::WindowPartitionMeta; use crate::sessions::QueryContext; use crate::spillers::Spiller; use crate::spillers::SpillerConfig; @@ -40,7 +42,7 @@ enum State { Flush, Spill, Restore, - Concat(Vec), + Compact(Vec), } pub struct TransformHilbertCollect { @@ -56,6 +58,7 @@ pub struct TransformHilbertCollect { // The buffer is used to control the memory usage of the window operator. buffer: WindowPartitionBuffer, + compact_strategy: CompactStrategy, max_block_size: usize, // Event variables. state: State, @@ -100,8 +103,6 @@ impl TransformHilbertCollect { let spiller = Spiller::create(ctx, operator, spill_config)?; // Create the window partition buffer. - let max_block_rows = - max_block_rows.min(settings.get_window_partition_sort_block_size()? as usize); let buffer = WindowPartitionBuffer::new(spiller, partitions.len(), max_block_rows, memory_settings)?; @@ -113,6 +114,7 @@ impl TransformHilbertCollect { immediate_output_blocks: vec![], partition_sizes: vec![0; num_partitions], max_block_size, + compact_strategy: CompactStrategy::new(max_block_rows, max_block_size), output_data_blocks: VecDeque::new(), state: State::Collect, }) @@ -130,7 +132,7 @@ impl Processor for TransformHilbertCollect { } fn event(&mut self) -> Result { - if matches!(self.state, State::Concat(_)) { + if matches!(self.state, State::Compact(_)) { return Ok(Event::Sync); } @@ -192,9 +194,9 @@ impl Processor for TransformHilbertCollect { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Collect) { - State::Concat(blocks) => { - let output = DataBlock::concat(&blocks)?; - self.output_data_blocks.push_back(output); + State::Compact(blocks) => { + let output = self.compact_strategy.process_data_blocks(blocks)?; + self.output_data_blocks.extend(output); } _ => unreachable!(), } @@ -212,12 +214,12 @@ impl Processor for TransformHilbertCollect { let mut restored_data_blocks = self.buffer.restore_by_id(partition_id, true).await?; restored_data_blocks.push(data_block); - self.state = State::Concat(restored_data_blocks); + self.state = State::Compact(restored_data_blocks); } } State::Restore => { let restored_data_blocks = self.buffer.restore().await?; - self.output_data_blocks.extend(restored_data_blocks); + self.state = State::Compact(restored_data_blocks); } _ => unreachable!(), } diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs index 75793aa415e08..d0808f1d423ef 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs @@ -24,66 +24,6 @@ pub trait DataProcessorStrategy: Send + Sync + 'static { fn process_data_blocks(&self, data_blocks: Vec) -> Result>; } -pub struct CompactStrategy { - max_bytes_per_block: usize, - max_rows_per_block: usize, -} - -impl CompactStrategy { - pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self { - Self { - max_bytes_per_block, - max_rows_per_block, - } - } - - fn concat_blocks(blocks: Vec) -> Result { - DataBlock::concat(&blocks) - } - - fn check_large_enough(&self, rows: usize, bytes: usize) -> bool { - rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block - } -} - -impl DataProcessorStrategy for CompactStrategy { - const NAME: &'static str = "Compact"; - - fn process_data_blocks(&self, data_blocks: Vec) -> Result> { - let blocks_num = data_blocks.len(); - if blocks_num < 2 { - return Ok(data_blocks); - } - - let mut accumulated_rows = 0; - let mut accumulated_bytes = 0; - let mut pending_blocks = Vec::with_capacity(blocks_num); - let mut staged_blocks = Vec::with_capacity(blocks_num); - let mut result = Vec::with_capacity(blocks_num); - for block in data_blocks { - accumulated_rows += block.num_rows(); - accumulated_bytes += block.estimate_block_size(); - pending_blocks.push(block); - if !self.check_large_enough(accumulated_rows, accumulated_bytes) { - continue; - } - if !staged_blocks.is_empty() { - result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); - } - std::mem::swap(&mut staged_blocks, &mut pending_blocks); - accumulated_rows = 0; - accumulated_bytes = 0; - } - - staged_blocks.append(&mut pending_blocks); - if !staged_blocks.is_empty() { - result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); - } - - Ok(result) - } -} - pub struct SortStrategy { sort_desc: Vec, schema: DataSchemaRef, diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs index 96edfcc986434..aaa93a459f8b6 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs @@ -13,8 +13,6 @@ // limitations under the License. mod data_processor_strategy; -mod hilbert_partition_exchange; -mod transform_hilbert_collect; mod transform_window_partition_collect; mod window_partition_buffer; mod window_partition_exchange; @@ -22,8 +20,6 @@ mod window_partition_meta; mod window_partition_partial_top_n_exchange; pub use data_processor_strategy::*; -pub use hilbert_partition_exchange::*; -pub use transform_hilbert_collect::*; pub use transform_window_partition_collect::*; pub use window_partition_buffer::*; pub use window_partition_exchange::*; From 504e60f3b7b0890d0fe7e8b15b788f729068d4a9 Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 14 May 2025 09:39:34 +0800 Subject: [PATCH 10/36] fix --- src/query/ee/src/hilbert_clustering/handler.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/ee/src/hilbert_clustering/handler.rs b/src/query/ee/src/hilbert_clustering/handler.rs index c7ee957a77ed9..dc116c57ab13b 100644 --- a/src/query/ee/src/hilbert_clustering/handler.rs +++ b/src/query/ee/src/hilbert_clustering/handler.rs @@ -63,7 +63,7 @@ impl HilbertClusteringHandler for RealHilbertClusteringHandler { let max_bytes_per_block = fuse_table.get_option( FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD, DEFAULT_BLOCK_BUFFER_SIZE, - ) * 2; + ); let hilbert_min_bytes = std::cmp::max( hilbert_clustering_min_bytes, max_bytes_per_block * block_per_seg, From 0798c0f5e60b5ca9d658ab9795e4b7c3e8d3b814 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 15 May 2025 02:12:02 +0800 Subject: [PATCH 11/36] fix test --- .../ee/07_hilbert_clustering/07_0000_recluster_final.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test index 6fc4bfbf44841..c1f3c647936ab 100644 --- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test +++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test @@ -77,7 +77,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); query I select count() from fuse_snapshot('test_hilbert','t'); ---- -8 +9 query II select block_count,row_count from fuse_segment('test_hilbert','t'); @@ -109,7 +109,7 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); query T select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t', 'a,b'); ---- -(a, b) linear {"average_depth":1.0,"average_overlaps":0.0,"block_depth_histogram":{"00001":5},"constant_block_count":0,"total_block_count":5} +(a, b) linear {"average_depth":1.4,"average_overlaps":0.4,"block_depth_histogram":{"00001":3,"00002":2},"constant_block_count":0,"total_block_count":5} # column specified not exist statement error 1065 From 98279ec6bbb39fe7b130eb683332563f25a37302 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 15 May 2025 10:32:29 +0800 Subject: [PATCH 12/36] fix test --- .../07_hilbert_clustering/07_0000_recluster_final.test | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test index c1f3c647936ab..2416bb509bea1 100644 --- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test +++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test @@ -21,6 +21,9 @@ USE test_hilbert statement ok create or replace table t(a int, b int) cluster by hilbert(a, b) row_per_block=2 block_per_segment=2 block_size_threshold = 18; +statement ok +set enable_block_stream_write = 0 + statement ok set hilbert_clustering_min_bytes = 35; @@ -97,10 +100,10 @@ select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); statement ok alter table t recluster final; -query T -select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t'); +query II +select info:partial_segment_count, info:unclustered_segment_count from clustering_information('test_hilbert','t'); ---- -(b, a) hilbert {"partial_block_count":0,"partial_segment_count":0,"stable_block_count":5,"stable_segment_count":2,"total_block_count":5,"total_segment_count":2,"unclustered_block_count":0,"unclustered_segment_count":0} +0 0 ######################################################## # force eval as linear clustering by specify columns # From d150dc03ea646af7461b4d0f5f297f454c302c09 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 15 May 2025 16:47:48 +0800 Subject: [PATCH 13/36] fix test --- .../ee/07_hilbert_clustering/07_0000_recluster_final.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test index 2416bb509bea1..4e0822f4589db 100644 --- a/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test +++ b/tests/sqllogictests/suites/ee/07_hilbert_clustering/07_0000_recluster_final.test @@ -110,9 +110,9 @@ select info:partial_segment_count, info:unclustered_segment_count from clusterin ######################################################## query T -select * EXCLUDE(timestamp) from clustering_information('test_hilbert','t', 'a,b'); +select cluster_key, type, info:constant_block_count from clustering_information('test_hilbert','t', 'a,b'); ---- -(a, b) linear {"average_depth":1.4,"average_overlaps":0.4,"block_depth_histogram":{"00001":3,"00002":2},"constant_block_count":0,"total_block_count":5} +(a, b) linear 0 # column specified not exist statement error 1065 From 797177f67f65af894922678263ce3eabe9065828 Mon Sep 17 00:00:00 2001 From: zhyass Date: Mon, 19 May 2025 12:40:41 +0800 Subject: [PATCH 14/36] spill block build and block write --- .../expression/src/utils/block_thresholds.rs | 4 +- .../builders/builder_hilbert_partition.rs | 66 ++--- .../src/pipelines/builders/builder_window.rs | 8 +- .../transforms/recluster/compact_strategy.rs | 78 ------ .../processors/transforms/recluster/mod.rs | 11 +- ...nge.rs => recluster_partition_exchange.rs} | 20 +- .../recluster/recluster_partition_strategy.rs | 151 +++++++++++ .../recluster/transform_hilbert_collect.rs | 254 ------------------ .../transforms/window/partition/mod.rs | 4 +- ...ategy.rs => partition_process_strategy.rs} | 30 ++- .../transform_window_partition_collect.rs | 34 +-- src/query/storages/fuse/src/io/mod.rs | 4 +- .../fuse/src/io/write/block_writer.rs | 8 + .../fuse/src/io/write/bloom_index_writer.rs | 1 + .../src/io/write/inverted_index_writer.rs | 1 + src/query/storages/fuse/src/io/write/mod.rs | 4 +- .../fuse/src/io/write/stream/block_builder.rs | 15 +- .../storages/fuse/src/io/write/stream/mod.rs | 4 +- .../storages/fuse/src/operations/append.rs | 10 +- .../src/operations/common/processors/mod.rs | 1 + .../processors/transform_block_writer.rs | 116 ++++---- 21 files changed, 338 insertions(+), 486 deletions(-) delete mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs rename src/query/service/src/pipelines/processors/transforms/recluster/{hilbert_partition_exchange.rs => recluster_partition_exchange.rs} (78%) create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs delete mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs rename src/query/service/src/pipelines/processors/transforms/window/partition/{data_processor_strategy.rs => partition_process_strategy.rs} (75%) diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs index 742bfdf489261..01c0631abe124 100644 --- a/src/query/expression/src/utils/block_thresholds.rs +++ b/src/query/expression/src/utils/block_thresholds.rs @@ -153,7 +153,7 @@ impl BlockThresholds { let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed); // Adjust the number of blocks based on block size thresholds. let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024); - let min_bytes_per_block = max_bytes_per_block / 2; + let min_bytes_per_block = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024); let block_nums = if bytes_per_block > max_bytes_per_block { // Case 1: If the block size is too bigger. total_bytes.div_ceil(max_bytes_per_block) @@ -201,7 +201,7 @@ impl BlockThresholds { // Adjust block count based on byte size thresholds. let bytes_per_block = total_bytes.div_ceil(by_compressed); let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024); - let min_bytes = max_bytes / 2; + let min_bytes = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024); let total_partitions = if bytes_per_block > max_bytes { // Block size is too large. total_bytes / max_bytes diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index 1ffe3e5e2c69b..9ae7941e01e9e 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -20,8 +20,10 @@ use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_transforms::MemorySettings; +use databend_common_pipeline_transforms::TransformPipelineHelper; use databend_common_sql::executor::physical_plans::HilbertPartition; use databend_common_sql::executor::physical_plans::MutationKind; +use databend_common_storages_fuse::io::StreamBlockProperties; use databend_common_storages_fuse::operations::TransformBlockWriter; use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::statistics::ClusterStatsGenerator; @@ -29,10 +31,10 @@ use databend_common_storages_fuse::FuseTable; use databend_storages_common_cache::TempDirManager; use crate::pipelines::memory_settings::MemorySettingsExt; -use crate::pipelines::processors::transforms::CompactStrategy; -use crate::pipelines::processors::transforms::HilbertPartitionExchange; -use crate::pipelines::processors::transforms::TransformHilbertCollect; -use crate::pipelines::processors::transforms::TransformWindowPartitionCollect; +use crate::pipelines::processors::transforms::CompactPartitionStrategy; +use crate::pipelines::processors::transforms::ReclusterPartitionExchange; +use crate::pipelines::processors::transforms::ReclusterPartitionStrategy; +use crate::pipelines::processors::transforms::TransformPartitionCollect; use crate::pipelines::PipelineBuilder; use crate::spillers::SpillerDiskConfig; @@ -49,7 +51,7 @@ impl PipelineBuilder { self.main_pipeline.exchange( num_processors, - HilbertPartitionExchange::create(partition.range_start, partition.range_width), + ReclusterPartitionExchange::create(partition.range_start, partition.range_width), ); let settings = self.ctx.get_settings(); @@ -66,9 +68,15 @@ impl PipelineBuilder { let processor_id = AtomicUsize::new(0); if enable_stream_writer { + let properties = StreamBlockProperties::try_create( + self.ctx.clone(), + table, + partition.table_meta_timestamps, + )?; + self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(Box::new( - TransformHilbertCollect::new( + TransformPartitionCollect::new( self.ctx.clone(), input, output, @@ -78,28 +86,24 @@ impl PipelineBuilder { partition.range_width, window_spill_settings.clone(), disk_spill.clone(), - partition.rows_per_block, - partition.bytes_per_block, + ReclusterPartitionStrategy::new(properties.clone()), )?, ))) })?; - self.main_pipeline.add_transform(|input, output| { - TransformBlockWriter::try_create( + self.main_pipeline.add_async_accumulating_transformer(|| { + TransformBlockWriter::create( self.ctx.clone(), - input, - output, MutationKind::Recluster, table, - partition.table_meta_timestamps, false, - Some(partition.bytes_per_block), ) - }) + }); + Ok(()) } else { self.main_pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(Box::new( - TransformWindowPartitionCollect::new( + TransformPartitionCollect::new( self.ctx.clone(), input, output, @@ -109,24 +113,26 @@ impl PipelineBuilder { partition.range_width, window_spill_settings.clone(), disk_spill.clone(), - CompactStrategy::new(partition.rows_per_block, partition.bytes_per_block), + CompactPartitionStrategy::new( + partition.rows_per_block, + partition.bytes_per_block, + ), )?, ))) })?; - self.main_pipeline - .add_transform(|transform_input_port, transform_output_port| { - let proc = TransformSerializeBlock::try_create( - self.ctx.clone(), - transform_input_port, - transform_output_port, - table, - ClusterStatsGenerator::default(), - MutationKind::Recluster, - partition.table_meta_timestamps, - )?; - proc.into_processor() - }) + self.main_pipeline.add_transform(|input, output| { + let proc = TransformSerializeBlock::try_create( + self.ctx.clone(), + input, + output, + table, + ClusterStatsGenerator::default(), + MutationKind::Recluster, + partition.table_meta_timestamps, + )?; + proc.into_processor() + }) } } } diff --git a/src/query/service/src/pipelines/builders/builder_window.rs b/src/query/service/src/pipelines/builders/builder_window.rs index 187bb25d7dd77..64dbbe0e41e18 100644 --- a/src/query/service/src/pipelines/builders/builder_window.rs +++ b/src/query/service/src/pipelines/builders/builder_window.rs @@ -30,11 +30,11 @@ use databend_storages_common_cache::TempDirManager; use crate::pipelines::memory_settings::MemorySettingsExt; use crate::pipelines::processors::transforms::FrameBound; -use crate::pipelines::processors::transforms::SortStrategy; +use crate::pipelines::processors::transforms::TransformPartitionCollect; use crate::pipelines::processors::transforms::TransformWindow; -use crate::pipelines::processors::transforms::TransformWindowPartitionCollect; use crate::pipelines::processors::transforms::WindowFunctionInfo; use crate::pipelines::processors::transforms::WindowPartitionExchange; +use crate::pipelines::processors::transforms::WindowPartitionStrategy; use crate::pipelines::processors::transforms::WindowPartitionTopNExchange; use crate::pipelines::processors::transforms::WindowSortDesc; use crate::pipelines::PipelineBuilder; @@ -203,14 +203,14 @@ impl PipelineBuilder { let processor_id = AtomicUsize::new(0); self.main_pipeline.add_transform(|input, output| { - let strategy = SortStrategy::try_create( + let strategy = WindowPartitionStrategy::try_create( &settings, sort_desc.clone(), plan_schema.clone(), have_order_col, )?; Ok(ProcessorPtr::create(Box::new( - TransformWindowPartitionCollect::new( + TransformPartitionCollect::new( self.ctx.clone(), input, output, diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs deleted file mode 100644 index bd02855159648..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/recluster/compact_strategy.rs +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_exception::Result; -use databend_common_expression::DataBlock; - -use crate::pipelines::processors::transforms::DataProcessorStrategy; - -pub struct CompactStrategy { - max_bytes_per_block: usize, - max_rows_per_block: usize, -} - -impl CompactStrategy { - pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self { - Self { - max_bytes_per_block, - max_rows_per_block, - } - } - - fn concat_blocks(blocks: Vec) -> Result { - DataBlock::concat(&blocks) - } - - fn check_large_enough(&self, rows: usize, bytes: usize) -> bool { - rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block - } -} - -impl DataProcessorStrategy for CompactStrategy { - const NAME: &'static str = "Compact"; - - fn process_data_blocks(&self, data_blocks: Vec) -> Result> { - let blocks_num = data_blocks.len(); - if blocks_num < 2 { - return Ok(data_blocks); - } - - let mut accumulated_rows = 0; - let mut accumulated_bytes = 0; - let mut pending_blocks = Vec::with_capacity(blocks_num); - let mut staged_blocks = Vec::with_capacity(blocks_num); - let mut result = Vec::with_capacity(blocks_num); - for block in data_blocks { - accumulated_rows += block.num_rows(); - accumulated_bytes += block.estimate_block_size(); - pending_blocks.push(block); - if !self.check_large_enough(accumulated_rows, accumulated_bytes) { - continue; - } - if !staged_blocks.is_empty() { - result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); - } - std::mem::swap(&mut staged_blocks, &mut pending_blocks); - accumulated_rows = 0; - accumulated_bytes = 0; - } - - staged_blocks.append(&mut pending_blocks); - if !staged_blocks.is_empty() { - result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); - } - - Ok(result) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs index aba21e76d3cd2..a3c680958f00b 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -12,10 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod compact_strategy; -mod hilbert_partition_exchange; -mod transform_hilbert_collect; +mod recluster_partition_exchange; +mod recluster_partition_strategy; -pub use compact_strategy::CompactStrategy; -pub use hilbert_partition_exchange::HilbertPartitionExchange; -pub use transform_hilbert_collect::TransformHilbertCollect; +pub use recluster_partition_exchange::ReclusterPartitionExchange; +pub use recluster_partition_strategy::CompactPartitionStrategy; +pub use recluster_partition_strategy::ReclusterPartitionStrategy; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs similarity index 78% rename from src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs rename to src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs index c6a79277af909..221d4328ef67a 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/hilbert_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs @@ -20,19 +20,19 @@ use databend_common_pipeline_core::processors::Exchange; use crate::pipelines::processors::transforms::WindowPartitionMeta; -pub struct HilbertPartitionExchange { +pub struct ReclusterPartitionExchange { start: u64, width: usize, } -impl HilbertPartitionExchange { - pub fn create(start: u64, width: usize) -> Arc { - Arc::new(HilbertPartitionExchange { start, width }) +impl ReclusterPartitionExchange { + pub fn create(start: u64, width: usize) -> Arc { + Arc::new(ReclusterPartitionExchange { start, width }) } } -impl Exchange for HilbertPartitionExchange { - const NAME: &'static str = "Hilbert"; +impl Exchange for ReclusterPartitionExchange { + const NAME: &'static str = "Recluster"; fn partition(&self, data_block: DataBlock, n: usize) -> Result> { let mut data_block = data_block; let range_ids = data_block @@ -51,16 +51,10 @@ impl Exchange for HilbertPartitionExchange { let scatter_indices = DataBlock::divide_indices_by_scatter_size(&indices, self.width); // Partition the data blocks to different processors. - let base = self.width / n; - let remainder = self.width % n; let mut output_data_blocks = vec![vec![]; n]; for (partition_id, indices) in scatter_indices.into_iter().take(self.width).enumerate() { if !indices.is_empty() { - let target = if partition_id < remainder * (base + 1) { - partition_id / (base + 1) - } else { - (partition_id - remainder) / base - }; + let target = (partition_id * n) / self.width; let block = data_block.take_with_optimize_size(&indices)?; output_data_blocks[target].push((partition_id, block)); } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs new file mode 100644 index 0000000000000..7f478c94b8d43 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs @@ -0,0 +1,151 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::DataBlock; +use databend_common_storages_fuse::io::StreamBlockBuilder; +use databend_common_storages_fuse::io::StreamBlockProperties; + +use crate::pipelines::processors::transforms::PartitionProcessStrategy; + +/// `ReclusterPartitionStrategy` is used when block stream writing is enabled. +/// It incrementally writes blocks using `StreamBlockBuilder`, which allows +/// partial serialization and flush during reclustering (e.g., Hilbert clustering). +pub struct ReclusterPartitionStrategy { + properties: Arc, +} + +impl ReclusterPartitionStrategy { + pub fn new(properties: Arc) -> Self { + Self { properties } + } +} + +impl PartitionProcessStrategy for ReclusterPartitionStrategy { + const NAME: &'static str = "Recluster"; + + fn calc_partitions( + &self, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + ) -> Vec { + (0..num_partitions) + .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) + .collect() + } + + /// Stream write each block, and flush it conditionally based on builder status + /// and input size estimation. + fn process_data_blocks(&self, data_blocks: Vec) -> Result> { + let mut input_sizes: usize = data_blocks.iter().map(|b| b.estimate_block_size()).sum(); + let mut input_rows: usize = data_blocks.iter().map(|b| b.num_rows()).sum(); + + let mut result = Vec::new(); + let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; + for block in data_blocks { + input_sizes -= block.estimate_block_size(); + input_rows -= block.num_rows(); + builder.write(block)?; + if builder.need_flush() && self.properties.check_large_enough(input_rows, input_sizes) { + let serialized = builder.finish()?; + result.push(DataBlock::empty_with_meta(Box::new(serialized))); + builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; + } + } + + if !builder.is_empty() { + let serialized = builder.finish()?; + result.push(DataBlock::empty_with_meta(Box::new(serialized))); + } + Ok(result) + } +} + +/// `CompactPartitionStrategy` is used when stream write is NOT enabled. +/// It uses a traditional "accumulate and concat" strategy to build large blocks +/// once input thresholds (row count or size) are exceeded. +pub struct CompactPartitionStrategy { + max_bytes_per_block: usize, + max_rows_per_block: usize, +} + +impl CompactPartitionStrategy { + pub fn new(max_rows_per_block: usize, max_bytes_per_block: usize) -> Self { + Self { + max_bytes_per_block, + max_rows_per_block, + } + } + + fn concat_blocks(blocks: Vec) -> Result { + DataBlock::concat(&blocks) + } + + fn check_large_enough(&self, rows: usize, bytes: usize) -> bool { + rows >= self.max_rows_per_block || bytes >= self.max_bytes_per_block + } +} + +impl PartitionProcessStrategy for CompactPartitionStrategy { + const NAME: &'static str = "Compact"; + + fn calc_partitions( + &self, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + ) -> Vec { + (0..num_partitions) + .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) + .collect() + } + + /// Collects blocks into batches and merges them via `concat` when size threshold is reached. + fn process_data_blocks(&self, data_blocks: Vec) -> Result> { + let blocks_num = data_blocks.len(); + if blocks_num < 2 { + return Ok(data_blocks); + } + + let mut accumulated_rows = 0; + let mut accumulated_bytes = 0; + let mut pending_blocks = Vec::with_capacity(blocks_num); + let mut staged_blocks = Vec::with_capacity(blocks_num); + let mut result = Vec::with_capacity(blocks_num); + for block in data_blocks { + accumulated_rows += block.num_rows(); + accumulated_bytes += block.estimate_block_size(); + pending_blocks.push(block); + if !self.check_large_enough(accumulated_rows, accumulated_bytes) { + continue; + } + if !staged_blocks.is_empty() { + result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + std::mem::swap(&mut staged_blocks, &mut pending_blocks); + accumulated_rows = 0; + accumulated_bytes = 0; + } + + staged_blocks.append(&mut pending_blocks); + if !staged_blocks.is_empty() { + result.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + + Ok(result) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs deleted file mode 100644 index 07740e7b56377..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_hilbert_collect.rs +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::collections::VecDeque; -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::BlockMetaInfoDowncast; -use databend_common_expression::DataBlock; -use databend_common_pipeline_core::processors::Event; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_transforms::MemorySettings; -use databend_common_settings::Settings; -use databend_common_storage::DataOperator; - -use crate::pipelines::processors::transforms::CompactStrategy; -use crate::pipelines::processors::transforms::DataProcessorStrategy; -use crate::pipelines::processors::transforms::WindowPartitionBuffer; -use crate::pipelines::processors::transforms::WindowPartitionMeta; -use crate::sessions::QueryContext; -use crate::spillers::Spiller; -use crate::spillers::SpillerConfig; -use crate::spillers::SpillerDiskConfig; -use crate::spillers::SpillerType; - -enum State { - Collect, - Flush, - Spill, - Restore, - Compact(Vec), -} - -pub struct TransformHilbertCollect { - input: Arc, - output: Arc, - - immediate_output_blocks: Vec<(usize, DataBlock)>, - output_data_blocks: VecDeque, - - // The partition id is used to map the partition id to the new partition id. - partition_id: Vec, - partition_sizes: Vec, - // The buffer is used to control the memory usage of the window operator. - buffer: WindowPartitionBuffer, - - compact_strategy: CompactStrategy, - max_block_size: usize, - // Event variables. - state: State, -} - -impl TransformHilbertCollect { - #[allow(clippy::too_many_arguments)] - pub fn new( - ctx: Arc, - input: Arc, - output: Arc, - settings: &Settings, - processor_id: usize, - num_processors: usize, - num_partitions: usize, - memory_settings: MemorySettings, - disk_spill: Option, - max_block_rows: usize, - max_block_size: usize, - ) -> Result { - // Calculate the partition ids collected by the processor. - let partitions: Vec = (0..num_partitions) - .filter(|&partition| partition % num_processors == processor_id) - .collect(); - - // Map each partition id to new partition id. - let mut partition_id = vec![0; num_partitions]; - for (new_partition_id, partition) in partitions.iter().enumerate() { - partition_id[*partition] = new_partition_id; - } - - let location_prefix = ctx.query_id_spill_prefix(); - let spill_config = SpillerConfig { - spiller_type: SpillerType::Window, - location_prefix, - disk_spill, - use_parquet: settings.get_spilling_file_format()?.is_parquet(), - }; - - // Create an inner `Spiller` to spill data. - let operator = DataOperator::instance().spill_operator(); - let spiller = Spiller::create(ctx, operator, spill_config)?; - - // Create the window partition buffer. - let buffer = - WindowPartitionBuffer::new(spiller, partitions.len(), max_block_rows, memory_settings)?; - - Ok(Self { - input, - output, - partition_id, - buffer, - immediate_output_blocks: vec![], - partition_sizes: vec![0; num_partitions], - max_block_size, - compact_strategy: CompactStrategy::new(max_block_rows, max_block_size), - output_data_blocks: VecDeque::new(), - state: State::Collect, - }) - } -} - -#[async_trait::async_trait] -impl Processor for TransformHilbertCollect { - fn name(&self) -> String { - "TransformHilbertCollect".to_string() - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if matches!(self.state, State::Compact(_)) { - return Ok(Event::Sync); - } - - if matches!(self.state, State::Flush | State::Spill | State::Restore) { - return Ok(Event::Async); - } - - if self.output.is_finished() { - self.input.finish(); - return Ok(Event::Finished); - } - - if !self.output.can_push() { - return Ok(Event::NeedConsume); - } - - if let Some(data_block) = self.output_data_blocks.pop_front() { - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - - if self.need_spill() { - self.state = State::Spill; - return Ok(Event::Async); - } - - if !self.immediate_output_blocks.is_empty() { - self.state = State::Flush; - return Ok(Event::Async); - } - - if self.input.is_finished() { - if !self.buffer.is_empty() { - self.state = State::Restore; - return Ok(Event::Async); - } - - self.output.finish(); - return Ok(Event::Finished); - } - - if self.input.has_data() { - self.collect_data_block()?; - - if self.need_spill() { - self.state = State::Spill; - return Ok(Event::Async); - } - - if !self.immediate_output_blocks.is_empty() { - self.state = State::Flush; - return Ok(Event::Async); - } - } - - self.input.set_need_data(); - Ok(Event::NeedData) - } - - fn process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Collect) { - State::Compact(blocks) => { - let output = self.compact_strategy.process_data_blocks(blocks)?; - self.output_data_blocks.extend(output); - } - _ => unreachable!(), - } - Ok(()) - } - - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Collect) { - State::Spill => { - self.buffer.spill().await?; - } - State::Flush => { - if let Some((partition_id, data_block)) = self.immediate_output_blocks.pop() { - let mut restored_data_blocks = - self.buffer.restore_by_id(partition_id, true).await?; - restored_data_blocks.push(data_block); - self.state = State::Compact(restored_data_blocks); - } - } - State::Restore => { - let restored_data_blocks = self.buffer.restore().await?; - self.state = State::Compact(restored_data_blocks); - } - _ => unreachable!(), - } - Ok(()) - } -} - -impl TransformHilbertCollect { - fn collect_data_block(&mut self) -> Result<()> { - let data_block = self.input.pull_data().unwrap()?; - if let Some(meta) = data_block - .get_owned_meta() - .and_then(WindowPartitionMeta::downcast_from) - { - for (partition_id, data_block) in meta.partitioned_data.into_iter() { - let new_id = self.partition_id[partition_id]; - self.partition_sizes[new_id] += data_block.estimate_block_size(); - if self.partition_sizes[new_id] >= self.max_block_size { - self.immediate_output_blocks.push((new_id, data_block)); - self.partition_sizes[new_id] = 0; - continue; - } - self.buffer.add_data_block(new_id, data_block); - } - } - Ok(()) - } - - fn need_spill(&mut self) -> bool { - self.buffer.need_spill() - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs index aaa93a459f8b6..1418388cf2553 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/mod.rs @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod data_processor_strategy; +mod partition_process_strategy; mod transform_window_partition_collect; mod window_partition_buffer; mod window_partition_exchange; mod window_partition_meta; mod window_partition_partial_top_n_exchange; -pub use data_processor_strategy::*; +pub use partition_process_strategy::*; pub use transform_window_partition_collect::*; pub use window_partition_buffer::*; pub use window_partition_exchange::*; diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs similarity index 75% rename from src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs rename to src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs index d0808f1d423ef..bec3f8a84e91f 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/data_processor_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs @@ -19,12 +19,21 @@ use databend_common_expression::SortColumnDescription; use databend_common_pipeline_transforms::sort_merge; use databend_common_settings::Settings; -pub trait DataProcessorStrategy: Send + Sync + 'static { +pub trait PartitionProcessStrategy: Send + Sync + 'static { const NAME: &'static str; + + /// Partition assignment: map partition index to processor via proportional mapping. + fn calc_partitions( + &self, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + ) -> Vec; + fn process_data_blocks(&self, data_blocks: Vec) -> Result>; } -pub struct SortStrategy { +pub struct WindowPartitionStrategy { sort_desc: Vec, schema: DataSchemaRef, max_block_size: usize, @@ -33,7 +42,7 @@ pub struct SortStrategy { have_order_col: bool, } -impl SortStrategy { +impl WindowPartitionStrategy { pub fn try_create( settings: &Settings, sort_desc: Vec, @@ -54,8 +63,19 @@ impl SortStrategy { } } -impl DataProcessorStrategy for SortStrategy { - const NAME: &'static str = "Sort"; +impl PartitionProcessStrategy for WindowPartitionStrategy { + const NAME: &'static str = "Window"; + + fn calc_partitions( + &self, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + ) -> Vec { + (0..num_partitions) + .filter(|&partition| partition % num_processors == processor_id) + .collect() + } fn process_data_blocks(&self, data_blocks: Vec) -> Result> { let data_blocks = data_blocks diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index 3051a2f0f018c..d1f011404223b 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -33,7 +33,7 @@ use databend_common_storage::DataOperator; use super::WindowPartitionBuffer; use super::WindowPartitionMeta; -use crate::pipelines::processors::transforms::DataProcessorStrategy; +use crate::pipelines::processors::transforms::PartitionProcessStrategy; use crate::sessions::QueryContext; use crate::spillers::Spiller; use crate::spillers::SpillerConfig; @@ -59,7 +59,7 @@ pub enum AsyncStep { Restore, } -pub struct TransformWindowPartitionCollect { +pub struct TransformPartitionCollect { input: Arc, output: Arc, @@ -78,7 +78,7 @@ pub struct TransformWindowPartitionCollect { is_collect_finished: bool, } -impl TransformWindowPartitionCollect { +impl TransformPartitionCollect { pub fn new( ctx: Arc, input: Arc, @@ -92,9 +92,7 @@ impl TransformWindowPartitionCollect { strategy: S, ) -> Result { // Calculate the partition ids collected by the processor. - let partitions: Vec = (0..num_partitions) - .filter(|&partition| partition % num_processors == processor_id) - .collect(); + let partitions = strategy.calc_partitions(processor_id, num_processors, num_partitions); // Map each partition id to new partition id. let mut partition_id = vec![0; num_partitions]; @@ -162,11 +160,7 @@ impl TransformWindowPartitionCollect { } if self.input.has_data() { - Self::collect_data_block( - self.input.pull_data().unwrap()?, - &self.partition_id, - &mut self.buffer, - ); + self.collect_data_block()?; } // Check again. flush memory data to external storage if need @@ -209,9 +203,9 @@ impl TransformWindowPartitionCollect { } #[async_trait::async_trait] -impl Processor for TransformWindowPartitionCollect { +impl Processor for TransformPartitionCollect { fn name(&self) -> String { - format!("TransformWindowPartitionCollect({})", S::NAME) + format!("TransformPartitionCollect({})", S::NAME) } fn as_any(&mut self) -> &mut dyn Any { @@ -271,21 +265,19 @@ impl Processor for TransformWindowPartitionCollect } } -impl TransformWindowPartitionCollect { - fn collect_data_block( - data_block: DataBlock, - partition_ids: &[usize], - buffer: &mut WindowPartitionBuffer, - ) { +impl TransformPartitionCollect { + fn collect_data_block(&mut self) -> Result<()> { + let data_block = self.input.pull_data().unwrap()?; if let Some(meta) = data_block .get_owned_meta() .and_then(WindowPartitionMeta::downcast_from) { for (partition_id, data_block) in meta.partitioned_data.into_iter() { - let partition_id = partition_ids[partition_id]; - buffer.add_data_block(partition_id, data_block); + let new_id = self.partition_id[partition_id]; + self.buffer.add_data_block(new_id, data_block); } } + Ok(()) } fn need_spill(&mut self) -> bool { diff --git a/src/query/storages/fuse/src/io/mod.rs b/src/query/storages/fuse/src/io/mod.rs index 63b43a9ff785f..93c695bd06f5c 100644 --- a/src/query/storages/fuse/src/io/mod.rs +++ b/src/query/storages/fuse/src/io/mod.rs @@ -50,8 +50,8 @@ pub use write::CachedMetaWriter; pub use write::InvertedIndexBuilder; pub use write::InvertedIndexWriter; pub use write::MetaWriter; -pub(crate) use write::StreamBlockBuilder; -pub(crate) use write::StreamBlockProperties; +pub use write::StreamBlockBuilder; +pub use write::StreamBlockProperties; pub use write::VirtualColumnBuilder; pub use write::WriteSettings; pub use write::MAX_BLOCK_UNCOMPRESSED_SIZE; diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs index 8b9b269327b5f..1264a757e94f8 100644 --- a/src/query/storages/fuse/src/io/write/block_writer.rs +++ b/src/query/storages/fuse/src/io/write/block_writer.rs @@ -20,6 +20,8 @@ use std::time::Instant; use chrono::Utc; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; +use databend_common_expression::local_block_meta_serde; +use databend_common_expression::BlockMetaInfo; use databend_common_expression::Column; use databend_common_expression::ColumnId; use databend_common_expression::DataBlock; @@ -124,6 +126,7 @@ pub async fn write_data(data: Vec, data_accessor: &Operator, location: &str) Ok(()) } +#[derive(Debug)] pub struct BlockSerialization { pub block_raw_data: Vec, pub block_meta: BlockMeta, @@ -132,6 +135,11 @@ pub struct BlockSerialization { pub virtual_column_state: Option, } +local_block_meta_serde!(BlockSerialization); + +#[typetag::serde(name = "block_serialization_meta")] +impl BlockMetaInfo for BlockSerialization {} + #[derive(Clone)] pub struct BlockBuilder { pub ctx: Arc, diff --git a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs index ec49070a6f08f..738c33ac2f2c3 100644 --- a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs @@ -40,6 +40,7 @@ use opendal::Operator; use crate::io::BlockReader; use crate::FuseStorageFormat; +#[derive(Debug)] pub struct BloomIndexState { pub(crate) data: Vec, pub(crate) size: u64, diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs index 74377a86108cb..8cf0b5f2355f0 100644 --- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs @@ -121,6 +121,7 @@ pub fn create_inverted_index_builders(table_meta: &TableMeta) -> Vec, pub(crate) size: u64, diff --git a/src/query/storages/fuse/src/io/write/mod.rs b/src/query/storages/fuse/src/io/write/mod.rs index b0af3633055dc..24bf6fd52c042 100644 --- a/src/query/storages/fuse/src/io/write/mod.rs +++ b/src/query/storages/fuse/src/io/write/mod.rs @@ -35,8 +35,8 @@ pub(crate) use inverted_index_writer::InvertedIndexState; pub use inverted_index_writer::InvertedIndexWriter; pub use meta_writer::CachedMetaWriter; pub use meta_writer::MetaWriter; -pub(crate) use stream::StreamBlockBuilder; -pub(crate) use stream::StreamBlockProperties; +pub use stream::StreamBlockBuilder; +pub use stream::StreamBlockProperties; pub use virtual_column_builder::VirtualColumnBuilder; pub use write_settings::WriteSettings; pub use write_settings::MAX_BLOCK_UNCOMPRESSED_SIZE; diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 69e81f8dec714..49473f1fd7032 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -233,16 +233,11 @@ impl StreamBlockBuilder { } pub fn need_flush(&self) -> bool { - if let Some(max_block_bytes) = self.properties.max_block_bytes { - if self.block_size >= max_block_bytes { - return true; - } - }; let file_size = self.block_writer.compressed_size(); self.row_count >= self.properties.block_thresholds.min_rows_per_block || self.block_size >= self.properties.block_thresholds.max_bytes_per_block || (file_size >= self.properties.block_thresholds.min_compressed_per_block - && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) + && self.block_size >= self.properties.block_thresholds.min_bytes_per_block / 2) } pub fn write(&mut self, block: DataBlock) -> Result<()> { @@ -355,7 +350,6 @@ pub struct StreamBlockProperties { pub(crate) ctx: Arc, pub(crate) write_settings: WriteSettings, pub(crate) block_thresholds: BlockThresholds, - pub(crate) max_block_bytes: Option, meta_locations: TableMetaLocationGenerator, source_schema: TableSchemaRef, @@ -374,7 +368,6 @@ impl StreamBlockProperties { ctx: Arc, table: &FuseTable, table_meta_timestamps: TableMetaTimestamps, - max_block_bytes: Option, ) -> Result> { // remove virtual computed fields. let fields = table @@ -437,7 +430,11 @@ impl StreamBlockProperties { ngram_args, inverted_index_builders, table_meta_timestamps, - max_block_bytes, })) } + + pub fn check_large_enough(&self, num_rows: usize, data_size: usize) -> bool { + self.block_thresholds + .check_large_enough(num_rows, data_size) + } } diff --git a/src/query/storages/fuse/src/io/write/stream/mod.rs b/src/query/storages/fuse/src/io/write/stream/mod.rs index 26d32ee679582..0c99368220ed4 100644 --- a/src/query/storages/fuse/src/io/write/stream/mod.rs +++ b/src/query/storages/fuse/src/io/write/stream/mod.rs @@ -16,5 +16,5 @@ mod block_builder; mod cluster_statistics; mod column_statistics; -pub(crate) use block_builder::StreamBlockBuilder; -pub(crate) use block_builder::StreamBlockProperties; +pub use block_builder::StreamBlockBuilder; +pub use block_builder::StreamBlockProperties; diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index 57cc6c8af06ea..6c9269bd13620 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -37,6 +37,7 @@ use databend_common_sql::executor::physical_plans::MutationKind; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use databend_storages_common_table_meta::table::ClusterType; +use crate::operations::TransformBlockBuilder; use crate::operations::TransformBlockWriter; use crate::operations::TransformSerializeBlock; use crate::statistics::ClusterStatsGenerator; @@ -53,17 +54,18 @@ impl FuseTable { ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet(); if enable_stream_block_write { pipeline.add_transform(|input, output| { - TransformBlockWriter::try_create( + TransformBlockBuilder::try_create( ctx.clone(), input, output, - MutationKind::Insert, self, table_meta_timestamps, - false, - None, ) })?; + + pipeline.add_async_accumulating_transformer(|| { + TransformBlockWriter::create(ctx.clone(), MutationKind::Insert, self, false) + }); } else { let block_thresholds = self.get_block_thresholds(); build_compact_block_pipeline(pipeline, block_thresholds)?; diff --git a/src/query/storages/fuse/src/operations/common/processors/mod.rs b/src/query/storages/fuse/src/operations/common/processors/mod.rs index e0e3d3b25f25a..d43c569c14016 100644 --- a/src/query/storages/fuse/src/operations/common/processors/mod.rs +++ b/src/query/storages/fuse/src/operations/common/processors/mod.rs @@ -22,6 +22,7 @@ mod transform_serialize_segment; pub use multi_table_insert_commit::CommitMultiTableInsert; pub use sink_commit::CommitSink; +pub use transform_block_writer::TransformBlockBuilder; pub use transform_block_writer::TransformBlockWriter; pub use transform_merge_commit_meta::TransformMergeCommitMeta; pub use transform_mutation_aggregator::TableMutationAggregator; diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index 73a85bf4f52c5..5af36b0a1f522 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -22,6 +22,7 @@ use databend_common_catalog::table::Table; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT; use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums; @@ -30,6 +31,7 @@ use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_transforms::AsyncAccumulatingTransform; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storage::MutationStatus; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -44,65 +46,51 @@ use crate::operations::MutationLogs; use crate::FuseTable; use crate::FUSE_OPT_KEY_ROW_PER_BLOCK; -#[allow(clippy::large_enum_variant)] enum State { Consume, Collect(DataBlock), Serialize, Finalize, Flush, - Write(BlockSerialization), } -pub struct TransformBlockWriter { +pub struct TransformBlockBuilder { state: State, input: Arc, output: Arc, - kind: MutationKind, properties: Arc, + max_block_rows: usize, builder: Option, need_flush: bool, input_data_size: usize, input_num_rows: usize, - dal: Operator, - // Only used in multi table insert - table_id: Option, - - max_block_rows: usize, input_data: VecDeque, output_data: Option, } -impl TransformBlockWriter { +impl TransformBlockBuilder { pub fn try_create( ctx: Arc, input: Arc, output: Arc, - kind: MutationKind, table: &FuseTable, table_meta_timestamps: TableMetaTimestamps, - with_tid: bool, - max_block_bytes: Option, ) -> Result { let max_block_rows = std::cmp::min( ctx.get_settings().get_max_block_size()? as usize, table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT), ); - let properties = - StreamBlockProperties::try_create(ctx, table, table_meta_timestamps, max_block_bytes)?; - Ok(ProcessorPtr::create(Box::new(TransformBlockWriter { + let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?; + Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder { state: State::Consume, input, output, - kind, properties, builder: None, - dal: table.get_operator(), need_flush: false, - table_id: if with_tid { Some(table.get_id()) } else { None }, input_data: VecDeque::new(), input_data_size: 0, input_num_rows: 0, @@ -134,9 +122,9 @@ impl TransformBlockWriter { } #[async_trait] -impl Processor for TransformBlockWriter { +impl Processor for TransformBlockBuilder { fn name(&self) -> String { - "TransformBlockWriter".to_string() + "TransformBlockBuilder".to_string() } fn as_any(&mut self) -> &mut dyn Any { @@ -144,12 +132,11 @@ impl Processor for TransformBlockWriter { } fn event(&mut self) -> Result { - match &self.state { - State::Collect(_) | State::Serialize | State::Flush | State::Finalize => { - return Ok(Event::Sync) - } - State::Write(_) => return Ok(Event::Async), - _ => {} + if matches!( + self.state, + State::Collect(_) | State::Serialize | State::Flush | State::Finalize + ) { + return Ok(Event::Sync); } if self.output.is_finished() { @@ -169,7 +156,6 @@ impl Processor for TransformBlockWriter { if self.need_flush && self .properties - .block_thresholds .check_large_enough(self.input_num_rows, self.input_data_size) { self.state = State::Flush; @@ -207,13 +193,9 @@ impl Processor for TransformBlockWriter { block.check_valid()?; self.input_data_size += block.estimate_block_size(); self.input_num_rows += block.num_rows(); - if self.properties.max_block_bytes.is_some() { - self.input_data.push_back(block); - } else { - let max_rows_per_block = self.calc_max_block_rows(&block); - let blocks = block.split_by_rows_no_tail(max_rows_per_block); - self.input_data.extend(blocks); - } + let max_rows_per_block = self.calc_max_block_rows(&block); + let blocks = block.split_by_rows_no_tail(max_rows_per_block); + self.input_data.extend(blocks); } State::Serialize => { while let Some(b) = self.input_data.pop_front() { @@ -240,7 +222,7 @@ impl Processor for TransformBlockWriter { let builder = self.builder.take().unwrap(); if !builder.is_empty() { let serialized = builder.finish()?; - self.state = State::Write(serialized); + self.output_data = Some(DataBlock::empty_with_meta(Box::new(serialized))); } self.need_flush = false; } @@ -248,11 +230,41 @@ impl Processor for TransformBlockWriter { } Ok(()) } +} - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Consume) { - State::Write(serialized) => { +pub struct TransformBlockWriter { + kind: MutationKind, + dal: Operator, + ctx: Arc, + // Only used in multi table insert + table_id: Option, +} + +impl TransformBlockWriter { + pub fn create( + ctx: Arc, + kind: MutationKind, + table: &FuseTable, + with_tid: bool, + ) -> Self { + Self { + ctx, + dal: table.get_operator(), + table_id: if with_tid { Some(table.get_id()) } else { None }, + kind, + } + } +} + +#[async_trait::async_trait] +impl AsyncAccumulatingTransform for TransformBlockWriter { + const NAME: &'static str = "TransformBlockWriter"; + + async fn transform(&mut self, data: DataBlock) -> Result> { + debug_assert!(data.is_empty()); + + if let Some(ptr) = data.get_owned_meta() { + if let Some(serialized) = BlockSerialization::downcast_from(ptr) { let extended_block_meta = BlockWriter::write_down(&self.dal, serialized).await?; let bytes = if let Some(draft_virtual_block_meta) = @@ -264,22 +276,19 @@ impl Processor for TransformBlockWriter { extended_block_meta.block_meta.block_size as usize }; - self.properties - .ctx - .get_write_progress() - .incr(&ProgressValues { - rows: extended_block_meta.block_meta.row_count as usize, - bytes, - }); + self.ctx.get_write_progress().incr(&ProgressValues { + rows: extended_block_meta.block_meta.row_count as usize, + bytes, + }); // appending new data block if let Some(tid) = self.table_id { - self.properties.ctx.update_multi_table_insert_status( + self.ctx.update_multi_table_insert_status( tid, extended_block_meta.block_meta.row_count, ); } else { - self.properties.ctx.add_mutation_status(MutationStatus { + self.ctx.add_mutation_status(MutationStatus { insert_rows: extended_block_meta.block_meta.row_count, update_rows: 0, deleted_rows: 0, @@ -299,10 +308,13 @@ impl Processor for TransformBlockWriter { }], })) }; - self.output_data = Some(output); + + return Ok(Some(output)); } - _ => return Err(ErrorCode::Internal("It's a bug.")), } - Ok(()) + + Err(ErrorCode::Internal( + "Cannot downcast meta to BlockSerialization", + )) } } From 62f2093350108405de3c9f748fb9477e2a235e95 Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 20 May 2025 18:42:01 +0800 Subject: [PATCH 15/36] fix test --- src/query/expression/tests/it/block_thresholds.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs index e7ad1304ae3cc..33b1d1b0ff394 100644 --- a/src/query/expression/tests/it/block_thresholds.rs +++ b/src/query/expression/tests/it/block_thresholds.rs @@ -105,8 +105,8 @@ fn test_calc_rows_for_recluster() { assert_eq!(result, 300); // Case 2: If the block size is too smaller. - let result = t.calc_rows_for_recluster(4_000, 4_000_000, 600_000); - assert_eq!(result, 1000); + let result = t.calc_rows_for_recluster(4_000, 2_000_000, 600_000); + assert_eq!(result, 800); // Case 3: use the compressed-based block count. let result = t.calc_rows_for_recluster(4_000, 10_000_000, 600_000); @@ -131,8 +131,8 @@ fn test_calc_partitions_for_recluster() { assert_eq!(result, 15); // Case 2: If the block size is too smaller. - let result = t.calc_partitions_for_recluster(4_000, 4_000_000, 600_000); - assert_eq!(result, 4); + let result = t.calc_partitions_for_recluster(4_000, 800_000, 800_000); + assert_eq!(result, 2); // Case 3: use the compressed-based block count. let result = t.calc_partitions_for_recluster(4_000, 10_000_000, 600_000); From e2b02f73c4b320da7054c700cce6f0e74d408116 Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 21 May 2025 02:25:18 +0800 Subject: [PATCH 16/36] add noise for hilbert recluster --- src/query/functions/src/scalars/hilbert.rs | 95 ++++++++++++++++++- .../interpreter_table_recluster.rs | 2 +- src/query/sql/src/planner/binder/ddl/table.rs | 11 ++- 3 files changed, 104 insertions(+), 4 deletions(-) diff --git a/src/query/functions/src/scalars/hilbert.rs b/src/query/functions/src/scalars/hilbert.rs index 060fe5ab97abe..2ba386450b0e0 100644 --- a/src/query/functions/src/scalars/hilbert.rs +++ b/src/query/functions/src/scalars/hilbert.rs @@ -21,24 +21,33 @@ use databend_common_expression::types::BinaryType; use databend_common_expression::types::DataType; use databend_common_expression::types::GenericType; use databend_common_expression::types::NullableType; +use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberType; use databend_common_expression::types::ReturnType; +use databend_common_expression::types::StringType; use databend_common_expression::types::ValueType; +use databend_common_expression::types::ALL_NUMERICS_TYPES; +use databend_common_expression::vectorize_with_builder_1_arg; use databend_common_expression::vectorize_with_builder_2_arg; +use databend_common_expression::with_number_mapped_type; use databend_common_expression::Column; use databend_common_expression::FixedLengthEncoding; use databend_common_expression::Function; use databend_common_expression::FunctionDomain; use databend_common_expression::FunctionEval; use databend_common_expression::FunctionFactory; +use databend_common_expression::FunctionProperty; use databend_common_expression::FunctionRegistry; use databend_common_expression::FunctionSignature; use databend_common_expression::ScalarRef; use databend_common_expression::Value; +use rand::rngs::SmallRng; +use rand::Rng; +use rand::SeedableRng; /// Registers Hilbert curve related functions with the function registry. pub fn register(registry: &mut FunctionRegistry) { - // Register the hilbert_range_index function that calculates Hilbert indices for multi-dimensional data + // Register the hilbert_range_index function that calculates Hilbert indices for multidimensional data let factory = FunctionFactory::Closure(Box::new(|_, args_type: &[DataType]| { let args_num = args_type.len(); // The function supports 2, 3, 4, or 5 dimensions (each dimension requires 2 arguments) @@ -97,7 +106,7 @@ pub fn register(registry: &mut FunctionRegistry) { points.push(key); } - // Convert the multi-dimensional point to a Hilbert index + // Convert the multidimensional point to a Hilbert index // This maps the n-dimensional point to a 1-dimensional value let points = points .iter() @@ -153,6 +162,88 @@ pub fn register(registry: &mut FunctionRegistry) { builder.push(id); }), ); + + // We use true randomness by appending a random u8 value at the end of the binary key. + // This introduces noise to break tie cases in clustering keys that are not uniformly distributed. + // Although this may slightly affect the accuracy of range_bound estimation, + // it ensures that Hilbert index + scatter will no longer suffer from data skew. + // Moreover, since the noise is added at the tail, the original order of the keys is preserved. + registry.properties.insert( + "add_noise".to_string(), + FunctionProperty::default().non_deterministic(), + ); + + registry.register_passthrough_nullable_1_arg::( + "add_noise", + |_, _| FunctionDomain::Full, + vectorize_with_builder_1_arg::(|val, builder, _| { + let mut bytes = val.as_bytes().to_vec(); + let mut rng = SmallRng::from_entropy(); + bytes.push(rng.gen::()); + builder.put_slice(&bytes); + builder.commit_row(); + }), + ); + + for ty in ALL_NUMERICS_TYPES { + with_number_mapped_type!(|NUM_TYPE| match ty { + NumberDataType::NUM_TYPE => { + registry + .register_passthrough_nullable_1_arg::, BinaryType, _, _>( + "add_noise", + |_, _| FunctionDomain::Full, + vectorize_with_builder_1_arg::, BinaryType>( + |val, builder, _| { + let mut encoded = val.encode().to_vec(); + let mut rng = SmallRng::from_entropy(); + encoded.push(rng.gen::()); + builder.put_slice(&encoded); + builder.commit_row(); + }, + ), + ); + } + }) + } + + registry.register_passthrough_nullable_2_arg::, BinaryType, _, _>( + "add_noise", + |_, _, _| FunctionDomain::Full, + vectorize_with_builder_2_arg::, BinaryType>( + |val, level, builder, _| { + let mut bytes = val.as_bytes().to_vec(); + let mut rng = SmallRng::from_entropy(); + for _ in 0..level { + bytes.push(rng.gen::()); + } + builder.put_slice(&bytes); + builder.commit_row(); + }, + ), + ); + + for ty in ALL_NUMERICS_TYPES { + with_number_mapped_type!(|NUM_TYPE| match ty { + NumberDataType::NUM_TYPE => { + registry + .register_passthrough_nullable_2_arg::, NumberType, BinaryType, _, _>( + "add_noise", + |_, _, _| FunctionDomain::Full, + vectorize_with_builder_2_arg::, NumberType, BinaryType>( + |val, level, builder, _| { + let mut encoded = val.encode().to_vec(); + let mut rng = SmallRng::from_entropy(); + for _ in 0..level { + encoded.push(rng.gen::()); + } + builder.put_slice(&encoded); + builder.commit_row(); + }, + ), + ); + } + }) + } } /// Calculates the partition ID for a value based on range boundaries. diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs index 411452a48d5a0..237a2252ea067 100644 --- a/src/query/service/src/interpreters/interpreter_table_recluster.rs +++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs @@ -652,7 +652,7 @@ impl ReclusterTableInterpreter { "range_bound(1000, {sample_size})({cluster_key_str})" )); - hilbert_keys.push(format!("{table}.{cluster_key_str}, []")); + hilbert_keys.push(format!("{cluster_key_str}, []")); } let hilbert_keys_str = hilbert_keys.join(", "); diff --git a/src/query/sql/src/planner/binder/ddl/table.rs b/src/query/sql/src/planner/binder/ddl/table.rs index 8f95e1d17958f..7d3263a8cdceb 100644 --- a/src/query/sql/src/planner/binder/ddl/table.rs +++ b/src/query/sql/src/planner/binder/ddl/table.rs @@ -149,6 +149,7 @@ use crate::plans::VacuumTemporaryFilesPlan; use crate::BindContext; use crate::DefaultExprBinder; use crate::Planner; +use crate::ScalarExpr; use crate::SelectBuilder; pub(in crate::planner::binder) struct AnalyzeCreateTableResult { @@ -1767,7 +1768,7 @@ impl Binder { let mut cluster_keys = Vec::with_capacity(expr_len); for cluster_expr in cluster_exprs.iter() { - let (cluster_key, _) = scalar_binder.bind(cluster_expr)?; + let (mut cluster_key, _) = scalar_binder.bind(cluster_expr)?; if cluster_key.used_columns().len() != 1 || !cluster_key.evaluable() { return Err(ErrorCode::InvalidClusterKeys(format!( "Cluster by expression `{:#}` is invalid", @@ -1775,6 +1776,14 @@ impl Binder { ))); } + if let ScalarExpr::FunctionCall(func) = &cluster_key { + if func.func_name == "add_noise" && matches!(cluster_type, AstClusterType::Hilbert) + { + debug_assert!(func.arguments.len() == 1); + cluster_key = func.arguments[0].clone(); + } + } + let expr = cluster_key.as_expr()?; if !expr.is_deterministic(&BUILTIN_FUNCTIONS) { return Err(ErrorCode::InvalidClusterKeys(format!( From 81e8dba0254c92d34251f23b3a73a7913ee76d5f Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 5 Jun 2025 03:39:18 +0800 Subject: [PATCH 17/36] update --- src/common/base/src/base/watch_notify.rs | 4 + .../src/aggregates/aggregate_range_bound.rs | 4 +- .../transforms/recluster/builder.rs | 198 ++++++++++++++++++ .../processors/transforms/recluster/mod.rs | 11 + .../recluster/range_bound_sampler.rs | 86 ++++++++ .../recluster/recluster_partition_exchange.rs | 3 +- .../recluster/recluster_sample_state.rs | 140 +++++++++++++ .../recluster/transform_add_order_column.rs | 72 +++++++ .../transform_range_partition_indexer.rs | 164 +++++++++++++++ .../recluster/transform_recluster_collect.rs | 82 ++++++++ 10 files changed, 759 insertions(+), 5 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/builder.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs diff --git a/src/common/base/src/base/watch_notify.rs b/src/common/base/src/base/watch_notify.rs index be05dfc9028c0..c7a677474b63f 100644 --- a/src/common/base/src/base/watch_notify.rs +++ b/src/common/base/src/base/watch_notify.rs @@ -44,6 +44,10 @@ impl WatchNotify { pub fn notify_waiters(&self) { let _ = self.tx.send_replace(true); } + + pub fn is_notified(&self) -> bool { + *self.rx.borrow() + } } #[cfg(test)] diff --git a/src/query/functions/src/aggregates/aggregate_range_bound.rs b/src/query/functions/src/aggregates/aggregate_range_bound.rs index 9776caac786c8..2572429300182 100644 --- a/src/query/functions/src/aggregates/aggregate_range_bound.rs +++ b/src/query/functions/src/aggregates/aggregate_range_bound.rs @@ -326,9 +326,7 @@ pub fn try_create_aggregate_range_bound_function( /// For a column with values `(0, 1, 3, 6, 8)` and `partition_num = 3`, the function calculates the /// partition boundaries based on the distribution of the data. The boundaries might be `[1, 6]`. pub fn aggregate_range_bound_function_desc() -> AggregateFunctionDescription { - AggregateFunctionDescription::creator(Box::new( - crate::aggregates::try_create_aggregate_range_bound_function, - )) + AggregateFunctionDescription::creator(Box::new(try_create_aggregate_range_bound_function)) } fn get_partitions( diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs b/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs new file mode 100644 index 0000000000000..0a58f27556f91 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs @@ -0,0 +1,198 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::row::RowConverter as CommonConverter; +use databend_common_expression::types::AccessType; +use databend_common_expression::types::ArgType; +use databend_common_expression::types::DataType; +use databend_common_expression::types::DateType; +use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::NumberType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; +use databend_common_expression::with_number_mapped_type; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_transforms::sort::CommonRows; +use databend_common_pipeline_transforms::sort::RowConverter; +use databend_common_pipeline_transforms::sort::Rows; +use databend_common_pipeline_transforms::sort::SimpleRowConverter; +use databend_common_pipeline_transforms::sort::SimpleRowsAsc; +use databend_common_pipeline_transforms::AccumulatingTransformer; +use databend_common_pipeline_transforms::Transformer; +use match_template::match_template; + +use crate::pipelines::processors::transforms::recluster::transform_add_order_column::TransformAddOrderColumn; +use crate::pipelines::processors::transforms::recluster::TransformRangePartitionIndexer; +use crate::pipelines::processors::transforms::SampleState; +use crate::pipelines::processors::transforms::TransformReclusterCollect; + +pub struct TransformReclusterBuilder { + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + sample_rate: f64, + seed: u64, +} + +impl TransformReclusterBuilder { + pub fn build_recluster_sample( + &self, + input: Arc, + output: Arc, + ) -> Result> { + self.build_inner(BuilderType::ReclusterSample, input, output, None) + } + + pub fn build_range_partition_indexer( + &self, + input: Arc, + output: Arc, + state: Arc, + ) -> Result> { + self.build_inner( + BuilderType::RangePartitionIndexer, + input, + output, + Some(state), + ) + } + + pub fn build_add_order_column( + &self, + input: Arc, + output: Arc, + ) -> Result> { + self.build_inner(BuilderType::AddOrderColumn, input, output, None) + } + + fn build_inner( + &self, + typ: BuilderType, + input: Arc, + output: Arc, + state: Option>, + ) -> Result> { + let mut build = BuilderInner { + input, + output, + typ, + base: self, + state, + }; + build.select_row_type() + } +} + +enum BuilderType { + AddOrderColumn, + ReclusterSample, + RangePartitionIndexer, +} + +struct BuilderInner<'a> { + input: Arc, + output: Arc, + typ: BuilderType, + base: &'a TransformReclusterBuilder, + state: Option>, +} + +impl BuilderInner<'_> { + pub fn select_row_type(&mut self) -> Result> { + match self.base.sort_desc.as_ref() { + [desc] => { + let schema = self.base.schema.clone(); + let sort_type = schema.field(desc.offset).data_type(); + assert!(desc.asc); + + match_template! { + T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], + match sort_type { + DataType::T => { + self.visit_type::, SimpleRowConverter>() + }, + DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { + NumberDataType::NUM_TYPE => { + self.visit_type::>, SimpleRowConverter>>() + } + }), + _ => self.visit_type::() + } + } + } + _ => self.visit_type::(), + } + } + + fn visit_type(&mut self) -> Result> + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + R::Type: ArgType + Send + Sync, + ::Scalar: Ord + Send + Sync, + { + match self.typ { + BuilderType::AddOrderColumn => self.build_add_order_column::(), + BuilderType::ReclusterSample => self.build_recluster_sample::(), + BuilderType::RangePartitionIndexer => self.build_range_partition_indexer::(), + } + } + + fn build_add_order_column(&mut self) -> Result> + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + { + let inner = TransformAddOrderColumn::::try_new( + self.base.sort_desc.clone(), + self.base.schema.clone(), + )?; + Ok(Transformer::create( + self.input.clone(), + self.output.clone(), + inner, + )) + } + + fn build_range_partition_indexer(&mut self) -> Result> + where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send + Sync, + { + Ok(TransformRangePartitionIndexer::::create( + self.input.clone(), + self.output.clone(), + self.state.clone().unwrap(), + )) + } + + fn build_recluster_sample(&mut self) -> Result> + where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send + Sync, + { + let offset = self.base.schema.fields().len(); + Ok(AccumulatingTransformer::create( + self.input.clone(), + self.output.clone(), + TransformReclusterCollect::::new(offset, self.base.sample_rate, self.base.seed), + )) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs index a3c680958f00b..fd1db0c11426c 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -12,9 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod builder; +mod range_bound_sampler; mod recluster_partition_exchange; mod recluster_partition_strategy; +mod recluster_sample_state; +mod transform_add_order_column; +mod transform_range_partition_indexer; +mod transform_recluster_collect; +pub use range_bound_sampler::RangeBoundSampler; pub use recluster_partition_exchange::ReclusterPartitionExchange; pub use recluster_partition_strategy::CompactPartitionStrategy; pub use recluster_partition_strategy::ReclusterPartitionStrategy; +pub(crate) use recluster_sample_state::SampleState; +pub use transform_range_partition_indexer::TransformRangePartitionIndexer; +pub(crate) use transform_recluster_collect::ReclusterSampleMeta; +pub use transform_recluster_collect::TransformReclusterCollect; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs new file mode 100644 index 0000000000000..0dfee36475b36 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs @@ -0,0 +1,86 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::marker::PhantomData; + +use databend_common_expression::types::ArgType; +use databend_common_expression::types::ValueType; +use databend_common_expression::DataBlock; +use databend_common_expression::Scalar; +use rand::prelude::SliceRandom; +use rand::prelude::SmallRng; +use rand::SeedableRng; + +pub struct RangeBoundSampler +where T: ValueType +{ + offset: usize, + sample_rate: f64, + rng: SmallRng, + + values: Vec<(u64, Vec)>, + _t: PhantomData, +} + +impl RangeBoundSampler +where T: ValueType +{ + pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self { + let rng = SmallRng::seed_from_u64(seed); + Self { + offset, + sample_rate, + rng, + values: vec![], + _t: PhantomData, + } + } +} + +impl RangeBoundSampler +where + T: ArgType, + T::Scalar: Ord + Send, +{ + pub fn add_block(&mut self, data: &DataBlock) { + let rows = data.num_rows(); + assert!(rows > 0); + let column = data.get_by_offset(self.offset).to_column(rows); + + let sample_size = std::cmp::max((self.sample_rate * rows as f64).ceil() as usize, 100); + let mut indices = (0..rows).collect::>(); + + let sampled_indices = if rows > sample_size { + indices.shuffle(&mut self.rng); + &indices[..sample_size] + } else { + &indices + }; + + let column = T::try_downcast_column(&column).unwrap(); + let sample_values = sampled_indices + .iter() + .map(|i| { + T::upcast_scalar(T::to_owned_scalar(unsafe { + T::index_column_unchecked(&column, *i) + })) + }) + .collect::>(); + self.values.push((rows as u64, sample_values)); + } + + pub fn sample_values(&mut self) -> Vec<(u64, Vec)> { + std::mem::take(&mut self.values) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs index 221d4328ef67a..dd5257850ac9f 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs @@ -33,8 +33,7 @@ impl ReclusterPartitionExchange { impl Exchange for ReclusterPartitionExchange { const NAME: &'static str = "Recluster"; - fn partition(&self, data_block: DataBlock, n: usize) -> Result> { - let mut data_block = data_block; + fn partition(&self, mut data_block: DataBlock, n: usize) -> Result> { let range_ids = data_block .get_last_column() .as_number() diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs new file mode 100644 index 0000000000000..77c0fa0a9483f --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs @@ -0,0 +1,140 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::sync::RwLock; + +use databend_common_base::base::WatchNotify; +use databend_common_exception::Result; +use databend_common_expression::compare_columns; +use databend_common_expression::types::ArgType; +use databend_common_expression::Scalar; + +pub struct SampleState { + pub inner: RwLock, + pub done: Arc, +} + +impl SampleState { + pub fn new(total_inputs: usize, partitions: usize) -> Arc { + Arc::new(SampleState { + inner: RwLock::new(SampleStateInner { + partitions, + total_inputs, + completed_inputs: 0, + values: vec![], + bounds: vec![], + }), + done: Arc::new(WatchNotify::new()), + }) + } + + pub fn merge_sample(&self, values: Vec<(u64, Vec)>) -> Result<()> + where + T: ArgType, + T::Scalar: Ord, + { + let mut inner = self.inner.write().unwrap(); + inner.completed_inputs += 1; + inner.values.extend_from_slice(&values); + + if inner.completed_inputs >= inner.total_inputs { + inner.determine_bounds::()?; + self.done.notify_waiters(); + } + Ok(()) + } + + pub fn get_bounds(&self) -> Vec + where + T: ArgType, + T::Scalar: Ord, + { + let inner = self.inner.read().unwrap(); + inner + .bounds + .iter() + .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap())) + .collect() + } +} + +pub struct SampleStateInner { + partitions: usize, + total_inputs: usize, + + completed_inputs: usize, + bounds: Vec, + + values: Vec<(u64, Vec)>, +} + +impl SampleStateInner { + fn determine_bounds(&mut self) -> Result<()> + where + T: ArgType, + T::Scalar: Ord, + { + if self.partitions < 2 { + return Ok(()); + } + + let (total_samples, total_rows) = self + .values + .iter() + .fold((0, 0), |(acc_samples, acc_rows), (rows, vals)| { + (acc_samples + vals.len(), acc_rows + *rows) + }); + let step = total_rows as f64 / self.partitions as f64; + let values = std::mem::take(&mut self.values); + let mut data = Vec::with_capacity(total_samples); + let mut weights = Vec::with_capacity(total_samples); + + for (num, values) in values.into_iter() { + let weight = num as f64 / values.len() as f64; + values.into_iter().for_each(|v| { + let val = T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()); + data.push(val); + weights.push(weight); + }); + } + let col = T::upcast_column(T::column_from_vec(data.clone(), &[])); + let indices = compare_columns(vec![col], total_samples)?; + + let mut cum_weight = 0.0; + let mut target = step; + let mut bounds = Vec::with_capacity(self.partitions - 1); + let mut previous_bound = None; + + let mut i = 0; + let mut j = 0; + while i < total_samples && j < self.partitions - 1 { + let idx = indices[i] as usize; + let weight = weights[idx]; + cum_weight += weight; + if cum_weight >= target { + let data = &data[idx]; + if previous_bound.as_ref().is_none_or(|prev| data > prev) { + bounds.push(T::upcast_scalar(data.clone())); + target += step; + j += 1; + previous_bound = Some(data.clone()); + } + } + i += 1; + } + self.bounds = bounds; + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs new file mode 100644 index 0000000000000..7b40593e887c3 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs @@ -0,0 +1,72 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::marker::PhantomData; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::DataBlock; +use databend_common_expression::DataSchemaRef; +use databend_common_expression::SortColumnDescription; +use databend_common_expression::Value; +use databend_common_pipeline_transforms::sort::RowConverter; +use databend_common_pipeline_transforms::sort::Rows; +use databend_common_pipeline_transforms::Transform; + +pub struct TransformAddOrderColumn { + row_converter: C, + sort_desc: Arc<[SortColumnDescription]>, + _r: PhantomData, +} + +impl TransformAddOrderColumn +where + R: Rows, + C: RowConverter, +{ + pub fn try_new(sort_desc: Arc<[SortColumnDescription]>, schema: DataSchemaRef) -> Result { + let row_converter = C::create(&sort_desc, schema.clone())?; + Ok(Self { + row_converter, + sort_desc, + _r: PhantomData, + }) + } +} + +impl Transform for TransformAddOrderColumn +where + R: Rows + 'static, + C: RowConverter + Send + 'static, +{ + const NAME: &'static str = "TransformAddOrderColumn"; + + fn transform(&mut self, mut data: DataBlock) -> Result { + let order_by_cols = self + .sort_desc + .iter() + .map(|desc| data.get_by_offset(desc.offset).clone()) + .collect::>(); + let rows = self + .row_converter + .convert(&order_by_cols, data.num_rows())?; + let order_col = rows.to_column(); + data.add_column(BlockEntry { + data_type: order_col.data_type(), + value: Value::Column(order_col), + }); + Ok(data) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs new file mode 100644 index 0000000000000..6d10600366eda --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -0,0 +1,164 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::types::ArgType; +use databend_common_expression::types::DataType; +use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::UInt64Type; +use databend_common_expression::BlockEntry; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_expression::FromData; +use databend_common_expression::Value; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; + +use crate::pipelines::processors::transforms::ReclusterSampleMeta; +use crate::pipelines::processors::transforms::SampleState; + +pub struct TransformRangePartitionIndexer +where T: ArgType +{ + input: Arc, + output: Arc, + + state: Arc, + input_data: Vec, + output_data: VecDeque, + bounds: Vec, +} + +impl TransformRangePartitionIndexer +where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send + Sync, +{ + pub fn create( + input: Arc, + output: Arc, + state: Arc, + ) -> Box { + Box::new(Self { + input, + output, + state, + input_data: vec![], + output_data: VecDeque::new(), + bounds: vec![], + }) + } +} + +#[async_trait::async_trait] +impl Processor for TransformRangePartitionIndexer +where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send + Sync, +{ + fn name(&self) -> String { + "TransformRangePartitionIndexer".to_owned() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + self.input.set_not_need_data(); + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if !self.input_data.is_empty() { + return Ok(Event::Sync); + } + + if self.input.is_finished() { + assert!(self.state.done.is_notified()); + self.output.finish(); + return Ok(Event::Finished); + } + + if !self.input.has_data() { + self.input.set_need_data(); + return Ok(Event::NeedData); + } + + let mut input_data = self.input.pull_data().unwrap()?; + let meta = input_data + .take_meta() + .and_then(ReclusterSampleMeta::downcast_from) + .expect("require a ReclusterSampleMeta"); + self.input_data = meta.blocks; + self.state.merge_sample::(meta.sample_values)?; + Ok(Event::Async) + } + + fn process(&mut self) -> Result<()> { + if let Some(mut block) = self.input_data.pop() { + let num_rows = block.num_rows(); + let last = block.get_last_column().clone(); + block.pop_columns(1); + let mut builder = Vec::with_capacity(num_rows); + let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap(); + for index in 0..num_rows { + let val = + T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) }); + let mut low = 0; + let mut high = self.bounds.len(); + while low < high { + let mid = low + ((high - low) / 2); + let bound = unsafe { self.bounds.get_unchecked(mid) }.clone(); + if val > bound { + low = mid + 1; + } else { + high = mid; + } + } + builder.push(low as u64); + } + + block.add_column(BlockEntry::new( + DataType::Number(NumberDataType::UInt64), + Value::Column(UInt64Type::from_data(builder)), + )); + self.output_data.push_back(block); + } + Ok(()) + } + + #[async_backtrace::framed] + async fn async_process(&mut self) -> Result<()> { + self.state.done.notified().await; + self.bounds = self.state.get_bounds::(); + Ok(()) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs new file mode 100644 index 0000000000000..3e9fe42a8dee4 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs @@ -0,0 +1,82 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use databend_common_exception::Result; +use databend_common_expression::local_block_meta_serde; +use databend_common_expression::types::ArgType; +use databend_common_expression::types::ValueType; +use databend_common_expression::BlockMetaInfo; +use databend_common_expression::DataBlock; +use databend_common_expression::Scalar; +use databend_common_pipeline_transforms::AccumulatingTransform; + +use crate::pipelines::processors::transforms::RangeBoundSampler; + +pub struct TransformReclusterCollect +where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send, +{ + input_data: Vec, + sampler: RangeBoundSampler, +} + +impl TransformReclusterCollect +where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send, +{ + pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self { + Self { + input_data: vec![], + sampler: RangeBoundSampler::::new(offset, sample_rate, seed), + } + } +} + +impl AccumulatingTransform for TransformReclusterCollect +where + T: ArgType + Send + Sync, + T::Scalar: Ord + Send, +{ + const NAME: &'static str = "TransformReclusterCollect"; + + fn transform(&mut self, data: DataBlock) -> Result> { + self.sampler.add_block(&data); + self.input_data.push(data); + Ok(vec![]) + } + + fn on_finish(&mut self, _output: bool) -> Result> { + let sample_values = self.sampler.sample_values(); + let blocks = std::mem::take(&mut self.input_data); + let meta = ReclusterSampleMeta { + blocks, + sample_values, + }; + + Ok(vec![DataBlock::empty_with_meta(Box::new(meta))]) + } +} + +#[derive(Debug)] +pub struct ReclusterSampleMeta { + pub blocks: Vec, + pub sample_values: Vec<(u64, Vec)>, +} + +local_block_meta_serde!(ReclusterSampleMeta); + +#[typetag::serde(name = "recluster_sample")] +impl BlockMetaInfo for ReclusterSampleMeta {} From e59fe4daeda21e06dcc86f7fdbc12bdb4df5106a Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 6 Jun 2025 19:08:55 +0800 Subject: [PATCH 18/36] linear recluster support block stream writer --- src/query/functions/src/scalars/hilbert.rs | 2 +- src/query/service/src/local/display.rs | 2 +- .../builders/builder_hilbert_partition.rs | 2 + .../pipelines/builders/builder_recluster.rs | 333 ++++++++++++++---- .../transforms/recluster/builder.rs | 198 ----------- .../processors/transforms/recluster/mod.rs | 6 +- .../transform_range_partition_indexer.rs | 2 +- .../recluster/transform_recluster_collect.rs | 1 - src/query/settings/src/settings_default.rs | 2 +- .../fuse/src/io/write/stream/block_builder.rs | 28 +- .../src/io/write/stream/cluster_statistics.rs | 47 ++- .../storages/fuse/src/operations/append.rs | 35 +- .../processors/transform_block_writer.rs | 4 +- 13 files changed, 362 insertions(+), 300 deletions(-) delete mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/builder.rs diff --git a/src/query/functions/src/scalars/hilbert.rs b/src/query/functions/src/scalars/hilbert.rs index 2ba386450b0e0..d570b83740c1e 100644 --- a/src/query/functions/src/scalars/hilbert.rs +++ b/src/query/functions/src/scalars/hilbert.rs @@ -267,7 +267,7 @@ fn calc_range_partition_id(val: ScalarRef, arr: &Column) -> u64 { while low < high { let mid = low + ((high - low) / 2); let bound = unsafe { arr.index_unchecked(mid) }; - if val > bound { + if val >= bound { low = mid + 1; } else { high = mid; diff --git a/src/query/service/src/local/display.rs b/src/query/service/src/local/display.rs index 32546082de770..b782accabdd00 100644 --- a/src/query/service/src/local/display.rs +++ b/src/query/service/src/local/display.rs @@ -259,7 +259,7 @@ impl FormatDisplay<'_> { rows_str, self.start.elapsed().as_secs_f64(), humanize_count(stats.total_rows as f64), - HumanBytes(stats.total_rows as u64), + HumanBytes(stats.total_bytes as u64), humanize_count(stats.total_rows as f64 / self.start.elapsed().as_secs_f64()), HumanBytes((stats.total_bytes as f64 / self.start.elapsed().as_secs_f64()) as u64), ); diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index 9ae7941e01e9e..aebafaa53566d 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -71,6 +71,8 @@ impl PipelineBuilder { let properties = StreamBlockProperties::try_create( self.ctx.clone(), table, + MutationKind::Recluster, + None, partition.table_meta_timestamps, )?; diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index 05d2d63dd55d7..a5f39011080d8 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -12,29 +12,62 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::atomic; +use std::sync::atomic::AtomicUsize; +use std::sync::Arc; + use databend_common_catalog::plan::DataSourceInfo; use databend_common_catalog::plan::DataSourcePlan; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::row::RowConverter as CommonConverter; +use databend_common_expression::types::AccessType; +use databend_common_expression::types::ArgType; +use databend_common_expression::types::DataType; +use databend_common_expression::types::DateType; +use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::NumberType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; +use databend_common_expression::with_number_mapped_type; +use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; use databend_common_expression::SortColumnDescription; use databend_common_metrics::storage::metrics_inc_recluster_block_bytes_to_read; use databend_common_metrics::storage::metrics_inc_recluster_block_nums_to_read; use databend_common_metrics::storage::metrics_inc_recluster_row_nums_to_read; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_core::Pipeline; use databend_common_pipeline_sources::EmptySource; use databend_common_pipeline_transforms::processors::build_compact_block_no_split_pipeline; use databend_common_pipeline_transforms::processors::TransformPipelineHelper; +use databend_common_pipeline_transforms::sort::CommonRows; +use databend_common_pipeline_transforms::sort::RowConverter; +use databend_common_pipeline_transforms::sort::Rows; +use databend_common_pipeline_transforms::sort::SimpleRowConverter; +use databend_common_pipeline_transforms::sort::SimpleRowsAsc; +use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::evaluator::CompoundBlockOperator; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::StreamContext; use databend_common_storages_factory::Table; +use databend_common_storages_fuse::io::StreamBlockProperties; +use databend_common_storages_fuse::operations::TransformBlockWriter; use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::TableContext; +use match_template::match_template; use crate::pipelines::builders::SortPipelineBuilder; +use crate::pipelines::processors::transforms::ReclusterPartitionExchange; +use crate::pipelines::processors::transforms::ReclusterPartitionStrategy; +use crate::pipelines::processors::transforms::SampleState; +use crate::pipelines::processors::transforms::TransformAddOrderColumn; use crate::pipelines::processors::transforms::TransformAddStreamColumns; +use crate::pipelines::processors::transforms::TransformPartitionCollect; +use crate::pipelines::processors::transforms::TransformRangePartitionIndexer; +use crate::pipelines::processors::transforms::TransformReclusterCollect; use crate::pipelines::PipelineBuilder; impl PipelineBuilder { @@ -71,7 +104,7 @@ impl PipelineBuilder { let recluster_block_nums = task.parts.len(); let block_thresholds = table.get_block_thresholds(); let table_info = table.get_table_info(); - let schema = table.schema_with_stream(); + let schema = Arc::new(table.schema_with_stream().remove_virtual_computed_fields()); let description = task.stats.get_description(&table_info.desc); let plan = DataSourcePlan { source_info: DataSourceInfo::TableSource(table_info.clone()), @@ -117,72 +150,158 @@ impl PipelineBuilder { .add_transformer(|| TransformAddStreamColumns::new(stream_ctx.clone())); } - let cluster_stats_gen = table.get_cluster_stats_gen( - self.ctx.clone(), - task.level + 1, - block_thresholds, - None, - )?; - let operators = cluster_stats_gen.operators.clone(); - if !operators.is_empty() { - let func_ctx2 = cluster_stats_gen.func_ctx.clone(); - self.main_pipeline.add_transformer(move || { - CompoundBlockOperator::new( - operators.clone(), - func_ctx2.clone(), - num_input_columns, - ) - }); - } + let level = task.level + 1; + let enable_stream_writer = + self.ctx.get_settings().get_enable_block_stream_write()? + && table.storage_format_as_parquet(); + if enable_stream_writer { + let properties = StreamBlockProperties::try_create( + self.ctx.clone(), + table, + MutationKind::Recluster, + Some(level), + recluster.table_meta_timestamps, + )?; + let operators = properties.cluster_operators(); + if !operators.is_empty() { + let func_ctx = self.ctx.get_function_context()?; + self.main_pipeline.add_transformer(move || { + CompoundBlockOperator::new( + operators.clone(), + func_ctx.clone(), + num_input_columns, + ) + }); + } + + let fields_with_cluster_key = properties.fields_with_cluster_key(); + let schema = DataSchemaRefExt::create(fields_with_cluster_key); + let sort_descs: Vec<_> = properties + .cluster_key_index() + .iter() + .map(|&offset| SortColumnDescription { + offset, + asc: true, + nulls_first: false, + }) + .collect(); - // construct output fields - let output_fields = cluster_stats_gen.out_fields.clone(); - let schema = DataSchemaRefExt::create(output_fields); - let sort_descs: Vec<_> = cluster_stats_gen - .cluster_key_index - .iter() - .map(|offset| SortColumnDescription { - offset: *offset, - asc: true, - nulls_first: false, - }) - .collect(); - - // merge sort - let sort_block_size = block_thresholds.calc_rows_for_recluster( - task.total_rows, - task.total_bytes, - task.total_compressed, - ); - - let sort_pipeline_builder = - SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())? - .with_block_size_hit(sort_block_size) - .remove_order_col_at_last(); - // Todo(zhyass): Recluster will no longer perform sort in the near future. - sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?; - - // Compact after merge sort. - let max_threads = self.ctx.get_settings().get_max_threads()? as usize; - build_compact_block_no_split_pipeline( - &mut self.main_pipeline, - block_thresholds, - max_threads, - )?; - - self.main_pipeline - .add_transform(|transform_input_port, transform_output_port| { - let proc = TransformSerializeBlock::try_create( + let num_processors = self.main_pipeline.output_len(); + let sample_rate = 0.01; + let partitions = block_thresholds.calc_partitions_for_recluster( + task.total_rows, + task.total_bytes, + task.total_compressed, + ); + let state = SampleState::new(num_processors, partitions); + let recluster_pipeline_builder = + ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_rate) + .with_state(state); + recluster_pipeline_builder + .build_recluster_sample_pipeline(&mut self.main_pipeline)?; + + self.main_pipeline.exchange( + num_processors, + ReclusterPartitionExchange::create(0, partitions), + ); + let processor_id = AtomicUsize::new(0); + let settings = self.ctx.get_settings(); + let memory_settings = MemorySettings::disable_spill(); + self.main_pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(Box::new( + TransformPartitionCollect::new( + self.ctx.clone(), + input, + output, + &settings, + processor_id.fetch_add(1, atomic::Ordering::AcqRel), + num_processors, + partitions, + memory_settings.clone(), + None, + ReclusterPartitionStrategy::new(properties.clone()), + )?, + ))) + })?; + + self.main_pipeline.add_async_accumulating_transformer(|| { + TransformBlockWriter::create( self.ctx.clone(), - transform_input_port, - transform_output_port, - table, - cluster_stats_gen.clone(), MutationKind::Recluster, - recluster.table_meta_timestamps, - )?; - proc.into_processor() - }) + table, + false, + ) + }); + Ok(()) + } else { + let cluster_stats_gen = table.get_cluster_stats_gen( + self.ctx.clone(), + level, + block_thresholds, + None, + )?; + let operators = cluster_stats_gen.operators.clone(); + if !operators.is_empty() { + let func_ctx2 = cluster_stats_gen.func_ctx.clone(); + self.main_pipeline.add_transformer(move || { + CompoundBlockOperator::new( + operators.clone(), + func_ctx2.clone(), + num_input_columns, + ) + }); + } + + // construct output fields + let output_fields = cluster_stats_gen.out_fields.clone(); + let schema = DataSchemaRefExt::create(output_fields); + let sort_descs: Vec<_> = cluster_stats_gen + .cluster_key_index + .iter() + .map(|offset| SortColumnDescription { + offset: *offset, + asc: true, + nulls_first: false, + }) + .collect(); + + // merge sort + let sort_block_size = block_thresholds.calc_rows_for_recluster( + task.total_rows, + task.total_bytes, + task.total_compressed, + ); + + let sort_pipeline_builder = + SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())? + .with_block_size_hit(sort_block_size) + .remove_order_col_at_last(); + // Todo(zhyass): Recluster will no longer perform sort in the near future. + sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?; + + // Compact after merge sort. + let max_threads = self.ctx.get_settings().get_max_threads()? as usize; + build_compact_block_no_split_pipeline( + &mut self.main_pipeline, + block_thresholds, + max_threads, + )?; + + self.main_pipeline.add_transform( + |transform_input_port, transform_output_port| { + let proc = TransformSerializeBlock::try_create( + self.ctx.clone(), + transform_input_port, + transform_output_port, + table, + cluster_stats_gen.clone(), + MutationKind::Recluster, + recluster.table_meta_timestamps, + )?; + proc.into_processor() + }, + ) + } } _ => Err(ErrorCode::Internal( "A node can only execute one recluster task".to_string(), @@ -190,3 +309,89 @@ impl PipelineBuilder { } } } + +struct ReclusterPipelineBuilder { + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + state: Option>, + sample_rate: f64, + seed: u64, +} + +impl ReclusterPipelineBuilder { + fn create( + schema: DataSchemaRef, + sort_desc: Arc<[SortColumnDescription]>, + sample_rate: f64, + ) -> Self { + Self { + schema, + sort_desc, + state: None, + sample_rate, + seed: rand::random(), + } + } + + #[allow(unused)] + fn with_seed(mut self, seed: u64) -> Self { + self.seed = seed; + self + } + + fn with_state(mut self, state: Arc) -> Self { + self.state = Some(state); + self + } + + fn build_recluster_sample_pipeline(&self, pipeline: &mut Pipeline) -> Result<()> { + match self.sort_desc.as_ref() { + [desc] => { + let schema = self.schema.clone(); + let sort_type = schema.field(desc.offset).data_type(); + assert!(desc.asc); + + match_template! { + T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], + match sort_type { + DataType::T => { + self.visit_type::, SimpleRowConverter>(pipeline) + }, + DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { + NumberDataType::NUM_TYPE => { + self.visit_type::>, SimpleRowConverter>>(pipeline) + } + }), + _ => self.visit_type::(pipeline) + } + } + } + _ => self.visit_type::(pipeline), + } + } + + fn visit_type(&self, pipeline: &mut Pipeline) -> Result<()> + where + R: Rows + 'static, + C: RowConverter + Send + 'static, + R::Type: ArgType + Send + Sync, + ::Scalar: Ord + Send + Sync, + { + pipeline.try_add_transformer(|| { + TransformAddOrderColumn::::try_new(self.sort_desc.clone(), self.schema.clone()) + })?; + let offset = self.schema.num_fields(); + pipeline.add_accumulating_transformer(|| { + TransformReclusterCollect::::new(offset, self.sample_rate, self.seed) + }); + pipeline.add_transform(|input, output| { + Ok(ProcessorPtr::create(TransformRangePartitionIndexer::< + R::Type, + >::create( + input, + output, + self.state.clone().unwrap(), + ))) + }) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs b/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs deleted file mode 100644 index 0a58f27556f91..0000000000000 --- a/src/query/service/src/pipelines/processors/transforms/recluster/builder.rs +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_exception::Result; -use databend_common_expression::row::RowConverter as CommonConverter; -use databend_common_expression::types::AccessType; -use databend_common_expression::types::ArgType; -use databend_common_expression::types::DataType; -use databend_common_expression::types::DateType; -use databend_common_expression::types::NumberDataType; -use databend_common_expression::types::NumberType; -use databend_common_expression::types::StringType; -use databend_common_expression::types::TimestampType; -use databend_common_expression::with_number_mapped_type; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::SortColumnDescription; -use databend_common_pipeline_core::processors::InputPort; -use databend_common_pipeline_core::processors::OutputPort; -use databend_common_pipeline_core::processors::Processor; -use databend_common_pipeline_transforms::sort::CommonRows; -use databend_common_pipeline_transforms::sort::RowConverter; -use databend_common_pipeline_transforms::sort::Rows; -use databend_common_pipeline_transforms::sort::SimpleRowConverter; -use databend_common_pipeline_transforms::sort::SimpleRowsAsc; -use databend_common_pipeline_transforms::AccumulatingTransformer; -use databend_common_pipeline_transforms::Transformer; -use match_template::match_template; - -use crate::pipelines::processors::transforms::recluster::transform_add_order_column::TransformAddOrderColumn; -use crate::pipelines::processors::transforms::recluster::TransformRangePartitionIndexer; -use crate::pipelines::processors::transforms::SampleState; -use crate::pipelines::processors::transforms::TransformReclusterCollect; - -pub struct TransformReclusterBuilder { - schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, - sample_rate: f64, - seed: u64, -} - -impl TransformReclusterBuilder { - pub fn build_recluster_sample( - &self, - input: Arc, - output: Arc, - ) -> Result> { - self.build_inner(BuilderType::ReclusterSample, input, output, None) - } - - pub fn build_range_partition_indexer( - &self, - input: Arc, - output: Arc, - state: Arc, - ) -> Result> { - self.build_inner( - BuilderType::RangePartitionIndexer, - input, - output, - Some(state), - ) - } - - pub fn build_add_order_column( - &self, - input: Arc, - output: Arc, - ) -> Result> { - self.build_inner(BuilderType::AddOrderColumn, input, output, None) - } - - fn build_inner( - &self, - typ: BuilderType, - input: Arc, - output: Arc, - state: Option>, - ) -> Result> { - let mut build = BuilderInner { - input, - output, - typ, - base: self, - state, - }; - build.select_row_type() - } -} - -enum BuilderType { - AddOrderColumn, - ReclusterSample, - RangePartitionIndexer, -} - -struct BuilderInner<'a> { - input: Arc, - output: Arc, - typ: BuilderType, - base: &'a TransformReclusterBuilder, - state: Option>, -} - -impl BuilderInner<'_> { - pub fn select_row_type(&mut self) -> Result> { - match self.base.sort_desc.as_ref() { - [desc] => { - let schema = self.base.schema.clone(); - let sort_type = schema.field(desc.offset).data_type(); - assert!(desc.asc); - - match_template! { - T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], - match sort_type { - DataType::T => { - self.visit_type::, SimpleRowConverter>() - }, - DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { - NumberDataType::NUM_TYPE => { - self.visit_type::>, SimpleRowConverter>>() - } - }), - _ => self.visit_type::() - } - } - } - _ => self.visit_type::(), - } - } - - fn visit_type(&mut self) -> Result> - where - R: Rows + 'static, - C: RowConverter + Send + 'static, - R::Type: ArgType + Send + Sync, - ::Scalar: Ord + Send + Sync, - { - match self.typ { - BuilderType::AddOrderColumn => self.build_add_order_column::(), - BuilderType::ReclusterSample => self.build_recluster_sample::(), - BuilderType::RangePartitionIndexer => self.build_range_partition_indexer::(), - } - } - - fn build_add_order_column(&mut self) -> Result> - where - R: Rows + 'static, - C: RowConverter + Send + 'static, - { - let inner = TransformAddOrderColumn::::try_new( - self.base.sort_desc.clone(), - self.base.schema.clone(), - )?; - Ok(Transformer::create( - self.input.clone(), - self.output.clone(), - inner, - )) - } - - fn build_range_partition_indexer(&mut self) -> Result> - where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send + Sync, - { - Ok(TransformRangePartitionIndexer::::create( - self.input.clone(), - self.output.clone(), - self.state.clone().unwrap(), - )) - } - - fn build_recluster_sample(&mut self) -> Result> - where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send + Sync, - { - let offset = self.base.schema.fields().len(); - Ok(AccumulatingTransformer::create( - self.input.clone(), - self.output.clone(), - TransformReclusterCollect::::new(offset, self.base.sample_rate, self.base.seed), - )) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs index fd1db0c11426c..0f3612043c7d9 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod builder; mod range_bound_sampler; mod recluster_partition_exchange; mod recluster_partition_strategy; @@ -25,7 +24,8 @@ pub use range_bound_sampler::RangeBoundSampler; pub use recluster_partition_exchange::ReclusterPartitionExchange; pub use recluster_partition_strategy::CompactPartitionStrategy; pub use recluster_partition_strategy::ReclusterPartitionStrategy; -pub(crate) use recluster_sample_state::SampleState; +pub use recluster_sample_state::SampleState; +pub use transform_add_order_column::TransformAddOrderColumn; pub use transform_range_partition_indexer::TransformRangePartitionIndexer; -pub(crate) use transform_recluster_collect::ReclusterSampleMeta; +pub use transform_recluster_collect::ReclusterSampleMeta; pub use transform_recluster_collect::TransformReclusterCollect; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index 6d10600366eda..6b944a487664b 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -137,7 +137,7 @@ where while low < high { let mid = low + ((high - low) / 2); let bound = unsafe { self.bounds.get_unchecked(mid) }.clone(); - if val > bound { + if val >= bound { low = mid + 1; } else { high = mid; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs index 3e9fe42a8dee4..3900fd81db6d7 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs @@ -15,7 +15,6 @@ use databend_common_exception::Result; use databend_common_expression::local_block_meta_serde; use databend_common_expression::types::ArgType; -use databend_common_expression::types::ValueType; use databend_common_expression::BlockMetaInfo; use databend_common_expression::DataBlock; use databend_common_expression::Scalar; diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index f5c42c4e05053..4dbc98b2b1d8a 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -860,7 +860,7 @@ impl DefaultSettings { desc: "Sets the maximum byte size of blocks for recluster", mode: SettingMode::Both, scope: SettingScope::Both, - range: Some(SettingRange::Numeric(0..=u64::MAX)), + range: Some(SettingRange::Numeric(0..=80 * 1024 * 1024 * 1024)), }), ("compact_max_block_selection", DefaultSettingValue { value: UserSettingValue::UInt64(10000), diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 49473f1fd7032..50a701c31be93 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -28,6 +28,7 @@ use databend_common_expression::Column; use databend_common_expression::ColumnId; use databend_common_expression::ComputedExpr; use databend_common_expression::DataBlock; +use databend_common_expression::DataField; use databend_common_expression::FieldIndex; use databend_common_expression::TableField; use databend_common_expression::TableSchema; @@ -35,6 +36,8 @@ use databend_common_expression::TableSchemaRef; use databend_common_expression::ORIGIN_BLOCK_ROW_NUM_COLUMN_ID; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; use databend_common_native::write::NativeWriter; +use databend_common_sql::evaluator::BlockOperator; +use databend_common_sql::executor::physical_plans::MutationKind; use databend_storages_common_index::BloomIndex; use databend_storages_common_index::BloomIndexBuilder; use databend_storages_common_index::Index; @@ -367,17 +370,24 @@ impl StreamBlockProperties { pub fn try_create( ctx: Arc, table: &FuseTable, + kind: MutationKind, + level: Option, table_meta_timestamps: TableMetaTimestamps, ) -> Result> { // remove virtual computed fields. - let fields = table + let mut fields = table .schema() .fields() .iter() .filter(|f| !matches!(f.computed_expr(), Some(ComputedExpr::Virtual(_)))) .cloned() .collect::>(); - + if !matches!(kind, MutationKind::Insert | MutationKind::Replace) { + // add stream fields. + for stream_column in table.stream_columns().iter() { + fields.push(stream_column.table_field()); + } + } let source_schema = Arc::new(TableSchema { fields, ..table.schema().as_ref().clone() @@ -400,7 +410,7 @@ impl StreamBlockProperties { let inverted_index_builders = create_inverted_index_builders(&table.table_info.meta); let cluster_stats_builder = - ClusterStatisticsBuilder::try_create(table, ctx.clone(), &source_schema)?; + ClusterStatisticsBuilder::try_create(table, ctx.clone(), &source_schema, level)?; let mut stats_columns = vec![]; let mut distinct_columns = vec![]; @@ -437,4 +447,16 @@ impl StreamBlockProperties { self.block_thresholds .check_large_enough(num_rows, data_size) } + + pub fn cluster_operators(&self) -> Vec { + self.cluster_stats_builder.operators() + } + + pub fn fields_with_cluster_key(&self) -> Vec { + self.cluster_stats_builder.out_fields() + } + + pub fn cluster_key_index(&self) -> &Vec { + self.cluster_stats_builder.cluster_key_index() + } } diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs index 84bba6b663db1..a0bd91888995e 100644 --- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs @@ -20,9 +20,9 @@ use databend_common_exception::Result; use databend_common_expression::Column; use databend_common_expression::ColumnRef; use databend_common_expression::DataBlock; +use databend_common_expression::DataField; use databend_common_expression::DataSchema; use databend_common_expression::Expr; -use databend_common_expression::FunctionContext; use databend_common_expression::Scalar; use databend_common_expression::TableSchemaRef; use databend_common_functions::aggregates::eval_aggr; @@ -35,12 +35,13 @@ use crate::FuseTable; #[derive(Default, Clone)] pub struct ClusterStatisticsBuilder { + out_fields: Vec, + level: i32, cluster_key_id: u32, cluster_key_index: Vec, extra_key_num: usize, operators: Vec, - func_ctx: FunctionContext, } impl ClusterStatisticsBuilder { @@ -48,6 +49,7 @@ impl ClusterStatisticsBuilder { table: &FuseTable, ctx: Arc, source_schema: &TableSchemaRef, + level: Option, ) -> Result> { let cluster_type = table.cluster_type(); if cluster_type.is_none_or(|v| v == ClusterType::Hilbert) { @@ -55,9 +57,9 @@ impl ClusterStatisticsBuilder { } let input_schema: Arc = DataSchema::from(source_schema).into(); - let input_filed_len = input_schema.fields.len(); + let mut out_fields = input_schema.fields().clone(); - let cluster_keys = table.linear_cluster_keys(ctx.clone()); + let cluster_keys = table.linear_cluster_keys(ctx); let mut cluster_key_index = Vec::with_capacity(cluster_keys.len()); let mut extra_key_num = 0; @@ -69,8 +71,11 @@ impl ClusterStatisticsBuilder { let index = match &expr { Expr::ColumnRef(ColumnRef { id, .. }) => *id, _ => { + let cname = format!("{}", expr); + out_fields.push(DataField::new(cname.as_str(), expr.data_type().clone())); exprs.push(expr); - let offset = input_filed_len + extra_key_num; + + let offset = out_fields.len() - 1; extra_key_num += 1; offset } @@ -90,14 +95,26 @@ impl ClusterStatisticsBuilder { cluster_key_id: table.cluster_key_meta.as_ref().unwrap().0, cluster_key_index, extra_key_num, - func_ctx: ctx.get_function_context()?, operators, + out_fields, + level: level.unwrap_or(0), })) } + + pub fn operators(&self) -> Vec { + self.operators.clone() + } + + pub fn out_fields(&self) -> Vec { + self.out_fields.clone() + } + + pub fn cluster_key_index(&self) -> &Vec { + &self.cluster_key_index + } } pub struct ClusterStatisticsState { - level: i32, mins: Vec, maxs: Vec, @@ -107,29 +124,23 @@ pub struct ClusterStatisticsState { impl ClusterStatisticsState { pub fn new(builder: Arc) -> Self { Self { - level: 0, mins: vec![], maxs: vec![], builder, } } - pub fn add_block(&mut self, input: DataBlock) -> Result { + pub fn add_block(&mut self, mut input: DataBlock) -> Result { if self.builder.cluster_key_index.is_empty() { return Ok(input); } let num_rows = input.num_rows(); - let mut block = self - .builder - .operators - .iter() - .try_fold(input, |input, op| op.execute(&self.builder.func_ctx, input))?; let cols = self .builder .cluster_key_index .iter() - .map(|&i| block.get_by_offset(i).to_column()) + .map(|&i| input.get_by_offset(i).to_column()) .collect(); let tuple = Column::Tuple(cols); let (min, _) = eval_aggr("min", vec![], &[tuple.clone()], num_rows, vec![])?; @@ -138,8 +149,8 @@ impl ClusterStatisticsState { assert_eq!(max.len(), 1); self.mins.push(min.index(0).unwrap().to_owned()); self.maxs.push(max.index(0).unwrap().to_owned()); - block.pop_columns(self.builder.extra_key_num); - Ok(block) + input.pop_columns(self.builder.extra_key_num); + Ok(input) } pub fn finalize(self, perfect: bool) -> Result> { @@ -167,7 +178,7 @@ impl ClusterStatisticsState { let level = if min == max && perfect { -1 } else { - self.level + self.builder.level }; Ok(Some(ClusterStatistics { diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index 6c9269bd13620..f938768066ce1 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -37,6 +37,7 @@ use databend_common_sql::executor::physical_plans::MutationKind; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use databend_storages_common_table_meta::table::ClusterType; +use crate::io::StreamBlockProperties; use crate::operations::TransformBlockBuilder; use crate::operations::TransformBlockWriter; use crate::operations::TransformSerializeBlock; @@ -53,13 +54,34 @@ impl FuseTable { let enable_stream_block_write = ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet(); if enable_stream_block_write { + let properties = StreamBlockProperties::try_create( + ctx.clone(), + self, + MutationKind::Insert, + None, + table_meta_timestamps, + )?; + + let cluster_operators = properties.cluster_operators(); + if !cluster_operators.is_empty() { + let num_input_columns = self.table_info.schema().num_fields(); + let func_ctx = ctx.get_function_context()?; + pipeline.add_transformer(move || { + CompoundBlockOperator::new( + cluster_operators.clone(), + func_ctx.clone(), + num_input_columns, + ) + }); + } + pipeline.add_transform(|input, output| { TransformBlockBuilder::try_create( ctx.clone(), input, output, self, - table_meta_timestamps, + properties.clone(), ) })?; @@ -70,7 +92,7 @@ impl FuseTable { let block_thresholds = self.get_block_thresholds(); build_compact_block_pipeline(pipeline, block_thresholds)?; - let schema = DataSchema::from(self.schema()).into(); + let schema = DataSchema::from(&self.schema().remove_virtual_computed_fields()).into(); let cluster_stats_gen = self.cluster_gen_for_append(ctx.clone(), pipeline, block_thresholds, Some(schema))?; pipeline.add_transform(|input, output| { @@ -103,7 +125,7 @@ impl FuseTable { let operators = cluster_stats_gen.operators.clone(); if !operators.is_empty() { - let num_input_columns = self.table_info.schema().fields().len(); + let num_input_columns = self.table_info.schema().num_fields(); let func_ctx2 = cluster_stats_gen.func_ctx.clone(); let mut builder = pipeline.try_create_transform_pipeline_builder_with_len( move || { @@ -162,7 +184,7 @@ impl FuseTable { let operators = cluster_stats_gen.operators.clone(); if !operators.is_empty() { - let num_input_columns = self.table_info.schema().fields().len(); + let num_input_columns = self.table_info.schema().num_fields(); let func_ctx2 = cluster_stats_gen.func_ctx.clone(); pipeline.add_transformer(move || { @@ -199,8 +221,9 @@ impl FuseTable { return Ok(ClusterStatsGenerator::default()); } - let input_schema = - modified_schema.unwrap_or(DataSchema::from(self.schema_with_stream()).into()); + let input_schema = modified_schema.unwrap_or( + DataSchema::from(&self.schema_with_stream().remove_virtual_computed_fields()).into(), + ); let mut merged = input_schema.fields().clone(); let cluster_keys = self.linear_cluster_keys(ctx.clone()); diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index 5af36b0a1f522..f5f61b8001c4d 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -34,7 +34,6 @@ use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_transforms::AsyncAccumulatingTransform; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storage::MutationStatus; -use databend_storages_common_table_meta::meta::TableMetaTimestamps; use opendal::Operator; use crate::io::BlockSerialization; @@ -77,13 +76,12 @@ impl TransformBlockBuilder { input: Arc, output: Arc, table: &FuseTable, - table_meta_timestamps: TableMetaTimestamps, + properties: Arc, ) -> Result { let max_block_rows = std::cmp::min( ctx.get_settings().get_max_block_size()? as usize, table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT), ); - let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?; Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder { state: State::Consume, input, From 8811b22e4a56681b5bcebdac77ded59a639bc056 Mon Sep 17 00:00:00 2001 From: zhyass Date: Sat, 7 Jun 2025 13:58:21 +0800 Subject: [PATCH 19/36] fix --- src/query/functions/src/scalars/hilbert.rs | 10 ++++---- .../recluster/recluster_sample_state.rs | 24 +++++++++++++++---- .../transform_range_partition_indexer.rs | 15 +++++++++--- 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/query/functions/src/scalars/hilbert.rs b/src/query/functions/src/scalars/hilbert.rs index d570b83740c1e..b57c6aa77a17a 100644 --- a/src/query/functions/src/scalars/hilbert.rs +++ b/src/query/functions/src/scalars/hilbert.rs @@ -257,17 +257,17 @@ pub fn register(registry: &mut FunctionRegistry) { /// /// # Example /// For boundaries [10, 20, 30]: -/// - Values < 10 get partition ID 0 -/// - Values >= 10 and < 20 get partition ID 1 -/// - Values >= 20 and < 30 get partition ID 2 -/// - Values >= 30 get partition ID 3 +/// - Values <= 10 get partition ID 0 +/// - Values > 10 and <= 20 get partition ID 1 +/// - Values > 20 and <= 30 get partition ID 2 +/// - Values > 30 get partition ID 3 fn calc_range_partition_id(val: ScalarRef, arr: &Column) -> u64 { let mut low = 0; let mut high = arr.len(); while low < high { let mid = low + ((high - low) / 2); let bound = unsafe { arr.index_unchecked(mid) }; - if val >= bound { + if val > bound { low = mid + 1; } else { high = mid; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs index 77c0fa0a9483f..12d50653b8b68 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs @@ -35,6 +35,7 @@ impl SampleState { completed_inputs: 0, values: vec![], bounds: vec![], + max_value: None, }), done: Arc::new(WatchNotify::new()), }) @@ -56,17 +57,22 @@ impl SampleState { Ok(()) } - pub fn get_bounds(&self) -> Vec + pub fn get_bounds(&self) -> (Vec, Option) where T: ArgType, T::Scalar: Ord, { let inner = self.inner.read().unwrap(); - inner + let bounds = inner .bounds .iter() .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap())) - .collect() + .collect(); + let max_value = inner + .max_value + .as_ref() + .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap())); + (bounds, max_value) } } @@ -76,6 +82,7 @@ pub struct SampleStateInner { completed_inputs: usize, bounds: Vec, + max_value: Option, values: Vec<(u64, Vec)>, } @@ -112,6 +119,9 @@ impl SampleStateInner { let col = T::upcast_column(T::column_from_vec(data.clone(), &[])); let indices = compare_columns(vec![col], total_samples)?; + let max_index = indices[total_samples - 1] as usize; + let max_val = data[max_index].clone(); + let mut cum_weight = 0.0; let mut target = step; let mut bounds = Vec::with_capacity(self.partitions - 1); @@ -126,7 +136,13 @@ impl SampleStateInner { if cum_weight >= target { let data = &data[idx]; if previous_bound.as_ref().is_none_or(|prev| data > prev) { - bounds.push(T::upcast_scalar(data.clone())); + if data == &max_val { + self.max_value = Some(T::upcast_scalar(max_val)); + break; + } + + let bound = T::upcast_scalar(data.clone()); + bounds.push(bound); target += step; j += 1; previous_bound = Some(data.clone()); diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index 6b944a487664b..215e0c977eb8f 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -44,6 +44,7 @@ where T: ArgType input_data: Vec, output_data: VecDeque, bounds: Vec, + max_value: Option, } impl TransformRangePartitionIndexer @@ -63,6 +64,7 @@ where input_data: vec![], output_data: VecDeque::new(), bounds: vec![], + max_value: None, }) } } @@ -124,6 +126,7 @@ where fn process(&mut self) -> Result<()> { if let Some(mut block) = self.input_data.pop() { + let bound_len = self.bounds.len(); let num_rows = block.num_rows(); let last = block.get_last_column().clone(); block.pop_columns(1); @@ -132,12 +135,18 @@ where for index in 0..num_rows { let val = T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) }); + if self.max_value.as_ref().is_some_and(|v| val >= *v) { + let range_id = bound_len + 1; + builder.push(range_id as u64); + continue; + } + let mut low = 0; - let mut high = self.bounds.len(); + let mut high = bound_len; while low < high { let mid = low + ((high - low) / 2); let bound = unsafe { self.bounds.get_unchecked(mid) }.clone(); - if val >= bound { + if val > bound { low = mid + 1; } else { high = mid; @@ -158,7 +167,7 @@ where #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { self.state.done.notified().await; - self.bounds = self.state.get_bounds::(); + (self.bounds, self.max_value) = self.state.get_bounds::(); Ok(()) } } From e395e1095289a424700f2289eb9feea32983fe35 Mon Sep 17 00:00:00 2001 From: zhyass Date: Sun, 8 Jun 2025 09:38:38 +0800 Subject: [PATCH 20/36] fix --- .../fuse/operations/mutation/recluster_mutator.rs | 2 ++ .../mutation/mutator/recluster_mutator.rs | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs index b3e8cf59c5a65..bd091a35ef5f2 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs @@ -156,6 +156,7 @@ async fn test_recluster_mutator_block_select() -> Result<()> { cluster_key_id, 1, column_ids, + 1, ); let (_, parts) = mutator .target_select(compact_segments, ReclusterMode::Recluster) @@ -280,6 +281,7 @@ async fn test_safety_for_recluster() -> Result<()> { cluster_key_id, max_tasks, column_ids, + 500, )); let (mode, selected_segs) = mutator.select_segments(&compact_segments, 8)?; // select the blocks with the highest depth. diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs index 3c50cd9f83153..0debc7375566c 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs @@ -72,6 +72,8 @@ pub struct ReclusterMutator { pub(crate) max_tasks: usize, pub(crate) cluster_key_types: Vec, pub(crate) column_ids: HashSet, + + average_size: usize, } impl ReclusterMutator { @@ -102,6 +104,13 @@ impl ReclusterMutator { // NOTE: The snapshot schema does not contain the stream column. let column_ids = snapshot.schema.to_leaf_column_id_set(); + let average_size = cmp::max( + snapshot + .summary + .uncompressed_byte_size + .div_ceil(snapshot.summary.block_count) as usize, + block_thresholds.max_bytes_per_block / 2, + ); Ok(Self { ctx, schema, @@ -111,6 +120,7 @@ impl ReclusterMutator { max_tasks, cluster_key_types, column_ids, + average_size, }) } @@ -125,6 +135,7 @@ impl ReclusterMutator { cluster_key_id: u32, max_tasks: usize, column_ids: HashSet, + average_size: usize, ) -> Self { Self { ctx, @@ -135,6 +146,7 @@ impl ReclusterMutator { max_tasks, cluster_key_types, column_ids, + average_size, } } @@ -196,8 +208,7 @@ impl ReclusterMutator { .get_recluster_block_size()? .min(avail_memory_usage * 30 / 100) as usize; // specify a rather small value, so that `recluster_block_size` might be tuned to lower value. - let max_blocks_num = - (memory_threshold / self.block_thresholds.max_bytes_per_block).max(2) * self.max_tasks; + let max_blocks_num = (memory_threshold / self.average_size).max(2) * self.max_tasks; let block_per_seg = self.block_thresholds.block_per_segment; // Prepare task generation parameters From e6a2a253d4421d76f923b12c5e126290a779db70 Mon Sep 17 00:00:00 2001 From: zhyass Date: Mon, 9 Jun 2025 11:21:11 +0800 Subject: [PATCH 21/36] fix test --- .../it/scalars/testdata/function_list.txt | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/query/functions/tests/it/scalars/testdata/function_list.txt b/src/query/functions/tests/it/scalars/testdata/function_list.txt index 06d4012daf92b..1119a04aa7373 100644 --- a/src/query/functions/tests/it/scalars/testdata/function_list.txt +++ b/src/query/functions/tests/it/scalars/testdata/function_list.txt @@ -115,6 +115,50 @@ Functions overloads: 1 add_months(Date NULL, Int64 NULL) :: Date NULL 2 add_months(Timestamp, Int64) :: Timestamp 3 add_months(Timestamp NULL, Int64 NULL) :: Timestamp NULL +0 add_noise(String) :: Binary +1 add_noise(String NULL) :: Binary NULL +2 add_noise(UInt8) :: Binary +3 add_noise(UInt8 NULL) :: Binary NULL +4 add_noise(UInt16) :: Binary +5 add_noise(UInt16 NULL) :: Binary NULL +6 add_noise(UInt32) :: Binary +7 add_noise(UInt32 NULL) :: Binary NULL +8 add_noise(UInt64) :: Binary +9 add_noise(UInt64 NULL) :: Binary NULL +10 add_noise(Int8) :: Binary +11 add_noise(Int8 NULL) :: Binary NULL +12 add_noise(Int16) :: Binary +13 add_noise(Int16 NULL) :: Binary NULL +14 add_noise(Int32) :: Binary +15 add_noise(Int32 NULL) :: Binary NULL +16 add_noise(Int64) :: Binary +17 add_noise(Int64 NULL) :: Binary NULL +18 add_noise(Float32) :: Binary +19 add_noise(Float32 NULL) :: Binary NULL +20 add_noise(Float64) :: Binary +21 add_noise(Float64 NULL) :: Binary NULL +22 add_noise(String, UInt64) :: Binary +23 add_noise(String NULL, UInt64 NULL) :: Binary NULL +24 add_noise(UInt8, UInt64) :: Binary +25 add_noise(UInt8 NULL, UInt64 NULL) :: Binary NULL +26 add_noise(UInt16, UInt64) :: Binary +27 add_noise(UInt16 NULL, UInt64 NULL) :: Binary NULL +28 add_noise(UInt32, UInt64) :: Binary +29 add_noise(UInt32 NULL, UInt64 NULL) :: Binary NULL +30 add_noise(UInt64, UInt64) :: Binary +31 add_noise(UInt64 NULL, UInt64 NULL) :: Binary NULL +32 add_noise(Int8, UInt64) :: Binary +33 add_noise(Int8 NULL, UInt64 NULL) :: Binary NULL +34 add_noise(Int16, UInt64) :: Binary +35 add_noise(Int16 NULL, UInt64 NULL) :: Binary NULL +36 add_noise(Int32, UInt64) :: Binary +37 add_noise(Int32 NULL, UInt64 NULL) :: Binary NULL +38 add_noise(Int64, UInt64) :: Binary +39 add_noise(Int64 NULL, UInt64 NULL) :: Binary NULL +40 add_noise(Float32, UInt64) :: Binary +41 add_noise(Float32 NULL, UInt64 NULL) :: Binary NULL +42 add_noise(Float64, UInt64) :: Binary +43 add_noise(Float64 NULL, UInt64 NULL) :: Binary NULL 0 add_quarters(Date, Int64) :: Date 1 add_quarters(Date NULL, Int64 NULL) :: Date NULL 2 add_quarters(Timestamp, Int64) :: Timestamp From fa6f023490fadea28534cacdc8245157430c4d78 Mon Sep 17 00:00:00 2001 From: zhyass Date: Mon, 9 Jun 2025 20:17:55 +0800 Subject: [PATCH 22/36] fix --- .../fuse/src/operations/mutation/mutator/recluster_mutator.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs index 0debc7375566c..85d275bcabef3 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs @@ -109,7 +109,7 @@ impl ReclusterMutator { .summary .uncompressed_byte_size .div_ceil(snapshot.summary.block_count) as usize, - block_thresholds.max_bytes_per_block / 2, + block_thresholds.min_bytes_per_block, ); Ok(Self { ctx, @@ -206,7 +206,7 @@ impl ReclusterMutator { settings.get_max_memory_usage()? - GLOBAL_MEM_STAT.get_memory_usage() as u64; let memory_threshold = settings .get_recluster_block_size()? - .min(avail_memory_usage * 30 / 100) as usize; + .min(avail_memory_usage * 50 / 100) as usize; // specify a rather small value, so that `recluster_block_size` might be tuned to lower value. let max_blocks_num = (memory_threshold / self.average_size).max(2) * self.max_tasks; let block_per_seg = self.block_thresholds.block_per_segment; From 42ebabafcee4fcf60f2e192eae77256a1c3ea79b Mon Sep 17 00:00:00 2001 From: zhyass Date: Mon, 9 Jun 2025 22:13:37 +0800 Subject: [PATCH 23/36] fix test --- src/query/storages/fuse/src/operations/recluster.rs | 4 ++++ .../base/09_fuse_engine/09_0008_fuse_optimize_table.test | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/query/storages/fuse/src/operations/recluster.rs b/src/query/storages/fuse/src/operations/recluster.rs index 4ce55ee9b9052..3cbf4cd1f922b 100644 --- a/src/query/storages/fuse/src/operations/recluster.rs +++ b/src/query/storages/fuse/src/operations/recluster.rs @@ -71,6 +71,10 @@ impl FuseTable { return Ok(None); }; + if snapshot.summary.block_count == 0 { + return Ok(None); + } + let mutator = Arc::new(ReclusterMutator::try_create( self, ctx.clone(), diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test index 43a2b262ca2f9..929e042c13122 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test @@ -510,7 +510,7 @@ select segment_count, block_count from fuse_snapshot('db_09_0008', 't9') limit 2 2 2 query I -select a from t9 +select a from t9 order by a ---- 1 2 From bd996f2d1f6dff0cc864a1d7c53aaf90cfd6c38e Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 10 Jun 2025 01:40:19 +0800 Subject: [PATCH 24/36] fix test --- .../expression/src/utils/block_thresholds.rs | 8 ++-- .../expression/tests/it/block_thresholds.rs | 6 +-- .../interpreter_table_recluster.rs | 2 +- .../pipelines/builders/builder_recluster.rs | 16 +++++--- .../recluster/range_bound_sampler.rs | 17 +++----- .../recluster/recluster_partition_strategy.rs | 40 +++++++++++++++---- .../recluster/transform_recluster_collect.rs | 4 +- src/query/settings/src/settings_default.rs | 8 ++-- .../settings/src/settings_getter_setter.rs | 4 +- .../fuse/src/io/write/stream/block_builder.rs | 4 +- .../src/io/write/stream/column_statistics.rs | 2 +- .../storages/fuse/src/operations/append.rs | 14 ++----- .../processors/transform_block_writer.rs | 26 +----------- .../mutation/mutator/recluster_mutator.rs | 19 ++++++--- .../09_0008_fuse_optimize_table.test | 2 +- 15 files changed, 88 insertions(+), 84 deletions(-) diff --git a/src/query/expression/src/utils/block_thresholds.rs b/src/query/expression/src/utils/block_thresholds.rs index 01c0631abe124..66cb1fbabc9e1 100644 --- a/src/query/expression/src/utils/block_thresholds.rs +++ b/src/query/expression/src/utils/block_thresholds.rs @@ -39,7 +39,7 @@ impl Default for BlockThresholds { max_bytes_per_block: DEFAULT_BLOCK_BUFFER_SIZE * 2, min_bytes_per_block: (DEFAULT_BLOCK_BUFFER_SIZE * 4).div_ceil(5), max_compressed_per_block: DEFAULT_BLOCK_COMPRESSED_SIZE, - min_compressed_per_block: (DEFAULT_BLOCK_COMPRESSED_SIZE * 4).div_ceil(5), + min_compressed_per_block: (DEFAULT_BLOCK_COMPRESSED_SIZE * 3).div_ceil(5), block_per_segment: DEFAULT_BLOCK_PER_SEGMENT, } } @@ -58,7 +58,7 @@ impl BlockThresholds { max_bytes_per_block: bytes_per_block * 2, min_bytes_per_block: (bytes_per_block * 4).div_ceil(5), max_compressed_per_block, - min_compressed_per_block: (max_compressed_per_block * 4).div_ceil(5), + min_compressed_per_block: (max_compressed_per_block * 3).div_ceil(5), block_per_segment, } } @@ -153,7 +153,7 @@ impl BlockThresholds { let bytes_per_block = total_bytes.div_ceil(block_num_by_compressed); // Adjust the number of blocks based on block size thresholds. let max_bytes_per_block = self.max_bytes_per_block.min(400 * 1024 * 1024); - let min_bytes_per_block = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024); + let min_bytes_per_block = self.min_bytes_per_block.min(100 * 1024 * 1024); let block_nums = if bytes_per_block > max_bytes_per_block { // Case 1: If the block size is too bigger. total_bytes.div_ceil(max_bytes_per_block) @@ -201,7 +201,7 @@ impl BlockThresholds { // Adjust block count based on byte size thresholds. let bytes_per_block = total_bytes.div_ceil(by_compressed); let max_bytes = self.max_bytes_per_block.min(400 * 1024 * 1024); - let min_bytes = (self.min_bytes_per_block / 2).min(50 * 1024 * 1024); + let min_bytes = self.min_bytes_per_block.min(100 * 1024 * 1024); let total_partitions = if bytes_per_block > max_bytes { // Block size is too large. total_bytes / max_bytes diff --git a/src/query/expression/tests/it/block_thresholds.rs b/src/query/expression/tests/it/block_thresholds.rs index 33b1d1b0ff394..b7409208386aa 100644 --- a/src/query/expression/tests/it/block_thresholds.rs +++ b/src/query/expression/tests/it/block_thresholds.rs @@ -105,8 +105,8 @@ fn test_calc_rows_for_recluster() { assert_eq!(result, 300); // Case 2: If the block size is too smaller. - let result = t.calc_rows_for_recluster(4_000, 2_000_000, 600_000); - assert_eq!(result, 800); + let result = t.calc_rows_for_recluster(4_000, 1_600_000, 600_000); + assert_eq!(result, 2000); // Case 3: use the compressed-based block count. let result = t.calc_rows_for_recluster(4_000, 10_000_000, 600_000); @@ -131,7 +131,7 @@ fn test_calc_partitions_for_recluster() { assert_eq!(result, 15); // Case 2: If the block size is too smaller. - let result = t.calc_partitions_for_recluster(4_000, 800_000, 800_000); + let result = t.calc_partitions_for_recluster(4_000, 1_600_000, 800_000); assert_eq!(result, 2); // Case 3: use the compressed-based block count. diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs index 237a2252ea067..602e58df1670a 100644 --- a/src/query/service/src/interpreters/interpreter_table_recluster.rs +++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs @@ -631,7 +631,7 @@ impl ReclusterTableInterpreter { let database = &self.plan.database; let table = &self.plan.table; let settings = self.ctx.get_settings(); - let sample_size = settings.get_hilbert_sample_size_per_block()?; + let sample_size = settings.get_recluster_sample_size_per_block()?; let name_resolution_ctx = NameResolutionContext::try_from(settings.as_ref())?; let ast_exprs = tbl.resolve_cluster_keys(self.ctx.clone()).unwrap(); diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index a5f39011080d8..cbcde2069c441 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -187,7 +187,11 @@ impl PipelineBuilder { .collect(); let num_processors = self.main_pipeline.output_len(); - let sample_rate = 0.01; + let sample_size = self + .ctx + .get_settings() + .get_recluster_sample_size_per_block()? + as usize; let partitions = block_thresholds.calc_partitions_for_recluster( task.total_rows, task.total_bytes, @@ -195,7 +199,7 @@ impl PipelineBuilder { ); let state = SampleState::new(num_processors, partitions); let recluster_pipeline_builder = - ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_rate) + ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_size) .with_state(state); recluster_pipeline_builder .build_recluster_sample_pipeline(&mut self.main_pipeline)?; @@ -314,7 +318,7 @@ struct ReclusterPipelineBuilder { schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, state: Option>, - sample_rate: f64, + sample_size: usize, seed: u64, } @@ -322,13 +326,13 @@ impl ReclusterPipelineBuilder { fn create( schema: DataSchemaRef, sort_desc: Arc<[SortColumnDescription]>, - sample_rate: f64, + sample_size: usize, ) -> Self { Self { schema, sort_desc, state: None, - sample_rate, + sample_size, seed: rand::random(), } } @@ -382,7 +386,7 @@ impl ReclusterPipelineBuilder { })?; let offset = self.schema.num_fields(); pipeline.add_accumulating_transformer(|| { - TransformReclusterCollect::::new(offset, self.sample_rate, self.seed) + TransformReclusterCollect::::new(offset, self.sample_size, self.seed) }); pipeline.add_transform(|input, output| { Ok(ProcessorPtr::create(TransformRangePartitionIndexer::< diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs index 0dfee36475b36..a76417256416f 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs @@ -26,7 +26,7 @@ pub struct RangeBoundSampler where T: ValueType { offset: usize, - sample_rate: f64, + sample_size: usize, rng: SmallRng, values: Vec<(u64, Vec)>, @@ -36,11 +36,11 @@ where T: ValueType impl RangeBoundSampler where T: ValueType { - pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self { + pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self { let rng = SmallRng::seed_from_u64(seed); Self { offset, - sample_rate, + sample_size, rng, values: vec![], _t: PhantomData, @@ -58,15 +58,10 @@ where assert!(rows > 0); let column = data.get_by_offset(self.offset).to_column(rows); - let sample_size = std::cmp::max((self.sample_rate * rows as f64).ceil() as usize, 100); + let sample_size = std::cmp::min(self.sample_size, rows); let mut indices = (0..rows).collect::>(); - - let sampled_indices = if rows > sample_size { - indices.shuffle(&mut self.rng); - &indices[..sample_size] - } else { - &indices - }; + indices.shuffle(&mut self.rng); + let sampled_indices = &indices[..sample_size]; let column = T::try_downcast_column(&column).unwrap(); let sample_values = sampled_indices diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs index 7f478c94b8d43..269aae2e3abea 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs @@ -32,6 +32,10 @@ impl ReclusterPartitionStrategy { pub fn new(properties: Arc) -> Self { Self { properties } } + + fn concat_blocks(blocks: Vec) -> Result { + DataBlock::concat(&blocks) + } } impl PartitionProcessStrategy for ReclusterPartitionStrategy { @@ -51,22 +55,44 @@ impl PartitionProcessStrategy for ReclusterPartitionStrategy { /// Stream write each block, and flush it conditionally based on builder status /// and input size estimation. fn process_data_blocks(&self, data_blocks: Vec) -> Result> { - let mut input_sizes: usize = data_blocks.iter().map(|b| b.estimate_block_size()).sum(); - let mut input_rows: usize = data_blocks.iter().map(|b| b.num_rows()).sum(); + let blocks_num = data_blocks.len(); + let mut accumulated_rows = 0; + let mut accumulated_bytes = 0; + let mut pending_blocks = Vec::with_capacity(blocks_num); + let mut staged_blocks = Vec::with_capacity(blocks_num); + let mut compacted = Vec::with_capacity(blocks_num); + for block in data_blocks { + accumulated_rows += block.num_rows(); + accumulated_bytes += block.estimate_block_size(); + pending_blocks.push(block); + if !self + .properties + .check_large_enough(accumulated_rows, accumulated_bytes) + { + continue; + } + if !staged_blocks.is_empty() { + compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + std::mem::swap(&mut staged_blocks, &mut pending_blocks); + accumulated_rows = 0; + accumulated_bytes = 0; + } + staged_blocks.append(&mut pending_blocks); + if !staged_blocks.is_empty() { + compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } let mut result = Vec::new(); let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; - for block in data_blocks { - input_sizes -= block.estimate_block_size(); - input_rows -= block.num_rows(); + for block in compacted { builder.write(block)?; - if builder.need_flush() && self.properties.check_large_enough(input_rows, input_sizes) { + if builder.need_flush() { let serialized = builder.finish()?; result.push(DataBlock::empty_with_meta(Box::new(serialized))); builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; } } - if !builder.is_empty() { let serialized = builder.finish()?; result.push(DataBlock::empty_with_meta(Box::new(serialized))); diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs index 3900fd81db6d7..46684b42b31e3 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs @@ -36,10 +36,10 @@ where T: ArgType + Send + Sync, T::Scalar: Ord + Send, { - pub fn new(offset: usize, sample_rate: f64, seed: u64) -> Self { + pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self { Self { input_data: vec![], - sampler: RangeBoundSampler::::new(offset, sample_rate, seed), + sampler: RangeBoundSampler::::new(offset, sample_size, seed), } } } diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 4dbc98b2b1d8a..a686891024a90 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -755,7 +755,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(0..=1)), }), ("enable_distributed_compact", DefaultSettingValue { - value: UserSettingValue::UInt64(0), + value: UserSettingValue::UInt64(1), desc: "Enables distributed execution of table compaction.", mode: SettingMode::Both, scope: SettingScope::Both, @@ -870,7 +870,7 @@ impl DefaultSettings { range: Some(SettingRange::Numeric(2..=u64::MAX)), }), ("enable_distributed_recluster", DefaultSettingValue { - value: UserSettingValue::UInt64(0), + value: UserSettingValue::UInt64(1), desc: "Enable distributed execution of table recluster.", mode: SettingMode::Both, scope: SettingScope::Both, @@ -1220,9 +1220,9 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(1..=65535)), }), - ("hilbert_sample_size_per_block", DefaultSettingValue { + ("recluster_sample_size_per_block", DefaultSettingValue { value: UserSettingValue::UInt64(1000), - desc: "Specifies the number of sample points per block used in Hilbert clustering.", + desc: "Specifies the number of sample points per block used in clustering.", mode: SettingMode::Both, scope: SettingScope::Both, range: Some(SettingRange::Numeric(1..=u64::MAX)), diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index a3ef08cc9264b..89061370021da 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -889,8 +889,8 @@ impl Settings { self.try_get_u64("hilbert_num_range_ids") } - pub fn get_hilbert_sample_size_per_block(&self) -> Result { - self.try_get_u64("hilbert_sample_size_per_block") + pub fn get_recluster_sample_size_per_block(&self) -> Result { + self.try_get_u64("recluster_sample_size_per_block") } pub fn get_hilbert_clustering_min_bytes(&self) -> Result { diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 50a701c31be93..30d3b2b45543d 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -238,9 +238,9 @@ impl StreamBlockBuilder { pub fn need_flush(&self) -> bool { let file_size = self.block_writer.compressed_size(); self.row_count >= self.properties.block_thresholds.min_rows_per_block - || self.block_size >= self.properties.block_thresholds.max_bytes_per_block + || self.block_size >= self.properties.block_thresholds.min_bytes_per_block * 2 || (file_size >= self.properties.block_thresholds.min_compressed_per_block - && self.block_size >= self.properties.block_thresholds.min_bytes_per_block / 2) + && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) } pub fn write(&mut self, block: DataBlock) -> Result<()> { diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs index 4c0e3cd715227..8df60aa61f03f 100644 --- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs @@ -160,7 +160,7 @@ fn column_update_hll_cardinality(col: &Column, ty: &DataType, hll: &mut ColumnDi let col = col.as_nullable().unwrap(); for (i, v) in col.validity.iter().enumerate() { if v { - let scalar = col.column.index(i).unwrap(); + let scalar = unsafe { col.column.index_unchecked(i) }; scalar_update_hll_cardinality(&scalar, inner, hll); } } diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index f938768066ce1..fc14e9589071e 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -51,6 +51,9 @@ impl FuseTable { pipeline: &mut Pipeline, table_meta_timestamps: TableMetaTimestamps, ) -> Result<()> { + let block_thresholds = self.get_block_thresholds(); + build_compact_block_pipeline(pipeline, block_thresholds)?; + let enable_stream_block_write = ctx.get_settings().get_enable_block_stream_write()? && self.storage_format_as_parquet(); if enable_stream_block_write { @@ -76,22 +79,13 @@ impl FuseTable { } pipeline.add_transform(|input, output| { - TransformBlockBuilder::try_create( - ctx.clone(), - input, - output, - self, - properties.clone(), - ) + TransformBlockBuilder::try_create(input, output, properties.clone()) })?; pipeline.add_async_accumulating_transformer(|| { TransformBlockWriter::create(ctx.clone(), MutationKind::Insert, self, false) }); } else { - let block_thresholds = self.get_block_thresholds(); - build_compact_block_pipeline(pipeline, block_thresholds)?; - let schema = DataSchema::from(&self.schema().remove_virtual_computed_fields()).into(); let cluster_stats_gen = self.cluster_gen_for_append(ctx.clone(), pipeline, block_thresholds, Some(schema))?; diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index f5f61b8001c4d..52a4e309560c8 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -24,7 +24,6 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; -use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT; use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::InputPort; @@ -43,7 +42,6 @@ use crate::io::StreamBlockProperties; use crate::operations::MutationLogEntry; use crate::operations::MutationLogs; use crate::FuseTable; -use crate::FUSE_OPT_KEY_ROW_PER_BLOCK; enum State { Consume, @@ -59,7 +57,6 @@ pub struct TransformBlockBuilder { output: Arc, properties: Arc, - max_block_rows: usize, builder: Option, need_flush: bool, @@ -72,16 +69,10 @@ pub struct TransformBlockBuilder { impl TransformBlockBuilder { pub fn try_create( - ctx: Arc, input: Arc, output: Arc, - table: &FuseTable, properties: Arc, ) -> Result { - let max_block_rows = std::cmp::min( - ctx.get_settings().get_max_block_size()? as usize, - table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT), - ); Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder { state: State::Consume, input, @@ -93,7 +84,6 @@ impl TransformBlockBuilder { input_data_size: 0, input_num_rows: 0, output_data: None, - max_block_rows, }))) } @@ -105,18 +95,6 @@ impl TransformBlockBuilder { } Ok(self.builder.as_mut().unwrap()) } - - fn calc_max_block_rows(&self, block: &DataBlock) -> usize { - let min_bytes_per_block = self.properties.block_thresholds.min_bytes_per_block; - let block_size = block.estimate_block_size(); - if block_size < min_bytes_per_block { - return self.max_block_rows; - } - let num_rows = block.num_rows(); - let average_row_size = block_size.div_ceil(num_rows); - let max_rows = min_bytes_per_block.div_ceil(average_row_size); - self.max_block_rows.min(max_rows) - } } #[async_trait] @@ -191,9 +169,7 @@ impl Processor for TransformBlockBuilder { block.check_valid()?; self.input_data_size += block.estimate_block_size(); self.input_num_rows += block.num_rows(); - let max_rows_per_block = self.calc_max_block_rows(&block); - let blocks = block.split_by_rows_no_tail(max_rows_per_block); - self.input_data.extend(blocks); + self.input_data.push_back(block); } State::Serialize => { while let Some(b) = self.input_data.pop_front() { diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs index 85d275bcabef3..792d4f0c1e85d 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs @@ -206,9 +206,9 @@ impl ReclusterMutator { settings.get_max_memory_usage()? - GLOBAL_MEM_STAT.get_memory_usage() as u64; let memory_threshold = settings .get_recluster_block_size()? - .min(avail_memory_usage * 50 / 100) as usize; + .min(avail_memory_usage * 40 / 100) as usize; // specify a rather small value, so that `recluster_block_size` might be tuned to lower value. - let max_blocks_num = (memory_threshold / self.average_size).max(2) * self.max_tasks; + let mut max_blocks_per_task = (memory_threshold / self.average_size).max(2); let block_per_seg = self.block_thresholds.block_per_segment; // Prepare task generation parameters @@ -276,8 +276,11 @@ impl ReclusterMutator { } // Select blocks for reclustering based on depth threshold and max block size - let mut selected_idx = - self.fetch_max_depth(points_map, self.depth_threshold, max_blocks_num)?; + let mut selected_idx = self.fetch_max_depth( + points_map, + self.depth_threshold, + max_blocks_per_task * self.max_tasks, + )?; if selected_idx.is_empty() { if level != 0 || small_blocks.len() < 2 { continue; @@ -291,13 +294,19 @@ impl ReclusterMutator { let mut task_compressed = 0; let mut task_indices = Vec::new(); let mut selected_blocks = Vec::new(); + if selected_idx.len() > max_blocks_per_task { + max_blocks_per_task = selected_idx.len().div_ceil(self.max_tasks).max(10); + } for idx in selected_idx { let block = blocks[idx].clone(); let block_size = block.block_size as usize; let row_count = block.row_count as usize; + let selected_len = selected_blocks.len(); // If memory threshold exceeded, generate a new task and reset accumulators - if task_bytes + block_size > memory_threshold && selected_blocks.len() > 1 { + if selected_len > max_blocks_per_task + || (task_bytes + block_size > memory_threshold && selected_len > 1) + { selected_blocks_idx.extend(std::mem::take(&mut task_indices)); tasks.push(self.generate_task( diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test index 929e042c13122..176d17124c5d5 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test @@ -512,10 +512,10 @@ select segment_count, block_count from fuse_snapshot('db_09_0008', 't9') limit 2 query I select a from t9 order by a ---- +-5 1 2 4 --5 statement ok insert into t9 values(-3) From ad818da956e16289d0effd37d6be4b34f5e06eff Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 12 Jun 2025 21:27:22 +0800 Subject: [PATCH 25/36] improve recluster partition --- .../pipelines/builders/builder_recluster.rs | 28 +-- .../processors/transforms/recluster/mod.rs | 2 + .../transform_recluster_partition.rs | 230 ++++++++++++++++++ .../processors/transform_block_writer.rs | 1 + .../mutation/mutator/recluster_mutator.rs | 2 +- 5 files changed, 243 insertions(+), 20 deletions(-) create mode 100644 src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index cbcde2069c441..853ccbc2b2fef 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -46,7 +46,6 @@ use databend_common_pipeline_transforms::sort::RowConverter; use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::sort::SimpleRowConverter; use databend_common_pipeline_transforms::sort::SimpleRowsAsc; -use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::evaluator::CompoundBlockOperator; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::executor::physical_plans::Recluster; @@ -61,13 +60,12 @@ use match_template::match_template; use crate::pipelines::builders::SortPipelineBuilder; use crate::pipelines::processors::transforms::ReclusterPartitionExchange; -use crate::pipelines::processors::transforms::ReclusterPartitionStrategy; use crate::pipelines::processors::transforms::SampleState; use crate::pipelines::processors::transforms::TransformAddOrderColumn; use crate::pipelines::processors::transforms::TransformAddStreamColumns; -use crate::pipelines::processors::transforms::TransformPartitionCollect; use crate::pipelines::processors::transforms::TransformRangePartitionIndexer; use crate::pipelines::processors::transforms::TransformReclusterCollect; +use crate::pipelines::processors::transforms::TransformReclusterPartition; use crate::pipelines::PipelineBuilder; impl PipelineBuilder { @@ -209,23 +207,15 @@ impl PipelineBuilder { ReclusterPartitionExchange::create(0, partitions), ); let processor_id = AtomicUsize::new(0); - let settings = self.ctx.get_settings(); - let memory_settings = MemorySettings::disable_spill(); self.main_pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create(Box::new( - TransformPartitionCollect::new( - self.ctx.clone(), - input, - output, - &settings, - processor_id.fetch_add(1, atomic::Ordering::AcqRel), - num_processors, - partitions, - memory_settings.clone(), - None, - ReclusterPartitionStrategy::new(properties.clone()), - )?, - ))) + TransformReclusterPartition::try_create( + input, + output, + properties.clone(), + processor_id.fetch_add(1, atomic::Ordering::AcqRel), + num_processors, + partitions, + ) })?; self.main_pipeline.add_async_accumulating_transformer(|| { diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs index 0f3612043c7d9..a024e330be25b 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -19,6 +19,7 @@ mod recluster_sample_state; mod transform_add_order_column; mod transform_range_partition_indexer; mod transform_recluster_collect; +mod transform_recluster_partition; pub use range_bound_sampler::RangeBoundSampler; pub use recluster_partition_exchange::ReclusterPartitionExchange; @@ -29,3 +30,4 @@ pub use transform_add_order_column::TransformAddOrderColumn; pub use transform_range_partition_indexer::TransformRangePartitionIndexer; pub use transform_recluster_collect::ReclusterSampleMeta; pub use transform_recluster_collect::TransformReclusterCollect; +pub use transform_recluster_partition::TransformReclusterPartition; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs new file mode 100644 index 0000000000000..9bec274cb90f6 --- /dev/null +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs @@ -0,0 +1,230 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::collections::VecDeque; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; +use databend_common_expression::DataBlock; +use databend_common_pipeline_core::processors::Event; +use databend_common_pipeline_core::processors::InputPort; +use databend_common_pipeline_core::processors::OutputPort; +use databend_common_pipeline_core::processors::Processor; +use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_storages_fuse::io::StreamBlockBuilder; +use databend_common_storages_fuse::io::StreamBlockProperties; + +use crate::pipelines::processors::transforms::WindowPartitionMeta; + +enum Step { + Consume, + Collect, + Flush, +} + +struct PartitionData { + builder: Option, + data_blocks: Vec, + block_size: usize, + block_rows: usize, +} + +impl PartitionData { + fn new() -> Self { + Self { + builder: None, + data_blocks: vec![], + block_size: 0, + block_rows: 0, + } + } + + fn is_empty(&self) -> bool { + self.builder.as_ref().is_none_or(|v| v.is_empty()) && self.data_blocks.is_empty() + } +} + +pub struct TransformReclusterPartition { + input: Arc, + output: Arc, + + properties: Arc, + + // The partition id is used to map the partition id to the new partition id. + partition_id: Vec, + partition_data: Vec, + output_data: VecDeque, + + step: Step, +} + +impl TransformReclusterPartition { + pub fn try_create( + input: Arc, + output: Arc, + properties: Arc, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + ) -> Result { + let partitions = (0..num_partitions) + .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) + .collect::>(); + let mut partition_id = vec![0; num_partitions]; + let mut partition_data = Vec::with_capacity(num_partitions); + for (new_partition_id, partition) in partitions.iter().enumerate() { + partition_id[*partition] = new_partition_id; + partition_data.push(PartitionData::new()); + } + Ok(ProcessorPtr::create(Box::new( + TransformReclusterPartition { + input, + output, + properties, + partition_id, + partition_data, + output_data: VecDeque::new(), + step: Step::Consume, + }, + ))) + } +} + +impl Processor for TransformReclusterPartition { + fn name(&self) -> String { + "TransformReclusterPartition".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if matches!(self.step, Step::Collect | Step::Flush) { + return Ok(Event::Sync); + } + + if self.output.is_finished() { + self.input.finish(); + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.pop_front() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.is_finished() { + if !self.partition_data.is_empty() { + self.step = Step::Flush; + return Ok(Event::Sync); + } + self.output.finish(); + return Ok(Event::Finished); + } + + if self.input.has_data() { + self.step = Step::Collect; + return Ok(Event::Sync); + } + + self.input.set_need_data(); + Ok(Event::NeedData) + } + + fn process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.step, Step::Consume) { + Step::Collect => { + let data_block = self.input.pull_data().unwrap()?; + if let Some(meta) = data_block + .get_owned_meta() + .and_then(WindowPartitionMeta::downcast_from) + { + for (partition_id, data_block) in meta.partitioned_data.into_iter() { + if data_block.is_empty() { + continue; + } + + let new_id = self.partition_id[partition_id]; + let partition_data = + unsafe { self.partition_data.get_unchecked_mut(new_id) }; + if partition_data.builder.is_none() { + partition_data.builder = Some(StreamBlockBuilder::try_new_with_config( + self.properties.clone(), + )?); + } + let builder = partition_data.builder.as_mut().unwrap(); + if !builder.need_flush() { + builder.write(data_block)?; + } else { + partition_data.block_size += data_block.estimate_block_size(); + partition_data.block_rows += data_block.num_rows(); + partition_data.data_blocks.push(data_block); + + if self.properties.check_large_enough( + partition_data.block_rows, + partition_data.block_size, + ) { + let builder = partition_data.builder.take().unwrap(); + let serialized = builder.finish()?; + self.output_data + .push_back(DataBlock::empty_with_meta(Box::new(serialized))); + + let mut builder = StreamBlockBuilder::try_new_with_config( + self.properties.clone(), + )?; + for block in + std::mem::take(&mut partition_data.data_blocks).into_iter() + { + builder.write(block)?; + } + partition_data.builder = Some(builder); + partition_data.block_rows = 0; + partition_data.block_size = 0; + } + } + } + } + } + Step::Flush => { + while let Some(mut partition_data) = self.partition_data.pop() { + if partition_data.is_empty() { + continue; + } + + let mut builder = if partition_data.builder.is_none() { + StreamBlockBuilder::try_new_with_config(self.properties.clone())? + } else { + partition_data.builder.take().unwrap() + }; + for block in partition_data.data_blocks { + builder.write(block)?; + } + let serialized = builder.finish()?; + self.output_data + .push_back(DataBlock::empty_with_meta(Box::new(serialized))); + break; + } + } + _ => unreachable!(), + } + Ok(()) + } +} diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index 52a4e309560c8..ea5b4b9dc5c0c 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -116,6 +116,7 @@ impl Processor for TransformBlockBuilder { } if self.output.is_finished() { + self.input.finish(); return Ok(Event::Finished); } diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs index 792d4f0c1e85d..be706f9d123b9 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs @@ -206,7 +206,7 @@ impl ReclusterMutator { settings.get_max_memory_usage()? - GLOBAL_MEM_STAT.get_memory_usage() as u64; let memory_threshold = settings .get_recluster_block_size()? - .min(avail_memory_usage * 40 / 100) as usize; + .min(avail_memory_usage * 30 / 100) as usize; // specify a rather small value, so that `recluster_block_size` might be tuned to lower value. let mut max_blocks_per_task = (memory_threshold / self.average_size).max(2); let block_per_seg = self.block_thresholds.block_per_segment; From 22c37b30acd14ea99aa069aa93b236786282d0a1 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 13 Jun 2025 11:36:08 +0800 Subject: [PATCH 26/36] for test --- .../recluster/recluster_partition_exchange.rs | 3 +++ .../recluster/transform_range_partition_indexer.rs | 5 +++++ .../recluster/transform_recluster_partition.rs | 10 ++++++++++ 3 files changed, 18 insertions(+) diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs index dd5257850ac9f..9e25119f2d15a 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::sync::Arc; +use std::time::Instant; use databend_common_exception::Result; use databend_common_expression::DataBlock; @@ -34,6 +35,7 @@ impl ReclusterPartitionExchange { impl Exchange for ReclusterPartitionExchange { const NAME: &'static str = "Recluster"; fn partition(&self, mut data_block: DataBlock, n: usize) -> Result> { + let start = Instant::now(); let range_ids = data_block .get_last_column() .as_number() @@ -58,6 +60,7 @@ impl Exchange for ReclusterPartitionExchange { output_data_blocks[target].push((partition_id, block)); } } + log::info!("Recluster range exchange: {:?}", start.elapsed()); // Union data blocks for each processor. Ok(output_data_blocks diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index 215e0c977eb8f..96027933631cb 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -15,6 +15,7 @@ use std::any::Any; use std::collections::VecDeque; use std::sync::Arc; +use std::time::Instant; use databend_common_exception::Result; use databend_common_expression::types::ArgType; @@ -45,6 +46,8 @@ where T: ArgType output_data: VecDeque, bounds: Vec, max_value: Option, + + start: Instant, } impl TransformRangePartitionIndexer @@ -65,6 +68,7 @@ where output_data: VecDeque::new(), bounds: vec![], max_value: None, + start: Instant::now(), }) } } @@ -121,6 +125,7 @@ where .expect("require a ReclusterSampleMeta"); self.input_data = meta.blocks; self.state.merge_sample::(meta.sample_values)?; + log::info!("Recluster range partition: {:?}", self.start.elapsed()); Ok(Event::Async) } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs index 9bec274cb90f6..1f8749dd4e647 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs @@ -15,6 +15,7 @@ use std::any::Any; use std::collections::VecDeque; use std::sync::Arc; +use std::time::Instant; use databend_common_exception::Result; use databend_common_expression::BlockMetaInfoDowncast; @@ -68,6 +69,9 @@ pub struct TransformReclusterPartition { partition_data: Vec, output_data: VecDeque, + start: Instant, + cnt: usize, + step: Step, } @@ -98,6 +102,8 @@ impl TransformReclusterPartition { partition_data, output_data: VecDeque::new(), step: Step::Consume, + start: Instant::now(), + cnt: 0, }, ))) } @@ -133,6 +139,10 @@ impl Processor for TransformReclusterPartition { if self.input.is_finished() { if !self.partition_data.is_empty() { + if self.cnt == 0 { + log::info!("Recluster: start flush: {:?}", self.start.elapsed()); + } + self.cnt += 1; self.step = Step::Flush; return Ok(Event::Sync); } From 27fa7db6685d3aef628f918318eb548151170388 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 13 Jun 2025 13:20:16 +0800 Subject: [PATCH 27/36] for test --- .../recluster/recluster_partition_exchange.rs | 10 +++++--- .../transform_recluster_partition.rs | 6 +++++ .../mode/standalone/explain/window.test | 24 +++++++++---------- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs index 9e25119f2d15a..444c81296de26 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs @@ -24,18 +24,22 @@ use crate::pipelines::processors::transforms::WindowPartitionMeta; pub struct ReclusterPartitionExchange { start: u64, width: usize, + start_time: Instant, } impl ReclusterPartitionExchange { pub fn create(start: u64, width: usize) -> Arc { - Arc::new(ReclusterPartitionExchange { start, width }) + Arc::new(ReclusterPartitionExchange { + start, + width, + start_time: Instant::now(), + }) } } impl Exchange for ReclusterPartitionExchange { const NAME: &'static str = "Recluster"; fn partition(&self, mut data_block: DataBlock, n: usize) -> Result> { - let start = Instant::now(); let range_ids = data_block .get_last_column() .as_number() @@ -60,7 +64,7 @@ impl Exchange for ReclusterPartitionExchange { output_data_blocks[target].push((partition_id, block)); } } - log::info!("Recluster range exchange: {:?}", start.elapsed()); + log::info!("Recluster range exchange: {:?}", self.start_time.elapsed()); // Union data blocks for each processor. Ok(output_data_blocks diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs index 1f8749dd4e647..5a3e19d2b0e3e 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_partition.rs @@ -162,6 +162,7 @@ impl Processor for TransformReclusterPartition { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.step, Step::Consume) { Step::Collect => { + let start_cost = self.start.elapsed(); let data_block = self.input.pull_data().unwrap()?; if let Some(meta) = data_block .get_owned_meta() @@ -212,6 +213,11 @@ impl Processor for TransformReclusterPartition { } } } + log::info!( + "Recluster: start collect: {:?}, end: {:?}", + start_cost, + self.start.elapsed() + ); } Step::Flush => { while let Some(mut partition_data) = self.partition_data.pop() { diff --git a/tests/sqllogictests/suites/mode/standalone/explain/window.test b/tests/sqllogictests/suites/mode/standalone/explain/window.test index 9846febbd9819..11943d90e42eb 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/window.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/window.test @@ -59,7 +59,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "Resize" ] 8 [ label = "SortPartialTransform" ] @@ -108,7 +108,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "Resize" ] 8 [ label = "SortPartialTransform" ] @@ -429,7 +429,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "LimitTransform" ] 8 [ label = "CompoundBlockOperator(Project)" ] @@ -457,7 +457,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "LimitTransform" ] 8 [ label = "CompoundBlockOperator(Project)" ] @@ -486,7 +486,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "LimitTransform" ] 8 [ label = "CompoundBlockOperator(Project)" ] @@ -510,7 +510,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "LimitTransform" ] 8 [ label = "CompoundBlockOperator(Project)" ] @@ -534,7 +534,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "LimitTransform" ] 8 [ label = "CompoundBlockOperator(Project)" ] @@ -559,7 +559,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(Window)" ] 4 [ label = "ShuffleMergePartition(Window)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "Resize" ] 8 [ label = "SortPartialTransform" ] @@ -610,7 +610,7 @@ digraph { 4 [ label = "TransformFilter" ] 5 [ label = "ShufflePartition(Window)" ] 6 [ label = "ShuffleMergePartition(Window)" ] - 7 [ label = "TransformWindowPartitionCollect(Sort)" ] + 7 [ label = "TransformPartitionCollect(Window)" ] 8 [ label = "Transform Window" ] 9 [ label = "Resize" ] 10 [ label = "SortPartialTransform" ] @@ -708,7 +708,7 @@ digraph { 2 [ label = "DeserializeDataTransform" ] 3 [ label = "ShufflePartition(WindowTopN)" ] 4 [ label = "ShuffleMergePartition(WindowTopN)" ] - 5 [ label = "TransformWindowPartitionCollect(Sort)" ] + 5 [ label = "TransformPartitionCollect(Window)" ] 6 [ label = "Transform Window" ] 7 [ label = "TransformFilter" ] 8 [ label = "LimitTransform" ] @@ -786,12 +786,12 @@ digraph { 1 [ label = "CompoundBlockOperator(Map)" ] 2 [ label = "ShufflePartition(Window)" ] 3 [ label = "ShuffleMergePartition(Window)" ] - 4 [ label = "TransformWindowPartitionCollect(Sort)" ] + 4 [ label = "TransformPartitionCollect(Window)" ] 5 [ label = "Transform Window" ] 6 [ label = "CompoundBlockOperator(Map)" ] 7 [ label = "ShufflePartition(Window)" ] 8 [ label = "ShuffleMergePartition(Window)" ] - 9 [ label = "TransformWindowPartitionCollect(Sort)" ] + 9 [ label = "TransformPartitionCollect(Window)" ] 10 [ label = "Transform Window" ] 11 [ label = "CompoundBlockOperator(Project)" ] 0 -> 1 [ label = "" ] From 8875f5372b05a66290fdd08d770beb6370a51200 Mon Sep 17 00:00:00 2001 From: zhyass Date: Sat, 14 Jun 2025 00:49:24 +0800 Subject: [PATCH 28/36] for test --- .../transform_range_partition_indexer.rs | 68 ++++++++++--------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index 96027933631cb..0f8439140f66c 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -130,42 +130,46 @@ where } fn process(&mut self) -> Result<()> { - if let Some(mut block) = self.input_data.pop() { - let bound_len = self.bounds.len(); - let num_rows = block.num_rows(); - let last = block.get_last_column().clone(); - block.pop_columns(1); - let mut builder = Vec::with_capacity(num_rows); - let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap(); - for index in 0..num_rows { - let val = - T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) }); - if self.max_value.as_ref().is_some_and(|v| val >= *v) { - let range_id = bound_len + 1; - builder.push(range_id as u64); - continue; - } + let start = Instant::now(); + let mut block = { + let blocks = std::mem::take(&mut self.input_data); + DataBlock::concat(&blocks)? + }; + + let bound_len = self.bounds.len(); + let num_rows = block.num_rows(); + let last = block.get_last_column().clone(); + block.pop_columns(1); + let mut builder = Vec::with_capacity(num_rows); + let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap(); + for index in 0..num_rows { + let val = T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) }); + if self.max_value.as_ref().is_some_and(|v| val >= *v) { + let range_id = bound_len + 1; + builder.push(range_id as u64); + continue; + } - let mut low = 0; - let mut high = bound_len; - while low < high { - let mid = low + ((high - low) / 2); - let bound = unsafe { self.bounds.get_unchecked(mid) }.clone(); - if val > bound { - low = mid + 1; - } else { - high = mid; - } + let mut low = 0; + let mut high = bound_len; + while low < high { + let mid = low + ((high - low) / 2); + let bound = unsafe { self.bounds.get_unchecked(mid) }.clone(); + if val > bound { + low = mid + 1; + } else { + high = mid; } - builder.push(low as u64); } - - block.add_column(BlockEntry::new( - DataType::Number(NumberDataType::UInt64), - Value::Column(UInt64Type::from_data(builder)), - )); - self.output_data.push_back(block); + builder.push(low as u64); } + + block.add_column(BlockEntry::new( + DataType::Number(NumberDataType::UInt64), + Value::Column(UInt64Type::from_data(builder)), + )); + self.output_data.push_back(block); + log::info!("Recluster range output: {:?}", start.elapsed()); Ok(()) } From c807783badd52faff80da0e1261e27f5e61baa80 Mon Sep 17 00:00:00 2001 From: zhyass Date: Sat, 14 Jun 2025 02:36:55 +0800 Subject: [PATCH 29/36] fix --- .../transforms/recluster/range_bound_sampler.rs | 2 +- .../transforms/recluster/transform_add_order_column.rs | 7 +------ .../recluster/transform_range_partition_indexer.rs | 9 +-------- 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs index a76417256416f..8e0afd6e647c3 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs @@ -56,7 +56,7 @@ where pub fn add_block(&mut self, data: &DataBlock) { let rows = data.num_rows(); assert!(rows > 0); - let column = data.get_by_offset(self.offset).to_column(rows); + let column = data.get_by_offset(self.offset).to_column(); let sample_size = std::cmp::min(self.sample_size, rows); let mut indices = (0..rows).collect::>(); diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs index 7b40593e887c3..f90458a8c44b6 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs @@ -16,11 +16,9 @@ use std::marker::PhantomData; use std::sync::Arc; use databend_common_exception::Result; -use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::SortColumnDescription; -use databend_common_expression::Value; use databend_common_pipeline_transforms::sort::RowConverter; use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::Transform; @@ -63,10 +61,7 @@ where .row_converter .convert(&order_by_cols, data.num_rows())?; let order_col = rows.to_column(); - data.add_column(BlockEntry { - data_type: order_col.data_type(), - value: Value::Column(order_col), - }); + data.add_column(order_col); Ok(data) } } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index 0f8439140f66c..ea2c4983ffb42 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -19,14 +19,10 @@ use std::time::Instant; use databend_common_exception::Result; use databend_common_expression::types::ArgType; -use databend_common_expression::types::DataType; -use databend_common_expression::types::NumberDataType; use databend_common_expression::types::UInt64Type; -use databend_common_expression::BlockEntry; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; use databend_common_expression::FromData; -use databend_common_expression::Value; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; @@ -164,10 +160,7 @@ where builder.push(low as u64); } - block.add_column(BlockEntry::new( - DataType::Number(NumberDataType::UInt64), - Value::Column(UInt64Type::from_data(builder)), - )); + block.add_column(UInt64Type::from_data(builder)); self.output_data.push_back(block); log::info!("Recluster range output: {:?}", start.elapsed()); Ok(()) From cb50347e2be7ec37205fa98bd7cf189c74fc29f6 Mon Sep 17 00:00:00 2001 From: zhyass Date: Sun, 15 Jun 2025 12:51:48 +0800 Subject: [PATCH 30/36] for test --- .../pipelines/builders/builder_recluster.rs | 145 ++++++++---------- .../processors/transforms/recluster/mod.rs | 1 + .../recluster/range_bound_sampler.rs | 33 +--- .../recluster/recluster_partition_strategy.rs | 88 +++++++++++ .../recluster/recluster_sample_state.rs | 73 +++------ .../recluster/transform_add_order_column.rs | 28 +--- .../transform_range_partition_indexer.rs | 78 ++++------ .../recluster/transform_recluster_collect.rs | 26 +--- src/query/settings/src/settings_default.rs | 7 + .../settings/src/settings_getter_setter.rs | 4 + .../src/io/write/stream/cluster_statistics.rs | 56 +++---- .../storages/fuse/src/operations/append.rs | 36 +++-- .../fuse/src/statistics/cluster_statistics.rs | 2 +- 13 files changed, 287 insertions(+), 290 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index 853ccbc2b2fef..5af820ccda385 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -20,16 +20,6 @@ use databend_common_catalog::plan::DataSourceInfo; use databend_common_catalog::plan::DataSourcePlan; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::row::RowConverter as CommonConverter; -use databend_common_expression::types::AccessType; -use databend_common_expression::types::ArgType; -use databend_common_expression::types::DataType; -use databend_common_expression::types::DateType; -use databend_common_expression::types::NumberDataType; -use databend_common_expression::types::NumberType; -use databend_common_expression::types::StringType; -use databend_common_expression::types::TimestampType; -use databend_common_expression::with_number_mapped_type; use databend_common_expression::DataSchemaRef; use databend_common_expression::DataSchemaRefExt; use databend_common_expression::SortColumnDescription; @@ -41,28 +31,27 @@ use databend_common_pipeline_core::Pipeline; use databend_common_pipeline_sources::EmptySource; use databend_common_pipeline_transforms::processors::build_compact_block_no_split_pipeline; use databend_common_pipeline_transforms::processors::TransformPipelineHelper; -use databend_common_pipeline_transforms::sort::CommonRows; -use databend_common_pipeline_transforms::sort::RowConverter; -use databend_common_pipeline_transforms::sort::Rows; -use databend_common_pipeline_transforms::sort::SimpleRowConverter; -use databend_common_pipeline_transforms::sort::SimpleRowsAsc; +use databend_common_pipeline_transforms::sort::utils::add_order_field; +use databend_common_pipeline_transforms::MemorySettings; use databend_common_sql::evaluator::CompoundBlockOperator; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::StreamContext; use databend_common_storages_factory::Table; use databend_common_storages_fuse::io::StreamBlockProperties; +use databend_common_storages_fuse::operations::TransformBlockBuilder; use databend_common_storages_fuse::operations::TransformBlockWriter; use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::TableContext; -use match_template::match_template; use crate::pipelines::builders::SortPipelineBuilder; use crate::pipelines::processors::transforms::ReclusterPartitionExchange; +use crate::pipelines::processors::transforms::ReclusterPartitionStrategys; use crate::pipelines::processors::transforms::SampleState; use crate::pipelines::processors::transforms::TransformAddOrderColumn; use crate::pipelines::processors::transforms::TransformAddStreamColumns; +use crate::pipelines::processors::transforms::TransformPartitionCollect; use crate::pipelines::processors::transforms::TransformRangePartitionIndexer; use crate::pipelines::processors::transforms::TransformReclusterCollect; use crate::pipelines::processors::transforms::TransformReclusterPartition; @@ -172,9 +161,7 @@ impl PipelineBuilder { }); } - let fields_with_cluster_key = properties.fields_with_cluster_key(); - let schema = DataSchemaRefExt::create(fields_with_cluster_key); - let sort_descs: Vec<_> = properties + let sort_desc: Vec<_> = properties .cluster_key_index() .iter() .map(|&offset| SortColumnDescription { @@ -183,6 +170,10 @@ impl PipelineBuilder { nulls_first: false, }) .collect(); + let fields_with_cluster_key = properties.fields_with_cluster_key(); + let schema = DataSchemaRefExt::create(fields_with_cluster_key); + let schema = add_order_field(schema, &sort_desc); + let order_offset = schema.fields.len() - 1; let num_processors = self.main_pipeline.output_len(); let sample_size = self @@ -196,9 +187,12 @@ impl PipelineBuilder { task.total_compressed, ); let state = SampleState::new(num_processors, partitions); - let recluster_pipeline_builder = - ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_size) - .with_state(state); + let recluster_pipeline_builder = ReclusterPipelineBuilder::create( + schema.clone(), + sort_desc.clone(), + sample_size, + ) + .with_state(state); recluster_pipeline_builder .build_recluster_sample_pipeline(&mut self.main_pipeline)?; @@ -207,16 +201,46 @@ impl PipelineBuilder { ReclusterPartitionExchange::create(0, partitions), ); let processor_id = AtomicUsize::new(0); - self.main_pipeline.add_transform(|input, output| { - TransformReclusterPartition::try_create( - input, - output, - properties.clone(), - processor_id.fetch_add(1, atomic::Ordering::AcqRel), - num_processors, - partitions, - ) - })?; + + let settings = self.ctx.get_settings(); + let enable_writings = settings.get_enable_block_stream_writes()?; + if enable_writings { + let memory_settings = MemorySettings::disable_spill(); + self.main_pipeline.add_transform(|input, output| { + let strategy = + ReclusterPartitionStrategys::new(properties.clone(), order_offset); + + Ok(ProcessorPtr::create(Box::new( + TransformPartitionCollect::new( + self.ctx.clone(), + input, + output, + &settings, + processor_id.fetch_add(1, atomic::Ordering::AcqRel), + num_processors, + partitions, + memory_settings.clone(), + None, + strategy, + )?, + ))) + })?; + + self.main_pipeline.add_transform(|input, output| { + TransformBlockBuilder::try_create(input, output, properties.clone()) + })?; + } else { + self.main_pipeline.add_transform(|input, output| { + TransformReclusterPartition::try_create( + input, + output, + properties.clone(), + processor_id.fetch_add(1, atomic::Ordering::AcqRel), + num_processors, + partitions, + ) + })?; + } self.main_pipeline.add_async_accumulating_transformer(|| { TransformBlockWriter::create( @@ -249,7 +273,7 @@ impl PipelineBuilder { // construct output fields let output_fields = cluster_stats_gen.out_fields.clone(); let schema = DataSchemaRefExt::create(output_fields); - let sort_descs: Vec<_> = cluster_stats_gen + let sort_desc: Vec<_> = cluster_stats_gen .cluster_key_index .iter() .map(|offset| SortColumnDescription { @@ -267,10 +291,9 @@ impl PipelineBuilder { ); let sort_pipeline_builder = - SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())? + SortPipelineBuilder::create(self.ctx.clone(), schema, sort_desc.into())? .with_block_size_hit(sort_block_size) .remove_order_col_at_last(); - // Todo(zhyass): Recluster will no longer perform sort in the near future. sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?; // Compact after merge sort. @@ -306,7 +329,7 @@ impl PipelineBuilder { struct ReclusterPipelineBuilder { schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, + sort_desc: Vec, state: Option>, sample_size: usize, seed: u64, @@ -315,7 +338,7 @@ struct ReclusterPipelineBuilder { impl ReclusterPipelineBuilder { fn create( schema: DataSchemaRef, - sort_desc: Arc<[SortColumnDescription]>, + sort_desc: Vec, sample_size: usize, ) -> Self { Self { @@ -339,53 +362,17 @@ impl ReclusterPipelineBuilder { } fn build_recluster_sample_pipeline(&self, pipeline: &mut Pipeline) -> Result<()> { - match self.sort_desc.as_ref() { - [desc] => { - let schema = self.schema.clone(); - let sort_type = schema.field(desc.offset).data_type(); - assert!(desc.asc); - - match_template! { - T = [ Date => DateType, Timestamp => TimestampType, String => StringType ], - match sort_type { - DataType::T => { - self.visit_type::, SimpleRowConverter>(pipeline) - }, - DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty { - NumberDataType::NUM_TYPE => { - self.visit_type::>, SimpleRowConverter>>(pipeline) - } - }), - _ => self.visit_type::(pipeline) - } - } - } - _ => self.visit_type::(pipeline), - } - } - - fn visit_type(&self, pipeline: &mut Pipeline) -> Result<()> - where - R: Rows + 'static, - C: RowConverter + Send + 'static, - R::Type: ArgType + Send + Sync, - ::Scalar: Ord + Send + Sync, - { pipeline.try_add_transformer(|| { - TransformAddOrderColumn::::try_new(self.sort_desc.clone(), self.schema.clone()) + TransformAddOrderColumn::try_new(self.sort_desc.clone(), self.schema.clone()) })?; - let offset = self.schema.num_fields(); + let offset = self.schema.num_fields() - 1; pipeline.add_accumulating_transformer(|| { - TransformReclusterCollect::::new(offset, self.sample_size, self.seed) + TransformReclusterCollect::new(offset, self.sample_size, self.seed) }); pipeline.add_transform(|input, output| { - Ok(ProcessorPtr::create(TransformRangePartitionIndexer::< - R::Type, - >::create( - input, - output, - self.state.clone().unwrap(), - ))) + Ok(ProcessorPtr::create( + TransformRangePartitionIndexer::create(input, output, self.state.clone().unwrap()), + )) }) } } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs index a024e330be25b..b87be1f1e4d51 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -25,6 +25,7 @@ pub use range_bound_sampler::RangeBoundSampler; pub use recluster_partition_exchange::ReclusterPartitionExchange; pub use recluster_partition_strategy::CompactPartitionStrategy; pub use recluster_partition_strategy::ReclusterPartitionStrategy; +pub use recluster_partition_strategy::ReclusterPartitionStrategys; pub use recluster_sample_state::SampleState; pub use transform_add_order_column::TransformAddOrderColumn; pub use transform_range_partition_indexer::TransformRangePartitionIndexer; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs index 8e0afd6e647c3..b3fe9a77a4660 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs @@ -12,30 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::marker::PhantomData; - -use databend_common_expression::types::ArgType; -use databend_common_expression::types::ValueType; use databend_common_expression::DataBlock; -use databend_common_expression::Scalar; use rand::prelude::SliceRandom; use rand::prelude::SmallRng; use rand::SeedableRng; -pub struct RangeBoundSampler -where T: ValueType -{ +pub struct RangeBoundSampler { offset: usize, sample_size: usize, rng: SmallRng, - values: Vec<(u64, Vec)>, - _t: PhantomData, + values: Vec<(u64, Vec>)>, } -impl RangeBoundSampler -where T: ValueType -{ +impl RangeBoundSampler { pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self { let rng = SmallRng::seed_from_u64(seed); Self { @@ -43,16 +33,11 @@ where T: ValueType sample_size, rng, values: vec![], - _t: PhantomData, } } } -impl RangeBoundSampler -where - T: ArgType, - T::Scalar: Ord + Send, -{ +impl RangeBoundSampler { pub fn add_block(&mut self, data: &DataBlock) { let rows = data.num_rows(); assert!(rows > 0); @@ -63,19 +48,15 @@ where indices.shuffle(&mut self.rng); let sampled_indices = &indices[..sample_size]; - let column = T::try_downcast_column(&column).unwrap(); + let column = column.as_binary().unwrap(); let sample_values = sampled_indices .iter() - .map(|i| { - T::upcast_scalar(T::to_owned_scalar(unsafe { - T::index_column_unchecked(&column, *i) - })) - }) + .map(|i| unsafe { column.index_unchecked(*i) }.to_vec()) .collect::>(); self.values.push((rows as u64, sample_values)); } - pub fn sample_values(&mut self) -> Vec<(u64, Vec)> { + pub fn sample_values(&mut self) -> Vec<(u64, Vec>)> { std::mem::take(&mut self.values) } } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs index 269aae2e3abea..fb10cb0caec08 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs @@ -16,6 +16,8 @@ use std::sync::Arc; use databend_common_exception::Result; use databend_common_expression::DataBlock; +use databend_common_expression::LimitType; +use databend_common_expression::SortColumnDescription; use databend_common_storages_fuse::io::StreamBlockBuilder; use databend_common_storages_fuse::io::StreamBlockProperties; @@ -175,3 +177,89 @@ impl PartitionProcessStrategy for CompactPartitionStrategy { Ok(result) } } + +pub struct ReclusterPartitionStrategys { + properties: Arc, + sort_desc: Vec, +} + +impl ReclusterPartitionStrategys { + pub fn new(properties: Arc, offset: usize) -> Self { + Self { + properties, + sort_desc: vec![SortColumnDescription { + offset, + asc: true, + nulls_first: false, + }], + } + } + + fn concat_blocks(blocks: Vec) -> Result { + DataBlock::concat(&blocks) + } +} + +impl PartitionProcessStrategy for ReclusterPartitionStrategys { + const NAME: &'static str = "Recluster"; + + fn calc_partitions( + &self, + processor_id: usize, + num_processors: usize, + num_partitions: usize, + ) -> Vec { + (0..num_partitions) + .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) + .collect() + } + + /// Stream write each block, and flush it conditionally based on builder status + /// and input size estimation. + fn process_data_blocks(&self, data_blocks: Vec) -> Result> { + let blocks_num = data_blocks.len(); + let mut accumulated_rows = 0; + let mut accumulated_bytes = 0; + let mut pending_blocks = Vec::with_capacity(blocks_num); + let mut staged_blocks = Vec::with_capacity(blocks_num); + let mut compacted = Vec::with_capacity(blocks_num); + for block in data_blocks { + accumulated_rows += block.num_rows(); + accumulated_bytes += block.estimate_block_size(); + pending_blocks.push(block); + if !self + .properties + .check_large_enough(accumulated_rows, accumulated_bytes) + { + continue; + } + if !staged_blocks.is_empty() { + compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + std::mem::swap(&mut staged_blocks, &mut pending_blocks); + accumulated_rows = 0; + accumulated_bytes = 0; + } + staged_blocks.append(&mut pending_blocks); + if !staged_blocks.is_empty() { + compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + } + + let mut result = Vec::new(); + let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; + for block in compacted { + let block = DataBlock::sort_with_type(&block, &self.sort_desc, LimitType::None)?; + builder.write(block)?; + if builder.need_flush() { + let serialized = builder.finish()?; + result.push(DataBlock::empty_with_meta(Box::new(serialized))); + builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; + } + } + if !builder.is_empty() { + let serialized = builder.finish()?; + result.push(DataBlock::empty_with_meta(Box::new(serialized))); + } + Ok(result) + } +} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs index 12d50653b8b68..35e22321339d2 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::intrinsics::unlikely; use std::sync::Arc; use std::sync::RwLock; use databend_common_base::base::WatchNotify; use databend_common_exception::Result; use databend_common_expression::compare_columns; -use databend_common_expression::types::ArgType; -use databend_common_expression::Scalar; +use databend_common_expression::types::BinaryType; +use databend_common_expression::FromData; pub struct SampleState { pub inner: RwLock, @@ -41,38 +42,21 @@ impl SampleState { }) } - pub fn merge_sample(&self, values: Vec<(u64, Vec)>) -> Result<()> - where - T: ArgType, - T::Scalar: Ord, - { + pub fn merge_sample(&self, values: Vec<(u64, Vec>)>) -> Result<()> { let mut inner = self.inner.write().unwrap(); inner.completed_inputs += 1; inner.values.extend_from_slice(&values); if inner.completed_inputs >= inner.total_inputs { - inner.determine_bounds::()?; + inner.determine_bounds()?; self.done.notify_waiters(); } Ok(()) } - pub fn get_bounds(&self) -> (Vec, Option) - where - T: ArgType, - T::Scalar: Ord, - { + pub fn get_bounds(&self) -> (Vec>, Option>) { let inner = self.inner.read().unwrap(); - let bounds = inner - .bounds - .iter() - .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap())) - .collect(); - let max_value = inner - .max_value - .as_ref() - .map(|v| T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap())); - (bounds, max_value) + (inner.bounds.clone(), inner.max_value.clone()) } } @@ -81,18 +65,14 @@ pub struct SampleStateInner { total_inputs: usize, completed_inputs: usize, - bounds: Vec, - max_value: Option, + bounds: Vec>, + max_value: Option>, - values: Vec<(u64, Vec)>, + values: Vec<(u64, Vec>)>, } impl SampleStateInner { - fn determine_bounds(&mut self) -> Result<()> - where - T: ArgType, - T::Scalar: Ord, - { + fn determine_bounds(&mut self) -> Result<()> { if self.partitions < 2 { return Ok(()); } @@ -111,16 +91,15 @@ impl SampleStateInner { for (num, values) in values.into_iter() { let weight = num as f64 / values.len() as f64; values.into_iter().for_each(|v| { - let val = T::to_owned_scalar(T::try_downcast_scalar(&v.as_ref()).unwrap()); - data.push(val); + data.push(v); weights.push(weight); }); } - let col = T::upcast_column(T::column_from_vec(data.clone(), &[])); + let col = BinaryType::from_data(data.clone()); let indices = compare_columns(vec![col], total_samples)?; let max_index = indices[total_samples - 1] as usize; - let max_val = data[max_index].clone(); + let max_val = &data[max_index]; let mut cum_weight = 0.0; let mut target = step; @@ -131,22 +110,20 @@ impl SampleStateInner { let mut j = 0; while i < total_samples && j < self.partitions - 1 { let idx = indices[i] as usize; + let value = &data[idx]; let weight = weights[idx]; cum_weight += weight; - if cum_weight >= target { - let data = &data[idx]; - if previous_bound.as_ref().is_none_or(|prev| data > prev) { - if data == &max_val { - self.max_value = Some(T::upcast_scalar(max_val)); - break; - } - - let bound = T::upcast_scalar(data.clone()); - bounds.push(bound); - target += step; - j += 1; - previous_bound = Some(data.clone()); + + if cum_weight >= target && previous_bound.map_or(true, |prev| value > prev) { + if unlikely(value == max_val) { + self.max_value = Some(max_val.clone()); + break; } + + bounds.push(value.clone()); + previous_bound = Some(value); + target += step; + j += 1; } i += 1; } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs index f90458a8c44b6..b3fa11ba7ddda 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_add_order_column.rs @@ -12,10 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::marker::PhantomData; -use std::sync::Arc; - use databend_common_exception::Result; +use databend_common_expression::row::RowConverter as CommonConverter; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::SortColumnDescription; @@ -23,32 +21,22 @@ use databend_common_pipeline_transforms::sort::RowConverter; use databend_common_pipeline_transforms::sort::Rows; use databend_common_pipeline_transforms::Transform; -pub struct TransformAddOrderColumn { - row_converter: C, - sort_desc: Arc<[SortColumnDescription]>, - _r: PhantomData, +pub struct TransformAddOrderColumn { + row_converter: CommonConverter, + sort_desc: Vec, } -impl TransformAddOrderColumn -where - R: Rows, - C: RowConverter, -{ - pub fn try_new(sort_desc: Arc<[SortColumnDescription]>, schema: DataSchemaRef) -> Result { - let row_converter = C::create(&sort_desc, schema.clone())?; +impl TransformAddOrderColumn { + pub fn try_new(sort_desc: Vec, schema: DataSchemaRef) -> Result { + let row_converter = CommonConverter::create(&sort_desc, schema.clone())?; Ok(Self { row_converter, sort_desc, - _r: PhantomData, }) } } -impl Transform for TransformAddOrderColumn -where - R: Rows + 'static, - C: RowConverter + Send + 'static, -{ +impl Transform for TransformAddOrderColumn { const NAME: &'static str = "TransformAddOrderColumn"; fn transform(&mut self, mut data: DataBlock) -> Result { diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index ea2c4983ffb42..3888b130d4227 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -14,11 +14,11 @@ use std::any::Any; use std::collections::VecDeque; +use std::intrinsics::unlikely; use std::sync::Arc; use std::time::Instant; use databend_common_exception::Result; -use databend_common_expression::types::ArgType; use databend_common_expression::types::UInt64Type; use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; @@ -31,26 +31,20 @@ use databend_common_pipeline_core::processors::Processor; use crate::pipelines::processors::transforms::ReclusterSampleMeta; use crate::pipelines::processors::transforms::SampleState; -pub struct TransformRangePartitionIndexer -where T: ArgType -{ +pub struct TransformRangePartitionIndexer { input: Arc, output: Arc, state: Arc, input_data: Vec, output_data: VecDeque, - bounds: Vec, - max_value: Option, + bounds: Vec>, + max_value: Option>, start: Instant, } -impl TransformRangePartitionIndexer -where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send + Sync, -{ +impl TransformRangePartitionIndexer { pub fn create( input: Arc, output: Arc, @@ -70,11 +64,7 @@ where } #[async_trait::async_trait] -impl Processor for TransformRangePartitionIndexer -where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send + Sync, -{ +impl Processor for TransformRangePartitionIndexer { fn name(&self) -> String { "TransformRangePartitionIndexer".to_owned() } @@ -120,48 +110,36 @@ where .and_then(ReclusterSampleMeta::downcast_from) .expect("require a ReclusterSampleMeta"); self.input_data = meta.blocks; - self.state.merge_sample::(meta.sample_values)?; + self.state.merge_sample(meta.sample_values)?; log::info!("Recluster range partition: {:?}", self.start.elapsed()); Ok(Event::Async) } fn process(&mut self) -> Result<()> { let start = Instant::now(); - let mut block = { - let blocks = std::mem::take(&mut self.input_data); - DataBlock::concat(&blocks)? - }; - - let bound_len = self.bounds.len(); - let num_rows = block.num_rows(); - let last = block.get_last_column().clone(); - block.pop_columns(1); - let mut builder = Vec::with_capacity(num_rows); - let last_col = T::try_downcast_column(&last.remove_nullable()).unwrap(); - for index in 0..num_rows { - let val = T::to_owned_scalar(unsafe { T::index_column_unchecked(&last_col, index) }); - if self.max_value.as_ref().is_some_and(|v| val >= *v) { - let range_id = bound_len + 1; - builder.push(range_id as u64); - continue; - } - - let mut low = 0; - let mut high = bound_len; - while low < high { - let mid = low + ((high - low) / 2); - let bound = unsafe { self.bounds.get_unchecked(mid) }.clone(); - if val > bound { - low = mid + 1; - } else { - high = mid; + if let Some(mut block) = self.input_data.pop() { + let bound_len = self.bounds.len(); + let num_rows = block.num_rows(); + let mut builder = Vec::with_capacity(num_rows); + let last_col = block.get_last_column().as_binary().unwrap(); + for index in 0..num_rows { + let val = unsafe { last_col.index_unchecked(index) }; + if unlikely(self.max_value.as_ref().is_some_and(|v| val >= v.as_slice())) { + let range_id = bound_len + 1; + builder.push(range_id as u64); + continue; } + + let idx = self + .bounds + .binary_search_by(|b| b.as_slice().cmp(val)) + .unwrap_or_else(|i| i); + builder.push(idx as u64); } - builder.push(low as u64); - } - block.add_column(UInt64Type::from_data(builder)); - self.output_data.push_back(block); + block.add_column(UInt64Type::from_data(builder)); + self.output_data.push_back(block); + } log::info!("Recluster range output: {:?}", start.elapsed()); Ok(()) } @@ -169,7 +147,7 @@ where #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { self.state.done.notified().await; - (self.bounds, self.max_value) = self.state.get_bounds::(); + (self.bounds, self.max_value) = self.state.get_bounds(); Ok(()) } } diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs index 46684b42b31e3..28d1c0aed8b54 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_recluster_collect.rs @@ -14,41 +14,27 @@ use databend_common_exception::Result; use databend_common_expression::local_block_meta_serde; -use databend_common_expression::types::ArgType; use databend_common_expression::BlockMetaInfo; use databend_common_expression::DataBlock; -use databend_common_expression::Scalar; use databend_common_pipeline_transforms::AccumulatingTransform; use crate::pipelines::processors::transforms::RangeBoundSampler; -pub struct TransformReclusterCollect -where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send, -{ +pub struct TransformReclusterCollect { input_data: Vec, - sampler: RangeBoundSampler, + sampler: RangeBoundSampler, } -impl TransformReclusterCollect -where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send, -{ +impl TransformReclusterCollect { pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self { Self { input_data: vec![], - sampler: RangeBoundSampler::::new(offset, sample_size, seed), + sampler: RangeBoundSampler::new(offset, sample_size, seed), } } } -impl AccumulatingTransform for TransformReclusterCollect -where - T: ArgType + Send + Sync, - T::Scalar: Ord + Send, -{ +impl AccumulatingTransform for TransformReclusterCollect { const NAME: &'static str = "TransformReclusterCollect"; fn transform(&mut self, data: DataBlock) -> Result> { @@ -72,7 +58,7 @@ where #[derive(Debug)] pub struct ReclusterSampleMeta { pub blocks: Vec, - pub sample_values: Vec<(u64, Vec)>, + pub sample_values: Vec<(u64, Vec>)>, } local_block_meta_serde!(ReclusterSampleMeta); diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index a686891024a90..f85d03b3115ef 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1312,6 +1312,13 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=1)), }), + ("enable_block_stream_writes", DefaultSettingValue { + value: UserSettingValue::UInt64(0), + desc: "Enables block stream write", + mode: SettingMode::Both, + scope: SettingScope::Both, + range: Some(SettingRange::Numeric(0..=1)), + }), ("trace_sample_rate", DefaultSettingValue { value: UserSettingValue::UInt64(1), desc: "Setting the trace sample rate. The value should be between '0' and '100'", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 89061370021da..9289d4badbcd6 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -964,6 +964,10 @@ impl Settings { Ok(self.try_get_u64("enable_block_stream_write")? == 1) } + pub fn get_enable_block_stream_writes(&self) -> Result { + Ok(self.try_get_u64("enable_block_stream_writes")? == 1) + } + pub fn get_statement_queue_ttl_in_seconds(&self) -> Result { self.try_get_u64("statement_queue_ttl_in_seconds") } diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs index a0bd91888995e..c7cc352f7ecc6 100644 --- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs @@ -17,7 +17,6 @@ use std::sync::Arc; use databend_common_catalog::table::Table; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; -use databend_common_expression::Column; use databend_common_expression::ColumnRef; use databend_common_expression::DataBlock; use databend_common_expression::DataField; @@ -25,7 +24,6 @@ use databend_common_expression::DataSchema; use databend_common_expression::Expr; use databend_common_expression::Scalar; use databend_common_expression::TableSchemaRef; -use databend_common_functions::aggregates::eval_aggr; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::evaluator::BlockOperator; use databend_storages_common_table_meta::meta::ClusterStatistics; @@ -115,8 +113,8 @@ impl ClusterStatisticsBuilder { } pub struct ClusterStatisticsState { - mins: Vec, - maxs: Vec, + min_values: Vec>, + max_values: Vec>, builder: Arc, } @@ -124,8 +122,8 @@ pub struct ClusterStatisticsState { impl ClusterStatisticsState { pub fn new(builder: Arc) -> Self { Self { - mins: vec![], - maxs: vec![], + min_values: vec![], + max_values: vec![], builder, } } @@ -135,20 +133,20 @@ impl ClusterStatisticsState { return Ok(input); } - let num_rows = input.num_rows(); - let cols = self - .builder - .cluster_key_index - .iter() - .map(|&i| input.get_by_offset(i).to_column()) - .collect(); - let tuple = Column::Tuple(cols); - let (min, _) = eval_aggr("min", vec![], &[tuple.clone()], num_rows, vec![])?; - let (max, _) = eval_aggr("max", vec![], &[tuple.clone()], num_rows, vec![])?; - assert_eq!(min.len(), 1); - assert_eq!(max.len(), 1); - self.mins.push(min.index(0).unwrap().to_owned()); - self.maxs.push(max.index(0).unwrap().to_owned()); + let mut min = Vec::with_capacity(self.builder.cluster_key_index.len()); + let mut max = Vec::with_capacity(self.builder.cluster_key_index.len()); + for key in self.builder.cluster_key_index.iter() { + let val = input.get_by_offset(*key); + let left = unsafe { val.index_unchecked(0) }.to_owned(); + min.push(left); + + // The maximum in cluster statistics needn't larger than the non-trimmed one. + // So we use trim_min directly. + let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned(); + max.push(right); + } + self.min_values.push(min); + self.max_values.push(max); input.pop_columns(self.builder.extra_key_num); Ok(input) } @@ -158,22 +156,8 @@ impl ClusterStatisticsState { return Ok(None); } - let min = self - .mins - .into_iter() - .min_by(|x, y| x.as_ref().cmp(&y.as_ref())) - .unwrap() - .as_tuple() - .unwrap() - .clone(); - let max = self - .maxs - .into_iter() - .max_by(|x, y| x.as_ref().cmp(&y.as_ref())) - .unwrap() - .as_tuple() - .unwrap() - .clone(); + let min = self.min_values.into_iter().min().unwrap(); + let max = self.max_values.into_iter().max().unwrap(); let level = if min == max && perfect { -1 diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index fc14e9589071e..8a2349f8ff359 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -65,16 +65,32 @@ impl FuseTable { table_meta_timestamps, )?; - let cluster_operators = properties.cluster_operators(); - if !cluster_operators.is_empty() { - let num_input_columns = self.table_info.schema().num_fields(); - let func_ctx = ctx.get_function_context()?; - pipeline.add_transformer(move || { - CompoundBlockOperator::new( - cluster_operators.clone(), - func_ctx.clone(), - num_input_columns, - ) + let cluster_key_index = properties.cluster_key_index(); + if !cluster_key_index.is_empty() { + let cluster_operators = properties.cluster_operators(); + if !cluster_operators.is_empty() { + let num_input_columns = self.table_info.schema().num_fields(); + let func_ctx = ctx.get_function_context()?; + pipeline.add_transformer(move || { + CompoundBlockOperator::new( + cluster_operators.clone(), + func_ctx.clone(), + num_input_columns, + ) + }); + } + + let sort_desc: Vec = cluster_key_index + .iter() + .map(|index| SortColumnDescription { + offset: *index, + asc: true, + nulls_first: false, + }) + .collect(); + let sort_desc: Arc<[_]> = sort_desc.into(); + pipeline.add_transformer(|| { + TransformSortPartial::new(LimitType::None, sort_desc.clone()) }); } diff --git a/src/query/storages/fuse/src/statistics/cluster_statistics.rs b/src/query/storages/fuse/src/statistics/cluster_statistics.rs index f452938fd4c25..904446690d93f 100644 --- a/src/query/storages/fuse/src/statistics/cluster_statistics.rs +++ b/src/query/storages/fuse/src/statistics/cluster_statistics.rs @@ -122,7 +122,7 @@ impl ClusterStatsGenerator { let left = unsafe { val.index_unchecked(0) }.to_owned(); min.push(left); - // The maximum in cluster statistics neednot larger than the non-trimmed one. + // The maximum in cluster statistics needn't larger than the non-trimmed one. // So we use trim_min directly. let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned(); max.push(right); From 35bfee36631836ea66beb0dad45aeca5d33f09d0 Mon Sep 17 00:00:00 2001 From: zhyass Date: Mon, 16 Jun 2025 04:12:11 +0800 Subject: [PATCH 31/36] fix --- .../pipelines/builders/builder_recluster.rs | 51 +++------ .../processors/transforms/recluster/mod.rs | 1 - .../recluster/recluster_partition_strategy.rs | 102 ++---------------- .../recluster/recluster_sample_state.rs | 2 +- .../transform_range_partition_indexer.rs | 40 ++++--- src/query/settings/src/settings_default.rs | 7 -- .../settings/src/settings_getter_setter.rs | 4 - .../src/io/write/stream/cluster_statistics.rs | 56 ++++++---- 8 files changed, 83 insertions(+), 180 deletions(-) diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index 5af820ccda385..28b5feaacfd39 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -39,7 +39,6 @@ use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::StreamContext; use databend_common_storages_factory::Table; use databend_common_storages_fuse::io::StreamBlockProperties; -use databend_common_storages_fuse::operations::TransformBlockBuilder; use databend_common_storages_fuse::operations::TransformBlockWriter; use databend_common_storages_fuse::operations::TransformSerializeBlock; use databend_common_storages_fuse::FuseTable; @@ -47,14 +46,13 @@ use databend_common_storages_fuse::TableContext; use crate::pipelines::builders::SortPipelineBuilder; use crate::pipelines::processors::transforms::ReclusterPartitionExchange; -use crate::pipelines::processors::transforms::ReclusterPartitionStrategys; +use crate::pipelines::processors::transforms::ReclusterPartitionStrategy; use crate::pipelines::processors::transforms::SampleState; use crate::pipelines::processors::transforms::TransformAddOrderColumn; use crate::pipelines::processors::transforms::TransformAddStreamColumns; use crate::pipelines::processors::transforms::TransformPartitionCollect; use crate::pipelines::processors::transforms::TransformRangePartitionIndexer; use crate::pipelines::processors::transforms::TransformReclusterCollect; -use crate::pipelines::processors::transforms::TransformReclusterPartition; use crate::pipelines::PipelineBuilder; impl PipelineBuilder { @@ -173,7 +171,6 @@ impl PipelineBuilder { let fields_with_cluster_key = properties.fields_with_cluster_key(); let schema = DataSchemaRefExt::create(fields_with_cluster_key); let schema = add_order_field(schema, &sort_desc); - let order_offset = schema.fields.len() - 1; let num_processors = self.main_pipeline.output_len(); let sample_size = self @@ -203,44 +200,24 @@ impl PipelineBuilder { let processor_id = AtomicUsize::new(0); let settings = self.ctx.get_settings(); - let enable_writings = settings.get_enable_block_stream_writes()?; - if enable_writings { - let memory_settings = MemorySettings::disable_spill(); - self.main_pipeline.add_transform(|input, output| { - let strategy = - ReclusterPartitionStrategys::new(properties.clone(), order_offset); - - Ok(ProcessorPtr::create(Box::new( - TransformPartitionCollect::new( - self.ctx.clone(), - input, - output, - &settings, - processor_id.fetch_add(1, atomic::Ordering::AcqRel), - num_processors, - partitions, - memory_settings.clone(), - None, - strategy, - )?, - ))) - })?; - - self.main_pipeline.add_transform(|input, output| { - TransformBlockBuilder::try_create(input, output, properties.clone()) - })?; - } else { - self.main_pipeline.add_transform(|input, output| { - TransformReclusterPartition::try_create( + let memory_settings = MemorySettings::disable_spill(); + self.main_pipeline.add_transform(|input, output| { + let strategy = ReclusterPartitionStrategy::new(properties.clone()); + Ok(ProcessorPtr::create(Box::new( + TransformPartitionCollect::new( + self.ctx.clone(), input, output, - properties.clone(), + &settings, processor_id.fetch_add(1, atomic::Ordering::AcqRel), num_processors, partitions, - ) - })?; - } + memory_settings.clone(), + None, + strategy, + )?, + ))) + })?; self.main_pipeline.add_async_accumulating_transformer(|| { TransformBlockWriter::create( diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs index b87be1f1e4d51..a024e330be25b 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/mod.rs @@ -25,7 +25,6 @@ pub use range_bound_sampler::RangeBoundSampler; pub use recluster_partition_exchange::ReclusterPartitionExchange; pub use recluster_partition_strategy::CompactPartitionStrategy; pub use recluster_partition_strategy::ReclusterPartitionStrategy; -pub use recluster_partition_strategy::ReclusterPartitionStrategys; pub use recluster_sample_state::SampleState; pub use transform_add_order_column::TransformAddOrderColumn; pub use transform_range_partition_indexer::TransformRangePartitionIndexer; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs index fb10cb0caec08..9c8a4573171a1 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs @@ -16,8 +16,6 @@ use std::sync::Arc; use databend_common_exception::Result; use databend_common_expression::DataBlock; -use databend_common_expression::LimitType; -use databend_common_expression::SortColumnDescription; use databend_common_storages_fuse::io::StreamBlockBuilder; use databend_common_storages_fuse::io::StreamBlockProperties; @@ -34,10 +32,6 @@ impl ReclusterPartitionStrategy { pub fn new(properties: Arc) -> Self { Self { properties } } - - fn concat_blocks(blocks: Vec) -> Result { - DataBlock::concat(&blocks) - } } impl PartitionProcessStrategy for ReclusterPartitionStrategy { @@ -74,7 +68,7 @@ impl PartitionProcessStrategy for ReclusterPartitionStrategy { continue; } if !staged_blocks.is_empty() { - compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + compacted.push(std::mem::take(&mut staged_blocks)); } std::mem::swap(&mut staged_blocks, &mut pending_blocks); accumulated_rows = 0; @@ -82,13 +76,15 @@ impl PartitionProcessStrategy for ReclusterPartitionStrategy { } staged_blocks.append(&mut pending_blocks); if !staged_blocks.is_empty() { - compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); + compacted.push(std::mem::take(&mut staged_blocks)); } let mut result = Vec::new(); let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; - for block in compacted { - builder.write(block)?; + for blocks in compacted { + for block in blocks { + builder.write(block)?; + } if builder.need_flush() { let serialized = builder.finish()?; result.push(DataBlock::empty_with_meta(Box::new(serialized))); @@ -177,89 +173,3 @@ impl PartitionProcessStrategy for CompactPartitionStrategy { Ok(result) } } - -pub struct ReclusterPartitionStrategys { - properties: Arc, - sort_desc: Vec, -} - -impl ReclusterPartitionStrategys { - pub fn new(properties: Arc, offset: usize) -> Self { - Self { - properties, - sort_desc: vec![SortColumnDescription { - offset, - asc: true, - nulls_first: false, - }], - } - } - - fn concat_blocks(blocks: Vec) -> Result { - DataBlock::concat(&blocks) - } -} - -impl PartitionProcessStrategy for ReclusterPartitionStrategys { - const NAME: &'static str = "Recluster"; - - fn calc_partitions( - &self, - processor_id: usize, - num_processors: usize, - num_partitions: usize, - ) -> Vec { - (0..num_partitions) - .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) - .collect() - } - - /// Stream write each block, and flush it conditionally based on builder status - /// and input size estimation. - fn process_data_blocks(&self, data_blocks: Vec) -> Result> { - let blocks_num = data_blocks.len(); - let mut accumulated_rows = 0; - let mut accumulated_bytes = 0; - let mut pending_blocks = Vec::with_capacity(blocks_num); - let mut staged_blocks = Vec::with_capacity(blocks_num); - let mut compacted = Vec::with_capacity(blocks_num); - for block in data_blocks { - accumulated_rows += block.num_rows(); - accumulated_bytes += block.estimate_block_size(); - pending_blocks.push(block); - if !self - .properties - .check_large_enough(accumulated_rows, accumulated_bytes) - { - continue; - } - if !staged_blocks.is_empty() { - compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); - } - std::mem::swap(&mut staged_blocks, &mut pending_blocks); - accumulated_rows = 0; - accumulated_bytes = 0; - } - staged_blocks.append(&mut pending_blocks); - if !staged_blocks.is_empty() { - compacted.push(Self::concat_blocks(std::mem::take(&mut staged_blocks))?); - } - - let mut result = Vec::new(); - let mut builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; - for block in compacted { - let block = DataBlock::sort_with_type(&block, &self.sort_desc, LimitType::None)?; - builder.write(block)?; - if builder.need_flush() { - let serialized = builder.finish()?; - result.push(DataBlock::empty_with_meta(Box::new(serialized))); - builder = StreamBlockBuilder::try_new_with_config(self.properties.clone())?; - } - } - if !builder.is_empty() { - let serialized = builder.finish()?; - result.push(DataBlock::empty_with_meta(Box::new(serialized))); - } - Ok(result) - } -} diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs index 35e22321339d2..07960939ee538 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_sample_state.rs @@ -114,7 +114,7 @@ impl SampleStateInner { let weight = weights[idx]; cum_weight += weight; - if cum_weight >= target && previous_bound.map_or(true, |prev| value > prev) { + if cum_weight >= target && previous_bound.is_none_or(|prev| value > prev) { if unlikely(value == max_val) { self.max_value = Some(max_val.clone()); break; diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs index 3888b130d4227..39efbec0b20a7 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/transform_range_partition_indexer.rs @@ -118,28 +118,40 @@ impl Processor for TransformRangePartitionIndexer { fn process(&mut self) -> Result<()> { let start = Instant::now(); if let Some(mut block) = self.input_data.pop() { - let bound_len = self.bounds.len(); let num_rows = block.num_rows(); let mut builder = Vec::with_capacity(num_rows); let last_col = block.get_last_column().as_binary().unwrap(); - for index in 0..num_rows { - let val = unsafe { last_col.index_unchecked(index) }; - if unlikely(self.max_value.as_ref().is_some_and(|v| val >= v.as_slice())) { - let range_id = bound_len + 1; - builder.push(range_id as u64); - continue; + if let Some(max_value) = self.max_value.as_ref() { + let bound_len = self.bounds.len(); + for index in 0..num_rows { + let val = unsafe { last_col.index_unchecked(index) }; + if unlikely(val >= max_value.as_slice()) { + let range_id = bound_len + 1; + builder.push(range_id as u64); + continue; + } + + let idx = self + .bounds + .binary_search_by(|b| b.as_slice().cmp(val)) + .unwrap_or_else(|i| i); + builder.push(idx as u64); + } + } else { + for index in 0..num_rows { + let val = unsafe { last_col.index_unchecked(index) }; + let idx = self + .bounds + .binary_search_by(|b| b.as_slice().cmp(val)) + .unwrap_or_else(|i| i); + builder.push(idx as u64); } - - let idx = self - .bounds - .binary_search_by(|b| b.as_slice().cmp(val)) - .unwrap_or_else(|i| i); - builder.push(idx as u64); } - + block.pop_columns(1); block.add_column(UInt64Type::from_data(builder)); self.output_data.push_back(block); } + log::info!("Recluster range output: {:?}", start.elapsed()); Ok(()) } diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index f85d03b3115ef..a686891024a90 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1312,13 +1312,6 @@ impl DefaultSettings { scope: SettingScope::Both, range: Some(SettingRange::Numeric(0..=1)), }), - ("enable_block_stream_writes", DefaultSettingValue { - value: UserSettingValue::UInt64(0), - desc: "Enables block stream write", - mode: SettingMode::Both, - scope: SettingScope::Both, - range: Some(SettingRange::Numeric(0..=1)), - }), ("trace_sample_rate", DefaultSettingValue { value: UserSettingValue::UInt64(1), desc: "Setting the trace sample rate. The value should be between '0' and '100'", diff --git a/src/query/settings/src/settings_getter_setter.rs b/src/query/settings/src/settings_getter_setter.rs index 9289d4badbcd6..89061370021da 100644 --- a/src/query/settings/src/settings_getter_setter.rs +++ b/src/query/settings/src/settings_getter_setter.rs @@ -964,10 +964,6 @@ impl Settings { Ok(self.try_get_u64("enable_block_stream_write")? == 1) } - pub fn get_enable_block_stream_writes(&self) -> Result { - Ok(self.try_get_u64("enable_block_stream_writes")? == 1) - } - pub fn get_statement_queue_ttl_in_seconds(&self) -> Result { self.try_get_u64("statement_queue_ttl_in_seconds") } diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs index c7cc352f7ecc6..a0bd91888995e 100644 --- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use databend_common_catalog::table::Table; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; +use databend_common_expression::Column; use databend_common_expression::ColumnRef; use databend_common_expression::DataBlock; use databend_common_expression::DataField; @@ -24,6 +25,7 @@ use databend_common_expression::DataSchema; use databend_common_expression::Expr; use databend_common_expression::Scalar; use databend_common_expression::TableSchemaRef; +use databend_common_functions::aggregates::eval_aggr; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_sql::evaluator::BlockOperator; use databend_storages_common_table_meta::meta::ClusterStatistics; @@ -113,8 +115,8 @@ impl ClusterStatisticsBuilder { } pub struct ClusterStatisticsState { - min_values: Vec>, - max_values: Vec>, + mins: Vec, + maxs: Vec, builder: Arc, } @@ -122,8 +124,8 @@ pub struct ClusterStatisticsState { impl ClusterStatisticsState { pub fn new(builder: Arc) -> Self { Self { - min_values: vec![], - max_values: vec![], + mins: vec![], + maxs: vec![], builder, } } @@ -133,20 +135,20 @@ impl ClusterStatisticsState { return Ok(input); } - let mut min = Vec::with_capacity(self.builder.cluster_key_index.len()); - let mut max = Vec::with_capacity(self.builder.cluster_key_index.len()); - for key in self.builder.cluster_key_index.iter() { - let val = input.get_by_offset(*key); - let left = unsafe { val.index_unchecked(0) }.to_owned(); - min.push(left); - - // The maximum in cluster statistics needn't larger than the non-trimmed one. - // So we use trim_min directly. - let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned(); - max.push(right); - } - self.min_values.push(min); - self.max_values.push(max); + let num_rows = input.num_rows(); + let cols = self + .builder + .cluster_key_index + .iter() + .map(|&i| input.get_by_offset(i).to_column()) + .collect(); + let tuple = Column::Tuple(cols); + let (min, _) = eval_aggr("min", vec![], &[tuple.clone()], num_rows, vec![])?; + let (max, _) = eval_aggr("max", vec![], &[tuple.clone()], num_rows, vec![])?; + assert_eq!(min.len(), 1); + assert_eq!(max.len(), 1); + self.mins.push(min.index(0).unwrap().to_owned()); + self.maxs.push(max.index(0).unwrap().to_owned()); input.pop_columns(self.builder.extra_key_num); Ok(input) } @@ -156,8 +158,22 @@ impl ClusterStatisticsState { return Ok(None); } - let min = self.min_values.into_iter().min().unwrap(); - let max = self.max_values.into_iter().max().unwrap(); + let min = self + .mins + .into_iter() + .min_by(|x, y| x.as_ref().cmp(&y.as_ref())) + .unwrap() + .as_tuple() + .unwrap() + .clone(); + let max = self + .maxs + .into_iter() + .max_by(|x, y| x.as_ref().cmp(&y.as_ref())) + .unwrap() + .as_tuple() + .unwrap() + .clone(); let level = if min == max && perfect { -1 From 0eb6279a6897fd3e83a580d8259f66a3c0f3b692 Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 17 Jun 2025 12:48:07 +0800 Subject: [PATCH 32/36] add column ndv estimator --- .../fuse/src/io/write/stream/block_builder.rs | 8 +- .../src/io/write/stream/column_statistics.rs | 177 +++++++++--------- 2 files changed, 97 insertions(+), 88 deletions(-) diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 30d3b2b45543d..2bff1283e2fdd 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -359,7 +359,7 @@ pub struct StreamBlockProperties { cluster_stats_builder: Arc, stats_columns: Vec, - distinct_columns: Vec, + distinct_columns: Vec<(ColumnId, DataType)>, bloom_columns_map: BTreeMap, ngram_args: Vec, inverted_index_builders: Vec, @@ -417,12 +417,12 @@ impl StreamBlockProperties { let leaf_fields = source_schema.leaf_fields(); for field in leaf_fields.iter() { let column_id = field.column_id(); - if RangeIndex::supported_type(&DataType::from(field.data_type())) - && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID + let data_type = DataType::from(field.data_type()); + if RangeIndex::supported_type(&data_type) && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID { stats_columns.push(column_id); if !bloom_column_ids.contains(&column_id) { - distinct_columns.push(column_id); + distinct_columns.push((column_id, data_type)); } } } diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs index 8df60aa61f03f..1e11c56af4c1c 100644 --- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs @@ -13,17 +13,20 @@ // limitations under the License. use std::collections::HashMap; +use std::hash::Hash; +use std::marker::PhantomData; use databend_common_exception::Result; -use databend_common_expression::types::AccessType; +use databend_common_expression::types::boolean::TrueIdxIter; use databend_common_expression::types::DataType; use databend_common_expression::types::DateType; -use databend_common_expression::types::DecimalColumn; -use databend_common_expression::types::DecimalScalar; +use databend_common_expression::types::Decimal128Type; +use databend_common_expression::types::Decimal256Type; use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberType; use databend_common_expression::types::StringType; use databend_common_expression::types::TimestampType; +use databend_common_expression::types::ValueType; use databend_common_expression::with_number_mapped_type; use databend_common_expression::Column; use databend_common_expression::ColumnId; @@ -32,6 +35,7 @@ use databend_common_expression::Scalar; use databend_common_expression::ScalarRef; use databend_common_expression::TableSchemaRef; use databend_common_expression::Value; +use databend_common_expression::SELECTIVITY_THRESHOLD; use databend_common_functions::aggregates::eval_aggr; use databend_storages_common_table_meta::meta::ColumnDistinctHLL; use databend_storages_common_table_meta::meta::ColumnStatistics; @@ -43,11 +47,11 @@ use crate::statistics::Trim; pub struct ColumnStatisticsState { col_stats: HashMap>, - distinct_columns: HashMap, + distinct_columns: HashMap>, } impl ColumnStatisticsState { - pub fn new(stats_columns: &[ColumnId], distinct_columns: &[ColumnId]) -> Self { + pub fn new(stats_columns: &[ColumnId], distinct_columns: &[(ColumnId, DataType)]) -> Self { let col_stats = stats_columns .iter() .map(|&col_id| (col_id, Vec::new())) @@ -55,7 +59,7 @@ impl ColumnStatisticsState { let distinct_columns = distinct_columns .iter() - .map(|&col_id| (col_id, ColumnDistinctHLL::default())) + .map(|(col_id, data_type)| (*col_id, create_estimator(data_type))) .collect(); Self { @@ -80,8 +84,8 @@ impl ColumnStatisticsState { in_memory_size as u64, None, ); - if let Some(hll) = self.distinct_columns.get_mut(&column_id) { - scalar_update_hll_cardinality(&s.as_ref(), &data_type, hll); + if let Some(estimator) = self.distinct_columns.get_mut(&column_id) { + estimator.update_scalar(&s.as_ref()); } self.col_stats.get_mut(&column_id).unwrap().push(col_stats); } @@ -128,8 +132,8 @@ impl ColumnStatisticsState { self.col_stats.get_mut(&column_id).unwrap().push(col_stats); // use distinct count calculated by the xor hash function to avoid repetitive operation. - if let Some(hll) = self.distinct_columns.get_mut(&column_id) { - column_update_hll_cardinality(&col, &data_type, hll); + if let Some(estimator) = self.distinct_columns.get_mut(&column_id) { + estimator.update_column(&col); } } } @@ -146,8 +150,8 @@ impl ColumnStatisticsState { let mut col_stats = reduce_column_statistics(stats); if let Some(count) = column_distinct_count.get(id) { col_stats.distinct_of_values = Some(*count as u64); - } else if let Some(hll) = self.distinct_columns.get(id) { - col_stats.distinct_of_values = Some(hll.count() as u64); + } else if let Some(estimator) = self.distinct_columns.get(id) { + col_stats.distinct_of_values = Some(estimator.finalize()); } statistics.insert(*id, col_stats); } @@ -155,93 +159,98 @@ impl ColumnStatisticsState { } } -fn column_update_hll_cardinality(col: &Column, ty: &DataType, hll: &mut ColumnDistinctHLL) { - if let DataType::Nullable(inner) = ty { - let col = col.as_nullable().unwrap(); - for (i, v) in col.validity.iter().enumerate() { - if v { - let scalar = unsafe { col.column.index_unchecked(i) }; - scalar_update_hll_cardinality(&scalar, inner, hll); - } - } - return; - } +pub trait ColumnNDVEstimator: Send + Sync { + fn update_column(&mut self, column: &Column); + fn update_scalar(&mut self, scalar: &ScalarRef); + fn finalize(&self) -> u64; +} - with_number_mapped_type!(|NUM_TYPE| match ty { +pub fn create_estimator(data_type: &DataType) -> Box { + let inner_type = data_type.remove_nullable(); + with_number_mapped_type!(|NUM_TYPE| match inner_type { DataType::Number(NumberDataType::NUM_TYPE) => { - let col = NumberType::::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(v); - } + ColumnNDVEstimatorImpl::>::create() } DataType::String => { - let col = StringType::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(&v); - } + ColumnNDVEstimatorImpl::::create() } DataType::Date => { - let col = DateType::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(v); - } + ColumnNDVEstimatorImpl::::create() } DataType::Timestamp => { - let col = TimestampType::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(v); - } + ColumnNDVEstimatorImpl::::create() + } + DataType::Decimal(s) if s.can_carried_by_128() => { + ColumnNDVEstimatorImpl::::create() } DataType::Decimal(_) => { - match col { - Column::Decimal(DecimalColumn::Decimal128(col, _)) => { - for v in col.iter() { - hll.add_object(v); - } - } - Column::Decimal(DecimalColumn::Decimal256(col, _)) => { - for v in col.iter() { - hll.add_object(v); - } - } - _ => unreachable!(), - }; + ColumnNDVEstimatorImpl::::create() } - _ => unreachable!("Unsupported data type: {:?}", ty), - }); + _ => unreachable!("Unsupported data type: {:?}", data_type), + }) } -fn scalar_update_hll_cardinality(scalar: &ScalarRef, ty: &DataType, hll: &mut ColumnDistinctHLL) { - if matches!(scalar, ScalarRef::Null) { - return; - } +pub struct ColumnNDVEstimatorImpl +where + T: ValueType + Send + Sync, + T::Scalar: Hash, +{ + hll: ColumnDistinctHLL, + _phantom: PhantomData, +} - let ty = ty.remove_nullable(); +impl ColumnNDVEstimatorImpl +where + T: ValueType + Send + Sync, + T::Scalar: Hash, +{ + pub fn create() -> Box { + Box::new(Self { + hll: ColumnDistinctHLL::new(), + _phantom: Default::default(), + }) + } +} - with_number_mapped_type!(|NUM_TYPE| match ty { - DataType::Number(NumberDataType::NUM_TYPE) => { - let val = NumberType::::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::String => { - let val = StringType::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::Date => { - let val = DateType::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::Timestamp => { - let val = TimestampType::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::Decimal(_) => { - match scalar { - ScalarRef::Decimal(DecimalScalar::Decimal128(v, _)) => hll.add_object(&v), - ScalarRef::Decimal(DecimalScalar::Decimal256(v, _)) => hll.add_object(&v), - _ => unreachable!(), +impl ColumnNDVEstimator for ColumnNDVEstimatorImpl +where + T: ValueType + Send + Sync, + T::Scalar: Hash, +{ + fn update_column(&mut self, column: &Column) { + if let Column::Nullable(box inner) = column { + let validity_len = inner.validity.len(); + let column = T::try_downcast_column(&inner.column).unwrap(); + if inner.validity.true_count() as f64 / validity_len as f64 >= SELECTIVITY_THRESHOLD { + for (data, valid) in T::iter_column(&column).zip(inner.validity.iter()) { + if valid { + self.hll.add_object(&T::to_owned_scalar(data)); + } + } + } else { + TrueIdxIter::new(validity_len, Some(&inner.validity)).for_each(|idx| { + let val = unsafe { T::index_column_unchecked(&column, idx) }; + self.hll.add_object(&T::to_owned_scalar(val)); + }) + } + } else { + let column = T::try_downcast_column(column).unwrap(); + for value in T::iter_column(&column) { + self.hll.add_object(&T::to_owned_scalar(value)); } } - _ => unreachable!("Unsupported data type: {:?}", ty), - }); + } + + fn update_scalar(&mut self, scalar: &ScalarRef) { + if matches!(scalar, ScalarRef::Null) { + return; + } + + let val = T::try_downcast_scalar(scalar).unwrap(); + self.hll.add_object(&T::to_owned_scalar(val)); + } + + fn finalize(&self) -> u64 { + self.hll.count() as u64 + } } From f0b0d930b2add9290e9f2bf59b906db39f0343a5 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 19 Jun 2025 04:05:04 +0800 Subject: [PATCH 33/36] add column min max state --- .../fuse/src/io/write/stream/block_builder.rs | 4 +- .../src/io/write/stream/column_statistics.rs | 362 ++++++++++++++---- 2 files changed, 289 insertions(+), 77 deletions(-) diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 2bff1283e2fdd..c125ae026d49f 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -358,7 +358,7 @@ pub struct StreamBlockProperties { source_schema: TableSchemaRef, cluster_stats_builder: Arc, - stats_columns: Vec, + stats_columns: Vec<(ColumnId, DataType)>, distinct_columns: Vec<(ColumnId, DataType)>, bloom_columns_map: BTreeMap, ngram_args: Vec, @@ -420,7 +420,7 @@ impl StreamBlockProperties { let data_type = DataType::from(field.data_type()); if RangeIndex::supported_type(&data_type) && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID { - stats_columns.push(column_id); + stats_columns.push((column_id, data_type.clone())); if !bloom_column_ids.contains(&column_id) { distinct_columns.push((column_id, data_type)); } diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs index 1e11c56af4c1c..161245c9f89b5 100644 --- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::cmp::Ordering; use std::collections::HashMap; use std::hash::Hash; use std::marker::PhantomData; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::boolean::TrueIdxIter; use databend_common_expression::types::DataType; use databend_common_expression::types::DateType; +use databend_common_expression::types::Decimal; use databend_common_expression::types::Decimal128Type; use databend_common_expression::types::Decimal256Type; use databend_common_expression::types::NumberDataType; @@ -36,25 +39,26 @@ use databend_common_expression::ScalarRef; use databend_common_expression::TableSchemaRef; use databend_common_expression::Value; use databend_common_expression::SELECTIVITY_THRESHOLD; -use databend_common_functions::aggregates::eval_aggr; use databend_storages_common_table_meta::meta::ColumnDistinctHLL; use databend_storages_common_table_meta::meta::ColumnStatistics; use databend_storages_common_table_meta::meta::StatisticsOfColumns; -use crate::statistics::reducers::reduce_column_statistics; use crate::statistics::traverse_values_dfs; use crate::statistics::Trim; pub struct ColumnStatisticsState { - col_stats: HashMap>, + col_stats: HashMap>, distinct_columns: HashMap>, } impl ColumnStatisticsState { - pub fn new(stats_columns: &[ColumnId], distinct_columns: &[(ColumnId, DataType)]) -> Self { + pub fn new( + stats_columns: &[(ColumnId, DataType)], + distinct_columns: &[(ColumnId, DataType)], + ) -> Self { let col_stats = stats_columns .iter() - .map(|&col_id| (col_id, Vec::new())) + .map(|(col_id, data_type)| (*col_id, create_column_minmax_state(data_type))) .collect(); let distinct_columns = distinct_columns @@ -74,63 +78,20 @@ impl ColumnStatisticsState { for (column_id, col, data_type) in leaves { match col { Value::Scalar(s) => { - let unset_bits = if s == Scalar::Null { rows } else { 0 }; - // when we read it back from parquet, it is a Column instead of Scalar - let in_memory_size = s.as_ref().estimated_scalar_repeat_size(rows, &data_type); - let col_stats = ColumnStatistics::new( - s.clone(), - s.clone(), - unset_bits as u64, - in_memory_size as u64, - None, + self.col_stats.get_mut(&column_id).unwrap().update_scalar( + &s.as_ref(), + rows, + &data_type, ); if let Some(estimator) = self.distinct_columns.get_mut(&column_id) { estimator.update_scalar(&s.as_ref()); } - self.col_stats.get_mut(&column_id).unwrap().push(col_stats); } Value::Column(col) => { - // later, during the evaluation of expressions, name of field does not matter - let mut min = Scalar::Null; - let mut max = Scalar::Null; - - let (mins, _) = eval_aggr("min", vec![], &[col.clone()], rows, vec![])?; - if mins.len() > 0 { - min = if let Some(v) = mins.index(0) { - // safe upwrap. - v.to_owned().trim_min().unwrap() - } else { - self.col_stats.remove(&column_id); - continue; - } - } - - let (maxs, _) = eval_aggr("max", vec![], &[col.clone()], rows, vec![])?; - if maxs.len() > 0 { - max = if let Some(v) = maxs.index(0) { - if let Some(v) = v.to_owned().trim_max() { - v - } else { - self.col_stats.remove(&column_id); - continue; - } - } else { - self.col_stats.remove(&column_id); - continue; - } - } - - let (is_all_null, bitmap) = col.validity(); - let unset_bits = match (is_all_null, bitmap) { - (true, _) => rows, - (false, Some(bitmap)) => bitmap.null_count(), - (false, None) => 0, - }; - let in_memory_size = col.memory_size() as u64; - let col_stats = - ColumnStatistics::new(min, max, unset_bits as u64, in_memory_size, None); - self.col_stats.get_mut(&column_id).unwrap().push(col_stats); - + self.col_stats + .get_mut(&column_id) + .unwrap() + .update_column(&col); // use distinct count calculated by the xor hash function to avoid repetitive operation. if let Some(estimator) = self.distinct_columns.get_mut(&column_id) { estimator.update_column(&col); @@ -146,14 +107,14 @@ impl ColumnStatisticsState { column_distinct_count: HashMap, ) -> Result { let mut statistics = StatisticsOfColumns::with_capacity(self.col_stats.len()); - for (id, stats) in &self.col_stats { - let mut col_stats = reduce_column_statistics(stats); - if let Some(count) = column_distinct_count.get(id) { + for (id, stats) in self.col_stats { + let mut col_stats = stats.finalize()?; + if let Some(count) = column_distinct_count.get(&id) { col_stats.distinct_of_values = Some(*count as u64); - } else if let Some(estimator) = self.distinct_columns.get(id) { + } else if let Some(estimator) = self.distinct_columns.get(&id) { col_stats.distinct_of_values = Some(estimator.finalize()); } - statistics.insert(*id, col_stats); + statistics.insert(id, col_stats); } Ok(statistics) } @@ -193,7 +154,7 @@ pub fn create_estimator(data_type: &DataType) -> Box { pub struct ColumnNDVEstimatorImpl where T: ValueType + Send + Sync, - T::Scalar: Hash, + for<'a> T::ScalarRef<'a>: Hash, { hll: ColumnDistinctHLL, _phantom: PhantomData, @@ -202,7 +163,7 @@ where impl ColumnNDVEstimatorImpl where T: ValueType + Send + Sync, - T::Scalar: Hash, + for<'a> T::ScalarRef<'a>: Hash, { pub fn create() -> Box { Box::new(Self { @@ -215,28 +176,37 @@ where impl ColumnNDVEstimator for ColumnNDVEstimatorImpl where T: ValueType + Send + Sync, - T::Scalar: Hash, + for<'a> T::ScalarRef<'a>: Hash, { fn update_column(&mut self, column: &Column) { - if let Column::Nullable(box inner) = column { - let validity_len = inner.validity.len(); - let column = T::try_downcast_column(&inner.column).unwrap(); - if inner.validity.true_count() as f64 / validity_len as f64 >= SELECTIVITY_THRESHOLD { - for (data, valid) in T::iter_column(&column).zip(inner.validity.iter()) { + let (column, validity) = if let Column::Nullable(box inner) = column { + let validity = if inner.validity.null_count() == 0 { + None + } else { + Some(&inner.validity) + }; + (&inner.column, validity) + } else { + (column, None) + }; + + let column = T::try_downcast_column(column).unwrap(); + if let Some(v) = validity { + if v.true_count() as f64 / v.len() as f64 >= SELECTIVITY_THRESHOLD { + for (data, valid) in T::iter_column(&column).zip(v.iter()) { if valid { - self.hll.add_object(&T::to_owned_scalar(data)); + self.hll.add_object(&data); } } } else { - TrueIdxIter::new(validity_len, Some(&inner.validity)).for_each(|idx| { + TrueIdxIter::new(v.len(), Some(v)).for_each(|idx| { let val = unsafe { T::index_column_unchecked(&column, idx) }; - self.hll.add_object(&T::to_owned_scalar(val)); + self.hll.add_object(&val); }) } } else { - let column = T::try_downcast_column(column).unwrap(); for value in T::iter_column(&column) { - self.hll.add_object(&T::to_owned_scalar(value)); + self.hll.add_object(&value); } } } @@ -247,10 +217,252 @@ where } let val = T::try_downcast_scalar(scalar).unwrap(); - self.hll.add_object(&T::to_owned_scalar(val)); + self.hll.add_object(&val); } fn finalize(&self) -> u64 { self.hll.count() as u64 } } + +pub trait ColumnMinMaxState: Send + Sync { + fn update_column(&mut self, column: &Column); + + fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType); + + fn finalize(self: Box) -> Result; +} + +pub trait MinMaxAdapter: Send + Sync { + type Value: Clone + Send + Sync; + + fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value; + + fn value_to_scalar(val: Self::Value) -> T::Scalar; + + fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering); +} + +pub struct CommonAdapter; + +impl MinMaxAdapter for CommonAdapter +where + T: ValueType, + T::Scalar: Send + Sync, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + type Value = T::Scalar; + + fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value { + T::to_owned_scalar(val) + } + + fn value_to_scalar(val: Self::Value) -> T::Scalar { + val + } + + fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering) { + if scalar.partial_cmp(&T::to_scalar_ref(value)) == Some(ordering) { + *value = T::to_owned_scalar(scalar); + } + } +} + +pub struct DecimalAdapter; + +impl MinMaxAdapter for DecimalAdapter +where + T: ValueType, + T::Scalar: Decimal + Send + Sync, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + type Value = ::U64Array; + + fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value { + T::Scalar::to_u64_array(T::to_owned_scalar(val)) + } + + fn value_to_scalar(val: Self::Value) -> T::Scalar { + T::Scalar::from_u64_array(val) + } + + fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering) { + let val = T::Scalar::from_u64_array(*value); + if scalar.partial_cmp(&T::to_scalar_ref(&val)) == Some(ordering) { + *value = T::Scalar::to_u64_array(T::to_owned_scalar(scalar)); + } + } +} + +pub fn create_column_minmax_state(data_type: &DataType) -> Box { + let inner_type = data_type.remove_nullable(); + with_number_mapped_type!(|NUM_TYPE| match inner_type { + DataType::Number(NumberDataType::NUM_TYPE) => { + GenericColumnMinMaxState::, CommonAdapter>::create(inner_type) + } + DataType::String => { + GenericColumnMinMaxState::::create(inner_type) + } + DataType::Date => { + GenericColumnMinMaxState::::create(inner_type) + } + DataType::Timestamp => { + GenericColumnMinMaxState::::create(inner_type) + } + DataType::Decimal(s) if s.can_carried_by_128() => { + GenericColumnMinMaxState::::create(inner_type) + } + DataType::Decimal(_) => { + GenericColumnMinMaxState::::create(inner_type) + } + _ => unreachable!("Unsupported data type: {:?}", data_type), + }) +} + +pub struct GenericColumnMinMaxState +where + T: ValueType, + A: MinMaxAdapter, +{ + min: Option, + max: Option, + null_count: usize, + in_memory_size: usize, + data_type: DataType, + + _phantom: PhantomData<(T, A)>, +} + +impl GenericColumnMinMaxState +where + T: ValueType + Send + Sync, + T::Scalar: Send + Sync, + A: MinMaxAdapter + 'static, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + pub fn create(data_type: DataType) -> Box { + Box::new(Self { + min: None, + max: None, + null_count: 0, + in_memory_size: 0, + data_type, + _phantom: PhantomData, + }) + } + + fn add_batch<'a, I>(&mut self, mut iter: I) + where I: Iterator> { + let first = iter.next().unwrap(); + let mut min = first.clone(); + let mut max = first; + for v in iter { + if matches!(min.partial_cmp(&v), Some(Ordering::Greater)) { + min = v; + } else if matches!(max.partial_cmp(&v), Some(Ordering::Less)) { + max = v; + } + } + + self.add(min, max); + } + + fn add(&mut self, min: T::ScalarRef<'_>, max: T::ScalarRef<'_>) { + if let Some(val) = self.min.as_mut() { + A::update_value(val, min, Ordering::Less); + } else { + self.min = Some(A::scalar_to_value(min)); + } + + if let Some(val) = self.max.as_mut() { + A::update_value(val, max, Ordering::Greater); + } else { + self.max = Some(A::scalar_to_value(max)); + } + } +} + +impl ColumnMinMaxState for GenericColumnMinMaxState +where + T: ValueType + Send + Sync, + T::Scalar: Send + Sync, + A: MinMaxAdapter + 'static, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + fn update_column(&mut self, column: &Column) { + self.in_memory_size += column.memory_size(); + let (column, validity) = if let Column::Nullable(box inner) = column { + let validity = if inner.validity.null_count() == 0 { + None + } else { + Some(&inner.validity) + }; + (&inner.column, validity) + } else { + (column, None) + }; + self.null_count += validity.map_or(0, |v| v.null_count()); + + let column = T::try_downcast_column(column).unwrap(); + if let Some(v) = validity { + if v.true_count() as f64 / v.len() as f64 >= SELECTIVITY_THRESHOLD { + let column_iter = T::iter_column(&column); + let value_iter = column_iter + .zip(v.iter()) + .filter(|(_, v)| *v) + .map(|(v, _)| v); + self.add_batch(value_iter); + } else { + for idx in TrueIdxIter::new(v.len(), Some(v)) { + let v = unsafe { T::index_column_unchecked(&column, idx) }; + self.add(v.clone(), v); + } + } + } else { + let column_iter = T::iter_column(&column); + self.add_batch(column_iter); + } + } + + fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType) { + // when we read it back from parquet, it is a Column instead of Scalar + self.in_memory_size += scalar.estimated_scalar_repeat_size(num_rows, data_type); + if scalar.is_null() { + self.null_count += num_rows; + return; + } + + let val = T::try_downcast_scalar(scalar).unwrap(); + self.add(val.clone(), val); + } + + fn finalize(self: Box) -> Result { + let min = if let Some(v) = self.min { + let v = A::value_to_scalar(v); + // safe upwrap. + T::upcast_scalar_with_type(v, &self.data_type) + .trim_min() + .unwrap() + } else { + Scalar::Null + }; + let max = if let Some(v) = self.max { + let v = A::value_to_scalar(v); + if let Some(v) = T::upcast_scalar_with_type(v, &self.data_type).trim_max() { + v + } else { + return Err(ErrorCode::Internal("Unable to trim string")); + } + } else { + Scalar::Null + }; + + Ok(ColumnStatistics::new( + min, + max, + self.null_count as u64, + self.in_memory_size as u64, + None, + )) + } +} From ca55812c4a46f47060ccf7cd2d87ccc6ca69258b Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 19 Jun 2025 16:52:02 +0800 Subject: [PATCH 34/36] fix --- .../storages/common/index/src/bloom_index.rs | 92 +++++++++++++++---- .../src/io/write/stream/column_statistics.rs | 5 +- 2 files changed, 78 insertions(+), 19 deletions(-) diff --git a/src/query/storages/common/index/src/bloom_index.rs b/src/query/storages/common/index/src/bloom_index.rs index bf5ec35035e64..59d06f6298a49 100644 --- a/src/query/storages/common/index/src/bloom_index.rs +++ b/src/query/storages/common/index/src/bloom_index.rs @@ -14,6 +14,7 @@ use std::collections::BTreeMap; use std::collections::HashMap; +use std::hash::DefaultHasher; use std::hash::Hasher; use std::ops::ControlFlow; use std::ops::Deref; @@ -35,12 +36,18 @@ use databend_common_expression::types::BinaryType; use databend_common_expression::types::Bitmap; use databend_common_expression::types::Buffer; use databend_common_expression::types::DataType; +use databend_common_expression::types::DateType; use databend_common_expression::types::MapType; use databend_common_expression::types::NullableType; use databend_common_expression::types::Number; use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::NumberType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; use databend_common_expression::types::UInt64Type; +use databend_common_expression::types::ValueType; use databend_common_expression::visit_expr; +use databend_common_expression::with_number_mapped_type; use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::ColumnBuilder; @@ -349,6 +356,71 @@ impl BloomIndex { Ok(column) } + pub fn calculate_digest_by_type(data_type: &DataType, column: &Column) -> Result> { + let inner_type = data_type.remove_nullable(); + with_number_mapped_type!(|NUM_TYPE| match inner_type { + DataType::Number(NumberDataType::NUM_TYPE) => { + Self::calculate_nullable_column_digests::>(column) + } + DataType::String => { + Self::calculate_nullable_column_digests::(column) + } + DataType::Date => { + Self::calculate_nullable_column_digests::(column) + } + DataType::Timestamp => { + Self::calculate_nullable_column_digests::(column) + } + _ => Err(ErrorCode::Internal(format!( + "Unsupported data type: {:?}", + data_type + ))), + }) + } + + #[inline(always)] + fn hash_one(v: &T) -> u64 { + let mut hasher = DefaultHasher::default(); + DFHash::hash(v, &mut hasher); + hasher.finish() + } + + fn calculate_nullable_column_digests(column: &Column) -> Result> + where for<'a> T::ScalarRef<'a>: DFHash { + let (column, validity) = if let Column::Nullable(box inner) = column { + let validity = if inner.validity.null_count() == 0 { + None + } else { + Some(&inner.validity) + }; + (&inner.column, validity) + } else { + (column, None) + }; + + let capacity = validity.map_or(column.len(), |v| v.true_count() + 1); + let mut result = Vec::with_capacity(capacity); + if validity.is_some() { + result.push(0); + } + let column = T::try_downcast_column(column).unwrap(); + if let Some(validity) = validity { + let column_iter = T::iter_column(&column); + let value_iter = column_iter + .zip(validity.iter()) + .filter(|(_, v)| *v) + .map(|(v, _)| v); + for value in value_iter { + result.push(Self::hash_one(&value)); + } + } else { + for value in T::iter_column(&column) { + result.push(Self::hash_one(&value)); + } + } + Ok(result) + } + /// calculate digest for column that may have null values /// /// returns (column, validity) where column is the digest of the column @@ -734,24 +806,8 @@ impl BloomIndexBuilder { } }; - let (column, validity) = - BloomIndex::calculate_nullable_column_digest(&self.func_ctx, &column, &data_type)?; - // create filter per column - if validity.as_ref().map(|v| v.null_count()).unwrap_or(0) > 0 { - let validity = validity.unwrap(); - let it = column.deref().iter().zip(validity.iter()).map( - |(v, b)| { - if !b { - &0 - } else { - v - } - }, - ); - index_column.builder.add_digests(it); - } else { - index_column.builder.add_digests(column.deref()); - } + let column = BloomIndex::calculate_digest_by_type(&data_type, &column)?; + index_column.builder.add_digests(column.deref()); } for index_column in self.ngram_columns.iter_mut() { let field_type = &block.data_type(index_column.index); diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs index 161245c9f89b5..ee520eb5007b3 100644 --- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs @@ -359,7 +359,10 @@ where for v in iter { if matches!(min.partial_cmp(&v), Some(Ordering::Greater)) { min = v; - } else if matches!(max.partial_cmp(&v), Some(Ordering::Less)) { + continue; + } + + if matches!(max.partial_cmp(&v), Some(Ordering::Less)) { max = v; } } From e7483b5ed981297d1c4763f403325488a43bb03a Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 19 Jun 2025 18:44:02 +0800 Subject: [PATCH 35/36] remove unused codes --- .../interpreter_table_recluster.rs | 44 ++------- .../builders/builder_hilbert_partition.rs | 6 +- .../pipelines/builders/builder_recluster.rs | 2 +- .../aggregator/aggregate_exchange_injector.rs | 1 - .../recluster/recluster_partition_exchange.rs | 23 ++--- .../recluster/recluster_partition_strategy.rs | 22 ----- .../partition/partition_process_strategy.rs | 19 ---- .../transform_window_partition_collect.rs | 4 +- .../src/schedulers/fragments/fragmenter.rs | 23 ----- .../src/schedulers/fragments/plan_fragment.rs | 56 ----------- .../query_fragment_actions_display.rs | 1 - .../flight/v1/exchange/data_exchange.rs | 17 ---- .../flight/v1/exchange/exchange_injector.rs | 6 -- .../flight/v1/exchange/exchange_manager.rs | 12 --- .../src/servers/flight/v1/exchange/mod.rs | 1 - .../flight/v1/scatter/flight_scatter_mod.rs | 92 ------------------- .../src/servers/flight/v1/scatter/mod.rs | 2 - src/query/sql/src/executor/format.rs | 8 -- .../sql/src/executor/physical_plans/common.rs | 2 - .../physical_plans/physical_exchange.rs | 8 -- .../physical_plans/physical_recluster.rs | 3 +- .../planner/format/display_rel_operator.rs | 1 - .../sql/src/planner/optimizer/ir/format.rs | 1 - .../planner/optimizer/ir/property/enforcer.rs | 1 - .../planner/optimizer/ir/property/property.rs | 8 +- .../optimizers/cascades/cost/model.rs | 2 +- src/query/sql/src/planner/plans/exchange.rs | 4 +- .../fuse/src/statistics/cluster_statistics.rs | 2 +- 28 files changed, 33 insertions(+), 338 deletions(-) delete mode 100644 src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs diff --git a/src/query/service/src/interpreters/interpreter_table_recluster.rs b/src/query/service/src/interpreters/interpreter_table_recluster.rs index 602e58df1670a..2468c6b2cd159 100644 --- a/src/query/service/src/interpreters/interpreter_table_recluster.rs +++ b/src/query/service/src/interpreters/interpreter_table_recluster.rs @@ -28,7 +28,6 @@ use databend_common_catalog::table::TableExt; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::type_check::check_function; -use databend_common_expression::types::NumberScalar; use databend_common_expression::DataBlock; use databend_common_expression::Scalar; use databend_common_functions::BUILTIN_FUNCTIONS; @@ -53,8 +52,6 @@ use databend_common_sql::plans::plan_hilbert_sql; use databend_common_sql::plans::replace_with_constant; use databend_common_sql::plans::set_update_stream_columns; use databend_common_sql::plans::BoundColumnRef; -use databend_common_sql::plans::ConstantExpr; -use databend_common_sql::plans::FunctionCall; use databend_common_sql::plans::Plan; use databend_common_sql::plans::ReclusterPlan; use databend_common_sql::IdentifierNormalizer; @@ -433,44 +430,22 @@ impl ReclusterTableInterpreter { // For distributed execution, add an exchange operator to distribute work if is_distributed { - let nodes_num = cluster.nodes.len() as u64; - let scalar_expr = ScalarExpr::FunctionCall(FunctionCall { - span: None, - func_name: "div".to_string(), - params: vec![], - arguments: vec![ - ScalarExpr::FunctionCall(FunctionCall { - span: None, - func_name: "multiply".to_string(), - params: vec![], - arguments: vec![ - ScalarExpr::BoundColumnRef(BoundColumnRef { - span: None, - column: bind_context.columns.last().unwrap().clone(), - }), - ScalarExpr::ConstantExpr(ConstantExpr { - span: None, - value: Scalar::Number(NumberScalar::UInt64(nodes_num)), - }), - ], - }), - ScalarExpr::ConstantExpr(ConstantExpr { - span: None, - value: Scalar::Number(NumberScalar::UInt64(total_partitions as u64)), - }), - ], - }); - // Create an expression for the partition column, // i.e.`range_partition_id(hilbert_range_index({hilbert_keys_str}), [...]) AS _predicate` - let expr = scalar_expr_to_remote_expr(&scalar_expr, plan.output_schema()?.as_ref())?; + let expr = scalar_expr_to_remote_expr( + &ScalarExpr::BoundColumnRef(BoundColumnRef { + span: None, + column: bind_context.columns.last().unwrap().clone(), + }), + plan.output_schema()?.as_ref(), + )?; // Add exchange operator for data distribution, // shuffling data based on the hash of range partition IDs derived from the Hilbert index. plan = Box::new(PhysicalPlan::Exchange(Exchange { plan_id: 0, input: plan, - kind: FragmentKind::Modulo, + kind: FragmentKind::Normal, keys: vec![expr], allow_adjust_parallelism: true, ignore_exchange: false, @@ -487,8 +462,7 @@ impl ReclusterTableInterpreter { plan_id: 0, input: plan, table_info: table_info.clone(), - range_start: 0, - range_width: total_partitions, + num_partitions: total_partitions, table_meta_timestamps, bytes_per_block, rows_per_block, diff --git a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs index aebafaa53566d..870db16444a42 100644 --- a/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs +++ b/src/query/service/src/pipelines/builders/builder_hilbert_partition.rs @@ -51,7 +51,7 @@ impl PipelineBuilder { self.main_pipeline.exchange( num_processors, - ReclusterPartitionExchange::create(partition.range_start, partition.range_width), + ReclusterPartitionExchange::create(partition.num_partitions), ); let settings = self.ctx.get_settings(); @@ -85,7 +85,7 @@ impl PipelineBuilder { &settings, processor_id.fetch_add(1, atomic::Ordering::AcqRel), num_processors, - partition.range_width, + partition.num_partitions, window_spill_settings.clone(), disk_spill.clone(), ReclusterPartitionStrategy::new(properties.clone()), @@ -112,7 +112,7 @@ impl PipelineBuilder { &settings, processor_id.fetch_add(1, atomic::Ordering::AcqRel), num_processors, - partition.range_width, + partition.num_partitions, window_spill_settings.clone(), disk_spill.clone(), CompactPartitionStrategy::new( diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index 28b5feaacfd39..b4c9a396c5fcb 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -195,7 +195,7 @@ impl PipelineBuilder { self.main_pipeline.exchange( num_processors, - ReclusterPartitionExchange::create(0, partitions), + ReclusterPartitionExchange::create(partitions), ); let processor_id = AtomicUsize::new(0); diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs index 40904ea2c8e16..55688a4347259 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/aggregate_exchange_injector.rs @@ -233,7 +233,6 @@ impl ExchangeInjector for AggregateInjector { match exchange { DataExchange::Merge(_) => unreachable!(), DataExchange::Broadcast(_) => unreachable!(), - DataExchange::Modulo(_) => unreachable!(), DataExchange::ShuffleDataExchange(exchange) => { Ok(Arc::new(Box::new(HashTableHashScatter { buckets: exchange.destination_ids.len(), diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs index 444c81296de26..7fc006d3afad2 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_exchange.rs @@ -22,16 +22,14 @@ use databend_common_pipeline_core::processors::Exchange; use crate::pipelines::processors::transforms::WindowPartitionMeta; pub struct ReclusterPartitionExchange { - start: u64, - width: usize, + num_partitions: usize, start_time: Instant, } impl ReclusterPartitionExchange { - pub fn create(start: u64, width: usize) -> Arc { + pub fn create(num_partitions: usize) -> Arc { Arc::new(ReclusterPartitionExchange { - start, - width, + num_partitions, start_time: Instant::now(), }) } @@ -50,18 +48,21 @@ impl Exchange for ReclusterPartitionExchange { // Scatter the data block to different partitions. let indices = range_ids .iter() - .map(|&id| (id - self.start) as u16) + .map(|&id| (id % self.num_partitions as u64) as u16) .collect::>(); data_block.pop_columns(1); - - let scatter_indices = DataBlock::divide_indices_by_scatter_size(&indices, self.width); + let scatter_indices = + DataBlock::divide_indices_by_scatter_size(&indices, self.num_partitions); // Partition the data blocks to different processors. let mut output_data_blocks = vec![vec![]; n]; - for (partition_id, indices) in scatter_indices.into_iter().take(self.width).enumerate() { + for (partition_id, indices) in scatter_indices + .into_iter() + .take(self.num_partitions) + .enumerate() + { if !indices.is_empty() { - let target = (partition_id * n) / self.width; let block = data_block.take_with_optimize_size(&indices)?; - output_data_blocks[target].push((partition_id, block)); + output_data_blocks[partition_id % n].push((partition_id, block)); } } log::info!("Recluster range exchange: {:?}", self.start_time.elapsed()); diff --git a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs index 9c8a4573171a1..d8f3443c4c6e0 100644 --- a/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/recluster/recluster_partition_strategy.rs @@ -37,17 +37,6 @@ impl ReclusterPartitionStrategy { impl PartitionProcessStrategy for ReclusterPartitionStrategy { const NAME: &'static str = "Recluster"; - fn calc_partitions( - &self, - processor_id: usize, - num_processors: usize, - num_partitions: usize, - ) -> Vec { - (0..num_partitions) - .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) - .collect() - } - /// Stream write each block, and flush it conditionally based on builder status /// and input size estimation. fn process_data_blocks(&self, data_blocks: Vec) -> Result> { @@ -127,17 +116,6 @@ impl CompactPartitionStrategy { impl PartitionProcessStrategy for CompactPartitionStrategy { const NAME: &'static str = "Compact"; - fn calc_partitions( - &self, - processor_id: usize, - num_processors: usize, - num_partitions: usize, - ) -> Vec { - (0..num_partitions) - .filter(|&partition| (partition * num_processors) / num_partitions == processor_id) - .collect() - } - /// Collects blocks into batches and merges them via `concat` when size threshold is reached. fn process_data_blocks(&self, data_blocks: Vec) -> Result> { let blocks_num = data_blocks.len(); diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs index bec3f8a84e91f..cffa542136623 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/partition_process_strategy.rs @@ -22,14 +22,6 @@ use databend_common_settings::Settings; pub trait PartitionProcessStrategy: Send + Sync + 'static { const NAME: &'static str; - /// Partition assignment: map partition index to processor via proportional mapping. - fn calc_partitions( - &self, - processor_id: usize, - num_processors: usize, - num_partitions: usize, - ) -> Vec; - fn process_data_blocks(&self, data_blocks: Vec) -> Result>; } @@ -66,17 +58,6 @@ impl WindowPartitionStrategy { impl PartitionProcessStrategy for WindowPartitionStrategy { const NAME: &'static str = "Window"; - fn calc_partitions( - &self, - processor_id: usize, - num_processors: usize, - num_partitions: usize, - ) -> Vec { - (0..num_partitions) - .filter(|&partition| partition % num_processors == processor_id) - .collect() - } - fn process_data_blocks(&self, data_blocks: Vec) -> Result> { let data_blocks = data_blocks .into_iter() diff --git a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs index d1f011404223b..0171af6053d7c 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/partition/transform_window_partition_collect.rs @@ -92,7 +92,9 @@ impl TransformPartitionCollect { strategy: S, ) -> Result { // Calculate the partition ids collected by the processor. - let partitions = strategy.calc_partitions(processor_id, num_processors, num_partitions); + let partitions: Vec = (0..num_partitions) + .filter(|&partition| partition % num_processors == processor_id) + .collect(); // Map each partition id to new partition id. let mut partition_id = vec![0; num_partitions]; diff --git a/src/query/service/src/schedulers/fragments/fragmenter.rs b/src/query/service/src/schedulers/fragments/fragmenter.rs index dc267c896dc17..5b83c20a670d8 100644 --- a/src/query/service/src/schedulers/fragments/fragmenter.rs +++ b/src/query/service/src/schedulers/fragments/fragmenter.rs @@ -15,7 +15,6 @@ use std::sync::Arc; use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_meta_types::NodeInfo; use databend_common_sql::executor::physical_plans::CompactSource; @@ -27,7 +26,6 @@ use databend_common_sql::executor::physical_plans::ExchangeSink; use databend_common_sql::executor::physical_plans::ExchangeSource; use databend_common_sql::executor::physical_plans::FragmentKind; use databend_common_sql::executor::physical_plans::HashJoin; -use databend_common_sql::executor::physical_plans::HilbertPartition; use databend_common_sql::executor::physical_plans::MutationSource; use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::executor::physical_plans::ReplaceInto; @@ -41,7 +39,6 @@ use crate::schedulers::PlanFragment; use crate::servers::flight::v1::exchange::BroadcastExchange; use crate::servers::flight::v1::exchange::DataExchange; use crate::servers::flight::v1::exchange::MergeExchange; -use crate::servers::flight::v1::exchange::ModuloExchange; use crate::servers::flight::v1::exchange::ShuffleDataExchange; use crate::sessions::QueryContext; use crate::sql::executor::PhysicalPlan; @@ -69,7 +66,6 @@ enum State { Compact, Recluster, Other, - HilbertRecluster, } impl Fragmenter { @@ -118,15 +114,6 @@ impl Fragmenter { FragmentKind::Expansive => { Ok(Some(BroadcastExchange::create(Self::get_executors(ctx)))) } - FragmentKind::Modulo => { - if plan.keys.len() != 1 { - return Err(ErrorCode::Internal("Modulo exchange require one key")); - } - Ok(Some(ModuloExchange::create( - Self::get_executors(ctx), - plan.keys[0].clone(), - ))) - } _ => Ok(None), }, _ => Ok(None), @@ -213,15 +200,6 @@ impl PhysicalPlanReplacer for Fragmenter { Ok(PhysicalPlan::Recluster(Box::new(plan.clone()))) } - fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result { - let input = self.replace(&plan.input)?; - self.state = State::HilbertRecluster; - Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition { - input: Box::new(input), - ..plan.clone() - }))) - } - fn replace_compact_source(&mut self, plan: &CompactSource) -> Result { self.state = State::Compact; Ok(PhysicalPlan::CompactSource(Box::new(plan.clone()))) @@ -323,7 +301,6 @@ impl PhysicalPlanReplacer for Fragmenter { State::ReplaceInto => FragmentType::ReplaceInto, State::Compact => FragmentType::Compact, State::Recluster => FragmentType::Recluster, - State::HilbertRecluster => FragmentType::HilbertRecluster, }; self.state = State::Other; let exchange = Self::get_exchange(self.ctx.clone(), &plan)?; diff --git a/src/query/service/src/schedulers/fragments/plan_fragment.rs b/src/query/service/src/schedulers/fragments/plan_fragment.rs index fab77a79d29f5..2f52da8b04ce3 100644 --- a/src/query/service/src/schedulers/fragments/plan_fragment.rs +++ b/src/query/service/src/schedulers/fragments/plan_fragment.rs @@ -28,7 +28,6 @@ use databend_common_sql::executor::physical_plans::CompactSource; use databend_common_sql::executor::physical_plans::ConstantTableScan; use databend_common_sql::executor::physical_plans::CopyIntoTable; use databend_common_sql::executor::physical_plans::CopyIntoTableSource; -use databend_common_sql::executor::physical_plans::HilbertPartition; use databend_common_sql::executor::physical_plans::MutationSource; use databend_common_sql::executor::physical_plans::Recluster; use databend_common_sql::executor::physical_plans::ReplaceDeduplicate; @@ -65,7 +64,6 @@ pub enum FragmentType { Compact, Recluster, MutationSource, - HilbertRecluster, } #[derive(Clone)] @@ -138,9 +136,6 @@ impl PlanFragment { FragmentType::Recluster => { self.redistribute_recluster(ctx, &mut fragment_actions)?; } - FragmentType::HilbertRecluster => { - self.redistribute_hilbert(ctx, &mut fragment_actions)?; - } } if let Some(ref exchange) = self.exchange { @@ -381,40 +376,6 @@ impl PlanFragment { Ok(()) } - fn redistribute_hilbert( - &self, - ctx: Arc, - fragment_actions: &mut QueryFragmentActions, - ) -> Result<()> { - let exchange_sink = match &self.plan { - PhysicalPlan::ExchangeSink(plan) => plan, - _ => unreachable!("logic error"), - }; - let hilbert = match exchange_sink.input.as_ref() { - PhysicalPlan::HilbertPartition(plan) => plan, - _ => unreachable!("logic error"), - }; - - let total_ranges = hilbert.range_width; - let executors = Fragmenter::get_executors(ctx); - let num_executors = executors.len(); - let base_width = total_ranges / num_executors; - let remainder = total_ranges % num_executors; - for (executor_idx, executor) in executors.into_iter().enumerate() { - let width = base_width + if executor_idx < remainder { 1 } else { 0 }; - let min = executor_idx * base_width + std::cmp::min(executor_idx, remainder); - let mut plan = self.plan.clone(); - let mut replace_hilbert = ReplaceHilbert { - range_width: width, - range_start: min as u64, - }; - plan = replace_hilbert.replace(&plan)?; - fragment_actions.add_action(QueryFragmentAction::create(executor, plan)); - } - - Ok(()) - } - fn reshuffle( executors: Vec, partitions: Vec, @@ -590,23 +551,6 @@ impl PhysicalPlanReplacer for ReplaceReadSource { } } -struct ReplaceHilbert { - range_width: usize, - range_start: u64, -} - -impl PhysicalPlanReplacer for ReplaceHilbert { - fn replace_hilbert_serialize(&mut self, plan: &HilbertPartition) -> Result { - let input = self.replace(&plan.input)?; - Ok(PhysicalPlan::HilbertPartition(Box::new(HilbertPartition { - input: Box::new(input), - range_width: self.range_width, - range_start: self.range_start, - ..plan.clone() - }))) - } -} - struct ReplaceRecluster { tasks: Vec, } diff --git a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs index 36d8f0c257eb1..adb0b6c3bcd18 100644 --- a/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs +++ b/src/query/service/src/schedulers/fragments/query_fragment_actions_display.rs @@ -72,7 +72,6 @@ impl Display for QueryFragmentActionsWrap<'_> { DataExchange::Merge(_) => writeln!(f, " DataExchange: Merge")?, DataExchange::Broadcast(_) => writeln!(f, " DataExchange: Broadcast")?, DataExchange::ShuffleDataExchange(_) => writeln!(f, " DataExchange: Shuffle")?, - DataExchange::Modulo(_) => writeln!(f, " DataExchange: Modulo")?, } } diff --git a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs index 0fba30c72ec7b..f23c7582559a7 100644 --- a/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs +++ b/src/query/service/src/servers/flight/v1/exchange/data_exchange.rs @@ -19,7 +19,6 @@ pub enum DataExchange { Merge(MergeExchange), Broadcast(BroadcastExchange), ShuffleDataExchange(ShuffleDataExchange), - Modulo(ModuloExchange), } impl DataExchange { @@ -28,7 +27,6 @@ impl DataExchange { DataExchange::Merge(exchange) => vec![exchange.destination_id.clone()], DataExchange::Broadcast(exchange) => exchange.destination_ids.clone(), DataExchange::ShuffleDataExchange(exchange) => exchange.destination_ids.clone(), - DataExchange::Modulo(exchange) => exchange.destination_ids.clone(), } } } @@ -79,18 +77,3 @@ impl BroadcastExchange { DataExchange::Broadcast(BroadcastExchange { destination_ids }) } } - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct ModuloExchange { - pub destination_ids: Vec, - pub shuffle_key: RemoteExpr, -} - -impl ModuloExchange { - pub fn create(destination_ids: Vec, shuffle_key: RemoteExpr) -> DataExchange { - DataExchange::Modulo(ModuloExchange { - destination_ids, - shuffle_key, - }) - } -} diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs index 5b10b4f346960..4aa65ba175a83 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_injector.rs @@ -29,7 +29,6 @@ use crate::servers::flight::v1::exchange::ShuffleExchangeParams; use crate::servers::flight::v1::scatter::BroadcastFlightScatter; use crate::servers::flight::v1::scatter::FlightScatter; use crate::servers::flight::v1::scatter::HashFlightScatter; -use crate::servers::flight::v1::scatter::ModFlightScatter; use crate::sessions::QueryContext; pub trait ExchangeInjector: Send + Sync + 'static { @@ -101,11 +100,6 @@ impl ExchangeInjector for DefaultExchangeInjector { local_pos, )? } - DataExchange::Modulo(exchange) => ModFlightScatter::try_create( - ctx.get_function_context()?, - &exchange.shuffle_key, - exchange.destination_ids.len(), - )?, })) } diff --git a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs index 8d96b11c3488d..13a6a57742127 100644 --- a/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs +++ b/src/query/service/src/servers/flight/v1/exchange/exchange_manager.rs @@ -1011,18 +1011,6 @@ impl FragmentCoordinator { .flight_scatter(&info.query_ctx, data_exchange)?, }), )), - DataExchange::Modulo(exchange) => Ok(Some(ExchangeParams::ShuffleExchange( - ShuffleExchangeParams { - exchange_injector: exchange_injector.clone(), - schema: self.physical_plan.output_schema()?, - fragment_id: self.fragment_id, - query_id: info.query_id.to_string(), - executor_id: info.current_executor.to_string(), - destination_ids: exchange.destination_ids.to_owned(), - shuffle_scatter: exchange_injector - .flight_scatter(&info.query_ctx, data_exchange)?, - }, - ))), } } diff --git a/src/query/service/src/servers/flight/v1/exchange/mod.rs b/src/query/service/src/servers/flight/v1/exchange/mod.rs index ada27909df959..194f2cbe1e3e5 100644 --- a/src/query/service/src/servers/flight/v1/exchange/mod.rs +++ b/src/query/service/src/servers/flight/v1/exchange/mod.rs @@ -32,7 +32,6 @@ pub mod serde; pub use data_exchange::BroadcastExchange; pub use data_exchange::DataExchange; pub use data_exchange::MergeExchange; -pub use data_exchange::ModuloExchange; pub use data_exchange::ShuffleDataExchange; pub use exchange_injector::DefaultExchangeInjector; pub use exchange_injector::ExchangeInjector; diff --git a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs b/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs deleted file mode 100644 index f83fea3f574c2..0000000000000 --- a/src/query/service/src/servers/flight/v1/scatter/flight_scatter_mod.rs +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::type_check::check_function; -use databend_common_expression::types::DataType; -use databend_common_expression::types::NumberDataType; -use databend_common_expression::types::NumberScalar; -use databend_common_expression::DataBlock; -use databend_common_expression::Evaluator; -use databend_common_expression::Expr; -use databend_common_expression::FunctionContext; -use databend_common_expression::RemoteExpr; -use databend_common_expression::Scalar; -use databend_common_functions::BUILTIN_FUNCTIONS; - -use crate::servers::flight::v1::scatter::FlightScatter; - -#[derive(Clone)] -pub struct ModFlightScatter { - scatter_size: usize, - func_ctx: FunctionContext, - expr: Expr, -} - -impl ModFlightScatter { - pub fn try_create( - func_ctx: FunctionContext, - expr: &RemoteExpr, - scatter_size: usize, - ) -> Result> { - let expr = check_function( - None, - "modulo", - &[], - &[ - expr.as_expr(&BUILTIN_FUNCTIONS), - Expr::constant( - Scalar::Number(NumberScalar::UInt64(scatter_size as u64)), - Some(DataType::Number(NumberDataType::UInt64)), - ), - ], - &BUILTIN_FUNCTIONS, - )?; - let return_type = expr.data_type(); - if !matches!(return_type, DataType::Number(NumberDataType::UInt64)) { - return Err(ErrorCode::Internal(format!( - "ModFlightScatter expects modulo expression to return UInt64, but got {:?}", - return_type - ))); - } - - Ok(Box::new(ModFlightScatter { - scatter_size, - func_ctx, - expr, - })) - } -} - -impl FlightScatter for ModFlightScatter { - fn execute(&self, data_block: DataBlock) -> Result> { - let evaluator = Evaluator::new(&data_block, &self.func_ctx, &BUILTIN_FUNCTIONS); - let num = data_block.num_rows(); - - let column = evaluator - .run(&self.expr)? - .into_full_column(&DataType::Number(NumberDataType::UInt64), num); - let indices = column.as_number().unwrap().as_u_int64().unwrap(); - let data_blocks = DataBlock::scatter(&data_block, indices, self.scatter_size)?; - - let block_meta = data_block.get_meta(); - let mut res = Vec::with_capacity(data_blocks.len()); - for data_block in data_blocks { - res.push(data_block.add_meta(block_meta.cloned())?); - } - - Ok(res) - } -} diff --git a/src/query/service/src/servers/flight/v1/scatter/mod.rs b/src/query/service/src/servers/flight/v1/scatter/mod.rs index 2904ed87684ca..b5f5f900dab71 100644 --- a/src/query/service/src/servers/flight/v1/scatter/mod.rs +++ b/src/query/service/src/servers/flight/v1/scatter/mod.rs @@ -15,9 +15,7 @@ mod flight_scatter; mod flight_scatter_broadcast; mod flight_scatter_hash; -mod flight_scatter_mod; pub use flight_scatter::FlightScatter; pub use flight_scatter_broadcast::BroadcastFlightScatter; pub use flight_scatter_hash::HashFlightScatter; -pub use flight_scatter_mod::ModFlightScatter; diff --git a/src/query/sql/src/executor/format.rs b/src/query/sql/src/executor/format.rs index 29f5bc2529dad..1e3f8879339f3 100644 --- a/src/query/sql/src/executor/format.rs +++ b/src/query/sql/src/executor/format.rs @@ -1639,14 +1639,6 @@ fn exchange_to_format_tree( ), FragmentKind::Expansive => "Broadcast".to_string(), FragmentKind::Merge => "Merge".to_string(), - FragmentKind::Modulo => format!( - "Modulo({})", - plan.keys - .iter() - .map(|key| { key.as_expr(&BUILTIN_FUNCTIONS).sql_display() }) - .collect::>() - .join(", ") - ), })), to_format_tree(&plan.input, metadata, profs, context)?, ])) diff --git a/src/query/sql/src/executor/physical_plans/common.rs b/src/query/sql/src/executor/physical_plans/common.rs index 10859f8391da1..545179b4af4d6 100644 --- a/src/query/sql/src/executor/physical_plans/common.rs +++ b/src/query/sql/src/executor/physical_plans/common.rs @@ -67,8 +67,6 @@ pub enum FragmentKind { // Broadcast Expansive, Merge, - // Partitioned by a specified expression % node_nums - Modulo, } #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Copy)] diff --git a/src/query/sql/src/executor/physical_plans/physical_exchange.rs b/src/query/sql/src/executor/physical_plans/physical_exchange.rs index b4507942dc8ba..1e831519c415b 100644 --- a/src/query/sql/src/executor/physical_plans/physical_exchange.rs +++ b/src/query/sql/src/executor/physical_plans/physical_exchange.rs @@ -81,14 +81,6 @@ impl PhysicalPlanBuilder { allow_adjust_parallelism = false; FragmentKind::Merge } - crate::plans::Exchange::Modulo(scalar) => { - let expr = scalar - .type_check(input_schema.as_ref())? - .project_column_ref(|index| input_schema.index_of(&index.to_string()).unwrap()); - let (expr, _) = ConstantFolder::fold(&expr, &self.func_ctx, &BUILTIN_FUNCTIONS); - keys.push(expr.as_remote_expr()); - FragmentKind::Modulo - } }; Ok(PhysicalPlan::Exchange(Exchange { plan_id: 0, diff --git a/src/query/sql/src/executor/physical_plans/physical_recluster.rs b/src/query/sql/src/executor/physical_plans/physical_recluster.rs index 0a5520ccb5ba2..a90df3a9fdef5 100644 --- a/src/query/sql/src/executor/physical_plans/physical_recluster.rs +++ b/src/query/sql/src/executor/physical_plans/physical_recluster.rs @@ -31,9 +31,8 @@ pub struct HilbertPartition { pub plan_id: u32, pub input: Box, pub table_info: TableInfo, + pub num_partitions: usize, pub table_meta_timestamps: TableMetaTimestamps, pub rows_per_block: usize, pub bytes_per_block: usize, - pub range_start: u64, - pub range_width: usize, } diff --git a/src/query/sql/src/planner/format/display_rel_operator.rs b/src/query/sql/src/planner/format/display_rel_operator.rs index a82e84ddc59ac..9835bbedf0cdd 100644 --- a/src/query/sql/src/planner/format/display_rel_operator.rs +++ b/src/query/sql/src/planner/format/display_rel_operator.rs @@ -397,7 +397,6 @@ fn exchange_to_format_tree(id_humanizer: &I, op: &Exchange) -> F Exchange::Broadcast => "Exchange(Broadcast)", Exchange::Merge => "Exchange(Merge)", Exchange::MergeSort => "Exchange(MergeSort)", - Exchange::Modulo(_) => "Exchange(Modulo)", }; match op { diff --git a/src/query/sql/src/planner/optimizer/ir/format.rs b/src/query/sql/src/planner/optimizer/ir/format.rs index 017c9bec97203..f9613af6b35ef 100644 --- a/src/query/sql/src/planner/optimizer/ir/format.rs +++ b/src/query/sql/src/planner/optimizer/ir/format.rs @@ -66,7 +66,6 @@ fn display_rel_op(rel_op: &RelOperator) -> String { Exchange::Broadcast => "Broadcast".to_string(), Exchange::Merge => "Merge".to_string(), Exchange::MergeSort => "MergeSort".to_string(), - Exchange::Modulo(scalar) => format!("Modulo({})", scalar.as_raw_expr()), }) } RelOperator::DummyTableScan(_) => "DummyTableScan".to_string(), diff --git a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs index 69abb144e11a4..1229898f5bef4 100644 --- a/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs +++ b/src/query/sql/src/planner/optimizer/ir/property/enforcer.rs @@ -73,7 +73,6 @@ impl Enforcer for DistributionEnforcer { Distribution::Random | Distribution::Any => Err(ErrorCode::Internal( "Cannot enforce random or any distribution", )), - Distribution::Modulo(key) => Ok(Exchange::Modulo(key.clone()).into()), } } } diff --git a/src/query/sql/src/planner/optimizer/ir/property/property.rs b/src/query/sql/src/planner/optimizer/ir/property/property.rs index 3eff4f594a2d2..9ae23730ca7e7 100644 --- a/src/query/sql/src/planner/optimizer/ir/property/property.rs +++ b/src/query/sql/src/planner/optimizer/ir/property/property.rs @@ -92,7 +92,6 @@ pub enum Distribution { Serial, Broadcast, Hash(Vec), - Modulo(Box), } impl Default for Distribution { @@ -111,15 +110,11 @@ impl Distribution { | (Distribution::Random, _) | (Distribution::Serial, Distribution::Serial) | (Distribution::Broadcast, Distribution::Broadcast) - | (Distribution::Hash(_), Distribution::Broadcast) - | (Distribution::Modulo(_), Distribution::Broadcast) => true, + | (Distribution::Hash(_), Distribution::Broadcast) => true, (Distribution::Hash(ref keys), Distribution::Hash(ref other_keys)) => { keys == other_keys } - (Distribution::Modulo(ref key), Distribution::Modulo(ref other_key)) => { - key == other_key - } _ => false, } } @@ -140,7 +135,6 @@ impl Display for Distribution { .collect::>() .join(", ") ), - Distribution::Modulo(ref key) => write!(f, "Modulo({})", key.as_raw_expr()), } } } diff --git a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs index 5bd737365a76b..6d39e793a7231 100644 --- a/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs +++ b/src/query/sql/src/planner/optimizer/optimizers/cascades/cost/model.rs @@ -158,7 +158,7 @@ impl DefaultCostModel { let exchange: Exchange = (*m_expr.plan.clone()).clone().try_into()?; let group = memo.group(m_expr.group_index)?; let cost = match exchange { - Exchange::Hash(_) | Exchange::Modulo(_) => { + Exchange::Hash(_) => { group.stat_info.cardinality * self.network_per_row + group.stat_info.cardinality * self.compute_per_row } diff --git a/src/query/sql/src/planner/plans/exchange.rs b/src/query/sql/src/planner/plans/exchange.rs index db8dffd95d8cf..a7aca885b2ed1 100644 --- a/src/query/sql/src/planner/plans/exchange.rs +++ b/src/query/sql/src/planner/plans/exchange.rs @@ -30,8 +30,7 @@ pub enum Exchange { Hash(Vec), Broadcast, Merge, - MergeSort, // For distributed sort - Modulo(Box), // For recluster + MergeSort, // For distributed sort } impl Operator for Exchange { @@ -50,7 +49,6 @@ impl Operator for Exchange { Exchange::Broadcast => Distribution::Broadcast, Exchange::Merge => Distribution::Serial, Exchange::MergeSort => Distribution::Serial, - Exchange::Modulo(key) => Distribution::Modulo(key.clone()), }, }) } diff --git a/src/query/storages/fuse/src/statistics/cluster_statistics.rs b/src/query/storages/fuse/src/statistics/cluster_statistics.rs index 904446690d93f..f452938fd4c25 100644 --- a/src/query/storages/fuse/src/statistics/cluster_statistics.rs +++ b/src/query/storages/fuse/src/statistics/cluster_statistics.rs @@ -122,7 +122,7 @@ impl ClusterStatsGenerator { let left = unsafe { val.index_unchecked(0) }.to_owned(); min.push(left); - // The maximum in cluster statistics needn't larger than the non-trimmed one. + // The maximum in cluster statistics neednot larger than the non-trimmed one. // So we use trim_min directly. let right = unsafe { val.index_unchecked(val.value().len() - 1) }.to_owned(); max.push(right); From 7d140eb7b081db31a436107292ae9288c6300451 Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 24 Jun 2025 11:59:30 +0800 Subject: [PATCH 36/36] add cluster sample --- src/query/expression/src/block.rs | 4 +- .../src/sampler/fixed_size_sampler.rs | 2 +- src/query/expression/src/sampler/mod.rs | 1 + .../pipelines/builders/builder_recluster.rs | 9 +- src/query/storages/fuse/src/constants.rs | 1 + .../src/io/write/stream/cluster_statistics.rs | 6 +- .../src/io/write/stream/column_statistics.rs | 104 +++++++++++++++++- .../storages/fuse/src/operations/append.rs | 13 --- 8 files changed, 111 insertions(+), 29 deletions(-) diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs index 03c12fc8f6962..0ee73b994beae 100644 --- a/src/query/expression/src/block.rs +++ b/src/query/expression/src/block.rs @@ -504,8 +504,8 @@ impl DataBlock { } #[inline] - pub fn remove_column(&mut self, index: usize) { - self.entries.remove(index); + pub fn remove_column(&mut self, index: usize) -> BlockEntry { + self.entries.remove(index) } #[inline] diff --git a/src/query/expression/src/sampler/fixed_size_sampler.rs b/src/query/expression/src/sampler/fixed_size_sampler.rs index dd7500d40759b..b1317c38ba693 100644 --- a/src/query/expression/src/sampler/fixed_size_sampler.rs +++ b/src/query/expression/src/sampler/fixed_size_sampler.rs @@ -162,7 +162,7 @@ fn compact_indices(indices: &mut Vec, blocks: &mut Vec .collect(); } -mod reservoir_sampling { +pub mod reservoir_sampling { use std::num::NonZeroUsize; use rand::Rng; diff --git a/src/query/expression/src/sampler/mod.rs b/src/query/expression/src/sampler/mod.rs index c34b36905bd0f..558770c854f7b 100644 --- a/src/query/expression/src/sampler/mod.rs +++ b/src/query/expression/src/sampler/mod.rs @@ -16,4 +16,5 @@ mod fixed_rate_sampler; mod fixed_size_sampler; pub use fixed_rate_sampler::FixedRateSampler; +pub use fixed_size_sampler::reservoir_sampling::AlgoL; pub use fixed_size_sampler::FixedSizeSampler; diff --git a/src/query/service/src/pipelines/builders/builder_recluster.rs b/src/query/service/src/pipelines/builders/builder_recluster.rs index b4c9a396c5fcb..8364e45d726fa 100644 --- a/src/query/service/src/pipelines/builders/builder_recluster.rs +++ b/src/query/service/src/pipelines/builders/builder_recluster.rs @@ -184,12 +184,9 @@ impl PipelineBuilder { task.total_compressed, ); let state = SampleState::new(num_processors, partitions); - let recluster_pipeline_builder = ReclusterPipelineBuilder::create( - schema.clone(), - sort_desc.clone(), - sample_size, - ) - .with_state(state); + let recluster_pipeline_builder = + ReclusterPipelineBuilder::create(schema, sort_desc.clone(), sample_size) + .with_state(state); recluster_pipeline_builder .build_recluster_sample_pipeline(&mut self.main_pipeline)?; diff --git a/src/query/storages/fuse/src/constants.rs b/src/query/storages/fuse/src/constants.rs index 890153cb5f15b..9f56c913e5ba1 100644 --- a/src/query/storages/fuse/src/constants.rs +++ b/src/query/storages/fuse/src/constants.rs @@ -31,6 +31,7 @@ pub const FUSE_TBL_XOR_BLOOM_INDEX_PREFIX: &str = "_i_b_v2"; pub const FUSE_TBL_SEGMENT_PREFIX: &str = "_sg"; pub const FUSE_TBL_SNAPSHOT_PREFIX: &str = "_ss"; pub const FUSE_TBL_SNAPSHOT_STATISTICS_PREFIX: &str = "_ts"; +pub const FUSE_TBL_BLOCK_STATS_PREFIX: &str = "_bs"; pub const FUSE_TBL_LAST_SNAPSHOT_HINT: &str = "last_snapshot_location_hint"; pub const FUSE_TBL_LAST_SNAPSHOT_HINT_V2: &str = "last_snapshot_location_hint_v2"; pub const FUSE_TBL_VIRTUAL_BLOCK_PREFIX: &str = "_vb"; diff --git a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs index a0bd91888995e..4c33ff80566db 100644 --- a/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/cluster_statistics.rs @@ -37,7 +37,7 @@ use crate::FuseTable; pub struct ClusterStatisticsBuilder { out_fields: Vec, level: i32, - cluster_key_id: u32, + cluster_key_id: Option, cluster_key_index: Vec, extra_key_num: usize, @@ -92,7 +92,7 @@ impl ClusterStatisticsBuilder { }] }; Ok(Arc::new(Self { - cluster_key_id: table.cluster_key_meta.as_ref().unwrap().0, + cluster_key_id: table.cluster_key_id(), cluster_key_index, extra_key_num, operators, @@ -185,7 +185,7 @@ impl ClusterStatisticsState { max, min, level, - cluster_key_id: self.builder.cluster_key_id, + cluster_key_id: self.builder.cluster_key_id.unwrap(), pages: None, })) } diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs index ee520eb5007b3..b0da462368914 100644 --- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs @@ -13,13 +13,18 @@ // limitations under the License. use std::cmp::Ordering; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::hash::Hash; use std::marker::PhantomData; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::sampler::AlgoL; use databend_common_expression::types::boolean::TrueIdxIter; +use databend_common_expression::types::AccessType; +use databend_common_expression::types::ArrayType; +use databend_common_expression::types::BinaryColumn; +use databend_common_expression::types::BinaryType; use databend_common_expression::types::DataType; use databend_common_expression::types::DateType; use databend_common_expression::types::Decimal; @@ -30,7 +35,8 @@ use databend_common_expression::types::NumberType; use databend_common_expression::types::StringType; use databend_common_expression::types::TimestampType; use databend_common_expression::types::ValueType; -use databend_common_expression::with_number_mapped_type; +use databend_common_expression::{with_number_mapped_type, BlockRowIndex}; +use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::ColumnId; use databend_common_expression::DataBlock; @@ -39,16 +45,18 @@ use databend_common_expression::ScalarRef; use databend_common_expression::TableSchemaRef; use databend_common_expression::Value; use databend_common_expression::SELECTIVITY_THRESHOLD; -use databend_storages_common_table_meta::meta::ColumnDistinctHLL; +use databend_storages_common_table_meta::meta::{ColumnDistinctHLL, Location}; use databend_storages_common_table_meta::meta::ColumnStatistics; use databend_storages_common_table_meta::meta::StatisticsOfColumns; - +use rand::rngs::SmallRng; +use databend_common_expression::types::binary::BinaryColumnBuilder; use crate::statistics::traverse_values_dfs; use crate::statistics::Trim; pub struct ColumnStatisticsState { col_stats: HashMap>, distinct_columns: HashMap>, + // cluster_key: Option, } impl ColumnStatisticsState { @@ -120,6 +128,94 @@ impl ColumnStatisticsState { } } +#[derive(Debug)] +pub struct BlockStatisticsState { + pub(crate) data: Vec, + pub(crate) size: u64, + pub(crate) location: Location, +} + +pub struct ClusterStateSampler { + k: usize, + origins: Vec, + indices: Vec, + core: AlgoL, + + s: usize, +} + +impl ClusterStateSampler { + pub fn new(k: usize, rng: SmallRng) -> Self { + let core = AlgoL::new(k.try_into().unwrap(), rng); + Self { + origins: Vec::new(), + indices: Vec::with_capacity(k), + k, + core, + s: usize::MAX, + } + } + + pub fn add_column(&mut self, data: BinaryColumn) { + let rows = data.len(); + assert!(rows > 0); + let block_idx = self.origins.len() as u32; + let change = self.add_indices(rows, block_idx); + if change { + self.origins.push(data); + } + } + + fn add_indices(&mut self, rows: usize, block_idx: u32) -> bool { + let mut change = false; + let mut cur = 0; + + // Fill initial reservoir + if self.indices.len() < self.k { + let remain = self.k - self.indices.len(); + + if rows <= remain { + self.indices.extend((0..rows).map(|i| (block_idx, i as u32, 1))); + if self.indices.len() == self.k { + self.s = self.core.search(); + } + return true; + } + + self.indices.extend((0..remain).map(|i| (block_idx, i as u32, 1))); + cur += remain; + self.s = self.core.search(); + change = true; + } + + // Apply AlgoL + while rows - cur > self.s { + cur += self.s; + let pos = self.core.pos(); + self.indices[pos] = (block_idx, cur as u32, 1); + self.core.update_w(); + self.s = self.core.search(); + change = true; + } + + self.s -= rows - cur; + change + } + + pub fn finalize(self) -> BlockEntry { + let columns = self.origins; + let mut builder = BinaryColumnBuilder::with_capacity(self.k, 0); + for (block_index, row, times) in self.indices { + let val = + unsafe { BinaryType::index_column_unchecked(&columns[block_index as usize], row as usize) }; + for _ in 0..times { + BinaryType::push_item(&mut builder, val.clone()) + } + } + BlockEntry::new_const_column_arg::>(builder.build(), 1) + } +} + pub trait ColumnNDVEstimator: Send + Sync { fn update_column(&mut self, column: &Column); fn update_scalar(&mut self, scalar: &ScalarRef); diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index 8a2349f8ff359..1d02e58b52f0b 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -79,19 +79,6 @@ impl FuseTable { ) }); } - - let sort_desc: Vec = cluster_key_index - .iter() - .map(|index| SortColumnDescription { - offset: *index, - asc: true, - nulls_first: false, - }) - .collect(); - let sort_desc: Arc<[_]> = sort_desc.into(); - pipeline.add_transformer(|| { - TransformSortPartial::new(LimitType::None, sort_desc.clone()) - }); } pipeline.add_transform(|input, output| {