Skip to content

Commit cb50347

Browse files
committed
for test
1 parent c807783 commit cb50347

File tree

13 files changed

+287
-290
lines changed

13 files changed

+287
-290
lines changed

src/query/service/src/pipelines/builders/builder_recluster.rs

Lines changed: 66 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,6 @@ use databend_common_catalog::plan::DataSourceInfo;
2020
use databend_common_catalog::plan::DataSourcePlan;
2121
use databend_common_exception::ErrorCode;
2222
use databend_common_exception::Result;
23-
use databend_common_expression::row::RowConverter as CommonConverter;
24-
use databend_common_expression::types::AccessType;
25-
use databend_common_expression::types::ArgType;
26-
use databend_common_expression::types::DataType;
27-
use databend_common_expression::types::DateType;
28-
use databend_common_expression::types::NumberDataType;
29-
use databend_common_expression::types::NumberType;
30-
use databend_common_expression::types::StringType;
31-
use databend_common_expression::types::TimestampType;
32-
use databend_common_expression::with_number_mapped_type;
3323
use databend_common_expression::DataSchemaRef;
3424
use databend_common_expression::DataSchemaRefExt;
3525
use databend_common_expression::SortColumnDescription;
@@ -41,28 +31,27 @@ use databend_common_pipeline_core::Pipeline;
4131
use databend_common_pipeline_sources::EmptySource;
4232
use databend_common_pipeline_transforms::processors::build_compact_block_no_split_pipeline;
4333
use databend_common_pipeline_transforms::processors::TransformPipelineHelper;
44-
use databend_common_pipeline_transforms::sort::CommonRows;
45-
use databend_common_pipeline_transforms::sort::RowConverter;
46-
use databend_common_pipeline_transforms::sort::Rows;
47-
use databend_common_pipeline_transforms::sort::SimpleRowConverter;
48-
use databend_common_pipeline_transforms::sort::SimpleRowsAsc;
34+
use databend_common_pipeline_transforms::sort::utils::add_order_field;
35+
use databend_common_pipeline_transforms::MemorySettings;
4936
use databend_common_sql::evaluator::CompoundBlockOperator;
5037
use databend_common_sql::executor::physical_plans::MutationKind;
5138
use databend_common_sql::executor::physical_plans::Recluster;
5239
use databend_common_sql::StreamContext;
5340
use databend_common_storages_factory::Table;
5441
use databend_common_storages_fuse::io::StreamBlockProperties;
42+
use databend_common_storages_fuse::operations::TransformBlockBuilder;
5543
use databend_common_storages_fuse::operations::TransformBlockWriter;
5644
use databend_common_storages_fuse::operations::TransformSerializeBlock;
5745
use databend_common_storages_fuse::FuseTable;
5846
use databend_common_storages_fuse::TableContext;
59-
use match_template::match_template;
6047

6148
use crate::pipelines::builders::SortPipelineBuilder;
6249
use crate::pipelines::processors::transforms::ReclusterPartitionExchange;
50+
use crate::pipelines::processors::transforms::ReclusterPartitionStrategys;
6351
use crate::pipelines::processors::transforms::SampleState;
6452
use crate::pipelines::processors::transforms::TransformAddOrderColumn;
6553
use crate::pipelines::processors::transforms::TransformAddStreamColumns;
54+
use crate::pipelines::processors::transforms::TransformPartitionCollect;
6655
use crate::pipelines::processors::transforms::TransformRangePartitionIndexer;
6756
use crate::pipelines::processors::transforms::TransformReclusterCollect;
6857
use crate::pipelines::processors::transforms::TransformReclusterPartition;
@@ -172,9 +161,7 @@ impl PipelineBuilder {
172161
});
173162
}
174163

175-
let fields_with_cluster_key = properties.fields_with_cluster_key();
176-
let schema = DataSchemaRefExt::create(fields_with_cluster_key);
177-
let sort_descs: Vec<_> = properties
164+
let sort_desc: Vec<_> = properties
178165
.cluster_key_index()
179166
.iter()
180167
.map(|&offset| SortColumnDescription {
@@ -183,6 +170,10 @@ impl PipelineBuilder {
183170
nulls_first: false,
184171
})
185172
.collect();
173+
let fields_with_cluster_key = properties.fields_with_cluster_key();
174+
let schema = DataSchemaRefExt::create(fields_with_cluster_key);
175+
let schema = add_order_field(schema, &sort_desc);
176+
let order_offset = schema.fields.len() - 1;
186177

187178
let num_processors = self.main_pipeline.output_len();
188179
let sample_size = self
@@ -196,9 +187,12 @@ impl PipelineBuilder {
196187
task.total_compressed,
197188
);
198189
let state = SampleState::new(num_processors, partitions);
199-
let recluster_pipeline_builder =
200-
ReclusterPipelineBuilder::create(schema, sort_descs.into(), sample_size)
201-
.with_state(state);
190+
let recluster_pipeline_builder = ReclusterPipelineBuilder::create(
191+
schema.clone(),
192+
sort_desc.clone(),
193+
sample_size,
194+
)
195+
.with_state(state);
202196
recluster_pipeline_builder
203197
.build_recluster_sample_pipeline(&mut self.main_pipeline)?;
204198

@@ -207,16 +201,46 @@ impl PipelineBuilder {
207201
ReclusterPartitionExchange::create(0, partitions),
208202
);
209203
let processor_id = AtomicUsize::new(0);
210-
self.main_pipeline.add_transform(|input, output| {
211-
TransformReclusterPartition::try_create(
212-
input,
213-
output,
214-
properties.clone(),
215-
processor_id.fetch_add(1, atomic::Ordering::AcqRel),
216-
num_processors,
217-
partitions,
218-
)
219-
})?;
204+
205+
let settings = self.ctx.get_settings();
206+
let enable_writings = settings.get_enable_block_stream_writes()?;
207+
if enable_writings {
208+
let memory_settings = MemorySettings::disable_spill();
209+
self.main_pipeline.add_transform(|input, output| {
210+
let strategy =
211+
ReclusterPartitionStrategys::new(properties.clone(), order_offset);
212+
213+
Ok(ProcessorPtr::create(Box::new(
214+
TransformPartitionCollect::new(
215+
self.ctx.clone(),
216+
input,
217+
output,
218+
&settings,
219+
processor_id.fetch_add(1, atomic::Ordering::AcqRel),
220+
num_processors,
221+
partitions,
222+
memory_settings.clone(),
223+
None,
224+
strategy,
225+
)?,
226+
)))
227+
})?;
228+
229+
self.main_pipeline.add_transform(|input, output| {
230+
TransformBlockBuilder::try_create(input, output, properties.clone())
231+
})?;
232+
} else {
233+
self.main_pipeline.add_transform(|input, output| {
234+
TransformReclusterPartition::try_create(
235+
input,
236+
output,
237+
properties.clone(),
238+
processor_id.fetch_add(1, atomic::Ordering::AcqRel),
239+
num_processors,
240+
partitions,
241+
)
242+
})?;
243+
}
220244

221245
self.main_pipeline.add_async_accumulating_transformer(|| {
222246
TransformBlockWriter::create(
@@ -249,7 +273,7 @@ impl PipelineBuilder {
249273
// construct output fields
250274
let output_fields = cluster_stats_gen.out_fields.clone();
251275
let schema = DataSchemaRefExt::create(output_fields);
252-
let sort_descs: Vec<_> = cluster_stats_gen
276+
let sort_desc: Vec<_> = cluster_stats_gen
253277
.cluster_key_index
254278
.iter()
255279
.map(|offset| SortColumnDescription {
@@ -267,10 +291,9 @@ impl PipelineBuilder {
267291
);
268292

269293
let sort_pipeline_builder =
270-
SortPipelineBuilder::create(self.ctx.clone(), schema, sort_descs.into())?
294+
SortPipelineBuilder::create(self.ctx.clone(), schema, sort_desc.into())?
271295
.with_block_size_hit(sort_block_size)
272296
.remove_order_col_at_last();
273-
// Todo(zhyass): Recluster will no longer perform sort in the near future.
274297
sort_pipeline_builder.build_full_sort_pipeline(&mut self.main_pipeline)?;
275298

276299
// Compact after merge sort.
@@ -306,7 +329,7 @@ impl PipelineBuilder {
306329

307330
struct ReclusterPipelineBuilder {
308331
schema: DataSchemaRef,
309-
sort_desc: Arc<[SortColumnDescription]>,
332+
sort_desc: Vec<SortColumnDescription>,
310333
state: Option<Arc<SampleState>>,
311334
sample_size: usize,
312335
seed: u64,
@@ -315,7 +338,7 @@ struct ReclusterPipelineBuilder {
315338
impl ReclusterPipelineBuilder {
316339
fn create(
317340
schema: DataSchemaRef,
318-
sort_desc: Arc<[SortColumnDescription]>,
341+
sort_desc: Vec<SortColumnDescription>,
319342
sample_size: usize,
320343
) -> Self {
321344
Self {
@@ -339,53 +362,17 @@ impl ReclusterPipelineBuilder {
339362
}
340363

341364
fn build_recluster_sample_pipeline(&self, pipeline: &mut Pipeline) -> Result<()> {
342-
match self.sort_desc.as_ref() {
343-
[desc] => {
344-
let schema = self.schema.clone();
345-
let sort_type = schema.field(desc.offset).data_type();
346-
assert!(desc.asc);
347-
348-
match_template! {
349-
T = [ Date => DateType, Timestamp => TimestampType, String => StringType ],
350-
match sort_type {
351-
DataType::T => {
352-
self.visit_type::<SimpleRowsAsc<T>, SimpleRowConverter<T>>(pipeline)
353-
},
354-
DataType::Number(num_ty) => with_number_mapped_type!(|NUM_TYPE| match num_ty {
355-
NumberDataType::NUM_TYPE => {
356-
self.visit_type::<SimpleRowsAsc<NumberType<NUM_TYPE>>, SimpleRowConverter<NumberType<NUM_TYPE>>>(pipeline)
357-
}
358-
}),
359-
_ => self.visit_type::<CommonRows, CommonConverter>(pipeline)
360-
}
361-
}
362-
}
363-
_ => self.visit_type::<CommonRows, CommonConverter>(pipeline),
364-
}
365-
}
366-
367-
fn visit_type<R, C>(&self, pipeline: &mut Pipeline) -> Result<()>
368-
where
369-
R: Rows + 'static,
370-
C: RowConverter<R> + Send + 'static,
371-
R::Type: ArgType + Send + Sync,
372-
<R::Type as AccessType>::Scalar: Ord + Send + Sync,
373-
{
374365
pipeline.try_add_transformer(|| {
375-
TransformAddOrderColumn::<R, C>::try_new(self.sort_desc.clone(), self.schema.clone())
366+
TransformAddOrderColumn::try_new(self.sort_desc.clone(), self.schema.clone())
376367
})?;
377-
let offset = self.schema.num_fields();
368+
let offset = self.schema.num_fields() - 1;
378369
pipeline.add_accumulating_transformer(|| {
379-
TransformReclusterCollect::<R::Type>::new(offset, self.sample_size, self.seed)
370+
TransformReclusterCollect::new(offset, self.sample_size, self.seed)
380371
});
381372
pipeline.add_transform(|input, output| {
382-
Ok(ProcessorPtr::create(TransformRangePartitionIndexer::<
383-
R::Type,
384-
>::create(
385-
input,
386-
output,
387-
self.state.clone().unwrap(),
388-
)))
373+
Ok(ProcessorPtr::create(
374+
TransformRangePartitionIndexer::create(input, output, self.state.clone().unwrap()),
375+
))
389376
})
390377
}
391378
}

src/query/service/src/pipelines/processors/transforms/recluster/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub use range_bound_sampler::RangeBoundSampler;
2525
pub use recluster_partition_exchange::ReclusterPartitionExchange;
2626
pub use recluster_partition_strategy::CompactPartitionStrategy;
2727
pub use recluster_partition_strategy::ReclusterPartitionStrategy;
28+
pub use recluster_partition_strategy::ReclusterPartitionStrategys;
2829
pub use recluster_sample_state::SampleState;
2930
pub use transform_add_order_column::TransformAddOrderColumn;
3031
pub use transform_range_partition_indexer::TransformRangePartitionIndexer;

src/query/service/src/pipelines/processors/transforms/recluster/range_bound_sampler.rs

Lines changed: 7 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,47 +12,32 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
use std::marker::PhantomData;
16-
17-
use databend_common_expression::types::ArgType;
18-
use databend_common_expression::types::ValueType;
1915
use databend_common_expression::DataBlock;
20-
use databend_common_expression::Scalar;
2116
use rand::prelude::SliceRandom;
2217
use rand::prelude::SmallRng;
2318
use rand::SeedableRng;
2419

25-
pub struct RangeBoundSampler<T>
26-
where T: ValueType
27-
{
20+
pub struct RangeBoundSampler {
2821
offset: usize,
2922
sample_size: usize,
3023
rng: SmallRng,
3124

32-
values: Vec<(u64, Vec<Scalar>)>,
33-
_t: PhantomData<T>,
25+
values: Vec<(u64, Vec<Vec<u8>>)>,
3426
}
3527

36-
impl<T> RangeBoundSampler<T>
37-
where T: ValueType
38-
{
28+
impl RangeBoundSampler {
3929
pub fn new(offset: usize, sample_size: usize, seed: u64) -> Self {
4030
let rng = SmallRng::seed_from_u64(seed);
4131
Self {
4232
offset,
4333
sample_size,
4434
rng,
4535
values: vec![],
46-
_t: PhantomData,
4736
}
4837
}
4938
}
5039

51-
impl<T> RangeBoundSampler<T>
52-
where
53-
T: ArgType,
54-
T::Scalar: Ord + Send,
55-
{
40+
impl RangeBoundSampler {
5641
pub fn add_block(&mut self, data: &DataBlock) {
5742
let rows = data.num_rows();
5843
assert!(rows > 0);
@@ -63,19 +48,15 @@ where
6348
indices.shuffle(&mut self.rng);
6449
let sampled_indices = &indices[..sample_size];
6550

66-
let column = T::try_downcast_column(&column).unwrap();
51+
let column = column.as_binary().unwrap();
6752
let sample_values = sampled_indices
6853
.iter()
69-
.map(|i| {
70-
T::upcast_scalar(T::to_owned_scalar(unsafe {
71-
T::index_column_unchecked(&column, *i)
72-
}))
73-
})
54+
.map(|i| unsafe { column.index_unchecked(*i) }.to_vec())
7455
.collect::<Vec<_>>();
7556
self.values.push((rows as u64, sample_values));
7657
}
7758

78-
pub fn sample_values(&mut self) -> Vec<(u64, Vec<Scalar>)> {
59+
pub fn sample_values(&mut self) -> Vec<(u64, Vec<Vec<u8>>)> {
7960
std::mem::take(&mut self.values)
8061
}
8162
}

0 commit comments

Comments
 (0)