From 3c23d9131b2cef4f624c958681414aef8c8cccd2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 8 Jun 2025 13:39:14 -0500 Subject: [PATCH 01/25] implement dynamic filter pushdown for topk operator --- datafusion/common/src/config.rs | 7 + datafusion/core/tests/fuzz_cases/mod.rs | 1 + .../tests/fuzz_cases/topk_filter_pushdown.rs | 354 ++++++++++++++++++ .../physical_optimizer/filter_pushdown/mod.rs | 94 ++++- .../filter_pushdown/util.rs | 14 + .../physical-expr/src/expressions/mod.rs | 1 + .../physical-optimizer/src/optimizer.rs | 8 +- datafusion/physical-plan/src/sorts/sort.rs | 43 ++- datafusion/physical-plan/src/topk/mod.rs | 169 ++++++++- .../test_files/information_schema.slt | 2 + .../test_files/parquet_filter_pushdown.slt | 20 +- .../test_files/push_down_filter.slt | 31 +- datafusion/sqllogictest/test_files/topk.slt | 14 +- docs/source/user-guide/configs.md | 1 + 14 files changed, 724 insertions(+), 35 deletions(-) create mode 100644 datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 4efb67a37c99..8ab7f06dc7b5 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -614,6 +614,13 @@ config_namespace! { /// during aggregations, if possible pub enable_topk_aggregation: bool, default = true + /// When set to true attempts to push down dynamic filters generated by operators into the file scan phase. + /// For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer + /// will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. + /// This means that if we already have 10 timestamps in the year 2025 + /// any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. + pub enable_dynamic_filter_pushdown: bool, default = true + /// When set to true, the optimizer will insert filters before a join between /// a nullable and non-nullable column to filter out nulls on the nullable side. This /// filter can add additional overhead when the file format does not fully support diff --git a/datafusion/core/tests/fuzz_cases/mod.rs b/datafusion/core/tests/fuzz_cases/mod.rs index 8ccc2a5bc131..9e01621c0257 100644 --- a/datafusion/core/tests/fuzz_cases/mod.rs +++ b/datafusion/core/tests/fuzz_cases/mod.rs @@ -21,6 +21,7 @@ mod join_fuzz; mod merge_fuzz; mod sort_fuzz; mod sort_query_fuzz; +mod topk_filter_pushdown; mod aggregation_fuzzer; mod equivalence; diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs new file mode 100644 index 000000000000..2b1b905d9390 --- /dev/null +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -0,0 +1,354 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use arrow::array::{Int32Array, StringArray, StringDictionaryBuilder}; +use arrow::datatypes::Int32Type; +use arrow::record_batch::RecordBatch; +use arrow::util::pretty::pretty_format_batches; +use arrow_schema::{DataType, Field, Schema}; +use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig}; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_datasource::ListingTableUrl; +use datafusion_datasource_parquet::ParquetFormat; +use datafusion_execution::object_store::ObjectStoreUrl; +use itertools::Itertools; +use object_store::memory::InMemory; +use object_store::path::Path; +use object_store::{ObjectStore, PutPayload}; +use parquet::arrow::ArrowWriter; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use tokio::sync::Mutex; +use tokio::task::JoinSet; + +#[derive(Clone)] +struct TestDataSet { + store: Arc, + schema: Arc, +} + +/// List of in memory parquet files with UTF8 data +// Use a mutex rather than LazyLock to allow for async initialization +static TESTFILES: LazyLock>> = + LazyLock::new(|| Mutex::new(vec![])); + +async fn test_files() -> Vec { + let files_mutex = &TESTFILES; + let mut files = files_mutex.lock().await; + if !files.is_empty() { + return (*files).clone(); + } + + let mut rng = StdRng::seed_from_u64(0); + + for nulls_in_ids in [false, true] { + for nulls_in_names in [false, true] { + for nulls_in_departments in [false, true] { + let store = Arc::new(InMemory::new()); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, nulls_in_ids), + Field::new("name", DataType::Utf8, nulls_in_names), + Field::new( + "department", + DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + ), + nulls_in_departments, + ), + ])); + + let name_choices = if nulls_in_names { + [Some("Alice"), Some("Bob"), None, Some("David"), None] + } else { + [ + Some("Alice"), + Some("Bob"), + Some("Charlie"), + Some("David"), + Some("Eve"), + ] + }; + + let department_choices = if nulls_in_departments { + [ + Some("Theater"), + Some("Engineering"), + None, + Some("Arts"), + None, + ] + } else { + [ + Some("Theater"), + Some("Engineering"), + Some("Healthcare"), + Some("Arts"), + Some("Music"), + ] + }; + + // Generate 5 files, some with overlapping or repeated ids some without + for i in 0..5 { + let num_batches = rng.gen_range(1..3); + let mut batches = Vec::with_capacity(num_batches); + for _ in 0..num_batches { + let num_rows = 25; + let ids = Int32Array::from_iter((0..num_rows).map(|file| { + if nulls_in_ids { + if rng.gen_bool(1.0 / 10.0) { + None + } else { + Some(rng.gen_range(file..file + 5)) + } + } else { + Some(rng.gen_range(file..file + 5)) + } + })); + let names = StringArray::from_iter((0..num_rows).map(|_| { + // randomly select a name + let idx = rng.gen_range(0..name_choices.len()); + name_choices[idx].map(|s| s.to_string()) + })); + let mut departments = StringDictionaryBuilder::::new(); + for _ in 0..num_rows { + // randomly select a department + let idx = rng.gen_range(0..department_choices.len()); + departments.append_option(department_choices[idx].as_ref()); + } + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(ids), + Arc::new(names), + Arc::new(departments.finish()), + ], + ) + .unwrap(); + batches.push(batch); + } + let mut buf = vec![]; + { + let mut writer = + ArrowWriter::try_new(&mut buf, schema.clone(), None).unwrap(); + for batch in batches { + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + } + writer.flush().unwrap(); + writer.finish().unwrap(); + } + let payload = PutPayload::from(buf); + let path = Path::from(format!("file_{i}.parquet")); + store.put(&path, payload).await.unwrap(); + } + files.push(TestDataSet { store, schema }); + } + } + } + (*files).clone() +} + +async fn run_query_with_config( + query: &str, + config: SessionConfig, + dataset: TestDataSet, +) -> Vec { + let store = dataset.store; + let schema = dataset.schema; + let ctx = SessionContext::new_with_config(config); + let url = ObjectStoreUrl::parse("memory://").unwrap(); + ctx.register_object_store(url.as_ref(), store.clone()); + + let format = Arc::new( + ParquetFormat::default() + .with_options(ctx.state().table_options().parquet.clone()), + ); + let options = ListingOptions::new(format); + let table_path = ListingTableUrl::parse("memory:///").unwrap(); + let config = ListingTableConfig::new(table_path) + .with_listing_options(options) + .with_schema(schema); + let table = Arc::new(ListingTable::try_new(config).unwrap()); + + ctx.register_table("test_table", table).unwrap(); + + ctx.sql(query).await.unwrap().collect().await.unwrap() +} + +#[derive(Debug)] +struct RunQueryResult { + query: String, + result: Vec, + expected: Vec, +} + +impl RunQueryResult { + fn expected_formated(&self) -> String { + format!("{}", pretty_format_batches(&self.expected).unwrap()) + } + + fn result_formated(&self) -> String { + format!("{}", pretty_format_batches(&self.result).unwrap()) + } + + fn is_ok(&self) -> bool { + self.expected_formated() == self.result_formated() + } +} + +async fn run_query( + query: String, + cfg: SessionConfig, + dataset: TestDataSet, +) -> RunQueryResult { + let cfg_with_dynamic_filters = cfg + .clone() + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", true); + let cfg_without_dynamic_filters = cfg + .clone() + .set_bool("datafusion.optimizer.enable_dynamic_filter_pushdown", false); + + let expected_result = + run_query_with_config(&query, cfg_without_dynamic_filters, dataset.clone()).await; + let result = + run_query_with_config(&query, cfg_with_dynamic_filters, dataset.clone()).await; + + RunQueryResult { + query: query.to_string(), + result, + expected: expected_result, + } +} + +struct TestCase { + query: String, + cfg: SessionConfig, + dataset: TestDataSet, +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_fuzz_topk_filter_pushdown() { + let order_columns = ["id", "name", "department"]; + let order_directions = ["ASC", "DESC"]; + let null_orders = ["NULLS FIRST", "NULLS LAST"]; + + let start = datafusion_common::instant::Instant::now(); + let mut orders: HashMap> = HashMap::new(); + for order_column in &order_columns { + for order_direction in &order_directions { + for null_order in &null_orders { + // if there is a vec for this column insert the order, otherwise create a new vec + let ordering = + format!("{} {} {}", order_column, order_direction, null_order); + match orders.get_mut(*order_column) { + Some(order_vec) => { + order_vec.push(ordering); + } + None => { + orders.insert(order_column.to_string(), vec![ordering]); + } + } + } + } + } + + let mut queries = vec![]; + + for limit in [1, 10] { + for num_order_by_columns in [1, 2, 3] { + for order_columns in ["id", "name", "department"] + .iter() + .combinations(num_order_by_columns) + { + for orderings in order_columns + .iter() + .map(|col| orders.get(**col).unwrap()) + .multi_cartesian_product() + { + let query = format!( + "SELECT * FROM test_table ORDER BY {} LIMIT {}", + orderings.into_iter().join(", "), + limit + ); + queries.push(query); + } + } + } + } + + queries.sort_unstable(); + println!( + "Generated {} queries in {:?}", + queries.len(), + start.elapsed() + ); + + let start = datafusion_common::instant::Instant::now(); + let datasets = test_files().await; + println!("Generated test files in {:?}", start.elapsed()); + + let mut test_cases = vec![]; + for enable_filter_pushdown in [true, false] { + for query in &queries { + for dataset in &datasets { + let mut cfg = SessionConfig::new(); + cfg = cfg.set_bool( + "datafusion.optimizer.enable_dynamic_filter_pushdown", + enable_filter_pushdown, + ); + test_cases.push(TestCase { + query: query.to_string(), + cfg, + dataset: dataset.clone(), + }); + } + } + } + + let start = datafusion_common::instant::Instant::now(); + let mut join_set = JoinSet::new(); + for tc in test_cases { + join_set.spawn(run_query(tc.query, tc.cfg, tc.dataset)); + } + let mut results = join_set.join_all().await; + results.sort_unstable_by(|a, b| a.query.cmp(&b.query)); + println!("Ran {} test cases in {:?}", results.len(), start.elapsed()); + + let failures = results + .iter() + .filter(|result| !result.is_ok()) + .collect::>(); + + for failure in &failures { + println!("Failure:"); + println!("Query:\n{}", failure.query); + println!("\nExpected:\n{}", failure.expected_formated()); + println!("\nResult:\n{}", failure.result_formated()); + println!("\n\n"); + } + + if !failures.is_empty() { + panic!("Some test cases failed"); + } else { + println!("All test cases passed"); + } +} diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index 16dba7e3f205..c9c8530b9b81 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -17,28 +17,40 @@ use std::sync::{Arc, LazyLock}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::{ + array::record_batch, + datatypes::{DataType, Field, Schema, SchemaRef}, +}; +use arrow_schema::SortOptions; use datafusion::{ logical_expr::Operator, physical_plan::{ expressions::{BinaryExpr, Column, Literal}, PhysicalExpr, }, + prelude::{SessionConfig, SessionContext}, scalar::ScalarValue, }; use datafusion_common::config::ConfigOptions; +use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_functions_aggregate::count::count_udaf; -use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::{aggregate::AggregateExprBuilder, Partitioning}; -use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; +use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr}; +use datafusion_physical_optimizer::{ + filter_pushdown::FilterPushdown, PhysicalOptimizerRule, +}; use datafusion_physical_plan::{ aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}, coalesce_batches::CoalesceBatchesExec, filter::FilterExec, repartition::RepartitionExec, + sorts::sort::SortExec, + ExecutionPlan, }; -use util::{OptimizationTest, TestNode, TestScanBuilder}; +use futures::StreamExt; +use object_store::memory::InMemory; +use util::{format_plan_for_test, OptimizationTest, TestNode, TestScanBuilder}; mod util; @@ -346,6 +358,80 @@ fn test_node_handles_child_pushdown_result() { ); } +#[tokio::test] +async fn test_topk_dynamic_filter_pushdown() { + // This test is a bit of a hack, but it shows that we can push down dynamic filters + // into the DataSourceExec. The test is not perfect because we don't have a real + // implementation of the dynamic filter yet, so we just use a static filter. + let batches = vec![ + record_batch!( + ("a", Utf8, ["aa", "ab"]), + ("b", Utf8, ["bd", "bc"]), + ("c", Float64, [1.0, 2.0]) + ) + .unwrap(), + record_batch!( + ("a", Utf8, ["ac", "ad"]), + ("b", Utf8, ["bb", "ba"]), + ("c", Float64, [2.0, 1.0]) + ) + .unwrap(), + ]; + let scan = TestScanBuilder::new(schema()) + .with_support(true) + .with_batches(batches) + .build(); + let plan = Arc::new( + SortExec::new( + LexOrdering::new(vec![PhysicalSortExpr::new( + col("b", &schema()).unwrap(), + SortOptions::new(true, false), // descending, nulls_first + )]), + Arc::clone(&scan), + ) + .with_fetch(Some(1)), + ) as Arc; + + // expect the predicate to be pushed down into the DataSource + insta::assert_snapshot!( + OptimizationTest::new(Arc::clone(&plan), FilterPushdown{}, true), + @r" + OptimizationTest: + input: + - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false] + - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + output: + Ok: + - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false] + - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + " + ); + + // Actually apply the optimization to the plan + let mut config = ConfigOptions::default(); + config.execution.parquet.pushdown_filters = true; + let plan = FilterPushdown {}.optimize(plan, &config).unwrap(); + let config = SessionConfig::new().with_batch_size(2); + let session_ctx = SessionContext::new_with_config(config); + session_ctx.register_object_store( + ObjectStoreUrl::parse("test://").unwrap().as_ref(), + Arc::new(InMemory::new()), + ); + let state = session_ctx.state(); + let task_ctx = state.task_ctx(); + let mut stream = plan.execute(0, Arc::clone(&task_ctx)).unwrap(); + // Iterate one batch + stream.next().await.unwrap().unwrap(); + // Now check what our filter looks like + insta::assert_snapshot!( + format!("{}", format_plan_for_test(&plan)), + @r" + - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false], filter=[b@1 > bd] + - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ b@1 > bd ] + " + ); +} + /// Schema: /// a: String /// b: String diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index c60d2b4d8187..f99f0fa54a14 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -271,6 +271,11 @@ impl TestScanBuilder { self } + pub fn with_batches(mut self, batches: Vec) -> Self { + self.batches = batches; + self + } + pub fn build(self) -> Arc { let source = Arc::new(TestSource::new(self.support, self.batches)); let base_config = FileScanConfigBuilder::new( @@ -426,6 +431,15 @@ fn format_lines(s: &str) -> Vec { s.trim().split('\n').map(|s| s.to_string()).collect() } +pub fn format_plan_for_test(plan: &Arc) -> String { + let mut out = String::new(); + for line in format_execution_plan(plan) { + out.push_str(&format!(" - {line}\n")); + } + out.push('\n'); + out +} + #[derive(Debug)] pub(crate) struct TestNode { inject_filter: bool, diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index d77207fbbcd7..8f46133ed0bb 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -43,6 +43,7 @@ pub use case::{case, CaseExpr}; pub use cast::{cast, CastExpr}; pub use column::{col, with_new_schema, Column}; pub use datafusion_expr::utils::format_state_name; +pub use dynamic_filters::DynamicFilterPhysicalExpr; pub use in_list::{in_list, InListExpr}; pub use is_not_null::{is_not_null, IsNotNullExpr}; pub use is_null::{is_null, IsNullExpr}; diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index d5129cea9d4e..d5af8a4b2d54 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -96,10 +96,6 @@ impl PhysicalOptimizer { // as that rule may inject other operations in between the different AggregateExecs. // Applying the rule early means only directly-connected AggregateExecs must be examined. Arc::new(LimitedDistinctAggregation::new()), - // The FilterPushdown rule tries to push down filters as far as it can. - // For example, it will push down filtering from a `FilterExec` to - // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. - Arc::new(FilterPushdown::new()), // The EnforceDistribution rule is for adding essential repartitioning to satisfy distribution // requirements. Please make sure that the whole plan tree is determined before this rule. // This rule increases parallelism if doing so is beneficial to the physical plan; i.e. at @@ -139,6 +135,10 @@ impl PhysicalOptimizer { // reduced by narrowing their input tables. Arc::new(ProjectionPushdown::new()), Arc::new(InsertYieldExec::new()), + // The FilterPushdown rule tries to push down filters as far as it can. + // For example, it will push down filtering from a `FilterExec` to + // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. + Arc::new(FilterPushdown::new()), // The SanityCheckPlan rule checks whether the order and // distribution requirements of each node in the plan // is satisfied. It will also reject non-runnable query diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index c31f17291e19..59fdaabe78ec 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -27,6 +27,7 @@ use std::sync::Arc; use crate::common::spawn_buffered; use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType}; use crate::expressions::PhysicalSortExpr; +use crate::filter_pushdown::FilterDescription; use crate::limit::LimitStream; use crate::metrics::{ BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics, @@ -53,6 +54,8 @@ use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_physical_expr::LexOrdering; +use datafusion_physical_expr::expressions::{lit, DynamicFilterPhysicalExpr}; +use datafusion_physical_expr::PhysicalExpr; use futures::{StreamExt, TryStreamExt}; use log::{debug, trace}; @@ -843,6 +846,8 @@ pub struct SortExec { common_sort_prefix: Vec, /// Cache holding plan properties like equivalences, output partitioning etc. cache: PlanProperties, + /// Filter matching the state of the sort for dynamic filter pushdown + filter: Option>, } impl SortExec { @@ -861,6 +866,7 @@ impl SortExec { fetch: None, common_sort_prefix: sort_prefix, cache, + filter: None, } } @@ -906,6 +912,14 @@ impl SortExec { if fetch.is_some() && is_pipeline_friendly { cache = cache.with_boundedness(Boundedness::Bounded); } + let filter = fetch.is_some().then(|| { + let children = self + .expr + .iter() + .map(|sort_expr| Arc::clone(&sort_expr.expr)) + .collect::>(); + Arc::new(DynamicFilterPhysicalExpr::new(children, lit(true))) + }); SortExec { input: Arc::clone(&self.input), expr: self.expr.clone(), @@ -914,6 +928,7 @@ impl SortExec { common_sort_prefix: self.common_sort_prefix.clone(), fetch, cache, + filter, } } @@ -1009,6 +1024,13 @@ impl DisplayAs for SortExec { match self.fetch { Some(fetch) => { write!(f, "SortExec: TopK(fetch={fetch}), expr=[{}], preserve_partitioning=[{preserve_partitioning}]", self.expr)?; + if let Some(filter) = &self.filter { + if let Ok(current) = filter.current() { + if !current.eq(&lit(true)) { + write!(f, ", filter=[{}]", current)?; + } + } + } if !self.common_sort_prefix.is_empty() { write!(f, ", sort_prefix=[")?; let mut first = true; @@ -1079,9 +1101,10 @@ impl ExecutionPlan for SortExec { self: Arc, children: Vec>, ) -> Result> { - let new_sort = SortExec::new(self.expr.clone(), Arc::clone(&children[0])) + let mut new_sort = SortExec::new(self.expr.clone(), Arc::clone(&children[0])) .with_fetch(self.fetch) .with_preserve_partitioning(self.preserve_partitioning); + new_sort.filter = self.filter.clone(); Ok(Arc::new(new_sort)) } @@ -1122,6 +1145,7 @@ impl ExecutionPlan for SortExec { context.session_config().batch_size(), context.runtime_env(), &self.metrics_set, + self.filter.clone(), )?; Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), @@ -1228,6 +1252,23 @@ impl ExecutionPlan for SortExec { .with_preserve_partitioning(self.preserve_partitioning()), ))) } + + fn gather_filters_for_pushdown( + &self, + parent_filters: Vec>, + config: &datafusion_common::config::ConfigOptions, + ) -> Result { + if let Some(filter) = &self.filter { + if config.optimizer.enable_dynamic_filter_pushdown { + let filter = Arc::clone(filter) as Arc; + return Ok(FilterDescription::new_with_child_count(1) + .all_parent_filters_supported(parent_filters) + .with_self_filter(filter)); + } + } + Ok(FilterDescription::new_with_child_count(1) + .all_parent_filters_supported(parent_filters)) + } } #[cfg(test)] diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index d8b6f0e400b8..307e8f1eab84 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -17,6 +17,12 @@ //! TopK: Combination of Sort / LIMIT +use arrow::{ + array::Array, + compute::interleave_record_batch, + row::{RowConverter, Rows, SortField}, +}; +use datafusion_expr::{ColumnarValue, Operator}; use std::mem::size_of; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; @@ -25,15 +31,17 @@ use crate::spill::get_record_batch_memory_size; use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; use arrow::array::{ArrayRef, RecordBatch}; -use arrow::compute::interleave_record_batch; use arrow::datatypes::SchemaRef; -use arrow::row::{RowConverter, Rows, SortField}; -use datafusion_common::{internal_datafusion_err, HashMap, Result}; +use datafusion_common::{internal_datafusion_err, internal_err, HashMap, Result, ScalarValue}; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, runtime_env::RuntimeEnv, }; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; +use datafusion_physical_expr::{ + expressions::{is_not_null, is_null, lit, BinaryExpr, DynamicFilterPhysicalExpr}, + PhysicalExpr, +}; /// Global TopK /// @@ -110,6 +118,8 @@ pub struct TopK { common_sort_prefix_converter: Option, /// Common sort prefix between the input and the sort expressions to allow early exit optimization common_sort_prefix: Arc<[PhysicalSortExpr]>, + /// Filter matching the state of the `TopK` heap used for dynamic filter pushdown + filter: Option>, /// If true, indicates that all rows of subsequent batches are guaranteed /// to be greater (by byte order, after row conversion) than the top K, /// which means the top K won't change and the computation can be finished early. @@ -148,6 +158,7 @@ impl TopK { batch_size: usize, runtime: Arc, metrics: &ExecutionPlanMetricsSet, + filter: Option>, ) -> Result { let reservation = MemoryConsumer::new(format!("TopK[{partition_id}]")) .register(&runtime.memory_pool); @@ -179,6 +190,7 @@ impl TopK { common_sort_prefix_converter: prefix_row_converter, common_sort_prefix: Arc::from(common_sort_prefix), finished: false, + filter, }) } @@ -207,6 +219,7 @@ impl TopK { // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`) // this avoids some work and also might be better vectorizable. let mut batch_entry = self.heap.register_batch(batch.clone()); + let mut updated = false; for (index, row) in rows.iter().enumerate() { match self.heap.max() { // heap has k items, and the new row is greater than the @@ -216,6 +229,7 @@ impl TopK { None | Some(_) => { self.heap.add(&mut batch_entry, row, index); self.metrics.row_replacements.add(1); + updated = true; } } } @@ -227,10 +241,108 @@ impl TopK { // update memory reservation self.reservation.try_resize(self.size())?; - // flag the topK as finished if we know that all - // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K, - // which means the top K won't change and the computation can be finished early. - self.attempt_early_completion(&batch)?; + if updated { + // flag the topK as finished if we know that all + // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K, + // which means the top K won't change and the computation can be finished early. + self.attempt_early_completion(&batch)?; + // update the filter representation of our TopK heap + self.update_filter()?; + } + + Ok(()) + } + + /// Update the filter representation of our TopK heap + fn update_filter(&mut self) -> Result<()> { + let Some(filter) = &self.filter else { + return Ok(()); + }; + if let Some(thresholds) = self.heap.get_threshold_values(&self.expr)? { + // Create filter expressions for each threshold + let mut filters: Vec> = + Vec::with_capacity(thresholds.len()); + + let mut prev_sort_expr: Option> = None; + for (sort_expr, value) in self.expr.iter().zip(thresholds.iter()) { + // Create the appropriate operator based on sort order + let op = if sort_expr.options.descending { + // For descending sort, we want col > threshold (exclude smaller values) + Operator::Gt + } else { + // For ascending sort, we want col < threshold (exclude larger values) + Operator::Lt + }; + + let value_null = value.is_null(); + + let comparison = Arc::new(BinaryExpr::new( + Arc::clone(&sort_expr.expr), + op, + lit(value.clone()), + )); + + let comparison_with_null = + match (sort_expr.options.nulls_first, value_null) { + // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) + (true, true) => lit(false), + (true, false) => Arc::new(BinaryExpr::new( + is_null(Arc::clone(&sort_expr.expr))?, + Operator::Or, + comparison, + )), + // For nulls last, transform to (threshold.value is null and threshold.expr is not null) + // or (threshold.value is not null and comparison) + (false, true) => is_not_null(Arc::clone(&sort_expr.expr))?, + (false, false) => comparison, + }; + + let mut eq_expr = Arc::new(BinaryExpr::new( + Arc::clone(&sort_expr.expr), + Operator::Eq, + lit(value.clone()), + )); + + if value_null { + eq_expr = Arc::new(BinaryExpr::new( + is_null(Arc::clone(&sort_expr.expr))?, + Operator::Or, + eq_expr, + )); + } + + // For a query like order by a, b, the filter for column `b` is only applied if + // the condition a = threshold.value (considering null equality) is met. + // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field, + // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields. + match prev_sort_expr.take() { + None => { + prev_sort_expr = Some(eq_expr); + filters.push(comparison_with_null); + } + Some(p) => { + filters.push(Arc::new(BinaryExpr::new( + Arc::clone(&p), + Operator::And, + comparison_with_null, + ))); + + prev_sort_expr = + Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr))); + } + } + } + + let dynamic_predicate = filters + .into_iter() + .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b))); + + if let Some(predicate) = dynamic_predicate { + if !predicate.eq(&lit(true)) { + filter.update(predicate)?; + } + } + } Ok(()) } @@ -324,6 +436,7 @@ impl TopK { common_sort_prefix_converter: _, common_sort_prefix: _, finished: _, + filter: _, } = self; let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop @@ -566,6 +679,47 @@ impl TopKHeap { + self.store.size() + self.owned_bytes } + + fn get_threshold_values( + &self, + sort_exprs: &[PhysicalSortExpr], + ) -> Result>> { + // If the heap doesn't have k elements yet, we can't create thresholds + let max_row = match self.max() { + Some(row) => row, + None => return Ok(None), + }; + + // Get the batch that contains the max row + let batch_entry = match self.store.get(max_row.batch_id) { + Some(entry) => entry, + None => return internal_err!("Invalid batch ID in TopKRow"), + }; + + // Extract threshold values for each sort expression + let mut scalar_values = Vec::with_capacity(sort_exprs.len()); + for sort_expr in sort_exprs { + // Extract the value for this column from the max row + let expr = Arc::clone(&sort_expr.expr); + let value = expr.evaluate(&batch_entry.batch.slice(max_row.index, 1))?; + + // Convert to scalar value - should be a single value since we're evaluating on a single row batch + let scalar = match value { + ColumnarValue::Scalar(scalar) => scalar, + ColumnarValue::Array(array) if array.len() == 1 => { + // Extract the first (and only) value from the array + ScalarValue::try_from_array(&array, 0)? + } + array => { + return internal_err!("Expected a scalar value, got {:?}", array) + } + }; + + scalar_values.push(scalar); + } + + Ok(Some(scalar_values)) + } } /// Represents one of the top K rows held in this heap. Orders @@ -834,6 +988,7 @@ mod tests { 2, runtime, &metrics, + None, )?; // Create the first batch with two columns: diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index dc8b7680d83e..42d64c5ee155 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -285,6 +285,7 @@ datafusion.format.types_info false datafusion.optimizer.allow_symmetric_joins_without_pruning true datafusion.optimizer.default_filter_selectivity 20 datafusion.optimizer.enable_distinct_aggregation_soft_limit true +datafusion.optimizer.enable_dynamic_filter_pushdown true datafusion.optimizer.enable_round_robin_repartition true datafusion.optimizer.enable_topk_aggregation true datafusion.optimizer.expand_views_at_output false @@ -396,6 +397,7 @@ datafusion.format.types_info false Show types in visual representation batches datafusion.optimizer.allow_symmetric_joins_without_pruning true Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. datafusion.optimizer.default_filter_selectivity 20 The default filter selectivity used by Filter Statistics when an exact selectivity cannot be determined. Valid values are between 0 (no selectivity) and 100 (all rows are selected). datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. +datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. datafusion.optimizer.enable_round_robin_repartition true When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores datafusion.optimizer.enable_topk_aggregation true When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible datafusion.optimizer.expand_views_at_output false When set to true, if the returned type is a view type then the output will be coerced to a non-view. Coerces `Utf8View` to `LargeUtf8`, and `BinaryView` to `LargeBinary`. diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index e5b5f5ac878a..9f7d8e011f97 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -86,7 +86,10 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST] 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------ProjectionExec: expr=[a@0 as a] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 +06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[] # When filter pushdown *is* enabled, ParquetExec can filter exactly, @@ -134,7 +137,10 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST] 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2 AND a@0 IS NOT NULL, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2 AND a_null_count@3 != row_count@2, required_guarantees=[] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------ProjectionExec: expr=[a@0 as a] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 +06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2 AND a@0 IS NOT NULL, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2 AND a_null_count@3 != row_count@2, required_guarantees=[] query I @@ -153,7 +159,10 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [b@0 ASC NULLS LAST] 02)--SortExec: expr=[b@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[b], file_type=parquet, predicate=a@0 = bar, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= bar AND bar <= a_max@1, required_guarantees=[a in (bar)] +03)----CoalesceBatchesExec: target_batch_size=8192 +04)------ProjectionExec: expr=[b@1 as b] +05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 +06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 = bar, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= bar AND bar <= a_max@1, required_guarantees=[a in (bar)] ## cleanup statement ok @@ -229,7 +238,10 @@ EXPLAIN select * from t_pushdown where val != 'c'; logical_plan 01)Filter: t_pushdown.val != Utf8("c") 02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8("c")] -physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)] # If we have a mix of filters: # - The partition filters get evaluated during planning diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index ed948dd11439..b4c74610f064 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -192,7 +192,8 @@ explain select * from test_filter_with_limit where value = 2 limit 1; ---- physical_plan 01)CoalescePartitionsExec: fetch=1 -02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], limit=1, file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)] +02)--CoalesceBatchesExec: target_batch_size=8192, fetch=1 +03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)] query II select * from test_filter_with_limit where value = 2 limit 1; @@ -229,43 +230,57 @@ LOCATION 'test_files/scratch/push_down_filter/t.parquet'; query TT explain select a from t where a = '100'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] # The predicate should not have a column cast when the value is a valid i32 query TT explain select a from t where a != '100'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)] # The predicate should still have the column cast when the value is a NOT valid i32 query TT explain select a from t where a = '99999999999'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999 +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999 # The predicate should still have the column cast when the value is a NOT valid i32 query TT explain select a from t where a = '99.99'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99 +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99 # The predicate should still have the column cast when the value is a NOT valid i32 query TT explain select a from t where a = ''; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = # The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information. query TT explain select a from t where cast(a as string) = '100'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] # The predicate should still have the column cast when the literal alters its string representation after round-trip casting (leading zero lost). query TT explain select a from t where CAST(a AS string) = '0123'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123 +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123 statement ok diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 26fef6d666b1..9ff382d32af9 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -316,7 +316,7 @@ explain select number, letter, age from partial_sorted order by number desc, let ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Explain variations of the above query with different orderings, and different sort prefixes. @@ -326,28 +326,28 @@ explain select number, letter, age from partial_sorted order by age desc limit 3 ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[age@2 DESC], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] query TT explain select number, letter, age from partial_sorted order by number desc, letter desc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] query TT explain select number, letter, age from partial_sorted order by number asc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] query TT explain select number, letter, age from partial_sorted order by letter asc, number desc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[letter@1 ASC NULLS LAST, number@0 DESC], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Explicit NULLS ordering cases (reversing the order of the NULLS on the number and letter orderings) query TT @@ -355,14 +355,14 @@ explain select number, letter, age from partial_sorted order by number desc, let ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC], preserve_partitioning=[false], sort_prefix=[number@0 DESC] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] query TT explain select number, letter, age from partial_sorted order by number desc NULLS LAST, letter asc limit 3; ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC NULLS LAST, letter@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] # Verify that the sort prefix is correctly computed on the normalized ordering (removing redundant aliased columns) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 42282e39e41f..fcb2360b87e3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -101,6 +101,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | +| datafusion.optimizer.enable_dynamic_filter_pushdown | true | When set to true attempts to push down dynamic filters generated by operators into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. | | datafusion.optimizer.filter_null_join_keys | false | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down. | | datafusion.optimizer.repartition_aggregations | true | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level | | datafusion.optimizer.repartition_file_min_size | 10485760 | Minimum total files size in bytes to perform file scan repartitioning. | From e5979d636da7f257d52e7eae5c726de794989e4d Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 8 Jun 2025 15:29:43 -0500 Subject: [PATCH 02/25] fix some tests --- .../tests/fuzz_cases/topk_filter_pushdown.rs | 12 +++---- .../physical_optimizer/filter_pushdown/mod.rs | 2 +- .../physical-optimizer/src/optimizer.rs | 6 ++++ datafusion/sqllogictest/test_files/limit.slt | 2 +- .../test_files/parquet_filter_pushdown.slt | 20 +++--------- .../test_files/push_down_filter.slt | 31 +++++-------------- 6 files changed, 26 insertions(+), 47 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index 2b1b905d9390..2fa3575bb65a 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -108,30 +108,30 @@ async fn test_files() -> Vec { // Generate 5 files, some with overlapping or repeated ids some without for i in 0..5 { - let num_batches = rng.gen_range(1..3); + let num_batches = rng.random_range(1..3); let mut batches = Vec::with_capacity(num_batches); for _ in 0..num_batches { let num_rows = 25; let ids = Int32Array::from_iter((0..num_rows).map(|file| { if nulls_in_ids { - if rng.gen_bool(1.0 / 10.0) { + if rng.random_bool(1.0 / 10.0) { None } else { - Some(rng.gen_range(file..file + 5)) + Some(rng.random_range(file..file + 5)) } } else { - Some(rng.gen_range(file..file + 5)) + Some(rng.random_range(file..file + 5)) } })); let names = StringArray::from_iter((0..num_rows).map(|_| { // randomly select a name - let idx = rng.gen_range(0..name_choices.len()); + let idx = rng.random_range(0..name_choices.len()); name_choices[idx].map(|s| s.to_string()) })); let mut departments = StringDictionaryBuilder::::new(); for _ in 0..num_rows { // randomly select a department - let idx = rng.gen_range(0..department_choices.len()); + let idx = rng.random_range(0..department_choices.len()); departments.append_option(department_choices[idx].as_ref()); } let batch = RecordBatch::try_new( diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index c9c8530b9b81..3aefb4f7997c 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -386,7 +386,7 @@ async fn test_topk_dynamic_filter_pushdown() { LexOrdering::new(vec![PhysicalSortExpr::new( col("b", &schema()).unwrap(), SortOptions::new(true, false), // descending, nulls_first - )]), + )]).unwrap(), Arc::clone(&scan), ) .with_fetch(Some(1)), diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index d5af8a4b2d54..49245551645e 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -96,6 +96,12 @@ impl PhysicalOptimizer { // as that rule may inject other operations in between the different AggregateExecs. // Applying the rule early means only directly-connected AggregateExecs must be examined. Arc::new(LimitedDistinctAggregation::new()), + // The FilterPushdown rule tries to push down filters as far as it can. + // For example, it will push down filtering from a `FilterExec` to + // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. + // This must run before EnforceSorting to ensure dynamic filters from TopK operators + // are established before sorts are potentially recreated. + Arc::new(FilterPushdown::new()), // The EnforceDistribution rule is for adding essential repartitioning to satisfy distribution // requirements. Please make sure that the whole plan tree is determined before this rule. // This rule increases parallelism if doing so is beneficial to the physical plan; i.e. at diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 9d5106bf2caf..3398fa29018b 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -854,7 +854,7 @@ physical_plan 01)ProjectionExec: expr=[1 as foo] 02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1 03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST], preserve_partitioning=[true] -04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]}, projection=[part_key], file_type=parquet +04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ] query I with selection as ( diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 9f7d8e011f97..e5b5f5ac878a 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -86,10 +86,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST] 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----CoalesceBatchesExec: target_batch_size=8192 -04)------ProjectionExec: expr=[a@0 as a] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 -06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2, required_guarantees=[] # When filter pushdown *is* enabled, ParquetExec can filter exactly, @@ -137,10 +134,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [a@0 ASC NULLS LAST] 02)--SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----CoalesceBatchesExec: target_batch_size=8192 -04)------ProjectionExec: expr=[a@0 as a] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 -06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=b@1 > 2 AND a@0 IS NOT NULL, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2 AND a_null_count@3 != row_count@2, required_guarantees=[] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a], file_type=parquet, predicate=b@1 > 2 AND a@0 IS NOT NULL, pruning_predicate=b_null_count@1 != row_count@2 AND b_max@0 > 2 AND a_null_count@3 != row_count@2, required_guarantees=[] query I @@ -159,10 +153,7 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [b@0 ASC NULLS LAST] 02)--SortExec: expr=[b@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----CoalesceBatchesExec: target_batch_size=8192 -04)------ProjectionExec: expr=[b@1 as b] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 -06)----------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[a, b], file_type=parquet, predicate=a@0 = bar, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= bar AND bar <= a_max@1, required_guarantees=[a in (bar)] +03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_table/2.parquet]]}, projection=[b], file_type=parquet, predicate=a@0 = bar, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= bar AND bar <= a_max@1, required_guarantees=[a in (bar)] ## cleanup statement ok @@ -238,10 +229,7 @@ EXPLAIN select * from t_pushdown where val != 'c'; logical_plan 01)Filter: t_pushdown.val != Utf8("c") 02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8("c")] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 -03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)] +physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)] # If we have a mix of filters: # - The partition filters get evaluated during planning diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index b4c74610f064..ed948dd11439 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -192,8 +192,7 @@ explain select * from test_filter_with_limit where value = 2 limit 1; ---- physical_plan 01)CoalescePartitionsExec: fetch=1 -02)--CoalesceBatchesExec: target_batch_size=8192, fetch=1 -03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)] +02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_filter_with_limit/part-2.parquet]]}, projection=[part_key, value], limit=1, file_type=parquet, predicate=value@1 = 2, pruning_predicate=value_null_count@2 != row_count@3 AND value_min@0 <= 2 AND 2 <= value_max@1, required_guarantees=[value in (2)] query II select * from test_filter_with_limit where value = 2 limit 1; @@ -230,57 +229,43 @@ LOCATION 'test_files/scratch/push_down_filter/t.parquet'; query TT explain select a from t where a = '100'; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] # The predicate should not have a column cast when the value is a valid i32 query TT explain select a from t where a != '100'; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)] # The predicate should still have the column cast when the value is a NOT valid i32 query TT explain select a from t where a = '99999999999'; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999 +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999 # The predicate should still have the column cast when the value is a NOT valid i32 query TT explain select a from t where a = '99.99'; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99 +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99 # The predicate should still have the column cast when the value is a NOT valid i32 query TT explain select a from t where a = ''; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = # The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information. query TT explain select a from t where cast(a as string) = '100'; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 = 100, pruning_predicate=a_null_count@2 != row_count@3 AND a_min@0 <= 100 AND 100 <= a_max@1, required_guarantees=[a in (100)] # The predicate should still have the column cast when the literal alters its string representation after round-trip casting (leading zero lost). query TT explain select a from t where CAST(a AS string) = '0123'; ---- -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123 +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123 statement ok From d8670de7c1a5211449d836f38f0df659e3f25759 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sun, 8 Jun 2025 16:04:25 -0500 Subject: [PATCH 03/25] lint --- datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs | 2 +- .../core/tests/physical_optimizer/filter_pushdown/mod.rs | 8 +++----- datafusion/physical-plan/src/sorts/sort.rs | 4 ++-- datafusion/physical-plan/src/topk/mod.rs | 6 ++++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index 2fa3575bb65a..572222b7b9da 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -258,7 +258,7 @@ async fn test_fuzz_topk_filter_pushdown() { for null_order in &null_orders { // if there is a vec for this column insert the order, otherwise create a new vec let ordering = - format!("{} {} {}", order_column, order_direction, null_order); + format!("{order_column} {order_direction} {null_order}"); match orders.get_mut(*order_column) { Some(order_vec) => { order_vec.push(ordering); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index 3aefb4f7997c..30db366cd549 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -36,9 +36,7 @@ use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_functions_aggregate::count::count_udaf; use datafusion_physical_expr::{aggregate::AggregateExprBuilder, Partitioning}; use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr}; -use datafusion_physical_optimizer::{ - filter_pushdown::FilterPushdown, PhysicalOptimizerRule, -}; +use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; use datafusion_physical_plan::{ aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}, coalesce_batches::CoalesceBatchesExec, @@ -386,7 +384,8 @@ async fn test_topk_dynamic_filter_pushdown() { LexOrdering::new(vec![PhysicalSortExpr::new( col("b", &schema()).unwrap(), SortOptions::new(true, false), // descending, nulls_first - )]).unwrap(), + )]) + .unwrap(), Arc::clone(&scan), ) .with_fetch(Some(1)), @@ -410,7 +409,6 @@ async fn test_topk_dynamic_filter_pushdown() { // Actually apply the optimization to the plan let mut config = ConfigOptions::default(); config.execution.parquet.pushdown_filters = true; - let plan = FilterPushdown {}.optimize(plan, &config).unwrap(); let config = SessionConfig::new().with_batch_size(2); let session_ctx = SessionContext::new_with_config(config); session_ctx.register_object_store( diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 59fdaabe78ec..570d64449040 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -53,8 +53,8 @@ use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; -use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::expressions::{lit, DynamicFilterPhysicalExpr}; +use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::PhysicalExpr; use futures::{StreamExt, TryStreamExt}; @@ -1027,7 +1027,7 @@ impl DisplayAs for SortExec { if let Some(filter) = &self.filter { if let Ok(current) = filter.current() { if !current.eq(&lit(true)) { - write!(f, ", filter=[{}]", current)?; + write!(f, ", filter=[{current}]")?; } } } diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 307e8f1eab84..a610a47fb90a 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -32,16 +32,18 @@ use crate::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; use arrow::array::{ArrayRef, RecordBatch}; use arrow::datatypes::SchemaRef; -use datafusion_common::{internal_datafusion_err, internal_err, HashMap, Result, ScalarValue}; +use datafusion_common::{ + internal_datafusion_err, internal_err, HashMap, Result, ScalarValue, +}; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, runtime_env::RuntimeEnv, }; -use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use datafusion_physical_expr::{ expressions::{is_not_null, is_null, lit, BinaryExpr, DynamicFilterPhysicalExpr}, PhysicalExpr, }; +use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; /// Global TopK /// From 2f5ce8593efa49ba630cb1fa6d6ce33aad372dbb Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Jun 2025 06:41:15 -0500 Subject: [PATCH 04/25] pass around the filter in EnforceSorting optimizer rule --- .../tests/fuzz_cases/topk_filter_pushdown.rs | 3 +- .../physical_optimizer/filter_pushdown/mod.rs | 68 +++++++++++++++++-- .../src/expressions/dynamic_filters.rs | 3 +- .../src/enforce_distribution.rs | 1 + .../src/enforce_sorting/mod.rs | 23 ++++--- .../src/enforce_sorting/sort_pushdown.rs | 51 +++++++++++++- .../physical-optimizer/src/optimizer.rs | 6 -- .../src/topk_aggregation.rs | 7 +- datafusion/physical-optimizer/src/utils.rs | 8 ++- datafusion/physical-plan/src/sorts/sort.rs | 25 +++++-- datafusion/physical-plan/src/topk/mod.rs | 9 +-- .../test_files/parquet_filter_pushdown.slt | 35 ---------- 12 files changed, 169 insertions(+), 70 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index 572222b7b9da..7296502264e6 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -257,8 +257,7 @@ async fn test_fuzz_topk_filter_pushdown() { for order_direction in &order_directions { for null_order in &null_orders { // if there is a vec for this column insert the order, otherwise create a new vec - let ordering = - format!("{order_column} {order_direction} {null_order}"); + let ordering = format!("{order_column} {order_direction} {null_order}"); match orders.get_mut(*order_column) { Some(order_vec) => { order_vec.push(ordering); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index 30db366cd549..a8357c0466c9 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -20,6 +20,7 @@ use std::sync::{Arc, LazyLock}; use arrow::{ array::record_batch, datatypes::{DataType, Field, Schema, SchemaRef}, + util::pretty::pretty_format_batches, }; use arrow_schema::SortOptions; use datafusion::{ @@ -28,7 +29,7 @@ use datafusion::{ expressions::{BinaryExpr, Column, Literal}, PhysicalExpr, }, - prelude::{SessionConfig, SessionContext}, + prelude::{ParquetReadOptions, SessionConfig, SessionContext}, scalar::ScalarValue, }; use datafusion_common::config::ConfigOptions; @@ -36,7 +37,9 @@ use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_functions_aggregate::count::count_udaf; use datafusion_physical_expr::{aggregate::AggregateExprBuilder, Partitioning}; use datafusion_physical_expr::{expressions::col, LexOrdering, PhysicalSortExpr}; -use datafusion_physical_optimizer::filter_pushdown::FilterPushdown; +use datafusion_physical_optimizer::{ + filter_pushdown::FilterPushdown, PhysicalOptimizerRule, +}; use datafusion_physical_plan::{ aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}, coalesce_batches::CoalesceBatchesExec, @@ -47,7 +50,7 @@ use datafusion_physical_plan::{ }; use futures::StreamExt; -use object_store::memory::InMemory; +use object_store::{memory::InMemory, ObjectStore}; use util::{format_plan_for_test, OptimizationTest, TestNode, TestScanBuilder}; mod util; @@ -406,9 +409,10 @@ async fn test_topk_dynamic_filter_pushdown() { " ); - // Actually apply the optimization to the plan + // Actually apply the optimization to the plan and put some data through it to check that the filter is updated to reflect the TopK state let mut config = ConfigOptions::default(); config.execution.parquet.pushdown_filters = true; + let plan = FilterPushdown::new().optimize(plan, &config).unwrap(); let config = SessionConfig::new().with_batch_size(2); let session_ctx = SessionContext::new_with_config(config); session_ctx.register_object_store( @@ -430,6 +434,62 @@ async fn test_topk_dynamic_filter_pushdown() { ); } +/// Integration test for dynamic filter pushdown with TopK. +/// We use an integration test because there are complex interactions in the optimizer rules +/// that the unit tests applying a single optimizer rule do not cover. +#[tokio::test] +async fn test_topk_dynamic_filter_pushdown_integration() { + let store = Arc::new(InMemory::new()) as Arc; + let mut cfg = SessionConfig::new(); + cfg.options_mut().execution.parquet.pushdown_filters = true; + cfg.options_mut().execution.parquet.max_row_group_size = 128; + let ctx = SessionContext::new_with_config(cfg); + ctx.register_object_store( + ObjectStoreUrl::parse("memory://").unwrap().as_ref(), + Arc::clone(&store), + ); + ctx.sql( + r" +COPY ( + SELECT 1372708800 + value AS t + FROM generate_series(0, 99999) + ORDER BY t + ) TO 'memory:///1.parquet' +STORED AS PARQUET; + ", + ) + .await + .unwrap() + .collect() + .await + .unwrap(); + + // Register the file with the context + ctx.register_parquet( + "topk_pushdown", + "memory:///1.parquet", + ParquetReadOptions::default(), + ) + .await + .unwrap(); + + // Create a TopK query that will use dynamic filter pushdown + let df = ctx + .sql(r"EXPLAIN ANALYZE SELECT t FROM topk_pushdown ORDER BY t LIMIT 10;") + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let explain = format!("{}", pretty_format_batches(&batches).unwrap()); + + assert!(explain.contains("output_rows=128")); // Read 1 row group + assert!(explain.contains("t@0 < 1372708809")); // Dynamic filter was applied + assert!( + explain.contains("pushdown_rows_matched=128, pushdown_rows_pruned=99872"), + "{explain}" + ); + // Pushdown pruned most rows +} + /// Schema: /// a: String /// b: String diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs index 756fb638af2b..09a8da61cdfa 100644 --- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs +++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs @@ -288,8 +288,9 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { } fn snapshot(&self) -> Result>> { + let inner = self.current()?; // Return the current expression as a snapshot. - Ok(Some(self.current()?)) + Ok(Some(inner)) } } diff --git a/datafusion/physical-optimizer/src/enforce_distribution.rs b/datafusion/physical-optimizer/src/enforce_distribution.rs index 478ce39eecb9..bf08e4a9c243 100644 --- a/datafusion/physical-optimizer/src/enforce_distribution.rs +++ b/datafusion/physical-optimizer/src/enforce_distribution.rs @@ -1308,6 +1308,7 @@ pub fn ensure_distribution( .downcast_ref::() .map(|output| output.fetch()) .unwrap_or(None), + None, )?; } } diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs index 8a71b28486a2..78360ab30fde 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs @@ -57,6 +57,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::Result; +use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr; use datafusion_physical_expr::{Distribution, Partitioning}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -403,7 +404,7 @@ pub fn parallelize_sorts( && requirements.plan.output_partitioning().partition_count() <= 1 { // Take the initial sort expressions and requirements - let (sort_exprs, fetch) = get_sort_exprs(&requirements.plan)?; + let (sort_exprs, fetch, filter) = get_sort_exprs(&requirements.plan)?; let sort_reqs = LexRequirement::from(sort_exprs.clone()); let sort_exprs = sort_exprs.clone(); @@ -417,7 +418,7 @@ pub fn parallelize_sorts( // deals with the children and their children and so on. requirements = requirements.children.swap_remove(0); - requirements = add_sort_above_with_check(requirements, sort_reqs, fetch)?; + requirements = add_sort_above_with_check(requirements, sort_reqs, fetch, filter)?; let spm = SortPreservingMergeExec::new(sort_exprs, Arc::clone(&requirements.plan)); @@ -513,6 +514,7 @@ pub fn ensure_sorting( .downcast_ref::() .map(|output| output.fetch()) .unwrap_or(None), + None, ); child = update_sort_ctx_children_data(child, true)?; } @@ -644,7 +646,7 @@ fn adjust_window_sort_removal( // Satisfy the ordering requirement so that the window can run: let mut child_node = window_tree.children.swap_remove(0); if let Some(reqs) = reqs { - child_node = add_sort_above(child_node, reqs.into_single(), None); + child_node = add_sort_above(child_node, reqs.into_single(), None, None); } let child_plan = Arc::clone(&child_node.plan); window_tree.children.push(child_node); @@ -803,15 +805,20 @@ fn remove_corresponding_sort_from_sub_plan( Ok(node) } +/// Return type for get_sort_exprs function +type SortExprsResult<'a> = ( + &'a LexOrdering, + Option, + Option>, +); + /// Converts an [ExecutionPlan] trait object to a [LexOrdering] reference when possible. -fn get_sort_exprs( - sort_any: &Arc, -) -> Result<(&LexOrdering, Option)> { +fn get_sort_exprs(sort_any: &Arc) -> Result> { if let Some(sort_exec) = sort_any.as_any().downcast_ref::() { - Ok((sort_exec.expr(), sort_exec.fetch())) + Ok((sort_exec.expr(), sort_exec.fetch(), sort_exec.filter())) } else if let Some(spm) = sort_any.as_any().downcast_ref::() { - Ok((spm.expr(), spm.fetch())) + Ok((spm.expr(), spm.fetch(), None)) } else { plan_err!("Given ExecutionPlan is not a SortExec or a SortPreservingMergeExec") } diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs index bd7b0060c3be..1fe1ed3e21b1 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs @@ -26,7 +26,7 @@ use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{internal_err, HashSet, JoinSide, Result}; use datafusion_expr::JoinType; -use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr}; use datafusion_physical_expr::utils::collect_columns; use datafusion_physical_expr::{ add_offset_to_physical_sort_exprs, EquivalenceProperties, @@ -57,6 +57,7 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; pub struct ParentRequirements { ordering_requirement: Option, fetch: Option, + filter: Option>, } pub type SortPushDown = PlanContext; @@ -70,6 +71,8 @@ pub fn assign_initial_requirements(sort_push_down: &mut SortPushDown) { // If the parent has a fetch value, assign it to the children // Or use the fetch value of the child. fetch: child.plan.fetch(), + // If the parent has a filter, assign it to the children + filter: sort_push_down.data.filter.clone(), }; } } @@ -95,6 +98,7 @@ fn pushdown_sorts_helper( ) -> Result> { let plan = sort_push_down.plan; let parent_fetch = sort_push_down.data.fetch; + let parent_filter = sort_push_down.data.filter.clone(); let Some(parent_requirement) = sort_push_down.data.ordering_requirement.clone() else { @@ -114,6 +118,18 @@ fn pushdown_sorts_helper( sort_push_down.data.fetch = fetch; sort_push_down.data.ordering_requirement = Some(OrderingRequirements::from(sort_ordering)); + let filter = plan + .as_any() + .downcast_ref::() + .and_then(|s| s.filter().clone()); + match filter { + Some(filter) => { + sort_push_down.data.filter = Some(filter); + } + None => { + sort_push_down.data.filter = parent_filter.clone(); + } + } // Recursive call to helper, so it doesn't transform_down and miss // the new node (previous child of sort): return pushdown_sorts_helper(sort_push_down); @@ -131,11 +147,20 @@ fn pushdown_sorts_helper( return internal_err!("SortExec should have output ordering"); }; + let filter = plan + .as_any() + .downcast_ref::() + .and_then(|s| s.filter().clone()); + let sort_fetch = plan.fetch(); let parent_is_stricter = eqp.requirements_compatible( parent_requirement.first().clone(), sort_ordering.clone().into(), ); + let sort_filter = plan + .as_any() + .downcast_ref::() + .and_then(|s| s.filter().clone()); // Remove the current sort as we are either going to prove that it is // unnecessary, or replace it with a stricter sort. @@ -152,17 +177,27 @@ fn pushdown_sorts_helper( sort_push_down, parent_requirement.into_single(), parent_fetch, + filter.clone(), ); // Update pushdown requirements: sort_push_down.children[0].data = ParentRequirements { ordering_requirement: Some(OrderingRequirements::from(sort_ordering)), fetch: sort_fetch, + filter, }; return Ok(Transformed::yes(sort_push_down)); } else { // Sort was unnecessary, just propagate the stricter fetch and // ordering requirements: sort_push_down.data.fetch = min_fetch(sort_fetch, parent_fetch); + match sort_filter { + Some(filter) => { + sort_push_down.data.filter = Some(filter); + } + None => { + sort_push_down.data.filter = parent_filter.clone(); + } + } let current_is_stricter = eqp.requirements_compatible( sort_ordering.clone().into(), parent_requirement.first().clone(), @@ -194,9 +229,22 @@ fn pushdown_sorts_helper( // For operators that can take a sort pushdown, continue with updated // requirements: let current_fetch = sort_push_down.plan.fetch(); + let current_filter = sort_push_down + .plan + .as_any() + .downcast_ref::() + .and_then(|s| s.filter().clone()); for (child, order) in sort_push_down.children.iter_mut().zip(adjusted) { child.data.ordering_requirement = order; child.data.fetch = min_fetch(current_fetch, parent_fetch); + match current_filter { + Some(ref filter) => { + child.data.filter = Some(Arc::clone(filter)); + } + None => { + child.data.filter = parent_filter.clone(); + } + } } sort_push_down.data.ordering_requirement = None; } else { @@ -205,6 +253,7 @@ fn pushdown_sorts_helper( sort_push_down, parent_requirement.into_single(), parent_fetch, + parent_filter, ); assign_initial_requirements(&mut sort_push_down); } diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 49245551645e..d5129cea9d4e 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -99,8 +99,6 @@ impl PhysicalOptimizer { // The FilterPushdown rule tries to push down filters as far as it can. // For example, it will push down filtering from a `FilterExec` to // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. - // This must run before EnforceSorting to ensure dynamic filters from TopK operators - // are established before sorts are potentially recreated. Arc::new(FilterPushdown::new()), // The EnforceDistribution rule is for adding essential repartitioning to satisfy distribution // requirements. Please make sure that the whole plan tree is determined before this rule. @@ -141,10 +139,6 @@ impl PhysicalOptimizer { // reduced by narrowing their input tables. Arc::new(ProjectionPushdown::new()), Arc::new(InsertYieldExec::new()), - // The FilterPushdown rule tries to push down filters as far as it can. - // For example, it will push down filtering from a `FilterExec` to - // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. - Arc::new(FilterPushdown::new()), // The SanityCheckPlan rule checks whether the order and // distribution requirements of each node in the plan // is satisfied. It will also reject non-runnable query diff --git a/datafusion/physical-optimizer/src/topk_aggregation.rs b/datafusion/physical-optimizer/src/topk_aggregation.rs index bff0b1e49684..33166284774b 100644 --- a/datafusion/physical-optimizer/src/topk_aggregation.rs +++ b/datafusion/physical-optimizer/src/topk_aggregation.rs @@ -130,10 +130,13 @@ impl TopKAggregation { Ok(Transformed::no(plan)) }; let child = Arc::clone(child).transform_down(closure).data().ok()?; - let sort = SortExec::new(sort.expr().clone(), child) + let mut new_sort = SortExec::new(sort.expr().clone(), child) .with_fetch(sort.fetch()) .with_preserve_partitioning(sort.preserve_partitioning()); - Some(Arc::new(sort)) + if let Some(filter) = sort.filter() { + new_sort = new_sort.with_filter(Arc::clone(&filter)); + } + Some(Arc::new(new_sort)) } } diff --git a/datafusion/physical-optimizer/src/utils.rs b/datafusion/physical-optimizer/src/utils.rs index 3655e555a744..431925033678 100644 --- a/datafusion/physical-optimizer/src/utils.rs +++ b/datafusion/physical-optimizer/src/utils.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use datafusion_common::Result; +use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr; use datafusion_physical_expr::{LexOrdering, LexRequirement}; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; @@ -39,6 +40,7 @@ pub fn add_sort_above( node: PlanContext, sort_requirements: LexRequirement, fetch: Option, + filter: Option>, ) -> PlanContext { let mut sort_reqs: Vec<_> = sort_requirements.into(); sort_reqs.retain(|sort_expr| { @@ -55,6 +57,9 @@ pub fn add_sort_above( if node.plan.output_partitioning().partition_count() > 1 { new_sort = new_sort.with_preserve_partitioning(true); } + if let Some(filter) = filter { + new_sort = new_sort.with_filter(filter); + } PlanContext::new(Arc::new(new_sort), T::default(), vec![node]) } @@ -65,13 +70,14 @@ pub fn add_sort_above_with_check( node: PlanContext, sort_requirements: LexRequirement, fetch: Option, + filter: Option>, ) -> Result> { if !node .plan .equivalence_properties() .ordering_satisfy_requirement(sort_requirements.clone())? { - Ok(add_sort_above(node, sort_requirements, fetch)) + Ok(add_sort_above(node, sort_requirements, fetch, filter)) } else { Ok(node) } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 570d64449040..1ffaf7f1641b 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -913,12 +913,15 @@ impl SortExec { cache = cache.with_boundedness(Boundedness::Bounded); } let filter = fetch.is_some().then(|| { - let children = self - .expr - .iter() - .map(|sort_expr| Arc::clone(&sort_expr.expr)) - .collect::>(); - Arc::new(DynamicFilterPhysicalExpr::new(children, lit(true))) + // If we already have a filter, keep it. Otherwise, create a new one. + self.filter.clone().unwrap_or_else(|| { + let children = self + .expr + .iter() + .map(|sort_expr| Arc::clone(&sort_expr.expr)) + .collect::>(); + Arc::new(DynamicFilterPhysicalExpr::new(children, lit(true))) + }) }); SortExec { input: Arc::clone(&self.input), @@ -932,6 +935,11 @@ impl SortExec { } } + pub fn with_filter(mut self, filter: Arc) -> Self { + self.filter = Some(filter); + self + } + /// Input schema pub fn input(&self) -> &Arc { &self.input @@ -947,6 +955,11 @@ impl SortExec { self.fetch } + /// If `Some(filter)`, returns the filter expression that matches the state of the sort. + pub fn filter(&self) -> Option> { + self.filter.clone() + } + fn output_partitioning_helper( input: &Arc, preserve_partitioning: bool, diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index a610a47fb90a..929f2873fe93 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -243,11 +243,12 @@ impl TopK { // update memory reservation self.reservation.try_resize(self.size())?; + // flag the topK as finished if we know that all + // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K, + // which means the top K won't change and the computation can be finished early. + self.attempt_early_completion(&batch)?; + if updated { - // flag the topK as finished if we know that all - // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K, - // which means the top K won't change and the computation can be finished early. - self.attempt_early_completion(&batch)?; // update the filter representation of our TopK heap self.update_filter()?; } diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index e5b5f5ac878a..1b6ae13fbe77 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -246,38 +246,3 @@ physical_plan 02)--FilterExec: val@0 != part@1 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)] - -# The order of filters should not matter -query TT -EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val; ----- -logical_plan -01)Filter: t_pushdown.val = t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 = part@1 -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet - -query TT -select val, part from t_pushdown where part = 'a' AND part = val; ----- -a a - -query TT -EXPLAIN select val, part from t_pushdown where part = val AND part = 'a'; ----- -logical_plan -01)Filter: t_pushdown.val = t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] -physical_plan -01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: val@0 = part@1 -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet - -query TT -select val, part from t_pushdown where part = val AND part = 'a'; ----- -a a \ No newline at end of file From 66f64072e3f896fa58c9050c56a14867331c7353 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Jun 2025 12:56:42 -0500 Subject: [PATCH 05/25] check that filter pushdown happened in fuzz tests --- .../tests/fuzz_cases/topk_filter_pushdown.rs | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index 7296502264e6..2568e10dcd89 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -167,11 +167,17 @@ async fn test_files() -> Vec { (*files).clone() } + +struct RunResult { + results: Vec, + explain_plan: String, +} + async fn run_query_with_config( query: &str, config: SessionConfig, dataset: TestDataSet, -) -> Vec { +) -> RunResult { let store = dataset.store; let schema = dataset.schema; let ctx = SessionContext::new_with_config(config); @@ -191,7 +197,16 @@ async fn run_query_with_config( ctx.register_table("test_table", table).unwrap(); - ctx.sql(query).await.unwrap().collect().await.unwrap() + let results = ctx.sql(query).await.unwrap().collect().await.unwrap(); + let explain_batches = ctx.sql(&format!("EXPLAIN ANALYZE {query}")).await.unwrap(). + collect().await.unwrap(); + let explain_plan = pretty_format_batches(&explain_batches) + .unwrap() + .to_string(); + RunResult { + results, + explain_plan, + } } #[derive(Debug)] @@ -215,6 +230,16 @@ impl RunQueryResult { } } +/// Iterate over each line in the plan and check that one of them has `DataSourceExec` and `DynamicFilterPhysicalExpr` in the same line. +fn has_dynamic_filter_expr_pushdown(plan: &str) -> bool { + for line in plan.lines() { + if line.contains("DataSourceExec") && line.contains("DynamicFilterPhysicalExpr") { + return true; + } + } + false +} + async fn run_query( query: String, cfg: SessionConfig, @@ -231,11 +256,15 @@ async fn run_query( run_query_with_config(&query, cfg_without_dynamic_filters, dataset.clone()).await; let result = run_query_with_config(&query, cfg_with_dynamic_filters, dataset.clone()).await; + // Check that dynamic filters were actually pushed down + if !has_dynamic_filter_expr_pushdown(&result.explain_plan) { + panic!("Dynamic filter was not pushed down in query: {query}\n\n{}", result.explain_plan); + } RunQueryResult { query: query.to_string(), - result, - expected: expected_result, + result: result.results, + expected: expected_result.results, } } From 5cb49d2754d8f5a6d106c1362069e1d290853e92 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Jun 2025 13:10:41 -0500 Subject: [PATCH 06/25] fmt --- .../tests/fuzz_cases/topk_filter_pushdown.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs index 2568e10dcd89..a5934882cbcc 100644 --- a/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs +++ b/datafusion/core/tests/fuzz_cases/topk_filter_pushdown.rs @@ -167,7 +167,6 @@ async fn test_files() -> Vec { (*files).clone() } - struct RunResult { results: Vec, explain_plan: String, @@ -198,11 +197,14 @@ async fn run_query_with_config( ctx.register_table("test_table", table).unwrap(); let results = ctx.sql(query).await.unwrap().collect().await.unwrap(); - let explain_batches = ctx.sql(&format!("EXPLAIN ANALYZE {query}")).await.unwrap(). - collect().await.unwrap(); - let explain_plan = pretty_format_batches(&explain_batches) + let explain_batches = ctx + .sql(&format!("EXPLAIN ANALYZE {query}")) + .await .unwrap() - .to_string(); + .collect() + .await + .unwrap(); + let explain_plan = pretty_format_batches(&explain_batches).unwrap().to_string(); RunResult { results, explain_plan, @@ -258,7 +260,10 @@ async fn run_query( run_query_with_config(&query, cfg_with_dynamic_filters, dataset.clone()).await; // Check that dynamic filters were actually pushed down if !has_dynamic_filter_expr_pushdown(&result.explain_plan) { - panic!("Dynamic filter was not pushed down in query: {query}\n\n{}", result.explain_plan); + panic!( + "Dynamic filter was not pushed down in query: {query}\n\n{}", + result.explain_plan + ); } RunQueryResult { From 9eea14c2ebbcc13ccfebdaa22ca0421d86418e13 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:15:42 -0500 Subject: [PATCH 07/25] revert accidental change --- .../test_files/parquet_filter_pushdown.slt | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 1b6ae13fbe77..e5b5f5ac878a 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -246,3 +246,38 @@ physical_plan 02)--FilterExec: val@0 != part@1 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != d AND val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != d OR d != val_max@1) AND val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c, d)] + +# The order of filters should not matter +query TT +EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val; +---- +logical_plan +01)Filter: t_pushdown.val = t_pushdown.part +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: val@0 = part@1 +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet + +query TT +select val, part from t_pushdown where part = 'a' AND part = val; +---- +a a + +query TT +EXPLAIN select val, part from t_pushdown where part = val AND part = 'a'; +---- +logical_plan +01)Filter: t_pushdown.val = t_pushdown.part +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: val@0 = part@1 +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet + +query TT +select val, part from t_pushdown where part = val AND part = 'a'; +---- +a a \ No newline at end of file From 907d0a30287c6d799d3c2141d9669a4b34590540 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:17:03 -0500 Subject: [PATCH 08/25] remove indentation --- datafusion/physical-plan/src/topk/mod.rs | 147 ++++++++++++----------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 929f2873fe93..a5fff1b84e00 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -261,89 +261,90 @@ impl TopK { let Some(filter) = &self.filter else { return Ok(()); }; - if let Some(thresholds) = self.heap.get_threshold_values(&self.expr)? { - // Create filter expressions for each threshold - let mut filters: Vec> = - Vec::with_capacity(thresholds.len()); - - let mut prev_sort_expr: Option> = None; - for (sort_expr, value) in self.expr.iter().zip(thresholds.iter()) { - // Create the appropriate operator based on sort order - let op = if sort_expr.options.descending { - // For descending sort, we want col > threshold (exclude smaller values) - Operator::Gt - } else { - // For ascending sort, we want col < threshold (exclude larger values) - Operator::Lt - }; + let Some(thresholds) = self.heap.get_threshold_values(&self.expr)? else { + return Ok(()); + }; - let value_null = value.is_null(); + // Create filter expressions for each threshold + let mut filters: Vec> = + Vec::with_capacity(thresholds.len()); - let comparison = Arc::new(BinaryExpr::new( - Arc::clone(&sort_expr.expr), - op, - lit(value.clone()), - )); + let mut prev_sort_expr: Option> = None; + for (sort_expr, value) in self.expr.iter().zip(thresholds.iter()) { + // Create the appropriate operator based on sort order + let op = if sort_expr.options.descending { + // For descending sort, we want col > threshold (exclude smaller values) + Operator::Gt + } else { + // For ascending sort, we want col < threshold (exclude larger values) + Operator::Lt + }; - let comparison_with_null = - match (sort_expr.options.nulls_first, value_null) { - // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) - (true, true) => lit(false), - (true, false) => Arc::new(BinaryExpr::new( - is_null(Arc::clone(&sort_expr.expr))?, - Operator::Or, - comparison, - )), - // For nulls last, transform to (threshold.value is null and threshold.expr is not null) - // or (threshold.value is not null and comparison) - (false, true) => is_not_null(Arc::clone(&sort_expr.expr))?, - (false, false) => comparison, - }; - - let mut eq_expr = Arc::new(BinaryExpr::new( - Arc::clone(&sort_expr.expr), - Operator::Eq, - lit(value.clone()), + let value_null = value.is_null(); + + let comparison = Arc::new(BinaryExpr::new( + Arc::clone(&sort_expr.expr), + op, + lit(value.clone()), + )); + + let comparison_with_null = match (sort_expr.options.nulls_first, value_null) { + // For nulls first, transform to (threshold.value is not null) and (threshold.expr is null or comparison) + (true, true) => lit(false), + (true, false) => Arc::new(BinaryExpr::new( + is_null(Arc::clone(&sort_expr.expr))?, + Operator::Or, + comparison, + )), + // For nulls last, transform to (threshold.value is null and threshold.expr is not null) + // or (threshold.value is not null and comparison) + (false, true) => is_not_null(Arc::clone(&sort_expr.expr))?, + (false, false) => comparison, + }; + + let mut eq_expr = Arc::new(BinaryExpr::new( + Arc::clone(&sort_expr.expr), + Operator::Eq, + lit(value.clone()), + )); + + if value_null { + eq_expr = Arc::new(BinaryExpr::new( + is_null(Arc::clone(&sort_expr.expr))?, + Operator::Or, + eq_expr, )); + } - if value_null { - eq_expr = Arc::new(BinaryExpr::new( - is_null(Arc::clone(&sort_expr.expr))?, - Operator::Or, - eq_expr, - )); + // For a query like order by a, b, the filter for column `b` is only applied if + // the condition a = threshold.value (considering null equality) is met. + // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field, + // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields. + match prev_sort_expr.take() { + None => { + prev_sort_expr = Some(eq_expr); + filters.push(comparison_with_null); } - - // For a query like order by a, b, the filter for column `b` is only applied if - // the condition a = threshold.value (considering null equality) is met. - // Therefore, we add equality predicates for all preceding fields to the filter logic of the current field, - // and include the current field's equality predicate in `prev_sort_expr` for use with subsequent fields. - match prev_sort_expr.take() { - None => { - prev_sort_expr = Some(eq_expr); - filters.push(comparison_with_null); - } - Some(p) => { - filters.push(Arc::new(BinaryExpr::new( - Arc::clone(&p), - Operator::And, - comparison_with_null, - ))); - - prev_sort_expr = - Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr))); - } + Some(p) => { + filters.push(Arc::new(BinaryExpr::new( + Arc::clone(&p), + Operator::And, + comparison_with_null, + ))); + + prev_sort_expr = + Some(Arc::new(BinaryExpr::new(p, Operator::And, eq_expr))); } } + } - let dynamic_predicate = filters - .into_iter() - .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b))); + let dynamic_predicate = filters + .into_iter() + .reduce(|a, b| Arc::new(BinaryExpr::new(a, Operator::Or, b))); - if let Some(predicate) = dynamic_predicate { - if !predicate.eq(&lit(true)) { - filter.update(predicate)?; - } + if let Some(predicate) = dynamic_predicate { + if !predicate.eq(&lit(true)) { + filter.update(predicate)?; } } From a5f300cbb63e72b609060ac34e4a69eee6d5a069 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:18:19 -0500 Subject: [PATCH 09/25] remove outdated comment --- .../core/tests/physical_optimizer/filter_pushdown/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index a8357c0466c9..f6282c3a1b57 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -361,9 +361,6 @@ fn test_node_handles_child_pushdown_result() { #[tokio::test] async fn test_topk_dynamic_filter_pushdown() { - // This test is a bit of a hack, but it shows that we can push down dynamic filters - // into the DataSourceExec. The test is not perfect because we don't have a real - // implementation of the dynamic filter yet, so we just use a static filter. let batches = vec![ record_batch!( ("a", Utf8, ["aa", "ab"]), From 7887f3f80b66f23c803071b4ae49c5f9e12bb39a Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:26:34 -0500 Subject: [PATCH 10/25] add example to docstring --- datafusion/physical-plan/src/topk/mod.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index a5fff1b84e00..2da2ea870761 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -256,7 +256,16 @@ impl TopK { Ok(()) } - /// Update the filter representation of our TopK heap + /// Update the filter representation of our TopK heap. + /// For example, given the sort expression `ORDER BY a DESC, b ASC LIMIT 3`, + /// and the current heap values `[(1, 5), (1, 4), (2, 3)]`, + /// the filter will be updated to: + /// + /// ``` + /// (a > 1 OR (a = 1 AND b < 5)) AND + /// (a > 1 OR (a = 1 AND b < 4)) AND + /// (a > 2 OR (a = 2 AND b < 3)) + /// ``` fn update_filter(&mut self) -> Result<()> { let Some(filter) = &self.filter else { return Ok(()); From 18a2ba8a779402dcece4a97ed043d0b08277d17d Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 13 Jun 2025 16:43:39 -0500 Subject: [PATCH 11/25] fix doctest --- datafusion/physical-plan/src/topk/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 2da2ea870761..2accd6726c26 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -261,7 +261,7 @@ impl TopK { /// and the current heap values `[(1, 5), (1, 4), (2, 3)]`, /// the filter will be updated to: /// - /// ``` + /// ```sql /// (a > 1 OR (a = 1 AND b < 5)) AND /// (a > 1 OR (a = 1 AND b < 4)) AND /// (a > 2 OR (a = 2 AND b < 3)) From a90781fde9c3a51bf179966ee4c3791150037c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 14 Jun 2025 11:36:30 +0200 Subject: [PATCH 12/25] Filter TopK inputs based on dynamic topk filter --- datafusion/physical-plan/src/topk/mod.rs | 102 ++++++++++++++++------- 1 file changed, 73 insertions(+), 29 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 2accd6726c26..8aadb334b5e8 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -18,8 +18,8 @@ //! TopK: Combination of Sort / LIMIT use arrow::{ - array::Array, - compute::interleave_record_batch, + array::{Array, AsArray}, + compute::{interleave_record_batch, FilterBuilder}, row::{RowConverter, Rows, SortField}, }; use datafusion_expr::{ColumnarValue, Operator}; @@ -203,7 +203,7 @@ impl TopK { let baseline = self.metrics.baseline.clone(); let _timer = baseline.elapsed_compute().timer(); - let sort_keys: Vec = self + let mut sort_keys: Vec = self .expr .iter() .map(|expr| { @@ -212,43 +212,65 @@ impl TopK { }) .collect::>>()?; + let mut selected_rows = None; + + match self.filter.as_ref() { + // If a filter is provided, update it with the new rows + Some(filter) => { + let filter = filter.current()?; + let filtered = filter.evaluate(&batch)?; + let num_rows = batch.num_rows(); + let array = filtered.into_array(num_rows)?; + let filter = array.as_boolean().clone(); + if filter.true_count() == 0 { + // nothing to filter, so no need to update + return Ok(()); + } + let filter_predicate = FilterBuilder::new(&filter); + let filter_predicate = if sort_keys.len() > 1 { + // Optimize filter when it has multiple sort keys + filter_predicate.optimize().build() + } else { + filter_predicate.build() + }; + selected_rows = Some(filter); + sort_keys = sort_keys + .iter() + .map(|key| filter_predicate.filter(key).map_err(|x| x.into())) + .collect::>>()?; + } + None => {} + } // reuse existing `Rows` to avoid reallocations let rows = &mut self.scratch_rows; rows.clear(); self.row_converter.append(rows, &sort_keys)?; - // TODO make this algorithmically better?: - // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`) - // this avoids some work and also might be better vectorizable. let mut batch_entry = self.heap.register_batch(batch.clone()); - let mut updated = false; - for (index, row) in rows.iter().enumerate() { - match self.heap.max() { - // heap has k items, and the new row is greater than the - // current max in the heap ==> it is not a new topk - Some(max_row) if row.as_ref() >= max_row.row() => {} - // don't yet have k items or new item is lower than the currently k low values - None | Some(_) => { - self.heap.add(&mut batch_entry, row, index); - self.metrics.row_replacements.add(1); - updated = true; - } + + let replacements = match selected_rows { + Some(filter) => { + self.find_new_topk_items(filter.values().set_indices(), &mut batch_entry) } - } - self.heap.insert_batch_entry(batch_entry); + None => self.find_new_topk_items(0..sort_keys[0].len(), &mut batch_entry), + }; + + if replacements > 0 { + self.metrics.row_replacements.add(replacements); - // conserve memory - self.heap.maybe_compact()?; + self.heap.insert_batch_entry(batch_entry); - // update memory reservation - self.reservation.try_resize(self.size())?; + // conserve memory + self.heap.maybe_compact()?; - // flag the topK as finished if we know that all - // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K, - // which means the top K won't change and the computation can be finished early. - self.attempt_early_completion(&batch)?; + // update memory reservation + self.reservation.try_resize(self.size())?; + + // flag the topK as finished if we know that all + // subsequent batches are guaranteed to be greater (by byte order, after row conversion) than the top K, + // which means the top K won't change and the computation can be finished early. + self.attempt_early_completion(&batch)?; - if updated { // update the filter representation of our TopK heap self.update_filter()?; } @@ -256,6 +278,28 @@ impl TopK { Ok(()) } + fn find_new_topk_items( + &mut self, + items: impl Iterator, + batch_entry: &mut RecordBatchEntry, + ) -> usize { + let mut replacements = 0; + let rows = &mut self.scratch_rows; + for (index, row) in items.zip(rows.iter()) { + match self.heap.max() { + // heap has k items, and the new row is greater than the + // current max in the heap ==> it is not a new topk + Some(max_row) if row.as_ref() >= max_row.row() => {} + // don't yet have k items or new item is lower than the currently k low values + None | Some(_) => { + self.heap.add(batch_entry, row, index); + replacements += 1; + } + } + } + replacements + } + /// Update the filter representation of our TopK heap. /// For example, given the sort expression `ORDER BY a DESC, b ASC LIMIT 3`, /// and the current heap values `[(1, 5), (1, 4), (2, 3)]`, From 53ab9ab50b831263cc1314871d5ba099c358d6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 14 Jun 2025 11:54:34 +0200 Subject: [PATCH 13/25] Clippy --- datafusion/physical-plan/src/topk/mod.rs | 47 +++++++++++------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 8aadb334b5e8..d5c1ee950142 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -214,33 +214,30 @@ impl TopK { let mut selected_rows = None; - match self.filter.as_ref() { + if let Some(filter) = self.filter.as_ref() { // If a filter is provided, update it with the new rows - Some(filter) => { - let filter = filter.current()?; - let filtered = filter.evaluate(&batch)?; - let num_rows = batch.num_rows(); - let array = filtered.into_array(num_rows)?; - let filter = array.as_boolean().clone(); - if filter.true_count() == 0 { - // nothing to filter, so no need to update - return Ok(()); - } - let filter_predicate = FilterBuilder::new(&filter); - let filter_predicate = if sort_keys.len() > 1 { - // Optimize filter when it has multiple sort keys - filter_predicate.optimize().build() - } else { - filter_predicate.build() - }; - selected_rows = Some(filter); - sort_keys = sort_keys - .iter() - .map(|key| filter_predicate.filter(key).map_err(|x| x.into())) - .collect::>>()?; + let filter = filter.current()?; + let filtered = filter.evaluate(&batch)?; + let num_rows = batch.num_rows(); + let array = filtered.into_array(num_rows)?; + let filter = array.as_boolean().clone(); + if filter.true_count() == 0 { + // nothing to filter, so no need to update + return Ok(()); } - None => {} - } + let filter_predicate = FilterBuilder::new(&filter); + let filter_predicate = if sort_keys.len() > 1 { + // Optimize filter when it has multiple sort keys + filter_predicate.optimize().build() + } else { + filter_predicate.build() + }; + selected_rows = Some(filter); + sort_keys = sort_keys + .iter() + .map(|key| filter_predicate.filter(key).map_err(|x| x.into())) + .collect::>>()?; + }; // reuse existing `Rows` to avoid reallocations let rows = &mut self.scratch_rows; rows.clear(); From 746cc2faaaa70227e39f82e468d876df7670975a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 14 Jun 2025 12:48:17 +0200 Subject: [PATCH 14/25] Fast path --- datafusion/physical-plan/src/topk/mod.rs | 30 ++++++++++++++---------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index d5c1ee950142..0d97c11b2b7c 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -221,22 +221,26 @@ impl TopK { let num_rows = batch.num_rows(); let array = filtered.into_array(num_rows)?; let filter = array.as_boolean().clone(); - if filter.true_count() == 0 { + let true_count = filter.true_count(); + if true_count == 0 { // nothing to filter, so no need to update return Ok(()); } - let filter_predicate = FilterBuilder::new(&filter); - let filter_predicate = if sort_keys.len() > 1 { - // Optimize filter when it has multiple sort keys - filter_predicate.optimize().build() - } else { - filter_predicate.build() - }; - selected_rows = Some(filter); - sort_keys = sort_keys - .iter() - .map(|key| filter_predicate.filter(key).map_err(|x| x.into())) - .collect::>>()?; + // only update the keys / rows if the filter does not match all rows + if true_count < num_rows { + let filter_predicate = FilterBuilder::new(&filter); + let filter_predicate = if sort_keys.len() > 1 { + // Optimize filter when it has multiple sort keys + filter_predicate.optimize().build() + } else { + filter_predicate.build() + }; + selected_rows = Some(filter); + sort_keys = sort_keys + .iter() + .map(|key| filter_predicate.filter(key).map_err(|x| x.into())) + .collect::>>()?; + } }; // reuse existing `Rows` to avoid reallocations let rows = &mut self.scratch_rows; From 14a996dcc99584d7b671fa188d986a138150d419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 14 Jun 2025 22:16:49 +0200 Subject: [PATCH 15/25] Fix test --- .../core/tests/physical_optimizer/filter_pushdown/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index f6282c3a1b57..d17fdcd0516d 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -398,11 +398,11 @@ async fn test_topk_dynamic_filter_pushdown() { OptimizationTest: input: - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false] - - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true output: Ok: - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false] - - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ true ] " ); @@ -426,7 +426,7 @@ async fn test_topk_dynamic_filter_pushdown() { format!("{}", format_plan_for_test(&plan)), @r" - SortExec: TopK(fetch=1), expr=[b@1 DESC NULLS LAST], preserve_partitioning=[false], filter=[b@1 > bd] - - DataSourceExec: file_groups={1 group: [[test.paqruet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ b@1 > bd ] + - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilterPhysicalExpr [ b@1 > bd ] " ); } From ebe5b80ae52c52c55c7d3f918797d89285784f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 15 Jun 2025 00:26:10 +0200 Subject: [PATCH 16/25] Fix --- datafusion/physical-plan/src/topk/mod.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 0d97c11b2b7c..470133c5458b 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -19,7 +19,7 @@ use arrow::{ array::{Array, AsArray}, - compute::{interleave_record_batch, FilterBuilder}, + compute::{interleave_record_batch, prep_null_mask_filter, FilterBuilder}, row::{RowConverter, Rows, SortField}, }; use datafusion_expr::{ColumnarValue, Operator}; @@ -220,7 +220,7 @@ impl TopK { let filtered = filter.evaluate(&batch)?; let num_rows = batch.num_rows(); let array = filtered.into_array(num_rows)?; - let filter = array.as_boolean().clone(); + let mut filter = array.as_boolean().clone(); let true_count = filter.true_count(); if true_count == 0 { // nothing to filter, so no need to update @@ -228,6 +228,11 @@ impl TopK { } // only update the keys / rows if the filter does not match all rows if true_count < num_rows { + // Indices should be correct w.r.t. null indices + if filter.nulls().is_some() { + filter = prep_null_mask_filter(&filter); + } + let filter_predicate = FilterBuilder::new(&filter); let filter_predicate = if sort_keys.len() > 1 { // Optimize filter when it has multiple sort keys From 6832526341be574e7f267f162647f8473827c60b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sun, 15 Jun 2025 00:34:52 +0200 Subject: [PATCH 17/25] Adapt comment --- datafusion/physical-plan/src/topk/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 470133c5458b..8d06fa73ce8e 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -228,7 +228,9 @@ impl TopK { } // only update the keys / rows if the filter does not match all rows if true_count < num_rows { - // Indices should be correct w.r.t. null indices + // Indices in `set_indices` should be correct if filter contains nulls + // So we prepare the filter here. Note this is also done in the `FilterBuilder` + // so there is no overhead to do this here. if filter.nulls().is_some() { filter = prep_null_mask_filter(&filter); } From 5ad57fe97a87787ccdba0dbfa49cea82d1af2664 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 06:32:39 -0500 Subject: [PATCH 18/25] Split filter pushdown into two phases --- datafusion/core/benches/push_down_filter.rs | 2 +- .../core/tests/parquet/file_statistics.rs | 2 +- .../physical_optimizer/filter_pushdown/mod.rs | 28 +++++----- .../filter_pushdown/util.rs | 3 ++ datafusion/datasource/src/source.rs | 3 +- .../src/enforce_distribution.rs | 1 - .../src/enforce_sorting/mod.rs | 23 +++------ .../src/enforce_sorting/sort_pushdown.rs | 51 +------------------ .../physical-optimizer/src/filter_pushdown.rs | 45 ++++++++++------ .../physical-optimizer/src/optimizer.rs | 6 ++- .../src/topk_aggregation.rs | 7 +-- datafusion/physical-optimizer/src/utils.rs | 8 +-- .../physical-plan/src/coalesce_batches.rs | 5 +- .../physical-plan/src/execution_plan.rs | 8 ++- datafusion/physical-plan/src/filter.rs | 14 ++++- .../physical-plan/src/filter_pushdown.rs | 33 ++++++++++++ .../physical-plan/src/repartition/mod.rs | 5 +- datafusion/physical-plan/src/sorts/sort.rs | 7 ++- .../sqllogictest/test_files/explain.slt | 9 ++-- 19 files changed, 141 insertions(+), 119 deletions(-) diff --git a/datafusion/core/benches/push_down_filter.rs b/datafusion/core/benches/push_down_filter.rs index 139fb12c3094..f781b9ae46bd 100644 --- a/datafusion/core/benches/push_down_filter.rs +++ b/datafusion/core/benches/push_down_filter.rs @@ -105,7 +105,7 @@ fn bench_push_down_filter(c: &mut Criterion) { let mut config = ConfigOptions::default(); config.execution.parquet.pushdown_filters = true; let plan = BenchmarkPlan { plan, config }; - let optimizer = FilterPushdown::new(); + let optimizer = FilterPushdown::new_pre_optimization(); c.bench_function("push_down_filter", |b| { b.iter(|| { diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index a60beaf665e5..14ca17d1f8d5 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -83,7 +83,7 @@ async fn check_stats_precision_with_filter_pushdown() { Arc::new(FilterExec::try_new(physical_filter, exec_with_filter).unwrap()) as Arc; - let optimized_exec = FilterPushdown::new() + let optimized_exec = FilterPushdown::new_pre_optimization() .optimize(filtered_exec, &options) .unwrap(); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index d17fdcd0516d..6c92c7c52d37 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -63,7 +63,7 @@ fn test_pushdown_into_scan() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -87,7 +87,7 @@ fn test_pushdown_into_scan_with_config_options() { insta::assert_snapshot!( OptimizationTest::new( Arc::clone(&plan), - FilterPushdown {}, + FilterPushdown::new_pre_optimization(), false ), @r" @@ -106,7 +106,7 @@ fn test_pushdown_into_scan_with_config_options() { insta::assert_snapshot!( OptimizationTest::new( plan, - FilterPushdown {}, + FilterPushdown::new_pre_optimization(), true ), @r" @@ -131,7 +131,7 @@ fn test_filter_collapse() { let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap()); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -159,7 +159,7 @@ fn test_filter_with_projection() { // expect the predicate to be pushed down into the DataSource but the FilterExec to be converted to ProjectionExec insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -182,7 +182,7 @@ fn test_filter_with_projection() { .unwrap(), ); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{},true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(),true), @r" OptimizationTest: input: @@ -211,7 +211,7 @@ fn test_push_down_through_transparent_nodes() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{},true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(),true), @r" OptimizationTest: input: @@ -275,7 +275,7 @@ fn test_no_pushdown_through_aggregates() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -306,7 +306,7 @@ fn test_node_handles_child_pushdown_result() { let predicate = col_lit_predicate("a", "foo", &schema()); let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate)); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -325,7 +325,7 @@ fn test_node_handles_child_pushdown_result() { let predicate = col_lit_predicate("a", "foo", &schema()); let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate)); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -345,7 +345,7 @@ fn test_node_handles_child_pushdown_result() { let predicate = col_lit_predicate("a", "foo", &schema()); let plan = Arc::new(TestNode::new(false, Arc::clone(&scan), predicate)); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown{}, true), + OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), @r" OptimizationTest: input: @@ -393,7 +393,7 @@ async fn test_topk_dynamic_filter_pushdown() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(Arc::clone(&plan), FilterPushdown{}, true), + OptimizationTest::new(Arc::clone(&plan), FilterPushdown::new_post_optimization(), true), @r" OptimizationTest: input: @@ -409,7 +409,9 @@ async fn test_topk_dynamic_filter_pushdown() { // Actually apply the optimization to the plan and put some data through it to check that the filter is updated to reflect the TopK state let mut config = ConfigOptions::default(); config.execution.parquet.pushdown_filters = true; - let plan = FilterPushdown::new().optimize(plan, &config).unwrap(); + let plan = FilterPushdown::new_post_optimization() + .optimize(plan, &config) + .unwrap(); let config = SessionConfig::new().with_batch_size(2); let session_ctx = SessionContext::new_with_config(config); session_ctx.register_object_store( diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index f99f0fa54a14..e793af8ed4b0 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -29,6 +29,7 @@ use datafusion_datasource::{ use datafusion_physical_expr::conjunction; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_optimizer::PhysicalOptimizerRule; +use datafusion_physical_plan::filter_pushdown::FilterPushdownPhase; use datafusion_physical_plan::{ displayable, filter::FilterExec, @@ -510,6 +511,7 @@ impl ExecutionPlan for TestNode { fn gather_filters_for_pushdown( &self, + _phase: FilterPushdownPhase, parent_filters: Vec>, _config: &ConfigOptions, ) -> Result { @@ -520,6 +522,7 @@ impl ExecutionPlan for TestNode { fn handle_child_pushdown_result( &self, + _phase: FilterPushdownPhase, child_pushdown_result: ChildPushdownResult, _config: &ConfigOptions, ) -> Result>> { diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 29c3c9c3d7ff..a0f49ad7b16c 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -36,7 +36,7 @@ use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::filter_pushdown::{ - ChildPushdownResult, FilterPushdownPropagation, + ChildPushdownResult, FilterPushdownPhase, FilterPushdownPropagation, }; use datafusion_physical_plan::yield_stream::wrap_yield_stream; @@ -318,6 +318,7 @@ impl ExecutionPlan for DataSourceExec { fn handle_child_pushdown_result( &self, + _phase: FilterPushdownPhase, child_pushdown_result: ChildPushdownResult, config: &ConfigOptions, ) -> Result>> { diff --git a/datafusion/physical-optimizer/src/enforce_distribution.rs b/datafusion/physical-optimizer/src/enforce_distribution.rs index bf08e4a9c243..478ce39eecb9 100644 --- a/datafusion/physical-optimizer/src/enforce_distribution.rs +++ b/datafusion/physical-optimizer/src/enforce_distribution.rs @@ -1308,7 +1308,6 @@ pub fn ensure_distribution( .downcast_ref::() .map(|output| output.fetch()) .unwrap_or(None), - None, )?; } } diff --git a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs index 78360ab30fde..8a71b28486a2 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/mod.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/mod.rs @@ -57,7 +57,6 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::Result; -use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr; use datafusion_physical_expr::{Distribution, Partitioning}; use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement}; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -404,7 +403,7 @@ pub fn parallelize_sorts( && requirements.plan.output_partitioning().partition_count() <= 1 { // Take the initial sort expressions and requirements - let (sort_exprs, fetch, filter) = get_sort_exprs(&requirements.plan)?; + let (sort_exprs, fetch) = get_sort_exprs(&requirements.plan)?; let sort_reqs = LexRequirement::from(sort_exprs.clone()); let sort_exprs = sort_exprs.clone(); @@ -418,7 +417,7 @@ pub fn parallelize_sorts( // deals with the children and their children and so on. requirements = requirements.children.swap_remove(0); - requirements = add_sort_above_with_check(requirements, sort_reqs, fetch, filter)?; + requirements = add_sort_above_with_check(requirements, sort_reqs, fetch)?; let spm = SortPreservingMergeExec::new(sort_exprs, Arc::clone(&requirements.plan)); @@ -514,7 +513,6 @@ pub fn ensure_sorting( .downcast_ref::() .map(|output| output.fetch()) .unwrap_or(None), - None, ); child = update_sort_ctx_children_data(child, true)?; } @@ -646,7 +644,7 @@ fn adjust_window_sort_removal( // Satisfy the ordering requirement so that the window can run: let mut child_node = window_tree.children.swap_remove(0); if let Some(reqs) = reqs { - child_node = add_sort_above(child_node, reqs.into_single(), None, None); + child_node = add_sort_above(child_node, reqs.into_single(), None); } let child_plan = Arc::clone(&child_node.plan); window_tree.children.push(child_node); @@ -805,20 +803,15 @@ fn remove_corresponding_sort_from_sub_plan( Ok(node) } -/// Return type for get_sort_exprs function -type SortExprsResult<'a> = ( - &'a LexOrdering, - Option, - Option>, -); - /// Converts an [ExecutionPlan] trait object to a [LexOrdering] reference when possible. -fn get_sort_exprs(sort_any: &Arc) -> Result> { +fn get_sort_exprs( + sort_any: &Arc, +) -> Result<(&LexOrdering, Option)> { if let Some(sort_exec) = sort_any.as_any().downcast_ref::() { - Ok((sort_exec.expr(), sort_exec.fetch(), sort_exec.filter())) + Ok((sort_exec.expr(), sort_exec.fetch())) } else if let Some(spm) = sort_any.as_any().downcast_ref::() { - Ok((spm.expr(), spm.fetch(), None)) + Ok((spm.expr(), spm.fetch())) } else { plan_err!("Given ExecutionPlan is not a SortExec or a SortPreservingMergeExec") } diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs index 1fe1ed3e21b1..bd7b0060c3be 100644 --- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs +++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs @@ -26,7 +26,7 @@ use arrow::datatypes::SchemaRef; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{internal_err, HashSet, JoinSide, Result}; use datafusion_expr::JoinType; -use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr}; +use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::utils::collect_columns; use datafusion_physical_expr::{ add_offset_to_physical_sort_exprs, EquivalenceProperties, @@ -57,7 +57,6 @@ use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; pub struct ParentRequirements { ordering_requirement: Option, fetch: Option, - filter: Option>, } pub type SortPushDown = PlanContext; @@ -71,8 +70,6 @@ pub fn assign_initial_requirements(sort_push_down: &mut SortPushDown) { // If the parent has a fetch value, assign it to the children // Or use the fetch value of the child. fetch: child.plan.fetch(), - // If the parent has a filter, assign it to the children - filter: sort_push_down.data.filter.clone(), }; } } @@ -98,7 +95,6 @@ fn pushdown_sorts_helper( ) -> Result> { let plan = sort_push_down.plan; let parent_fetch = sort_push_down.data.fetch; - let parent_filter = sort_push_down.data.filter.clone(); let Some(parent_requirement) = sort_push_down.data.ordering_requirement.clone() else { @@ -118,18 +114,6 @@ fn pushdown_sorts_helper( sort_push_down.data.fetch = fetch; sort_push_down.data.ordering_requirement = Some(OrderingRequirements::from(sort_ordering)); - let filter = plan - .as_any() - .downcast_ref::() - .and_then(|s| s.filter().clone()); - match filter { - Some(filter) => { - sort_push_down.data.filter = Some(filter); - } - None => { - sort_push_down.data.filter = parent_filter.clone(); - } - } // Recursive call to helper, so it doesn't transform_down and miss // the new node (previous child of sort): return pushdown_sorts_helper(sort_push_down); @@ -147,20 +131,11 @@ fn pushdown_sorts_helper( return internal_err!("SortExec should have output ordering"); }; - let filter = plan - .as_any() - .downcast_ref::() - .and_then(|s| s.filter().clone()); - let sort_fetch = plan.fetch(); let parent_is_stricter = eqp.requirements_compatible( parent_requirement.first().clone(), sort_ordering.clone().into(), ); - let sort_filter = plan - .as_any() - .downcast_ref::() - .and_then(|s| s.filter().clone()); // Remove the current sort as we are either going to prove that it is // unnecessary, or replace it with a stricter sort. @@ -177,27 +152,17 @@ fn pushdown_sorts_helper( sort_push_down, parent_requirement.into_single(), parent_fetch, - filter.clone(), ); // Update pushdown requirements: sort_push_down.children[0].data = ParentRequirements { ordering_requirement: Some(OrderingRequirements::from(sort_ordering)), fetch: sort_fetch, - filter, }; return Ok(Transformed::yes(sort_push_down)); } else { // Sort was unnecessary, just propagate the stricter fetch and // ordering requirements: sort_push_down.data.fetch = min_fetch(sort_fetch, parent_fetch); - match sort_filter { - Some(filter) => { - sort_push_down.data.filter = Some(filter); - } - None => { - sort_push_down.data.filter = parent_filter.clone(); - } - } let current_is_stricter = eqp.requirements_compatible( sort_ordering.clone().into(), parent_requirement.first().clone(), @@ -229,22 +194,9 @@ fn pushdown_sorts_helper( // For operators that can take a sort pushdown, continue with updated // requirements: let current_fetch = sort_push_down.plan.fetch(); - let current_filter = sort_push_down - .plan - .as_any() - .downcast_ref::() - .and_then(|s| s.filter().clone()); for (child, order) in sort_push_down.children.iter_mut().zip(adjusted) { child.data.ordering_requirement = order; child.data.fetch = min_fetch(current_fetch, parent_fetch); - match current_filter { - Some(ref filter) => { - child.data.filter = Some(Arc::clone(filter)); - } - None => { - child.data.filter = parent_filter.clone(); - } - } } sort_push_down.data.ordering_requirement = None; } else { @@ -253,7 +205,6 @@ fn pushdown_sorts_helper( sort_push_down, parent_requirement.into_single(), parent_fetch, - parent_filter, ); assign_initial_requirements(&mut sort_push_down); } diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 5b2d47106b8d..4f5155eec235 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -22,7 +22,8 @@ use crate::PhysicalOptimizerRule; use datafusion_common::{config::ConfigOptions, Result}; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::filter_pushdown::{ - ChildPushdownResult, FilterPushdownPropagation, PredicateSupport, PredicateSupports, + ChildPushdownResult, FilterPushdownPhase, FilterPushdownPropagation, + PredicateSupport, PredicateSupports, }; use datafusion_physical_plan::{with_new_children_if_necessary, ExecutionPlan}; @@ -362,17 +363,25 @@ use itertools::izip; /// [`ProjectionExec`]: datafusion_physical_plan::projection::ProjectionExec /// [`AggregateExec`]: datafusion_physical_plan::aggregates::AggregateExec #[derive(Debug)] -pub struct FilterPushdown {} +pub struct FilterPushdown { + phase: FilterPushdownPhase, + name: String, +} impl FilterPushdown { - pub fn new() -> Self { - Self {} + fn new(phase: FilterPushdownPhase) -> Self { + Self { + phase, + name: format!("FilterPushdown({phase})"), + } + } + + pub fn new_pre_optimization() -> Self { + Self::new(FilterPushdownPhase::Pre) } -} -impl Default for FilterPushdown { - fn default() -> Self { - Self::new() + pub fn new_post_optimization() -> Self { + Self::new(FilterPushdownPhase::Post) } } @@ -382,13 +391,15 @@ impl PhysicalOptimizerRule for FilterPushdown { plan: Arc, config: &ConfigOptions, ) -> Result> { - Ok(push_down_filters(Arc::clone(&plan), vec![], config)? - .updated_node - .unwrap_or(plan)) + Ok( + push_down_filters(Arc::clone(&plan), vec![], config, self.phase)? + .updated_node + .unwrap_or(plan), + ) } fn name(&self) -> &str { - "FilterPushdown" + &self.name } fn schema_check(&self) -> bool { @@ -409,6 +420,7 @@ fn push_down_filters( node: Arc, parent_predicates: Vec>, config: &ConfigOptions, + phase: FilterPushdownPhase, ) -> Result>> { // If the node has any child, these will be rewritten as supported or unsupported let mut parent_predicates_pushdown_states = @@ -418,7 +430,7 @@ fn push_down_filters( let children = node.children(); let filter_description = - node.gather_filters_for_pushdown(parent_predicates.clone(), config)?; + node.gather_filters_for_pushdown(phase, parent_predicates.clone(), config)?; for (child, parent_filters, self_filters) in izip!( children, @@ -460,7 +472,7 @@ fn push_down_filters( } // Any filters that could not be pushed down to a child are marked as not-supported to our parents - let result = push_down_filters(Arc::clone(child), all_predicates, config)?; + let result = push_down_filters(Arc::clone(child), all_predicates, config, phase)?; if let Some(new_child) = result.updated_node { // If we have a filter pushdown result, we need to update our children @@ -524,8 +536,11 @@ fn push_down_filters( }) .collect(), ); - // Check what the current node wants to do given the result of pushdown to it's children + // TODO: by calling `handle_child_pushdown_result` we are assuming that the + // `ExecutionPlan` implementation will not change the plan itself. + // Should we have a separate method for dynamic pushdown that does not allow modifying the plan? let mut res = updated_node.handle_child_pushdown_result( + phase, ChildPushdownResult { parent_filters: parent_pushdown_result, self_filters: self_filters_pushdown_supports, diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index d5129cea9d4e..716e566644a4 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -98,8 +98,8 @@ impl PhysicalOptimizer { Arc::new(LimitedDistinctAggregation::new()), // The FilterPushdown rule tries to push down filters as far as it can. // For example, it will push down filtering from a `FilterExec` to - // a `DataSourceExec`, or from a `TopK`'s current state to a `DataSourceExec`. - Arc::new(FilterPushdown::new()), + // a `DataSourceExec`. + Arc::new(FilterPushdown::new_pre_optimization()), // The EnforceDistribution rule is for adding essential repartitioning to satisfy distribution // requirements. Please make sure that the whole plan tree is determined before this rule. // This rule increases parallelism if doing so is beneficial to the physical plan; i.e. at @@ -131,6 +131,8 @@ impl PhysicalOptimizer { // replacing operators with fetching variants, or adding limits // past operators that support limit pushdown. Arc::new(LimitPushdown::new()), + // This FilterPushdown handles dynamic filters that may have references to the source ExecutionPlan + Arc::new(FilterPushdown::new_post_optimization()), // The ProjectionPushdown rule tries to push projections towards // the sources in the execution plan. As a result of this process, // a projection can disappear if it reaches the source providers, and diff --git a/datafusion/physical-optimizer/src/topk_aggregation.rs b/datafusion/physical-optimizer/src/topk_aggregation.rs index 33166284774b..bff0b1e49684 100644 --- a/datafusion/physical-optimizer/src/topk_aggregation.rs +++ b/datafusion/physical-optimizer/src/topk_aggregation.rs @@ -130,13 +130,10 @@ impl TopKAggregation { Ok(Transformed::no(plan)) }; let child = Arc::clone(child).transform_down(closure).data().ok()?; - let mut new_sort = SortExec::new(sort.expr().clone(), child) + let sort = SortExec::new(sort.expr().clone(), child) .with_fetch(sort.fetch()) .with_preserve_partitioning(sort.preserve_partitioning()); - if let Some(filter) = sort.filter() { - new_sort = new_sort.with_filter(Arc::clone(&filter)); - } - Some(Arc::new(new_sort)) + Some(Arc::new(sort)) } } diff --git a/datafusion/physical-optimizer/src/utils.rs b/datafusion/physical-optimizer/src/utils.rs index 431925033678..3655e555a744 100644 --- a/datafusion/physical-optimizer/src/utils.rs +++ b/datafusion/physical-optimizer/src/utils.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use datafusion_common::Result; -use datafusion_physical_expr::expressions::DynamicFilterPhysicalExpr; use datafusion_physical_expr::{LexOrdering, LexRequirement}; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; @@ -40,7 +39,6 @@ pub fn add_sort_above( node: PlanContext, sort_requirements: LexRequirement, fetch: Option, - filter: Option>, ) -> PlanContext { let mut sort_reqs: Vec<_> = sort_requirements.into(); sort_reqs.retain(|sort_expr| { @@ -57,9 +55,6 @@ pub fn add_sort_above( if node.plan.output_partitioning().partition_count() > 1 { new_sort = new_sort.with_preserve_partitioning(true); } - if let Some(filter) = filter { - new_sort = new_sort.with_filter(filter); - } PlanContext::new(Arc::new(new_sort), T::default(), vec![node]) } @@ -70,14 +65,13 @@ pub fn add_sort_above_with_check( node: PlanContext, sort_requirements: LexRequirement, fetch: Option, - filter: Option>, ) -> Result> { if !node .plan .equivalence_properties() .ordering_satisfy_requirement(sort_requirements.clone())? { - Ok(add_sort_above(node, sort_requirements, fetch, filter)) + Ok(add_sort_above(node, sort_requirements, fetch)) } else { Ok(node) } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index f35231fb6a99..78bd4b4fc3a0 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -37,7 +37,8 @@ use datafusion_physical_expr::PhysicalExpr; use crate::coalesce::{BatchCoalescer, CoalescerState}; use crate::execution_plan::CardinalityEffect; use crate::filter_pushdown::{ - ChildPushdownResult, FilterDescription, FilterPushdownPropagation, + ChildPushdownResult, FilterDescription, FilterPushdownPhase, + FilterPushdownPropagation, }; use datafusion_common::config::ConfigOptions; use futures::ready; @@ -229,6 +230,7 @@ impl ExecutionPlan for CoalesceBatchesExec { fn gather_filters_for_pushdown( &self, + _phase: FilterPushdownPhase, parent_filters: Vec>, _config: &ConfigOptions, ) -> Result { @@ -238,6 +240,7 @@ impl ExecutionPlan for CoalesceBatchesExec { fn handle_child_pushdown_result( &self, + _phase: FilterPushdownPhase, child_pushdown_result: ChildPushdownResult, _config: &ConfigOptions, ) -> Result>> { diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 5dd090c12c27..a2f9319c3a5e 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -17,7 +17,8 @@ pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; use crate::filter_pushdown::{ - ChildPushdownResult, FilterDescription, FilterPushdownPropagation, + ChildPushdownResult, FilterDescription, FilterPushdownPhase, + FilterPushdownPropagation, }; pub use crate::metrics::Metric; pub use crate::ordering::InputOrderMode; @@ -509,8 +510,12 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// /// The default implementation bars all parent filters from being pushed down and adds no new filters. /// This is the safest option, making filter pushdown opt-in on a per-node pasis. + /// + /// Since this may perform deep modifications to the plan tree it is called early in the optimization phase + /// and is not expected to be called multiple times on the same plan. fn gather_filters_for_pushdown( &self, + _phase: FilterPushdownPhase, parent_filters: Vec>, _config: &ConfigOptions, ) -> Result { @@ -552,6 +557,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// [`PredicateSupports::new_with_supported_check`]: crate::filter_pushdown::PredicateSupports::new_with_supported_check fn handle_child_pushdown_result( &self, + _phase: FilterPushdownPhase, child_pushdown_result: ChildPushdownResult, _config: &ConfigOptions, ) -> Result>> { diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 25eb98a61b2e..252af9ebcd49 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -28,7 +28,8 @@ use super::{ use crate::common::can_project; use crate::execution_plan::CardinalityEffect; use crate::filter_pushdown::{ - ChildPushdownResult, FilterDescription, FilterPushdownPropagation, + ChildPushdownResult, FilterDescription, FilterPushdownPhase, + FilterPushdownPropagation, }; use crate::projection::{ make_with_child, try_embed_projection, update_expr, EmbeddedProjection, @@ -449,9 +450,14 @@ impl ExecutionPlan for FilterExec { fn gather_filters_for_pushdown( &self, + phase: FilterPushdownPhase, parent_filters: Vec>, _config: &ConfigOptions, ) -> Result { + if !matches!(phase, FilterPushdownPhase::Pre) { + return Ok(FilterDescription::new_with_child_count(1) + .all_parent_filters_supported(parent_filters)); + } let self_filter = split_conjunction(&self.predicate) .into_iter() .cloned() @@ -503,9 +509,15 @@ impl ExecutionPlan for FilterExec { fn handle_child_pushdown_result( &self, + phase: FilterPushdownPhase, child_pushdown_result: ChildPushdownResult, _config: &ConfigOptions, ) -> Result>> { + if !matches!(phase, FilterPushdownPhase::Pre) { + return Ok(FilterPushdownPropagation::transparent( + child_pushdown_result, + )); + } // We absorb any parent filters that were not handled by our children let mut unhandled_filters = child_pushdown_result.parent_filters.collect_unsupported(); diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs index 5222a98e3dd5..236abe9d2353 100644 --- a/datafusion/physical-plan/src/filter_pushdown.rs +++ b/datafusion/physical-plan/src/filter_pushdown.rs @@ -20,6 +20,39 @@ use std::vec::IntoIter; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +#[derive(Debug, Clone, Copy)] +pub enum FilterPushdownPhase { + /// Pushdown that happens before most other optimizations. + /// This pushdown allows static filters that do not reference any [`ExecutionPlan`]s to be pushed down. + /// Filters that reference an [`ExecutionPlan`] cannot be pushed down at this stage since the whole plan tree may be rewritten + /// by other optimizations. + /// Implemneters are however allowed to modify the execution plan themselves during this phase, for example by returning a completely + /// different [`ExecutionPlan`] from [`ExecutionPlan::handle_child_pushdown_result`]. + /// + /// [`ExecutionPlan`]: crate::ExecutionPlan + /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result + Pre, + /// Pushdown that happens after most other optimizations. + /// This pushdown allows filters that reference an [`ExecutionPlan`] to be pushed down. + /// It is guaranteed that subsequent optimizations will not make large changes to the plan tree, + /// but implementers are likewise not allowed to modify the plan tree themselves. + /// [`ExecutionPlan::handle_child_pushdown_result`] may still return a different [`ExecutionPlan`] (e.g. with internal state replaced) but + /// larger changes to the plan tree are likely to conflict with other optimizations or break execution outright. + /// + /// [`ExecutionPlan`]: crate::ExecutionPlan + /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result + Post, +} + +impl std::fmt::Display for FilterPushdownPhase { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FilterPushdownPhase::Pre => write!(f, "Pre"), + FilterPushdownPhase::Post => write!(f, "Post"), + } + } +} + /// The result of a plan for pushing down a filter into a child node. /// This contains references to filters so that nodes can mutate a filter /// before pushing it down to a child node (e.g. to adjust a projection) diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 8b0f7e9784af..d872a84d7285 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -55,7 +55,8 @@ use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr}; use datafusion_physical_expr_common::sort_expr::LexOrdering; use crate::filter_pushdown::{ - ChildPushdownResult, FilterDescription, FilterPushdownPropagation, + ChildPushdownResult, FilterDescription, FilterPushdownPhase, + FilterPushdownPropagation, }; use futures::stream::Stream; use futures::{FutureExt, StreamExt, TryStreamExt}; @@ -807,6 +808,7 @@ impl ExecutionPlan for RepartitionExec { fn gather_filters_for_pushdown( &self, + _phase: FilterPushdownPhase, parent_filters: Vec>, _config: &ConfigOptions, ) -> Result { @@ -816,6 +818,7 @@ impl ExecutionPlan for RepartitionExec { fn handle_child_pushdown_result( &self, + _phase: FilterPushdownPhase, child_pushdown_result: ChildPushdownResult, _config: &ConfigOptions, ) -> Result>> { diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 1ffaf7f1641b..0aa284297b22 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -27,7 +27,7 @@ use std::sync::Arc; use crate::common::spawn_buffered; use crate::execution_plan::{Boundedness, CardinalityEffect, EmissionType}; use crate::expressions::PhysicalSortExpr; -use crate::filter_pushdown::FilterDescription; +use crate::filter_pushdown::{FilterDescription, FilterPushdownPhase}; use crate::limit::LimitStream; use crate::metrics::{ BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, SpillMetrics, @@ -1268,9 +1268,14 @@ impl ExecutionPlan for SortExec { fn gather_filters_for_pushdown( &self, + phase: FilterPushdownPhase, parent_filters: Vec>, config: &datafusion_common::config::ConfigOptions, ) -> Result { + if !matches!(phase, FilterPushdownPhase::Post) { + return Ok(FilterDescription::new_with_child_count(1) + .all_parent_filters_supported(parent_filters)); + } if let Some(filter) = &self.filter { if config.optimizer.enable_dynamic_filter_pushdown { let filter = Arc::clone(filter) as Arc; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 560648271070..3aff0b443752 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -231,7 +231,7 @@ physical_plan after OutputRequirements physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE -physical_plan after FilterPushdown SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Pre) SAME TEXT AS ABOVE physical_plan after EnforceDistribution SAME TEXT AS ABOVE physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE physical_plan after EnforceSorting SAME TEXT AS ABOVE @@ -241,6 +241,7 @@ physical_plan after coalesce_batches SAME TEXT AS ABOVE physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushdown SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after insert_yield_exec SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE @@ -307,7 +308,7 @@ physical_plan after OutputRequirements physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE -physical_plan after FilterPushdown SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Pre) SAME TEXT AS ABOVE physical_plan after EnforceDistribution SAME TEXT AS ABOVE physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE physical_plan after EnforceSorting SAME TEXT AS ABOVE @@ -319,6 +320,7 @@ physical_plan after OutputRequirements 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] +physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after insert_yield_exec SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE @@ -349,7 +351,7 @@ physical_plan after OutputRequirements physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE -physical_plan after FilterPushdown SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Pre) SAME TEXT AS ABOVE physical_plan after EnforceDistribution SAME TEXT AS ABOVE physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE physical_plan after EnforceSorting SAME TEXT AS ABOVE @@ -361,6 +363,7 @@ physical_plan after OutputRequirements 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet +physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after insert_yield_exec SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE From 938c12975def65818f268fa67e212b39a3d6e075 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 08:37:09 -0500 Subject: [PATCH 19/25] more docs --- .../physical-plan/src/execution_plan.rs | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index a2f9319c3a5e..002a299f952d 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -513,6 +513,16 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// /// Since this may perform deep modifications to the plan tree it is called early in the optimization phase /// and is not expected to be called multiple times on the same plan. + /// + /// A quick summary of the phases is below, see [`FilterPushdownPhase`] for more details: + /// - [`FilterPushdownPhase::Pre`]: Filters get pushded down before most other optimizations are applied. + /// At this stage the plan can be modified (e.g. when [`ExecutionPlan::handle_child_pushdown_result`] is called the plan may choose to return an entirely new plan tree) + /// but subsequent optimizations may also rewrite the plan tree drastically, thus it is *not guaranteed* that a [`PhysicalExpr`] can hold on to a reference to the plan tree. + /// During this phase static filters (such as `col = 1`) are pushed down. + /// - [`FilterPushdownPhase::Post`]: Filters get pushed down after most other optimizations are applied. + /// At this stage the plan tree is expected to be stable and not change drastically, and operators that do filter pushdown during this phase should also not change the plan tree. + /// They may still return a new plan tree, but it should be equivalent to the old one to avoid breaking invariants from other optimizations. + /// During this phase dynamic filters (such as a reference to a TopK heap) are pushed down. fn gather_filters_for_pushdown( &self, _phase: FilterPushdownPhase, @@ -552,6 +562,17 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// - [`PredicateSupports::new_with_supported_check`]: takes a callback that returns true / false for each filter to indicate pushdown support. /// This can be used alongside [`FilterPushdownPropagation::with_filters`] and [`FilterPushdownPropagation::with_updated_node`] /// to dynamically build a result with a mix of supported and unsupported filters. + /// + /// There are two different phases in filter pushdown, which some operators may handle the same and some differently. + /// A quick summary of the phases is below, see [`FilterPushdownPhase`] for more details: + /// - [`FilterPushdownPhase::Pre`]: Filters get pushded down before most other optimizations are applied. + /// At this stage the plan can be modified (e.g. when [`ExecutionPlan::handle_child_pushdown_result`] is called the plan may choose to return an entirely new plan tree) + /// but subsequent optimizations may also rewrite the plan tree drastically, thus it is *not guaranteed* that a [`PhysicalExpr`] can hold on to a reference to the plan tree. + /// During this phase static filters (such as `col = 1`) are pushed down. + /// - [`FilterPushdownPhase::Post`]: Filters get pushed down after most other optimizations are applied. + /// At this stage the plan tree is expected to be stable and not change drastically, and operators that do filter pushdown during this phase should also not change the plan tree. + /// They may still return a new plan tree, but it should be equivalent to the old one to avoid breaking invariants from other optimizations. + /// During this phase dynamic filters (such as a reference to a TopK heap) are pushed down. /// /// [`PredicateSupport::Supported`]: crate::filter_pushdown::PredicateSupport::Supported /// [`PredicateSupports::new_with_supported_check`]: crate::filter_pushdown::PredicateSupports::new_with_supported_check From 3463c4495b12dc5fa213289e95c24faedd05ce42 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 08:45:54 -0500 Subject: [PATCH 20/25] fmt --- datafusion/physical-plan/src/execution_plan.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 002a299f952d..c1b84725c639 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -513,7 +513,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// /// Since this may perform deep modifications to the plan tree it is called early in the optimization phase /// and is not expected to be called multiple times on the same plan. - /// + /// /// A quick summary of the phases is below, see [`FilterPushdownPhase`] for more details: /// - [`FilterPushdownPhase::Pre`]: Filters get pushded down before most other optimizations are applied. /// At this stage the plan can be modified (e.g. when [`ExecutionPlan::handle_child_pushdown_result`] is called the plan may choose to return an entirely new plan tree) @@ -562,7 +562,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// - [`PredicateSupports::new_with_supported_check`]: takes a callback that returns true / false for each filter to indicate pushdown support. /// This can be used alongside [`FilterPushdownPropagation::with_filters`] and [`FilterPushdownPropagation::with_updated_node`] /// to dynamically build a result with a mix of supported and unsupported filters. - /// + /// /// There are two different phases in filter pushdown, which some operators may handle the same and some differently. /// A quick summary of the phases is below, see [`FilterPushdownPhase`] for more details: /// - [`FilterPushdownPhase::Pre`]: Filters get pushded down before most other optimizations are applied. From c719676b7b6ac7cafe269cd0eaf998fb23585e95 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 17:02:32 -0500 Subject: [PATCH 21/25] revert unecessary change --- datafusion/physical-expr/src/expressions/dynamic_filters.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/dynamic_filters.rs b/datafusion/physical-expr/src/expressions/dynamic_filters.rs index 09a8da61cdfa..756fb638af2b 100644 --- a/datafusion/physical-expr/src/expressions/dynamic_filters.rs +++ b/datafusion/physical-expr/src/expressions/dynamic_filters.rs @@ -288,9 +288,8 @@ impl PhysicalExpr for DynamicFilterPhysicalExpr { } fn snapshot(&self) -> Result>> { - let inner = self.current()?; // Return the current expression as a snapshot. - Ok(Some(inner)) + Ok(Some(self.current()?)) } } From 4a3e3c3e75f01867c667cbfb766594bed60c19b1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 17:13:25 -0500 Subject: [PATCH 22/25] docs, naming --- datafusion/core/benches/push_down_filter.rs | 2 +- .../core/tests/parquet/file_statistics.rs | 2 +- .../physical_optimizer/filter_pushdown/mod.rs | 22 ++++++++-------- .../physical-optimizer/src/filter_pushdown.rs | 20 +++++++++----- .../physical-optimizer/src/optimizer.rs | 14 ++++++---- .../physical-plan/src/execution_plan.rs | 26 ++++--------------- .../physical-plan/src/filter_pushdown.rs | 15 ++++++----- 7 files changed, 49 insertions(+), 52 deletions(-) diff --git a/datafusion/core/benches/push_down_filter.rs b/datafusion/core/benches/push_down_filter.rs index f781b9ae46bd..139fb12c3094 100644 --- a/datafusion/core/benches/push_down_filter.rs +++ b/datafusion/core/benches/push_down_filter.rs @@ -105,7 +105,7 @@ fn bench_push_down_filter(c: &mut Criterion) { let mut config = ConfigOptions::default(); config.execution.parquet.pushdown_filters = true; let plan = BenchmarkPlan { plan, config }; - let optimizer = FilterPushdown::new_pre_optimization(); + let optimizer = FilterPushdown::new(); c.bench_function("push_down_filter", |b| { b.iter(|| { diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 14ca17d1f8d5..a60beaf665e5 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -83,7 +83,7 @@ async fn check_stats_precision_with_filter_pushdown() { Arc::new(FilterExec::try_new(physical_filter, exec_with_filter).unwrap()) as Arc; - let optimized_exec = FilterPushdown::new_pre_optimization() + let optimized_exec = FilterPushdown::new() .optimize(filtered_exec, &options) .unwrap(); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs index 6c92c7c52d37..f1ef365c9220 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs @@ -63,7 +63,7 @@ fn test_pushdown_into_scan() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: @@ -87,7 +87,7 @@ fn test_pushdown_into_scan_with_config_options() { insta::assert_snapshot!( OptimizationTest::new( Arc::clone(&plan), - FilterPushdown::new_pre_optimization(), + FilterPushdown::new(), false ), @r" @@ -106,7 +106,7 @@ fn test_pushdown_into_scan_with_config_options() { insta::assert_snapshot!( OptimizationTest::new( plan, - FilterPushdown::new_pre_optimization(), + FilterPushdown::new(), true ), @r" @@ -131,7 +131,7 @@ fn test_filter_collapse() { let plan = Arc::new(FilterExec::try_new(predicate2, filter1).unwrap()); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: @@ -159,7 +159,7 @@ fn test_filter_with_projection() { // expect the predicate to be pushed down into the DataSource but the FilterExec to be converted to ProjectionExec insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: @@ -182,7 +182,7 @@ fn test_filter_with_projection() { .unwrap(), ); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(),true), + OptimizationTest::new(plan, FilterPushdown::new(),true), @r" OptimizationTest: input: @@ -211,7 +211,7 @@ fn test_push_down_through_transparent_nodes() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(),true), + OptimizationTest::new(plan, FilterPushdown::new(),true), @r" OptimizationTest: input: @@ -275,7 +275,7 @@ fn test_no_pushdown_through_aggregates() { // expect the predicate to be pushed down into the DataSource insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: @@ -306,7 +306,7 @@ fn test_node_handles_child_pushdown_result() { let predicate = col_lit_predicate("a", "foo", &schema()); let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate)); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: @@ -325,7 +325,7 @@ fn test_node_handles_child_pushdown_result() { let predicate = col_lit_predicate("a", "foo", &schema()); let plan = Arc::new(TestNode::new(true, Arc::clone(&scan), predicate)); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: @@ -345,7 +345,7 @@ fn test_node_handles_child_pushdown_result() { let predicate = col_lit_predicate("a", "foo", &schema()); let plan = Arc::new(TestNode::new(false, Arc::clone(&scan), predicate)); insta::assert_snapshot!( - OptimizationTest::new(plan, FilterPushdown::new_pre_optimization(), true), + OptimizationTest::new(plan, FilterPushdown::new(), true), @r" OptimizationTest: input: diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 4f5155eec235..52e4e075ff6b 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -369,19 +369,25 @@ pub struct FilterPushdown { } impl FilterPushdown { - fn new(phase: FilterPushdownPhase) -> Self { - Self { - phase, - name: format!("FilterPushdown({phase})"), + fn new_with_phase(phase: FilterPushdownPhase) -> Self { + let name = match phase { + FilterPushdownPhase::Pre => "FilterPushdown", + FilterPushdownPhase::Post => "FilterPushdown(Post)", } + .to_string(); + Self { phase, name } } - pub fn new_pre_optimization() -> Self { - Self::new(FilterPushdownPhase::Pre) + /// Create a new [`FilterPushdown`] optimizer rule that runs in the pre-optimization phase. + /// See [`FilterPushdownPhase`] for more details. + pub fn new() -> Self { + Self::new_with_phase(FilterPushdownPhase::Pre) } + /// Create a new [`FilterPushdown`] optimizer rule that runs in the post-optimization phase. + /// See [`FilterPushdownPhase`] for more details. pub fn new_post_optimization() -> Self { - Self::new(FilterPushdownPhase::Post) + Self::new_with_phase(FilterPushdownPhase::Post) } } diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 716e566644a4..aed81606919e 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -97,9 +97,11 @@ impl PhysicalOptimizer { // Applying the rule early means only directly-connected AggregateExecs must be examined. Arc::new(LimitedDistinctAggregation::new()), // The FilterPushdown rule tries to push down filters as far as it can. - // For example, it will push down filtering from a `FilterExec` to - // a `DataSourceExec`. - Arc::new(FilterPushdown::new_pre_optimization()), + // For example, it will push down filtering from a `FilterExec` to `DataSourceExec`. + // Note that this does not push down dynamic filters (such as those created by a `SortExec` operator in TopK mode), + // those are handled by the later `FilterPushdown` rule. + // See `FilterPushdownPhase` for more details. + Arc::new(FilterPushdown::new()), // The EnforceDistribution rule is for adding essential repartitioning to satisfy distribution // requirements. Please make sure that the whole plan tree is determined before this rule. // This rule increases parallelism if doing so is beneficial to the physical plan; i.e. at @@ -131,8 +133,6 @@ impl PhysicalOptimizer { // replacing operators with fetching variants, or adding limits // past operators that support limit pushdown. Arc::new(LimitPushdown::new()), - // This FilterPushdown handles dynamic filters that may have references to the source ExecutionPlan - Arc::new(FilterPushdown::new_post_optimization()), // The ProjectionPushdown rule tries to push projections towards // the sources in the execution plan. As a result of this process, // a projection can disappear if it reaches the source providers, and @@ -141,6 +141,10 @@ impl PhysicalOptimizer { // reduced by narrowing their input tables. Arc::new(ProjectionPushdown::new()), Arc::new(InsertYieldExec::new()), + // This FilterPushdown handles dynamic filters that may have references to the source ExecutionPlan. + // Therefore it should be run at the end of the optimization process since any changes to the plan may break the dynamic filter's references. + // See `FilterPushdownPhase` for more details. + Arc::new(FilterPushdown::new_post_optimization()), // The SanityCheckPlan rule checks whether the order and // distribution requirements of each node in the plan // is satisfied. It will also reject non-runnable query diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index c1b84725c639..75f1c90820e1 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -511,18 +511,9 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// The default implementation bars all parent filters from being pushed down and adds no new filters. /// This is the safest option, making filter pushdown opt-in on a per-node pasis. /// - /// Since this may perform deep modifications to the plan tree it is called early in the optimization phase - /// and is not expected to be called multiple times on the same plan. - /// - /// A quick summary of the phases is below, see [`FilterPushdownPhase`] for more details: - /// - [`FilterPushdownPhase::Pre`]: Filters get pushded down before most other optimizations are applied. - /// At this stage the plan can be modified (e.g. when [`ExecutionPlan::handle_child_pushdown_result`] is called the plan may choose to return an entirely new plan tree) - /// but subsequent optimizations may also rewrite the plan tree drastically, thus it is *not guaranteed* that a [`PhysicalExpr`] can hold on to a reference to the plan tree. - /// During this phase static filters (such as `col = 1`) are pushed down. - /// - [`FilterPushdownPhase::Post`]: Filters get pushed down after most other optimizations are applied. - /// At this stage the plan tree is expected to be stable and not change drastically, and operators that do filter pushdown during this phase should also not change the plan tree. - /// They may still return a new plan tree, but it should be equivalent to the old one to avoid breaking invariants from other optimizations. - /// During this phase dynamic filters (such as a reference to a TopK heap) are pushed down. + /// There are two different phases in filter pushdown, which some operators may handle the same and some differently. + /// Depending on the phase the operator may or may not be allowed to modify the plan. + /// See [`FilterPushdownPhase`] for more details. fn gather_filters_for_pushdown( &self, _phase: FilterPushdownPhase, @@ -564,15 +555,8 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// to dynamically build a result with a mix of supported and unsupported filters. /// /// There are two different phases in filter pushdown, which some operators may handle the same and some differently. - /// A quick summary of the phases is below, see [`FilterPushdownPhase`] for more details: - /// - [`FilterPushdownPhase::Pre`]: Filters get pushded down before most other optimizations are applied. - /// At this stage the plan can be modified (e.g. when [`ExecutionPlan::handle_child_pushdown_result`] is called the plan may choose to return an entirely new plan tree) - /// but subsequent optimizations may also rewrite the plan tree drastically, thus it is *not guaranteed* that a [`PhysicalExpr`] can hold on to a reference to the plan tree. - /// During this phase static filters (such as `col = 1`) are pushed down. - /// - [`FilterPushdownPhase::Post`]: Filters get pushed down after most other optimizations are applied. - /// At this stage the plan tree is expected to be stable and not change drastically, and operators that do filter pushdown during this phase should also not change the plan tree. - /// They may still return a new plan tree, but it should be equivalent to the old one to avoid breaking invariants from other optimizations. - /// During this phase dynamic filters (such as a reference to a TopK heap) are pushed down. + /// Depending on the phase the operator may or may not be allowed to modify the plan. + /// See [`FilterPushdownPhase`] for more details. /// /// [`PredicateSupport::Supported`]: crate::filter_pushdown::PredicateSupport::Supported /// [`PredicateSupports::new_with_supported_check`]: crate::filter_pushdown::PredicateSupports::new_with_supported_check diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs index 236abe9d2353..16a8feec9833 100644 --- a/datafusion/physical-plan/src/filter_pushdown.rs +++ b/datafusion/physical-plan/src/filter_pushdown.rs @@ -26,20 +26,23 @@ pub enum FilterPushdownPhase { /// This pushdown allows static filters that do not reference any [`ExecutionPlan`]s to be pushed down. /// Filters that reference an [`ExecutionPlan`] cannot be pushed down at this stage since the whole plan tree may be rewritten /// by other optimizations. - /// Implemneters are however allowed to modify the execution plan themselves during this phase, for example by returning a completely + /// Implementers are however allowed to modify the execution plan themselves during this phase, for example by returning a completely /// different [`ExecutionPlan`] from [`ExecutionPlan::handle_child_pushdown_result`]. /// + /// Pushdown of `FilterExec` into `DataSourceExec` is an example of a pre-pushdown. + /// /// [`ExecutionPlan`]: crate::ExecutionPlan /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result Pre, /// Pushdown that happens after most other optimizations. - /// This pushdown allows filters that reference an [`ExecutionPlan`] to be pushed down. - /// It is guaranteed that subsequent optimizations will not make large changes to the plan tree, - /// but implementers are likewise not allowed to modify the plan tree themselves. - /// [`ExecutionPlan::handle_child_pushdown_result`] may still return a different [`ExecutionPlan`] (e.g. with internal state replaced) but - /// larger changes to the plan tree are likely to conflict with other optimizations or break execution outright. + /// This stage of filter pushdown allows filters that reference an [`ExecutionPlan`] to be pushed down. + /// Since subsequent optimizations should not change the structure of the plan tree except for calling [`ExecutionPlan::with_new_children`] + /// (which generally preserves internal references) it is safe for references between [`ExecutionPlan`]s to be established at this stage. + /// + /// This phase is used to link a `SortExec` (with a TopK operator) or a `HashJoinExec` to a `DataSourceExec`. /// /// [`ExecutionPlan`]: crate::ExecutionPlan + /// [`ExecutionPlan::with_new_children`]: crate::ExecutionPlan::with_new_children /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result Post, } From cd56084f922cccd27a7c847fc18eb9cd213578c1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 17:16:26 -0500 Subject: [PATCH 23/25] add links --- datafusion/physical-plan/src/filter_pushdown.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs index 16a8feec9833..3bbe3997fdfc 100644 --- a/datafusion/physical-plan/src/filter_pushdown.rs +++ b/datafusion/physical-plan/src/filter_pushdown.rs @@ -29,9 +29,10 @@ pub enum FilterPushdownPhase { /// Implementers are however allowed to modify the execution plan themselves during this phase, for example by returning a completely /// different [`ExecutionPlan`] from [`ExecutionPlan::handle_child_pushdown_result`]. /// - /// Pushdown of `FilterExec` into `DataSourceExec` is an example of a pre-pushdown. + /// Pushdown of [`FilterExec`] into `DataSourceExec` is an example of a pre-pushdown. /// /// [`ExecutionPlan`]: crate::ExecutionPlan + /// [`FilterExec`]: crate::filter::FilterExec /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result Pre, /// Pushdown that happens after most other optimizations. @@ -39,10 +40,12 @@ pub enum FilterPushdownPhase { /// Since subsequent optimizations should not change the structure of the plan tree except for calling [`ExecutionPlan::with_new_children`] /// (which generally preserves internal references) it is safe for references between [`ExecutionPlan`]s to be established at this stage. /// - /// This phase is used to link a `SortExec` (with a TopK operator) or a `HashJoinExec` to a `DataSourceExec`. + /// This phase is used to link a [`SortExec`] (with a TopK operator) or a [`HashJoinExec`] to a `DataSourceExec`. /// /// [`ExecutionPlan`]: crate::ExecutionPlan /// [`ExecutionPlan::with_new_children`]: crate::ExecutionPlan::with_new_children + /// [`SortExec`]: crate::sorts::sort::SortExec + /// [`HashJoinExec`]: crate::joins::HashJoinExec /// [`ExecutionPlan::handle_child_pushdown_result`]: crate::ExecutionPlan::handle_child_pushdown_result Post, } From 7badc1efe470792a80e574ed4ae2dd92829e2a38 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 20:45:14 -0500 Subject: [PATCH 24/25] update plans --- datafusion/sqllogictest/test_files/explain.slt | 12 ++++++------ .../test_files/parquet_filter_pushdown.slt | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 3aff0b443752..e65225084548 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -231,7 +231,7 @@ physical_plan after OutputRequirements physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE -physical_plan after FilterPushdown(Pre) SAME TEXT AS ABOVE +physical_plan after FilterPushdown SAME TEXT AS ABOVE physical_plan after EnforceDistribution SAME TEXT AS ABOVE physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE physical_plan after EnforceSorting SAME TEXT AS ABOVE @@ -241,9 +241,9 @@ physical_plan after coalesce_batches SAME TEXT AS ABOVE physical_plan after OutputRequirements DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushdown SAME TEXT AS ABOVE -physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after insert_yield_exec SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] @@ -308,7 +308,7 @@ physical_plan after OutputRequirements physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE -physical_plan after FilterPushdown(Pre) SAME TEXT AS ABOVE +physical_plan after FilterPushdown SAME TEXT AS ABOVE physical_plan after EnforceDistribution SAME TEXT AS ABOVE physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE physical_plan after EnforceSorting SAME TEXT AS ABOVE @@ -320,9 +320,9 @@ physical_plan after OutputRequirements 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] -physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after insert_yield_exec SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] physical_plan_with_schema DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N] @@ -351,7 +351,7 @@ physical_plan after OutputRequirements physical_plan after aggregate_statistics SAME TEXT AS ABOVE physical_plan after join_selection SAME TEXT AS ABOVE physical_plan after LimitedDistinctAggregation SAME TEXT AS ABOVE -physical_plan after FilterPushdown(Pre) SAME TEXT AS ABOVE +physical_plan after FilterPushdown SAME TEXT AS ABOVE physical_plan after EnforceDistribution SAME TEXT AS ABOVE physical_plan after CombinePartialFinalAggregate SAME TEXT AS ABOVE physical_plan after EnforceSorting SAME TEXT AS ABOVE @@ -363,9 +363,9 @@ physical_plan after OutputRequirements 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet physical_plan after LimitAggregation SAME TEXT AS ABOVE physical_plan after LimitPushdown DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet -physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after ProjectionPushdown SAME TEXT AS ABOVE physical_plan after insert_yield_exec SAME TEXT AS ABOVE +physical_plan after FilterPushdown(Post) SAME TEXT AS ABOVE physical_plan after SanityCheckPlan SAME TEXT AS ABOVE physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet physical_plan_with_stats DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, file_type=parquet, statistics=[Rows=Exact(8), Bytes=Exact(671), [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]] diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index e5b5f5ac878a..f7082c1daaf5 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -280,4 +280,4 @@ physical_plan query TT select val, part from t_pushdown where part = val AND part = 'a'; ---- -a a \ No newline at end of file +a a From 8e88bd9913f0e8b3d04c1a61e8e928d09f271dba Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 16 Jun 2025 20:45:53 -0500 Subject: [PATCH 25/25] clippy --- datafusion/physical-optimizer/src/filter_pushdown.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/physical-optimizer/src/filter_pushdown.rs b/datafusion/physical-optimizer/src/filter_pushdown.rs index 52e4e075ff6b..885280576b4b 100644 --- a/datafusion/physical-optimizer/src/filter_pushdown.rs +++ b/datafusion/physical-optimizer/src/filter_pushdown.rs @@ -391,6 +391,12 @@ impl FilterPushdown { } } +impl Default for FilterPushdown { + fn default() -> Self { + Self::new() + } +} + impl PhysicalOptimizerRule for FilterPushdown { fn optimize( &self,