Skip to content

Set the default value of datafusion.execution.collect_statistics to true #16447

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 19, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,8 @@ config_namespace! {

/// Should DataFusion collect statistics when first creating a table.
/// Has no effect after the table is created. Applies to the default
/// `ListingTableProvider` in DataFusion. Defaults to false.
pub collect_statistics: bool, default = false
/// `ListingTableProvider` in DataFusion. Defaults to true.
pub collect_statistics: bool, default = true

/// Number of partitions for query execution. Increasing partitions can increase
/// concurrency.
Expand Down
27 changes: 13 additions & 14 deletions datafusion/core/src/execution/context/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,12 @@ impl SessionContext {
///
/// # Note: Statistics
///
/// NOTE: by default, statistics are not collected when reading the Parquet
/// files as this can slow down the initial DataFrame creation. However,
/// collecting statistics can greatly accelerate queries with certain
/// filters.
/// NOTE: by default, statistics are collected when reading the Parquet
/// files This can slow down the initial DataFrame creation while
/// greatly accelerating queries with certain filters.
///
/// To enable collect statistics, set the [config option]
/// `datafusion.execution.collect_statistics` to `true`. See
/// To disable statistics collection, set the [config option]
/// `datafusion.execution.collect_statistics` to `false`. See
/// [`ConfigOptions`] and [`ExecutionOptions::collect_statistics`] for more
/// details.
///
Expand Down Expand Up @@ -171,28 +170,28 @@ mod tests {

#[tokio::test]
async fn register_parquet_respects_collect_statistics_config() -> Result<()> {
// The default is false
// The default is true
let mut config = SessionConfig::new();
config.options_mut().explain.physical_plan_only = true;
config.options_mut().explain.show_statistics = true;
let content = explain_query_all_with_config(config).await?;
assert_contains!(content, "statistics=[Rows=Absent,");
assert_contains!(content, "statistics=[Rows=Exact(");

// Explicitly set to false
// Explicitly set to true
let mut config = SessionConfig::new();
config.options_mut().explain.physical_plan_only = true;
config.options_mut().explain.show_statistics = true;
config.options_mut().execution.collect_statistics = false;
config.options_mut().execution.collect_statistics = true;
let content = explain_query_all_with_config(config).await?;
assert_contains!(content, "statistics=[Rows=Absent,");
assert_contains!(content, "statistics=[Rows=Exact(");

// Explicitly set to true
// Explicitly set to false
let mut config = SessionConfig::new();
config.options_mut().explain.physical_plan_only = true;
config.options_mut().explain.show_statistics = true;
config.options_mut().execution.collect_statistics = true;
config.options_mut().execution.collect_statistics = false;
let content = explain_query_all_with_config(config).await?;
assert_contains!(content, "statistics=[Rows=Exact(10),");
assert_contains!(content, "statistics=[Rows=Absent,");

Ok(())
}
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/tests/parquet/row_group_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ macro_rules! int_tests {
.with_query(&format!("SELECT * FROM t where i{} in (100)", $bits))
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(0))
.with_pruned_by_stats(Some(4))
.with_pruned_by_stats(Some(0))
.with_matched_by_bloom_filter(Some(0))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(0)
Expand Down Expand Up @@ -1316,7 +1316,7 @@ async fn test_row_group_with_null_values() {
.with_query("SELECT * FROM t WHERE \"i32\" > 7")
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(0))
.with_pruned_by_stats(Some(3))
.with_pruned_by_stats(Some(0))
.with_expected_rows(0)
.with_matched_by_bloom_filter(Some(0))
.with_pruned_by_bloom_filter(Some(0))
Expand Down
383 changes: 156 additions & 227 deletions datafusion/sqllogictest/test_files/explain_tree.slt

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ datafusion.catalog.location NULL
datafusion.catalog.newlines_in_values false
datafusion.execution.batch_size 8192
datafusion.execution.coalesce_batches true
datafusion.execution.collect_statistics false
datafusion.execution.collect_statistics true
datafusion.execution.enable_recursive_ctes true
datafusion.execution.enforce_batch_size_in_joins false
datafusion.execution.keep_partition_by_columns false
Expand Down Expand Up @@ -328,7 +328,7 @@ datafusion.catalog.location NULL Location scanned to load tables for `default` s
datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.
datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption
datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting
datafusion.execution.collect_statistics false Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to false.
datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true.
datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs
datafusion.execution.enforce_batch_size_in_joins false Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower.
datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/limit.slt
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ physical_plan
01)ProjectionExec: expr=[1 as foo]
02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1
03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST], preserve_partitioning=[true]
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ]
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet]]}, projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr [ true ]

query I
with selection as (
Expand Down
11 changes: 4 additions & 7 deletions datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,9 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: val@0 != part@1
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet
03)----DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet

# If we reference only a partition column it gets evaluted during the listing phase
# If we reference only a partition column it gets evaluated during the listing phase
query TT
EXPLAIN select * from t_pushdown where part != 'a';
----
Expand Down Expand Up @@ -257,8 +256,7 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: val@0 = part@1
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet

query TT
select val, part from t_pushdown where part = 'a' AND part = val;
Expand All @@ -274,8 +272,7 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: val@0 = part@1
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]}, projection=[val, part], file_type=parquet

query TT
select val, part from t_pushdown where part = val AND part = 'a';
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/parquet_statistics.slt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ statement ok
set datafusion.explain.show_statistics = true;

######
# By default, the statistics are not gathered
# By default, the statistics are gathered
######

# Recreate the table to pick up the current setting
Expand All @@ -59,18 +59,18 @@ query TT
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
----
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]]
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]

# cleanup
statement ok
DROP TABLE test_table;

######
# When the setting is true, the statistics are gathered
# When the setting is true, statistics are gathered
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to other reviewers, the negative is still tested below:

When the setting is false, the statistics are NOT gathered

######

statement ok
Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/repartition.slt
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ physical_plan
01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
02)--CoalesceBatchesExec: target_batch_size=8192
03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
04)------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
05)--------AggregateExec: mode=Partial, gby=[column1@0 as column1], aggr=[sum(parquet_table.column2)]
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]}, projection=[column1, column2], file_type=parquet

# disable round robin repartitioning
Expand Down
Loading
Loading