apache · jayzhan211 · Aug 22, 2024 · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
@@ -385,6 +385,14 @@ impl LogicalPlanBuilder {
             .map(Self::from)
     }
 
+    /// Apply a filter which is used for a having clause
+    pub fn having(self, expr: impl Into<Expr>) -> Result<Self> {
+        let expr = normalize_col(expr.into(), &self.plan)?;
+        Filter::try_new_with_having(expr, Arc::new(self.plan))
+            .map(LogicalPlan::Filter)
+            .map(Self::from)
+    }
+
     /// Make a builder for a prepare logical plan from the builder's plan
     pub fn prepare(self, name: String, data_types: Vec<DataType>) -> Result<Self> {
         Ok(Self::from(LogicalPlan::Prepare(Prepare {

diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
@@ -643,9 +643,12 @@ impl LogicalPlan {
                 // todo it isn't clear why the schema is not recomputed here
                 Ok(LogicalPlan::Values(Values { schema, values }))
             }
-            LogicalPlan::Filter(Filter { predicate, input }) => {
-                Filter::try_new(predicate, input).map(LogicalPlan::Filter)
-            }
+            LogicalPlan::Filter(Filter {
+                predicate,
+                input,
+                having,
+            }) => Filter::try_new_internal(predicate, input, having)
+                .map(LogicalPlan::Filter),
             LogicalPlan::Repartition(_) => Ok(self),
             LogicalPlan::Window(Window {
                 input,
@@ -2081,6 +2084,8 @@ pub struct Filter {
     pub predicate: Expr,
     /// The incoming logical plan
     pub input: Arc<LogicalPlan>,
+    /// The flag to indicate if the filter is a having clause
+    pub having: bool,
 }
 
 impl Filter {
@@ -2089,6 +2094,20 @@ impl Filter {
     /// Notes: as Aliases have no effect on the output of a filter operator,
     /// they are removed from the predicate expression.
     pub fn try_new(predicate: Expr, input: Arc<LogicalPlan>) -> Result<Self> {
+        Self::try_new_internal(predicate, input, false)
+    }
+
+    /// Create a new filter operator for a having clause.
+    /// This is similar to a filter, but its having flag is set to true.
+    pub fn try_new_with_having(predicate: Expr, input: Arc<LogicalPlan>) -> Result<Self> {
+        Self::try_new_internal(predicate, input, true)
+    }
+
+    fn try_new_internal(
+        predicate: Expr,
+        input: Arc<LogicalPlan>,
+        having: bool,
+    ) -> Result<Self> {
         // Filter predicates must return a boolean value so we try and validate that here.
         // Note that it is not always possible to resolve the predicate expression during plan
         // construction (such as with correlated subqueries) so we make a best effort here and
@@ -2105,6 +2124,7 @@ impl Filter {
         Ok(Self {
             predicate: predicate.unalias_nested().data,
             input,
+            having,
         })
     }
 

diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -87,8 +87,17 @@ impl TreeNode for LogicalPlan {
                     schema,
                 })
             }),
-            LogicalPlan::Filter(Filter { predicate, input }) => rewrite_arc(input, f)?
-                .update_data(|input| LogicalPlan::Filter(Filter { predicate, input })),
+            LogicalPlan::Filter(Filter {
+                predicate,
+                input,
+                having,
+            }) => rewrite_arc(input, f)?.update_data(|input| {
+                LogicalPlan::Filter(Filter {
+                    predicate,
+                    input,
+                    having,
+                })
+            }),
             LogicalPlan::Repartition(Repartition {
                 input,
                 partitioning_scheme,
@@ -561,10 +570,17 @@ impl LogicalPlan {
                     value.into_iter().map_until_stop_and_collect(&mut f)
                 })?
                 .update_data(|values| LogicalPlan::Values(Values { schema, values })),
-            LogicalPlan::Filter(Filter { predicate, input }) => f(predicate)?
-                .update_data(|predicate| {
-                    LogicalPlan::Filter(Filter { predicate, input })
-                }),
+            LogicalPlan::Filter(Filter {
+                predicate,
+                input,
+                having,
+            }) => f(predicate)?.update_data(|predicate| {
+                LogicalPlan::Filter(Filter {
+                    predicate,
+                    input,
+                    having,
+                })
+            }),
             LogicalPlan::Repartition(Repartition {
                 input,
                 partitioning_scheme,

diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
@@ -804,6 +804,15 @@ pub fn find_base_plan(input: &LogicalPlan) -> &LogicalPlan {
     match input {
         LogicalPlan::Window(window) => find_base_plan(&window.input),
         LogicalPlan::Aggregate(agg) => find_base_plan(&agg.input),
+        LogicalPlan::Filter(filter) => {
+            if filter.having {
+                // If a filter is used for a having clause, its input plan is an aggregation.
+                // We should expand the wildcard expression based on the aggregation's input plan.
+                find_base_plan(&filter.input)
+            } else {
+                input
+            }
+        }
         _ => input,
     }
 }

diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs
@@ -160,14 +160,13 @@ fn replace_columns(
 mod tests {
     use arrow::datatypes::{DataType, Field, Schema};
 
+    use crate::test::{assert_analyzed_plan_eq_display_indent, test_table_scan};
+    use crate::Analyzer;
     use datafusion_common::{JoinType, TableReference};
     use datafusion_expr::{
         col, in_subquery, qualified_wildcard, table_scan, wildcard, LogicalPlanBuilder,
     };
 
-    use crate::test::{assert_analyzed_plan_eq_display_indent, test_table_scan};
-    use crate::Analyzer;
-
     use super::*;
 
     fn assert_plan_eq(plan: LogicalPlan, expected: &str) -> Result<()> {

diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use std::collections::HashSet;
+use std::ops::Deref;
 use std::sync::Arc;
 
 use crate::planner::{
@@ -34,9 +35,7 @@ use datafusion_expr::expr_rewriter::{
     normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_cols,
 };
 use datafusion_expr::logical_plan::tree_node::unwrap_arc;
-use datafusion_expr::utils::{
-    expr_as_column_expr, expr_to_columns, find_aggregate_exprs, find_window_exprs,
-};
+use datafusion_expr::utils::{expr_as_column_expr, expr_to_columns, exprlist_to_fields, find_aggregate_exprs, find_window_exprs};
 use datafusion_expr::{
     qualified_wildcard_with_options, wildcard_with_options, Aggregate, Expr, Filter,
     GroupingSet, LogicalPlan, LogicalPlanBuilder, Partitioning,
@@ -214,7 +213,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
 
         let plan = if let Some(having_expr_post_aggr) = having_expr_post_aggr {
             LogicalPlanBuilder::from(plan)
-                .filter(having_expr_post_aggr)?
+                .having(having_expr_post_aggr)?
                 .build()?
         } else {
             plan
@@ -749,11 +748,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             .map(|expr| rebase_expr(expr, &aggr_projection_exprs, input))
             .collect::<Result<Vec<Expr>>>()?;
 
+        let wildcard_exprs = select_exprs_post_aggr.iter().filter(|expr| matches!(expr, Expr::Wildcard { .. })).collect::<Vec<_>>();
+        let wildcard_fields = exprlist_to_fields(wildcard_exprs, input)?;
+
         // finally, we have some validation that the re-written projection can be resolved
         // from the aggregate output columns
         check_columns_satisfy_exprs(
             &column_exprs_post_aggr,
             &select_exprs_post_aggr,
+            &wildcard_fields,
             "Projection references non-aggregate values",
         )?;
 
@@ -766,9 +769,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             check_columns_satisfy_exprs(
                 &column_exprs_post_aggr,
                 &[having_expr_post_aggr.clone()],
+                &wildcard_fields,
                 "HAVING clause references non-aggregate values",
             )?;
-
             Some(having_expr_post_aggr)
         } else {
             None
@@ -778,6 +781,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
     }
 }
 
+fn check_contain_scalar_only(exprs: &[Expr]) -> bool {
+    exprs.iter().all(|expr| match expr {
+        Expr::ScalarFunction(_) => true,
+        Expr::Literal(_) => true,
+        Expr::Alias(alias) => check_contain_scalar_only(&[alias.expr.deref().clone()]),
+        _ => false,
+    })
+}
+
 // If there are any multiple-defined windows, we raise an error.
 fn check_conflicting_windows(window_defs: &[NamedWindowDefinition]) -> Result<()> {
     for (i, window_def_i) in window_defs.iter().enumerate() {

diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs
@@ -18,19 +18,20 @@
 //! SQL Utility Functions
 
 use std::collections::HashMap;
-
-use arrow_schema::{
-    DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE,
-};
+use std::sync::Arc;
+use arrow_schema::{DataType, Field, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE};
 use datafusion_common::tree_node::{
     Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
 };
 use datafusion_common::{
     exec_err, internal_err, plan_err, Column, DataFusionError, Result, ScalarValue,
+    TableReference,
 };
 use datafusion_expr::builder::get_unnested_columns;
 use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction};
-use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs};
+use datafusion_expr::utils::{
+    expr_as_column_expr, find_column_exprs,
+};
 use datafusion_expr::{expr_vec_fmt, Expr, ExprSchemable, LogicalPlan};
 use sqlparser::ast::{Ident, Value};
 
@@ -90,6 +91,7 @@ pub(crate) fn rebase_expr(
 pub(crate) fn check_columns_satisfy_exprs(
     columns: &[Expr],
     exprs: &[Expr],
+    wildcard_fields: &[(Option<TableReference>, Arc<Field>)],
     message_prefix: &str,
 ) -> Result<()> {
     columns.iter().try_for_each(|c| match c {
@@ -119,9 +121,34 @@ pub(crate) fn check_columns_satisfy_exprs(
             _ => check_column_satisfies_expr(columns, e, message_prefix)?,
         }
     }
+    let column_names = columns
+        .iter()
+        .map(|c| format!("{}", c.schema_name()))
+        .collect::<Vec<_>>();
+
+    wildcard_fields.into_iter().try_for_each(|(table, field)| {
+        let column_name = qualified_name(table, field.name());
+        if !column_names.iter().any(|c| c == &column_name) {
+            plan_err!(
+                "{}: Wildcard column {} could not be resolved from available columns: {}",
+                message_prefix,
+                column_name,
+                expr_vec_fmt!(columns)
+            )
+        } else {
+            Ok(())
+        }
+    })?;
     Ok(())
 }
 
+fn qualified_name(qualifier: &Option<TableReference>, name: &str) -> String {
+    match qualifier {
+        Some(q) => format!("{}.{}", q, name),
+        None => name.to_string(),
+    }
+}
+
 fn check_column_satisfies_expr(
     columns: &[Expr],
     expr: &Expr,

diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -5643,6 +5643,92 @@ select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(N
 ----
 0 NULL NULL NULL NULL NULL NULL NULL
 
+statement ok
+create table having_test(v1 int, v2 int)
+
+statement ok
+create table join_table(v1 int, v2 int)
+
+statement ok
+insert into having_test values (1, 2), (2, 3), (3, 4)
+
+statement ok
+insert into join_table values (1, 2), (2, 3), (3, 4)
+
+
+query II
+select * from having_test group by v1, v2 having max(v1) = 3
+----
+3 4
+
+query TT
+EXPLAIN select * from having_test group by v1, v2 having max(v1) = 3
+----
+logical_plan
+01)Projection: having_test.v1, having_test.v2
+02)--Filter: max(having_test.v1) = Int32(3)
+03)----Aggregate: groupBy=[[having_test.v1, having_test.v2]], aggr=[[max(having_test.v1)]]
+04)------TableScan: having_test projection=[v1, v2]
+physical_plan
+01)ProjectionExec: expr=[v1@0 as v1, v2@1 as v2]
+02)--CoalesceBatchesExec: target_batch_size=8192
+03)----FilterExec: max(having_test.v1)@2 = 3
+04)------AggregateExec: mode=FinalPartitioned, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
+05)--------CoalesceBatchesExec: target_batch_size=8192
+06)----------RepartitionExec: partitioning=Hash([v1@0, v2@1], 4), input_partitions=4
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------AggregateExec: mode=Partial, gby=[v1@0 as v1, v2@1 as v2], aggr=[max(having_test.v1)]
+09)----------------MemoryExec: partitions=1, partition_sizes=[1]
+
+
+query error DataFusion error: Error during planning: Projection references non-aggregate values: Wildcard column having_test\.v1 could not be resolved from available columns: max\(having_test\.v1\)
+select * from having_test having max(v1) = 3
+
+query I
+select max(v1) from having_test having max(v1) = 3
+----
+3
+
+query I
+select max(v1), * exclude (v1, v2) from having_test having max(v1) = 3
+----
+3
+
+# because v1, v2 is not in the group by clause, the sql is invalid
+query III
+select max(v1), * replace ('v1' as v3) from having_test group by v1, v2 having max(v1) = 3
+----
+3 3 4
+
+query III
+select max(v1), t.* from having_test t group by v1, v2 having max(v1) = 3
+----
+3 3 4
+
+query error DataFusion error: Error during planning: Projection references non-aggregate values: Wildcard column j\.v1 could not be resolved from available columns: t\.v1, t\.v2, max\(t\.v1\)
+select max(t.v1), j.* from having_test t join join_table j on t.v1 = j.v1 group by t.v1, t.v2 having max(t.v1) = 3
+
+query III
+select max(t.v1), j.* from having_test t join join_table j on t.v1 = j.v1 group by j.v1, j.v2 having max(t.v1) = 3
+----
+3 3 4
+
+# If the select items only contain scalar expressions, the having clause is valid.
+query P
+select now() from having_test having max(v1) = 4
+----
+
+# If the select items only contain scalar expressions, the having clause is valid.
+query I
+select 0 from having_test having max(v1) = 4
+----
+
+query error DataFusion error: Error during planning: Projection references non-aggregate values: Wildcard column having_test\.v2 could not be resolved from available columns: having_test\.v1, max\(having_test\.v1\)
+select * from having_test group by v1 having max(v1) = 3
+
+statement ok
+drop table having_test
+
 # test min/max Float16 without group expression
 query RRTT
 WITH data AS (