apache · jayzhan211 · Aug 22, 2024 · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
@@ -380,6 +380,14 @@ impl LogicalPlanBuilder {
             .map(Self::from)
     }
 
+    /// Apply a filter which is used for a having clause
+    pub fn having(self, expr: impl Into<Expr>) -> Result<Self> {
+        let expr = normalize_col(expr.into(), &self.plan)?;
+        Filter::try_new_having(expr, Arc::new(self.plan))
+            .map(LogicalPlan::Filter)
+            .map(Self::from)
+    }
+
     /// Make a builder for a prepare logical plan from the builder's plan
     pub fn prepare(self, name: String, data_types: Vec<DataType>) -> Result<Self> {
         Ok(Self::from(LogicalPlan::Prepare(Prepare {

diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs
@@ -643,9 +643,11 @@ impl LogicalPlan {
                 // todo it isn't clear why the schema is not recomputed here
                 Ok(LogicalPlan::Values(Values { schema, values }))
             }
-            LogicalPlan::Filter(Filter { predicate, input }) => {
-                Filter::try_new(predicate, input).map(LogicalPlan::Filter)
-            }
+            LogicalPlan::Filter(Filter {
+                predicate, input, having,
+            }) => {
+                Filter::try_new_internal(predicate, input, having).map(LogicalPlan::Filter)
+            },
             LogicalPlan::Repartition(_) => Ok(self),
             LogicalPlan::Window(Window {
                 input,
@@ -2081,6 +2083,8 @@ pub struct Filter {
     pub predicate: Expr,
     /// The incoming logical plan
     pub input: Arc<LogicalPlan>,
+    /// The flag to indicate if the filter is a having clause
+    pub having: bool,
 }
 
 impl Filter {
@@ -2089,6 +2093,20 @@ impl Filter {
     /// Notes: as Aliases have no effect on the output of a filter operator,
     /// they are removed from the predicate expression.
     pub fn try_new(predicate: Expr, input: Arc<LogicalPlan>) -> Result<Self> {
+        Self::try_new_internal(predicate, input, false)
+    }
+
+    /// Create a new filter operator for a having clause.
+    /// This is similar to a filter, but its having flag is set to true.
+    pub fn try_new_having(predicate: Expr, input: Arc<LogicalPlan>) -> Result<Self> {
+        Self::try_new_internal(predicate, input, true)
+    }
+
+    fn try_new_internal(
+        predicate: Expr,
+        input: Arc<LogicalPlan>,
+        having: bool,
+    ) -> Result<Self> {
         // Filter predicates must return a boolean value so we try and validate that here.
         // Note that it is not always possible to resolve the predicate expression during plan
         // construction (such as with correlated subqueries) so we make a best effort here and
@@ -2105,6 +2123,7 @@ impl Filter {
         Ok(Self {
             predicate: predicate.unalias_nested().data,
             input,
+            having,
         })
     }
 

diff --git a/datafusion/expr/src/logical_plan/tree_node.rs b/datafusion/expr/src/logical_plan/tree_node.rs
@@ -87,8 +87,17 @@ impl TreeNode for LogicalPlan {
                     schema,
                 })
             }),
-            LogicalPlan::Filter(Filter { predicate, input }) => rewrite_arc(input, f)?
-                .update_data(|input| LogicalPlan::Filter(Filter { predicate, input })),
+            LogicalPlan::Filter(Filter {
+                predicate,
+                input,
+                having,
+            }) => rewrite_arc(input, f)?.update_data(|input| {
+                LogicalPlan::Filter(Filter {
+                    predicate,
+                    input,
+                    having,
+                })
+            }),
             LogicalPlan::Repartition(Repartition {
                 input,
                 partitioning_scheme,
@@ -561,10 +570,17 @@ impl LogicalPlan {
                     value.into_iter().map_until_stop_and_collect(&mut f)
                 })?
                 .update_data(|values| LogicalPlan::Values(Values { schema, values })),
-            LogicalPlan::Filter(Filter { predicate, input }) => f(predicate)?
-                .update_data(|predicate| {
-                    LogicalPlan::Filter(Filter { predicate, input })
-                }),
+            LogicalPlan::Filter(Filter {
+                predicate,
+                input,
+                having,
+            }) => f(predicate)?.update_data(|predicate| {
+                LogicalPlan::Filter(Filter {
+                    predicate,
+                    input,
+                    having,
+                })
+            }),
             LogicalPlan::Repartition(Repartition {
                 input,
                 partitioning_scheme,

diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
@@ -804,6 +804,15 @@ pub fn find_base_plan(input: &LogicalPlan) -> &LogicalPlan {
     match input {
         LogicalPlan::Window(window) => find_base_plan(&window.input),
         LogicalPlan::Aggregate(agg) => find_base_plan(&agg.input),
+        LogicalPlan::Filter(filter) => {
+            if filter.having {
+                // If a filter is used for a having clause, its input plan is an aggregation.
+                // We should expand the wildcard expression based on the aggregation's input plan.
+                find_base_plan(&filter.input)
+            } else {
+                input
+            }
+        }
         _ => input,
     }
 }

diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs
@@ -160,13 +160,14 @@ fn replace_columns(
 mod tests {
     use arrow::datatypes::{DataType, Field, Schema};
 
+    use crate::test::{assert_analyzed_plan_eq_display_indent, test_table_scan};
+    use crate::Analyzer;
     use datafusion_common::{JoinType, TableReference};
     use datafusion_expr::{
-        col, in_subquery, qualified_wildcard, table_scan, wildcard, LogicalPlanBuilder,
+        col, ident, in_subquery, lit, qualified_wildcard, table_scan, wildcard,
+        LogicalPlanBuilder,
     };
-
-    use crate::test::{assert_analyzed_plan_eq_display_indent, test_table_scan};
-    use crate::Analyzer;
+    use datafusion_functions_aggregate::expr_fn::max;
 
     use super::*;
 
@@ -301,4 +302,32 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn plan_having_wildcard_projection() -> Result<()> {
+        let aggregate =
+            table_scan(Some("t1"), &employee_schema(), Some(vec![0, 1, 2, 3, 4]))?
+                .aggregate(
+                    vec![
+                        col("t1.id"),
+                        col("t1.first_name"),
+                        col("t1.last_name"),
+                        col("t1.state"),
+                        col("t1.salary"),
+                    ],
+                    vec![max(col("t1.salary"))],
+                )?
+                .build()?;
+        let plan = LogicalPlanBuilder::from(aggregate)
+            .having(ident("max(t1.salary)").gt(lit(100)))?
+            .project(vec![wildcard()])?
+            .build()?;
+
+        let expected = "Projection: t1.id, t1.first_name, t1.last_name, t1.state, t1.salary [id:Int32, first_name:Utf8, last_name:Utf8, state:Utf8, salary:Int32]\
+        \n  Filter: max(t1.salary) > Int32(100) [id:Int32, first_name:Utf8, last_name:Utf8, state:Utf8, salary:Int32, max(t1.salary):Int32;N]\
+        \n    Aggregate: groupBy=[[t1.id, t1.first_name, t1.last_name, t1.state, t1.salary]], aggr=[[max(t1.salary)]] [id:Int32, first_name:Utf8, last_name:Utf8, state:Utf8, salary:Int32, max(t1.salary):Int32;N]\
+        \n      TableScan: t1 projection=[id, first_name, last_name, state, salary] [id:Int32, first_name:Utf8, last_name:Utf8, state:Utf8, salary:Int32]";
+
+        assert_plan_eq(plan, expected)
+    }
 }
diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs
@@ -35,7 +35,8 @@ use datafusion_expr::expr_rewriter::{
 };
 use datafusion_expr::logical_plan::tree_node::unwrap_arc;
 use datafusion_expr::utils::{
-    expr_as_column_expr, expr_to_columns, find_aggregate_exprs, find_window_exprs,
+    expand_qualified_wildcard, expand_wildcard, expr_as_column_expr, expr_to_columns,
+    find_aggregate_exprs, find_window_exprs,
 };
 use datafusion_expr::{
     qualified_wildcard_with_options, wildcard_with_options, Aggregate, Expr, Filter,
@@ -214,7 +215,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
 
         let plan = if let Some(having_expr_post_aggr) = having_expr_post_aggr {
             LogicalPlanBuilder::from(plan)
-                .filter(having_expr_post_aggr)?
+                .having(having_expr_post_aggr)?
                 .build()?
         } else {
             plan
@@ -749,6 +750,37 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             .map(|expr| rebase_expr(expr, &aggr_projection_exprs, input))
             .collect::<Result<Vec<Expr>>>()?;
 
+        // If the having expression is present and the group by expression is not present,
+        // we can ensure this is an invalid query. Expand the wildcard expression here to
+        // get a better error message.
+        let select_exprs_post_aggr = if having_expr_opt.is_some()
+            && group_by_exprs.is_empty()
 let having_expr_post_aggr = if let Some(having_expr) = having_expr_opt { 
     let having_expr_post_aggr = 
         rebase_expr(having_expr, &aggr_projection_exprs, input)?; 
     check_columns_satisfy_exprs( 
         &column_exprs_post_aggr, 
         &[having_expr_post_aggr.clone()], 
         "HAVING clause references non-aggregate values", 
     )?; 
 let having_expr_post_aggr = if let Some(having_expr) = having_expr_opt { 
     let having_expr_post_aggr = 
         rebase_expr(having_expr, &aggr_projection_exprs, input)?; 
  
     check_columns_satisfy_exprs( 
         &column_exprs_post_aggr, 
         &[having_expr_post_aggr.clone()], 
         "HAVING clause references non-aggregate values", 
     )?; 
+        {
+            select_exprs_post_aggr
+                .into_iter()
+                .map(|expr| {
+                    if let Expr::Wildcard { qualifier, options } = expr {
+                        if let Some(qualifier) = qualifier {
+                            Ok::<_, DataFusionError>(expand_qualified_wildcard(
+                                &qualifier,
+                                input.schema(),
+                                Some(&options),
+                            )?)
+                        } else {
+                            Ok(expand_wildcard(input.schema(), input, Some(&options))?)
+                        }
+                    } else {
+                        Ok(vec![expr])
+                    }
+                })
+                .collect::<Result<Vec<_>>>()?
+                .into_iter()
+                .flatten()
+                .collect()
+        } else {
+            select_exprs_post_aggr
+        };
+
         // finally, we have some validation that the re-written projection can be resolved
         // from the aggregate output columns
         check_columns_satisfy_exprs(

diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -5643,3 +5643,24 @@ query I??III?T
 select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
 ----
 0 NULL NULL NULL NULL NULL NULL NULL
+
+statement ok
+create table having_test(v1 int, v2 int)
+
+statement ok
+insert into having_test values (1, 2), (2, 3), (3, 4)
+
+query II
+select * from having_test group by v1, v2 having max(v1) = 3
+----
+3 4
+
+query error DataFusion error: Error during planning: Projection references non-aggregate values: Expression having_test\.v1 could not be resolved from available columns: max\(having_test\.v1\)
+select * from having_test having max(v1) = 3
+
+# because v2 is not in the group by clause, the sql is invalid
+query error
+select * from having_test group by v1 having max(v1) = 3
+
+statement ok
+drop table having_test