Skip to content

Commit eb5af45

Browse files
mihailotim-dbcloud-fan
authored andcommitted
[SPARK-52672][SQL] Don't replace Sort/Having expressions with aliases if expression exists in Aggregate
### What changes were proposed in this pull request? Don't replace Sort/Having expressions with aliases that have semantically equal child if expression already exists in Aggregate ### Why are the changes needed? Current implementation sometimes replaces the Sort/Having expressions with aliases even though the replacement is not actually needed. For example for a query like: ``` SELECT col1 AS a, col1 FROM VALUES(1) GROUP BY col1 ORDER BY col1 ASC; ``` Previous implementation will replace `col1` in `SortOrder` with `a`, even though `col1` is present in output ``` Sort [a#25579 ASC NULLS FIRST], true +- Aggregate [col1#25578], [col1#25578 AS a#25579, col1#25578] +- LocalRelation [col1#25578] ``` Instead, we can just preserve `col1`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added new golden file tests. ### Was this patch authored or co-authored using generative AI tooling? No Closes #51359 from mihailotim-db/mihailotim-db/fix_realiasing_missing_expressions. Authored-by: Mihailo Timotic <mihailo.timotic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent ff93cbb commit eb5af45

File tree

4 files changed

+423
-6
lines changed

4 files changed

+423
-6
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2939,12 +2939,14 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
29392939
// Avoid adding an extra aggregate expression if it's already present in
29402940
// `agg.aggregateExpressions`. Trim inner aliases from aggregate expressions because of
29412941
// expressions like `spark_grouping_id` that can have inner aliases.
2942-
val index = agg.aggregateExpressions.indexWhere {
2943-
case Alias(child, _) => trimAliases(child) semanticEquals expr
2944-
case other => other semanticEquals expr
2945-
}
2946-
if (index >= 0) {
2947-
agg.aggregateExpressions(index).toAttribute
2942+
val replacement: Option[NamedExpression] =
2943+
agg.aggregateExpressions.foldLeft(Option.empty[NamedExpression]) {
2944+
case (None, alias: Alias) if expr.semanticEquals(trimAliases(alias.child)) => Some(alias)
2945+
case (None | Some(_: Alias), aggExpr) if expr.semanticEquals(aggExpr) => Some(aggExpr)
2946+
case (current, _) => current
2947+
}
2948+
if (replacement.isDefined) {
2949+
replacement.get.toAttribute
29482950
} else {
29492951
expr match {
29502952
case ae: AggregateExpression =>
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
-- Automatically generated by SQLQueryTestSuite
2+
-- !query
3+
SELECT col1 + 1 AS a FROM VALUES(1) GROUP BY a ORDER BY col1 + 1
4+
-- !query analysis
5+
Sort [a#x ASC NULLS FIRST], true
6+
+- Aggregate [(col1#x + 1)], [(col1#x + 1) AS a#x]
7+
+- LocalRelation [col1#x]
8+
9+
10+
-- !query
11+
SELECT col1 + 1 AS a, a AS b FROM VALUES(1) GROUP BY a ORDER BY col1 + 1
12+
-- !query analysis
13+
Sort [a#x ASC NULLS FIRST], true
14+
+- Project [a#x, a#x AS b#x]
15+
+- Project [(col1 + 1)#x, (col1 + 1)#x AS a#x]
16+
+- Aggregate [(col1#x + 1)], [(col1#x + 1) AS (col1 + 1)#x]
17+
+- LocalRelation [col1#x]
18+
19+
20+
-- !query
21+
SELECT col1 + 1 AS a FROM VALUES(1) GROUP BY a HAVING col1 + 1 > 0
22+
-- !query analysis
23+
Filter (a#x > 0)
24+
+- Aggregate [(col1#x + 1)], [(col1#x + 1) AS a#x]
25+
+- LocalRelation [col1#x]
26+
27+
28+
-- !query
29+
SELECT col1 + 1 AS a, a AS b FROM VALUES(1) GROUP BY a HAVING col1 + 1 > 0
30+
-- !query analysis
31+
Filter (a#x > 0)
32+
+- Project [a#x, a#x AS b#x]
33+
+- Project [(col1 + 1)#x, (col1 + 1)#x AS a#x]
34+
+- Aggregate [(col1#x + 1)], [(col1#x + 1) AS (col1 + 1)#x]
35+
+- LocalRelation [col1#x]
36+
37+
38+
-- !query
39+
SELECT col1, col2, GROUPING(col1) FROM VALUES("abc", 1) GROUP BY CUBE(col1, col2) ORDER BY GROUPING(col1)
40+
-- !query analysis
41+
Sort [grouping(col1)#x ASC NULLS FIRST], true
42+
+- Aggregate [col1#x, col2#x, spark_grouping_id#xL], [col1#x, col2#x, cast((shiftright(spark_grouping_id#xL, 1) & 1) as tinyint) AS grouping(col1)#x]
43+
+- Expand [[col1#x, col2#x, col1#x, col2#x, 0], [col1#x, col2#x, col1#x, null, 1], [col1#x, col2#x, null, col2#x, 2], [col1#x, col2#x, null, null, 3]], [col1#x, col2#x, col1#x, col2#x, spark_grouping_id#xL]
44+
+- Project [col1#x, col2#x, col1#x AS col1#x, col2#x AS col2#x]
45+
+- LocalRelation [col1#x, col2#x]
46+
47+
48+
-- !query
49+
SELECT col1, col2, GROUPING(col1) FROM VALUES("abc", 1) GROUP BY CUBE(col1, col2) HAVING GROUPING(col1) != NULL
50+
-- !query analysis
51+
Filter NOT (grouping(col1)#x = cast(null as tinyint))
52+
+- Aggregate [col1#x, col2#x, spark_grouping_id#xL], [col1#x, col2#x, cast((shiftright(spark_grouping_id#xL, 1) & 1) as tinyint) AS grouping(col1)#x]
53+
+- Expand [[col1#x, col2#x, col1#x, col2#x, 0], [col1#x, col2#x, col1#x, null, 1], [col1#x, col2#x, null, col2#x, 2], [col1#x, col2#x, null, null, 3]], [col1#x, col2#x, col1#x, col2#x, spark_grouping_id#xL]
54+
+- Project [col1#x, col2#x, col1#x AS col1#x, col2#x AS col2#x]
55+
+- LocalRelation [col1#x, col2#x]
56+
57+
58+
-- !query
59+
SELECT make_date(col1, col2, col3) AS a FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) ORDER BY make_date(col1, col2, col3)
60+
-- !query analysis
61+
Sort [a#x ASC NULLS FIRST], true
62+
+- Aggregate [make_date(col1#x, col2#x, col3#x, true)], [make_date(col1#x, col2#x, col3#x, true) AS a#x]
63+
+- LocalRelation [col1#x, col2#x, col3#x]
64+
65+
66+
-- !query
67+
SELECT make_date(col1, col2, col3) AS a, a AS b FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) ORDER BY make_date(col1, col2, col3)
68+
-- !query analysis
69+
Sort [a#x ASC NULLS FIRST], true
70+
+- Project [a#x, a#x AS b#x]
71+
+- Project [make_date(col1, col2, col3)#x, make_date(col1, col2, col3)#x AS a#x]
72+
+- Aggregate [make_date(col1#x, col2#x, col3#x, true)], [make_date(col1#x, col2#x, col3#x, true) AS make_date(col1, col2, col3)#x]
73+
+- LocalRelation [col1#x, col2#x, col3#x]
74+
75+
76+
-- !query
77+
SELECT make_date(col1, col2, col3) AS a FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) HAVING make_date(col1, col2, col3) > '2025-01-01'
78+
-- !query analysis
79+
Filter (a#x > cast(2025-01-01 as date))
80+
+- Aggregate [make_date(col1#x, col2#x, col3#x, true)], [make_date(col1#x, col2#x, col3#x, true) AS a#x]
81+
+- LocalRelation [col1#x, col2#x, col3#x]
82+
83+
84+
-- !query
85+
SELECT make_date(col1, col2, col3) AS a, a AS b FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) HAVING make_date(col1, col2, col3) > '2025-01-01'
86+
-- !query analysis
87+
Filter (a#x > cast(2025-01-01 as date))
88+
+- Project [a#x, a#x AS b#x]
89+
+- Project [make_date(col1, col2, col3)#x, make_date(col1, col2, col3)#x AS a#x]
90+
+- Aggregate [make_date(col1#x, col2#x, col3#x, true)], [make_date(col1#x, col2#x, col3#x, true) AS make_date(col1, col2, col3)#x]
91+
+- LocalRelation [col1#x, col2#x, col3#x]
92+
93+
94+
-- !query
95+
SELECT make_date(col1, col2, col3) AS a FROM VALUES(1,2,3) ORDER BY make_date(col1, col2, col3)
96+
-- !query analysis
97+
Project [a#x]
98+
+- Sort [make_date(col1#x, col2#x, col3#x, true) ASC NULLS FIRST], true
99+
+- Project [make_date(col1#x, col2#x, col3#x, true) AS a#x, col1#x, col2#x, col3#x]
100+
+- LocalRelation [col1#x, col2#x, col3#x]
101+
102+
103+
-- !query
104+
SELECT make_date(col1, col2, col3) AS a, a AS b FROM VALUES(1,2,3) ORDER BY make_date(col1, col2, col3)
105+
-- !query analysis
106+
Project [a#x, b#x]
107+
+- Sort [make_date(col1#x, col2#x, col3#x, true) ASC NULLS FIRST], true
108+
+- Project [a#x, a#x AS b#x, col1#x, col2#x, col3#x]
109+
+- Project [col1#x, col2#x, col3#x, make_date(col1#x, col2#x, col3#x, true) AS a#x]
110+
+- LocalRelation [col1#x, col2#x, col3#x]
111+
112+
113+
-- !query
114+
SELECT col1, col1 AS a FROM VALUES(1) GROUP BY col1 ORDER BY col1 ASC
115+
-- !query analysis
116+
Sort [col1#x ASC NULLS FIRST], true
117+
+- Aggregate [col1#x], [col1#x, col1#x AS a#x]
118+
+- LocalRelation [col1#x]
119+
120+
121+
-- !query
122+
SELECT col1 AS a, col1 FROM VALUES(1) GROUP BY col1 ORDER BY col1 ASC
123+
-- !query analysis
124+
Sort [col1#x ASC NULLS FIRST], true
125+
+- Aggregate [col1#x], [col1#x AS a#x, col1#x]
126+
+- LocalRelation [col1#x]
127+
128+
129+
-- !query
130+
SELECT col1, col1 AS a FROM VALUES(1) GROUP BY col1 HAVING col1 > 0
131+
-- !query analysis
132+
Filter (col1#x > 0)
133+
+- Aggregate [col1#x], [col1#x, col1#x AS a#x]
134+
+- LocalRelation [col1#x]
135+
136+
137+
-- !query
138+
SELECT col1 AS a, col1 FROM VALUES(1) GROUP BY col1 HAVING col1 > 0
139+
-- !query analysis
140+
Filter (col1#x > 0)
141+
+- Aggregate [col1#x], [col1#x AS a#x, col1#x]
142+
+- LocalRelation [col1#x]
143+
144+
145+
-- !query
146+
SELECT col2 AS b, col2 FROM VALUES(1,2) GROUP BY 1,2 ORDER BY ALL
147+
-- !query analysis
148+
Sort [b#x ASC NULLS FIRST, col2#x ASC NULLS FIRST], true
149+
+- Aggregate [col2#x, col2#x], [col2#x AS b#x, col2#x]
150+
+- LocalRelation [col1#x, col2#x]
151+
152+
153+
-- !query
154+
SELECT col2 AS b, col2 FROM VALUES(1,2) GROUP BY 1,2 HAVING col2 > 0 ORDER BY ALL
155+
-- !query analysis
156+
Sort [b#x ASC NULLS FIRST, col2#x ASC NULLS FIRST], true
157+
+- Filter (col2#x > 0)
158+
+- Aggregate [col2#x, col2#x], [col2#x AS b#x, col2#x]
159+
+- LocalRelation [col1#x, col2#x]
160+
161+
162+
-- !query
163+
SELECT col2 AS b, col2, b as c FROM VALUES(1,2) GROUP BY 1,2 ORDER BY ALL
164+
-- !query analysis
165+
Sort [b#x ASC NULLS FIRST, col2#x ASC NULLS FIRST, c#x ASC NULLS FIRST], true
166+
+- Project [b#x, col2#x, b#x AS c#x]
167+
+- Project [col2#x, col2#x AS b#x]
168+
+- Aggregate [col2#x, col2#x], [col2#x]
169+
+- LocalRelation [col1#x, col2#x]
170+
171+
172+
-- !query
173+
SELECT col2 AS b, col2, b as c FROM VALUES(1,2) GROUP BY 1,2 HAVING col2 > 0 ORDER BY ALL
174+
-- !query analysis
175+
Sort [b#x ASC NULLS FIRST, col2#x ASC NULLS FIRST, c#x ASC NULLS FIRST], true
176+
+- Filter (col2#x > 0)
177+
+- Project [b#x, col2#x, b#x AS c#x]
178+
+- Project [col2#x, col2#x AS b#x]
179+
+- Aggregate [col2#x, col2#x], [col2#x]
180+
+- LocalRelation [col1#x, col2#x]
181+
182+
183+
-- !query
184+
SELECT col1 AS a FROM VALUES(1,2) GROUP BY col1, col2 HAVING col2 > 1 ORDER BY col1
185+
-- !query analysis
186+
Project [a#x]
187+
+- Sort [col1#x ASC NULLS FIRST], true
188+
+- Project [a#x, col1#x]
189+
+- Filter (col2#x > 1)
190+
+- Aggregate [col1#x, col2#x], [col1#x AS a#x, col2#x, col1#x]
191+
+- LocalRelation [col1#x, col2#x]
192+
193+
194+
-- !query
195+
SELECT col1 AS a, a AS b FROM VALUES(1,2) GROUP BY col1, col2 HAVING col2 > 1 ORDER BY col1
196+
-- !query analysis
197+
Project [a#x, b#x]
198+
+- Sort [col1#x ASC NULLS FIRST], true
199+
+- Project [a#x, b#x, col1#x]
200+
+- Filter (col2#x > 1)
201+
+- Project [a#x, a#x AS b#x, col2#x, col1#x]
202+
+- Project [col1#x, col2#x, col1#x AS a#x]
203+
+- Aggregate [col1#x, col2#x], [col1#x, col2#x]
204+
+- LocalRelation [col1#x, col2#x]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
-- Replace expression with alias that has semantically equal child if expression is not in output
2+
SELECT col1 + 1 AS a FROM VALUES(1) GROUP BY a ORDER BY col1 + 1;
3+
SELECT col1 + 1 AS a, a AS b FROM VALUES(1) GROUP BY a ORDER BY col1 + 1;
4+
SELECT col1 + 1 AS a FROM VALUES(1) GROUP BY a HAVING col1 + 1 > 0;
5+
SELECT col1 + 1 AS a, a AS b FROM VALUES(1) GROUP BY a HAVING col1 + 1 > 0;
6+
7+
SELECT col1, col2, GROUPING(col1) FROM VALUES("abc", 1) GROUP BY CUBE(col1, col2) ORDER BY GROUPING(col1);
8+
SELECT col1, col2, GROUPING(col1) FROM VALUES("abc", 1) GROUP BY CUBE(col1, col2) HAVING GROUPING(col1) != NULL;
9+
10+
11+
SELECT make_date(col1, col2, col3) AS a FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) ORDER BY make_date(col1, col2, col3);
12+
SELECT make_date(col1, col2, col3) AS a, a AS b FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) ORDER BY make_date(col1, col2, col3);
13+
SELECT make_date(col1, col2, col3) AS a FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) HAVING make_date(col1, col2, col3) > '2025-01-01';
14+
SELECT make_date(col1, col2, col3) AS a, a AS b FROM VALUES(1,2,3) GROUP BY make_date(col1, col2, col3) HAVING make_date(col1, col2, col3) > '2025-01-01';
15+
16+
-- Don't replace expression with alias that has semantically equal child if expression is not grouping
17+
SELECT make_date(col1, col2, col3) AS a FROM VALUES(1,2,3) ORDER BY make_date(col1, col2, col3);
18+
SELECT make_date(col1, col2, col3) AS a, a AS b FROM VALUES(1,2,3) ORDER BY make_date(col1, col2, col3);
19+
20+
-- Don't replace expression with alias that has semantically equal child if expression is in output
21+
SELECT col1, col1 AS a FROM VALUES(1) GROUP BY col1 ORDER BY col1 ASC;
22+
SELECT col1 AS a, col1 FROM VALUES(1) GROUP BY col1 ORDER BY col1 ASC;
23+
SELECT col1, col1 AS a FROM VALUES(1) GROUP BY col1 HAVING col1 > 0;
24+
SELECT col1 AS a, col1 FROM VALUES(1) GROUP BY col1 HAVING col1 > 0;
25+
26+
SELECT col2 AS b, col2 FROM VALUES(1,2) GROUP BY 1,2 ORDER BY ALL;
27+
SELECT col2 AS b, col2 FROM VALUES(1,2) GROUP BY 1,2 HAVING col2 > 0 ORDER BY ALL;
28+
SELECT col2 AS b, col2, b as c FROM VALUES(1,2) GROUP BY 1,2 ORDER BY ALL;
29+
SELECT col2 AS b, col2, b as c FROM VALUES(1,2) GROUP BY 1,2 HAVING col2 > 0 ORDER BY ALL;
30+
31+
-- Fixed point doesn't know to correctly replace `col1` with `a` because HAVING adds an artificial Project node
32+
SELECT col1 AS a FROM VALUES(1,2) GROUP BY col1, col2 HAVING col2 > 1 ORDER BY col1;
33+
SELECT col1 AS a, a AS b FROM VALUES(1,2) GROUP BY col1, col2 HAVING col2 > 1 ORDER BY col1;

0 commit comments

Comments
 (0)