[SPARK-52353][SQL] Fix bug with wrong constraints in LogicalRDDs referencing previous iterations in UnionLoop

pavle-martinovic_data · yhuang-db · commit 255249f71316 · 2025-06-09T09:54:22.000-07:00
### What changes were proposed in this pull request? Modify the way that we write statistics and constraints in LogicalRDDs that refer to previous iterations in UnionLoopExec. ### Why are the changes needed? LogicalRDD constraints are currently incorrectly written in the case where we have multiple columns using the same name in recursion. This leads to incorrectly pruning out filters which can lead to infinite recursion. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New Golden file test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#51070 from Pajaraja/pavle-martinovic_data/ConstraintsFixII. Authored-by: pavle-martinovic_data <pavle.martinovic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -229,10 +229,36 @@ object LogicalRDD extends Logging {
     }
   }
 
+  // A version of buildOutputAssocForRewrite which doesn't assume that the names are the same,
+  // because the new output can have different names. Used when copying the LogicalRDD with a new
+  // output
+  private[sql] def buildOutputAssocForRewriteWithNewOutput(
+      source: Seq[Attribute],
+      destination: Seq[Attribute]): Option[Map[Attribute, Attribute]] = {
+    val rewrite = source.zip(destination).flatMap { case (attr1, attr2) =>
+      if (attr1.dataType == attr2.dataType) {
+        Some(attr1 -> attr2)
+      } else {
+        None
+      }
+    }.toMap
+
+    if (rewrite.size == source.size) {
+      Some(rewrite)
+    } else {
+      None
+    }
+  }
+
   private[sql] def rewriteStatsAndConstraints(
       logicalPlan: LogicalPlan,
-      optimizedPlan: LogicalPlan): (Option[Statistics], Option[ExpressionSet]) = {
-    val rewrite = buildOutputAssocForRewrite(optimizedPlan.output, logicalPlan.output)
+      optimizedPlan: LogicalPlan,
+      sameOutput: Boolean = true): (Option[Statistics], Option[ExpressionSet]) = {
+    val rewrite = if (sameOutput) {
+      buildOutputAssocForRewrite(optimizedPlan.output, logicalPlan.output)
+    } else {
+      buildOutputAssocForRewriteWithNewOutput(optimizedPlan.output, logicalPlan.output)
+    }
 
     rewrite.map { rw =>
       val rewrittenStatistics = rewriteStatistics(optimizedPlan.stats, rw)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnionLoopExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnionLoopExec.scala
@@ -96,7 +96,7 @@ case class UnionLoopExec(
     "numIterations" -> SQLMetrics.createMetric(sparkContext, "number of recursive iterations"),
     "numAnchorOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of anchor output rows"))
 
-  val localRelationLimit =
+  private val localRelationLimit =
     conf.getConf(SQLConf.CTE_RECURSION_ANCHOR_ROWS_LIMIT_TO_CONVERT_TO_LOCAL_RELATION)
 
   /**
@@ -220,9 +220,9 @@ case class UnionLoopExec(
               val logicalRDD = LogicalRDD.fromDataset(prevDF.queryExecution.toRdd, prevDF,
                   prevDF.isStreaming).newInstance()
               prevPlan = logicalRDD
-              val logicalPlan = prevDF.logicalPlan
               val optimizedPlan = prevDF.queryExecution.optimizedPlan
-              val (stats, constraints) = rewriteStatsAndConstraints(logicalPlan, optimizedPlan)
+              val (stats, constraints) = rewriteStatsAndConstraints(r, optimizedPlan,
+                sameOutput = false)
               logicalRDD.copy(output = r.output)(prevDF.sparkSession, stats, constraints)
           }
       }
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
@@ -1664,6 +1664,51 @@ WithCTE
       +- CTERelationRef xxxx, true, [x#x, y#x], false, false
 
 
+-- !query
+SET spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation=0
+-- !query analysis
+SetCommand (spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation,Some(0))
+
+
+-- !query
+WITH RECURSIVE tmp(x) AS (
+    values (1), (2), (3), (4), (5)
+), rcte(x, y) AS (
+    SELECT x, x FROM tmp WHERE x = 1
+    UNION ALL
+    SELECT x + 1, x FROM rcte WHERE x < 5
+)
+SELECT * FROM rcte
+-- !query analysis
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias tmp
+:     +- Project [col1#x AS x#x]
+:        +- LocalRelation [col1#x]
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias rcte
+:     +- Project [x#x AS x#x, x#x AS y#x]
+:        +- UnionLoop xxxx
+:           :- Project [x#x, x#x]
+:           :  +- Filter (x#x = 1)
+:           :     +- SubqueryAlias tmp
+:           :        +- CTERelationRef xxxx, true, [x#x], false, false, 5
+:           +- Project [(x#x + 1) AS (x + 1)#x, x#x]
+:              +- Filter (x#x < 5)
+:                 +- SubqueryAlias rcte
+:                    +- Project [x#x AS x#x, x#x AS y#x]
+:                       +- UnionLoopRef xxxx, [x#x, x#x], false
++- Project [x#x, y#x]
+   +- SubqueryAlias rcte
+      +- CTERelationRef xxxx, true, [x#x, y#x], false, false
+
+
+-- !query
+SET spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation=100
+-- !query analysis
+SetCommand (spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation,Some(100))
+
+
 -- !query
 WITH RECURSIVE tmp(x) AS (
     values (1), (2), (3), (4), (5)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql
@@ -598,6 +598,20 @@ WITH RECURSIVE tmp(x) AS (
 )
 SELECT * FROM rcte;
 
+-- Previous query without converting to local relations
+SET spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation=0;
+
+WITH RECURSIVE tmp(x) AS (
+    values (1), (2), (3), (4), (5)
+), rcte(x, y) AS (
+    SELECT x, x FROM tmp WHERE x = 1
+    UNION ALL
+    SELECT x + 1, x FROM rcte WHERE x < 5
+)
+SELECT * FROM rcte;
+
+SET spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation=100;
+
 -- Recursive CTE with multiple of the same reference in the anchor, which get referenced as different variables in subsequent iterations.
 WITH RECURSIVE tmp(x) AS (
     values (1), (2), (3), (4), (5)
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
@@ -1496,6 +1496,41 @@ struct<x:int,y:int>
 5	4
 
 
+-- !query
+SET spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation=0
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation	0
+
+
+-- !query
+WITH RECURSIVE tmp(x) AS (
+    values (1), (2), (3), (4), (5)
+), rcte(x, y) AS (
+    SELECT x, x FROM tmp WHERE x = 1
+    UNION ALL
+    SELECT x + 1, x FROM rcte WHERE x < 5
+)
+SELECT * FROM rcte
+-- !query schema
+struct<x:int,y:int>
+-- !query output
+1	1
+2	1
+3	2
+4	3
+5	4
+
+
+-- !query
+SET spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation=100
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.cteRecursionAnchorRowsLimitToConvertToLocalRelation	100
+
+
 -- !query
 WITH RECURSIVE tmp(x) AS (
     values (1), (2), (3), (4), (5)