[SPARK-48356][FOLLOW UP][SQL] Improve FOR statement's column schema inference

davidm-db · cloud-fan · yhuang-db · commit 9fdb4a8addb6 · 2025-06-09T09:54:22.000-07:00
### What changes were proposed in this pull request? This pull request changes `FOR` statement to infer column schemas from the query DataFrame, and no longer implicitly infer column schema in SetVariable. This is necessary due to type mismatch errors with complex nested types, e.g. `ARRAY<STRUCT<..>>`. ### Why are the changes needed? Bug fix for FOR statement. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit test that specifically targets problematic case. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#51053 from davidm-db/for_schema_inference. Lead-authored-by: David Milicevic <david.milicevic@databricks.com> Co-authored-by: David Milicevic <163021185+davidm-db@users.noreply.github.com> Co-authored-by: Wenchen Fan <cloud0fan@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala b/sql/core/src/main/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionNode.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical.ExceptionHandlerType.Exceptio
 import org.apache.spark.sql.catalyst.trees.{Origin, WithOrigin}
 import org.apache.spark.sql.classic.{DataFrame, Dataset, SparkSession}
 import org.apache.spark.sql.errors.SqlScriptingErrors
-import org.apache.spark.sql.types.BooleanType
+import org.apache.spark.sql.types.{BooleanType, DataType}
 
 /**
  * Trait for all SQL scripting execution nodes used during interpretation phase.
@@ -997,10 +997,14 @@ class ForStatementExec(
   private var state = ForState.VariableAssignment
 
   private var queryResult: util.Iterator[Row] = _
+  private var queryColumnNameToDataType: Map[String, DataType] = _
   private var isResultCacheValid = false
   private def cachedQueryResult(): util.Iterator[Row] = {
     if (!isResultCacheValid) {
-      queryResult = query.buildDataFrame(session).toLocalIterator()
+      val df = query.buildDataFrame(session)
+      queryResult = df.toLocalIterator()
+      queryColumnNameToDataType = df.schema.fields.map(f => f.name -> f.dataType).toMap
+
       query.isExecuted = true
       isResultCacheValid = true
     }
@@ -1063,7 +1067,7 @@ class ForStatementExec(
           val variableInitStatements = row.schema.names.toSeq
             .map { colName => (colName, createExpressionFromValue(row.getAs(colName))) }
             .flatMap { case (colName, expr) => Seq(
-              createDeclareVarExec(colName, expr),
+              createDeclareVarExec(colName),
               createSetVarExec(colName, expr)
             ) }
 
@@ -1166,8 +1170,9 @@ class ForStatementExec(
     case _ => Literal(value)
   }
 
-  private def createDeclareVarExec(varName: String, variable: Expression): SingleStatementExec = {
-    val defaultExpression = DefaultValueExpression(Literal(null, variable.dataType), "null")
+  private def createDeclareVarExec(varName: String): SingleStatementExec = {
+    val defaultExpression = DefaultValueExpression(
+      Literal(null, queryColumnNameToDataType(varName)), "null")
     val declareVariable = CreateVariable(
       UnresolvedIdentifier(Seq(varName)),
       defaultExpression,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingExecutionSuite.scala
@@ -2720,7 +2720,6 @@ class SqlScriptingExecutionSuite extends QueryTest with SharedSparkSession {
         |          SELECT varL3;
         |          SELECT 1/0;
         |        END;
-
         |        SELECT 5;
         |        SELECT 1/0;
         |        SELECT 6;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/scripting/SqlScriptingInterpreterSuite.scala
@@ -3450,4 +3450,32 @@ class SqlScriptingInterpreterSuite extends QueryTest with SharedSparkSession {
       verifySqlScriptResult(commands, expected)
     }
   }
+
+  test("for statement - structs in array have different values") {
+    withTable("t") {
+      val sqlScript =
+        """
+          |BEGIN
+          | CREATE TABLE t(
+          |   array_column ARRAY<STRUCT<id: INT, strCol: STRING, intArrayCol: ARRAY<INT>>>
+          | );
+          | INSERT INTO t VALUES
+          |  Array(Struct(1, null, Array(10)),
+          |        Struct(2, "name", Array()));
+          | FOR SELECT * FROM t DO
+          |   SELECT array_column;
+          | END FOR;
+          |END
+          |""".stripMargin
+
+      val expected = Seq(
+        Seq.empty[Row], // create table
+        Seq.empty[Row], // insert
+        Seq.empty[Row], // declare array_column
+        Seq.empty[Row], // set array_column
+        Seq(Row(Seq(Row(1, null, Seq(10)), Row(2, "name", Seq.empty))))
+      )
+      verifySqlScriptResult(sqlScript, expected)
+    }
+  }
 }