[SPARK-52023][SQL] Fix data corruption/segfault returning Option[Product] from udaf

eejbyfeldt · hvanhovell · commit 5e6e8f12a503 · 2025-07-01T09:37:36.000-04:00
### What changes were proposed in this pull request? This fixes so defining a udaf returning a `Option[Product]` produces correct results instead of the current behavior. Where it throws an exception, segfaults or produces incorrect results. ### Why are the changes needed? Fix correctness issue. ### Does this PR introduce _any_ user-facing change? Fixes a correctness issue. ### How was this patch tested? Existing and new unittest. ### Was this patch authored or co-authored using generative AI tooling? No Closes #50827 from eejbyfeldt/SPARK-52023. Authored-by: Emil Ejbyfeldt <emil.ejbyfeldt@choreograph.com> Signed-off-by: Herman van Hovell <herman@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -530,7 +530,7 @@ case class ScalaAggregator[IN, BUF, OUT](
 
   def eval(buffer: BUF): Any = {
     val row = outputSerializer(agg.finish(buffer))
-    if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType)
+    if (outputEncoder.isSerializedAsStructForTopLevel) row else row.get(0, dataType)
   }
 
   @transient private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala
@@ -60,6 +60,22 @@ object LongProductSumAgg extends Aggregator[(jlLong, jlLong), Long, jlLong] {
   def outputEncoder: Encoder[jlLong] = Encoders.LONG
 }
 
+final case class Reduce[T: Encoder](r: (T, T) => T)(implicit i: Encoder[Option[T]])
+  extends Aggregator[T, Option[T], T] {
+  def zero: Option[T] = None
+  def reduce(b: Option[T], a: T): Option[T] = Some(b.fold(a)(r(_, a)))
+  def merge(b1: Option[T], b2: Option[T]): Option[T] =
+    (b1, b2) match {
+      case (Some(a), Some(b)) => Some(r(a, b))
+      case (Some(a), None) => Some(a)
+      case (None, Some(b)) => Some(b)
+      case (None, None) => None
+    }
+  def finish(reduction: Option[T]): T = reduction.get
+  def bufferEncoder: Encoder[Option[T]] = implicitly
+  def outputEncoder: Encoder[T] = implicitly
+}
+
 @SQLUserDefinedType(udt = classOf[CountSerDeUDT])
 case class CountSerDeSQL(nSer: Int, nDeSer: Int, sum: Int)
 
@@ -180,6 +196,9 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     val data4 = Seq[Boolean](true, false, true).toDF("boolvalues")
     data4.write.saveAsTable("agg4")
 
+    val data5 = Seq[(Int, (Int, Int))]((1, (2, 3))).toDF("key", "value")
+    data5.write.saveAsTable("agg5")
+
     val emptyDF = spark.createDataFrame(
       sparkContext.emptyRDD[Row],
       StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil))
@@ -190,6 +209,8 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
     spark.udf.register("mydoubleavg", udaf(MyDoubleAvgAgg))
     spark.udf.register("longProductSum", udaf(LongProductSumAgg))
     spark.udf.register("arraysum", udaf(ArrayDataAgg))
+    spark.udf.register("reduceOptionPair", udaf(Reduce[Option[(Int, Int)]](
+      (opt1, opt2) => opt1.zip(opt2).map { case ((a1, b1), (a2, b2)) => (a1 + a2, b1 + b2) })))
   }
 
   override def afterAll(): Unit = {
@@ -371,6 +392,12 @@ abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSi
       Row(Seq(12.0, 15.0, 18.0)) :: Nil)
   }
 
+  test("SPARK-52023: Returning Option[Product] from udaf") {
+    checkAnswer(
+      spark.sql("SELECT reduceOptionPair(value) FROM agg5 GROUP BY key"),
+      Row(Row(2, 3)) :: Nil)
+  }
+
   test("verify aggregator ser/de behavior") {
     val data = sparkContext.parallelize((1 to 100).toSeq, 3).toDF("value1")
     val agg = udaf(CountSerDeAgg)

Original file line number	Diff line number	Diff line change
`@@ -530,7 +530,7 @@ case class ScalaAggregator[IN, BUF, OUT](`
`530`	`530`
`531`	`531`	`def eval(buffer: BUF): Any = {`
`532`	`532`	`val row = outputSerializer(agg.finish(buffer))`
`533`		`- if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType)`
	`533`	`+ if (outputEncoder.isSerializedAsStructForTopLevel) row else row.get(0, dataType)`
`534`	`534`	`}`
`535`	`535`
`536`	`536`	`@transient private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length)`