[SPARK-52695][SQL] User Defined Type write support for xml file format

yaooqinn · yaooqinn · commit cda84dd096b9 · 2025-07-08T10:03:38.000+08:00
### What changes were proposed in this pull request? This PR adds UDT write support for the XML file format ### Why are the changes needed? IllegalArgumentException is being thrown while writing UDT values ### Does this PR introduce _any_ user-facing change? Yes, if the udt's sqlType is compatible with XML file format, it becomes writable ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #51388 from yaooqinn/SPARK-52695. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Kent Yao <yao@apache.org>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala
@@ -228,6 +228,8 @@ class StaxXmlGenerator(
         writeChild(field.name, field.dataType, value)
       }
 
+    case (u: UserDefinedType[_], v) => writeElement(u.sqlType, v, options)
+
     case (_, _) =>
       throw new SparkIllegalArgumentException(
         errorClass = "_LEGACY_ERROR_TEMP_3238",
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -20,7 +20,7 @@ import java.io.{EOFException, File, FileOutputStream, StringWriter}
 import java.nio.charset.{StandardCharsets, UnsupportedCharsetException}
 import java.nio.file.{Files, Path, Paths}
 import java.sql.{Date, Timestamp}
-import java.time.{Instant, LocalDateTime}
+import java.time.{Instant, LocalDateTime, Year}
 import java.util.TimeZone
 import java.util.concurrent.ConcurrentHashMap
 import javax.xml.stream.{XMLOutputFactory, XMLStreamException}
@@ -38,7 +38,8 @@ import org.apache.hadoop.io.compress.{CompressionCodecFactory, GzipCodec}
 
 import org.apache.spark.{DebugFilesystem, SparkException}
 import org.apache.spark.io.ZStdCompressionCodec
-import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, Encoders, QueryTest, Row, SaveMode}
+import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, Encoders, QueryTest, Row, SaveMode, YearUDT}
+import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.UDTEncoder
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.catalyst.util.TypeUtils.ordinalNumber
 import org.apache.spark.sql.catalyst.xml.{IndentingXMLStreamWriter, XmlOptions}
@@ -3490,6 +3491,29 @@ class XmlSuite
       }
     }
   }
+
+  test("SPARK-52695: UDT write support for xml file format") {
+    val udt = new YearUDT()
+    val encoder = UDTEncoder(udt, classOf[YearUDT])
+    withTempDir { dir =>
+      val path = dir.getCanonicalPath
+      // Write a dataset of Year objects
+      val df1 = spark.range(2018, 2025).map(y => Year.of(y.toInt))(encoder)
+
+      df1
+      .write
+      .mode(SaveMode.Overwrite)
+      .option("rowTag", "ROW")
+      .xml(path)
+
+      val df = spark.read
+        .option("rowTag", "ROW")
+        .xml(path)
+
+      assert(df.schema === StructType(Seq(StructField("value", LongType))))
+      checkAnswer(df, spark.range(2018, 2025).toDF("value"))
+    }
+  }
 }
 
 // Mock file system that checks the number of open files

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,8 @@ class StaxXmlGenerator(`
`228`	`228`	`writeChild(field.name, field.dataType, value)`
`229`	`229`	`}`
`230`	`230`
	`231`	`+ case (u: UserDefinedType[_], v) => writeElement(u.sqlType, v, options)`
	`232`	`+`
`231`	`233`	`case (_, _) =>`
`232`	`234`	`throw new SparkIllegalArgumentException(`
`233`	`235`	`errorClass = "_LEGACY_ERROR_TEMP_3238",`