[SPARK-52638][SQL] Allow preserving Hive-style column order to be configurable

szehon-ho · szehon-ho · commit 197a1298b3a3 · 2025-07-01T12:44:52.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5989,6 +5989,16 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val PRESERVE_HIVE_COLUMN_ORDER =
+    buildConf("spark.sql.hive.preserveColumnOrder.enabled")
+      .internal()
+      .doc("When true, tables returned from HiveExternalCatalog preserve Hive-style column order " +
+        "where the partition columns are at the end.  Otherwise, the user-specified column order " +
+        "is returned.")
+      .version("4.1.0")
+      .booleanConf
+      .createWithDefault(true)
+
   /**
    * Holds information about keys that have been deprecated.
    *
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -48,6 +48,7 @@ import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions}
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
+import org.apache.spark.sql.internal.SQLConf.PRESERVE_HIVE_COLUMN_ORDER
 import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.SchemaUtils
@@ -818,16 +819,20 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
   // columns are not put at the end of schema. We need to reorder it when reading the schema
   // from the table properties.
   private def reorderSchema(schema: StructType, partColumnNames: Seq[String]): StructType = {
-    val partitionFields = partColumnNames.map { partCol =>
-      schema.find(_.name == partCol).getOrElse {
-        throw new AnalysisException(
-          errorClass = "_LEGACY_ERROR_TEMP_3088",
-          messageParameters = Map(
-            "schema" -> schema.catalogString,
-            "partColumnNames" -> partColumnNames.mkString("[", ", ", "]")))
+    if (conf.get(PRESERVE_HIVE_COLUMN_ORDER)) {
+      schema
+    } else {
+      val partitionFields = partColumnNames.map { partCol =>
+        schema.find(_.name == partCol).getOrElse {
+          throw new AnalysisException(
+            errorClass = "_LEGACY_ERROR_TEMP_3088",
+            messageParameters = Map(
+              "schema" -> schema.catalogString,
+              "partColumnNames" -> partColumnNames.mkString("[", ", ", "]")))
+        }
       }
+      StructType(schema.filterNot(partitionFields.contains) ++ partitionFields)
     }
-    StructType(schema.filterNot(partitionFields.contains) ++ partitionFields)
   }
 
   private def restoreHiveSerdeTable(table: CatalogTable): CatalogTable = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -36,9 +36,10 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
-import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogV2Util, Identifier, TableChange, TableInfo}
+import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogV2Util, Identifier, TableCatalog, TableChange, TableInfo}
 import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
 import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
+import org.apache.spark.sql.connector.expressions.Expressions
 import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
 import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec, ParquetFooterReader}
@@ -3432,4 +3433,34 @@ class HiveDDLSuite
         any[String], any[String], any[StructType])
     }
   }
+
+  test("SPARK-52638: Allow preserving Hive-style column order to be configurable") {
+    val catalog = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog]
+    withSQLConf(
+      SQLConf.PRESERVE_HIVE_COLUMN_ORDER.key -> "false"
+    ) {
+      withTable("t1") {
+        val identifier = Identifier.of(Array("default"), "t1")
+        val outputSchema = new StructType()
+          .add("a", IntegerType, true, "comment1")
+          .add("b", IntegerType, true, "comment2")
+          .add("c", IntegerType, true, "comment3")
+          .add("d", IntegerType, true, "comment4")
+        catalog.createTable(
+          identifier,
+          new TableInfo.Builder()
+            .withProperties(Map.empty.asJava)
+            .withColumns(CatalogV2Util.structTypeToV2Columns(outputSchema))
+            .withPartitions(Array(Expressions.identity("a")))
+            .build()
+        )
+        val cols = catalog.loadTable(identifier).columns()
+        assert(cols.length == 4)
+        assert(cols(0).name() == "a")
+        assert(cols(1).name() == "b")
+        assert(cols(2).name() == "c")
+        assert(cols(3).name() == "d")
+      }
+    }
+  }
 }