Skip to content

Commit cfd695b

Browse files
ilicmarkodbcloud-fan
authored andcommitted
[SPARK-52281][SQL] Change ALTER TABLE ALTER COLUMN TYPE STRING not to apply default collation if original data type was instance of StringType
### What changes were proposed in this pull request? Changed `ALTER TABLE ALTER COLUMN TYPE STRING` not to apply default collation if original data type was instance of `StringType`. ``` CREATE TABLE T (C1 CHAR/VARCHAR); ALTER TABLE T DEFAULT COLLATION UTF8_LCASE; ALTER TABLE T ALTER COLUMN C1 TYPE STRING COLLATE UTF8_LCASE; ----------------------------------------------------------------------------------- CREATE TABLE T (C1 STRING [COLLATE XYZ]) ALTER TABLE T DEFAULT COLLATION UTF8_LCASE ALTER TABLE T ALTER COLUMN C1 TYPE STRING // C1 -> STRING [COLLATE XYZ] ``` ### Why are the changes needed? Bug fix. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tests added to `DefaultCollationTestSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51001 from ilicmarkodb/fix_alter_colum_with_collation. Authored-by: ilicmarkodb <marko.ilic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 3e7fd7e commit cfd695b

File tree

2 files changed

+144
-15
lines changed

2 files changed

+144
-15
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ApplyDefaultCollationToStringType.scala

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,13 @@ import scala.util.control.NonFatal
2222
import org.apache.spark.sql.catalyst.expressions.{Cast, DefaultStringProducingExpression, Expression, Literal, SubqueryExpression}
2323
import org.apache.spark.sql.catalyst.plans.logical.{AddColumns, AlterColumns, AlterColumnSpec, AlterViewAs, ColumnDefinition, CreateTable, CreateTempView, CreateView, LogicalPlan, QualifiedColType, ReplaceColumns, ReplaceTable, TableSpec, V2CreateTablePlan}
2424
import org.apache.spark.sql.catalyst.rules.Rule
25-
import org.apache.spark.sql.connector.catalog.{SupportsNamespaces, TableCatalog}
25+
import org.apache.spark.sql.catalyst.trees.CurrentOrigin
26+
import org.apache.spark.sql.catalyst.util.CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY
27+
import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces, Table, TableCatalog}
2628
import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_COLLATION
27-
import org.apache.spark.sql.types.{DataType, StringType}
29+
import org.apache.spark.sql.errors.DataTypeErrors.toSQLId
30+
import org.apache.spark.sql.errors.QueryCompilationErrors
31+
import org.apache.spark.sql.types.{CharType, DataType, StringType, StructField, VarcharType}
2832

2933
/**
3034
* Resolves string types in logical plans by assigning them the appropriate collation. The
@@ -35,12 +39,12 @@ import org.apache.spark.sql.types.{DataType, StringType}
3539
*/
3640
object ApplyDefaultCollationToStringType extends Rule[LogicalPlan] {
3741
def apply(plan: LogicalPlan): LogicalPlan = {
38-
val planWithResolvedDefaultCollation = resolveDefaultCollation(plan)
42+
val preprocessedPlan = resolveDefaultCollation(pruneRedundantAlterColumnTypes(plan))
3943

40-
fetchDefaultCollation(planWithResolvedDefaultCollation) match {
44+
fetchDefaultCollation(preprocessedPlan) match {
4145
case Some(collation) =>
42-
transform(planWithResolvedDefaultCollation, StringType(collation))
43-
case None => planWithResolvedDefaultCollation
46+
transform(preprocessedPlan, StringType(collation))
47+
case None => preprocessedPlan
4448
}
4549
}
4650

@@ -170,22 +174,86 @@ object ApplyDefaultCollationToStringType extends Rule[LogicalPlan] {
170174
case p if isCreateOrAlterPlan(p) || AnalysisContext.get.collation.isDefined =>
171175
transformPlan(p, newType)
172176

173-
case addCols: AddColumns =>
177+
case addCols@AddColumns(_: ResolvedTable, _) =>
174178
addCols.copy(columnsToAdd = replaceColumnTypes(addCols.columnsToAdd, newType))
175179

176-
case replaceCols: ReplaceColumns =>
180+
case replaceCols@ReplaceColumns(_: ResolvedTable, _) =>
177181
replaceCols.copy(columnsToAdd = replaceColumnTypes(replaceCols.columnsToAdd, newType))
178182

179-
case a @ AlterColumns(_, specs: Seq[AlterColumnSpec]) =>
183+
case a @ AlterColumns(ResolvedTable(_, _, table: Table, _), specs: Seq[AlterColumnSpec]) =>
180184
val newSpecs = specs.map {
181-
case spec if spec.newDataType.isDefined && hasDefaultStringType(spec.newDataType.get) =>
185+
case spec if shouldApplyDefaultCollationToAlterColumn(spec, table) =>
182186
spec.copy(newDataType = Some(replaceDefaultStringType(spec.newDataType.get, newType)))
183187
case col => col
184188
}
185189
a.copy(specs = newSpecs)
186190
}
187191
}
188192

193+
/**
194+
* The column type should not be changed if the original column type is [[StringType]] and the new
195+
* type is the default [[StringType]] (i.e., [[StringType]] without an explicit collation).
196+
*
197+
* Query Example:
198+
* {{{
199+
* CREATE TABLE t (c1 STRING COLLATE UNICODE)
200+
* ALTER TABLE t ALTER COLUMN c1 TYPE STRING -- c1 will remain STRING COLLATE UNICODE
201+
* }}}
202+
*/
203+
private def pruneRedundantAlterColumnTypes(plan: LogicalPlan): LogicalPlan = {
204+
plan match {
205+
case alterColumns@AlterColumns(
206+
ResolvedTable(_, _, table: Table, _), specs: Seq[AlterColumnSpec]) =>
207+
val resolvedSpecs = specs.map { spec =>
208+
if (spec.newDataType.isDefined && isStringTypeColumn(spec.column, table) &&
209+
isDefaultStringType(spec.newDataType.get)) {
210+
spec.copy(newDataType = None)
211+
} else {
212+
spec
213+
}
214+
}
215+
val newAlterColumns = CurrentOrigin.withOrigin(alterColumns.origin) {
216+
alterColumns.copy(specs = resolvedSpecs)
217+
}
218+
newAlterColumns.copyTagsFrom(alterColumns)
219+
newAlterColumns
220+
case _ =>
221+
plan
222+
}
223+
}
224+
225+
private def shouldApplyDefaultCollationToAlterColumn(
226+
alterColumnSpec: AlterColumnSpec, table: Table): Boolean = {
227+
alterColumnSpec.newDataType.isDefined &&
228+
// Applies the default collation only if the original column's type is not StringType.
229+
!isStringTypeColumn(alterColumnSpec.column, table) &&
230+
hasDefaultStringType(alterColumnSpec.newDataType.get)
231+
}
232+
233+
/**
234+
* Checks whether the column's [[DataType]] is [[StringType]] in the given table. Throws an error
235+
* if the column is not found.
236+
*/
237+
private def isStringTypeColumn(fieldName: FieldName, table: Table): Boolean = {
238+
CatalogV2Util.v2ColumnsToStructType(table.columns())
239+
.findNestedField(fieldName.name, includeCollections = true, resolver = conf.resolver)
240+
.map {
241+
case (_, StructField(_, _: CharType, _, _)) =>
242+
false
243+
case (_, StructField(_, _: VarcharType, _, _)) =>
244+
false
245+
case (_, StructField(_, _: StringType, _, metadata))
246+
if !metadata.contains(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY) =>
247+
true
248+
case (_, _) =>
249+
false
250+
}
251+
.getOrElse {
252+
throw QueryCompilationErrors.unresolvedColumnError(
253+
toSQLId(fieldName.name), table.columns().map(_.name))
254+
}
255+
}
256+
189257
/**
190258
* Transforms the given plan, by transforming all expressions in its operators to use the given
191259
* new type instead of the default string type.

sql/core/src/test/scala/org/apache/spark/sql/collation/DefaultCollationTestSuite.scala

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,12 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
100100
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING COLLATE UNICODE")
101101
assertTableColumnCollation(testTable, "c2", "UNICODE")
102102

103+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
104+
// only if it wasn't a string column before. If the column was already a string, and we're
105+
// just changing its type to string (without explicit collation) again, keep the original
106+
// collation.
103107
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
104-
assertTableColumnCollation(testTable, "c2", "UTF8_BINARY")
108+
assertTableColumnCollation(testTable, "c2", "UNICODE")
105109
}
106110
}
107111

@@ -148,9 +152,26 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
148152
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING COLLATE UNICODE_CI")
149153
assertTableColumnCollation(testTable, "c1", "UNICODE_CI")
150154

155+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
156+
// only if it wasn't a string column before. If the column was already a string, and we're
157+
// just changing its type to string (without explicit collation) again, keep the original
158+
// collation.
159+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING")
160+
assertTableColumnCollation(testTable, "c1", "UNICODE_CI")
161+
151162
// alter table add columns with explicit collation, check collation for each column
152163
sql(s"ALTER TABLE $testTable ADD COLUMN c7 STRING COLLATE SR_CI_AI")
153164
sql(s"ALTER TABLE $testTable ADD COLUMN c8 STRING COLLATE UTF8_BINARY")
165+
assertTableColumnCollation(testTable, "c7", "SR_CI_AI")
166+
assertTableColumnCollation(testTable, "c8", "UTF8_BINARY")
167+
168+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
169+
// only if it wasn't a string column before. If the column was already a string, and we're
170+
// just changing its type to string (without explicit collation) again, keep the original
171+
// collation.
172+
sql(s"ALTER TABLE $testTable ALTER COLUMN c8 TYPE STRING")
173+
assertTableColumnCollation(testTable, "c8", "UTF8_BINARY")
174+
154175
assertTableColumnCollation(testTable, "c1", "UNICODE_CI")
155176
assertTableColumnCollation(testTable, "c2", "SR")
156177
assertTableColumnCollation(testTable, "c3", "UTF8_BINARY")
@@ -162,6 +183,24 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
162183
}
163184
}
164185

186+
test("Alter table alter column type with default collation") {
187+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
188+
// only if it wasn't a string column before. If the column was already a string, and we're
189+
// just changing its type to string (without explicit collation) again, keep the original
190+
// collation.
191+
withTable(testTable) {
192+
sql(s"CREATE TABLE $testTable (c1 STRING, c2 STRING COLLATE UTF8_LCASE, c3 STRING)" +
193+
s" DEFAULT COLLATION UTF8_LCASE")
194+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING")
195+
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
196+
sql(s"ALTER TABLE $testTable ALTER COLUMN c3 TYPE STRING COLLATE UNICODE")
197+
198+
assertTableColumnCollation(testTable, "c1", "UTF8_LCASE")
199+
assertTableColumnCollation(testTable, "c2", "UTF8_LCASE")
200+
assertTableColumnCollation(testTable, "c3", "UNICODE")
201+
}
202+
}
203+
165204
schemaAndObjectCollationPairs.foreach {
166205
case (schemaDefaultCollation, tableDefaultCollation) =>
167206
test(
@@ -221,7 +260,7 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
221260
sql(s"ALTER SCHEMA $testSchema DEFAULT COLLATION $schemaNewCollation")
222261

223262
// Altering schema default collation should not affect existing objects.
224-
addAndAlterColumns(tableDefaultCollation = tableDefaultCollation)
263+
addAndAlterColumns(c2Collation = "SR_AI", tableDefaultCollation = tableDefaultCollation)
225264
}
226265

227266
withTable(testTable) {
@@ -413,12 +452,12 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
413452
sql(s"CREATE TABLE $testTable (c1 STRING, c2 STRING COLLATE SR_AI) " +
414453
s"$tableDefaultCollationClause")
415454

416-
addAndAlterColumns(tableDefaultCollation = resolvedDefaultCollation)
455+
addAndAlterColumns(c2Collation = "SR_AI", tableDefaultCollation = resolvedDefaultCollation)
417456
}
418457
}
419458
}
420459

421-
private def addAndAlterColumns(tableDefaultCollation: String): Unit = {
460+
private def addAndAlterColumns(c2Collation: String, tableDefaultCollation: String): Unit = {
422461
// ADD COLUMN
423462
sql(s"ALTER TABLE $testTable ADD COLUMN c3 STRING")
424463
sql(s"ALTER TABLE $testTable ADD COLUMN c4 STRING COLLATE SR_AI")
@@ -432,7 +471,7 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
432471
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
433472
sql(s"ALTER TABLE $testTable ALTER COLUMN c3 TYPE STRING COLLATE UTF8_BINARY")
434473
assertTableColumnCollation(testTable, "c1", "UNICODE")
435-
assertTableColumnCollation(testTable, "c2", tableDefaultCollation)
474+
assertTableColumnCollation(testTable, "c2", c2Collation)
436475
assertTableColumnCollation(testTable, "c3", "UTF8_BINARY")
437476
}
438477
}
@@ -806,6 +845,28 @@ class DefaultCollationTestSuiteV2 extends DefaultCollationTestSuite with Datasou
806845
}
807846
}
808847

848+
test("alter char/varchar column to string type") {
849+
withTable(testTable) {
850+
sql(s"CREATE TABLE $testTable (c1 VARCHAR(10), c2 CHAR(10)) " +
851+
s"DEFAULT COLLATION UTF8_LCASE")
852+
853+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING")
854+
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
855+
assertTableColumnCollation(testTable, "c1", "UTF8_LCASE")
856+
assertTableColumnCollation(testTable, "c2", "UTF8_LCASE")
857+
}
858+
859+
withTable(testTable) {
860+
sql(s"CREATE TABLE $testTable (c1 VARCHAR(10), c2 CHAR(10)) " +
861+
s"DEFAULT COLLATION UTF8_LCASE")
862+
863+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING COLLATE UNICODE")
864+
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING COLLATE UNICODE")
865+
assertTableColumnCollation(testTable, "c1", "UNICODE")
866+
assertTableColumnCollation(testTable, "c2", "UNICODE")
867+
}
868+
}
869+
809870
private def testReplaceColumns(
810871
schemaDefaultCollation: String, tableDefaultCollation: Option[String] = None): Unit = {
811872
val (tableDefaultCollationClause, resolvedDefaultCollation) =

0 commit comments

Comments
 (0)