Skip to content

Commit c3dbcd0

Browse files
committed
temp
1 parent 56dcdd8 commit c3dbcd0

File tree

2 files changed

+153
-28
lines changed

2 files changed

+153
-28
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ApplyDefaultCollationToStringType.scala

Lines changed: 87 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@ package org.apache.spark.sql.catalyst.analysis
2020
import org.apache.spark.sql.catalyst.expressions.{Cast, DefaultStringProducingExpression, Expression, Literal, SubqueryExpression}
2121
import org.apache.spark.sql.catalyst.plans.logical.{AddColumns, AlterColumns, AlterColumnSpec, AlterViewAs, ColumnDefinition, CreateTable, CreateTempView, CreateView, LogicalPlan, QualifiedColType, ReplaceColumns, ReplaceTable, TableSpec, V2CreateTablePlan}
2222
import org.apache.spark.sql.catalyst.rules.Rule
23-
import org.apache.spark.sql.connector.catalog.{SupportsNamespaces, TableCatalog}
23+
import org.apache.spark.sql.catalyst.trees.CurrentOrigin
24+
import org.apache.spark.sql.catalyst.util.CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY
25+
import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces, Table, TableCatalog}
2426
import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_COLLATION
25-
import org.apache.spark.sql.types.{DataType, StringType}
27+
import org.apache.spark.sql.errors.DataTypeErrors.toSQLId
28+
import org.apache.spark.sql.errors.QueryCompilationErrors
29+
import org.apache.spark.sql.types.{CharType, DataType, StringType, StructField, VarcharType}
2630

2731
/**
2832
* Resolves string types in logical plans by assigning them the appropriate collation. The
@@ -33,12 +37,13 @@ import org.apache.spark.sql.types.{DataType, StringType}
3337
*/
3438
object ApplyDefaultCollationToStringType extends Rule[LogicalPlan] {
3539
def apply(plan: LogicalPlan): LogicalPlan = {
36-
val planWithResolvedDefaultCollation = resolveDefaultCollation(plan)
40+
val preprocessedPlan = Seq(resolveDefaultCollation _, resolveAlterColumnsDataType _)
41+
.foldLeft(plan) { case (currentPlan, resolver) => resolver(currentPlan) }
3742

38-
fetchDefaultCollation(planWithResolvedDefaultCollation) match {
43+
fetchDefaultCollation(preprocessedPlan) match {
3944
case Some(collation) =>
40-
transform(planWithResolvedDefaultCollation, StringType(collation))
41-
case None => planWithResolvedDefaultCollation
45+
transform(preprocessedPlan, StringType(collation))
46+
case None => preprocessedPlan
4247
}
4348
}
4449

@@ -63,10 +68,14 @@ object ApplyDefaultCollationToStringType extends Rule[LogicalPlan] {
6368
case ReplaceTable(_: ResolvedIdentifier, _, _, tableSpec: TableSpec, _) =>
6469
tableSpec.collation
6570

66-
// In `transform` we handle these 3 ALTER TABLE commands.
67-
case cmd: AddColumns => getCollationFromTableProps(cmd.table)
68-
case cmd: ReplaceColumns => getCollationFromTableProps(cmd.table)
69-
case cmd: AlterColumns => getCollationFromTableProps(cmd.table)
71+
case AddColumns(resolvedTable: ResolvedTable, _) =>
72+
Option(resolvedTable.table.properties.get(TableCatalog.PROP_COLLATION))
73+
74+
case ReplaceColumns(resolvedTable: ResolvedTable, _) =>
75+
Option(resolvedTable.table.properties.get(TableCatalog.PROP_COLLATION))
76+
77+
case AlterColumns(resolvedTable: ResolvedTable, _) =>
78+
Option(resolvedTable.table.properties.get(TableCatalog.PROP_COLLATION))
7079

7180
case alterViewAs: AlterViewAs =>
7281
alterViewAs.child match {
@@ -85,15 +94,6 @@ object ApplyDefaultCollationToStringType extends Rule[LogicalPlan] {
8594
}
8695
}
8796

88-
private def getCollationFromTableProps(t: LogicalPlan): Option[String] = {
89-
t match {
90-
case resolvedTbl: ResolvedTable
91-
if resolvedTbl.table.properties.containsKey(TableCatalog.PROP_COLLATION) =>
92-
Some(resolvedTbl.table.properties.get(TableCatalog.PROP_COLLATION))
93-
case _ => None
94-
}
95-
}
96-
9797
/**
9898
* Determines the default collation for an object in the following order:
9999
* 1. Use the object's explicitly defined default collation, if available.
@@ -151,22 +151,86 @@ object ApplyDefaultCollationToStringType extends Rule[LogicalPlan] {
151151
case p if isCreateOrAlterPlan(p) || AnalysisContext.get.collation.isDefined =>
152152
transformPlan(p, newType)
153153

154-
case addCols: AddColumns =>
154+
case addCols@AddColumns(_: ResolvedTable, _) =>
155155
addCols.copy(columnsToAdd = replaceColumnTypes(addCols.columnsToAdd, newType))
156156

157-
case replaceCols: ReplaceColumns =>
157+
case replaceCols@ReplaceColumns(_: ResolvedTable, _) =>
158158
replaceCols.copy(columnsToAdd = replaceColumnTypes(replaceCols.columnsToAdd, newType))
159159

160-
case a @ AlterColumns(_, specs: Seq[AlterColumnSpec]) =>
160+
case a @ AlterColumns(ResolvedTable(_, _, table: Table, _), specs: Seq[AlterColumnSpec]) =>
161161
val newSpecs = specs.map {
162-
case spec if spec.newDataType.isDefined && hasDefaultStringType(spec.newDataType.get) =>
162+
case spec if shouldApplyDefaultCollationToAlterColumn(spec, table) =>
163163
spec.copy(newDataType = Some(replaceDefaultStringType(spec.newDataType.get, newType)))
164164
case col => col
165165
}
166166
a.copy(specs = newSpecs)
167167
}
168168
}
169169

170+
/**
171+
* The column type should not be changed if the original column type is [[StringType]] and the new
172+
* type is the default [[StringType]] (i.e., [[StringType]] without an explicit collation).
173+
*
174+
* Query Example:
175+
* {{{
176+
* CREATE TABLE t (c1 STRING COLLATE UNICODE)
177+
* ALTER TABLE t ALTER COLUMN c1 TYPE STRING -- c1 will remain STRING COLLATE UNICODE
178+
* }}}
179+
*/
180+
private def resolveAlterColumnsDataType(plan: LogicalPlan): LogicalPlan = {
181+
plan match {
182+
case alterColumns@AlterColumns(
183+
ResolvedTable(_, _, table: Table, _), specs: Seq[AlterColumnSpec]) =>
184+
val resolvedSpecs = specs.map { spec =>
185+
if (spec.newDataType.isDefined && isStringTypeColumn(spec.column, table) &&
186+
isDefaultStringType(spec.newDataType.get)) {
187+
spec.copy(newDataType = None)
188+
} else {
189+
spec
190+
}
191+
}
192+
val newAlterColumns = CurrentOrigin.withOrigin(alterColumns.origin) {
193+
alterColumns.copy(specs = resolvedSpecs)
194+
}
195+
newAlterColumns.copyTagsFrom(alterColumns)
196+
newAlterColumns
197+
case _ =>
198+
plan
199+
}
200+
}
201+
202+
private def shouldApplyDefaultCollationToAlterColumn(
203+
alterColumnSpec: AlterColumnSpec, table: Table): Boolean = {
204+
alterColumnSpec.newDataType.isDefined &&
205+
// Applies the default collation only if the original column's type is not StringType.
206+
!isStringTypeColumn(alterColumnSpec.column, table) &&
207+
hasDefaultStringType(alterColumnSpec.newDataType.get)
208+
}
209+
210+
/**
211+
* Checks whether the column's [[DataType]] is [[StringType]] in the given table. Throws an error
212+
* if the column is not found.
213+
*/
214+
private def isStringTypeColumn(fieldName: FieldName, table: Table): Boolean = {
215+
CatalogV2Util.v2ColumnsToStructType(table.columns())
216+
.findNestedField(fieldName.name, includeCollections = true, resolver = conf.resolver)
217+
.map {
218+
case (_, StructField(_, _: CharType, _, _)) =>
219+
false
220+
case (_, StructField(_, _: VarcharType, _, _)) =>
221+
false
222+
case (_, StructField(_, _: StringType, _, metadata))
223+
if !metadata.contains(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY) =>
224+
true
225+
case (_, _) =>
226+
false
227+
}
228+
.getOrElse {
229+
throw QueryCompilationErrors.unresolvedColumnError(
230+
toSQLId(fieldName.name), table.columns().map(_.name))
231+
}
232+
}
233+
170234
/**
171235
* Transforms the given plan, by transforming all expressions in its operators to use the given
172236
* new type instead of the default string type.

sql/core/src/test/scala/org/apache/spark/sql/collation/DefaultCollationTestSuite.scala

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,12 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
8080
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING COLLATE UNICODE")
8181
assertTableColumnCollation(testTable, "c2", "UNICODE")
8282

83+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
84+
// only if it wasn't a string column before. If the column was already a string, and we're
85+
// just changing its type to string (without explicit collation) again, keep the original
86+
// collation.
8387
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
84-
assertTableColumnCollation(testTable, "c2", "UTF8_BINARY")
88+
assertTableColumnCollation(testTable, "c2", "UNICODE")
8589
}
8690
}
8791

@@ -128,9 +132,26 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
128132
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING COLLATE UNICODE_CI")
129133
assertTableColumnCollation(testTable, "c1", "UNICODE_CI")
130134

135+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
136+
// only if it wasn't a string column before. If the column was already a string, and we're
137+
// just changing its type to string (without explicit collation) again, keep the original
138+
// collation.
139+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING")
140+
assertTableColumnCollation(testTable, "c1", "UNICODE_CI")
141+
131142
// alter table add columns with explicit collation, check collation for each column
132143
sql(s"ALTER TABLE $testTable ADD COLUMN c7 STRING COLLATE SR_CI_AI")
133144
sql(s"ALTER TABLE $testTable ADD COLUMN c8 STRING COLLATE UTF8_BINARY")
145+
assertTableColumnCollation(testTable, "c7", "SR_CI_AI")
146+
assertTableColumnCollation(testTable, "c8", "UTF8_BINARY")
147+
148+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
149+
// only if it wasn't a string column before. If the column was already a string, and we're
150+
// just changing its type to string (without explicit collation) again, keep the original
151+
// collation.
152+
sql(s"ALTER TABLE $testTable ALTER COLUMN c8 TYPE STRING")
153+
assertTableColumnCollation(testTable, "c8", "UTF8_BINARY")
154+
134155
assertTableColumnCollation(testTable, "c1", "UNICODE_CI")
135156
assertTableColumnCollation(testTable, "c2", "SR")
136157
assertTableColumnCollation(testTable, "c3", "UTF8_BINARY")
@@ -142,6 +163,24 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
142163
}
143164
}
144165

166+
test("Alter table alter column type with default collation") {
167+
// When using ALTER TABLE ALTER COLUMN TYPE, the column should inherit the table's collation
168+
// only if it wasn't a string column before. If the column was already a string, and we're
169+
// just changing its type to string (without explicit collation) again, keep the original
170+
// collation.
171+
withTable(testTable) {
172+
sql(s"CREATE TABLE $testTable (c1 STRING, c2 STRING COLLATE UTF8_LCASE, c3 STRING)" +
173+
s" DEFAULT COLLATION UTF8_LCASE")
174+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING")
175+
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
176+
sql(s"ALTER TABLE $testTable ALTER COLUMN c3 TYPE STRING COLLATE UNICODE")
177+
178+
assertTableColumnCollation(testTable, "c1", "UTF8_LCASE")
179+
assertTableColumnCollation(testTable, "c2", "UTF8_LCASE")
180+
assertTableColumnCollation(testTable, "c3", "UNICODE")
181+
}
182+
}
183+
145184
Seq(
146185
// (schemaDefaultCollation, tableDefaultCollation)
147186
("UTF8_BINARY", None),
@@ -213,7 +252,7 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
213252
sql(s"ALTER SCHEMA $testSchema DEFAULT COLLATION $schemaNewCollation")
214253

215254
// Altering schema default collation should not affect existing objects.
216-
addAndAlterColumns(tableDefaultCollation = tableDefaultCollation)
255+
addAndAlterColumns(c2Collation = "SR_AI", tableDefaultCollation = tableDefaultCollation)
217256
}
218257

219258
withTable(testTable) {
@@ -405,12 +444,12 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
405444
sql(s"CREATE TABLE $testTable (c1 STRING, c2 STRING COLLATE SR_AI) " +
406445
s"$tableDefaultCollationClause")
407446

408-
addAndAlterColumns(tableDefaultCollation = resolvedDefaultCollation)
447+
addAndAlterColumns(c2Collation = "SR_AI", tableDefaultCollation = resolvedDefaultCollation)
409448
}
410449
}
411450
}
412451

413-
private def addAndAlterColumns(tableDefaultCollation: String): Unit = {
452+
private def addAndAlterColumns(c2Collation: String, tableDefaultCollation: String): Unit = {
414453
// ADD COLUMN
415454
sql(s"ALTER TABLE $testTable ADD COLUMN c3 STRING")
416455
sql(s"ALTER TABLE $testTable ADD COLUMN c4 STRING COLLATE SR_AI")
@@ -424,7 +463,7 @@ abstract class DefaultCollationTestSuite extends QueryTest with SharedSparkSessi
424463
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
425464
sql(s"ALTER TABLE $testTable ALTER COLUMN c3 TYPE STRING COLLATE UTF8_BINARY")
426465
assertTableColumnCollation(testTable, "c1", "UNICODE")
427-
assertTableColumnCollation(testTable, "c2", tableDefaultCollation)
466+
assertTableColumnCollation(testTable, "c2", c2Collation)
428467
assertTableColumnCollation(testTable, "c3", "UTF8_BINARY")
429468
}
430469
}
@@ -629,6 +668,28 @@ class DefaultCollationTestSuiteV2 extends DefaultCollationTestSuite with Datasou
629668
}
630669
}
631670

671+
test("alter char/varchar column to string type") {
672+
withTable(testTable) {
673+
sql(s"CREATE TABLE $testTable (c1 VARCHAR(10), c2 CHAR(10)) " +
674+
s"DEFAULT COLLATION UTF8_LCASE")
675+
676+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING")
677+
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING")
678+
assertTableColumnCollation(testTable, "c1", "UTF8_LCASE")
679+
assertTableColumnCollation(testTable, "c2", "UTF8_LCASE")
680+
}
681+
682+
withTable(testTable) {
683+
sql(s"CREATE TABLE $testTable (c1 VARCHAR(10), c2 CHAR(10)) " +
684+
s"DEFAULT COLLATION UTF8_LCASE")
685+
686+
sql(s"ALTER TABLE $testTable ALTER COLUMN c1 TYPE STRING COLLATE UNICODE")
687+
sql(s"ALTER TABLE $testTable ALTER COLUMN c2 TYPE STRING COLLATE UNICODE")
688+
assertTableColumnCollation(testTable, "c1", "UNICODE")
689+
assertTableColumnCollation(testTable, "c2", "UNICODE")
690+
}
691+
}
692+
632693
private def testReplaceColumns(
633694
schemaDefaultCollation: String, tableDefaultCollation: Option[String] = None): Unit = {
634695
val (tableDefaultCollationClause, resolvedDefaultCollation) =

0 commit comments

Comments
 (0)