[SPARK-50815][PYTHON] Fix Variant Local Data to Arrow Conversion

harshmotw-db · yhuang-db · commit d0131bbbaf0c · 2025-06-09T09:54:22.000-07:00
### What changes were proposed in this pull request? This PR removes unnecessary code for converting Variants in PySpark from local to arrow representation. This allows createDataFrame and Python Datasources to work seamlessly with Variants. ### Why are the changes needed? [This PR](apache#45826) introduced code to convert Variants from internal representation to representation in Arrow (LocalDataToArrowConversion). However, the internal representation is assumed to be `dict` and the arrow representation is assumed to be `VariantVal` even though it should be the other way around. It appears this code written in the PR is not actually encountered in any tests. This caused `createDataFrame` to not work with Variants and the [attempted fix](apache#49487) added a special case (`variants_as_dicts`) for this code, even though the special case was actually the only use case. This PR removes the old unnecessary code and only keeps the "special case" code as the main code for converting Variant from local (`VariantVal`) to Arrow (`dict`). ### Does this PR introduce _any_ user-facing change? This will allow users to use Python datasources with Variants. ### How was this patch tested? Existing tests should pass, and a new unit test for Python Datasources was added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#51082 from harshmotw-db/harsh-motwani_data/experimental_variant_fix. Authored-by: Harsh Motwani <harsh.motwani@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
@@ -95,7 +95,6 @@ def _need_converter(
     def _create_converter(
         dataType: DataType,
         nullable: bool = True,
-        variants_as_dicts: bool = False,  # some code paths may require python internal types
     ) -> Callable:
         assert dataType is not None and isinstance(dataType, DataType)
         assert isinstance(nullable, bool)
@@ -117,9 +116,7 @@ def convert_null(value: Any) -> Any:
             dedup_field_names = _dedup_names(dataType.names)
 
             field_convs = [
-                LocalDataToArrowConversion._create_converter(
-                    field.dataType, field.nullable, variants_as_dicts
-                )
+                LocalDataToArrowConversion._create_converter(field.dataType, field.nullable)
                 for field in dataType.fields
             ]
 
@@ -161,7 +158,7 @@ def convert_struct(value: Any) -> Any:
 
         elif isinstance(dataType, ArrayType):
             element_conv = LocalDataToArrowConversion._create_converter(
-                dataType.elementType, dataType.containsNull, variants_as_dicts
+                dataType.elementType, dataType.containsNull
             )
 
             def convert_array(value: Any) -> Any:
@@ -178,7 +175,7 @@ def convert_array(value: Any) -> Any:
         elif isinstance(dataType, MapType):
             key_conv = LocalDataToArrowConversion._create_converter(dataType.keyType)
             value_conv = LocalDataToArrowConversion._create_converter(
-                dataType.valueType, dataType.valueContainsNull, variants_as_dicts
+                dataType.valueType, dataType.valueContainsNull
             )
 
             def convert_map(value: Any) -> Any:
@@ -288,14 +285,7 @@ def convert_variant(value: Any) -> Any:
                     if not nullable:
                         raise PySparkValueError(f"input for {dataType} must not be None")
                     return None
-                elif (
-                    isinstance(value, dict)
-                    and all(key in value for key in ["value", "metadata"])
-                    and all(isinstance(value[key], bytes) for key in ["value", "metadata"])
-                    and not variants_as_dicts
-                ):
-                    return VariantVal(value["value"], value["metadata"])
-                elif isinstance(value, VariantVal) and variants_as_dicts:
+                elif isinstance(value, VariantVal):
                     return VariantType().toInternal(value)
                 else:
                     raise PySparkValueError(errorClass="MALFORMED_VARIANT")
@@ -325,9 +315,7 @@ def convert(data: Sequence[Any], schema: StructType, use_large_var_types: bool)
         column_names = schema.fieldNames()
 
         column_convs = [
-            LocalDataToArrowConversion._create_converter(
-                field.dataType, field.nullable, variants_as_dicts=True
-            )
+            LocalDataToArrowConversion._create_converter(field.dataType, field.nullable)
             for field in schema.fields
         ]
 
diff --git a/python/pyspark/sql/tests/test_python_datasource.py b/python/pyspark/sql/tests/test_python_datasource.py
@@ -48,7 +48,7 @@
 )
 from pyspark.sql.functions import spark_partition_id
 from pyspark.sql.session import SparkSession
-from pyspark.sql.types import Row, StructType
+from pyspark.sql.types import Row, StructType, VariantVal
 from pyspark.testing import assertDataFrameEqual
 from pyspark.testing.sqlutils import (
     SPARK_HOME,
@@ -88,32 +88,55 @@ def read(self, partition):
     def test_data_source_register(self):
         class TestReader(DataSourceReader):
             def read(self, partition):
-                yield (0, 1)
+                yield (
+                    0,
+                    1,
+                    VariantVal.parseJson('{"c":1}'),
+                    {"v": VariantVal.parseJson('{"d":2}')},
+                    [VariantVal.parseJson('{"e":3}')],
+                    {"v1": VariantVal.parseJson('{"f":4}'), "v2": VariantVal.parseJson('{"g":5}')},
+                )
 
         class TestDataSource(DataSource):
             def schema(self):
-                return "a INT, b INT"
+                return (
+                    "a INT, b INT, c VARIANT, d STRUCT<v VARIANT>, e ARRAY<VARIANT>,"
+                    "f MAP<STRING, VARIANT>"
+                )
 
             def reader(self, schema):
                 return TestReader()
 
         self.spark.dataSource.register(TestDataSource)
         df = self.spark.read.format("TestDataSource").load()
-        assertDataFrameEqual(df, [Row(a=0, b=1)])
+        assertDataFrameEqual(
+            df.selectExpr(
+                "a", "b", "to_json(c) c", "to_json(d.v) d", "to_json(e[0]) e", "to_json(f['v2']) f"
+            ),
+            [Row(a=0, b=1, c='{"c":1}', d='{"d":2}', e='{"e":3}', f='{"g":5}')],
+        )
 
         class MyDataSource(TestDataSource):
             @classmethod
             def name(cls):
                 return "TestDataSource"
 
             def schema(self):
-                return "c INT, d INT"
+                return (
+                    "c INT, d INT, e VARIANT, f STRUCT<v VARIANT>, g ARRAY<VARIANT>,"
+                    "h MAP<STRING, VARIANT>"
+                )
 
         # Should be able to register the data source with the same name.
         self.spark.dataSource.register(MyDataSource)
 
         df = self.spark.read.format("TestDataSource").load()
-        assertDataFrameEqual(df, [Row(c=0, d=1)])
+        assertDataFrameEqual(
+            df.selectExpr(
+                "c", "d", "to_json(e) e", "to_json(f.v) f", "to_json(g[0]) g", "to_json(h['v2']) h"
+            ),
+            [Row(c=0, d=1, e='{"c":1}', f='{"d":2}', g='{"e":3}', h='{"g":5}')],
+        )
 
     def register_data_source(
         self,