[SPARK-52698][PYTHON] Improve type hints for datasource module

allisonwang-db · asl3 · commit cec674a6ef0a · 2025-07-14T09:58:45.000-07:00
### What changes were proposed in this pull request? This PR improves the type annotations in python/pyspark/sql/datasource.py to use Python 3.10 typing syntax and built-in types instead of their typing module equivalents. ### Why are the changes needed? Follows current Python typing recommendations and best practices. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#51392 from allisonwang-db/spark-52698-type-hint. Authored-by: Allison Wang <allison.wang@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py
@@ -19,15 +19,12 @@
 from dataclasses import dataclass
 from typing import (
     Any,
-    Dict,
     Iterable,
     Iterator,
     List,
     Optional,
     Sequence,
-    Tuple,
     Type,
-    Union,
     TYPE_CHECKING,
 )
 
@@ -49,7 +46,6 @@
     "DataSourceStreamWriter",
     "DataSourceRegistration",
     "InputPartition",
-    "SimpleDataSourceStreamReader",
     "WriterCommitMessage",
     "Filter",
     "EqualTo",
@@ -84,7 +80,7 @@ class DataSource(ABC):
     .. versionadded: 4.0.0
     """
 
-    def __init__(self, options: Dict[str, str]) -> None:
+    def __init__(self, options: dict[str, str]) -> None:
         """
         Initializes the data source with user-provided options.
 
@@ -114,7 +110,7 @@ def name(cls) -> str:
         """
         return cls.__name__
 
-    def schema(self) -> Union[StructType, str]:
+    def schema(self) -> StructType | str:
         """
         Returns the schema of the data source.
 
@@ -261,7 +257,7 @@ def streamReader(self, schema: StructType) -> "DataSourceStreamReader":
         )
 
 
-ColumnPath = Tuple[str, ...]
+ColumnPath = tuple[str, ...]
 """
 A tuple of strings representing a column reference.
 
@@ -407,7 +403,7 @@ class In(Filter):
     """
 
     attribute: ColumnPath
-    value: Tuple[Any, ...]
+    value: tuple[Any, ...]
 
 
 @dataclass(frozen=True)
@@ -631,7 +627,7 @@ def partitions(self) -> Sequence[InputPartition]:
         )
 
     @abstractmethod
-    def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator["RecordBatch"]]:
+    def read(self, partition: InputPartition) -> Iterator[tuple] | Iterator["RecordBatch"]:
         """
         Generates data for a given partition and returns an iterator of tuples or rows.
 
@@ -760,7 +756,7 @@ def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]:
         )
 
     @abstractmethod
-    def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator["RecordBatch"]]:
+    def read(self, partition: InputPartition) -> Iterator[tuple] | Iterator["RecordBatch"]:
         """
         Generates data for a given partition and returns an iterator of tuples or rows.
 
@@ -852,7 +848,7 @@ def initialOffset(self) -> dict:
             messageParameters={"feature": "initialOffset"},
         )
 
-    def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]:
+    def read(self, start: dict) -> tuple[Iterator[tuple], dict]:
         """
         Read all available data from start offset and return the offset that next read attempt
         starts from.
@@ -864,7 +860,7 @@ def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]:
 
         Returns
         -------
-        A :class:`Tuple` of an iterator of :class:`Tuple` and a dict\\s
+        A :class:`tuple` of an iterator of :class:`tuple` and a dict\\s
             The iterator contains all the available records after start offset.
             The dict is the end offset of this read attempt and the start of next read attempt.
         """
@@ -873,7 +869,7 @@ def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]:
             messageParameters={"feature": "read"},
         )
 
-    def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[Tuple]:
+    def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[tuple]:
         """
         Read all available data from specific start offset and end offset.
         This is invoked during failure recovery to re-read a batch deterministically.
@@ -888,7 +884,7 @@ def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[Tuple]:
 
         Returns
         -------
-        iterator of :class:`Tuple`\\s
+        iterator of :class:`tuple`\\s
             All the records between start offset and end offset.
         """
         raise PySparkNotImplementedError(