Skip to content

Commit cec674a

Browse files
allisonwang-dbasl3
authored andcommitted
[SPARK-52698][PYTHON] Improve type hints for datasource module
### What changes were proposed in this pull request? This PR improves the type annotations in python/pyspark/sql/datasource.py to use Python 3.10 typing syntax and built-in types instead of their typing module equivalents. ### Why are the changes needed? Follows current Python typing recommendations and best practices. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#51392 from allisonwang-db/spark-52698-type-hint. Authored-by: Allison Wang <allison.wang@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
1 parent 38c97b0 commit cec674a

File tree

1 file changed

+10
-14
lines changed

1 file changed

+10
-14
lines changed

python/pyspark/sql/datasource.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,12 @@
1919
from dataclasses import dataclass
2020
from typing import (
2121
Any,
22-
Dict,
2322
Iterable,
2423
Iterator,
2524
List,
2625
Optional,
2726
Sequence,
28-
Tuple,
2927
Type,
30-
Union,
3128
TYPE_CHECKING,
3229
)
3330

@@ -49,7 +46,6 @@
4946
"DataSourceStreamWriter",
5047
"DataSourceRegistration",
5148
"InputPartition",
52-
"SimpleDataSourceStreamReader",
5349
"WriterCommitMessage",
5450
"Filter",
5551
"EqualTo",
@@ -84,7 +80,7 @@ class DataSource(ABC):
8480
.. versionadded: 4.0.0
8581
"""
8682

87-
def __init__(self, options: Dict[str, str]) -> None:
83+
def __init__(self, options: dict[str, str]) -> None:
8884
"""
8985
Initializes the data source with user-provided options.
9086
@@ -114,7 +110,7 @@ def name(cls) -> str:
114110
"""
115111
return cls.__name__
116112

117-
def schema(self) -> Union[StructType, str]:
113+
def schema(self) -> StructType | str:
118114
"""
119115
Returns the schema of the data source.
120116
@@ -261,7 +257,7 @@ def streamReader(self, schema: StructType) -> "DataSourceStreamReader":
261257
)
262258

263259

264-
ColumnPath = Tuple[str, ...]
260+
ColumnPath = tuple[str, ...]
265261
"""
266262
A tuple of strings representing a column reference.
267263
@@ -407,7 +403,7 @@ class In(Filter):
407403
"""
408404

409405
attribute: ColumnPath
410-
value: Tuple[Any, ...]
406+
value: tuple[Any, ...]
411407

412408

413409
@dataclass(frozen=True)
@@ -631,7 +627,7 @@ def partitions(self) -> Sequence[InputPartition]:
631627
)
632628

633629
@abstractmethod
634-
def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator["RecordBatch"]]:
630+
def read(self, partition: InputPartition) -> Iterator[tuple] | Iterator["RecordBatch"]:
635631
"""
636632
Generates data for a given partition and returns an iterator of tuples or rows.
637633
@@ -760,7 +756,7 @@ def partitions(self, start: dict, end: dict) -> Sequence[InputPartition]:
760756
)
761757

762758
@abstractmethod
763-
def read(self, partition: InputPartition) -> Union[Iterator[Tuple], Iterator["RecordBatch"]]:
759+
def read(self, partition: InputPartition) -> Iterator[tuple] | Iterator["RecordBatch"]:
764760
"""
765761
Generates data for a given partition and returns an iterator of tuples or rows.
766762
@@ -852,7 +848,7 @@ def initialOffset(self) -> dict:
852848
messageParameters={"feature": "initialOffset"},
853849
)
854850

855-
def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]:
851+
def read(self, start: dict) -> tuple[Iterator[tuple], dict]:
856852
"""
857853
Read all available data from start offset and return the offset that next read attempt
858854
starts from.
@@ -864,7 +860,7 @@ def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]:
864860
865861
Returns
866862
-------
867-
A :class:`Tuple` of an iterator of :class:`Tuple` and a dict\\s
863+
A :class:`tuple` of an iterator of :class:`tuple` and a dict\\s
868864
The iterator contains all the available records after start offset.
869865
The dict is the end offset of this read attempt and the start of next read attempt.
870866
"""
@@ -873,7 +869,7 @@ def read(self, start: dict) -> Tuple[Iterator[Tuple], dict]:
873869
messageParameters={"feature": "read"},
874870
)
875871

876-
def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[Tuple]:
872+
def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[tuple]:
877873
"""
878874
Read all available data from specific start offset and end offset.
879875
This is invoked during failure recovery to re-read a batch deterministically.
@@ -888,7 +884,7 @@ def readBetweenOffsets(self, start: dict, end: dict) -> Iterator[Tuple]:
888884
889885
Returns
890886
-------
891-
iterator of :class:`Tuple`\\s
887+
iterator of :class:`tuple`\\s
892888
All the records between start offset and end offset.
893889
"""
894890
raise PySparkNotImplementedError(

0 commit comments

Comments
 (0)