From f33778c9030dda546584d1f7c287a5d91383ca38 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 12 Jun 2025 16:23:30 +0200 Subject: [PATCH 1/4] New third-party IO engines --- pandas/io/common.py | 149 +++++++++++++++++++++++++++++++++++++++++++ pandas/io/iceberg.py | 8 +++ 2 files changed, 157 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index 1a9e6b472463d..cd1de6fe875d6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -51,6 +51,8 @@ import warnings import zipfile +import pkg_resources + from pandas._typing import ( BaseBuffer, ReadCsvBuffer, @@ -90,6 +92,10 @@ from pandas import MultiIndex +# registry of I/O engines. It is populated the first time a non-core +# pandas engine is used +_io_engines = None + @dataclasses.dataclass class IOArgs: @@ -1282,3 +1288,146 @@ def dedup_names( counts[col] = cur_count + 1 return names + + +def _engine_func(format_name: str, engine_name: str, is_writer: bool): + """ + Return the engine function for a given format and operation. + + pandas I/O engines can be registered via entry points. The first time this + function is called it will register all the entry points of the "pandas.io_engine" + group and cache them in the global `_io_engines` variable. + + Engines are implemented as classes with the `read_` and `to_` + methods (classmethods) for the formats they wish to provide. This function will + return the method from the engine and format being requested. + + Parameters + ---------- + format_name : str + The format such as 'csv', 'parquet', 'json', 'html', etc. + engine_name : str + The engine name provided by the user in `engine=`. + is_writer : bool + `True` to return the `to_` function, `False` to return the + `read_` one. + + Examples + -------- + An engine is implemented with a class like: + + >>> class DummyEngine: + ... @classmethod + ... def read_csv(cls, filepath_or_buffer, **kwargs): + ... # the engine signature must match the pandas method signature + ... return pd.DataFrame() + + It must be registered as an entry point with the engine name: + + ``` + [project.entry-points."pandas.io_engine"] + dummy = "pandas:io.dummy.DummyEngine" + + ``` + + Then the `read_csv` method of the engine can be retrieved with: + + >>> func = _engine_func(format_name="csv", engine_name="dummy", is_writer=False) + + This is used internally to dispatch the next pandas call to the engine caller: + + >>> df = read_csv("myfile.csv", engine="dummy") + """ + global _io_engines + + if _io_engines is None: + _io_engines = {} + for entry_point in pkg_resources.iter_entry_points(group="pandas.io_engine"): + _io_engines[entry_point.name] = entry_point.load() + + try: + engine_class = _io_engines[engine_name] + except KeyError as err: + raise ValueError( + f"'{engine_name}' is not a known engine. Some engines are only available " + "after installing the package that provides them." + ) from err + + func_name = f"to_{format_name}" if is_writer else f"read_{format_name}" + try: + engine_method = getattr(engine_class, func_name) + except AttributeError as err: + raise ValueError( + f"The engine '{engine_name}' does not provide a '{func_name}' function" + ) from err + else: + return engine_method + + +def _extract_io_function_info(func_name): + """ + Return the format and if it's a reader or writer from a function name like read_csv. + """ + op_type, format_name = func_name.split("_", maxsplit=1) + if op_type == "read": + is_writer = False + elif op_type == "to": + is_writer = True + else: + raise ValueError( + "Unable to extract info from the function name '{func_name}'. " + "The expected format is `read_ or `to_`." + ) + + return format_name, is_writer + + +def allow_third_party_engines(skip_engines: list[str] | None = None): + """ + Decorator to avoid boilerplate code when allowing readers and writers to use + third-party engines. + + The decorator will introspect the function to know which format should be obtained, + and to know if it's a reader or a writer. Then it will check if the engine has been + registered, and if it has, it will dispatch the execution to the engine with the + arguments provided by the user. + + Parameters + ---------- + skip_engines : list of str, optional + For engines that are implemented in pandas, we want to skip them for this engine + dispatching system. They should be specified in this parameter. + + Examples + -------- + The decorator works both with the `skip_engines` parameter, or without: + + >>> class DataFrame: + ... @allow_third_party_engines(["python", "c", "pyarrow"]) + ... def read_csv(filepath_or_buffer, **kwargs): + ... pass + ... + ... @allow_third_party_engines + ... def read_sas(filepath_or_buffer, **kwargs): + ... pass + """ + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if "engine" in kwargs and kwargs["engine"] not in skip_engines: + format_name, is_writer = _extract_io_function_info(func.__name__) + engine_func = _engine_func( + format_name=format_name, + engine_name=kwargs.pop("engine"), + is_writer=is_writer, + ) + return engine_func(*args, **kwargs) + else: + return func(*args, **kwargs) + + return wrapper + + if callable(skip_engines): + return decorator(skip_engines) + return decorator diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py index dcb675271031e..393817a400a43 100644 --- a/pandas/io/iceberg.py +++ b/pandas/io/iceberg.py @@ -6,7 +6,10 @@ from pandas import DataFrame +from pandas.io.common import allow_third_party_engines + +@allow_third_party_engines() def read_iceberg( table_identifier: str, catalog_name: str | None = None, @@ -18,6 +21,7 @@ def read_iceberg( snapshot_id: int | None = None, limit: int | None = None, scan_properties: dict[str, Any] | None = None, + engine: str | None = None, ) -> DataFrame: """ Read an Apache Iceberg table into a pandas DataFrame. @@ -52,6 +56,10 @@ def read_iceberg( scan_properties : dict of {str: obj}, optional Additional Table properties as a dictionary of string key value pairs to use for this scan. + engine : str, optional + The engine to use. Engines can be installed via third-party packages. For an + updated list of existing pandas I/O engines check the I/O engines section of + our Ecosystem page. Returns ------- From 555459b598bd4bb92a4b8efc154d619265096781 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 12 Jun 2025 22:04:01 +0200 Subject: [PATCH 2/4] Add tests and fix bugs --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 11 +++- pandas/io/common.py | 102 ++++++++++++++--------------- pandas/io/iceberg.py | 2 +- pandas/tests/io/test_io_engines.py | 48 ++++++++++++++ 5 files changed, 111 insertions(+), 53 deletions(-) create mode 100644 pandas/tests/io/test_io_engines.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03a386708323d..1130480d3c7b5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -88,6 +88,7 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- Third-party packages can now register engines that can be used in pandas I/O operations :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` (:issue:`61584`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8053c17437c5e..e9a088aa99399 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -188,7 +188,10 @@ nargsort, ) -from pandas.io.common import get_handle +from pandas.io.common import ( + allow_third_party_engines, + get_handle, +) from pandas.io.formats import ( console, format as fmt, @@ -3547,6 +3550,7 @@ def to_xml( return xml_formatter.write_output() + @allow_third_party_engines def to_iceberg( self, table_identifier: str, @@ -3556,6 +3560,7 @@ def to_iceberg( location: str | None = None, append: bool = False, snapshot_properties: dict[str, str] | None = None, + engine: str | None = None, ) -> None: """ Write a DataFrame to an Apache Iceberg table. @@ -3580,6 +3585,10 @@ def to_iceberg( If ``True``, append data to the table, instead of replacing the content. snapshot_properties : dict of {str: str}, optional Custom properties to be added to the snapshot summary + engine : str, optional + The engine to use. Engines can be installed via third-party packages. For an + updated list of existing pandas I/O engines check the I/O engines section of + our Ecosystem page. See Also -------- diff --git a/pandas/io/common.py b/pandas/io/common.py index cd1de6fe875d6..d1f872ed88b7d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,6 +9,7 @@ import codecs from collections import defaultdict from collections.abc import ( + Callable, Hashable, Mapping, Sequence, @@ -16,6 +17,7 @@ import dataclasses import functools import gzip +from importlib.metadata import entry_points from io import ( BufferedIOBase, BytesIO, @@ -51,8 +53,6 @@ import warnings import zipfile -import pkg_resources - from pandas._typing import ( BaseBuffer, ReadCsvBuffer, @@ -1290,9 +1290,9 @@ def dedup_names( return names -def _engine_func(format_name: str, engine_name: str, is_writer: bool): +def _get_io_engine(name: str): """ - Return the engine function for a given format and operation. + Return an I/O engine by its name. pandas I/O engines can be registered via entry points. The first time this function is called it will register all the entry points of the "pandas.io_engine" @@ -1304,13 +1304,8 @@ def _engine_func(format_name: str, engine_name: str, is_writer: bool): Parameters ---------- - format_name : str - The format such as 'csv', 'parquet', 'json', 'html', etc. - engine_name : str + name : str The engine name provided by the user in `engine=`. - is_writer : bool - `True` to return the `to_` function, `False` to return the - `read_` one. Examples -------- @@ -1330,59 +1325,57 @@ def _engine_func(format_name: str, engine_name: str, is_writer: bool): ``` - Then the `read_csv` method of the engine can be retrieved with: + Then the `read_csv` method of the engine can be used with: - >>> func = _engine_func(format_name="csv", engine_name="dummy", is_writer=False) + >>> _get_io_engine(engine_name="dummy").read_csv("myfile.csv") # doctest: +SKIP This is used internally to dispatch the next pandas call to the engine caller: - >>> df = read_csv("myfile.csv", engine="dummy") + >>> df = read_csv("myfile.csv", engine="dummy") # doctest: +SKIP """ global _io_engines if _io_engines is None: _io_engines = {} - for entry_point in pkg_resources.iter_entry_points(group="pandas.io_engine"): - _io_engines[entry_point.name] = entry_point.load() + for entry_point in entry_points().select(group="pandas.io_engine"): + package_name = entry_point.dist.metadata["Name"] + if entry_point.name in _io_engines: + _io_engines[entry_point.name]._other_providers.append(package_name) + else: + _io_engines[entry_point.name] = entry_point.load() + _io_engines[entry_point.name]._provider_name = package_name + _io_engines[entry_point.name]._other_providers = [] try: - engine_class = _io_engines[engine_name] + engine = _io_engines[name] except KeyError as err: raise ValueError( - f"'{engine_name}' is not a known engine. Some engines are only available " + f"'{name}' is not a known engine. Some engines are only available " "after installing the package that provides them." ) from err - func_name = f"to_{format_name}" if is_writer else f"read_{format_name}" - try: - engine_method = getattr(engine_class, func_name) - except AttributeError as err: - raise ValueError( - f"The engine '{engine_name}' does not provide a '{func_name}' function" - ) from err - else: - return engine_method - - -def _extract_io_function_info(func_name): - """ - Return the format and if it's a reader or writer from a function name like read_csv. - """ - op_type, format_name = func_name.split("_", maxsplit=1) - if op_type == "read": - is_writer = False - elif op_type == "to": - is_writer = True - else: - raise ValueError( - "Unable to extract info from the function name '{func_name}'. " - "The expected format is `read_ or `to_`." + if engine._other_providers: + msg = ( + f"The engine '{name}' has been registered by the package " + f"'{engine._provider_name}' and will be used. " ) + if len(engine._other_providers): + msg += ( + "The package '{engine._other_providers}' also tried to register " + "the engine, but it couldn't because it was already registered." + ) + else: + msg += ( + "Other packages that tried to register the engine, but they couldn't " + "because it was already registered are: " + f"{str(engine._other_providers)[1:-1]}." + ) + warnings.warn(RuntimeWarning, msg, stacklevel=find_stack_level()) - return format_name, is_writer + return engine -def allow_third_party_engines(skip_engines: list[str] | None = None): +def allow_third_party_engines(skip_engines: list[str] | Callable | None = None): """ Decorator to avoid boilerplate code when allowing readers and writers to use third-party engines. @@ -1415,14 +1408,21 @@ def allow_third_party_engines(skip_engines: list[str] | None = None): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): - if "engine" in kwargs and kwargs["engine"] not in skip_engines: - format_name, is_writer = _extract_io_function_info(func.__name__) - engine_func = _engine_func( - format_name=format_name, - engine_name=kwargs.pop("engine"), - is_writer=is_writer, - ) - return engine_func(*args, **kwargs) + if callable(skip_engines): + skip_engine = False + else: + skip_engine = kwargs["engine"] in skip_engines + + if "engine" in kwargs and not skip_engine: + engine_name = kwargs.pop("engine") + engine = _get_io_engine(engine_name) + try: + return getattr(engine, func.__name__)(*args, **kwargs) + except AttributeError as err: + raise ValueError( + f"The engine '{engine_name}' does not provide a " + f"'{func.__name__}' function" + ) from err else: return func(*args, **kwargs) diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py index 393817a400a43..c778a95809f97 100644 --- a/pandas/io/iceberg.py +++ b/pandas/io/iceberg.py @@ -9,7 +9,7 @@ from pandas.io.common import allow_third_party_engines -@allow_third_party_engines() +@allow_third_party_engines def read_iceberg( table_identifier: str, catalog_name: str | None = None, diff --git a/pandas/tests/io/test_io_engines.py b/pandas/tests/io/test_io_engines.py new file mode 100644 index 0000000000000..136b46a3fd972 --- /dev/null +++ b/pandas/tests/io/test_io_engines.py @@ -0,0 +1,48 @@ +import pytest + +from pandas.io import common + + +@pytest.fixture +def patch_engine(monkeypatch): + class MockIoEngine: + @classmethod + def read_foo(cls, fname): + return "third-party" + + monkeypatch.setattr(common, "_get_io_engine", lambda name: MockIoEngine) + + +class TestIoEngines: + def test_decorator_with_no_engine(self, patch_engine): + @common.allow_third_party_engines + def read_foo(fname, engine=None): + return "default" + + result = read_foo("myfile.foo") + assert result == "default" + + def test_decorator_with_skipped_engine(self, patch_engine): + @common.allow_third_party_engines(skip_engines=["c"]) + def read_foo(fname, engine=None): + return "default" + + result = read_foo("myfile.foo", engine="c") + assert result == "default" + + def test_decorator_with_third_party_engine(self, patch_engine): + @common.allow_third_party_engines + def read_foo(fname, engine=None): + return "default" + + result = read_foo("myfile.foo", engine="third-party") + assert result == "third-party" + + def test_decorator_with_third_party_engine_but_no_method(self, patch_engine): + @common.allow_third_party_engines + def read_bar(fname, engine=None): + return "default" + + msg = "'third-party' does not provide a 'read_bar'" + with pytest.raises(ValueError, match=msg): + read_bar("myfile.foo", engine="third-party") From 1ca77c1cce8e67364f5b9f43b3321cd66723354f Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 12 Jun 2025 23:54:54 +0200 Subject: [PATCH 3/4] Finishing docs and tests --- doc/source/development/extending.rst | 63 ++++++++++++++++++++++++++ pandas/io/common.py | 20 ++++----- pandas/tests/io/test_io_engines.py | 67 +++++++++++++++++++++++++--- web/pandas/community/ecosystem.md | 12 +++++ 4 files changed, 146 insertions(+), 16 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e67829b8805eb..a1e8e520bacae 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -489,6 +489,69 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1. +.. _extending.plotting-backends: + +IO engines +----------- + +pandas provides several IO connectors such as :func:`read_csv` or :meth:`to_parquet`, and many +of those support multiple engines. For example, :func:`read_csv` supports the ``python``, ``c`` +and ``pyarrow`` engines, each with its advantages and disadvantages, making each more appropriate +for certain use cases. + +Third-party package developers can implement engines for any of the pandas readers and writers. +When a ``pandas.read_*`` function or ``DataFrame.to_*`` method are called with an ``engine=""`` +that is not known to pandas, pandas will look into the entry points registered in the group +``pandas.io_engine`` by the packages in the environment, and will call the corresponding method. + +An engine is a simple Python class which implements one or more of the pandas readers and writers +as class methods: + +.. code-block:: python + + class EmptyDataEngine: + @classmethod + def read_json(cls, path_or_buf=None, **kwargs): + return pd.DataFrame() + + @classmethod + def to_json(cls, path_or_buf=None, **kwargs): + with open(path_or_buf, "w") as f: + f.write() + + @classmethod + def read_clipboard(cls, sep='\\s+', dtype_backend=None, **kwargs): + return pd.DataFrame() + +A single engine can support multiple readers and writers. When possible, it is a good practice for +a reader to provide both a reader and writer for the supported formats. But it is possible to +provide just one of them. + +The package implementing the engine needs to create an entry point for pandas to be able to discover +it. This is done in ``pyproject.toml``: + +```toml +[project.entry-points."pandas.io_engine"] +empty = empty_data:EmptyDataEngine +``` + +The first line should always be the same, creating the entry point in the ``pandas.io_engine`` group. +In the second line, ``empty`` is the name of the engine, and ``empty_data:EmptyDataEngine`` is where +to find the engine class in the package (``empty_data`` is the module name in this case). + +If a user have the package of the example installed, them it would be possible to use: + +.. code-block:: python + + pd.read_json("myfile.json", engine="empty") + +When pandas detects that no ``empty`` engine exists for the ``read_json`` reader in pandas, will +look at the entry points, will find the ``EmptyDataEngine`` engine, and will call the ``read_json`` +method on it with the arguments provided by the user (except the ``engine`` parameter). + +To avoid conflicts in the names of engines, we keep an "IO engines" section in our +[Ecosystem page](https://pandas.pydata.org/community/ecosystem.html#io-engines). + .. _extending.pandas_priority: Arithmetic with 3rd party types diff --git a/pandas/io/common.py b/pandas/io/common.py index d1f872ed88b7d..e3338307c25e2 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1340,11 +1340,10 @@ def _get_io_engine(name: str): for entry_point in entry_points().select(group="pandas.io_engine"): package_name = entry_point.dist.metadata["Name"] if entry_point.name in _io_engines: - _io_engines[entry_point.name]._other_providers.append(package_name) + _io_engines[entry_point.name]._packages.append(package_name) else: _io_engines[entry_point.name] = entry_point.load() - _io_engines[entry_point.name]._provider_name = package_name - _io_engines[entry_point.name]._other_providers = [] + _io_engines[entry_point.name]._packages = [package_name] try: engine = _io_engines[name] @@ -1354,23 +1353,22 @@ def _get_io_engine(name: str): "after installing the package that provides them." ) from err - if engine._other_providers: + if len(engine._packages) > 1: msg = ( f"The engine '{name}' has been registered by the package " - f"'{engine._provider_name}' and will be used. " + f"'{engine._packages[0]}' and will be used. " ) - if len(engine._other_providers): + if len(engine._packages) == 2: msg += ( - "The package '{engine._other_providers}' also tried to register " + f"The package '{engine._packages[1]}' also tried to register " "the engine, but it couldn't because it was already registered." ) else: msg += ( - "Other packages that tried to register the engine, but they couldn't " - "because it was already registered are: " - f"{str(engine._other_providers)[1:-1]}." + "The packages {str(engine._packages[1:]}[1:-1] also tried to register " + "the engine, but they couldn't because it was already registered." ) - warnings.warn(RuntimeWarning, msg, stacklevel=find_stack_level()) + warnings.warn(msg, RuntimeWarning, stacklevel=find_stack_level()) return engine diff --git a/pandas/tests/io/test_io_engines.py b/pandas/tests/io/test_io_engines.py index 136b46a3fd972..ccf4bff6533e8 100644 --- a/pandas/tests/io/test_io_engines.py +++ b/pandas/tests/io/test_io_engines.py @@ -1,16 +1,57 @@ +from types import SimpleNamespace + import pytest +import pandas._testing as tm + from pandas.io import common +class _MockIoEngine: + @classmethod + def read_foo(cls, fname): + return "third-party" + + @pytest.fixture def patch_engine(monkeypatch): - class MockIoEngine: - @classmethod - def read_foo(cls, fname): - return "third-party" + monkeypatch.setattr(common, "_get_io_engine", lambda name: _MockIoEngine) + + +@pytest.fixture +def patch_entry_points(monkeypatch): + class MockEntryPoint: + name = "myengine" + dist = SimpleNamespace(metadata={"Name": "mypackage"}) + + @staticmethod + def load(): + return _MockIoEngine - monkeypatch.setattr(common, "_get_io_engine", lambda name: MockIoEngine) + class MockDuplicate1: + name = "duplicate" + dist = SimpleNamespace(metadata={"Name": "package1"}) + + @staticmethod + def load(): + return SimpleNamespace(read_foo=lambda fname: "dup1") + + class MockDuplicate2: + name = "duplicate" + dist = SimpleNamespace(metadata={"Name": "package2"}) + + @staticmethod + def load(): + return SimpleNamespace(read_foo=lambda fname: "dup1") + + monkeypatch.setattr(common, "_io_engines", None) + monkeypatch.setattr( + common, + "entry_points", + lambda: SimpleNamespace( + select=lambda group: [MockEntryPoint, MockDuplicate1, MockDuplicate2] + ), + ) class TestIoEngines: @@ -46,3 +87,19 @@ def read_bar(fname, engine=None): msg = "'third-party' does not provide a 'read_bar'" with pytest.raises(ValueError, match=msg): read_bar("myfile.foo", engine="third-party") + + def test_correct_io_engine(self, patch_entry_points): + result = common._get_io_engine("myengine") + assert result is _MockIoEngine + + def test_unknown_io_engine(self, patch_entry_points): + with pytest.raises(ValueError, match="'unknown' is not a known engine"): + common._get_io_engine("unknown") + + def test_duplicate_engine(self, patch_entry_points): + with tm.assert_produces_warning( + RuntimeWarning, + match="'duplicate' has been registered by the package 'package1'", + ): + result = common._get_io_engine("duplicate") + assert hasattr(result, "read_foo") diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 1ebd4f3d3f1dc..341c668cc60df 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -712,6 +712,18 @@ authors to coordinate on the namespace. | [staircase](https://www.staircase.dev/) | `sc` | `Series`, `DataFrame` | | [woodwork](https://github.com/alteryx/woodwork) | `slice` | `Series`, `DataFrame` | +## IO engines + +Table with the third-party [IO engines](https://pandas.pydata.org/docs/development/extending.html#io-engines) +available to `read_*` functions and `DataFrame.to_*` methods. + + | Engine name | Library | Supported formats | + | ----------------|------------------------------------------------------ | ------------------------------- | + | | | | + +IO engines can be used by specifying the engine when calling a reader or writer +(e.g. `pd.read_csv("myfile.csv", engine="myengine")`). + ## Development tools ### [pandas-stubs](https://github.com/VirtusLab/pandas-stubs) From d38810138b41748fc639634962ab4f2b22bddfe6 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 13 Jun 2025 00:31:42 +0200 Subject: [PATCH 4/4] typo in doc label and typing issues --- doc/source/development/extending.rst | 2 +- pandas/io/common.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index a1e8e520bacae..cab0428d650b6 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -489,7 +489,7 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1. -.. _extending.plotting-backends: +.. _extending.io-engines: IO engines ----------- diff --git a/pandas/io/common.py b/pandas/io/common.py index e3338307c25e2..3ec9e094fb118 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -94,7 +94,7 @@ # registry of I/O engines. It is populated the first time a non-core # pandas engine is used -_io_engines = None +_io_engines: dict[str, Any] | None = None @dataclasses.dataclass @@ -1290,7 +1290,7 @@ def dedup_names( return names -def _get_io_engine(name: str): +def _get_io_engine(name: str) -> Any: """ Return an I/O engine by its name. @@ -1338,7 +1338,10 @@ def _get_io_engine(name: str): if _io_engines is None: _io_engines = {} for entry_point in entry_points().select(group="pandas.io_engine"): - package_name = entry_point.dist.metadata["Name"] + if entry_point.dist: + package_name = entry_point.dist.metadata["Name"] + else: + package_name = None if entry_point.name in _io_engines: _io_engines[entry_point.name]._packages.append(package_name) else: @@ -1373,7 +1376,9 @@ def _get_io_engine(name: str): return engine -def allow_third_party_engines(skip_engines: list[str] | Callable | None = None): +def allow_third_party_engines( + skip_engines: list[str] | Callable | None = None, +) -> Callable: """ Decorator to avoid boilerplate code when allowing readers and writers to use third-party engines. @@ -1403,10 +1408,10 @@ def allow_third_party_engines(skip_engines: list[str] | Callable | None = None): ... pass """ - def decorator(func): + def decorator(func: Callable) -> Callable: @functools.wraps(func) - def wrapper(*args, **kwargs): - if callable(skip_engines): + def wrapper(*args: Any, **kwargs: Any) -> Any: + if callable(skip_engines) or skip_engines is None: skip_engine = False else: skip_engine = kwargs["engine"] in skip_engines