pydata · dcherian · May 18, 2023 · Sep 10, 2022 · Sep 10, 2022 · Sep 10, 2022
diff --git a/README.md b/README.md
@@ -122,12 +122,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-Xarray bundles portions of pandas, NumPy and Seaborn, all of which are
+Xarray bundles portions of pandas, NumPy, Seaborn and dask, all of which are
 available under a "3-clause BSD" license:
 
 - pandas: setup.py, xarray/util/print_versions.py
 - NumPy: xarray/core/npcompat.py
 - Seaborn: _determine_cmap_params in xarray/core/plot/utils.py
+- Dask: xarray/core/daskcompat.py
 
 Xarray also bundles portions of CPython, which is available under the
 "Python Software Foundation License" in xarray/core/pycompat.py.

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -159,6 +159,10 @@ Documentation
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- Experimental support for wrapping chunked array libraries other than dask.
+  A new ABC is defined - :py:class:`xr.core.parallelcompat.ChunkManagerEntryPoint` - which can be overridden and then
+  registered by alternative chunked array implementations. (:issue:`6807`, :pull:`7019`)
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Don't assume that arrays read from disk will be Numpy arrays. This is a step toward
   enabling reads from a Zarr store using the `Kvikio <https://docs.rapids.ai/api/kvikio/stable/api.html#zarr>`_
   or `TensorStore <https://google.github.io/tensorstore/>`_ libraries.

diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ module = [
   "cf_units.*",
   "cfgrib.*",
   "cftime.*",
+  "cubed.*",
   "cupy.*",
   "fsspec.*",
   "h5netcdf.*",

diff --git a/setup.cfg b/setup.cfg
@@ -132,6 +132,10 @@ xarray =
     static/css/*
     static/html/*
 
+[options.entry_points]
+xarray.chunkmanagers =
+    dask = xarray.core.daskmanager:DaskManager
+
 [tool:pytest]
 python_files = test_*.py
 testpaths = xarray/tests properties

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -6,7 +6,16 @@
 from glob import glob
 from io import BytesIO
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Callable, Final, Literal, Union, cast, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Final,
+    Literal,
+    Union,
+    cast,
+    overload,
+)
 
 import numpy as np
 
@@ -20,9 +29,11 @@
     _nested_combine,
     combine_by_coords,
 )
+from xarray.core.daskmanager import DaskManager
 from xarray.core.dataarray import DataArray
 from xarray.core.dataset import Dataset, _get_chunk, _maybe_chunk
 from xarray.core.indexes import Index
+from xarray.core.parallelcompat import guess_chunkmanager
 from xarray.core.utils import is_remote_uri
 
 if TYPE_CHECKING:
@@ -38,6 +49,7 @@
         CompatOptions,
         JoinOptions,
         NestedSequence,
+        T_Chunks,
     )
 
     T_NetcdfEngine = Literal["netcdf4", "scipy", "h5netcdf"]
@@ -48,7 +60,6 @@
         str,  # no nice typing support for custom backends
         None,
     ]
-    T_Chunks = Union[int, dict[Any, Any], Literal["auto"], None]
     T_NetcdfTypes = Literal[
         "NETCDF4", "NETCDF4_CLASSIC", "NETCDF3_64BIT", "NETCDF3_CLASSIC"
     ]
@@ -297,17 +308,27 @@ def _chunk_ds(
     chunks,
     overwrite_encoded_chunks,
     inline_array,
+    chunked_array_type,
+    from_array_kwargs,
     **extra_tokens,
 ):
-    from dask.base import tokenize
+    chunkmanager = guess_chunkmanager(chunked_array_type)
+
+    # TODO refactor to move this dask-specific logic inside the DaskManager class
+    if isinstance(chunkmanager, DaskManager):
+        from dask.base import tokenize
 
-    mtime = _get_mtime(filename_or_obj)
-    token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens)
-    name_prefix = f"open_dataset-{token}"
+        mtime = _get_mtime(filename_or_obj)
+        token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens)
+        name_prefix = "open_dataset-"
+    else:
+        # not used
+        token = (None,)
+        name_prefix = None
 
     variables = {}
     for name, var in backend_ds.variables.items():
-        var_chunks = _get_chunk(var, chunks)
+        var_chunks = _get_chunk(var, chunks, chunkmanager)
         variables[name] = _maybe_chunk(
             name,
             var,
@@ -316,6 +337,8 @@ def _chunk_ds(
             name_prefix=name_prefix,
             token=token,
             inline_array=inline_array,
+            chunked_array_type=chunkmanager,
+            from_array_kwargs=from_array_kwargs.copy(),
         )
     return backend_ds._replace(variables)
 
@@ -328,6 +351,8 @@ def _dataset_from_backend_dataset(
     cache,
     overwrite_encoded_chunks,
     inline_array,
+    chunked_array_type,
+    from_array_kwargs,
     **extra_tokens,
 ):
     if not isinstance(chunks, (int, dict)) and chunks not in {None, "auto"}:
@@ -346,6 +371,8 @@ def _dataset_from_backend_dataset(
             chunks,
             overwrite_encoded_chunks,
             inline_array,
+            chunked_array_type,
+            from_array_kwargs,
             **extra_tokens,
         )
 
@@ -373,6 +400,8 @@ def open_dataset(
     decode_coords: Literal["coordinates", "all"] | bool | None = None,
     drop_variables: str | Iterable[str] | None = None,
     inline_array: bool = False,
+    chunked_array_type: str | None = None,
+    from_array_kwargs: dict[str, Any] | None = None,
     backend_kwargs: dict[str, Any] | None = None,
     **kwargs,
 ) -> Dataset:
@@ -465,6 +494,15 @@ def open_dataset(
         itself, and each chunk refers to that task by its key. With
         ``inline_array=True``, Dask will instead inline the array directly
         in the values of the task graph. See :py:func:`dask.array.from_array`.
+    chunked_array_type: str, optional
+        Which chunked array type to coerce this datasets' arrays to.
+        Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system.
+        Experimental API that should not be relied upon.
+    from_array_kwargs: dict
+        Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create
+        chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg.
+        For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed
+        to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
     backend_kwargs: dict
         Additional keyword arguments passed on to the engine open function,
         equivalent to `**kwargs`.
@@ -508,6 +546,9 @@ def open_dataset(
     if engine is None:
         engine = plugins.guess_engine(filename_or_obj)
 
+    if from_array_kwargs is None:
+        from_array_kwargs = {}
+
     backend = plugins.get_backend(engine)
 
     decoders = _resolve_decoders_kwargs(
@@ -536,6 +577,8 @@ def open_dataset(
         cache,
         overwrite_encoded_chunks,
         inline_array,
+        chunked_array_type,
+        from_array_kwargs,
         drop_variables=drop_variables,
         **decoders,
         **kwargs,
@@ -546,8 +589,8 @@ def open_dataset(
 def open_dataarray(
     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
     *,
-    engine: T_Engine = None,
-    chunks: T_Chunks = None,
+    engine: T_Engine | None = None,
+    chunks: T_Chunks | None = None,
     cache: bool | None = None,
     decode_cf: bool | None = None,
     mask_and_scale: bool | None = None,
@@ -558,6 +601,8 @@ def open_dataarray(
     decode_coords: Literal["coordinates", "all"] | bool | None = None,
     drop_variables: str | Iterable[str] | None = None,
     inline_array: bool = False,
+    chunked_array_type: str | None = None,
+    from_array_kwargs: dict[str, Any] | None = None,
     backend_kwargs: dict[str, Any] | None = None,
     **kwargs,
 ) -> DataArray:
@@ -652,6 +697,15 @@ def open_dataarray(
         itself, and each chunk refers to that task by its key. With
         ``inline_array=True``, Dask will instead inline the array directly
         in the values of the task graph. See :py:func:`dask.array.from_array`.
+    chunked_array_type: str, optional
+        Which chunked array type to coerce the underlying data array to.
+        Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system.
+        Experimental API that should not be relied upon.
+    from_array_kwargs: dict
+        Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create
+        chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg.
+        For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed
+        to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
     backend_kwargs: dict
         Additional keyword arguments passed on to the engine open function,
         equivalent to `**kwargs`.
@@ -695,6 +749,8 @@ def open_dataarray(
         cache=cache,
         drop_variables=drop_variables,
         inline_array=inline_array,
+        chunked_array_type=chunked_array_type,
+        from_array_kwargs=from_array_kwargs,
         backend_kwargs=backend_kwargs,
         use_cftime=use_cftime,
         decode_timedelta=decode_timedelta,
@@ -726,7 +782,7 @@ def open_dataarray(
 
 def open_mfdataset(
     paths: str | NestedSequence[str | os.PathLike],
-    chunks: T_Chunks = None,
+    chunks: T_Chunks | None = None,
     concat_dim: str
     | DataArray
     | Index
@@ -736,7 +792,7 @@ def open_mfdataset(
     | None = None,
     compat: CompatOptions = "no_conflicts",
     preprocess: Callable[[Dataset], Dataset] | None = None,
-    engine: T_Engine = None,
+    engine: T_Engine | None = None,
     data_vars: Literal["all", "minimal", "different"] | list[str] = "all",
     coords="different",
     combine: Literal["by_coords", "nested"] = "by_coords",
@@ -1490,6 +1546,7 @@ def to_zarr(
     safe_chunks: bool = True,
     storage_options: dict[str, str] | None = None,
     zarr_version: int | None = None,
+    chunkmanager_store_kwargs: dict[str, Any] | None = None,
 ) -> backends.ZarrStore:
     ...
 
@@ -1512,6 +1569,7 @@ def to_zarr(
     safe_chunks: bool = True,
     storage_options: dict[str, str] | None = None,
     zarr_version: int | None = None,
+    chunkmanager_store_kwargs: dict[str, Any] | None = None,
 ) -> Delayed:
     ...
 
@@ -1531,6 +1589,7 @@ def to_zarr(
     safe_chunks: bool = True,
     storage_options: dict[str, str] | None = None,
     zarr_version: int | None = None,
+    chunkmanager_store_kwargs: dict[str, Any] | None = None,
 ) -> backends.ZarrStore | Delayed:
     """This function creates an appropriate datastore for writing a dataset to
     a zarr ztore
@@ -1652,7 +1711,9 @@ def to_zarr(
     writer = ArrayWriter()
     # TODO: figure out how to properly handle unlimited_dims
     dump_to_store(dataset, zstore, writer, encoding=encoding)
-    writes = writer.sync(compute=compute)
+    writes = writer.sync(
+        compute=compute, chunkmanager_store_kwargs=chunkmanager_store_kwargs
+    )
 
     if compute:
         _finalize_store(writes, zstore)

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -11,7 +11,8 @@
 
 from xarray.conventions import cf_encoder
 from xarray.core import indexing
-from xarray.core.pycompat import is_duck_dask_array
+from xarray.core.parallelcompat import get_chunked_array_type
+from xarray.core.pycompat import is_chunked_array
 from xarray.core.utils import FrozenDict, NdimSizeLenMixin, is_remote_uri
 
 if TYPE_CHECKING:
@@ -153,7 +154,7 @@ def __init__(self, lock=None):
         self.lock = lock
 
     def add(self, source, target, region=None):
-        if is_duck_dask_array(source):
+        if is_chunked_array(source):
             self.sources.append(source)
             self.targets.append(target)
             self.regions.append(region)
@@ -163,21 +164,25 @@ def add(self, source, target, region=None):
             else:
                 target[...] = source
 
-    def sync(self, compute=True):
+    def sync(self, compute=True, chunkmanager_store_kwargs=None):
         if self.sources:
-            import dask.array as da
+            chunkmanager = get_chunked_array_type(*self.sources)
 
             # TODO: consider wrapping targets with dask.delayed, if this makes
             # for any discernible difference in perforance, e.g.,
             # targets = [dask.delayed(t) for t in self.targets]
 
-            delayed_store = da.store(
+            if chunkmanager_store_kwargs is None:
+                chunkmanager_store_kwargs = {}
+
+            delayed_store = chunkmanager.store(
                 self.sources,
                 self.targets,
                 lock=self.lock,
                 compute=compute,
                 flush=True,
                 regions=self.regions,
+                **chunkmanager_store_kwargs,
             )
             self.sources = []
             self.targets = []

diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py
@@ -146,7 +146,7 @@ def refresh_engines() -> None:
 
 def guess_engine(
     store_spec: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
-):
+) -> str | type[BackendEntrypoint]:
     engines = list_engines()
 
     for engine, backend in engines.items():