diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 9a6037cf3c4..5241d060e9c 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -680,6 +680,8 @@ backends.BackendArray backends.BackendEntrypoint.guess_can_open backends.BackendEntrypoint.open_dataset + backends.CoderOptions + backends.CoderOptions.to_kwargs core.indexing.IndexingSupport core.indexing.explicit_indexing_adapter diff --git a/doc/api.rst b/doc/api.rst index df6e87c0cf8..3290fcbff94 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1693,6 +1693,7 @@ Advanced API Dataset.set_close backends.BackendArray backends.BackendEntrypoint + backends.CoderOptions backends.list_engines backends.refresh_engines diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst index d3b5c3a9267..576ce00e7f9 100644 --- a/doc/internals/how-to-add-new-backend.rst +++ b/doc/internals/how-to-add-new-backend.rst @@ -38,21 +38,23 @@ This is what a ``BackendEntrypoint`` subclass should look like: .. code-block:: python - from xarray.backends import BackendEntrypoint + from xarray.backends import BackendEntrypoint, CoderOptions class MyBackendEntrypoint(BackendEntrypoint): + coder_class = CoderOptions + def open_dataset( self, filename_or_obj, *, - drop_variables=None, + coder_options=None, # other backend specific keyword arguments # `chunks` and `cache` DO NOT go here, they are handled by xarray ): - return my_open_dataset(filename_or_obj, drop_variables=drop_variables) + return my_open_dataset(filename_or_obj, coder_options=coder_options) - open_dataset_parameters = ["filename_or_obj", "drop_variables"] + open_dataset_parameters = ["filename_or_obj", "coder_options"] def guess_can_open(self, filename_or_obj): try: @@ -83,19 +85,15 @@ The following is an example of the high level processing steps: self, filename_or_obj, *, - drop_variables=None, - decode_times=True, - decode_timedelta=True, - decode_coords=True, + coder_options=None, my_backend_option=None, ): vars, attrs, coords = my_reader( filename_or_obj, - drop_variables=drop_variables, my_backend_option=my_backend_option, ) vars, attrs, coords = my_decode_variables( - vars, attrs, decode_times, decode_timedelta, decode_coords + vars, attrs, **coder_options.to_kwargs() ) # see also conventions.decode_cf_variables ds = xr.Dataset(vars, attrs=attrs, coords=coords) @@ -110,16 +108,13 @@ method shall be set by using :py:meth:`~xarray.Dataset.set_close`. The input of ``open_dataset`` method are one argument -(``filename_or_obj``) and one keyword argument (``drop_variables``): +(``filename_or_obj``) and one keyword argument (``coder_options``): - ``filename_or_obj``: can be any object but usually it is a string containing a path or an instance of :py:class:`pathlib.Path`. -- ``drop_variables``: can be ``None`` or an iterable containing the variable - names to be dropped when reading the data. +- ``coder_options``: can be None or :py:class:`~xarray.backends.CoderOptions` -If it makes sense for your backend, your ``open_dataset`` method -should implement in its interface the following boolean keyword arguments, called -**decoders**, which default to ``None``: +If it makes sense for your backend, you can override the ``CoderOptions`` fields, which default to ``None``: - ``mask_and_scale`` - ``decode_times`` @@ -127,12 +122,12 @@ should implement in its interface the following boolean keyword arguments, calle - ``use_cftime`` - ``concat_characters`` - ``decode_coords`` +- ``drop_variables`` -Note: all the supported decoders shall be declared explicitly -in backend ``open_dataset`` signature and adding a ``**kwargs`` is not allowed. +Note: If ``coder_options`` is None the given kwargs are validated against the default. These keyword arguments are explicitly defined in Xarray -:py:func:`~xarray.open_dataset` signature. Xarray will pass them to the +:py:func:`~xarray.CoderOptions` or subclass. Xarray will pass them to the backend only if the User explicitly sets a value different from ``None``. For more details on decoders see :ref:`RST decoders`. @@ -141,7 +136,6 @@ arguments. All these keyword arguments can be passed to :py:func:`~xarray.open_dataset` grouped either via the ``backend_kwargs`` parameter or explicitly using the syntax ``**kwargs``. - If you don't want to support the lazy loading, then the :py:class:`~xarray.Dataset` shall contain values as a :py:class:`numpy.ndarray` and your work is almost done. @@ -260,14 +254,16 @@ time is stored in two attributes dataDate and dataTime as strings. Therefore, it is not possible to reuse the Xarray time decoder, and implementing a new one is mandatory. -Decoders can be activated or deactivated using the boolean keywords of -Xarray :py:meth:`~xarray.open_dataset` signature: ``mask_and_scale``, +Decoders can be activated or deactivated using ``coder_options`` kwarg +(:py:class:`~xarray.backends.CoderOptions`) or it's boolean keywords equivalent of +Xarray :py:meth:`~xarray.open_dataset` (``mask_and_scale``, ``decode_times``, ``decode_timedelta``, ``use_cftime``, -``concat_characters``, ``decode_coords``. +``concat_characters``, ``decode_coords``. ``drop_variables``) Such keywords are passed to the backend only if the User sets a value different from ``None``. Note that the backend does not necessarily have to -implement all the decoders, but it shall declare in its ``open_dataset`` -interface only the boolean keywords related to the supported decoders. +implement all the decoders, but it shall declare a ``coder_class`` in its +``BackendEntrypoint`` interface with only the boolean keywords related to +the supported decoders. .. _RST backend_registration: diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 4f1748ce3c2..0bbf56a242b 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -1127,6 +1127,7 @@ If the file were instead stored remotely (e.g. ``s3://saved_on_disk.h5``) you ca that are used to `configure fsspec `_: .. jupyter-execute:: + :stderr: ds_kerchunked = xr.open_dataset( "./combined.json", diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py index e5df179716f..a68d56a3f61 100644 --- a/xarray/backends/__init__.py +++ b/xarray/backends/__init__.py @@ -4,7 +4,12 @@ formats. They should not be used directly, but rather through Dataset objects. """ -from xarray.backends.common import AbstractDataStore, BackendArray, BackendEntrypoint +from xarray.backends.common import ( + AbstractDataStore, + BackendArray, + BackendEntrypoint, + CoderOptions, +) from xarray.backends.file_manager import ( CachingFileManager, DummyFileManager, @@ -24,6 +29,7 @@ "BackendArray", "BackendEntrypoint", "CachingFileManager", + "CoderOptions", "DummyFileManager", "FileManager", "H5NetCDFStore", diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b80ec927b1e..e18bc86415d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -9,6 +9,7 @@ MutableMapping, Sequence, ) +from dataclasses import fields from functools import partial from io import BytesIO from itertools import starmap @@ -30,11 +31,13 @@ from xarray.backends.common import ( AbstractDataStore, ArrayWriter, + BaseCoderOptions, + CoderOptions, _find_absolute_paths, _normalize_path, + _reset_dataclass_to_false, ) from xarray.backends.locks import _get_scheduler -from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset @@ -42,7 +45,7 @@ from xarray.core.indexes import Index from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -from xarray.core.utils import is_remote_uri +from xarray.core.utils import emit_user_level_warning, is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.structure.chunks import _get_chunk, _maybe_chunk @@ -389,6 +392,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + coder_options, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -396,6 +400,8 @@ def _dataset_from_backend_dataset( f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." ) + extra_tokens.update(**coder_options.to_kwargs()) + _protect_dataset_variables_inplace(backend_ds, cache) if chunks is None: ds = backend_ds @@ -434,6 +440,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + coder_options, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -441,6 +448,8 @@ def _datatree_from_backend_datatree( f"chunks must be an int, dict, 'auto', or None. Instead found {chunks}." ) + extra_tokens.update(**coder_options.to_kwargs()) + _protect_datatree_variables_inplace(backend_tree, cache) if chunks is None: tree = backend_tree @@ -477,30 +486,53 @@ def _datatree_from_backend_datatree( return tree +def _resolve_decoders_options( + coder_options, backend, decoders +) -> tuple[CoderOptions, bool]: + # initialize CoderOptions with decoders if not given + # Deprecation Fallback + deprecated = False + if coder_options is None: + decode_cf = decoders.pop("decode_cf", None) + + # deprecation fallback + _coder_options = backend.coder_class() + if type(_coder_options) is BaseCoderOptions: + coder_options = CoderOptions() + coder_class = CoderOptions + emit_user_level_warning( + "'coder_options' keyword argument introduced, grouping together " + f"all decoder keyword arguments. Please update {backend} accordingly.", + FutureWarning, + ) + deprecated = True + else: + coder_options = backend.coder_class() + coder_class = backend.coder_class + if decode_cf is False: + coder_options = _reset_dataclass_to_false(coder_options) + else: + field_names = {f.name for f in fields(coder_class)} + coders = {} + for d in list(decoders): + if d in field_names: + coders[d] = decoders.pop(d) + coder_options = coder_class(**coders) + + return coder_options, deprecated + + def open_dataset( filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, cache: bool | None = None, - decode_cf: bool | None = None, - mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool - | CFDatetimeCoder - | Mapping[str, bool | CFDatetimeCoder] - | None = None, - decode_timedelta: bool - | CFTimedeltaCoder - | Mapping[str, bool | CFTimedeltaCoder] - | None = None, - use_cftime: bool | Mapping[str, bool] | None = None, - concat_characters: bool | Mapping[str, bool] | None = None, - decode_coords: Literal["coordinates", "all"] | bool | None = None, - drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, + coder_options: CoderOptions | None = None, **kwargs, ) -> Dataset: """Open and decode a dataset from a file or file-like object. @@ -540,76 +572,91 @@ def open_dataset( argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool or dict-like, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_times : bool, CFDatetimeCoder or dict-like, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them - encoded as numbers. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_timedelta : bool, CFTimedeltaCoder, or dict-like, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of ``decode_times``; if - ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this - takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a - matching ``time_unit``. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - use_cftime: bool or dict-like, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - - .. deprecated:: 2025.01.1 - Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. - - concat_characters : bool or dict-like, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - - Only existing variables can be set as coordinates. Missing variables - will be silently ignored. - drop_variables: str or iterable of str, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. + coder_options : CoderOptions, optional + Dataclass containing below keyword arguments to pass to cf decoding. If set, + overrides any given keyword arguments: + + - 'decode_cf' : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + + - 'mask_and_scale' : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_times' : bool, CFDatetimeCoder or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them + encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_timedelta' : bool, CFTimedeltaCoder, or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of ``decode_times``; if + ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this + takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a + matching ``time_unit``. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'use_cftime' : bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + .. deprecated:: 2025.01.1 + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. + + - 'concat_characters' : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_coords' : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + + - 'drop_variables' : str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + .. versionadded:: 2025.06.2 + The new keyword argument 'coder_options' was added. For backwards + compatibility coder_options can be given as keyword arguments, too. + inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -618,13 +665,16 @@ def open_dataset( in the values of the task graph. See :py:func:`dask.array.from_array`. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. - Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Defaults to 'dask' if installed, else whatever is registered via the + `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict - Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create - chunked arrays, via whichever chunk manager is specified through the `chunked_array_type` kwarg. - For example if :py:func:`dask.array.Array` objects are used for chunking, additional kwargs will be passed - to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + Additional keyword arguments passed on to the + `ChunkManagerEntrypoint.from_array` method used to create chunked arrays, + via whichever chunk manager is specified through the `chunked_array_type` kwarg. + For example if :py:func:`dask.array.Array` objects are used for chunking, + additional kwargs will be passed to :py:func:`dask.array.from_array`. + Experimental API that should not be relied upon. backend_kwargs: dict Additional keyword arguments passed on to the engine open function, equivalent to `**kwargs`. @@ -673,23 +723,22 @@ def open_dataset( backend = plugins.get_backend(engine) - decoders = _resolve_decoders_kwargs( - decode_cf, - open_backend_dataset_parameters=backend.open_dataset_parameters, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - decode_timedelta=decode_timedelta, - concat_characters=concat_characters, - use_cftime=use_cftime, - decode_coords=decode_coords, + # initialize coder_options per kwargs if not given + coder_options, deprecated = _resolve_decoders_options( + coder_options, backend, kwargs ) overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + + # deprecation fallback + nkwargs = kwargs.copy() + if deprecated: + nkwargs.update(**coder_options.to_kwargs()) + else: + nkwargs.update(coder_options=coder_options) backend_ds = backend.open_dataset( filename_or_obj, - drop_variables=drop_variables, - **decoders, - **kwargs, + **nkwargs, ) ds = _dataset_from_backend_dataset( backend_ds, @@ -701,8 +750,7 @@ def open_dataset( inline_array, chunked_array_type, from_array_kwargs, - drop_variables=drop_variables, - **decoders, + coder_options=coder_options, **kwargs, ) return ds @@ -714,21 +762,11 @@ def open_dataarray( engine: T_Engine = None, chunks: T_Chunks = None, cache: bool | None = None, - decode_cf: bool | None = None, - mask_and_scale: bool | None = None, - decode_times: bool - | CFDatetimeCoder - | Mapping[str, bool | CFDatetimeCoder] - | None = None, - decode_timedelta: bool | CFTimedeltaCoder | None = None, - use_cftime: bool | None = None, - concat_characters: bool | None = None, - decode_coords: Literal["coordinates", "all"] | bool | None = None, - drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, + coder_options: CoderOptions | None = None, **kwargs, ) -> DataArray: """Open an DataArray from a file or file-like object containing a single @@ -771,68 +809,91 @@ def open_dataarray( argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. This keyword may not be supported by all the backends. - decode_times : bool, CFDatetimeCoder or dict-like, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or - leave them encoded as numbers. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of ``decode_times``; if - ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this - takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a - matching ``time_unit``. - This keyword may not be supported by all the backends. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. This keyword may not be supported by all the backends. - - .. deprecated:: 2025.01.1 - Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. - - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - - Only existing variables can be set as coordinates. Missing variables - will be silently ignored. - drop_variables: str or iterable of str, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. + coder_options : CoderOptions, optional + Dataclass containing below keyword arguments to pass to cf decoding. If set, + overrides any given keyword arguments: + + - 'decode_cf' : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + + - 'mask_and_scale' : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_times' : bool, CFDatetimeCoder or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them + encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_timedelta' : bool, CFTimedeltaCoder, or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of ``decode_times``; if + ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this + takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a + matching ``time_unit``. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'use_cftime' : bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + .. deprecated:: 2025.01.1 + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. + + - 'concat_characters' : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_coords' : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + + - 'drop_variables' : str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + .. versionadded:: 2025.06.2 + The new keyword argument 'coder_options' was added. For backwards + compatibility coder_options can be given as keyword arguments, too. + inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -881,21 +942,14 @@ def open_dataarray( dataset = open_dataset( filename_or_obj, - decode_cf=decode_cf, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, engine=engine, chunks=chunks, cache=cache, - drop_variables=drop_variables, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, **kwargs, ) @@ -932,24 +986,11 @@ def open_datatree( engine: T_Engine = None, chunks: T_Chunks = None, cache: bool | None = None, - decode_cf: bool | None = None, - mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool - | CFDatetimeCoder - | Mapping[str, bool | CFDatetimeCoder] - | None = None, - decode_timedelta: bool - | CFTimedeltaCoder - | Mapping[str, bool | CFTimedeltaCoder] - | None = None, - use_cftime: bool | Mapping[str, bool] | None = None, - concat_characters: bool | Mapping[str, bool] | None = None, - decode_coords: Literal["coordinates", "all"] | bool | None = None, - drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, + coder_options: CoderOptions | None = None, **kwargs, ) -> DataTree: """ @@ -985,85 +1026,100 @@ def open_datatree( argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool or dict-like, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_times : bool, CFDatetimeCoder or dict-like, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or - leave them encoded as numbers. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_timedelta : bool or dict-like, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of ``decode_times``; if - ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this - takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a - matching ``time_unit``. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - use_cftime: bool or dict-like, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - - .. deprecated:: 2025.01.1 - Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. - - concat_characters : bool or dict-like, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - - Only existing variables can be set as coordinates. Missing variables - will be silently ignored. - drop_variables: str or iterable of str, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. + coder_options : CoderOptions, optional + Dataclass containing below keyword arguments to pass to cf decoding. If set, + overrides any given keyword arguments: + + - 'decode_cf' : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + + - 'mask_and_scale' : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_times' : bool, CFDatetimeCoder or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them + encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_timedelta' : bool, CFTimedeltaCoder, or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of ``decode_times``; if + ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this + takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a + matching ``time_unit``. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'use_cftime' : bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + .. deprecated:: 2025.01.1 + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. + + - 'concat_characters' : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_coords' : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + + - 'drop_variables' : str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + .. versionadded:: 2025.06.2 + The new keyword argument ``coder_options`` was added. For backwards + compatibility coder_options can be given as keyword arguments, too. + inline_array: bool, default: False How to include the array in the dask task graph. - By default(``inline_array=False``) the array is included in a task by + By default (``inline_array=False``) the array is included in a task by itself, and each chunk refers to that task by its key. With ``inline_array=True``, Dask will instead inline the array directly in the values of the task graph. See :py:func:`dask.array.from_array`. chunked_array_type: str, optional Which chunked array type to coerce this datasets' arrays to. - Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEnetryPoint` system. + Defaults to 'dask' if installed, else whatever is registered via the `ChunkManagerEntryPoint` system. Experimental API that should not be relied upon. from_array_kwargs: dict Additional keyword arguments passed on to the `ChunkManagerEntrypoint.from_array` method used to create @@ -1118,23 +1174,21 @@ def open_datatree( backend = plugins.get_backend(engine) - decoders = _resolve_decoders_kwargs( - decode_cf, - open_backend_dataset_parameters=backend.open_dataset_parameters, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - decode_timedelta=decode_timedelta, - concat_characters=concat_characters, - use_cftime=use_cftime, - decode_coords=decode_coords, + coder_options, deprecated = _resolve_decoders_options( + coder_options, backend, kwargs ) overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + # deprecation fallback + nkwargs = kwargs.copy() + if deprecated: + nkwargs.update(**coder_options.to_kwargs()) + else: + nkwargs.update(coder_options=coder_options) + backend_tree = backend.open_datatree( filename_or_obj, - drop_variables=drop_variables, - **decoders, - **kwargs, + **nkwargs, ) tree = _datatree_from_backend_datatree( @@ -1147,8 +1201,7 @@ def open_datatree( inline_array, chunked_array_type, from_array_kwargs, - drop_variables=drop_variables, - **decoders, + coder_options, **kwargs, ) @@ -1161,24 +1214,11 @@ def open_groups( engine: T_Engine = None, chunks: T_Chunks = None, cache: bool | None = None, - decode_cf: bool | None = None, - mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool - | CFDatetimeCoder - | Mapping[str, bool | CFDatetimeCoder] - | None = None, - decode_timedelta: bool - | CFTimedeltaCoder - | Mapping[str, bool | CFTimedeltaCoder] - | None = None, - use_cftime: bool | Mapping[str, bool] | None = None, - concat_characters: bool | Mapping[str, bool] | None = None, - decode_coords: Literal["coordinates", "all"] | bool | None = None, - drop_variables: str | Iterable[str] | None = None, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, backend_kwargs: dict[str, Any] | None = None, + coder_options: CoderOptions | None = None, **kwargs, ) -> dict[str, Dataset]: """ @@ -1218,74 +1258,91 @@ def open_groups( argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool or dict-like, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_times : bool, CFDatetimeCoder or dict-like, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or - leave them encoded as numbers. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_timedelta : bool or dict-like, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of ``decode_times``; if - ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this - takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a - matching ``time_unit``. - This keyword may not be supported by all the backends. - use_cftime: bool or dict-like, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - - .. deprecated:: 2025.01.1 - Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. - - concat_characters : bool or dict-like, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - Pass a mapping, e.g. ``{"my_variable": False}``, - to toggle this feature per-variable individually. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - - Only existing variables can be set as coordinates. Missing variables - will be silently ignored. - drop_variables: str or iterable of str, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. + coder_options : CoderOptions, optional + Dataclass containing below keyword arguments to pass to cf decoding. If set, + overrides any given keyword arguments: + + - 'decode_cf' : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + + - 'mask_and_scale' : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_times' : bool, CFDatetimeCoder or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them + encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_timedelta' : bool, CFTimedeltaCoder, or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of ``decode_times``; if + ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this + takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a + matching ``time_unit``. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'use_cftime' : bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + .. deprecated:: 2025.01.1 + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. + + - 'concat_characters' : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_coords' : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + + - 'drop_variables' : str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + .. versionadded:: 2025.06.2 + The new keyword argument 'coder_options' was added. For backwards + compatibility coder_options can be given as keyword arguments, too. + inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1350,23 +1407,21 @@ def open_groups( backend = plugins.get_backend(engine) - decoders = _resolve_decoders_kwargs( - decode_cf, - open_backend_dataset_parameters=(), - mask_and_scale=mask_and_scale, - decode_times=decode_times, - decode_timedelta=decode_timedelta, - concat_characters=concat_characters, - use_cftime=use_cftime, - decode_coords=decode_coords, + coder_options, deprecated = _resolve_decoders_options( + coder_options, backend, kwargs ) + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + # deprecation fallback + nkwargs = kwargs.copy() + if deprecated: + nkwargs.update(**coder_options.to_kwargs()) + else: + nkwargs.update(coder_options=coder_options) backend_groups = backend.open_groups_as_dict( filename_or_obj, - drop_variables=drop_variables, - **decoders, - **kwargs, + **nkwargs, ) groups = { @@ -1380,8 +1435,7 @@ def open_groups( inline_array, chunked_array_type, from_array_kwargs, - drop_variables=drop_variables, - **decoders, + coder_options, **kwargs, ) for name, backend_ds in backend_groups.items() diff --git a/xarray/backends/common.py b/xarray/backends/common.py index f478c2b882c..c8c5a6a235c 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -5,13 +5,23 @@ import time import traceback from collections.abc import Hashable, Iterable, Mapping, Sequence +from dataclasses import dataclass, fields, replace from glob import glob -from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Literal, + TypeVar, + Union, + overload, +) import numpy as np import pandas as pd from xarray.coding import strings, variables +from xarray.coding.times import CFDatetimeCoder, CFTimedeltaCoder from xarray.coding.variables import SerializationWarning from xarray.conventions import cf_encoder from xarray.core import indexing @@ -658,6 +668,124 @@ def encode(self, variables, attributes): return super().encode(variables, attributes) +def _reset_dataclass_to_false(instance): + # Returns instance with all elements set to False + field_names = [f.name for f in fields(instance)] + false_values = dict.fromkeys(field_names, False) + return replace(instance, **false_values) + + +def _validate_kwargs_for_dataclass(cls, kwargs): + valid_fields = {f.name for f in fields(cls)} + invalid = {k: v for k, v in kwargs.items() if k not in valid_fields} + return invalid + + +@dataclass(frozen=True, kw_only=True) +class BaseCoderOptions: + pass + + +@dataclass(frozen=True, kw_only=True) +class CoderOptions(BaseCoderOptions): + """ + CF Coding Options. + + Parameters + ---------- + mask_and_scale : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + decode_times : bool, CFDatetimeCoder or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them + encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + decode_timedelta : bool, CFTimedeltaCoder, or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of ``decode_times``; if + ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this + takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a + matching ``time_unit``. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + use_cftime : bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + .. deprecated:: 2025.01.1 + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. + + concat_characters : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + + drop_variables : str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + """ + + # Todo: maybe add these two to disentangle masking from scaling? + # mask: Optional[bool] = None + # scale: Optional[bool] = None + mask_and_scale: bool | Mapping[str, bool] | None = None + decode_times: ( + bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None + ) = None + decode_timedelta: ( + bool | CFTimedeltaCoder | Mapping[str, bool | CFTimedeltaCoder] | None + ) = None + use_cftime: bool | Mapping[str, bool] | None = None + concat_characters: bool | Mapping[str, bool] | None = None + decode_coords: Literal["coordinates", "all"] | bool | None = None + drop_variables: str | Iterable[str] | None = None + + def to_kwargs(self): + return {k: v for k, v in vars(self).items() if v is not None} + + class BackendEntrypoint: """ ``BackendEntrypoint`` is a class container and it is the main interface @@ -695,6 +823,7 @@ class BackendEntrypoint: open_dataset_parameters: ClassVar[tuple | None] = None description: ClassVar[str] = "" url: ClassVar[str] = "" + coder_class = BaseCoderOptions def __repr__(self) -> str: txt = f"<{type(self).__name__}>" @@ -708,7 +837,7 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - drop_variables: str | Iterable[str] | None = None, + coder_options: CoderOptions | None = None, ) -> Dataset: """ Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. @@ -730,7 +859,7 @@ def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - drop_variables: str | Iterable[str] | None = None, + coder_options: CoderOptions | None = None, ) -> DataTree: """ Backend open_datatree method used by Xarray in :py:func:`~xarray.open_datatree`. @@ -742,7 +871,7 @@ def open_groups_as_dict( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - drop_variables: str | Iterable[str] | None = None, + coder_options: CoderOptions | None = None, ) -> dict[str, Dataset]: """ Opens a dictionary mapping from group names to Datasets. diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f3e434c6e5e..86e63e60eb7 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -3,7 +3,6 @@ import functools import io import os -from collections.abc import Iterable from typing import TYPE_CHECKING, Any import numpy as np @@ -11,6 +10,7 @@ from xarray.backends.common import ( BACKEND_ENTRYPOINTS, BackendEntrypoint, + CoderOptions, WritableCFDataStore, _normalize_path, _open_remote_file, @@ -28,6 +28,7 @@ _get_datatype, _nc4_require_group, ) +from xarray.backends.netCDF4_ import NetCDF4CoderOptions as H5netcdfCoderOptions from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing from xarray.core.utils import ( @@ -371,23 +372,6 @@ def close(self, **kwargs): self._manager.close(**kwargs) -def _check_phony_dims(phony_dims): - emit_phony_dims_warning = False - if phony_dims is None: - emit_phony_dims_warning = True - phony_dims = "access" - return emit_phony_dims_warning, phony_dims - - -def _emit_phony_dims_warning(): - emit_user_level_warning( - "The 'phony_dims' kwarg now defaults to 'access'. " - "Previously 'phony_dims=None' would raise an error. " - "For full netcdf equivalence please use phony_dims='sort'.", - UserWarning, - ) - - class H5netcdfBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the h5netcdf package. @@ -410,6 +394,8 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): backends.ScipyBackendEntrypoint """ + coder_class = H5netcdfCoderOptions + description = ( "Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using h5netcdf in Xarray" ) @@ -433,27 +419,21 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, format=None, group=None, lock=None, invalid_netcdf=None, - phony_dims=None, + phony_dims="access", decode_vlen_strings=True, driver=None, driver_kwds=None, storage_options: dict[str, Any] | None = None, + coder_options: CoderOptions | None = None, + **kwargs, ) -> Dataset: - # Keep this message for some versions - # remove and set phony_dims="access" above - emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) - + coder_options = ( + coder_options if coder_options is not None else self.coder_class() + ) filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, @@ -472,53 +452,28 @@ def open_dataset( ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) - # only warn if phony_dims exist in file - # remove together with the above check - # after some versions - if store.ds._root._phony_dim_count > 0 and emit_phony_dims_warning: - _emit_phony_dims_warning() - return ds def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, format=None, group: str | None = None, lock=None, invalid_netcdf=None, - phony_dims=None, + phony_dims="access", decode_vlen_strings=True, driver=None, driver_kwds=None, + coder_options: CoderOptions | None = None, **kwargs, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, format=format, group=group, lock=lock, @@ -527,6 +482,7 @@ def open_datatree( decode_vlen_strings=decode_vlen_strings, driver=driver, driver_kwds=driver_kwds, + coder_options=coder_options, **kwargs, ) @@ -536,13 +492,6 @@ def open_groups_as_dict( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, format=None, group: str | None = None, lock=None, @@ -551,16 +500,13 @@ def open_groups_as_dict( decode_vlen_strings=True, driver=None, driver_kwds=None, + coder_options: CoderOptions | None = None, **kwargs, ) -> dict[str, Dataset]: from xarray.backends.common import _iter_nc_groups from xarray.core.treenode import NodePath from xarray.core.utils import close_on_error - # Keep this message for some versions - # remove and set phony_dims="access" above - emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) - filename_or_obj = _normalize_path(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, @@ -588,13 +534,7 @@ def open_groups_as_dict( with close_on_error(group_store): group_ds = store_entrypoint.open_dataset( group_store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) if group: @@ -603,12 +543,6 @@ def open_groups_as_dict( group_name = str(NodePath(path_group)) groups_dict[group_name] = group_ds - # only warn if phony_dims exist in file - # remove together with the above check - # after some versions - if store.ds._phony_dim_count > 0 and emit_phony_dims_warning: - _emit_phony_dims_warning() - return groups_dict diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 8c3a01eba66..39d15d542a1 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -3,9 +3,10 @@ import functools import operator import os -from collections.abc import Iterable +from collections.abc import Mapping from contextlib import suppress -from typing import TYPE_CHECKING, Any +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -13,6 +14,7 @@ BACKEND_ENTRYPOINTS, BackendArray, BackendEntrypoint, + CoderOptions, WritableCFDataStore, _normalize_path, datatree_from_dict_with_io_cleanup, @@ -35,6 +37,7 @@ create_vlen_dtype, is_unicode_dtype, ) +from xarray.coding.times import CFDatetimeCoder from xarray.coding.variables import pop_to from xarray.core import indexing from xarray.core.utils import ( @@ -602,6 +605,17 @@ def close(self, **kwargs): self._manager.close(**kwargs) +@dataclass(frozen=True) +class NetCDF4CoderOptions(CoderOptions): + # defaults for netcdf4 based backends + mask_and_scale: bool | Mapping[str, bool] | None = True + decode_times: ( + bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None + ) = True + concat_characters: bool | Mapping[str, bool] | None = True + decode_coords: Literal["coordinates", "all"] | bool | None = True + + class NetCDF4BackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the netCDF4 package. @@ -624,6 +638,8 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): backends.ScipyBackendEntrypoint """ + coder_class = NetCDF4CoderOptions + description = ( "Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using netCDF4 in Xarray" ) @@ -650,15 +666,7 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group=None, - mode="r", format="NETCDF4", clobber=True, diskless=False, @@ -666,11 +674,14 @@ def open_dataset( auto_complex=None, lock=None, autoclose=False, + coder_options: CoderOptions | None = None, ) -> Dataset: + coder_options = ( + coder_options if coder_options is not None else self.coder_class() + ) filename_or_obj = _normalize_path(filename_or_obj) store = NetCDF4DataStore.open( filename_or_obj, - mode=mode, format=format, group=group, clobber=clobber, @@ -685,13 +696,7 @@ def open_dataset( with close_on_error(store): ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) return ds @@ -699,13 +704,6 @@ def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group: str | None = None, format="NETCDF4", clobber=True, @@ -714,17 +712,11 @@ def open_datatree( auto_complex=None, lock=None, autoclose=False, + coder_options: CoderOptions | None = None, **kwargs, ) -> DataTree: groups_dict = self.open_groups_as_dict( filename_or_obj, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, group=group, format=format, clobber=clobber, @@ -732,6 +724,7 @@ def open_datatree( persist=persist, lock=lock, autoclose=autoclose, + coder_options=coder_options, **kwargs, ) @@ -741,13 +734,6 @@ def open_groups_as_dict( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group: str | None = None, format="NETCDF4", clobber=True, @@ -756,6 +742,7 @@ def open_groups_as_dict( auto_complex=None, lock=None, autoclose=False, + coder_options: CoderOptions | None = None, **kwargs, ) -> dict[str, Dataset]: from xarray.backends.common import _iter_nc_groups @@ -772,7 +759,6 @@ def open_groups_as_dict( lock=lock, autoclose=autoclose, ) - # Check for a group and make it a parent if it exists if group: parent = NodePath("/") / NodePath(group) @@ -787,13 +773,7 @@ def open_groups_as_dict( with close_on_error(group_store): group_ds = store_entrypoint.open_dataset( group_store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) if group: group_name = str(NodePath(path_group).relative_to(parent)) diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index 555538c2562..841f5a7181c 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -51,7 +51,7 @@ def detect_parameters(open_dataset: Callable) -> tuple[str, ...]: parameters_list = [] for name, param in parameters.items(): if param.kind in ( - inspect.Parameter.VAR_KEYWORD, + # inspect.Parameter.VAR_KEYWORD, inspect.Parameter.VAR_POSITIONAL, ): raise TypeError( diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 73b719f8260..1bf18771b79 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections.abc import Iterable from typing import TYPE_CHECKING, Any import numpy as np @@ -10,6 +9,7 @@ AbstractDataStore, BackendArray, BackendEntrypoint, + CoderOptions, _normalize_path, datatree_from_dict_with_io_cleanup, robust_getitem, @@ -204,6 +204,7 @@ class PydapBackendEntrypoint(BackendEntrypoint): backends.PydapDataStore """ + coder_class = CoderOptions description = "Open remote datasets via OPeNDAP using pydap in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" @@ -217,13 +218,6 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group=None, application=None, session=None, @@ -231,6 +225,8 @@ def open_dataset( timeout=None, verify=None, user_charset=None, + coder_options: CoderOptions | None = None, + **kwargs, ) -> Dataset: store = PydapDataStore.open( url=filename_or_obj, @@ -246,13 +242,7 @@ def open_dataset( with close_on_error(store): ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) return ds @@ -260,35 +250,25 @@ def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group: str | None = None, application=None, session=None, timeout=None, verify=None, user_charset=None, + coder_options: CoderOptions | None = None, + **kwargs, ) -> DataTree: groups_dict = self.open_groups_as_dict( - filename_or_obj, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + filename_or_obj=filename_or_obj, group=group, application=None, session=None, timeout=None, verify=None, user_charset=None, + coder_options=coder_options, + **kwargs, ) return datatree_from_dict_with_io_cleanup(groups_dict) @@ -297,19 +277,14 @@ def open_groups_as_dict( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group: str | None = None, application=None, session=None, timeout=None, verify=None, user_charset=None, + coder_options: CoderOptions | None = None, + **kwargs, ) -> dict[str, Dataset]: from xarray.core.treenode import NodePath @@ -375,13 +350,7 @@ def group_fqn(store, path=None, g_fqn=None) -> dict[str, str]: with close_on_error(store): group_ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) if group: group_name = str(NodePath(path_group).relative_to(parent)) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index b98d226cac6..4c31c322d38 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -3,7 +3,6 @@ import gzip import io import os -from collections.abc import Iterable from typing import TYPE_CHECKING, Any import numpy as np @@ -12,6 +11,7 @@ BACKEND_ENTRYPOINTS, BackendArray, BackendEntrypoint, + CoderOptions, WritableCFDataStore, _normalize_path, ) @@ -286,6 +286,8 @@ class ScipyBackendEntrypoint(BackendEntrypoint): backends.H5netcdfBackendEntrypoint """ + coder_class = CoderOptions + description = "Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html" @@ -310,35 +312,26 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, - mode="r", format=None, group=None, mmap=None, lock=None, + coder_options: CoderOptions | None = None, + **kwargs, ) -> Dataset: + coder_options = ( + coder_options if coder_options is not None else self.coder_class() + ) filename_or_obj = _normalize_path(filename_or_obj) store = ScipyDataStore( - filename_or_obj, mode=mode, format=format, group=group, mmap=mmap, lock=lock + filename_or_obj, format=format, group=group, mmap=mmap, lock=lock ) store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) return ds diff --git a/xarray/backends/store.py b/xarray/backends/store.py index b1b3956ca8e..e9db592d223 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections.abc import Iterable from typing import TYPE_CHECKING, Any from xarray import conventions @@ -8,6 +7,7 @@ BACKEND_ENTRYPOINTS, AbstractDataStore, BackendEntrypoint, + CoderOptions, ) from xarray.core.dataset import Dataset @@ -21,6 +21,8 @@ class StoreBackendEntrypoint(BackendEntrypoint): description = "Open AbstractDataStore instances in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.StoreBackendEntrypoint.html" + coder_class = CoderOptions + def guess_can_open( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, @@ -31,29 +33,18 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, + coder_options: CoderOptions | None = None, ) -> Dataset: assert isinstance(filename_or_obj, AbstractDataStore) vars, attrs = filename_or_obj.load() encoding = filename_or_obj.get_encoding() + coder_options = ( + coder_options if coder_options is not None else self.coder_class() + ) vars, attrs, coord_names = conventions.decode_cf_variables( - vars, - attrs, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + vars, attrs, **coder_options.to_kwargs() ) ds = Dataset(vars, attrs=attrs) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 48405b906cd..9875833789b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -17,8 +17,10 @@ AbstractWritableDataStore, BackendArray, BackendEntrypoint, + CoderOptions, _encode_variable_name, _normalize_path, + _validate_kwargs_for_dataclass, datatree_from_dict_with_io_cleanup, ensure_dtype_not_object, ) @@ -1330,23 +1332,24 @@ def open_zarr( group=None, synchronizer=None, chunks="auto", - decode_cf=True, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables=None, + # decode_cf=True, + # mask_and_scale=True, + # decode_times=True, + # concat_characters=True, + # decode_coords=True, + # drop_variables=None, consolidated=None, overwrite_encoded_chunks=False, chunk_store=None, storage_options=None, - decode_timedelta=None, - use_cftime=None, + # decode_timedelta=None, + # use_cftime=None, zarr_version=None, zarr_format=None, use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + coder_options: CoderOptions | None = None, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1379,32 +1382,91 @@ def open_zarr( overwrite_encoded_chunks : bool, optional Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False) - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. - drop_variables : str or iterable, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. + coder_options : CoderOptions, optional + Dataclass containing below keyword arguments to pass to cf decoding. If set, + overrides any given keyword arguments: + + - 'decode_cf' : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + + - 'mask_and_scale' : bool or dict-like, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_times' : bool, CFDatetimeCoder or dict-like, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them + encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_timedelta' : bool, CFTimedeltaCoder, or dict-like, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of ``decode_times``; if + ``decode_times`` is a :py:class:`coders.CFDatetimeCoder` instance, this + takes the form of a :py:class:`coders.CFTimedeltaCoder` instance with a + matching ``time_unit``. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'use_cftime' : bool or dict-like, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + .. deprecated:: 2025.01.1 + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. + + - 'concat_characters' : bool or dict-like, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. + This keyword may not be supported by all the backends. + + - 'decode_coords' : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + + Only existing variables can be set as coordinates. Missing variables + will be silently ignored. + + - 'drop_variables' : str or iterable of str, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + .. versionadded:: 2025.06.2 + The new keyword argument 'coder_options' was added. For backwards + compatibility coder_options can be given as keyword arguments, too. + consolidated : bool, optional Whether to open the store using zarr's consolidated metadata capability. Only works for stores that have already been consolidated. @@ -1418,21 +1480,6 @@ def open_zarr( storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {'days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds'} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. - use_cftime : bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. zarr_version : int or None, optional .. deprecated:: 2024.9.1 @@ -1488,9 +1535,12 @@ def open_zarr( chunks = None if kwargs: - raise TypeError( - "open_zarr() got unexpected keyword arguments " + ",".join(kwargs.keys()) - ) + invalid_kwargs = _validate_kwargs_for_dataclass(CoderOptions, kwargs) + if invalid_kwargs: + raise TypeError( + "open_zarr() got unexpected keyword arguments " + + ",".join(invalid_kwargs.keys()) + ) backend_kwargs = { "synchronizer": synchronizer, @@ -1505,21 +1555,15 @@ def open_zarr( ds = open_dataset( filename_or_obj=store, group=group, - decode_cf=decode_cf, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, engine="zarr", chunks=chunks, - drop_variables=drop_variables, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, - decode_timedelta=decode_timedelta, - use_cftime=use_cftime, zarr_version=zarr_version, use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, + coder_options=coder_options, + **kwargs, ) return ds @@ -1536,6 +1580,7 @@ class ZarrBackendEntrypoint(BackendEntrypoint): backends.ZarrStore """ + coder_class = CoderOptions description = "Open zarr files (.zarr) using zarr in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ZarrBackendEntrypoint.html" @@ -1553,15 +1598,7 @@ def open_dataset( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group=None, - mode="r", synchronizer=None, consolidated=None, chunk_store=None, @@ -1572,13 +1609,16 @@ def open_dataset( engine=None, use_zarr_fill_value_as_mask=None, cache_members: bool = True, + coder_options: CoderOptions | None = None, ) -> Dataset: + coder_options = ( + coder_options if coder_options is not None else self.coder_class() + ) filename_or_obj = _normalize_path(filename_or_obj) if not store: store = ZarrStore.open_group( filename_or_obj, group=group, - mode=mode, synchronizer=synchronizer, consolidated=consolidated, consolidate_on_close=False, @@ -1594,13 +1634,7 @@ def open_dataset( with close_on_error(store): ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) return ds @@ -1608,40 +1642,26 @@ def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group: str | None = None, - mode="r", synchronizer=None, consolidated=None, chunk_store=None, storage_options=None, zarr_version=None, zarr_format=None, + coder_options: CoderOptions | None = None, ) -> DataTree: filename_or_obj = _normalize_path(filename_or_obj) groups_dict = self.open_groups_as_dict( filename_or_obj=filename_or_obj, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, group=group, - mode=mode, synchronizer=synchronizer, consolidated=consolidated, chunk_store=chunk_store, storage_options=storage_options, zarr_version=zarr_version, zarr_format=zarr_format, + coder_options=coder_options, ) return datatree_from_dict_with_io_cleanup(groups_dict) @@ -1650,21 +1670,14 @@ def open_groups_as_dict( self, filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, *, - mask_and_scale=True, - decode_times=True, - concat_characters=True, - decode_coords=True, - drop_variables: str | Iterable[str] | None = None, - use_cftime=None, - decode_timedelta=None, group: str | None = None, - mode="r", synchronizer=None, consolidated=None, chunk_store=None, storage_options=None, zarr_version=None, zarr_format=None, + coder_options: CoderOptions | None = None, ) -> dict[str, Dataset]: filename_or_obj = _normalize_path(filename_or_obj) @@ -1677,7 +1690,6 @@ def open_groups_as_dict( stores = ZarrStore.open_store( filename_or_obj, group=parent, - mode=mode, synchronizer=synchronizer, consolidated=consolidated, consolidate_on_close=False, @@ -1694,13 +1706,7 @@ def open_groups_as_dict( with close_on_error(store): group_ds = store_entrypoint.open_dataset( store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, + coder_options=coder_options, ) if group: group_name = str(NodePath(path_group).relative_to(parent)) diff --git a/xarray/conventions.py b/xarray/conventions.py index 17f1e0666b6..1559ef633dd 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -390,7 +390,7 @@ def stackable(dim: Hashable) -> bool: if isinstance(drop_variables, str): drop_variables = [drop_variables] - elif drop_variables is None: + elif drop_variables is None or drop_variables is False: drop_variables = [] drop_variables = set(drop_variables) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..0d55db3f01f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2368,7 +2368,7 @@ def save(self, dataset, store_target, **kwargs): # type: ignore[override] @contextlib.contextmanager def open(self, path, **kwargs): with xr.open_dataset( - path, engine="zarr", mode="r", **kwargs, **self.version_kwargs + path, engine="zarr", **kwargs, **self.version_kwargs ) as ds: yield ds @@ -4427,14 +4427,13 @@ def test_phony_dims_warning(self) -> None: fx = f.create_group(grp) for k, v in var.items(): fx.create_dataset(k, data=v) - with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"): - with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds: - assert ds.sizes == { - "phony_dim_0": 5, - "phony_dim_1": 5, - "phony_dim_2": 5, - "phony_dim_3": 25, - } + with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds: + assert ds.sizes == { + "phony_dim_0": 5, + "phony_dim_1": 5, + "phony_dim_2": 5, + "phony_dim_3": 25, + } @requires_h5netcdf diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..322f0f18599 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -35,10 +35,12 @@ def test_custom_engine() -> None: ) class CustomBackend(xr.backends.BackendEntrypoint): + coder_class = xr.backends.CoderOptions + def open_dataset( self, filename_or_obj, - drop_variables=None, + coder_options=None, **kwargs, ) -> xr.Dataset: return expected.copy(deep=True) @@ -54,10 +56,12 @@ def test_multiindex() -> None: dataset = dataset.stack(z=["coord1", "coord2"]) class MultiindexBackend(xr.backends.BackendEntrypoint): + coder_class = xr.backends.CoderOptions + def open_dataset( self, filename_or_obj, - drop_variables=None, + coder_options=None, **kwargs, ) -> xr.Dataset: return dataset.copy(deep=True) @@ -69,7 +73,9 @@ def open_dataset( class PassThroughBackendEntrypoint(xr.backends.BackendEntrypoint): """Access an object passed to the `open_dataset` method.""" - def open_dataset(self, dataset, *, drop_variables=None): + coder_class = xr.backends.CoderOptions + + def open_dataset(self, dataset, *, coder_options=None): """Return the first argument.""" return dataset diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 518758a0efb..7a1c929d266 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -530,14 +530,13 @@ def test_phony_dims_warning(self, tmpdir) -> None: for k, v in var.items(): fx.create_dataset(k, data=v) - with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"): - with open_datatree(filepath, engine=self.engine) as tree: - assert tree.bar.dims == { - "phony_dim_0": 5, - "phony_dim_1": 5, - "phony_dim_2": 5, - "phony_dim_3": 25, - } + with open_datatree(filepath, engine=self.engine) as tree: + assert tree.bar.dims == { + "phony_dim_0": 5, + "phony_dim_1": 5, + "phony_dim_2": 5, + "phony_dim_3": 25, + } @requires_zarr diff --git a/xarray/tests/test_plugins.py b/xarray/tests/test_plugins.py index e2129229c2c..a77a0933956 100644 --- a/xarray/tests/test_plugins.py +++ b/xarray/tests/test_plugins.py @@ -127,9 +127,9 @@ def test_set_missing_parameters() -> None: def test_set_missing_parameters_raise_error() -> None: - backend = DummyBackendEntrypointKwargs - with pytest.raises(TypeError): - plugins.set_missing_parameters({"engine": backend}) + # backend = DummyBackendEntrypointKwargs + # with pytest.raises(TypeError): + # plugins.set_missing_parameters({"engine": backend}) backend_args = DummyBackendEntrypointArgs with pytest.raises(TypeError):