Update signature open_dataset for API v2 (#4547)

aurghs · TheRed86 · alexamici · web-flow · commit ba989f65e800 · 2020-11-06T15:43:09.000+01:00
* add in api.open_dataset dispatching to stub apiv2

* remove in apiv2 check for input AbstractDataStore

* bugfix typo

* add kwarg engines in _get_backend_cls needed by apiv2

* add alpha support for h5netcdf

* style: clean not used code, modify some variable/function name

* Add ENGINES entry for cfgrib.

* Define function open_backend_dataset_cfgrib() to be used in apiv2.py.
Add necessary imports for this function.

* Apply black to check formatting.

* Apply black to check formatting.

* add dummy zarr apiv2 backend

* align apiv2.open_dataset to api.open_dataset

* remove unused extra_coords in open_backend_dataset_*

* remove extra_coords in open_backend_dataset_cfgrib

* transform zarr maybe_chunk and get_chunks in classmethod
- to be used in apiv2 without instantiate the object

* make alpha zarr apiv2 working

* refactor apiv2.open_dataset:
- modify signature
- move default setting inside backends

* move dataset_from_backend_dataset out of apiv2.open_dataset

* remove blank lines

* remove blank lines

* style

* Re-write error messages

* Fix code style

* Fix code style

* remove unused import

* replace warning with ValueError for not supported kwargs in backends

* change zarr.ZarStore.get_chunks into a static method

* group `backend_kwargs` and `kwargs` in `extra_tokes` argument in apiv2.dataset_from_backend_dataset`

* remove in open_backend_dayaset_${engine} signature kwarags and the related error message

* black

* Change signature of open_dataset function in apiv2 to include explicit decodings.

* Set an alias for chunks='auto'.

* Allign empty rows with previous version.

* reverse changes in chunks management

* move check on decoders from backends to open_dataset (apiv2)

* update documentation

* Change signature of open_dataset function in apiv2 to include explicit decodings.

* Set an alias for chunks='auto'.

* Allign empty rows with previous version.

* reverse changes in chunks management

* move check on decoders from backends to open_dataset (apiv2)

* update documentation

* change defaut value for decode_cf in open_dataset. The function bahaviour is unchanged.

* Review docstring of open_dataset function.

* bugfix typo

* - add check on backends signatures
- add plugins.py cotaining backneds info

* - black isort

* - add type declaration in plugins.py

* Fix the type hint for ENGINES

* Drop special case and simplify resolve_decoders_kwargs

* isort

Co-authored-by: TheRed86 &lt;m.rossetti@bopen.eu&gt;
Co-authored-by: Alessandro Amici &lt;a.amici@bopen.eu&gt;
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -435,9 +435,9 @@ def open_dataset(
     """
     if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2":
         kwargs = locals().copy()
-        from . import apiv2
+        from . import apiv2, plugins
 
-        if engine in apiv2.ENGINES:
+        if engine in plugins.ENGINES:
             return apiv2.open_dataset(**kwargs)
 
     if autoclose is not None:
diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py
@@ -1,20 +1,14 @@
 import os
 
 from ..core.utils import is_remote_uri
-from . import cfgrib_, h5netcdf_, zarr
+from . import plugins, zarr
 from .api import (
     _autodetect_engine,
     _get_backend_cls,
     _normalize_path,
     _protect_dataset_variables_inplace,
 )
 
-ENGINES = {
-    "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf,
-    "zarr": zarr.open_backend_dataset_zarr,
-    "cfgrib": cfgrib_.open_backend_dataset_cfgrib,
-}
-
 
 def dataset_from_backend_dataset(
     ds,
@@ -23,7 +17,7 @@ def dataset_from_backend_dataset(
     chunks,
     cache,
     overwrite_encoded_chunks,
-    extra_tokens,
+    **extra_tokens,
 ):
     if not (isinstance(chunks, (int, dict)) or chunks is None):
         if chunks != "auto":
@@ -73,17 +67,34 @@ def dataset_from_backend_dataset(
     # Ensure source filename always stored in dataset object (GH issue #2550)
     if "source" not in ds.encoding:
         if isinstance(filename_or_obj, str):
-            ds.encoding["source"] = filename_or_obj
+            ds2.encoding["source"] = filename_or_obj
 
     return ds2
 
 
+def resolve_decoders_kwargs(decode_cf, engine, **decoders):
+    signature = plugins.ENGINES[engine]["signature"]
+    if decode_cf is False:
+        for d in decoders:
+            if d in signature:
+                decoders[d] = False
+    return {k: v for k, v in decoders.items() if v is not None}
+
+
 def open_dataset(
     filename_or_obj,
     *,
     engine=None,
     chunks=None,
     cache=None,
+    decode_cf=None,
+    mask_and_scale=None,
+    decode_times=None,
+    decode_timedelta=None,
+    use_cftime=None,
+    concat_characters=None,
+    decode_coords=None,
+    drop_variables=None,
     backend_kwargs=None,
     **kwargs,
 ):
@@ -94,70 +105,50 @@ def open_dataset(
     filename_or_obj : str, Path, file-like or DataStore
         Strings and Path objects are interpreted as a path to a netCDF file
         or an OpenDAP URL and opened with python-netCDF4, unless the filename
-        ends with .gz, in which case the file is gunzipped and opened with
+        ends with .gz, in which case the file is unzipped and opened with
         scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
         objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
-    group : str, optional
-        Path to the netCDF4 group in the given file to open (only works for
-        netCDF4 files).
-    decode_cf : bool, optional
-        Whether to decode these variables, assuming they were saved according
-        to CF conventions.
-    mask_and_scale : bool, optional
-        If True, replace array values equal to `_FillValue` with NA and scale
-        values according to the formula `original_values * scale_factor +
-        add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
-        taken from variable attributes (if they exist).  If the `_FillValue` or
-        `missing_value` attribute contains multiple values a warning will be
-        issued and all array values matching one of the multiple values will
-        be replaced by NA. mask_and_scale defaults to True except for the
-        pseudonetcdf backend.
-    decode_times : bool, optional
-        If True, decode times encoded in the standard NetCDF datetime format
-        into datetime objects. Otherwise, leave them encoded as numbers.
-    autoclose : bool, optional
-        If True, automatically close files to avoid OS Error of too many files
-        being open.  However, this option doesn't work with streams, e.g.,
-        BytesIO.
-    concat_characters : bool, optional
-        If True, concatenate along the last dimension of character arrays to
-        form string arrays. Dimensions will only be concatenated over (and
-        removed) if they have no corresponding variable and if they are only
-        used as the last dimension of character arrays.
-    decode_coords : bool, optional
-        If True, decode the 'coordinates' attribute to identify coordinates in
-        the resulting dataset.
-    engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \
-        "pseudonetcdf", "zarr"}, optional
+    engine : str, optional
         Engine to use when reading files. If not provided, the default engine
         is chosen based on available dependencies, with a preference for
-        "netcdf4".
+        "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\
+        "pynio", "cfgrib", "pseudonetcdf", "zarr"}.
     chunks : int or dict, optional
         If chunks is provided, it is used to load the new dataset into dask
         arrays. ``chunks={}`` loads the dataset with dask using a single
         chunk for all arrays. When using ``engine="zarr"``, setting
         ``chunks='auto'`` will create dask chunks based on the variable's zarr
         chunks.
-    lock : False or lock-like, optional
-        Resource lock to use when reading data from disk. Only relevant when
-        using dask or another form of parallelism. By default, appropriate
-        locks are chosen to safely read and write files with the currently
-        active dask scheduler.
     cache : bool, optional
-        If True, cache data loaded from the underlying datastore in memory as
+        If True, cache data is loaded from the underlying datastore in memory as
         NumPy arrays when accessed to avoid reading from the underlying data-
         store multiple times. Defaults to True unless you specify the `chunks`
         argument to use dask, in which case it defaults to False. Does not
         change the behavior of coordinates corresponding to dimensions, which
         always load their data from disk into a ``pandas.Index``.
-    drop_variables: str or iterable, optional
-        A variable or list of variables to exclude from being parsed from the
-        dataset. This may be useful to drop variables with problems or
-        inconsistent values.
-    backend_kwargs: dict, optional
-        A dictionary of keyword arguments to pass on to the backend. This
-        may be useful when backend options would improve performance or
-        allow user control of dataset processing.
+    decode_cf : bool, optional
+        Setting ``decode_cf=False`` will disable ``mask_and_scale``,
+        ``decode_times``, ``decode_timedelta``, ``concat_characters``,
+        ``decode_coords``.
+    mask_and_scale : bool, optional
+        If True, array values equal to `_FillValue` are replaced with NA and other
+        values are scaled according to the formula `original_values * scale_factor +
+        add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
+        taken from variable attributes (if they exist).  If the `_FillValue` or
+        `missing_value` attribute contains multiple values, a warning will be
+        issued and all array values matching one of the multiple values will
+        be replaced by NA. mask_and_scale defaults to True except for the
+        pseudonetcdf backend. This keyword may not be supported by all the backends.
+    decode_times : bool, optional
+        If True, decode times encoded in the standard NetCDF datetime format
+        into datetime objects. Otherwise, leave them encoded as numbers.
+        This keyword may not be supported by all the backends.
+    decode_timedelta : bool, optional
+        If True, decode variables and coordinates with time units in
+        {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"}
+        into timedelta objects. If False, they remain encoded as numbers.
+        If None (default), assume the same value of decode_time.
+        This keyword may not be supported by all the backends.
     use_cftime: bool, optional
         Only relevant if encoded dates come from a standard calendar
         (e.g. "gregorian", "proleptic_gregorian", "standard", or not
@@ -167,12 +158,38 @@ def open_dataset(
         ``cftime.datetime`` objects, regardless of whether or not they can be
         represented using ``np.datetime64[ns]`` objects.  If False, always
         decode times to ``np.datetime64[ns]`` objects; if this is not possible
-        raise an error.
-    decode_timedelta : bool, optional
-        If True, decode variables and coordinates with time units in
-        {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"}
-        into timedelta objects. If False, leave them encoded as numbers.
-        If None (default), assume the same value of decode_time.
+        raise an error. This keyword may not be supported by all the backends.
+    concat_characters : bool, optional
+        If True, concatenate along the last dimension of character arrays to
+        form string arrays. Dimensions will only be concatenated over (and
+        removed) if they have no corresponding variable and if they are only
+        used as the last dimension of character arrays.
+        This keyword may not be supported by all the backends.
+    decode_coords : bool, optional
+        If True, decode the 'coordinates' attribute to identify coordinates in
+        the resulting dataset. This keyword may not be supported by all the
+        backends.
+    drop_variables: str or iterable, optional
+        A variable or list of variables to exclude from the dataset parsing.
+        This may be useful to drop variables with problems or
+        inconsistent values.
+    backend_kwargs:
+        Additional keyword arguments passed on to the engine open function.
+    **kwargs: dict
+        Additional keyword arguments passed on to the engine open function.
+        For example:
+
+        - 'group': path to the netCDF4 group in the given file to open given as
+        a str,supported by "netcdf4", "h5netcdf", "zarr".
+
+        - 'lock': resource lock to use when reading data from disk. Only
+        relevant when using dask or another form of parallelism. By default,
+        appropriate locks are chosen to safely read and write files with the
+        currently active dask scheduler. Supported by "netcdf4", "h5netcdf",
+        "pynio", "pseudonetcdf", "cfgrib".
+
+        See engine open function for kwargs accepted by each specific engine.
+
 
     Returns
     -------
@@ -202,12 +219,27 @@ def open_dataset(
     if engine is None:
         engine = _autodetect_engine(filename_or_obj)
 
+    decoders = resolve_decoders_kwargs(
+        decode_cf,
+        engine=engine,
+        mask_and_scale=mask_and_scale,
+        decode_times=decode_times,
+        decode_timedelta=decode_timedelta,
+        concat_characters=concat_characters,
+        use_cftime=use_cftime,
+        decode_coords=decode_coords,
+    )
+
     backend_kwargs = backend_kwargs.copy()
     overwrite_encoded_chunks = backend_kwargs.pop("overwrite_encoded_chunks", None)
 
-    open_backend_dataset = _get_backend_cls(engine, engines=ENGINES)
+    open_backend_dataset = _get_backend_cls(engine, engines=plugins.ENGINES)[
+        "open_dataset"
+    ]
     backend_ds = open_backend_dataset(
         filename_or_obj,
+        drop_variables=drop_variables,
+        **decoders,
         **backend_kwargs,
         **{k: v for k, v in kwargs.items() if v is not None},
     )
@@ -218,7 +250,10 @@ def open_dataset(
         chunks,
         cache,
         overwrite_encoded_chunks,
-        {**backend_kwargs, **kwargs},
+        drop_variables=drop_variables,
+        **decoders,
+        **backend_kwargs,
+        **kwargs,
     )
 
     return ds
diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py
@@ -76,7 +76,6 @@ def get_encoding(self):
 def open_backend_dataset_cfgrib(
     filename_or_obj,
     *,
-    decode_cf=True,
     mask_and_scale=True,
     decode_times=None,
     concat_characters=None,
@@ -93,13 +92,6 @@ def open_backend_dataset_cfgrib(
     time_dims=("time", "step"),
 ):
 
-    if not decode_cf:
-        mask_and_scale = False
-        decode_times = False
-        concat_characters = False
-        decode_coords = False
-        decode_timedelta = False
-
     store = CfGribDataStore(
         filename_or_obj,
         indexpath=indexpath,
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -328,7 +328,6 @@ def close(self, **kwargs):
 def open_backend_dataset_h5necdf(
     filename_or_obj,
     *,
-    decode_cf=True,
     mask_and_scale=True,
     decode_times=None,
     concat_characters=None,
@@ -343,13 +342,6 @@ def open_backend_dataset_h5necdf(
     phony_dims=None,
 ):
 
-    if not decode_cf:
-        mask_and_scale = False
-        decode_times = False
-        concat_characters = False
-        decode_coords = False
-        decode_timedelta = False
-
     store = H5NetCDFStore.open(
         filename_or_obj,
         format=format,
diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py
@@ -0,0 +1,31 @@
+import inspect
+import typing as T
+
+from . import cfgrib_, h5netcdf_, zarr
+
+ENGINES: T.Dict[str, T.Dict[str, T.Any]] = {
+    "h5netcdf": {
+        "open_dataset": h5netcdf_.open_backend_dataset_h5necdf,
+    },
+    "zarr": {
+        "open_dataset": zarr.open_backend_dataset_zarr,
+    },
+    "cfgrib": {
+        "open_dataset": cfgrib_.open_backend_dataset_cfgrib,
+    },
+}
+
+
+for engine in ENGINES.values():
+    if "signature" not in engine:
+        parameters = inspect.signature(engine["open_dataset"]).parameters
+        for name, param in parameters.items():
+            if param.kind in (
+                inspect.Parameter.VAR_KEYWORD,
+                inspect.Parameter.VAR_POSITIONAL,
+            ):
+                raise TypeError(
+                    f'All the parameters in {engine["open_dataset"]!r} signature should be explicit. '
+                    "*args and **kwargs is not supported"
+                )
+        engine["signature"] = set(parameters)
diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -693,7 +693,6 @@ def open_zarr(
 
 def open_backend_dataset_zarr(
     filename_or_obj,
-    decode_cf=True,
     mask_and_scale=True,
     decode_times=None,
     concat_characters=None,
@@ -709,13 +708,6 @@ def open_backend_dataset_zarr(
     chunk_store=None,
 ):
 
-    if not decode_cf:
-        mask_and_scale = False
-        decode_times = False
-        concat_characters = False
-        decode_coords = False
-        decode_timedelta = False
-
     store = ZarrStore.open_group(
         filename_or_obj,
         group=group,