Skip to content

Commit ba989f6

Browse files
aurghsTheRed86alexamici
authored
Update signature open_dataset for API v2 (#4547)
* add in api.open_dataset dispatching to stub apiv2 * remove in apiv2 check for input AbstractDataStore * bugfix typo * add kwarg engines in _get_backend_cls needed by apiv2 * add alpha support for h5netcdf * style: clean not used code, modify some variable/function name * Add ENGINES entry for cfgrib. * Define function open_backend_dataset_cfgrib() to be used in apiv2.py. Add necessary imports for this function. * Apply black to check formatting. * Apply black to check formatting. * add dummy zarr apiv2 backend * align apiv2.open_dataset to api.open_dataset * remove unused extra_coords in open_backend_dataset_* * remove extra_coords in open_backend_dataset_cfgrib * transform zarr maybe_chunk and get_chunks in classmethod - to be used in apiv2 without instantiate the object * make alpha zarr apiv2 working * refactor apiv2.open_dataset: - modify signature - move default setting inside backends * move dataset_from_backend_dataset out of apiv2.open_dataset * remove blank lines * remove blank lines * style * Re-write error messages * Fix code style * Fix code style * remove unused import * replace warning with ValueError for not supported kwargs in backends * change zarr.ZarStore.get_chunks into a static method * group `backend_kwargs` and `kwargs` in `extra_tokes` argument in apiv2.dataset_from_backend_dataset` * remove in open_backend_dayaset_${engine} signature kwarags and the related error message * black * Change signature of open_dataset function in apiv2 to include explicit decodings. * Set an alias for chunks='auto'. * Allign empty rows with previous version. * reverse changes in chunks management * move check on decoders from backends to open_dataset (apiv2) * update documentation * Change signature of open_dataset function in apiv2 to include explicit decodings. * Set an alias for chunks='auto'. * Allign empty rows with previous version. * reverse changes in chunks management * move check on decoders from backends to open_dataset (apiv2) * update documentation * change defaut value for decode_cf in open_dataset. The function bahaviour is unchanged. * Review docstring of open_dataset function. * bugfix typo * - add check on backends signatures - add plugins.py cotaining backneds info * - black isort * - add type declaration in plugins.py * Fix the type hint for ENGINES * Drop special case and simplify resolve_decoders_kwargs * isort Co-authored-by: TheRed86 <m.rossetti@bopen.eu> Co-authored-by: Alessandro Amici <a.amici@bopen.eu>
1 parent 235b2e5 commit ba989f6

File tree

6 files changed

+133
-91
lines changed

6 files changed

+133
-91
lines changed

xarray/backends/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -435,9 +435,9 @@ def open_dataset(
435435
"""
436436
if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2":
437437
kwargs = locals().copy()
438-
from . import apiv2
438+
from . import apiv2, plugins
439439

440-
if engine in apiv2.ENGINES:
440+
if engine in plugins.ENGINES:
441441
return apiv2.open_dataset(**kwargs)
442442

443443
if autoclose is not None:

xarray/backends/apiv2.py

Lines changed: 100 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
11
import os
22

33
from ..core.utils import is_remote_uri
4-
from . import cfgrib_, h5netcdf_, zarr
4+
from . import plugins, zarr
55
from .api import (
66
_autodetect_engine,
77
_get_backend_cls,
88
_normalize_path,
99
_protect_dataset_variables_inplace,
1010
)
1111

12-
ENGINES = {
13-
"h5netcdf": h5netcdf_.open_backend_dataset_h5necdf,
14-
"zarr": zarr.open_backend_dataset_zarr,
15-
"cfgrib": cfgrib_.open_backend_dataset_cfgrib,
16-
}
17-
1812

1913
def dataset_from_backend_dataset(
2014
ds,
@@ -23,7 +17,7 @@ def dataset_from_backend_dataset(
2317
chunks,
2418
cache,
2519
overwrite_encoded_chunks,
26-
extra_tokens,
20+
**extra_tokens,
2721
):
2822
if not (isinstance(chunks, (int, dict)) or chunks is None):
2923
if chunks != "auto":
@@ -73,17 +67,34 @@ def dataset_from_backend_dataset(
7367
# Ensure source filename always stored in dataset object (GH issue #2550)
7468
if "source" not in ds.encoding:
7569
if isinstance(filename_or_obj, str):
76-
ds.encoding["source"] = filename_or_obj
70+
ds2.encoding["source"] = filename_or_obj
7771

7872
return ds2
7973

8074

75+
def resolve_decoders_kwargs(decode_cf, engine, **decoders):
76+
signature = plugins.ENGINES[engine]["signature"]
77+
if decode_cf is False:
78+
for d in decoders:
79+
if d in signature:
80+
decoders[d] = False
81+
return {k: v for k, v in decoders.items() if v is not None}
82+
83+
8184
def open_dataset(
8285
filename_or_obj,
8386
*,
8487
engine=None,
8588
chunks=None,
8689
cache=None,
90+
decode_cf=None,
91+
mask_and_scale=None,
92+
decode_times=None,
93+
decode_timedelta=None,
94+
use_cftime=None,
95+
concat_characters=None,
96+
decode_coords=None,
97+
drop_variables=None,
8798
backend_kwargs=None,
8899
**kwargs,
89100
):
@@ -94,70 +105,50 @@ def open_dataset(
94105
filename_or_obj : str, Path, file-like or DataStore
95106
Strings and Path objects are interpreted as a path to a netCDF file
96107
or an OpenDAP URL and opened with python-netCDF4, unless the filename
97-
ends with .gz, in which case the file is gunzipped and opened with
108+
ends with .gz, in which case the file is unzipped and opened with
98109
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
99110
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
100-
group : str, optional
101-
Path to the netCDF4 group in the given file to open (only works for
102-
netCDF4 files).
103-
decode_cf : bool, optional
104-
Whether to decode these variables, assuming they were saved according
105-
to CF conventions.
106-
mask_and_scale : bool, optional
107-
If True, replace array values equal to `_FillValue` with NA and scale
108-
values according to the formula `original_values * scale_factor +
109-
add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
110-
taken from variable attributes (if they exist). If the `_FillValue` or
111-
`missing_value` attribute contains multiple values a warning will be
112-
issued and all array values matching one of the multiple values will
113-
be replaced by NA. mask_and_scale defaults to True except for the
114-
pseudonetcdf backend.
115-
decode_times : bool, optional
116-
If True, decode times encoded in the standard NetCDF datetime format
117-
into datetime objects. Otherwise, leave them encoded as numbers.
118-
autoclose : bool, optional
119-
If True, automatically close files to avoid OS Error of too many files
120-
being open. However, this option doesn't work with streams, e.g.,
121-
BytesIO.
122-
concat_characters : bool, optional
123-
If True, concatenate along the last dimension of character arrays to
124-
form string arrays. Dimensions will only be concatenated over (and
125-
removed) if they have no corresponding variable and if they are only
126-
used as the last dimension of character arrays.
127-
decode_coords : bool, optional
128-
If True, decode the 'coordinates' attribute to identify coordinates in
129-
the resulting dataset.
130-
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \
131-
"pseudonetcdf", "zarr"}, optional
111+
engine : str, optional
132112
Engine to use when reading files. If not provided, the default engine
133113
is chosen based on available dependencies, with a preference for
134-
"netcdf4".
114+
"netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\
115+
"pynio", "cfgrib", "pseudonetcdf", "zarr"}.
135116
chunks : int or dict, optional
136117
If chunks is provided, it is used to load the new dataset into dask
137118
arrays. ``chunks={}`` loads the dataset with dask using a single
138119
chunk for all arrays. When using ``engine="zarr"``, setting
139120
``chunks='auto'`` will create dask chunks based on the variable's zarr
140121
chunks.
141-
lock : False or lock-like, optional
142-
Resource lock to use when reading data from disk. Only relevant when
143-
using dask or another form of parallelism. By default, appropriate
144-
locks are chosen to safely read and write files with the currently
145-
active dask scheduler.
146122
cache : bool, optional
147-
If True, cache data loaded from the underlying datastore in memory as
123+
If True, cache data is loaded from the underlying datastore in memory as
148124
NumPy arrays when accessed to avoid reading from the underlying data-
149125
store multiple times. Defaults to True unless you specify the `chunks`
150126
argument to use dask, in which case it defaults to False. Does not
151127
change the behavior of coordinates corresponding to dimensions, which
152128
always load their data from disk into a ``pandas.Index``.
153-
drop_variables: str or iterable, optional
154-
A variable or list of variables to exclude from being parsed from the
155-
dataset. This may be useful to drop variables with problems or
156-
inconsistent values.
157-
backend_kwargs: dict, optional
158-
A dictionary of keyword arguments to pass on to the backend. This
159-
may be useful when backend options would improve performance or
160-
allow user control of dataset processing.
129+
decode_cf : bool, optional
130+
Setting ``decode_cf=False`` will disable ``mask_and_scale``,
131+
``decode_times``, ``decode_timedelta``, ``concat_characters``,
132+
``decode_coords``.
133+
mask_and_scale : bool, optional
134+
If True, array values equal to `_FillValue` are replaced with NA and other
135+
values are scaled according to the formula `original_values * scale_factor +
136+
add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
137+
taken from variable attributes (if they exist). If the `_FillValue` or
138+
`missing_value` attribute contains multiple values, a warning will be
139+
issued and all array values matching one of the multiple values will
140+
be replaced by NA. mask_and_scale defaults to True except for the
141+
pseudonetcdf backend. This keyword may not be supported by all the backends.
142+
decode_times : bool, optional
143+
If True, decode times encoded in the standard NetCDF datetime format
144+
into datetime objects. Otherwise, leave them encoded as numbers.
145+
This keyword may not be supported by all the backends.
146+
decode_timedelta : bool, optional
147+
If True, decode variables and coordinates with time units in
148+
{"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"}
149+
into timedelta objects. If False, they remain encoded as numbers.
150+
If None (default), assume the same value of decode_time.
151+
This keyword may not be supported by all the backends.
161152
use_cftime: bool, optional
162153
Only relevant if encoded dates come from a standard calendar
163154
(e.g. "gregorian", "proleptic_gregorian", "standard", or not
@@ -167,12 +158,38 @@ def open_dataset(
167158
``cftime.datetime`` objects, regardless of whether or not they can be
168159
represented using ``np.datetime64[ns]`` objects. If False, always
169160
decode times to ``np.datetime64[ns]`` objects; if this is not possible
170-
raise an error.
171-
decode_timedelta : bool, optional
172-
If True, decode variables and coordinates with time units in
173-
{"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"}
174-
into timedelta objects. If False, leave them encoded as numbers.
175-
If None (default), assume the same value of decode_time.
161+
raise an error. This keyword may not be supported by all the backends.
162+
concat_characters : bool, optional
163+
If True, concatenate along the last dimension of character arrays to
164+
form string arrays. Dimensions will only be concatenated over (and
165+
removed) if they have no corresponding variable and if they are only
166+
used as the last dimension of character arrays.
167+
This keyword may not be supported by all the backends.
168+
decode_coords : bool, optional
169+
If True, decode the 'coordinates' attribute to identify coordinates in
170+
the resulting dataset. This keyword may not be supported by all the
171+
backends.
172+
drop_variables: str or iterable, optional
173+
A variable or list of variables to exclude from the dataset parsing.
174+
This may be useful to drop variables with problems or
175+
inconsistent values.
176+
backend_kwargs:
177+
Additional keyword arguments passed on to the engine open function.
178+
**kwargs: dict
179+
Additional keyword arguments passed on to the engine open function.
180+
For example:
181+
182+
- 'group': path to the netCDF4 group in the given file to open given as
183+
a str,supported by "netcdf4", "h5netcdf", "zarr".
184+
185+
- 'lock': resource lock to use when reading data from disk. Only
186+
relevant when using dask or another form of parallelism. By default,
187+
appropriate locks are chosen to safely read and write files with the
188+
currently active dask scheduler. Supported by "netcdf4", "h5netcdf",
189+
"pynio", "pseudonetcdf", "cfgrib".
190+
191+
See engine open function for kwargs accepted by each specific engine.
192+
176193
177194
Returns
178195
-------
@@ -202,12 +219,27 @@ def open_dataset(
202219
if engine is None:
203220
engine = _autodetect_engine(filename_or_obj)
204221

222+
decoders = resolve_decoders_kwargs(
223+
decode_cf,
224+
engine=engine,
225+
mask_and_scale=mask_and_scale,
226+
decode_times=decode_times,
227+
decode_timedelta=decode_timedelta,
228+
concat_characters=concat_characters,
229+
use_cftime=use_cftime,
230+
decode_coords=decode_coords,
231+
)
232+
205233
backend_kwargs = backend_kwargs.copy()
206234
overwrite_encoded_chunks = backend_kwargs.pop("overwrite_encoded_chunks", None)
207235

208-
open_backend_dataset = _get_backend_cls(engine, engines=ENGINES)
236+
open_backend_dataset = _get_backend_cls(engine, engines=plugins.ENGINES)[
237+
"open_dataset"
238+
]
209239
backend_ds = open_backend_dataset(
210240
filename_or_obj,
241+
drop_variables=drop_variables,
242+
**decoders,
211243
**backend_kwargs,
212244
**{k: v for k, v in kwargs.items() if v is not None},
213245
)
@@ -218,7 +250,10 @@ def open_dataset(
218250
chunks,
219251
cache,
220252
overwrite_encoded_chunks,
221-
{**backend_kwargs, **kwargs},
253+
drop_variables=drop_variables,
254+
**decoders,
255+
**backend_kwargs,
256+
**kwargs,
222257
)
223258

224259
return ds

xarray/backends/cfgrib_.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ def get_encoding(self):
7676
def open_backend_dataset_cfgrib(
7777
filename_or_obj,
7878
*,
79-
decode_cf=True,
8079
mask_and_scale=True,
8180
decode_times=None,
8281
concat_characters=None,
@@ -93,13 +92,6 @@ def open_backend_dataset_cfgrib(
9392
time_dims=("time", "step"),
9493
):
9594

96-
if not decode_cf:
97-
mask_and_scale = False
98-
decode_times = False
99-
concat_characters = False
100-
decode_coords = False
101-
decode_timedelta = False
102-
10395
store = CfGribDataStore(
10496
filename_or_obj,
10597
indexpath=indexpath,

xarray/backends/h5netcdf_.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,6 @@ def close(self, **kwargs):
328328
def open_backend_dataset_h5necdf(
329329
filename_or_obj,
330330
*,
331-
decode_cf=True,
332331
mask_and_scale=True,
333332
decode_times=None,
334333
concat_characters=None,
@@ -343,13 +342,6 @@ def open_backend_dataset_h5necdf(
343342
phony_dims=None,
344343
):
345344

346-
if not decode_cf:
347-
mask_and_scale = False
348-
decode_times = False
349-
concat_characters = False
350-
decode_coords = False
351-
decode_timedelta = False
352-
353345
store = H5NetCDFStore.open(
354346
filename_or_obj,
355347
format=format,

xarray/backends/plugins.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import inspect
2+
import typing as T
3+
4+
from . import cfgrib_, h5netcdf_, zarr
5+
6+
ENGINES: T.Dict[str, T.Dict[str, T.Any]] = {
7+
"h5netcdf": {
8+
"open_dataset": h5netcdf_.open_backend_dataset_h5necdf,
9+
},
10+
"zarr": {
11+
"open_dataset": zarr.open_backend_dataset_zarr,
12+
},
13+
"cfgrib": {
14+
"open_dataset": cfgrib_.open_backend_dataset_cfgrib,
15+
},
16+
}
17+
18+
19+
for engine in ENGINES.values():
20+
if "signature" not in engine:
21+
parameters = inspect.signature(engine["open_dataset"]).parameters
22+
for name, param in parameters.items():
23+
if param.kind in (
24+
inspect.Parameter.VAR_KEYWORD,
25+
inspect.Parameter.VAR_POSITIONAL,
26+
):
27+
raise TypeError(
28+
f'All the parameters in {engine["open_dataset"]!r} signature should be explicit. '
29+
"*args and **kwargs is not supported"
30+
)
31+
engine["signature"] = set(parameters)

xarray/backends/zarr.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,6 @@ def open_zarr(
693693

694694
def open_backend_dataset_zarr(
695695
filename_or_obj,
696-
decode_cf=True,
697696
mask_and_scale=True,
698697
decode_times=None,
699698
concat_characters=None,
@@ -709,13 +708,6 @@ def open_backend_dataset_zarr(
709708
chunk_store=None,
710709
):
711710

712-
if not decode_cf:
713-
mask_and_scale = False
714-
decode_times = False
715-
concat_characters = False
716-
decode_coords = False
717-
decode_timedelta = False
718-
719711
store = ZarrStore.open_group(
720712
filename_or_obj,
721713
group=group,

0 commit comments

Comments
 (0)