Skip to content

Commit 58de38f

Browse files
authored
Merge branch 'main' into grouper-public-api
2 parents 64f78cd + 04b38a0 commit 58de38f

26 files changed

+873
-341
lines changed

asv_bench/benchmarks/dataset_io.py

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import pandas as pd
88

99
import xarray as xr
10+
from xarray.backends.api import open_datatree
11+
from xarray.core.datatree import DataTree
1012

1113
from . import _skip_slow, parameterized, randint, randn, requires_dask
1214

@@ -16,7 +18,6 @@
1618
except ImportError:
1719
pass
1820

19-
2021
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
2122

2223
_ENGINES = tuple(xr.backends.list_engines().keys() - {"store"})
@@ -469,6 +470,116 @@ def create_delayed_write():
469470
return ds.to_netcdf("file.nc", engine="netcdf4", compute=False)
470471

471472

473+
class IONestedDataTree:
474+
"""
475+
A few examples that benchmark reading/writing a heavily nested netCDF datatree with
476+
xarray
477+
"""
478+
479+
timeout = 300.0
480+
repeat = 1
481+
number = 5
482+
483+
def make_datatree(self, nchildren=10):
484+
# multiple Dataset
485+
self.ds = xr.Dataset()
486+
self.nt = 1000
487+
self.nx = 90
488+
self.ny = 45
489+
self.nchildren = nchildren
490+
491+
self.block_chunks = {
492+
"time": self.nt / 4,
493+
"lon": self.nx / 3,
494+
"lat": self.ny / 3,
495+
}
496+
497+
self.time_chunks = {"time": int(self.nt / 36)}
498+
499+
times = pd.date_range("1970-01-01", periods=self.nt, freq="D")
500+
lons = xr.DataArray(
501+
np.linspace(0, 360, self.nx),
502+
dims=("lon",),
503+
attrs={"units": "degrees east", "long_name": "longitude"},
504+
)
505+
lats = xr.DataArray(
506+
np.linspace(-90, 90, self.ny),
507+
dims=("lat",),
508+
attrs={"units": "degrees north", "long_name": "latitude"},
509+
)
510+
self.ds["foo"] = xr.DataArray(
511+
randn((self.nt, self.nx, self.ny), frac_nan=0.2),
512+
coords={"lon": lons, "lat": lats, "time": times},
513+
dims=("time", "lon", "lat"),
514+
name="foo",
515+
attrs={"units": "foo units", "description": "a description"},
516+
)
517+
self.ds["bar"] = xr.DataArray(
518+
randn((self.nt, self.nx, self.ny), frac_nan=0.2),
519+
coords={"lon": lons, "lat": lats, "time": times},
520+
dims=("time", "lon", "lat"),
521+
name="bar",
522+
attrs={"units": "bar units", "description": "a description"},
523+
)
524+
self.ds["baz"] = xr.DataArray(
525+
randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
526+
coords={"lon": lons, "lat": lats},
527+
dims=("lon", "lat"),
528+
name="baz",
529+
attrs={"units": "baz units", "description": "a description"},
530+
)
531+
532+
self.ds.attrs = {"history": "created for xarray benchmarking"}
533+
534+
self.oinds = {
535+
"time": randint(0, self.nt, 120),
536+
"lon": randint(0, self.nx, 20),
537+
"lat": randint(0, self.ny, 10),
538+
}
539+
self.vinds = {
540+
"time": xr.DataArray(randint(0, self.nt, 120), dims="x"),
541+
"lon": xr.DataArray(randint(0, self.nx, 120), dims="x"),
542+
"lat": slice(3, 20),
543+
}
544+
root = {f"group_{group}": self.ds for group in range(self.nchildren)}
545+
nested_tree1 = {
546+
f"group_{group}/subgroup_1": xr.Dataset() for group in range(self.nchildren)
547+
}
548+
nested_tree2 = {
549+
f"group_{group}/subgroup_2": xr.DataArray(np.arange(1, 10)).to_dataset(
550+
name="a"
551+
)
552+
for group in range(self.nchildren)
553+
}
554+
nested_tree3 = {
555+
f"group_{group}/subgroup_2/sub-subgroup_1": self.ds
556+
for group in range(self.nchildren)
557+
}
558+
dtree = root | nested_tree1 | nested_tree2 | nested_tree3
559+
self.dtree = DataTree.from_dict(dtree)
560+
561+
562+
class IOReadDataTreeNetCDF4(IONestedDataTree):
563+
def setup(self):
564+
# TODO: Lazily skipped in CI as it is very demanding and slow.
565+
# Improve times and remove errors.
566+
_skip_slow()
567+
568+
requires_dask()
569+
570+
self.make_datatree()
571+
self.format = "NETCDF4"
572+
self.filepath = "datatree.nc4.nc"
573+
dtree = self.dtree
574+
dtree.to_netcdf(filepath=self.filepath)
575+
576+
def time_load_datatree_netcdf4(self):
577+
open_datatree(self.filepath, engine="netcdf4").load()
578+
579+
def time_open_datatree_netcdf4(self):
580+
open_datatree(self.filepath, engine="netcdf4")
581+
582+
472583
class IOWriteNetCDFDask:
473584
timeout = 60
474585
repeat = 1

doc/whats-new.rst

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ New Features
2424
~~~~~~~~~~~~
2525
- Allow chunking for arrays with duplicated dimension names (:issue:`8759`, :pull:`9099`).
2626
By `Martin Raspaud <https://github.com/mraspaud>`_.
27+
- Extract the source url from fsspec objects (:issue:`9142`, :pull:`8923`).
28+
By `Justus Magin <https://github.com/keewis>`_.
2729

2830
Breaking changes
2931
~~~~~~~~~~~~~~~~
@@ -35,11 +37,21 @@ Deprecations
3537

3638
Bug fixes
3739
~~~~~~~~~
40+
- Don't convert custom indexes to ``pandas`` indexes when computing a diff (:pull:`9157`)
41+
By `Justus Magin <https://github.com/keewis>`_.
3842
- Make :py:func:`testing.assert_allclose` work with numpy 2.0 (:issue:`9165`, :pull:`9166`).
3943
By `Pontus Lurcock <https://github.com/pont-us>`_.
44+
- Allow diffing objects with array attributes on variables (:issue:`9153`, :pull:`9169`).
45+
By `Justus Magin <https://github.com/keewis>`_.
4046
- Promote floating-point numeric datetimes before decoding (:issue:`9179`, :pull:`9182`).
4147
By `Justus Magin <https://github.com/keewis>`_.
42-
48+
- Fiy static typing of tolerance arguments by allowing `str` type (:issue:`8892`, :pull:`9194`).
49+
By `Michael Niklas <https://github.com/headtr1ck>`_.
50+
- Dark themes are now properly detected for ``html[data-theme=dark]``-tags (:pull:`9200`).
51+
By `Dieter Werthmüller <https://github.com/prisae>`_.
52+
- Reductions no longer fail for ``np.complex_`` dtype arrays when numbagg is
53+
installed.
54+
By `Maximilian Roos <https://github.com/max-sixty>`_
4355

4456
Documentation
4557
~~~~~~~~~~~~~

properties/test_pandas_roundtrip.py

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import pytest
1010

1111
import xarray as xr
12-
from xarray.tests import has_pandas_3
1312

1413
pytest.importorskip("hypothesis")
1514
import hypothesis.extra.numpy as npst # isort:skip
@@ -25,22 +24,34 @@
2524

2625
numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt))
2726

27+
28+
@st.composite
29+
def dataframe_strategy(draw):
30+
tz = draw(st.timezones())
31+
dtype = pd.DatetimeTZDtype(unit="ns", tz=tz)
32+
33+
datetimes = st.datetimes(
34+
min_value=pd.Timestamp("1677-09-21T00:12:43.145224193"),
35+
max_value=pd.Timestamp("2262-04-11T23:47:16.854775807"),
36+
timezones=st.just(tz),
37+
)
38+
39+
df = pdst.data_frames(
40+
[
41+
pdst.column("datetime_col", elements=datetimes),
42+
pdst.column("other_col", elements=st.integers()),
43+
],
44+
index=pdst.range_indexes(min_size=1, max_size=10),
45+
)
46+
return draw(df).astype({"datetime_col": dtype})
47+
48+
2849
an_array = npst.arrays(
2950
dtype=numeric_dtypes,
3051
shape=npst.array_shapes(max_dims=2), # can only convert 1D/2D to pandas
3152
)
3253

3354

34-
datetime_with_tz_strategy = st.datetimes(timezones=st.timezones())
35-
dataframe_strategy = pdst.data_frames(
36-
[
37-
pdst.column("datetime_col", elements=datetime_with_tz_strategy),
38-
pdst.column("other_col", elements=st.integers()),
39-
],
40-
index=pdst.range_indexes(min_size=1, max_size=10),
41-
)
42-
43-
4455
@st.composite
4556
def datasets_1d_vars(draw) -> xr.Dataset:
4657
"""Generate datasets with only 1D variables
@@ -111,11 +122,7 @@ def test_roundtrip_pandas_dataframe(df) -> None:
111122
xr.testing.assert_identical(arr, roundtripped.to_xarray())
112123

113124

114-
@pytest.mark.skipif(
115-
has_pandas_3,
116-
reason="fails to roundtrip on pandas 3 (see https://github.com/pydata/xarray/issues/9098)",
117-
)
118-
@given(df=dataframe_strategy)
125+
@given(df=dataframe_strategy())
119126
def test_roundtrip_pandas_dataframe_datetime(df) -> None:
120127
# Need to name the indexes, otherwise Xarray names them 'dim_0', 'dim_1'.
121128
df.index.name = "rows"

properties/test_properties.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import pytest
2+
3+
pytest.importorskip("hypothesis")
4+
5+
from hypothesis import given
6+
7+
import xarray as xr
8+
import xarray.testing.strategies as xrst
9+
10+
11+
@given(attrs=xrst.simple_attrs)
12+
def test_assert_identical(attrs):
13+
v = xr.Variable(dims=(), data=0, attrs=attrs)
14+
xr.testing.assert_identical(v, v.copy(deep=True))
15+
16+
ds = xr.Dataset(attrs=attrs)
17+
xr.testing.assert_identical(ds, ds.copy(deep=True))

xarray/backends/api.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -382,8 +382,11 @@ def _dataset_from_backend_dataset(
382382
ds.set_close(backend_ds._close)
383383

384384
# Ensure source filename always stored in dataset object
385-
if "source" not in ds.encoding and isinstance(filename_or_obj, (str, os.PathLike)):
386-
ds.encoding["source"] = _normalize_path(filename_or_obj)
385+
if "source" not in ds.encoding:
386+
path = getattr(filename_or_obj, "path", filename_or_obj)
387+
388+
if isinstance(path, (str, os.PathLike)):
389+
ds.encoding["source"] = _normalize_path(path)
387390

388391
return ds
389392

xarray/backends/zarr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ def open_store(
446446
stacklevel=stacklevel,
447447
zarr_version=zarr_version,
448448
)
449-
group_paths = [str(group / node[1:]) for node in _iter_zarr_groups(zarr_group)]
449+
group_paths = [node for node in _iter_zarr_groups(zarr_group, parent=group)]
450450
return {
451451
group: cls(
452452
zarr_group.get(group),

xarray/core/alignment.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def __init__(
137137
exclude_dims: str | Iterable[Hashable] = frozenset(),
138138
exclude_vars: Iterable[Hashable] = frozenset(),
139139
method: str | None = None,
140-
tolerance: int | float | Iterable[int | float] | None = None,
140+
tolerance: float | Iterable[float] | str | None = None,
141141
copy: bool = True,
142142
fill_value: Any = dtypes.NA,
143143
sparse: bool = False,
@@ -965,7 +965,7 @@ def reindex(
965965
obj: T_Alignable,
966966
indexers: Mapping[Any, Any],
967967
method: str | None = None,
968-
tolerance: int | float | Iterable[int | float] | None = None,
968+
tolerance: float | Iterable[float] | str | None = None,
969969
copy: bool = True,
970970
fill_value: Any = dtypes.NA,
971971
sparse: bool = False,
@@ -1004,7 +1004,7 @@ def reindex_like(
10041004
obj: T_Alignable,
10051005
other: Dataset | DataArray,
10061006
method: str | None = None,
1007-
tolerance: int | float | Iterable[int | float] | None = None,
1007+
tolerance: float | Iterable[float] | str | None = None,
10081008
copy: bool = True,
10091009
fill_value: Any = dtypes.NA,
10101010
) -> T_Alignable:

xarray/core/dataarray.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1910,7 +1910,7 @@ def reindex_like(
19101910
other: T_DataArrayOrSet,
19111911
*,
19121912
method: ReindexMethodOptions = None,
1913-
tolerance: int | float | Iterable[int | float] | None = None,
1913+
tolerance: float | Iterable[float] | str | None = None,
19141914
copy: bool = True,
19151915
fill_value=dtypes.NA,
19161916
) -> Self:
@@ -1937,7 +1937,7 @@ def reindex_like(
19371937
- backfill / bfill: propagate next valid index value backward
19381938
- nearest: use nearest valid index value
19391939
1940-
tolerance : optional
1940+
tolerance : float | Iterable[float] | str | None, default: None
19411941
Maximum distance between original and new labels for inexact
19421942
matches. The values of the index at the matching locations must
19431943
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
@@ -2097,7 +2097,7 @@ def reindex(
20972097
indexers: Mapping[Any, Any] | None = None,
20982098
*,
20992099
method: ReindexMethodOptions = None,
2100-
tolerance: float | Iterable[float] | None = None,
2100+
tolerance: float | Iterable[float] | str | None = None,
21012101
copy: bool = True,
21022102
fill_value=dtypes.NA,
21032103
**indexers_kwargs: Any,
@@ -2127,7 +2127,7 @@ def reindex(
21272127
- backfill / bfill: propagate next valid index value backward
21282128
- nearest: use nearest valid index value
21292129
2130-
tolerance : float | Iterable[float] | None, default: None
2130+
tolerance : float | Iterable[float] | str | None, default: None
21312131
Maximum distance between original and new labels for inexact
21322132
matches. The values of the index at the matching locations must
21332133
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

xarray/core/dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3500,7 +3500,7 @@ def reindex_like(
35003500
self,
35013501
other: T_Xarray,
35023502
method: ReindexMethodOptions = None,
3503-
tolerance: int | float | Iterable[int | float] | None = None,
3503+
tolerance: float | Iterable[float] | str | None = None,
35043504
copy: bool = True,
35053505
fill_value: Any = xrdtypes.NA,
35063506
) -> Self:
@@ -3527,7 +3527,7 @@ def reindex_like(
35273527
- "backfill" / "bfill": propagate next valid index value backward
35283528
- "nearest": use nearest valid index value
35293529
3530-
tolerance : optional
3530+
tolerance : float | Iterable[float] | str | None, default: None
35313531
Maximum distance between original and new labels for inexact
35323532
matches. The values of the index at the matching locations must
35333533
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
@@ -3570,7 +3570,7 @@ def reindex(
35703570
self,
35713571
indexers: Mapping[Any, Any] | None = None,
35723572
method: ReindexMethodOptions = None,
3573-
tolerance: int | float | Iterable[int | float] | None = None,
3573+
tolerance: float | Iterable[float] | str | None = None,
35743574
copy: bool = True,
35753575
fill_value: Any = xrdtypes.NA,
35763576
**indexers_kwargs: Any,
@@ -3595,7 +3595,7 @@ def reindex(
35953595
- "backfill" / "bfill": propagate next valid index value backward
35963596
- "nearest": use nearest valid index value
35973597
3598-
tolerance : optional
3598+
tolerance : float | Iterable[float] | str | None, default: None
35993599
Maximum distance between original and new labels for inexact
36003600
matches. The values of the index at the matching locations must
36013601
satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

0 commit comments

Comments
 (0)