Skip to content

Commit 7d55345

Browse files
committed
Merge branch 'main' into backend-indexing
* main: Add whatsnew entry for #8974 (#9022) Zarr: Optimize appending (#8998)
2 parents b46c320 + b9c124b commit 7d55345

File tree

5 files changed

+243
-71
lines changed

5 files changed

+243
-71
lines changed

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ nosetests.xml
5050
dask-worker-space/
5151

5252
# asv environments
53-
.asv
53+
asv_bench/.asv
54+
asv_bench/pkgs
5455

5556
# Translations
5657
*.mo
@@ -68,7 +69,7 @@ dask-worker-space/
6869

6970
# xarray specific
7071
doc/_build
71-
generated/
72+
doc/generated/
7273
xarray/tests/data/*.grib.*.idx
7374

7475
# Sync tools

doc/whats-new.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ Bug fixes
7373

7474
Internal Changes
7575
~~~~~~~~~~~~~~~~
76+
- Enforces failures on CI when tests raise warnings from within xarray (:pull:`8974`)
77+
By `Maximilian Roos <https://github.com/max-sixty>`_
7678
- Migrates ``formatting_html`` functionality for ``DataTree`` into ``xarray/core`` (:pull: `8930`)
7779
By `Eni Awowale <https://github.com/eni-awowale>`_, `Julia Signell <https://github.com/jsignell>`_
7880
and `Tom Nicholas <https://github.com/TomNicholas>`_.
@@ -87,9 +89,10 @@ Internal Changes
8789
- Migrates ``ops.py`` functionality into ``xarray/core/datatree_ops.py`` (:pull:`8976`)
8890
By `Matt Savoie <https://github.com/flamingbear>`_ and `Tom Nicholas <https://github.com/TomNicholas>`_.
8991
- ``transpose``, ``set_dims``, ``stack`` & ``unstack`` now use a ``dim`` kwarg
90-
rather than ``dims`` or ``dimensions``. This is the final change to make xarray methods
91-
consistent with their use of ``dim``. Using the existing kwarg will raise a
92-
warning. By `Maximilian Roos <https://github.com/max-sixty>`_
92+
rather than ``dims`` or ``dimensions``. This is the final change to unify
93+
xarray functions to use ``dim``. Using the existing kwarg will raise a
94+
warning.
95+
By `Maximilian Roos <https://github.com/max-sixty>`_
9396

9497

9598
.. _whats-new.2024.03.0:

xarray/backends/api.py

Lines changed: 3 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,42 +1521,6 @@ def save_mfdataset(
15211521
)
15221522

15231523

1524-
def _validate_datatypes_for_zarr_append(zstore, dataset):
1525-
"""If variable exists in the store, confirm dtype of the data to append is compatible with
1526-
existing dtype.
1527-
"""
1528-
1529-
existing_vars = zstore.get_variables()
1530-
1531-
def check_dtype(vname, var):
1532-
if (
1533-
vname not in existing_vars
1534-
or np.issubdtype(var.dtype, np.number)
1535-
or np.issubdtype(var.dtype, np.datetime64)
1536-
or np.issubdtype(var.dtype, np.bool_)
1537-
or var.dtype == object
1538-
):
1539-
# We can skip dtype equality checks under two conditions: (1) if the var to append is
1540-
# new to the dataset, because in this case there is no existing var to compare it to;
1541-
# or (2) if var to append's dtype is known to be easy-to-append, because in this case
1542-
# we can be confident appending won't cause problems. Examples of dtypes which are not
1543-
# easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
1544-
# positive integer character length). For these dtypes, appending dissimilar lengths
1545-
# can result in truncation of appended data. Therefore, variables which already exist
1546-
# in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
1547-
# exact dtype equality, as checked below.
1548-
pass
1549-
elif not var.dtype == existing_vars[vname].dtype:
1550-
raise ValueError(
1551-
f"Mismatched dtypes for variable {vname} between Zarr store on disk "
1552-
f"and dataset to append. Store has dtype {existing_vars[vname].dtype} but "
1553-
f"dataset to append has dtype {var.dtype}."
1554-
)
1555-
1556-
for vname, var in dataset.data_vars.items():
1557-
check_dtype(vname, var)
1558-
1559-
15601524
# compute=True returns ZarrStore
15611525
@overload
15621526
def to_zarr(
@@ -1712,37 +1676,21 @@ def to_zarr(
17121676

17131677
if region is not None:
17141678
zstore._validate_and_autodetect_region(dataset)
1715-
# can't modify indexed with region writes
1679+
# can't modify indexes with region writes
17161680
dataset = dataset.drop_vars(dataset.indexes)
17171681
if append_dim is not None and append_dim in region:
17181682
raise ValueError(
17191683
f"cannot list the same dimension in both ``append_dim`` and "
17201684
f"``region`` with to_zarr(), got {append_dim} in both"
17211685
)
17221686

1723-
if mode in ["a", "a-", "r+"]:
1724-
_validate_datatypes_for_zarr_append(zstore, dataset)
1725-
if append_dim is not None:
1726-
existing_dims = zstore.get_dimensions()
1727-
if append_dim not in existing_dims:
1728-
raise ValueError(
1729-
f"append_dim={append_dim!r} does not match any existing "
1730-
f"dataset dimensions {existing_dims}"
1731-
)
1687+
if encoding and mode in ["a", "a-", "r+"]:
17321688
existing_var_names = set(zstore.zarr_group.array_keys())
17331689
for var_name in existing_var_names:
1734-
if var_name in encoding.keys():
1690+
if var_name in encoding:
17351691
raise ValueError(
17361692
f"variable {var_name!r} already exists, but encoding was provided"
17371693
)
1738-
if mode == "r+":
1739-
new_names = [k for k in dataset.variables if k not in existing_var_names]
1740-
if new_names:
1741-
raise ValueError(
1742-
f"dataset contains non-pre-existing variables {new_names}, "
1743-
"which is not allowed in ``xarray.Dataset.to_zarr()`` with "
1744-
"mode='r+'. To allow writing new variables, set mode='a'."
1745-
)
17461694

17471695
writer = ArrayWriter()
17481696
# TODO: figure out how to properly handle unlimited_dims

xarray/backends/zarr.py

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,34 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
337337
return var
338338

339339

340+
def _validate_datatypes_for_zarr_append(vname, existing_var, new_var):
341+
"""If variable exists in the store, confirm dtype of the data to append is compatible with
342+
existing dtype.
343+
"""
344+
if (
345+
np.issubdtype(new_var.dtype, np.number)
346+
or np.issubdtype(new_var.dtype, np.datetime64)
347+
or np.issubdtype(new_var.dtype, np.bool_)
348+
or new_var.dtype == object
349+
):
350+
# We can skip dtype equality checks under two conditions: (1) if the var to append is
351+
# new to the dataset, because in this case there is no existing var to compare it to;
352+
# or (2) if var to append's dtype is known to be easy-to-append, because in this case
353+
# we can be confident appending won't cause problems. Examples of dtypes which are not
354+
# easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
355+
# positive integer character length). For these dtypes, appending dissimilar lengths
356+
# can result in truncation of appended data. Therefore, variables which already exist
357+
# in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
358+
# exact dtype equality, as checked below.
359+
pass
360+
elif not new_var.dtype == existing_var.dtype:
361+
raise ValueError(
362+
f"Mismatched dtypes for variable {vname} between Zarr store on disk "
363+
f"and dataset to append. Store has dtype {existing_var.dtype} but "
364+
f"dataset to append has dtype {new_var.dtype}."
365+
)
366+
367+
340368
def _validate_and_transpose_existing_dims(
341369
var_name, new_var, existing_var, region, append_dim
342370
):
@@ -625,26 +653,58 @@ def store(
625653
import zarr
626654

627655
existing_keys = tuple(self.zarr_group.array_keys())
656+
657+
if self._mode == "r+":
658+
new_names = [k for k in variables if k not in existing_keys]
659+
if new_names:
660+
raise ValueError(
661+
f"dataset contains non-pre-existing variables {new_names}, "
662+
"which is not allowed in ``xarray.Dataset.to_zarr()`` with "
663+
"``mode='r+'``. To allow writing new variables, set ``mode='a'``."
664+
)
665+
666+
if self._append_dim is not None and self._append_dim not in existing_keys:
667+
# For dimensions without coordinate values, we must parse
668+
# the _ARRAY_DIMENSIONS attribute on *all* arrays to check if it
669+
# is a valid existing dimension name.
670+
# TODO: This `get_dimensions` method also does shape checking
671+
# which isn't strictly necessary for our check.
672+
existing_dims = self.get_dimensions()
673+
if self._append_dim not in existing_dims:
674+
raise ValueError(
675+
f"append_dim={self._append_dim!r} does not match any existing "
676+
f"dataset dimensions {existing_dims}"
677+
)
678+
628679
existing_variable_names = {
629680
vn for vn in variables if _encode_variable_name(vn) in existing_keys
630681
}
631-
new_variables = set(variables) - existing_variable_names
632-
variables_without_encoding = {vn: variables[vn] for vn in new_variables}
682+
new_variable_names = set(variables) - existing_variable_names
633683
variables_encoded, attributes = self.encode(
634-
variables_without_encoding, attributes
684+
{vn: variables[vn] for vn in new_variable_names}, attributes
635685
)
636686

637687
if existing_variable_names:
638-
# Decode variables directly, without going via xarray.Dataset to
639-
# avoid needing to load index variables into memory.
640-
# TODO: consider making loading indexes lazy again?
688+
# We make sure that values to be appended are encoded *exactly*
689+
# as the current values in the store.
690+
# To do so, we decode variables directly to access the proper encoding,
691+
# without going via xarray.Dataset to avoid needing to load
692+
# index variables into memory.
641693
existing_vars, _, _ = conventions.decode_cf_variables(
642-
{k: self.open_store_variable(name=k) for k in existing_variable_names},
643-
self.get_attrs(),
694+
variables={
695+
k: self.open_store_variable(name=k) for k in existing_variable_names
696+
},
697+
# attributes = {} since we don't care about parsing the global
698+
# "coordinates" attribute
699+
attributes={},
644700
)
645701
# Modified variables must use the same encoding as the store.
646702
vars_with_encoding = {}
647703
for vn in existing_variable_names:
704+
if self._mode in ["a", "a-", "r+"]:
705+
_validate_datatypes_for_zarr_append(
706+
vn, existing_vars[vn], variables[vn]
707+
)
648708
vars_with_encoding[vn] = variables[vn].copy(deep=False)
649709
vars_with_encoding[vn].encoding = existing_vars[vn].encoding
650710
vars_with_encoding, _ = self.encode(vars_with_encoding, {})
@@ -709,7 +769,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
709769

710770
for vn, v in variables.items():
711771
name = _encode_variable_name(vn)
712-
check = vn in check_encoding_set
713772
attrs = v.attrs.copy()
714773
dims = v.dims
715774
dtype = v.dtype
@@ -725,7 +784,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
725784
# https://github.com/pydata/xarray/issues/8371 for details.
726785
encoding = extract_zarr_variable_encoding(
727786
v,
728-
raise_on_invalid=check,
787+
raise_on_invalid=vn in check_encoding_set,
729788
name=vn,
730789
safe_chunks=self._safe_chunks,
731790
)
@@ -828,7 +887,7 @@ def _auto_detect_regions(self, ds, region):
828887
assert variable.dims == (dim,)
829888
index = pd.Index(variable.data)
830889
idxs = index.get_indexer(ds[dim].data)
831-
if any(idxs == -1):
890+
if (idxs == -1).any():
832891
raise KeyError(
833892
f"Not all values of coordinate '{dim}' in the new array were"
834893
" found in the original store. Writing to a zarr region slice"

0 commit comments

Comments
 (0)