From 49c9ea4ba924a04bf705c8bd32f848c8688a7092 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Tue, 8 Jul 2025 13:06:53 +0200 Subject: [PATCH 1/6] The align_chunks parameter was not being sent on the to_zarr method of the datasets --- xarray/core/dataset.py | 1 + xarray/tests/test_backends.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6de626a159b..3399d116c6f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2302,6 +2302,7 @@ def to_zarr( append_dim=append_dim, region=region, safe_chunks=safe_chunks, + align_chunks=align_chunks, zarr_version=zarr_version, zarr_format=zarr_format, write_empty_chunks=write_empty_chunks, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..4c828e19227 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6868,6 +6868,21 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): chunk = chunk.chunk() self.save(store, chunk.chunk(), region=region) + @requires_dask + def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: + skip_if_zarr_format_3(tmp_store) + dataset = DataArray( + np.arange(4), dims=["a"], coords={"a": np.arange(4)} + ).chunk(a=(2, 1, 1)).to_dataset(name="foo") + + dataset.to_zarr( + tmp_store, + align_chunks=True, + encoding={"foo": {"chunks": (3,)}}, + ) + with open_dataset(tmp_store, engine="zarr") as loaded_ds: + assert_identical(dataset, loaded_ds) + @requires_h5netcdf @requires_fsspec From fa00c95755684e2204eefcf27a4f62845c03356a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Jul 2025 12:54:49 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4c828e19227..6da34c23a6e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6871,9 +6871,11 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): @requires_dask def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: skip_if_zarr_format_3(tmp_store) - dataset = DataArray( - np.arange(4), dims=["a"], coords={"a": np.arange(4)} - ).chunk(a=(2, 1, 1)).to_dataset(name="foo") + dataset = ( + DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) + .chunk(a=(2, 1, 1)) + .to_dataset(name="foo") + ) dataset.to_zarr( tmp_store, From 6d3ff30a9f99983a54bb00ee2e4ad5a75b65fdf5 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Tue, 8 Jul 2025 15:05:03 +0200 Subject: [PATCH 3/6] Add a note on the whats-new.rst about the error of the align_chunks for datasets --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e8b602e9dc9..cf245750df6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being + passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). + Documentation ~~~~~~~~~~~~~ From 62e3ddbc180acd879b8ad651c725af9dbcbf70d0 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Tue, 8 Jul 2025 15:25:19 +0200 Subject: [PATCH 4/6] Fix a ValueError on the test_dataset_to_zarr_align_chunks_true --- xarray/tests/test_backends.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6da34c23a6e..3fff0f78cf2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6870,20 +6870,22 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): @requires_dask def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: - skip_if_zarr_format_3(tmp_store) - dataset = ( - DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) - .chunk(a=(2, 1, 1)) - .to_dataset(name="foo") - ) + # This test is a replica of the one in `test_dataarray_to_zarr_align_chunks_true` + # but for datasets + with self.create_zarr_target() as store: + ds = ( + DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) + .chunk(a=(2, 1, 1)) + .to_dataset(name="foo") + ) - dataset.to_zarr( - tmp_store, - align_chunks=True, - encoding={"foo": {"chunks": (3,)}}, - ) - with open_dataset(tmp_store, engine="zarr") as loaded_ds: - assert_identical(dataset, loaded_ds) + self.save( + store, + ds, + align_chunks=True, + encoding={"foo": {"chunks": (3,)}}, + mode="w", + ) @requires_h5netcdf From a2789f6da1fc77238c36fb732bead1d52727b3b6 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 9 Jul 2025 17:28:10 +0200 Subject: [PATCH 5/6] Fix the case when enc_chunks are bigger than the dask chunks --- xarray/backends/chunks.py | 29 +++++++++++++++++++---------- xarray/tests/test_backends.py | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index f17f5375976..11744eff194 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -138,9 +138,6 @@ def build_grid_chunks( chunk_size: int, region: slice | None = None, ) -> tuple[int, ...]: - if region is None: - region = slice(0, size) - region_start = region.start or 0 # Generate the zarr chunks inside the region of this dim chunks_on_region = [chunk_size - (region_start % chunk_size)] @@ -159,6 +156,15 @@ def grid_rechunk( if not nd_var_chunks: return v + # This is useful for the scenarios where the enc_chunks are bigger than the + # variable chunks, which happens when the user specifies the enc_chunks manually. + enc_chunks = tuple( + min(enc_chunk, sum(v_chunk)) + for enc_chunk, v_chunk in zip( + enc_chunks, v.chunks, strict=True + ) + ) + nd_grid_chunks = tuple( build_grid_chunks( sum(var_chunks), @@ -191,9 +197,9 @@ def validate_grid_chunks_alignment( base_error = ( "Specified Zarr chunks encoding['chunks']={enc_chunks!r} for " "variable named {name!r} would overlap multiple Dask chunks. " - "Check the chunk at position {var_chunk_pos}, which has a size of " - "{var_chunk_size} on dimension {dim_i}. It is unaligned with " - "backend chunks of size {chunk_size} in region {region}. " + "Please check the Dask chunks at position {var_chunk_pos} and " + "{var_chunk_pos_next}, on axis {axis}, they are overlapped " + "on the same Zarr chunk in the region {region}. " "Writing this array in parallel with Dask could lead to corrupted data. " "To resolve this issue, consider one of the following options: " "- Rechunk the array using `chunk()`. " @@ -202,7 +208,7 @@ def validate_grid_chunks_alignment( "- Enable automatic chunks alignment with `align_chunks=True`." ) - for dim_i, chunk_size, var_chunks, interval, size in zip( + for axis, chunk_size, var_chunks, interval, size in zip( range(len(enc_chunks)), enc_chunks, nd_var_chunks, @@ -215,9 +221,10 @@ def validate_grid_chunks_alignment( raise ValueError( base_error.format( var_chunk_pos=i + 1, + var_chunk_pos_next=i + 2, var_chunk_size=chunk, + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, @@ -237,9 +244,10 @@ def validate_grid_chunks_alignment( raise ValueError( base_error.format( var_chunk_pos=0, + var_chunk_pos_next=0, var_chunk_size=var_chunks[0], + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, @@ -251,9 +259,10 @@ def validate_grid_chunks_alignment( error_on_last_chunk = base_error.format( var_chunk_pos=len(var_chunks) - 1, + var_chunk_pos_next=len(var_chunks) - 1, var_chunk_size=var_chunks[-1], + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fff0f78cf2..017b239e3af 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6874,8 +6874,15 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: # but for datasets with self.create_zarr_target() as store: ds = ( - DataArray(np.arange(4), dims=["a"], coords={"a": np.arange(4)}) - .chunk(a=(2, 1, 1)) + DataArray( + np.arange(4).reshape((2, 2)), + dims=["a", "b"], + coords={ + "a": np.arange(2), + "b": np.arange(2), + } + ) + .chunk(a=(1, 1), b=(1, 1)) .to_dataset(name="foo") ) @@ -6883,9 +6890,31 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: store, ds, align_chunks=True, - encoding={"foo": {"chunks": (3,)}}, + encoding={"foo": {"chunks": (3,3)}}, mode="w", ) + assert_identical(ds, xr.open_zarr(store)) + + ds = ( + DataArray( + np.arange(4, 8).reshape((2, 2)), + dims=["a", "b"], + coords={ + "a": np.arange(2), + "b": np.arange(2), + } + ) + .chunk(a=(1, 1), b=(1, 1)) + .to_dataset(name="foo") + ) + + self.save( + store, + ds, + align_chunks=True, + region="auto", + ) + assert_identical(ds, xr.open_zarr(store)) @requires_h5netcdf From 60c6c75f902075ae883c06c4991ea6069c6ea965 Mon Sep 17 00:00:00 2001 From: Joseph Gonzalez Date: Wed, 9 Jul 2025 17:28:50 +0200 Subject: [PATCH 6/6] Linter --- xarray/backends/chunks.py | 4 +--- xarray/tests/test_backends.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index 11744eff194..fbf085165c1 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -160,9 +160,7 @@ def grid_rechunk( # variable chunks, which happens when the user specifies the enc_chunks manually. enc_chunks = tuple( min(enc_chunk, sum(v_chunk)) - for enc_chunk, v_chunk in zip( - enc_chunks, v.chunks, strict=True - ) + for enc_chunk, v_chunk in zip(enc_chunks, v.chunks, strict=True) ) nd_grid_chunks = tuple( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 017b239e3af..072b81121f5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6880,7 +6880,7 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: coords={ "a": np.arange(2), "b": np.arange(2), - } + }, ) .chunk(a=(1, 1), b=(1, 1)) .to_dataset(name="foo") @@ -6890,7 +6890,7 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: store, ds, align_chunks=True, - encoding={"foo": {"chunks": (3,3)}}, + encoding={"foo": {"chunks": (3, 3)}}, mode="w", ) assert_identical(ds, xr.open_zarr(store)) @@ -6902,7 +6902,7 @@ def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: coords={ "a": np.arange(2), "b": np.arange(2), - } + }, ) .chunk(a=(1, 1), b=(1, 1)) .to_dataset(name="foo")