diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e8b602e9dc9..cf245750df6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being + passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`). + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/chunks.py b/xarray/backends/chunks.py index f17f5375976..fbf085165c1 100644 --- a/xarray/backends/chunks.py +++ b/xarray/backends/chunks.py @@ -138,9 +138,6 @@ def build_grid_chunks( chunk_size: int, region: slice | None = None, ) -> tuple[int, ...]: - if region is None: - region = slice(0, size) - region_start = region.start or 0 # Generate the zarr chunks inside the region of this dim chunks_on_region = [chunk_size - (region_start % chunk_size)] @@ -159,6 +156,13 @@ def grid_rechunk( if not nd_var_chunks: return v + # This is useful for the scenarios where the enc_chunks are bigger than the + # variable chunks, which happens when the user specifies the enc_chunks manually. + enc_chunks = tuple( + min(enc_chunk, sum(v_chunk)) + for enc_chunk, v_chunk in zip(enc_chunks, v.chunks, strict=True) + ) + nd_grid_chunks = tuple( build_grid_chunks( sum(var_chunks), @@ -191,9 +195,9 @@ def validate_grid_chunks_alignment( base_error = ( "Specified Zarr chunks encoding['chunks']={enc_chunks!r} for " "variable named {name!r} would overlap multiple Dask chunks. " - "Check the chunk at position {var_chunk_pos}, which has a size of " - "{var_chunk_size} on dimension {dim_i}. It is unaligned with " - "backend chunks of size {chunk_size} in region {region}. " + "Please check the Dask chunks at position {var_chunk_pos} and " + "{var_chunk_pos_next}, on axis {axis}, they are overlapped " + "on the same Zarr chunk in the region {region}. " "Writing this array in parallel with Dask could lead to corrupted data. " "To resolve this issue, consider one of the following options: " "- Rechunk the array using `chunk()`. " @@ -202,7 +206,7 @@ def validate_grid_chunks_alignment( "- Enable automatic chunks alignment with `align_chunks=True`." ) - for dim_i, chunk_size, var_chunks, interval, size in zip( + for axis, chunk_size, var_chunks, interval, size in zip( range(len(enc_chunks)), enc_chunks, nd_var_chunks, @@ -215,9 +219,10 @@ def validate_grid_chunks_alignment( raise ValueError( base_error.format( var_chunk_pos=i + 1, + var_chunk_pos_next=i + 2, var_chunk_size=chunk, + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, @@ -237,9 +242,10 @@ def validate_grid_chunks_alignment( raise ValueError( base_error.format( var_chunk_pos=0, + var_chunk_pos_next=0, var_chunk_size=var_chunks[0], + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, @@ -251,9 +257,10 @@ def validate_grid_chunks_alignment( error_on_last_chunk = base_error.format( var_chunk_pos=len(var_chunks) - 1, + var_chunk_pos_next=len(var_chunks) - 1, var_chunk_size=var_chunks[-1], + axis=axis, name=name, - dim_i=dim_i, chunk_size=chunk_size, region=interval, enc_chunks=enc_chunks, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6de626a159b..3399d116c6f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2302,6 +2302,7 @@ def to_zarr( append_dim=append_dim, region=region, safe_chunks=safe_chunks, + align_chunks=align_chunks, zarr_version=zarr_version, zarr_format=zarr_format, write_empty_chunks=write_empty_chunks, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..072b81121f5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -6868,6 +6868,54 @@ def test_zarr_safe_chunk_region(self, mode: Literal["r+", "a"]): chunk = chunk.chunk() self.save(store, chunk.chunk(), region=region) + @requires_dask + def test_dataset_to_zarr_align_chunks_true(self, tmp_store) -> None: + # This test is a replica of the one in `test_dataarray_to_zarr_align_chunks_true` + # but for datasets + with self.create_zarr_target() as store: + ds = ( + DataArray( + np.arange(4).reshape((2, 2)), + dims=["a", "b"], + coords={ + "a": np.arange(2), + "b": np.arange(2), + }, + ) + .chunk(a=(1, 1), b=(1, 1)) + .to_dataset(name="foo") + ) + + self.save( + store, + ds, + align_chunks=True, + encoding={"foo": {"chunks": (3, 3)}}, + mode="w", + ) + assert_identical(ds, xr.open_zarr(store)) + + ds = ( + DataArray( + np.arange(4, 8).reshape((2, 2)), + dims=["a", "b"], + coords={ + "a": np.arange(2), + "b": np.arange(2), + }, + ) + .chunk(a=(1, 1), b=(1, 1)) + .to_dataset(name="foo") + ) + + self.save( + store, + ds, + align_chunks=True, + region="auto", + ) + assert_identical(ds, xr.open_zarr(store)) + @requires_h5netcdf @requires_fsspec