From 47359bdfc01ff42dfa4170c26c5300728f652280 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 28 Mar 2025 12:23:39 -0400 Subject: [PATCH 01/22] test: use davis branch run plz run for real use true dev xarray get versions right for wacky dev version something --- .github/workflows/upstream-dev-ci.yaml | 10 +-- ci/install-upstream-wheels.sh | 88 +++++++++++++------------- xarray/backends/zarr.py | 9 ++- xarray/tests/__init__.py | 2 +- xarray/tests/test_backends.py | 10 +++ 5 files changed, 64 insertions(+), 55 deletions(-) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 5e74c85e319..1b4ff75144c 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -39,16 +39,8 @@ jobs: upstream-dev: name: upstream-dev runs-on: ubuntu-latest - needs: detect-ci-trigger env: ZARR_V3_EXPERIMENTAL_API: 1 - if: | - always() - && ( - (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - || needs.detect-ci-trigger.outputs.triggered == 'true' - || contains( github.event.pull_request.labels.*.name, 'run-upstream') - ) defaults: run: shell: bash -l {0} @@ -73,7 +65,7 @@ jobs: bash ci/install-upstream-wheels.sh - name: Install xarray run: | - python -m pip install --no-deps -e . + python -m pip install git+https://github.com/pydata/xarray - name: Version info run: | python xarray/util/print_versions.py diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 02d36259f82..93bd124315b 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash if which micromamba >/dev/null; then - conda=micromamba + conda=micromamba elif which mamba >/dev/null; then - conda=mamba + conda=mamba else - conda=conda + conda=conda fi # temporarily (?) remove numbagg and numba @@ -14,53 +14,53 @@ $conda remove -y numba numbagg sparse $conda remove -y numexpr # forcibly remove packages to avoid artifacts $conda remove -y --force \ - numpy \ - scipy \ - pandas \ - distributed \ - fsspec \ - zarr \ - cftime \ - packaging \ - bottleneck \ - flox - # pint + numpy \ + scipy \ + pandas \ + distributed \ + fsspec \ + zarr \ + cftime \ + packaging \ + bottleneck \ + flox +# pint # to limit the runtime of Upstream CI python -m pip install \ - -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ - --no-deps \ - --pre \ - --upgrade \ - numpy \ - scipy \ - matplotlib \ - pandas + -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ + --no-deps \ + --pre \ + --upgrade \ + numpy \ + scipy \ + matplotlib \ + pandas # for some reason pandas depends on pyarrow already. # Remove once a `pyarrow` version compiled with `numpy>=2.0` is on `conda-forge` python -m pip install \ - -i https://pypi.fury.io/arrow-nightlies/ \ - --prefer-binary \ - --no-deps \ - --pre \ - --upgrade \ - pyarrow + -i https://pypi.fury.io/arrow-nightlies/ \ + --prefer-binary \ + --no-deps \ + --pre \ + --upgrade \ + pyarrow # manually install `pint`, `donfig`, and `crc32c` to pull in new dependencies python -m pip install --upgrade pint donfig crc32c python -m pip install \ - --no-deps \ - --upgrade \ - git+https://github.com/dask/dask \ - git+https://github.com/dask/dask-expr \ - git+https://github.com/dask/distributed \ - git+https://github.com/zarr-developers/zarr-python \ - git+https://github.com/Unidata/cftime \ - git+https://github.com/pypa/packaging \ - git+https://github.com/hgrecco/pint \ - git+https://github.com/pydata/bottleneck \ - git+https://github.com/intake/filesystem_spec \ - git+https://github.com/SciTools/nc-time-axis \ - git+https://github.com/xarray-contrib/flox \ - git+https://github.com/h5netcdf/h5netcdf \ - git+https://github.com/dgasmith/opt_einsum - # git+https://github.com/pydata/sparse + --no-deps \ + --upgrade \ + git+https://github.com/dask/dask \ + git+https://github.com/dask/dask-expr \ + git+https://github.com/dask/distributed \ + git+https://github.com/d-v-b/zarr-python.git@feat/fixed-length-strings \ + git+https://github.com/Unidata/cftime \ + git+https://github.com/pypa/packaging \ + git+https://github.com/hgrecco/pint \ + git+https://github.com/pydata/bottleneck \ + git+https://github.com/intake/filesystem_spec \ + git+https://github.com/SciTools/nc-time-axis \ + git+https://github.com/xarray-contrib/flox \ + git+https://github.com/h5netcdf/h5netcdf \ + git+https://github.com/dgasmith/opt_einsum +# git+https://github.com/pydata/sparse diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 48405b906cd..7b4fb3a965a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -106,7 +106,10 @@ def _choose_default_mode( def _zarr_v3() -> bool: - return module_available("zarr", minversion="3") + # hack for this test only + # version being pickd up as 3.0.0b which is apparently not greater than 3 + # return True + return module_available("zarr", minversion="3.0.0a0") # need some special secret attributes to tell us the dimensions @@ -1067,6 +1070,10 @@ def _create_new_array( if c in encoding: encoding["config"][c] = encoding.pop(c) + print(fill_value) + print(encoding) + print(dtype) + print(shape) zarr_array = self.zarr_group.create( name, shape=shape, diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fe76df75fa0..fd607353364 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -129,7 +129,7 @@ def _importorskip( has_bottleneck, requires_bottleneck = _importorskip("bottleneck") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") -has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") +has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0a01") has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..da062f746c8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -369,6 +369,8 @@ def create_store(self): def roundtrip( self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): + print(save_kwargs) + print(open_kwargs) if save_kwargs is None: save_kwargs = {} if open_kwargs is None: @@ -926,6 +928,9 @@ def test_roundtrip_bytes_with_fill_value(self) -> None: encoding = {"_FillValue": b"X", "dtype": "S1"} original = Dataset({"x": ("t", values, {}, encoding)}) expected = original.copy(deep=True) + zarr_format_3 = has_zarr_v3 and zarr.config.config["default_zarr_format"] == 3 + print(zarr_format_3) + print(self.version_kwargs) with self.roundtrip(original) as actual: assert_identical(expected, actual) @@ -2363,6 +2368,7 @@ def create_store(self, cache_members: bool = False): ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] + print("ver kwargs", self.version_kwargs) return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager @@ -3832,6 +3838,10 @@ def roundtrip_dir( if open_kwargs is None: open_kwargs = {} + print("here?") + print(save_kwargs) + print(self.version_kwargs) + print(open_kwargs) data.to_zarr(store, **save_kwargs, **self.version_kwargs) with xr.open_dataset( store, engine="zarr", **open_kwargs, **self.version_kwargs From 218098b46b14ca6fa41c510c66a15ba6c0739269 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Sat, 17 May 2025 15:43:10 -0400 Subject: [PATCH 02/22] read fill value with new datatypes --- xarray/backends/zarr.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7b4fb3a965a..01f37bcb568 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd +from packaging.version import Version from xarray import coding, conventions from xarray.backends.chunks import grid_rechunk, validate_grid_chunks_alignment @@ -819,10 +820,24 @@ def open_store_variable(self, name): if zarr_array.fill_value is not None: attributes["_FillValue"] = zarr_array.fill_value elif "_FillValue" in attributes: - attributes["_FillValue"] = FillValueCoder.decode( - attributes["_FillValue"], zarr_array.dtype - ) + # TODO update version check for the released version with dtypes + # probably be 3.1 + import zarr + if Version(zarr.__version__) >= Version("3.0.6"): + attributes["_FillValue"] = ( + # Use the new dtype infrastructure instead of doing xarray + # specific fill value decoding + zarr_array.metadata.data_type.from_json_value( + attributes["_FillValue"], + zarr_format=zarr_array.metadata.zarr_format, + ) + ) + else: + original_zarr_dtype = zarr_array.metadata.data_type + attributes["_FillValue"] = FillValueCoder.decode( + attributes["_FillValue"], original_zarr_dtype.value + ) return Variable(dimensions, data, attributes, encoding) def get_variables(self): @@ -1070,10 +1085,6 @@ def _create_new_array( if c in encoding: encoding["config"][c] = encoding.pop(c) - print(fill_value) - print(encoding) - print(dtype) - print(shape) zarr_array = self.zarr_group.create( name, shape=shape, From 841ed36dc9a601d2701865cb9a82c117ea93828f Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 25 Jun 2025 14:44:21 -0400 Subject: [PATCH 03/22] update reading fill values to new dtypes --- xarray/backends/zarr.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 01f37bcb568..60b5e1f7eab 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -824,19 +824,19 @@ def open_store_variable(self, name): # probably be 3.1 import zarr + # if Version(zarr.__version__) >= Version("3.0.6"): attributes["_FillValue"] = ( # Use the new dtype infrastructure instead of doing xarray # specific fill value decoding - zarr_array.metadata.data_type.from_json_value( + FillValueCoder.decode( attributes["_FillValue"], - zarr_format=zarr_array.metadata.zarr_format, + zarr_array.metadata.data_type.to_native_dtype(), ) ) else: - original_zarr_dtype = zarr_array.metadata.data_type attributes["_FillValue"] = FillValueCoder.decode( - attributes["_FillValue"], original_zarr_dtype.value + attributes["_FillValue"], zarr_array.metadata.data_type.value ) return Variable(dimensions, data, attributes, encoding) @@ -953,6 +953,7 @@ def store( variables_encoded, attributes = self.encode( {vn: variables[vn] for vn in new_variable_names}, attributes ) + print(f"{variables_encoded=}") if existing_variable_names: # We make sure that values to be appended are encoded *exactly* @@ -1005,6 +1006,7 @@ def store( else: variables_to_set = variables_encoded + print(f"{variables_to_set=}") self.set_variables( variables_to_set, check_encoding_set, writer, unlimited_dims=unlimited_dims ) @@ -1013,6 +1015,7 @@ def store( if _zarr_v3(): kwargs["zarr_format"] = self.zarr_group.metadata.zarr_format zarr.consolidate_metadata(self.zarr_group.store, **kwargs) + print("DONE STORE.STORE") def sync(self): pass @@ -1062,6 +1065,7 @@ def _open_existing_array(self, *, name) -> ZarrArray: def _create_new_array( self, *, name, shape, dtype, fill_value, encoding, attrs ) -> ZarrArray: + # STRING ERROR FOR OBJECT HERE if coding.strings.check_vlen_dtype(dtype) is str: dtype = str @@ -1085,6 +1089,8 @@ def _create_new_array( if c in encoding: encoding["config"][c] = encoding.pop(c) + print("create") + print(dtype) zarr_array = self.zarr_group.create( name, shape=shape, @@ -1223,6 +1229,8 @@ def set_variables( encoding["overwrite"] = self._mode == "w" + print(dtype) + print(";sdf") zarr_array = self._create_new_array( name=name, dtype=dtype, @@ -1231,6 +1239,8 @@ def set_variables( encoding=encoding, attrs=encoded_attrs, ) + print(zarr_array) + print(type(zarr_array)) writer.add(v.data, zarr_array, region) From 99cfd7808a93d187aef20d03379a6ac735276726 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 27 Jun 2025 15:57:58 -0400 Subject: [PATCH 04/22] dtype updates --- ci/install-upstream-wheels.sh | 2 +- xarray/tests/test_backends.py | 31 +++++++------------------------ 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 93bd124315b..439d96ab905 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -53,7 +53,7 @@ python -m pip install \ git+https://github.com/dask/dask \ git+https://github.com/dask/dask-expr \ git+https://github.com/dask/distributed \ - git+https://github.com/d-v-b/zarr-python.git@feat/fixed-length-strings \ + git+https://github.com/zarr-developers/zarr-python.git@main \ git+https://github.com/Unidata/cftime \ git+https://github.com/pypa/packaging \ git+https://github.com/hgrecco/pint \ diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index da062f746c8..6e7325576e5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -369,8 +369,6 @@ def create_store(self): def roundtrip( self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): - print(save_kwargs) - print(open_kwargs) if save_kwargs is None: save_kwargs = {} if open_kwargs is None: @@ -928,9 +926,6 @@ def test_roundtrip_bytes_with_fill_value(self) -> None: encoding = {"_FillValue": b"X", "dtype": "S1"} original = Dataset({"x": ("t", values, {}, encoding)}) expected = original.copy(deep=True) - zarr_format_3 = has_zarr_v3 and zarr.config.config["default_zarr_format"] == 3 - print(zarr_format_3) - print(self.version_kwargs) with self.roundtrip(original) as actual: assert_identical(expected, actual) @@ -2368,7 +2363,6 @@ def create_store(self, cache_members: bool = False): ) def save(self, dataset, store_target, **kwargs): # type: ignore[override] - print("ver kwargs", self.version_kwargs) return dataset.to_zarr(store=store_target, **kwargs, **self.version_kwargs) @contextlib.contextmanager @@ -2422,7 +2416,7 @@ def test_read_non_consolidated_warning(self) -> None: def test_non_existent_store(self) -> None: with pytest.raises( FileNotFoundError, - match="(No such file or directory|Unable to find group|No group found)", + match="(No such file or directory|Unable to find group|No group found in store)", ): xr.open_zarr(f"{uuid.uuid4()}") @@ -2504,6 +2498,7 @@ def test_manual_chunk(self) -> None: assert_identical(actual.load(), auto.load()) @requires_dask + @pytest.mark.filterwarnings("ignore:.*does not have a Zarr V3 specification.*") def test_warning_on_bad_chunks(self) -> None: original = create_test_data().chunk({"dim1": 4, "dim2": 3, "dim3": 3}) @@ -2912,7 +2907,6 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: - skip_if_zarr_format_3("This actually works fine with Zarr format 3") ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) @@ -2921,18 +2915,6 @@ def test_append_string_length_mismatch_raises(self, dtype) -> None: store_target, append_dim="time", **self.version_kwargs ) - @pytest.mark.parametrize("dtype", ["U", "S"]) - def test_append_string_length_mismatch_works(self, dtype) -> None: - skip_if_zarr_format_2("This doesn't work with Zarr format 2") - # ...but it probably would if we used object dtype - ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) - expected = xr.concat([ds, ds_to_append], dim="time") - with self.create_zarr_target() as store_target: - ds.to_zarr(store_target, mode="w", **self.version_kwargs) - ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) - actual = xr.open_dataset(store_target, engine="zarr") - xr.testing.assert_identical(expected, actual) - def test_check_encoding_is_consistent_after_append(self) -> None: ds, ds_to_append, _ = create_append_test_data() @@ -3014,6 +2996,11 @@ def test_append_with_append_dim_no_overwrite(self) -> None: original2["lon"] = lon assert_identical(original2, actual) + def test_minimal_round(self) -> None: + ds = create_test_data() + with self.roundtrip(ds) as ds_reload: + assert_identical(ds, ds_reload) + @requires_dask def test_to_zarr_compute_false_roundtrip(self) -> None: from dask.delayed import Delayed @@ -3838,10 +3825,6 @@ def roundtrip_dir( if open_kwargs is None: open_kwargs = {} - print("here?") - print(save_kwargs) - print(self.version_kwargs) - print(open_kwargs) data.to_zarr(store, **save_kwargs, **self.version_kwargs) with xr.open_dataset( store, engine="zarr", **open_kwargs, **self.version_kwargs From 5da11247eed5b0c3395dad5b5f2714502101ce47 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Fri, 27 Jun 2025 16:25:36 -0400 Subject: [PATCH 05/22] more debugging --- xarray/backends/zarr.py | 72 ++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 60b5e1f7eab..44251b19c47 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -783,13 +783,18 @@ def ds(self): return self.zarr_group def open_store_variable(self, name): + print(f"DEBUG OPEN: Reading variable '{name}'") zarr_array = self.members[name] + print( + f"DEBUG OPEN: zarr_array.dtype={zarr_array.dtype}, fill_value={zarr_array.fill_value}" + ) data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( zarr_array, DIMENSION_KEY, try_nczarr ) attributes = dict(attributes) + print(f"DEBUG OPEN: Original attributes: {attributes}") encoding = { "chunks": zarr_array.chunks, @@ -814,31 +819,60 @@ def open_store_variable(self, name): } ) + print( + f"DEBUG OPEN: _use_zarr_fill_value_as_mask={self._use_zarr_fill_value_as_mask}" + ) + print(f"DEBUG OPEN: '_FillValue' in attributes: {'_FillValue' in attributes}") + if self._use_zarr_fill_value_as_mask: # Setting this attribute triggers CF decoding for missing values # by interpreting Zarr's fill_value to mean the same as netCDF's _FillValue + print(f"DEBUG OPEN: Using zarr fill_value as mask for '{name}'") + print("===========") + print(f"DEBUG DTYPE: zarr array fill value: {zarr_array.fill_value.dtype}") if zarr_array.fill_value is not None: attributes["_FillValue"] = zarr_array.fill_value + print( + f"\t DEBUG OPEN: Set _FillValue to {zarr_array.fill_value} for '{name}'" + ) + print("===========") elif "_FillValue" in attributes: # TODO update version check for the released version with dtypes # probably be 3.1 import zarr - # + print(f"DEBUG: Processing _FillValue for {name}") + print(f"DEBUG: Original _FillValue: {attributes['_FillValue']}") + print( + f"DEBUG: zarr_array.metadata.data_type: {zarr_array.metadata.data_type}" + ) + if Version(zarr.__version__) >= Version("3.0.6"): + native_dtype = zarr_array.metadata.data_type.to_native_dtype() + print(f"DEBUG: native_dtype: {native_dtype}") attributes["_FillValue"] = ( # Use the new dtype infrastructure instead of doing xarray # specific fill value decoding FillValueCoder.decode( attributes["_FillValue"], - zarr_array.metadata.data_type.to_native_dtype(), + native_dtype, ) ) else: + dtype_value = zarr_array.metadata.data_type.value + print(f"DEBUG: dtype_value: {dtype_value}") attributes["_FillValue"] = FillValueCoder.decode( - attributes["_FillValue"], zarr_array.metadata.data_type.value + attributes["_FillValue"], dtype_value ) - return Variable(dimensions, data, attributes, encoding) + + print(f"DEBUG: Decoded _FillValue: {attributes['_FillValue']}") + print(f"DEBUG: Decoded _FillValue type: {type(attributes['_FillValue'])}") + + print(f"DEBUG OPEN: Final attributes for '{name}': {attributes}") + variable = Variable(dimensions, data, attributes, encoding) + print(f"DEBUG OPEN: Created variable '{name}' with dtype: {variable.dtype}") + + return variable def get_variables(self): return FrozenDict((k, self.open_store_variable(k)) for k in self.array_keys()) @@ -953,7 +987,7 @@ def store( variables_encoded, attributes = self.encode( {vn: variables[vn] for vn in new_variable_names}, attributes ) - print(f"{variables_encoded=}") + print("HERE") if existing_variable_names: # We make sure that values to be appended are encoded *exactly* @@ -961,14 +995,28 @@ def store( # To do so, we decode variables directly to access the proper encoding, # without going via xarray.Dataset to avoid needing to load # index variables into memory. + print( + f"DEBUG APPEND: Processing existing variables: {existing_variable_names}" + ) + store_vars = { + k: self.open_store_variable(name=k) for k in existing_variable_names + } + print( + f"DEBUG APPEND: Store vars dtypes: {[(name, var.dtype) for name, var in store_vars.items()]}" + ) + existing_vars, _, _ = conventions.decode_cf_variables( - variables={ - k: self.open_store_variable(name=k) for k in existing_variable_names - }, + variables=store_vars, # attributes = {} since we don't care about parsing the global # "coordinates" attribute attributes={}, ) + print( + f"DEBUG APPEND: After CF decode dtypes: {[(name, var.dtype) for name, var in existing_vars.items()]}" + ) + print( + f"DEBUG APPEND: Variables to append dtypes: {[(name, var.dtype) for name, var in variables_encoded.items() if name in existing_variable_names]}" + ) # Modified variables must use the same encoding as the store. vars_with_encoding = {} for vn in existing_variable_names: @@ -1006,7 +1054,6 @@ def store( else: variables_to_set = variables_encoded - print(f"{variables_to_set=}") self.set_variables( variables_to_set, check_encoding_set, writer, unlimited_dims=unlimited_dims ) @@ -1015,7 +1062,6 @@ def store( if _zarr_v3(): kwargs["zarr_format"] = self.zarr_group.metadata.zarr_format zarr.consolidate_metadata(self.zarr_group.store, **kwargs) - print("DONE STORE.STORE") def sync(self): pass @@ -1089,8 +1135,6 @@ def _create_new_array( if c in encoding: encoding["config"][c] = encoding.pop(c) - print("create") - print(dtype) zarr_array = self.zarr_group.create( name, shape=shape, @@ -1229,8 +1273,6 @@ def set_variables( encoding["overwrite"] = self._mode == "w" - print(dtype) - print(";sdf") zarr_array = self._create_new_array( name=name, dtype=dtype, @@ -1239,8 +1281,6 @@ def set_variables( encoding=encoding, attrs=encoded_attrs, ) - print(zarr_array) - print(type(zarr_array)) writer.add(v.data, zarr_array, region) From cb18495d67e81ce3661c5c5932cfff6ca65d2d6f Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 11:29:01 -0400 Subject: [PATCH 06/22] remove debug statements --- pyproject.toml | 1 + xarray/backends/zarr.py | 43 ----------------------------------------- 2 files changed, 1 insertion(+), 43 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8cfbb6851b3..3571db540a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ io = [ "cftime", "pooch", ] +# "zarr @ git+https://github.com/zarr-developers/zarr-python.git", etc = ["sparse>=0.15"] parallel = ["dask[complete]"] viz = ["cartopy>=0.23", "matplotlib", "nc-time-axis", "seaborn"] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 44251b19c47..77b1d7e7ad7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -783,18 +783,13 @@ def ds(self): return self.zarr_group def open_store_variable(self, name): - print(f"DEBUG OPEN: Reading variable '{name}'") zarr_array = self.members[name] - print( - f"DEBUG OPEN: zarr_array.dtype={zarr_array.dtype}, fill_value={zarr_array.fill_value}" - ) data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array)) try_nczarr = self._mode == "r" dimensions, attributes = _get_zarr_dims_and_attrs( zarr_array, DIMENSION_KEY, try_nczarr ) attributes = dict(attributes) - print(f"DEBUG OPEN: Original attributes: {attributes}") encoding = { "chunks": zarr_array.chunks, @@ -819,37 +814,18 @@ def open_store_variable(self, name): } ) - print( - f"DEBUG OPEN: _use_zarr_fill_value_as_mask={self._use_zarr_fill_value_as_mask}" - ) - print(f"DEBUG OPEN: '_FillValue' in attributes: {'_FillValue' in attributes}") - if self._use_zarr_fill_value_as_mask: # Setting this attribute triggers CF decoding for missing values # by interpreting Zarr's fill_value to mean the same as netCDF's _FillValue - print(f"DEBUG OPEN: Using zarr fill_value as mask for '{name}'") - print("===========") - print(f"DEBUG DTYPE: zarr array fill value: {zarr_array.fill_value.dtype}") if zarr_array.fill_value is not None: attributes["_FillValue"] = zarr_array.fill_value - print( - f"\t DEBUG OPEN: Set _FillValue to {zarr_array.fill_value} for '{name}'" - ) - print("===========") elif "_FillValue" in attributes: # TODO update version check for the released version with dtypes # probably be 3.1 import zarr - print(f"DEBUG: Processing _FillValue for {name}") - print(f"DEBUG: Original _FillValue: {attributes['_FillValue']}") - print( - f"DEBUG: zarr_array.metadata.data_type: {zarr_array.metadata.data_type}" - ) - if Version(zarr.__version__) >= Version("3.0.6"): native_dtype = zarr_array.metadata.data_type.to_native_dtype() - print(f"DEBUG: native_dtype: {native_dtype}") attributes["_FillValue"] = ( # Use the new dtype infrastructure instead of doing xarray # specific fill value decoding @@ -860,17 +836,11 @@ def open_store_variable(self, name): ) else: dtype_value = zarr_array.metadata.data_type.value - print(f"DEBUG: dtype_value: {dtype_value}") attributes["_FillValue"] = FillValueCoder.decode( attributes["_FillValue"], dtype_value ) - print(f"DEBUG: Decoded _FillValue: {attributes['_FillValue']}") - print(f"DEBUG: Decoded _FillValue type: {type(attributes['_FillValue'])}") - - print(f"DEBUG OPEN: Final attributes for '{name}': {attributes}") variable = Variable(dimensions, data, attributes, encoding) - print(f"DEBUG OPEN: Created variable '{name}' with dtype: {variable.dtype}") return variable @@ -987,7 +957,6 @@ def store( variables_encoded, attributes = self.encode( {vn: variables[vn] for vn in new_variable_names}, attributes ) - print("HERE") if existing_variable_names: # We make sure that values to be appended are encoded *exactly* @@ -995,15 +964,9 @@ def store( # To do so, we decode variables directly to access the proper encoding, # without going via xarray.Dataset to avoid needing to load # index variables into memory. - print( - f"DEBUG APPEND: Processing existing variables: {existing_variable_names}" - ) store_vars = { k: self.open_store_variable(name=k) for k in existing_variable_names } - print( - f"DEBUG APPEND: Store vars dtypes: {[(name, var.dtype) for name, var in store_vars.items()]}" - ) existing_vars, _, _ = conventions.decode_cf_variables( variables=store_vars, @@ -1011,12 +974,6 @@ def store( # "coordinates" attribute attributes={}, ) - print( - f"DEBUG APPEND: After CF decode dtypes: {[(name, var.dtype) for name, var in existing_vars.items()]}" - ) - print( - f"DEBUG APPEND: Variables to append dtypes: {[(name, var.dtype) for name, var in variables_encoded.items() if name in existing_variable_names]}" - ) # Modified variables must use the same encoding as the store. vars_with_encoding = {} for vn in existing_variable_names: From 3e035a615b981b8691138f5b63eadd0515ae9774 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 11:38:39 -0400 Subject: [PATCH 07/22] ci: correct installation of xarray --- .github/workflows/upstream-dev-ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 1b4ff75144c..1c8aed82427 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -65,7 +65,7 @@ jobs: bash ci/install-upstream-wheels.sh - name: Install xarray run: | - python -m pip install git+https://github.com/pydata/xarray + python -m pip install --no-deps -e . - name: Version info run: | python xarray/util/print_versions.py From 7a037f1c3cf980c2aae67d2294cd75633f3233b1 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 11:42:31 -0400 Subject: [PATCH 08/22] cleanup --- pyproject.toml | 1 - xarray/backends/zarr.py | 6 +----- xarray/tests/__init__.py | 2 +- xarray/tests/test_backends.py | 16 +++++++++++----- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3571db540a0..8cfbb6851b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ io = [ "cftime", "pooch", ] -# "zarr @ git+https://github.com/zarr-developers/zarr-python.git", etc = ["sparse>=0.15"] parallel = ["dask[complete]"] viz = ["cartopy>=0.23", "matplotlib", "nc-time-axis", "seaborn"] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 77b1d7e7ad7..5407185b4fc 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -107,10 +107,7 @@ def _choose_default_mode( def _zarr_v3() -> bool: - # hack for this test only - # version being pickd up as 3.0.0b which is apparently not greater than 3 - # return True - return module_available("zarr", minversion="3.0.0a0") + return module_available("zarr", minversion="3") # need some special secret attributes to tell us the dimensions @@ -1068,7 +1065,6 @@ def _open_existing_array(self, *, name) -> ZarrArray: def _create_new_array( self, *, name, shape, dtype, fill_value, encoding, attrs ) -> ZarrArray: - # STRING ERROR FOR OBJECT HERE if coding.strings.check_vlen_dtype(dtype) is str: dtype = str diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fd607353364..fe76df75fa0 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -129,7 +129,7 @@ def _importorskip( has_bottleneck, requires_bottleneck = _importorskip("bottleneck") has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") -has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0a01") +has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6e7325576e5..71ab2cff503 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -369,11 +369,14 @@ def create_store(self): def roundtrip( self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): + print("INT ROUNDTRIP") if save_kwargs is None: save_kwargs = {} if open_kwargs is None: open_kwargs = {} with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path: + print(path) + print(save_kwargs) self.save(data, path, **save_kwargs) with self.open(path, **open_kwargs) as ds: yield ds @@ -848,8 +851,16 @@ def test_outer_indexing_reversed(self) -> None: {"z": (("t", "p", "y", "x"), np.ones((1, 1, 31, 40)))}, ) + print("jsda;lkjasdlfk") + print(f"Test class: {self.__class__.__name__}") + print(f"Dataset: {ds}") with self.roundtrip(ds) as on_disk: + print(f"on_disk type: {type(on_disk)}") + print(f"on_disk.z type: {type(on_disk.z)}") subset = on_disk.isel(t=[0], p=0).z[:, ::10, ::10][:, ::-1, :] + print(f"subset sizes: {subset.sizes}") + loaded = subset.load() + print(f"loaded sizes: {loaded.sizes}") assert subset.sizes == subset.load().sizes def test_isel_dataarray(self) -> None: @@ -2996,11 +3007,6 @@ def test_append_with_append_dim_no_overwrite(self) -> None: original2["lon"] = lon assert_identical(original2, actual) - def test_minimal_round(self) -> None: - ds = create_test_data() - with self.roundtrip(ds) as ds_reload: - assert_identical(ds, ds_reload) - @requires_dask def test_to_zarr_compute_false_roundtrip(self) -> None: from dask.delayed import Delayed From c5977821e9e86db63f53e28be9219fd43251d736 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 11:45:18 -0400 Subject: [PATCH 09/22] reset upstream wheel script --- ci/install-upstream-wheels.sh | 88 +++++++++++++++++------------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 439d96ab905..02d36259f82 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash if which micromamba >/dev/null; then - conda=micromamba + conda=micromamba elif which mamba >/dev/null; then - conda=mamba + conda=mamba else - conda=conda + conda=conda fi # temporarily (?) remove numbagg and numba @@ -14,53 +14,53 @@ $conda remove -y numba numbagg sparse $conda remove -y numexpr # forcibly remove packages to avoid artifacts $conda remove -y --force \ - numpy \ - scipy \ - pandas \ - distributed \ - fsspec \ - zarr \ - cftime \ - packaging \ - bottleneck \ - flox -# pint + numpy \ + scipy \ + pandas \ + distributed \ + fsspec \ + zarr \ + cftime \ + packaging \ + bottleneck \ + flox + # pint # to limit the runtime of Upstream CI python -m pip install \ - -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ - --no-deps \ - --pre \ - --upgrade \ - numpy \ - scipy \ - matplotlib \ - pandas + -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ + --no-deps \ + --pre \ + --upgrade \ + numpy \ + scipy \ + matplotlib \ + pandas # for some reason pandas depends on pyarrow already. # Remove once a `pyarrow` version compiled with `numpy>=2.0` is on `conda-forge` python -m pip install \ - -i https://pypi.fury.io/arrow-nightlies/ \ - --prefer-binary \ - --no-deps \ - --pre \ - --upgrade \ - pyarrow + -i https://pypi.fury.io/arrow-nightlies/ \ + --prefer-binary \ + --no-deps \ + --pre \ + --upgrade \ + pyarrow # manually install `pint`, `donfig`, and `crc32c` to pull in new dependencies python -m pip install --upgrade pint donfig crc32c python -m pip install \ - --no-deps \ - --upgrade \ - git+https://github.com/dask/dask \ - git+https://github.com/dask/dask-expr \ - git+https://github.com/dask/distributed \ - git+https://github.com/zarr-developers/zarr-python.git@main \ - git+https://github.com/Unidata/cftime \ - git+https://github.com/pypa/packaging \ - git+https://github.com/hgrecco/pint \ - git+https://github.com/pydata/bottleneck \ - git+https://github.com/intake/filesystem_spec \ - git+https://github.com/SciTools/nc-time-axis \ - git+https://github.com/xarray-contrib/flox \ - git+https://github.com/h5netcdf/h5netcdf \ - git+https://github.com/dgasmith/opt_einsum -# git+https://github.com/pydata/sparse + --no-deps \ + --upgrade \ + git+https://github.com/dask/dask \ + git+https://github.com/dask/dask-expr \ + git+https://github.com/dask/distributed \ + git+https://github.com/zarr-developers/zarr-python \ + git+https://github.com/Unidata/cftime \ + git+https://github.com/pypa/packaging \ + git+https://github.com/hgrecco/pint \ + git+https://github.com/pydata/bottleneck \ + git+https://github.com/intake/filesystem_spec \ + git+https://github.com/SciTools/nc-time-axis \ + git+https://github.com/xarray-contrib/flox \ + git+https://github.com/h5netcdf/h5netcdf \ + git+https://github.com/dgasmith/opt_einsum + # git+https://github.com/pydata/sparse From 3877091388f398c919f3c5925bf8fe0193d127cd Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 11:46:28 -0400 Subject: [PATCH 10/22] ci: more cleanup --- .github/workflows/upstream-dev-ci.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 1c8aed82427..5e74c85e319 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -39,8 +39,16 @@ jobs: upstream-dev: name: upstream-dev runs-on: ubuntu-latest + needs: detect-ci-trigger env: ZARR_V3_EXPERIMENTAL_API: 1 + if: | + always() + && ( + (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + || needs.detect-ci-trigger.outputs.triggered == 'true' + || contains( github.event.pull_request.labels.*.name, 'run-upstream') + ) defaults: run: shell: bash -l {0} From 6d3e9cd3505e0399f046c9d8a43628b009c95801 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 11:58:16 -0400 Subject: [PATCH 11/22] better version check --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5407185b4fc..b2e074c5849 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -821,7 +821,7 @@ def open_store_variable(self, name): # probably be 3.1 import zarr - if Version(zarr.__version__) >= Version("3.0.6"): + if Version(zarr.__version__) > Version("3.0.9"): native_dtype = zarr_array.metadata.data_type.to_native_dtype() attributes["_FillValue"] = ( # Use the new dtype infrastructure instead of doing xarray From fde770df24c090a58fded01cb921cb360db7b6b6 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 12:07:25 -0400 Subject: [PATCH 12/22] allow upstream tests on PRs from forks --- .github/workflows/upstream-dev-ci.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 5e74c85e319..5c28ef0d851 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -23,8 +23,7 @@ jobs: name: detect upstream-dev ci trigger runs-on: ubuntu-latest if: | - github.repository == 'pydata/xarray' - && (github.event_name == 'push' || github.event_name == 'pull_request') + github.event_name == 'push' || github.event_name == 'pull_request' outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: From 5da6ccafae9055fa011b144afb34c0ee88cd263d Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 12:26:02 -0400 Subject: [PATCH 13/22] test: proper skip behavior for zarr versions --- xarray/tests/__init__.py | 14 ++++++++++++++ xarray/tests/test_backends.py | 15 ++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fe76df75fa0..5ff0ce846d0 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -130,6 +130,20 @@ def _importorskip( has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") +has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.0.9") + +# Additional check for zarr dtype support (dev versions > 3.0.9) +if has_zarr: + import zarr + + # Dev versions like "3.0.9.dev47+g9da38f75" are > "3.0.9" + has_zarr_v3_dtypes = has_zarr_v3_dtypes or ( + Version(zarr.__version__) > Version("3.0.9") + ) + requires_zarr_v3_dtypes = pytest.mark.skipif( + not has_zarr_v3_dtypes, reason="requires zarr>3.0.9 (including dev versions)" + ) + has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 71ab2cff503..443ccab51eb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -72,6 +72,7 @@ has_scipy, has_zarr, has_zarr_v3, + has_zarr_v3_dtypes, mock, network, requires_cftime, @@ -369,14 +370,11 @@ def create_store(self): def roundtrip( self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False ): - print("INT ROUNDTRIP") if save_kwargs is None: save_kwargs = {} if open_kwargs is None: open_kwargs = {} with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path: - print(path) - print(save_kwargs) self.save(data, path, **save_kwargs) with self.open(path, **open_kwargs) as ds: yield ds @@ -851,16 +849,8 @@ def test_outer_indexing_reversed(self) -> None: {"z": (("t", "p", "y", "x"), np.ones((1, 1, 31, 40)))}, ) - print("jsda;lkjasdlfk") - print(f"Test class: {self.__class__.__name__}") - print(f"Dataset: {ds}") with self.roundtrip(ds) as on_disk: - print(f"on_disk type: {type(on_disk)}") - print(f"on_disk.z type: {type(on_disk.z)}") subset = on_disk.isel(t=[0], p=0).z[:, ::10, ::10][:, ::-1, :] - print(f"subset sizes: {subset.sizes}") - loaded = subset.load() - print(f"loaded sizes: {loaded.sizes}") assert subset.sizes == subset.load().sizes def test_isel_dataarray(self) -> None: @@ -2918,6 +2908,9 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: + if has_zarr_v3() and not has_zarr_v3_dtypes: + pytest.skip("This works on pre dtype updated zarr python") + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) From f8a45bab335830ea98704b2616cc3e89658fe3d9 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 14:11:17 -0400 Subject: [PATCH 14/22] More robust dtype check main branch of zarr is not a higher version right now. --- xarray/backends/zarr.py | 12 +++++------- xarray/tests/__init__.py | 24 +++++++++++++++--------- xarray/tests/test_backends.py | 23 +++++++++++++++++++++-- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b2e074c5849..0d4bb87a71c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd -from packaging.version import Version from xarray import coding, conventions from xarray.backends.chunks import grid_rechunk, validate_grid_chunks_alignment @@ -819,9 +818,9 @@ def open_store_variable(self, name): elif "_FillValue" in attributes: # TODO update version check for the released version with dtypes # probably be 3.1 - import zarr - if Version(zarr.__version__) > Version("3.0.9"): + # if Version(zarr.__version__) > Version("3.0.10"): + if hasattr(zarr_array.metadata.data_type, "to_native_dtype"): native_dtype = zarr_array.metadata.data_type.to_native_dtype() attributes["_FillValue"] = ( # Use the new dtype infrastructure instead of doing xarray @@ -961,12 +960,11 @@ def store( # To do so, we decode variables directly to access the proper encoding, # without going via xarray.Dataset to avoid needing to load # index variables into memory. - store_vars = { - k: self.open_store_variable(name=k) for k in existing_variable_names - } existing_vars, _, _ = conventions.decode_cf_variables( - variables=store_vars, + variables={ + k: self.open_store_variable(name=k) for k in existing_variable_names + }, # attributes = {} since we don't care about parsing the global # "coordinates" attribute attributes={}, diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 5ff0ce846d0..8d1b2d556f4 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -130,20 +130,26 @@ def _importorskip( has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") -has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.0.9") - -# Additional check for zarr dtype support (dev versions > 3.0.9) -if has_zarr: +has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.0.10") +if has_zarr_v3: import zarr - # Dev versions like "3.0.9.dev47+g9da38f75" are > "3.0.9" - has_zarr_v3_dtypes = has_zarr_v3_dtypes or ( - Version(zarr.__version__) > Version("3.0.9") - ) + has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") + + # has_zarr_v3_dtypes = hasattr(zarr.Array.metadata.data_type, "to_native_dtype") requires_zarr_v3_dtypes = pytest.mark.skipif( - not has_zarr_v3_dtypes, reason="requires zarr>3.0.9 (including dev versions)" + not has_zarr_v3_dtypes, reason="requires zarr>3.0.10 (including dev versions)" ) +# Additional check for zarr dtype support (dev versions > 3.0.9) +# if has_zarr: +# import zarr +# +# # Dev versions like "3.0.10.dev47+g9da38f75" are > "3.0.10" +# has_zarr_v3_dtypes = has_zarr_v3_dtypes or ( +# Version(zarr.__version__) > Version("3.0.10") +# ) + has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 443ccab51eb..8c3ab96e75c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2908,8 +2908,11 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: - if has_zarr_v3() and not has_zarr_v3_dtypes: - pytest.skip("This works on pre dtype updated zarr python") + print("start") + print(has_zarr_v3) + print(has_zarr_v3_dtypes) + if has_zarr_v3 and not has_zarr_v3_dtypes: + skip_if_zarr_format_3("This actually works fine with Zarr format 3") ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: @@ -2919,6 +2922,22 @@ def test_append_string_length_mismatch_raises(self, dtype) -> None: store_target, append_dim="time", **self.version_kwargs ) + @pytest.mark.parametrize("dtype", ["U", "S"]) + def test_append_string_length_mismatch_works(self, dtype) -> None: + skip_if_zarr_format_2("This doesn't work with Zarr format 2") + # ...but it probably would if we used object dtype + if has_zarr_v3_dtypes: + pytest.skip("This works on pre dtype updated zarr python") + + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) + expected = xr.concat([ds, ds_to_append], dim="time") + + with self.create_zarr_target() as store_target: + ds.to_zarr(store_target, mode="w", **self.version_kwargs) + ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) + actual = xr.open_dataset(store_target, engine="zarr") + xr.testing.assert_identical(expected, actual) + def test_check_encoding_is_consistent_after_append(self) -> None: ds, ds_to_append, _ = create_append_test_data() From 12c4943a6d80409b271fa25c85a9e32fa74b2644 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 14:12:52 -0400 Subject: [PATCH 15/22] cleanup --- xarray/backends/zarr.py | 1 - xarray/tests/test_backends.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0d4bb87a71c..3d796d21838 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -960,7 +960,6 @@ def store( # To do so, we decode variables directly to access the proper encoding, # without going via xarray.Dataset to avoid needing to load # index variables into memory. - existing_vars, _, _ = conventions.decode_cf_variables( variables={ k: self.open_store_variable(name=k) for k in existing_variable_names diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8c3ab96e75c..3b9d905e5da 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2927,7 +2927,7 @@ def test_append_string_length_mismatch_works(self, dtype) -> None: skip_if_zarr_format_2("This doesn't work with Zarr format 2") # ...but it probably would if we used object dtype if has_zarr_v3_dtypes: - pytest.skip("This works on pre dtype updated zarr python") + pytest.skip("This works on pre ZDtype Zarr-Python, but fails after.") ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) expected = xr.concat([ds, ds_to_append], dim="time") From f2e917a5e765d1e078c1ee5f00f7f484857b59ad Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 14:14:35 -0400 Subject: [PATCH 16/22] clarify --- xarray/tests/__init__.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 8d1b2d556f4..019a5ccca54 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -134,22 +134,16 @@ def _importorskip( if has_zarr_v3: import zarr + # manual update by checking attrs for now + # TODO: use version specifier + # installing from git main is giving me a lower version than the + # most recently released zarr has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") - # has_zarr_v3_dtypes = hasattr(zarr.Array.metadata.data_type, "to_native_dtype") requires_zarr_v3_dtypes = pytest.mark.skipif( not has_zarr_v3_dtypes, reason="requires zarr>3.0.10 (including dev versions)" ) -# Additional check for zarr dtype support (dev versions > 3.0.9) -# if has_zarr: -# import zarr -# -# # Dev versions like "3.0.10.dev47+g9da38f75" are > "3.0.10" -# has_zarr_v3_dtypes = has_zarr_v3_dtypes or ( -# Version(zarr.__version__) > Version("3.0.10") -# ) - has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") From b1c6809ba527f280914b240e13f95b174e305a38 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Thu, 3 Jul 2025 14:26:57 -0400 Subject: [PATCH 17/22] cleanup --- xarray/tests/test_backends.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3b9d905e5da..3030244d010 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2908,9 +2908,6 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: - print("start") - print(has_zarr_v3) - print(has_zarr_v3_dtypes) if has_zarr_v3 and not has_zarr_v3_dtypes: skip_if_zarr_format_3("This actually works fine with Zarr format 3") From 8ee46b1e40b50ea790b3037775912d1a023d7ebb Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 9 Jul 2025 16:56:20 -0400 Subject: [PATCH 18/22] revert old fix --- xarray/backends/zarr.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3d796d21838..903a6987e11 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -816,25 +816,9 @@ def open_store_variable(self, name): if zarr_array.fill_value is not None: attributes["_FillValue"] = zarr_array.fill_value elif "_FillValue" in attributes: - # TODO update version check for the released version with dtypes - # probably be 3.1 - - # if Version(zarr.__version__) > Version("3.0.10"): - if hasattr(zarr_array.metadata.data_type, "to_native_dtype"): - native_dtype = zarr_array.metadata.data_type.to_native_dtype() - attributes["_FillValue"] = ( - # Use the new dtype infrastructure instead of doing xarray - # specific fill value decoding - FillValueCoder.decode( - attributes["_FillValue"], - native_dtype, - ) - ) - else: - dtype_value = zarr_array.metadata.data_type.value - attributes["_FillValue"] = FillValueCoder.decode( - attributes["_FillValue"], dtype_value - ) + attributes["_FillValue"] = FillValueCoder.decode( + attributes["_FillValue"], zarr_array.dtype + ) variable = Variable(dimensions, data, attributes, encoding) From 05c8aa60f2b64e98f33fdb361d97bf16a507e062 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 9 Jul 2025 17:07:50 -0400 Subject: [PATCH 19/22] update: zarr dtypes test --- xarray/tests/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 019a5ccca54..4de9e422761 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -130,7 +130,7 @@ def _importorskip( has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") -has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.0.10") +has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.1.0") if has_zarr_v3: import zarr @@ -141,7 +141,7 @@ def _importorskip( has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") requires_zarr_v3_dtypes = pytest.mark.skipif( - not has_zarr_v3_dtypes, reason="requires zarr>3.0.10 (including dev versions)" + not has_zarr_v3_dtypes, reason="requires zarr>3.1.0" ) has_fsspec, requires_fsspec = _importorskip("fsspec") From 656802342e85252b981ab46945515fca15e0e0c8 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 9 Jul 2025 19:40:25 -0400 Subject: [PATCH 20/22] Apply suggestion from @ianhi --- .github/workflows/upstream-dev-ci.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 5c28ef0d851..bdc9a0b9cb9 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -23,7 +23,6 @@ jobs: name: detect upstream-dev ci trigger runs-on: ubuntu-latest if: | - github.event_name == 'push' || github.event_name == 'pull_request' outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: From 8d61d17872fd0ab6cae47dd3e40748dfa3160d39 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 9 Jul 2025 19:41:30 -0400 Subject: [PATCH 21/22] Update .github/workflows/upstream-dev-ci.yaml --- .github/workflows/upstream-dev-ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index bdc9a0b9cb9..5e74c85e319 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -23,6 +23,8 @@ jobs: name: detect upstream-dev ci trigger runs-on: ubuntu-latest if: | + github.repository == 'pydata/xarray' + && (github.event_name == 'push' || github.event_name == 'pull_request') outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: From d9e04ebbc0a718669691193f0c34e346ebd17d4a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 9 Jul 2025 17:42:55 -0600 Subject: [PATCH 22/22] Update xarray/backends/zarr.py --- xarray/backends/zarr.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 743d72a19bc..8b26a6b40ec 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -820,9 +820,7 @@ def open_store_variable(self, name): attributes["_FillValue"], zarr_array.dtype ) - variable = Variable(dimensions, data, attributes, encoding) - - return variable + return Variable(dimensions, data, attributes, encoding) def get_variables(self): return FrozenDict((k, self.open_store_variable(k)) for k in self.array_keys())