Skip to content

Commit 5876365

Browse files
olimccmax-sixtyjhammanpre-commit-ci[bot]
authored
Avoid redundant metadata reads in ZarrArrayWrapper (#8297)
* Avoid redundant metadata reads in `ZarrArrayWrapper` Modify ZarrArrayWrapper to avoid re-reading metadata each time data is read from the array. * Improve test documentation * Update xarray/tests/test_backends.py Co-authored-by: Joe Hamman <jhamman1@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add what's new entry --------- Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Co-authored-by: Joe Hamman <jhamman1@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 47eec7f commit 5876365

File tree

3 files changed

+58
-7
lines changed

3 files changed

+58
-7
lines changed

doc/whats-new.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Deprecations
5454

5555
Bug fixes
5656
~~~~~~~~~
57+
5758
- :py:meth:`DataArray.rename` & :py:meth:`Dataset.rename` would emit a warning
5859
when the operation was a no-op. (:issue:`8266`)
5960
By `Simon Hansen <https://github.com/hoxbro>`_.
@@ -64,6 +65,12 @@ Bug fixes
6465
(:issue:`8271`, :pull:`8272`). By `Spencer Clark
6566
<https://github.com/spencerkclark>`_.
6667

68+
- Fix excess metadata requests when using a Zarr store. Prior to this, metadata
69+
was re-read every time data was retrieved from the array, now metadata is retrieved only once
70+
when they array is initialized.
71+
(:issue:`8290`, :pull:`8297`).
72+
By `Oliver McCormack <https://github.com/olimcc>`_.
73+
6774

6875
Documentation
6976
~~~~~~~~~~~~~

xarray/backends/zarr.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,27 +61,29 @@ def encode_zarr_attr_value(value):
6161

6262

6363
class ZarrArrayWrapper(BackendArray):
64-
__slots__ = ("datastore", "dtype", "shape", "variable_name")
64+
__slots__ = ("datastore", "dtype", "shape", "variable_name", "_array")
6565

6666
def __init__(self, variable_name, datastore):
6767
self.datastore = datastore
6868
self.variable_name = variable_name
6969

70-
array = self.get_array()
71-
self.shape = array.shape
70+
# some callers attempt to evaluate an array if an `array` property exists on the object.
71+
# we prefix with _ to avoid this inference.
72+
self._array = self.datastore.zarr_group[self.variable_name]
73+
self.shape = self._array.shape
7274

7375
# preserve vlen string object dtype (GH 7328)
74-
if array.filters is not None and any(
75-
[filt.codec_id == "vlen-utf8" for filt in array.filters]
76+
if self._array.filters is not None and any(
77+
[filt.codec_id == "vlen-utf8" for filt in self._array.filters]
7678
):
7779
dtype = coding.strings.create_vlen_dtype(str)
7880
else:
79-
dtype = array.dtype
81+
dtype = self._array.dtype
8082

8183
self.dtype = dtype
8284

8385
def get_array(self):
84-
return self.datastore.zarr_group[self.variable_name]
86+
return self._array
8587

8688
def _oindex(self, key):
8789
return self.get_array().oindex[key]

xarray/tests/test_backends.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from os import listdir
2020
from pathlib import Path
2121
from typing import TYPE_CHECKING, Any, Final, cast
22+
from unittest.mock import patch
2223

2324
import numpy as np
2425
import pandas as pd
@@ -2862,6 +2863,47 @@ def create_zarr_target(self):
28622863
yield tmp
28632864

28642865

2866+
@requires_zarr
2867+
class TestZarrArrayWrapperCalls(TestZarrKVStoreV3):
2868+
def test_avoid_excess_metadata_calls(self) -> None:
2869+
"""Test that chunk requests do not trigger redundant metadata requests.
2870+
2871+
This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls
2872+
to retrieve chunk data after initialization do not trigger additional
2873+
metadata requests.
2874+
2875+
https://github.com/pydata/xarray/issues/8290
2876+
"""
2877+
2878+
import zarr
2879+
2880+
ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))})
2881+
2882+
# The call to retrieve metadata performs a group lookup. We patch Group.__getitem__
2883+
# so that we can inspect calls to this method - specifically count of calls.
2884+
# Use of side_effect means that calls are passed through to the original method
2885+
# rather than a mocked method.
2886+
Group = zarr.hierarchy.Group
2887+
with (
2888+
self.create_zarr_target() as store,
2889+
patch.object(
2890+
Group, "__getitem__", side_effect=Group.__getitem__, autospec=True
2891+
) as mock,
2892+
):
2893+
ds.to_zarr(store, mode="w")
2894+
2895+
# We expect this to request array metadata information, so call_count should be >= 1,
2896+
# At time of writing, 2 calls are made
2897+
xrds = xr.open_zarr(store)
2898+
call_count = mock.call_count
2899+
assert call_count > 0
2900+
2901+
# compute() requests array data, which should not trigger additional metadata requests
2902+
# we assert that the number of calls has not increased after fetchhing the array
2903+
xrds.test.compute(scheduler="sync")
2904+
assert mock.call_count == call_count
2905+
2906+
28652907
@requires_zarr
28662908
@requires_fsspec
28672909
def test_zarr_storage_options() -> None:

0 commit comments

Comments
 (0)