Skip to content

Commit 1715ed3

Browse files
dcherianandersy005
andauthored
Avoid duplicate Zarr array read (#8472)
Co-authored-by: Anderson Banihirwe <13301940+andersy005@users.noreply.github.com>
1 parent b313ffc commit 1715ed3

File tree

2 files changed

+43
-50
lines changed

2 files changed

+43
-50
lines changed

xarray/backends/zarr.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,12 @@ def encode_zarr_attr_value(value):
6161

6262

6363
class ZarrArrayWrapper(BackendArray):
64-
__slots__ = ("datastore", "dtype", "shape", "variable_name", "_array")
65-
66-
def __init__(self, variable_name, datastore):
67-
self.datastore = datastore
68-
self.variable_name = variable_name
64+
__slots__ = ("dtype", "shape", "_array")
6965

66+
def __init__(self, zarr_array):
7067
# some callers attempt to evaluate an array if an `array` property exists on the object.
7168
# we prefix with _ to avoid this inference.
72-
self._array = self.datastore.zarr_group[self.variable_name]
69+
self._array = zarr_array
7370
self.shape = self._array.shape
7471

7572
# preserve vlen string object dtype (GH 7328)
@@ -86,10 +83,10 @@ def get_array(self):
8683
return self._array
8784

8885
def _oindex(self, key):
89-
return self.get_array().oindex[key]
86+
return self._array.oindex[key]
9087

9188
def __getitem__(self, key):
92-
array = self.get_array()
89+
array = self._array
9390
if isinstance(key, indexing.BasicIndexer):
9491
return array[key.tuple]
9592
elif isinstance(key, indexing.VectorizedIndexer):
@@ -506,7 +503,7 @@ def ds(self):
506503
return self.zarr_group
507504

508505
def open_store_variable(self, name, zarr_array):
509-
data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self))
506+
data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array))
510507
try_nczarr = self._mode == "r"
511508
dimensions, attributes = _get_zarr_dims_and_attrs(
512509
zarr_array, DIMENSION_KEY, try_nczarr

xarray/tests/test_backends.py

Lines changed: 37 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2836,6 +2836,43 @@ def test_write_empty(
28362836
ls = listdir(os.path.join(store, "test"))
28372837
assert set(expected) == set([file for file in ls if file[0] != "."])
28382838

2839+
def test_avoid_excess_metadata_calls(self) -> None:
2840+
"""Test that chunk requests do not trigger redundant metadata requests.
2841+
2842+
This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls
2843+
to retrieve chunk data after initialization do not trigger additional
2844+
metadata requests.
2845+
2846+
https://github.com/pydata/xarray/issues/8290
2847+
"""
2848+
2849+
import zarr
2850+
2851+
ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))})
2852+
2853+
# The call to retrieve metadata performs a group lookup. We patch Group.__getitem__
2854+
# so that we can inspect calls to this method - specifically count of calls.
2855+
# Use of side_effect means that calls are passed through to the original method
2856+
# rather than a mocked method.
2857+
Group = zarr.hierarchy.Group
2858+
with (
2859+
self.create_zarr_target() as store,
2860+
patch.object(
2861+
Group, "__getitem__", side_effect=Group.__getitem__, autospec=True
2862+
) as mock,
2863+
):
2864+
ds.to_zarr(store, mode="w")
2865+
2866+
# We expect this to request array metadata information, so call_count should be == 1,
2867+
xrds = xr.open_zarr(store)
2868+
call_count = mock.call_count
2869+
assert call_count == 1
2870+
2871+
# compute() requests array data, which should not trigger additional metadata requests
2872+
# we assert that the number of calls has not increased after fetchhing the array
2873+
xrds.test.compute(scheduler="sync")
2874+
assert mock.call_count == call_count
2875+
28392876

28402877
class ZarrBaseV3(ZarrBase):
28412878
zarr_version = 3
@@ -2876,47 +2913,6 @@ def create_zarr_target(self):
28762913
yield tmp
28772914

28782915

2879-
@requires_zarr
2880-
class TestZarrArrayWrapperCalls(TestZarrKVStoreV3):
2881-
def test_avoid_excess_metadata_calls(self) -> None:
2882-
"""Test that chunk requests do not trigger redundant metadata requests.
2883-
2884-
This test targets logic in backends.zarr.ZarrArrayWrapper, asserting that calls
2885-
to retrieve chunk data after initialization do not trigger additional
2886-
metadata requests.
2887-
2888-
https://github.com/pydata/xarray/issues/8290
2889-
"""
2890-
2891-
import zarr
2892-
2893-
ds = xr.Dataset(data_vars={"test": (("Z",), np.array([123]).reshape(1))})
2894-
2895-
# The call to retrieve metadata performs a group lookup. We patch Group.__getitem__
2896-
# so that we can inspect calls to this method - specifically count of calls.
2897-
# Use of side_effect means that calls are passed through to the original method
2898-
# rather than a mocked method.
2899-
Group = zarr.hierarchy.Group
2900-
with (
2901-
self.create_zarr_target() as store,
2902-
patch.object(
2903-
Group, "__getitem__", side_effect=Group.__getitem__, autospec=True
2904-
) as mock,
2905-
):
2906-
ds.to_zarr(store, mode="w")
2907-
2908-
# We expect this to request array metadata information, so call_count should be >= 1,
2909-
# At time of writing, 2 calls are made
2910-
xrds = xr.open_zarr(store)
2911-
call_count = mock.call_count
2912-
assert call_count > 0
2913-
2914-
# compute() requests array data, which should not trigger additional metadata requests
2915-
# we assert that the number of calls has not increased after fetchhing the array
2916-
xrds.test.compute(scheduler="sync")
2917-
assert mock.call_count == call_count
2918-
2919-
29202916
@requires_zarr
29212917
@requires_fsspec
29222918
def test_zarr_storage_options() -> None:

0 commit comments

Comments
 (0)