Skip to content

Commit e014bb7

Browse files
authored
Write manifests to zarr store (#45)
* basic structure for writing a zarr store containing manifests * write in nicely indented form * use pathlib for everything * documentation * docstrings * vendor zarr.utils.json_dumps * remove consolidated metadata, as v3 doesn't have this yet * license for vendoring part of zarr-python * change to write v3 * implement reading from v3-compliant stores * roundtripping test * forgot to add the file with the test * test dataset-level attributes * debugging print * try explicitly separating files from directories
1 parent 08774f7 commit e014bb7

File tree

9 files changed

+378
-56
lines changed

9 files changed

+378
-56
lines changed

docs/usage.md

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,25 @@ Currently you can only serialize virtual variables backed by `ManifestArray` obj
338338

339339
### Writing as Zarr
340340

341-
TODO: Write out references as a Zarr v3 store following the [Chunk Manifest ZEP](https://github.com/zarr-developers/zarr-specs/issues/287), see [PR #45](https://github.com/TomNicholas/VirtualiZarr/pull/45)
341+
Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_zarr>` accessor method.
342342

343-
TODO: Explanation of how this requires changes in zarr upstream to be able to read it
343+
```python
344+
combined_vds.virtualize.to_zarr('combined.zarr')
345+
```
346+
347+
The result is a zarr v3 store on disk which contains the chunk manifest information written out as `manifest.json` files, so the store looks like this:
348+
349+
```
350+
combined/zarr.json <- group metadata
351+
combined/air/zarr.json <- array metadata
352+
combined/air/manifest.json <- array manifest
353+
...
354+
```
355+
356+
The advantage of this format is that any zarr v3 reader that understands the chunk manifest ZEP could read from this store, no matter what language it is written in (e.g. via `zarr-python`, `zarr-js`, or rust). This reading would also not require `fsspec`.
357+
358+
```{note}
359+
Currently there are not yet any zarr v3 readers which understand the chunk manifest ZEP, so until then this feature cannot be used for data processing.
360+
361+
This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`.
362+
```

virtualizarr/manifests/manifest.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import itertools
2+
import json
23
import re
34
from typing import Any, Iterable, Iterator, List, Mapping, Tuple, Union, cast
45

@@ -110,14 +111,19 @@ def dict(self) -> dict[str, dict[str, Union[str, int]]]:
110111
"""Converts the entire manifest to a nested dictionary."""
111112
return {k: dict(entry) for k, entry in self.entries.items()}
112113

113-
@staticmethod
114-
def from_zarr_json(filepath: str) -> "ChunkManifest":
114+
@classmethod
115+
def from_zarr_json(cls, filepath: str) -> "ChunkManifest":
115116
"""Create a ChunkManifest from a Zarr manifest.json file."""
116-
raise NotImplementedError()
117+
with open(filepath, "r") as manifest_file:
118+
entries_dict = json.load(manifest_file)
119+
120+
entries = {cast(ChunkKey, k): ChunkEntry(**entry) for k, entry in entries_dict.items()}
121+
return cls(entries=entries)
117122

118123
def to_zarr_json(self, filepath: str) -> None:
119124
"""Write a ChunkManifest to a Zarr manifest.json file."""
120-
raise NotImplementedError()
125+
with open(filepath, "w") as json_file:
126+
json.dump(self.dict(), json_file, indent=4, separators=(", ", ": "))
121127

122128
@classmethod
123129
def _from_kerchunk_chunk_dict(cls, kerchunk_chunk_dict) -> "ChunkManifest":

virtualizarr/tests/test_zarr.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import xarray as xr
2+
import numpy as np
3+
import xarray.testing as xrt
4+
from virtualizarr import open_virtual_dataset, ManifestArray
5+
from virtualizarr.manifests.manifest import ChunkEntry
6+
7+
8+
def test_zarr_v3_roundtrip(tmpdir):
9+
arr = ManifestArray(
10+
chunkmanifest={"0.0": ChunkEntry(path="test.nc", offset=6144, length=48)},
11+
zarray=dict(
12+
shape=(2, 3),
13+
dtype=np.dtype("<i8"),
14+
chunks=(2, 3),
15+
compressor=None,
16+
filters=None,
17+
fill_value=None,
18+
order="C",
19+
zarr_format=3,
20+
),
21+
)
22+
original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
23+
24+
original.virtualize.to_zarr(tmpdir / "store.zarr")
25+
roundtrip = open_virtual_dataset(tmpdir / "store.zarr", filetype="zarr_v3", indexes={})
26+
27+
xrt.assert_identical(roundtrip, original)

virtualizarr/vendor/__init__.py

Whitespace-only changes.

virtualizarr/vendor/zarr/LICENSE.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
The MIT License (MIT)
2+
3+
Copyright (c) 2015-2024 Zarr Developers <https://github.com/zarr-developers>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

virtualizarr/vendor/zarr/__init__.py

Whitespace-only changes.

virtualizarr/vendor/zarr/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
import numbers
3+
4+
from typing import Any
5+
6+
7+
class NumberEncoder(json.JSONEncoder):
8+
def default(self, o):
9+
# See json.JSONEncoder.default docstring for explanation
10+
# This is necessary to encode numpy dtype
11+
if isinstance(o, numbers.Integral):
12+
return int(o)
13+
if isinstance(o, numbers.Real):
14+
return float(o)
15+
return json.JSONEncoder.default(self, o)
16+
17+
18+
def json_dumps(o: Any) -> bytes:
19+
"""Write JSON in a consistent, human-readable way."""
20+
return json.dumps(
21+
o, indent=4, sort_keys=True, ensure_ascii=True, separators=(",", ": "), cls=NumberEncoder
22+
).encode("ascii")

virtualizarr/xarray.py

Lines changed: 111 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pathlib import Path
12
from typing import List, Literal, Mapping, Optional, Union, overload, MutableMapping, Iterable
23

34
import ujson # type: ignore
@@ -10,6 +11,7 @@
1011
import virtualizarr.kerchunk as kerchunk
1112
from virtualizarr.kerchunk import KerchunkStoreRefs, FileType
1213
from virtualizarr.manifests import ChunkManifest, ManifestArray
14+
from virtualizarr.zarr import dataset_to_zarr, attrs_from_zarr_group_json, metadata_from_zarr_json
1315

1416

1517
class ManifestBackendArray(ManifestArray, BackendArray):
@@ -39,7 +41,7 @@ def open_virtual_dataset(
3941
File path to open as a set of virtualized zarr arrays.
4042
filetype : FileType, default None
4143
Type of file to be opened. Used to determine which kerchunk file format backend to use.
42-
Can be one of {'netCDF3', 'netCDF4'}.
44+
Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
4345
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
4446
drop_variables: list[str], default is None
4547
Variables in the file to drop before returning.
@@ -76,57 +78,116 @@ def open_virtual_dataset(
7678
if common:
7779
raise ValueError(f"Cannot both load and drop variables {common}")
7880

79-
# this is the only place we actually always need to use kerchunk directly
80-
# TODO avoid even reading byte ranges for variables that will be dropped later anyway?
81-
vds_refs = kerchunk.read_kerchunk_references_from_file(
82-
filepath=filepath,
83-
filetype=filetype,
84-
)
85-
virtual_vars = virtual_vars_from_kerchunk_refs(
86-
vds_refs,
87-
drop_variables=drop_variables + loadable_variables,
88-
virtual_array_class=virtual_array_class,
89-
)
90-
ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
91-
92-
if indexes is None or len(loadable_variables) > 0:
93-
# TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
94-
# TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
95-
# TODO really we probably want a dedicated xarray backend that iterates over all variables only once
96-
ds = xr.open_dataset(filepath, drop_variables=drop_variables)
97-
98-
if indexes is None:
99-
# add default indexes by reading data from file
100-
indexes = {name: index for name, index in ds.xindexes.items()}
101-
elif indexes != {}:
102-
# TODO allow manual specification of index objects
103-
raise NotImplementedError()
104-
else:
105-
indexes = dict(**indexes) # for type hinting: to allow mutation
10681

107-
loadable_vars = {name: var for name, var in ds.variables.items() if name in loadable_variables}
82+
if virtual_array_class is not ManifestArray:
83+
raise NotImplementedError()
10884

109-
# if we only read the indexes we can just close the file right away as nothing is lazy
110-
if loadable_vars == {}:
111-
ds.close()
85+
if filetype == "zarr_v3":
86+
# TODO is there a neat way of auto-detecting this?
87+
return open_virtual_dataset_from_v3_store(storepath=filepath, drop_variables=drop_variables, indexes=indexes)
11288
else:
113-
loadable_vars = {}
114-
indexes = {}
89+
# this is the only place we actually always need to use kerchunk directly
90+
# TODO avoid even reading byte ranges for variables that will be dropped later anyway?
91+
vds_refs = kerchunk.read_kerchunk_references_from_file(
92+
filepath=filepath,
93+
filetype=filetype,
94+
)
95+
virtual_vars = virtual_vars_from_kerchunk_refs(
96+
vds_refs,
97+
drop_variables=drop_variables + loadable_variables,
98+
virtual_array_class=virtual_array_class,
99+
)
100+
ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
101+
102+
if indexes is None or len(loadable_variables) > 0:
103+
# TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
104+
# TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
105+
# TODO really we probably want a dedicated xarray backend that iterates over all variables only once
106+
ds = xr.open_dataset(filepath, drop_variables=drop_variables)
107+
108+
if indexes is None:
109+
# add default indexes by reading data from file
110+
indexes = {name: index for name, index in ds.xindexes.items()}
111+
elif indexes != {}:
112+
# TODO allow manual specification of index objects
113+
raise NotImplementedError()
114+
else:
115+
indexes = dict(**indexes) # for type hinting: to allow mutation
116+
117+
loadable_vars = {name: var for name, var in ds.variables.items() if name in loadable_variables}
118+
119+
# if we only read the indexes we can just close the file right away as nothing is lazy
120+
if loadable_vars == {}:
121+
ds.close()
122+
else:
123+
loadable_vars = {}
124+
indexes = {}
125+
126+
vars = {**virtual_vars, **loadable_vars}
127+
128+
data_vars, coords = separate_coords(vars, indexes)
129+
130+
vds = xr.Dataset(
131+
data_vars,
132+
coords=coords,
133+
# indexes={}, # TODO should be added in a later version of xarray
134+
attrs=ds_attrs,
135+
)
136+
137+
# TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
115138

116-
vars = {**virtual_vars, **loadable_vars}
139+
return vds
140+
141+
142+
def open_virtual_dataset_from_v3_store(
143+
storepath: str,
144+
drop_variables: List[str],
145+
indexes: Optional[Mapping[str, Index]],
146+
) -> xr.Dataset:
147+
"""
148+
Read a Zarr v3 store and return an xarray Dataset containing virtualized arrays.
149+
"""
150+
_storepath = Path(storepath)
151+
152+
ds_attrs = attrs_from_zarr_group_json(_storepath / "zarr.json")
153+
154+
# TODO recursive glob to create a datatree
155+
# Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it
156+
# see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166
157+
all_paths = _storepath.glob("*/")
158+
directory_paths = [p for p in all_paths if not p.is_file()]
159+
160+
vars = {}
161+
for array_dir in directory_paths:
162+
var_name = array_dir.name
163+
if var_name in drop_variables:
164+
break
165+
166+
zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json")
167+
manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json"))
168+
169+
marr = ManifestArray(chunkmanifest=manifest, zarray=zarray)
170+
var = xr.Variable(data=marr, dims=dim_names, attrs=attrs)
171+
vars[var_name] = var
172+
173+
if indexes is None:
174+
raise NotImplementedError()
175+
elif indexes != {}:
176+
# TODO allow manual specification of index objects
177+
raise NotImplementedError()
178+
else:
179+
indexes = dict(**indexes) # for type hinting: to allow mutation
117180

118181
data_vars, coords = separate_coords(vars, indexes)
119182

120-
vds = xr.Dataset(
183+
ds = xr.Dataset(
121184
data_vars,
122185
coords=coords,
123186
# indexes={}, # TODO should be added in a later version of xarray
124187
attrs=ds_attrs,
125188
)
126189

127-
# TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
128-
129-
return vds
190+
return ds
130191

131192

132193
def virtual_vars_from_kerchunk_refs(
@@ -161,9 +222,9 @@ def virtual_vars_from_kerchunk_refs(
161222

162223
def dataset_from_kerchunk_refs(
163224
refs: KerchunkStoreRefs,
164-
drop_variables: Optional[List[str]] = None,
165-
virtual_array_class=ManifestArray,
166-
indexes={},
225+
drop_variables: List[str] = [],
226+
virtual_array_class: type = ManifestArray,
227+
indexes: Optional[MutableMapping[str, Index]] = None,
167228
) -> xr.Dataset:
168229
"""
169230
Translate a store-level kerchunk reference dict into an xarray Dataset containing virtualized arrays.
@@ -177,6 +238,8 @@ def dataset_from_kerchunk_refs(
177238

178239
vars = virtual_vars_from_kerchunk_refs(refs, drop_variables, virtual_array_class)
179240

241+
if indexes is None:
242+
indexes = {}
180243
data_vars, coords = separate_coords(vars, indexes)
181244

182245
ds_attrs = kerchunk.fully_decode_arr_refs(refs["refs"]).get(".zattrs", {})
@@ -261,13 +324,16 @@ def to_zarr(self, storepath: str) -> None:
261324
"""
262325
Serialize all virtualized arrays in this xarray dataset as a Zarr store.
263326
327+
Currently requires all variables to be backed by ManifestArray objects.
328+
329+
Not very useful until some implementation of a Zarr reader can actually read these manifest.json files.
330+
See https://github.com/zarr-developers/zarr-specs/issues/287
331+
264332
Parameters
265333
----------
266334
storepath : str
267335
"""
268-
raise NotImplementedError(
269-
"No point in writing out these virtual arrays to Zarr until at least one Zarr reader can actually read them."
270-
)
336+
dataset_to_zarr(self.ds, storepath)
271337

272338
@overload
273339
def to_kerchunk(self, filepath: None, format: Literal["dict"]) -> KerchunkStoreRefs:

0 commit comments

Comments
 (0)