Skip to content

Add .to_icechunk() method to ManifestGroup #591

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 72 additions & 1 deletion virtualizarr/manifests/group.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from __future__ import annotations

import textwrap
from typing import Iterator, Mapping
from time import datetime
from typing import TYPE_CHECKING, Iterator, Mapping, Optional

import xarray as xr
from zarr.core.group import GroupMetadata

from virtualizarr.manifests import ManifestArray

if TYPE_CHECKING:
from icechunk import IcechunkStore # type: ignore[import-not-found]


class ManifestGroup(
Mapping[str, "ManifestArray | ManifestGroup"],
Expand Down Expand Up @@ -131,3 +135,70 @@ def to_virtual_dataset(self) -> xr.Dataset:
coord_names=coord_names,
attrs=attributes,
)

@classmethod
def from_virtual_dataset(
cls,
vds: xr.Dataset,
) -> "ManifestGroup":
"""
Create a new ManifestGroup from a virtual dataset object.

The virtual dataset should contain only virtual variables, i.e. those backed by ManifestArrays.

Parameters
----------
vds: xr.Dataset
Virtual dataset, containing only virtual variables.
"""

for name, var in vds.variables.items():
if not isinstance(var.data, ManifestArray):
raise TypeError(
f"Cannot convert a dataset containing a loadable variable directly to a ManifestGroup, but found variable {name} has type {type(var.data)}"
)

manifestarrays = {name: var.data for name, var in vds.variables.items()}

attributes = vds.attrs
# TODO test this is correct
attributes["dimension_names"] = " ".join(list(vds.dims))
# TODO test this constructor round-trips coordinates
attributes["coordinates"] = " ".join(list(vds.coords))

return cls(arrays=manifestarrays, attributes=attributes)

def to_icechunk(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The point of adding this on ManifestGroup instead of ManifestStore initially is to make it easier to compose to deal with virtual datatrees later.

store: "IcechunkStore",
group: Optional[str] = None,
append_dim: Optional[str] = None,
last_updated_at: Optional[datetime] = None,
) -> None:
"""
Write the contents of this ManifestGroup into an Icechunk Store as virtual chunks.

Parameters
----------
store: IcechunkStore
Store to write dataset into.
group: str, optional
Path of the group to write the dataset into (default: the root group).
append_dim: str, optional
Dimension along which to append the virtual dataset.
last_updated_at: datetime, optional
Datetime to use as a checksum for any virtual chunks written to the store
with this operation. When not provided, no check is performed.

Raises
------
ValueError
If the store is read-only.
"""
from virtualizarr.writers.icechunk import write_manifestgroup_to_icechunk

write_manifestgroup_to_icechunk(
store=store,
group=group,
append_dim=append_dim,
last_updated_at=last_updated_at,
)
53 changes: 37 additions & 16 deletions virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
from xarray.backends.zarr import encode_zarr_attr_value
from zarr import Array, Group

from virtualizarr.codecs import get_codecs
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.codecs import extract_codecs, get_codecs
from virtualizarr.manifests import ChunkManifest, ManifestArray, ManifestGroup
from virtualizarr.manifests.utils import (
check_compatible_encodings,
check_same_chunk_shapes,
check_same_codecs,
check_same_dtypes,
Expand Down Expand Up @@ -219,20 +218,43 @@ def write_virtual_dataset_to_icechunk_group(
append_dim=append_dim,
)

# TODO make sure coordinates get set
fully_virtual_ds = xr.Dataset(loadable_variables, attrs=vds.attrs)
manifest_group = ManifestGroup.from_virtual_dataset(fully_virtual_ds)

# note: this calls write_manifestgroup_to_icechunk
manifest_group.to_icechunk(
store=store,
group=group.name,
append_dim=append_dim,
last_updated_at=last_updated_at,
)


def write_manifestgroup_to_icechunk(
manifest_group: ManifestGroup,
store: "IcechunkStore",
*,
group: Optional[str] = None,
append_dim: Optional[str] = None,
last_updated_at: Optional[datetime] = None,
) -> None:
"""Write the contents of a single ManifestGroup into an Icechunk Store as virtual chunks."""

# Then write the virtual variables to the same group
for name, var in virtual_variables.items():
write_virtual_variable_to_icechunk(
for name, marr in manifest_group.arrays():
write_manifestarray_to_icechunk(
store=store,
group=group,
name=name, # type: ignore[arg-type]
var=var,
marr=marr,
append_dim=append_dim,
last_updated_at=last_updated_at,
)

# finish by writing group-level attributes
# note: group attributes must be set after writing individual variables else it gets overwritten
update_attributes(group, vds.attrs, coords=vds.coords)
update_attributes(group, group.metadata.attributes)


def update_attributes(
Expand Down Expand Up @@ -295,23 +317,19 @@ def check_compatible_arrays(
check_same_shapes_except_on_concat_axis(arr_shapes, append_axis)


def write_virtual_variable_to_icechunk(
def write_manifestarray_to_icechunk(
store: "IcechunkStore",
group: "Group",
name: str,
var: xr.Variable,
ma: ManifestArray,
append_dim: Optional[str] = None,
last_updated_at: Optional[datetime] = None,
) -> None:
"""Write a single virtual variable into an icechunk store"""
from zarr import Array

from virtualizarr.codecs import extract_codecs

ma = cast(ManifestArray, var.data)
metadata = ma.metadata

dims: list[str] = cast(list[str], list(var.dims))
dims: list[str] = cast(list[str], list(ma.metadata.dimension_names))
existing_num_chunks = 0
if append_dim and append_dim in dims:
# TODO: MRP - zarr, or icechunk zarr, array assignment to a variable doesn't work to point to the same object
Expand All @@ -322,7 +340,9 @@ def write_virtual_variable_to_icechunk(

# check if arrays can be concatenated
check_compatible_arrays(ma, group[name], append_axis) # type: ignore[arg-type]
check_compatible_encodings(var.encoding, group[name].attrs)

# TODO how to check encodings for manifestarrays here?
# check_compatible_encodings(var.encoding, group[name].attrs)

# determine number of existing chunks along the append axis
existing_num_chunks = num_chunks(
Expand All @@ -347,10 +367,11 @@ def write_virtual_variable_to_icechunk(
dtype=metadata.data_type.to_numpy(),
filters=filters,
compressors=compressors,
dimension_names=var.dims,
dimension_names=dims,
fill_value=metadata.fill_value,
)

# TODO
update_attributes(arr, var.attrs, encoding=var.encoding)

write_manifest_virtual_refs(
Expand Down
Loading