From 45bb4e56d24ab5d979eabb92b1db11a5897937be Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 11:08:07 +0100 Subject: [PATCH 01/26] add rough cli converter structure --- src/zarr/core/metadata/converter/__init__.py | 0 src/zarr/core/metadata/converter/cli.py | 0 .../metadata/converter/v2_v3_converter.py | 67 +++++++++++++++++++ src/zarr/core/metadata/v2.py | 2 +- 4 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 src/zarr/core/metadata/converter/__init__.py create mode 100644 src/zarr/core/metadata/converter/cli.py create mode 100644 src/zarr/core/metadata/converter/v2_v3_converter.py diff --git a/src/zarr/core/metadata/converter/__init__.py b/src/zarr/core/metadata/converter/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py new file mode 100644 index 0000000000..7d79b5e2d0 --- /dev/null +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -0,0 +1,67 @@ +from zarr.abc.codec import Codec +from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.core.array import Array +from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata + + +async def convert_v2_to_v3(zarr_v2: Array) -> None: + if not isinstance(zarr_v2.metadata, ArrayV2Metadata): + raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") + + # zarr_format = zarr_v2.metadata.zarr_format + # if zarr_format != 2: + # raise ValueError( + # f"Input zarr array / group is zarr_format {zarr_format} - only 2 is accepted." + # ) + + # accept array or group - if group, iterate into it to do the whole hierarchy + + # how are the metadata files currently written? Which function? + # could add a to_v3() function on to the ArrayV2Metadata / GroupMetadata classes?? + convert_v2_metadata(zarr_v2.metadata) + + # Check for existing zarr json? + # await zarr_v2._async_array._save_metadata(metadata_v3) + + +def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: + chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) + + # Handle C vs F? (see gist) + codecs = convert_compressor(metadata_v2) + + return ArrayV3Metadata( + shape=metadata_v2.shape, + data_type=metadata_v2.dtype, + chunk_grid=metadata_v2.chunk_grid, + chunk_key_encoding=chunk_key_encoding, + fill_value=metadata_v2.fill_value, + codecs=codecs, + attributes=metadata_v2.attributes, + dimension_names=None, + storage_transformers=None, + ) + + +def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: + compressor_codecs: list[Codec] = [] + + if metadata_v2.compressor is None: + return compressor_codecs + + compressor_name = metadata_v2.compressor.codec_id + + if compressor_name == "blosc": + compressor_codecs.append( + BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, + ) + ) + + return compressor_codecs diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 3ac75e0418..7bdad204b8 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -68,7 +68,7 @@ class ArrayV2Metadata(Metadata): order: MemoryOrder = "C" filters: tuple[numcodecs.abc.Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: CompressorLikev2 + compressor: numcodecs.abc.Codec | None attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) From 456c9e774061ccb265ca5752854ae8414077b9f9 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:15:09 +0100 Subject: [PATCH 02/26] allow zstd, gzip and numcodecs zarr 3 compression --- .../metadata/converter/v2_v3_converter.py | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 7d79b5e2d0..0208ce9c26 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -1,9 +1,12 @@ from zarr.abc.codec import Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.registry import get_codec_class async def convert_v2_to_v3(zarr_v2: Array) -> None: @@ -53,15 +56,44 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: compressor_name = metadata_v2.compressor.codec_id - if compressor_name == "blosc": - compressor_codecs.append( - BloscCodec( - typesize=metadata_v2.dtype.to_native_dtype().itemsize, - cname=metadata_v2.compressor.cname, - clevel=metadata_v2.compressor.clevel, - shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), - blocksize=metadata_v2.compressor.blocksize, + match compressor_name: + case "blosc": + compressor_codecs.append( + BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, + ) ) - ) + + case "zstd": + compressor_codecs.append( + ZstdCodec( + level=metadata_v2.compressor.level, + checksum=metadata_v2.compressor.checksum, + ) + ) + + case "gzip": + compressor_codecs.append(GzipCodec(level=metadata_v2.compressor.level)) + + case _: + # If possible, find matching numcodecs.zarr3 codec + numcodec_name = f"numcodecs.{compressor_name}" + numcodec_dict = { + "name": numcodec_name, + "configuration": metadata_v2.compressor.get_config(), + } + + try: + compressor_codec = get_codec_class(numcodec_name) + except KeyError as exc: + raise ValueError( + f"Couldn't find corresponding numcodecs.zarr3 codec for {compressor_name}" + ) from exc + + compressor_codecs.append(compressor_codec.from_dict(numcodec_dict)) return compressor_codecs From 242a338353d50ec7ca8c6008fa22d9443c3737d0 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:22:14 +0100 Subject: [PATCH 03/26] convert filters to v3 --- .../metadata/converter/v2_v3_converter.py | 62 ++++++++++++++----- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 0208ce9c26..23085addc4 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -1,4 +1,8 @@ -from zarr.abc.codec import Codec +from typing import cast + +import numcodecs.abc + +from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.gzip import GzipCodec from zarr.codecs.zstd import ZstdCodec @@ -33,7 +37,10 @@ def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) # Handle C vs F? (see gist) - codecs = convert_compressor(metadata_v2) + convert_filters(metadata_v2) + convert_compressor(metadata_v2) + + codecs: list[Codec] = [] return ArrayV3Metadata( shape=metadata_v2.shape, @@ -48,8 +55,20 @@ def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: ) -def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: - compressor_codecs: list[Codec] = [] +def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: + if metadata_v2.filters is None: + return [] + + filters_codecs = [find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] + for codec in filters_codecs: + if not isinstance(codec, ArrayArrayCodec): + raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") + + return cast(list[ArrayArrayCodec], filters_codecs) + + +def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[BytesBytesCodec]: + compressor_codecs: list[BytesBytesCodec] = [] if metadata_v2.compressor is None: return compressor_codecs @@ -81,19 +100,30 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[Codec]: case _: # If possible, find matching numcodecs.zarr3 codec - numcodec_name = f"numcodecs.{compressor_name}" - numcodec_dict = { - "name": numcodec_name, - "configuration": metadata_v2.compressor.get_config(), - } + compressor_codec = find_numcodecs_zarr3(metadata_v2.compressor) - try: - compressor_codec = get_codec_class(numcodec_name) - except KeyError as exc: - raise ValueError( - f"Couldn't find corresponding numcodecs.zarr3 codec for {compressor_name}" - ) from exc + if not isinstance(compressor_codec, BytesBytesCodec): + raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") - compressor_codecs.append(compressor_codec.from_dict(numcodec_dict)) + compressor_codecs.append(compressor_codec) return compressor_codecs + + +def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: + """Find matching numcodecs.zarr3 codec (if it exists)""" + + numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}" + numcodec_dict = { + "name": numcodec_name, + "configuration": numcodecs_codec.get_config(), + } + + try: + codec_v3 = get_codec_class(numcodec_name) + except KeyError as exc: + raise ValueError( + f"Couldn't find corresponding numcodecs.zarr3 codec for {numcodecs_codec.codec_id}" + ) from exc + + return codec_v3.from_dict(numcodec_dict) From 1045c331c59e50e22959c054505d15869f94d33c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 15:05:33 +0100 Subject: [PATCH 04/26] create BytesCodec with correct endian --- .../metadata/converter/v2_v3_converter.py | 53 ++++++++++--------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 23085addc4..cbf5603abc 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -4,10 +4,12 @@ from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.dtype.common import HasEndianness from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.registry import get_codec_class @@ -37,11 +39,22 @@ def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) # Handle C vs F? (see gist) - convert_filters(metadata_v2) - convert_compressor(metadata_v2) - codecs: list[Codec] = [] + # array-array codecs + codecs.extend(convert_filters(metadata_v2)) + + # array-bytes codecs + if not isinstance(metadata_v2.dtype, HasEndianness): + codecs.append(BytesCodec(endian=None)) + else: + codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) + + # bytes-bytes codecs + bytes_bytes_codec = convert_compressor(metadata_v2) + if bytes_bytes_codec is not None: + codecs.append(bytes_bytes_codec) + return ArrayV3Metadata( shape=metadata_v2.shape, data_type=metadata_v2.dtype, @@ -67,36 +80,30 @@ def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: return cast(list[ArrayArrayCodec], filters_codecs) -def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[BytesBytesCodec]: - compressor_codecs: list[BytesBytesCodec] = [] - +def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: if metadata_v2.compressor is None: - return compressor_codecs + return None compressor_name = metadata_v2.compressor.codec_id match compressor_name: case "blosc": - compressor_codecs.append( - BloscCodec( - typesize=metadata_v2.dtype.to_native_dtype().itemsize, - cname=metadata_v2.compressor.cname, - clevel=metadata_v2.compressor.clevel, - shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), - blocksize=metadata_v2.compressor.blocksize, - ) + return BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, ) case "zstd": - compressor_codecs.append( - ZstdCodec( - level=metadata_v2.compressor.level, - checksum=metadata_v2.compressor.checksum, - ) + return ZstdCodec( + level=metadata_v2.compressor.level, + checksum=metadata_v2.compressor.checksum, ) case "gzip": - compressor_codecs.append(GzipCodec(level=metadata_v2.compressor.level)) + return GzipCodec(level=metadata_v2.compressor.level) case _: # If possible, find matching numcodecs.zarr3 codec @@ -105,9 +112,7 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> list[BytesBytesCodec]: if not isinstance(compressor_codec, BytesBytesCodec): raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") - compressor_codecs.append(compressor_codec) - - return compressor_codecs + return compressor_codec def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: From 4e2442fb8494f4dc264956856f7d94dbcf34133a Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 1 Jul 2025 15:59:42 +0100 Subject: [PATCH 05/26] handle C vs F order in v2 metadata --- src/zarr/core/metadata/converter/v2_v3_converter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index cbf5603abc..5571f14487 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -6,6 +6,7 @@ from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding @@ -38,10 +39,12 @@ async def convert_v2_to_v3(zarr_v2: Array) -> None: def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) - # Handle C vs F? (see gist) codecs: list[Codec] = [] # array-array codecs + if metadata_v2.order == "F": + # F is equivalent to order: n-1, ... 1, 0 + codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) codecs.extend(convert_filters(metadata_v2)) # array-bytes codecs From c63f0b87bacfd10a7252f815140f3264a74d041e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:27:13 +0100 Subject: [PATCH 06/26] save group and array metadata to file --- .../metadata/converter/v2_v3_converter.py | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index 5571f14487..fce2ad4654 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -1,3 +1,4 @@ +import asyncio from typing import cast import numcodecs.abc @@ -9,34 +10,37 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.common import ZARR_JSON from zarr.core.dtype.common import HasEndianness +from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.registry import get_codec_class +from zarr.storage._utils import _join_paths -async def convert_v2_to_v3(zarr_v2: Array) -> None: - if not isinstance(zarr_v2.metadata, ArrayV2Metadata): +async def convert_v2_to_v3(zarr_v2: Array | Group) -> None: + if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") - # zarr_format = zarr_v2.metadata.zarr_format - # if zarr_format != 2: - # raise ValueError( - # f"Input zarr array / group is zarr_format {zarr_format} - only 2 is accepted." - # ) + if isinstance(zarr_v2.metadata, GroupMetadata): + group_metadata_v3 = GroupMetadata( + attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None + ) + await save_v3_metadata(zarr_v2, group_metadata_v3) - # accept array or group - if group, iterate into it to do the whole hierarchy + # process members of the group + for key in zarr_v2: + await convert_v2_to_v3(zarr_v2[key]) - # how are the metadata files currently written? Which function? - # could add a to_v3() function on to the ArrayV2Metadata / GroupMetadata classes?? - convert_v2_metadata(zarr_v2.metadata) - - # Check for existing zarr json? - # await zarr_v2._async_array._save_metadata(metadata_v3) + else: + array_metadata_v3 = convert_array_v2_metadata(zarr_v2.metadata) + await save_v3_metadata(zarr_v2, array_metadata_v3) -def convert_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: +def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) codecs: list[Codec] = [] @@ -135,3 +139,20 @@ def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: ) from exc return codec_v3.from_dict(numcodec_dict) + + +async def save_v3_metadata( + zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata +) -> None: + zarr_json_path = _join_paths([zarr_v2.path, ZARR_JSON]) + + if await zarr_v2.store.exists(zarr_json_path): + raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") + + to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) + awaitables = [ + zarr_v2.store.set_if_not_exists(_join_paths([zarr_v2.path, key]), value) + for key, value in to_save.items() + ] + + await asyncio.gather(*awaitables) From 2947ce4b2fc4953252a3715a7ec941323a79b95f Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:50:50 +0100 Subject: [PATCH 07/26] create overall conversion functions for store, array or group --- .../metadata/converter/v2_v3_converter.py | 55 ++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/v2_v3_converter.py index fce2ad4654..fa9b61715f 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/v2_v3_converter.py @@ -3,6 +3,7 @@ import numcodecs.abc +import zarr from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec from zarr.codecs.blosc import BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec @@ -18,10 +19,36 @@ from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.registry import get_codec_class +from zarr.storage import StoreLike from zarr.storage._utils import _join_paths -async def convert_v2_to_v3(zarr_v2: Array | Group) -> None: +async def convert_v2_to_v3(store: StoreLike, path: str | None = None) -> None: + """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + + Parameters + ---------- + store : StoreLike + Store or path to directory in file system or name of zip file. + path : str | None, optional + The path within the store to open, by default None + """ + + zarr_v2 = zarr.open(store=store, mode="r+", path=path) + await convert_array_or_group(zarr_v2) + + +async def convert_array_or_group(zarr_v2: Array | Group) -> None: + """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then + all arrays / groups within this group will also be converted. A zarr.json file will be created + at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. + + Parameters + ---------- + zarr_v2 : Array | Group + An array or group with zarr_format = 2 + """ if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") @@ -29,18 +56,18 @@ async def convert_v2_to_v3(zarr_v2: Array | Group) -> None: group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - await save_v3_metadata(zarr_v2, group_metadata_v3) + await _save_v3_metadata(zarr_v2, group_metadata_v3) # process members of the group for key in zarr_v2: - await convert_v2_to_v3(zarr_v2[key]) + await convert_array_or_group(zarr_v2[key]) else: - array_metadata_v3 = convert_array_v2_metadata(zarr_v2.metadata) - await save_v3_metadata(zarr_v2, array_metadata_v3) + array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) + await _save_v3_metadata(zarr_v2, array_metadata_v3) -def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: +def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) codecs: list[Codec] = [] @@ -49,7 +76,7 @@ def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: if metadata_v2.order == "F": # F is equivalent to order: n-1, ... 1, 0 codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) - codecs.extend(convert_filters(metadata_v2)) + codecs.extend(_convert_filters(metadata_v2)) # array-bytes codecs if not isinstance(metadata_v2.dtype, HasEndianness): @@ -58,7 +85,7 @@ def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) # bytes-bytes codecs - bytes_bytes_codec = convert_compressor(metadata_v2) + bytes_bytes_codec = _convert_compressor(metadata_v2) if bytes_bytes_codec is not None: codecs.append(bytes_bytes_codec) @@ -75,11 +102,11 @@ def convert_array_v2_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: ) -def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: +def _convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: if metadata_v2.filters is None: return [] - filters_codecs = [find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] + filters_codecs = [_find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] for codec in filters_codecs: if not isinstance(codec, ArrayArrayCodec): raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") @@ -87,7 +114,7 @@ def convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: return cast(list[ArrayArrayCodec], filters_codecs) -def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: +def _convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: if metadata_v2.compressor is None: return None @@ -114,7 +141,7 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: case _: # If possible, find matching numcodecs.zarr3 codec - compressor_codec = find_numcodecs_zarr3(metadata_v2.compressor) + compressor_codec = _find_numcodecs_zarr3(metadata_v2.compressor) if not isinstance(compressor_codec, BytesBytesCodec): raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") @@ -122,7 +149,7 @@ def convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: return compressor_codec -def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: +def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: """Find matching numcodecs.zarr3 codec (if it exists)""" numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}" @@ -141,7 +168,7 @@ def find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: return codec_v3.from_dict(numcodec_dict) -async def save_v3_metadata( +async def _save_v3_metadata( zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata ) -> None: zarr_json_path = _join_paths([zarr_v2.path, ZARR_JSON]) From ba8175502017d169b5d3aa300f11aea5069a48ab Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 11:56:42 +0100 Subject: [PATCH 08/26] add minimal typer cli --- pyproject.toml | 1 + src/zarr/core/metadata/converter/cli.py | 32 +++++++++++++++++++ ...{v2_v3_converter.py => converter_v2_v3.py} | 24 +++++++++----- 3 files changed, 49 insertions(+), 8 deletions(-) rename src/zarr/core/metadata/converter/{v2_v3_converter.py => converter_v2_v3.py} (88%) diff --git a/pyproject.toml b/pyproject.toml index 6c18563a1f..eb95c5dada 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,7 @@ remote = [ gpu = [ "cupy-cuda12x", ] +cli = ["typer"] # Development extras test = [ "coverage", diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index e69de29bb2..d5c693e07f 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -0,0 +1,32 @@ +from typing import Annotated + +import typer + +from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3 + +app = typer.Typer() + + +@app.command() # type: ignore[misc] +def convert( + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, +) -> None: + """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + """ + convert_v2_to_v3(store=store, path=path) + + +@app.command() # type: ignore[misc] +def clear() -> None: + print("Clearing...") + + +if __name__ == "__main__": + app() diff --git a/src/zarr/core/metadata/converter/v2_v3_converter.py b/src/zarr/core/metadata/converter/converter_v2_v3.py similarity index 88% rename from src/zarr/core/metadata/converter/v2_v3_converter.py rename to src/zarr/core/metadata/converter/converter_v2_v3.py index fa9b61715f..33845a53a8 100644 --- a/src/zarr/core/metadata/converter/v2_v3_converter.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -1,5 +1,5 @@ import asyncio -from typing import cast +from typing import Any, cast import numcodecs.abc @@ -18,12 +18,15 @@ from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StoreLike from zarr.storage._utils import _join_paths -async def convert_v2_to_v3(store: StoreLike, path: str | None = None) -> None: +def convert_v2_to_v3( + store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None +) -> None: """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. @@ -33,13 +36,18 @@ async def convert_v2_to_v3(store: StoreLike, path: str | None = None) -> None: Store or path to directory in file system or name of zip file. path : str | None, optional The path within the store to open, by default None + storage_options : dict | None, optional + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. """ - zarr_v2 = zarr.open(store=store, mode="r+", path=path) - await convert_array_or_group(zarr_v2) + zarr_v2 = zarr.open( + store=store, mode="r+", zarr_format=2, path=path, storage_options=storage_options + ) + convert_array_or_group(zarr_v2) -async def convert_array_or_group(zarr_v2: Array | Group) -> None: +def convert_array_or_group(zarr_v2: Array | Group) -> None: """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. A zarr.json file will be created at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. @@ -56,15 +64,15 @@ async def convert_array_or_group(zarr_v2: Array | Group) -> None: group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - await _save_v3_metadata(zarr_v2, group_metadata_v3) + sync(_save_v3_metadata(zarr_v2, group_metadata_v3)) # process members of the group for key in zarr_v2: - await convert_array_or_group(zarr_v2[key]) + convert_array_or_group(zarr_v2[key]) else: array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) - await _save_v3_metadata(zarr_v2, array_metadata_v3) + sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: From 67f958030f7ab57965dc03061c291dfa95d8de60 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 14:23:36 +0100 Subject: [PATCH 09/26] add initial tests for converter --- .../metadata/converter/converter_v2_v3.py | 4 +- tests/test_metadata/test_converter_v2_v3.py | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 tests/test_metadata/test_converter_v2_v3.py diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 33845a53a8..8d84fac0d6 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -41,9 +41,7 @@ def convert_v2_to_v3( the Store constructor for that implementation. Ignored otherwise. """ - zarr_v2 = zarr.open( - store=store, mode="r+", zarr_format=2, path=path, storage_options=storage_options - ) + zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) convert_array_or_group(zarr_v2) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py new file mode 100644 index 0000000000..de6e389dad --- /dev/null +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -0,0 +1,72 @@ +import numcodecs +import pytest +from typer.testing import CliRunner + +import zarr +from zarr.abc.store import Store +from zarr.codecs.blosc import BloscCodec +from zarr.codecs.bytes import BytesCodec +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.dtype.npy.int import UInt16 +from zarr.core.metadata.converter.cli import app + +runner = CliRunner() + + +def test_convert_array(local_store: Store) -> None: + shape = (10, 10) + chunks = (10, 10) + dtype = "uint16" + compressors = numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1) + fill_value = 2 + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + zarr.create_array( + store=local_store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressors=compressors, + zarr_format=2, + fill_value=fill_value, + attributes=attributes, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.node_type == "array" + assert metadata.shape == shape + assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) + assert metadata.chunk_key_encoding == DefaultChunkKeyEncoding(separator=".") + assert metadata.data_type == UInt16("little") + assert metadata.codecs == ( + BytesCodec(endian="little"), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ) + assert metadata.fill_value == fill_value + assert metadata.attributes == attributes + assert metadata.dimension_names is None + assert metadata.storage_transformers == () + + +@pytest.mark.parametrize("node_type", ["array", "group"]) +def test_convert_v3(local_store: Store, node_type: str) -> None: + """Attempting to convert a v3 array/group should always fail""" + + if node_type == "array": + zarr.create_array( + store=local_store, shape=(10, 10), chunks=(10, 10), zarr_format=3, dtype="uint16" + ) + else: + zarr.create_group(store=local_store, zarr_format=3) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" From 0d7c2c894901425f87163e7c54aacb174c797b2e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 15:10:13 +0100 Subject: [PATCH 10/26] add tests for conversion of groups and nested groups and arrays --- .../metadata/converter/converter_v2_v3.py | 10 ++-- tests/test_metadata/test_converter_v2_v3.py | 56 +++++++++++++++++++ 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 8d84fac0d6..2b9cce91c4 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -59,16 +59,18 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") if isinstance(zarr_v2.metadata, GroupMetadata): + # process members of the group + for key in zarr_v2: + convert_array_or_group(zarr_v2[key]) + + # write group's converted metadata group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) sync(_save_v3_metadata(zarr_v2, group_metadata_v3)) - # process members of the group - for key in zarr_v2: - convert_array_or_group(zarr_v2[key]) - else: + # write array's converted metadata array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index de6e389dad..f8ec11ffcf 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -55,6 +55,62 @@ def test_convert_array(local_store: Store) -> None: assert metadata.storage_transformers == () +def test_convert_group(local_store: Store) -> None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.node_type == "group" + assert metadata.attributes == attributes + assert metadata.consolidated_metadata is None + + +def test_convert_nested_groups_and_arrays(local_store: Store) -> None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + # 3 levels of nested groups + group_1 = zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + group_3 = group_2.create_group(name="group_3", attributes=attributes) + + # 1 array per group + array_1 = group_1.create_array( + name="array_1", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes + ) + array_2 = group_2.create_array( + name="array_2", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes + ) + array_3 = group_3.create_array( + name="array_3", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes + ) + + paths = [group_1.path, group_2.path, group_3.path, array_1.path, array_2.path, array_3.path] + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + + # check zarr.json were created for every group and array + total_zarr_jsons = 0 + for _, _, filenames in local_store.root.walk(): + assert "zarr.json" in filenames + total_zarr_jsons += 1 + assert total_zarr_jsons == 6 + + # Check converted zarr can be opened + metadata accessed at all levels + zarr_array = zarr.open(local_store.root, zarr_format=3) + for path in paths: + zarr_v3 = zarr_array[path] + metadata = zarr_v3.metadata + assert metadata.zarr_format == 3 + assert metadata.attributes == attributes + + @pytest.mark.parametrize("node_type", ["array", "group"]) def test_convert_v3(local_store: Store, node_type: str) -> None: """Attempting to convert a v3 array/group should always fail""" From cf395802e3351a371411471aaf3af528775aca6c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 15:47:00 +0100 Subject: [PATCH 11/26] add tests for conversion of compressors and filters --- tests/test_metadata/test_converter_v2_v3.py | 79 +++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index f8ec11ffcf..5cb2f7cd86 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -1,11 +1,18 @@ +import lzma + import numcodecs +import numcodecs.abc import pytest +from numcodecs.zarr3 import LZMA, Delta from typer.testing import CliRunner import zarr +from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs.blosc import BloscCodec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.dtype.npy.int import UInt16 @@ -111,6 +118,78 @@ def test_convert_nested_groups_and_arrays(local_store: Store) -> None: assert metadata.attributes == attributes +@pytest.mark.parametrize( + ("compressor_v2", "compressor_v3"), + [ + ( + numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ), + (numcodecs.Zstd(level=3), ZstdCodec(level=3)), + (numcodecs.GZip(level=3), GzipCodec(level=3)), + ( + numcodecs.LZMA( + format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}] + ), + LZMA(format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}]), + ), + ], + ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], +) +def test_convert_compressor( + local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec +) -> None: + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=compressor_v2, + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == ( + BytesCodec(endian="little"), + compressor_v3, + ) + + +def test_convert_filter(local_store: Store) -> None: + filter_v2 = numcodecs.Delta(dtype=" None: """Attempting to convert a v3 array/group should always fail""" From 11499e798754b85da4ccd9fe7c6d6c831f56c528 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 16:07:41 +0100 Subject: [PATCH 12/26] test conversion of order and endianness --- tests/test_metadata/test_converter_v2_v3.py | 67 ++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 5cb2f7cd86..300a0f4a67 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -12,10 +12,11 @@ from zarr.codecs.blosc import BloscCodec from zarr.codecs.bytes import BytesCodec from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.dtype.npy.int import UInt16 +from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 from zarr.core.metadata.converter.cli import app runner = CliRunner() @@ -190,6 +191,70 @@ def test_convert_filter(local_store: Store) -> None: ) +@pytest.mark.parametrize( + ("order", "expected_codecs"), + [ + ("C", (BytesCodec(endian="little"),)), + ("F", (TransposeCodec(order=(1, 0)), BytesCodec(endian="little"))), + ], +) +def test_convert_C_vs_F_order( + local_store: Store, order: str, expected_codecs: tuple[Codec] +) -> None: + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=None, + zarr_format=2, + fill_value=0, + order=order, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + + assert metadata.codecs == expected_codecs + + +@pytest.mark.parametrize( + ("dtype", "expected_data_type", "expected_codecs"), + [ + ("uint8", UInt8(), (BytesCodec(endian=None),)), + ("uint16", UInt16(), (BytesCodec(endian="little"),)), + ], + ids=["single_byte", "multi_byte"], +) +def test_convert_endian( + local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] +) -> None: + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype=dtype, + compressors=None, + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.data_type == expected_data_type + assert metadata.codecs == expected_codecs + + @pytest.mark.parametrize("node_type", ["array", "group"]) def test_convert_v3(local_store: Store, node_type: str) -> None: """Attempting to convert a v3 array/group should always fail""" From 90b0996bd95a1446ef762d7802f166d4b52290d8 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Thu, 3 Jul 2025 16:26:52 +0100 Subject: [PATCH 13/26] add tests for edge cases of incorrect codecs --- tests/test_metadata/test_converter_v2_v3.py | 64 +++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 300a0f4a67..5300fc6656 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -270,3 +270,67 @@ def test_convert_v3(local_store: Store, node_type: str) -> None: assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" + + +def test_convert_unknown_codec(local_store: Store) -> None: + """Attempting to convert a codec without a v3 equivalent should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Categorize(labels=["a", "b"], dtype=object)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, ValueError) + assert ( + str(result.exception) == "Couldn't find corresponding numcodecs.zarr3 codec for categorize" + ) + + +def test_convert_incorrect_filter(local_store: Store) -> None: + """Attempting to convert a filter (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Zstd(level=3)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert ( + str(result.exception) == "Filter is not an ArrayArrayCodec" + ) + + +def test_convert_incorrect_compressor(local_store: Store) -> None: + """Attempting to convert a compressor (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=numcodecs.Delta(dtype=" is not a BytesBytesCodec" + ) From 85159bb788fa8f2fcd01d68b286f3e666d6dbf39 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:45:07 +0100 Subject: [PATCH 14/26] add tests for / separator --- tests/test_metadata/test_converter_v2_v3.py | 50 ++++++++++++++------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 5300fc6656..b7884fe4d4 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -1,4 +1,5 @@ import lzma +from typing import Any import numcodecs import numcodecs.abc @@ -79,26 +80,38 @@ def test_convert_group(local_store: Store) -> None: assert metadata.consolidated_metadata is None -def test_convert_nested_groups_and_arrays(local_store: Store) -> None: - attributes = {"baz": 42, "qux": [1, 4, 7, 12]} +def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: + """Create a zarr with nested groups / arrays, returning the paths to all.""" # 3 levels of nested groups - group_1 = zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) group_2 = group_1.create_group(name="group_2", attributes=attributes) - group_3 = group_2.create_group(name="group_3", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] # 1 array per group - array_1 = group_1.create_array( - name="array_1", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes - ) - array_2 = group_2.create_array( - name="array_2", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes - ) - array_3 = group_3.create_array( - name="array_3", shape=(10, 10), chunks=(10, 10), dtype="uint16", attributes=attributes - ) + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) - paths = [group_1.path, group_2.path, group_3.path, array_1.path, array_2.path, array_3.path] + return paths + + +@pytest.mark.parametrize("separator", [".", "/"]) +def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> None: + """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays + (including when there are additional dirs due to using a / separator)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + paths = create_nested_zarr(local_store, attributes, separator) result = runner.invoke(app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -106,8 +119,13 @@ def test_convert_nested_groups_and_arrays(local_store: Store) -> None: # check zarr.json were created for every group and array total_zarr_jsons = 0 for _, _, filenames in local_store.root.walk(): - assert "zarr.json" in filenames - total_zarr_jsons += 1 + # group / array directories + if ".zattrs" in filenames: + assert "zarr.json" in filenames + total_zarr_jsons += 1 + # other directories e.g. for chunks when separator is / + else: + assert "zarr.json" not in filenames assert total_zarr_jsons == 6 # Check converted zarr can be opened + metadata accessed at all levels From 53ba1669323872d9af3711367927ecd8b282c9c9 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 11:36:32 +0100 Subject: [PATCH 15/26] draft of metadata remover and add test for internal paths --- .../metadata/converter/converter_v2_v3.py | 67 ++++++++++++++++--- tests/test_metadata/test_converter_v2_v3.py | 24 +++++++ 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 2b9cce91c4..4c7b1e6048 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -13,7 +13,14 @@ from zarr.core.array import Array from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.common import ZARR_JSON +from zarr.core.common import ( + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, + ZGROUP_JSON, + ZMETADATA_V2_JSON, + ZarrFormat, +) from zarr.core.dtype.common import HasEndianness from zarr.core.group import Group, GroupMetadata from zarr.core.metadata.v2 import ArrayV2Metadata @@ -21,7 +28,7 @@ from zarr.core.sync import sync from zarr.registry import get_codec_class from zarr.storage import StoreLike -from zarr.storage._utils import _join_paths +from zarr.storage._common import make_store_path def convert_v2_to_v3( @@ -75,6 +82,50 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) +async def remove_metadata( + store: StoreLike, + zarr_format: ZarrFormat, + path: str | None = None, + storage_options: dict[str, Any] | None = None, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + + Parameters + ---------- + store : StoreLike + Store or path to directory in file system or name of zip file. + zarr_format : ZarrFormat + Which format's metadata to remove - 2 or 3. + path : str | None, optional + The path within the store to open, by default None + storage_options : dict | None, optional + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. + """ + store_path = await make_store_path(store, mode="r+", storage_options=storage_options) + if not store_path.store.supports_deletes: + raise ValueError("Store must support deletes to remove metadata") + + if path is None: + prefix = "" + else: + prefix = path + + if zarr_format == 2: + metadata_files = [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON] + else: + metadata_files = [ZARR_JSON] + + awaitables = [ + (store_path / file_path).delete() + async for file_path in store_path.store.list_prefix(prefix) + if file_path.split("/")[-1] in metadata_files + ] + + await asyncio.gather(*awaitables) + + def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) @@ -179,15 +230,9 @@ def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: async def _save_v3_metadata( zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata ) -> None: - zarr_json_path = _join_paths([zarr_v2.path, ZARR_JSON]) - - if await zarr_v2.store.exists(zarr_json_path): + zarr_json_path = zarr_v2.store_path / ZARR_JSON + if await zarr_json_path.exists(): raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) - awaitables = [ - zarr_v2.store.set_if_not_exists(_join_paths([zarr_v2.path, key]), value) - for key, value in to_save.items() - ] - - await asyncio.gather(*awaitables) + await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index b7884fe4d4..e0348be777 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -137,6 +137,30 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> assert metadata.attributes == attributes +@pytest.mark.parametrize("separator", [".", "/"]) +def test_convert_nested_with_path(local_store: Store, separator: str) -> None: + """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" + + create_nested_zarr(local_store, {}, separator) + + result = runner.invoke(app, ["convert", str(local_store.root), "--path", "group_1"]) + assert result.exit_code == 0 + + group_path = local_store.root / "group_1" + + total_zarr_jsons = 0 + for dirpath, _, filenames in local_store.root.walk(): + inside_group = (dirpath == group_path) or (group_path in dirpath.parents) + if (".zattrs" in filenames) and inside_group: + # group / array directories inside the group + assert "zarr.json" in filenames + total_zarr_jsons += 1 + else: + assert "zarr.json" not in filenames + + assert total_zarr_jsons == 4 + + @pytest.mark.parametrize( ("compressor_v2", "compressor_v3"), [ From d4cdc045a6634840fcf8944054b964d96c139094 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:24:29 +0100 Subject: [PATCH 16/26] add clear command to cli with tests --- src/zarr/core/metadata/converter/cli.py | 28 +++- tests/test_metadata/test_converter_v2_v3.py | 153 ++++++++++++++++---- 2 files changed, 152 insertions(+), 29 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index d5c693e07f..bfa2c6cf04 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -1,8 +1,9 @@ -from typing import Annotated +from typing import Annotated, Literal, cast import typer -from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3 +from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3, remove_metadata +from zarr.core.sync import sync app = typer.Typer() @@ -24,8 +25,27 @@ def convert( @app.command() # type: ignore[misc] -def clear() -> None: - print("Clearing...") +def clear( + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + zarr_format: Annotated[ + int, + typer.Argument( + help="Which format's metadata to remove - 2 or 3.", + min=2, + max=3, + ), + ], + path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + """ + sync(remove_metadata(store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path)) if __name__ == "__main__": diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index e0348be777..21b3e94c17 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -1,4 +1,5 @@ import lzma +from pathlib import Path from typing import Any import numcodecs @@ -23,6 +24,93 @@ runner = CliRunner() +def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: + """Create a zarr with nested groups / arrays, returning the paths to all.""" + + # 3 levels of nested groups + group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] + + # 1 array per group + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) + + return paths + + +@pytest.fixture +def expected_paths_no_metadata() -> list[Path]: + """Expected paths from create_nested_zarr, with no metadata files""" + return [ + Path("array_0"), + Path("array_0/0.0"), + Path("array_0/0.1"), + Path("array_0/1.0"), + Path("array_0/1.1"), + Path("group_1"), + Path("group_1/array_1"), + Path("group_1/array_1/0.0"), + Path("group_1/array_1/0.1"), + Path("group_1/array_1/1.0"), + Path("group_1/array_1/1.1"), + Path("group_1/group_2"), + Path("group_1/group_2/array_2"), + Path("group_1/group_2/array_2/0.0"), + Path("group_1/group_2/array_2/0.1"), + Path("group_1/group_2/array_2/1.0"), + Path("group_1/group_2/array_2/1.1"), + ] + + +@pytest.fixture +def expected_paths_v3_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: + """Expected paths from create_nested_zarr, with v3 metadata files""" + v3_paths = [ + Path("array_0/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + Path("zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + ] + expected_paths_no_metadata.extend(v3_paths) + + return sorted(expected_paths_no_metadata) + + +@pytest.fixture +def expected_paths_v2_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: + """Expected paths from create_nested_zarr, with v2 metadata files""" + v2_paths = [ + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + Path(".zgroup"), + Path(".zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + ] + expected_paths_no_metadata.extend(v2_paths) + + return sorted(expected_paths_no_metadata) + + def test_convert_array(local_store: Store) -> None: shape = (10, 10) chunks = (10, 10) @@ -80,31 +168,6 @@ def test_convert_group(local_store: Store) -> None: assert metadata.consolidated_metadata is None -def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: - """Create a zarr with nested groups / arrays, returning the paths to all.""" - - # 3 levels of nested groups - group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) - group_1 = group_0.create_group(name="group_1", attributes=attributes) - group_2 = group_1.create_group(name="group_2", attributes=attributes) - paths = [group_0.path, group_1.path, group_2.path] - - # 1 array per group - for i, group in enumerate([group_0, group_1, group_2]): - array = group.create_array( - name=f"array_{i}", - shape=(10, 10), - chunks=(5, 5), - dtype="uint16", - attributes=attributes, - chunk_key_encoding={"name": "v2", "separator": separator}, - ) - array[:] = 1 - paths.append(array.path) - - return paths - - @pytest.mark.parametrize("separator", [".", "/"]) def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays @@ -376,3 +439,43 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: str(result.exception) == "Compressor is not a BytesBytesCodec" ) + + +def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list[Path]) -> None: + """Test all v2 metadata can be removed (leaving all groups / arrays as-is)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + result = runner.invoke(app, ["clear", str(local_store.root), "2"]) + assert result.exit_code == 0 + + # check metadata files removed, but all groups / arrays still remain + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + assert paths == expected_paths + + +@pytest.mark.parametrize( + ("zarr_format", "expected_paths"), + [("2", "expected_paths_v3_metadata"), ("3", "expected_paths_v2_metadata")], +) +def test_remove_metadata_after_conversion( + local_store: Store, request: pytest.FixtureRequest, zarr_format: str, expected_paths: list[Path] +) -> None: + """Test all v2/v3 metadata can be removed after metadata conversion (all groups / arrays / + metadata of other versions should remain as-is)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata + result = runner.invoke(app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + result = runner.invoke(app, ["clear", str(local_store.root), zarr_format]) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in request.getfixturevalue(expected_paths)] + assert paths == expected_paths From dfdc729fa25bd81a762cd48ce8e5348ea020527e Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:59:40 +0100 Subject: [PATCH 17/26] add test for metadata removal with path# --- tests/test_metadata/test_converter_v2_v3.py | 23 +++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 21b3e94c17..05592b0f55 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -457,6 +457,29 @@ def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list assert paths == expected_paths +def test_remove_metadata_v2_with_path( + local_store: Store, expected_paths_no_metadata: list[Path] +) -> None: + """Test only v2 metadata within the given path (group_1) is removed""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + result = runner.invoke(app, ["clear", str(local_store.root), "2", "--path", "group_1"]) + assert result.exit_code == 0 + + # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top + # group) + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + expected_paths.append(local_store.root / ".zattrs") + expected_paths.append(local_store.root / ".zgroup") + expected_paths.append(local_store.root / "array_0" / ".zarray") + expected_paths.append(local_store.root / "array_0" / ".zattrs") + assert paths == sorted(expected_paths) + + @pytest.mark.parametrize( ("zarr_format", "expected_paths"), [("2", "expected_paths_v3_metadata"), ("3", "expected_paths_v2_metadata")], From ad6099183466fc932674acb02fd6992dedcb8839 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 7 Jul 2025 17:37:46 +0100 Subject: [PATCH 18/26] add verbose logging option --- src/zarr/core/metadata/converter/cli.py | 20 +++++++++++++++++++ .../metadata/converter/converter_v2_v3.py | 14 ++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index bfa2c6cf04..8339e502c2 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -1,3 +1,4 @@ +import logging from typing import Annotated, Literal, cast import typer @@ -48,5 +49,24 @@ def clear( sync(remove_metadata(store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path)) +@app.callback() # type: ignore[misc] +def main( + verbose: Annotated[ + bool, + typer.Option( + help="enable verbose logging - will print info about metadata files being deleted / saved." + ), + ] = False, +) -> None: + """ + Convert metadata from v2 to v3. See available commands below - access help for individual commands with + cli.py COMMAND --help. + """ + if verbose: + lvl = logging.INFO + fmt = "%(message)s" + logging.basicConfig(level=lvl, format=fmt) + + if __name__ == "__main__": app() diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 4c7b1e6048..03793c7c34 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -1,4 +1,5 @@ import asyncio +import logging from typing import Any, cast import numcodecs.abc @@ -30,6 +31,8 @@ from zarr.storage import StoreLike from zarr.storage._common import make_store_path +logger = logging.getLogger(__name__) + def convert_v2_to_v3( store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None @@ -117,11 +120,11 @@ async def remove_metadata( else: metadata_files = [ZARR_JSON] - awaitables = [ - (store_path / file_path).delete() - async for file_path in store_path.store.list_prefix(prefix) - if file_path.split("/")[-1] in metadata_files - ] + awaitables = [] + async for file_path in store_path.store.list_prefix(prefix): + if file_path.split("/")[-1] in metadata_files: + logger.info("Deleting metadata at %s", store_path / file_path) + awaitables.append((store_path / file_path).delete()) await asyncio.gather(*awaitables) @@ -234,5 +237,6 @@ async def _save_v3_metadata( if await zarr_json_path.exists(): raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") + logger.info("Saving metadata to %s", zarr_json_path) to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) From 66bae0d02e9080cba34b9a4af6ac4cd475e4746c Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 8 Jul 2025 11:43:40 +0100 Subject: [PATCH 19/26] add dry run option to cli --- src/zarr/core/metadata/converter/cli.py | 52 ++++++++++++++++--- .../metadata/converter/converter_v2_v3.py | 32 ++++++++---- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 8339e502c2..24a3de7937 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -8,6 +8,21 @@ app = typer.Typer() +logger = logging.getLogger(__name__) + + +def _set_logging_config(verbose: bool) -> None: + if verbose: + lvl = logging.INFO + else: + lvl = logging.WARNING + fmt = "%(message)s" + logging.basicConfig(level=lvl, format=fmt) + + +def _set_verbose_level() -> None: + logging.getLogger().setLevel(logging.INFO) + @app.command() # type: ignore[misc] def convert( @@ -18,11 +33,23 @@ def convert( ), ], path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be converted are logged, but no new files are actually created." + ), + ] = False, ) -> None: """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. """ - convert_v2_to_v3(store=store, path=path) + if dry_run: + _set_verbose_level() + logger.info( + "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" + ) + + convert_v2_to_v3(store=store, path=path, dry_run=dry_run) @app.command() # type: ignore[misc] @@ -42,11 +69,27 @@ def clear( ), ], path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be deleted are logged, but no files are actually removed." + ), + ] = False, ) -> None: """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. Note - this will remove metadata files at all levels of the hierarchy (every group and array). """ - sync(remove_metadata(store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path)) + if dry_run: + _set_verbose_level() + logger.info( + "Dry run enabled - no files will be deleted. Log of files that would be deleted on a real run:" + ) + + sync( + remove_metadata( + store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path, dry_run=dry_run + ) + ) @app.callback() # type: ignore[misc] @@ -62,10 +105,7 @@ def main( Convert metadata from v2 to v3. See available commands below - access help for individual commands with cli.py COMMAND --help. """ - if verbose: - lvl = logging.INFO - fmt = "%(message)s" - logging.basicConfig(level=lvl, format=fmt) + _set_logging_config(verbose) if __name__ == "__main__": diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index 03793c7c34..f5d0e8f010 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -35,7 +35,10 @@ def convert_v2_to_v3( - store: StoreLike, path: str | None = None, storage_options: dict[str, Any] | None = None + store: StoreLike, + path: str | None = None, + storage_options: dict[str, Any] | None = None, + dry_run: bool = False, ) -> None: """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. @@ -49,13 +52,15 @@ def convert_v2_to_v3( storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) - convert_array_or_group(zarr_v2) + convert_array_or_group(zarr_v2, dry_run=dry_run) -def convert_array_or_group(zarr_v2: Array | Group) -> None: +def convert_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> None: """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then all arrays / groups within this group will also be converted. A zarr.json file will be created at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. @@ -64,6 +69,8 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: ---------- zarr_v2 : Array | Group An array or group with zarr_format = 2 + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are actually created. """ if not zarr_v2.metadata.zarr_format == 2: raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") @@ -71,18 +78,18 @@ def convert_array_or_group(zarr_v2: Array | Group) -> None: if isinstance(zarr_v2.metadata, GroupMetadata): # process members of the group for key in zarr_v2: - convert_array_or_group(zarr_v2[key]) + convert_array_or_group(zarr_v2[key], dry_run=dry_run) # write group's converted metadata group_metadata_v3 = GroupMetadata( attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None ) - sync(_save_v3_metadata(zarr_v2, group_metadata_v3)) + sync(_save_v3_metadata(zarr_v2, group_metadata_v3, dry_run=dry_run)) else: # write array's converted metadata array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) - sync(_save_v3_metadata(zarr_v2, array_metadata_v3)) + sync(_save_v3_metadata(zarr_v2, array_metadata_v3, dry_run=dry_run)) async def remove_metadata( @@ -90,6 +97,7 @@ async def remove_metadata( zarr_format: ZarrFormat, path: str | None = None, storage_options: dict[str, Any] | None = None, + dry_run: bool = False, ) -> None: """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. Note - this will remove metadata files at all levels of the hierarchy (every group and array). @@ -105,6 +113,8 @@ async def remove_metadata( storage_options : dict | None, optional If the store is backed by an fsspec-based implementation, then this dict will be passed to the Store constructor for that implementation. Ignored otherwise. + dry_run : bool, optional + Enable a 'dry run' - files that would be deleted are logged, but no files are actually removed. """ store_path = await make_store_path(store, mode="r+", storage_options=storage_options) if not store_path.store.supports_deletes: @@ -124,7 +134,9 @@ async def remove_metadata( async for file_path in store_path.store.list_prefix(prefix): if file_path.split("/")[-1] in metadata_files: logger.info("Deleting metadata at %s", store_path / file_path) - awaitables.append((store_path / file_path).delete()) + + if not dry_run: + awaitables.append((store_path / file_path).delete()) await asyncio.gather(*awaitables) @@ -231,7 +243,7 @@ def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: async def _save_v3_metadata( - zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata + zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata, dry_run: bool = False ) -> None: zarr_json_path = zarr_v2.store_path / ZARR_JSON if await zarr_json_path.exists(): @@ -239,4 +251,6 @@ async def _save_v3_metadata( logger.info("Saving metadata to %s", zarr_json_path) to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) - await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) + + if not dry_run: + await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) From 97df9bf0e29999ddfac06bbe11391cb13f4da023 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Tue, 8 Jul 2025 11:58:12 +0100 Subject: [PATCH 20/26] add test for dry-run --- tests/test_metadata/test_converter_v2_v3.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 05592b0f55..029453cfeb 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -502,3 +502,24 @@ def test_remove_metadata_after_conversion( paths = sorted(local_store.root.rglob("*")) expected_paths = [local_store.root / p for p in request.getfixturevalue(expected_paths)] assert paths == expected_paths + + +@pytest.mark.parametrize("cli_command", ["convert", "clear"]) +def test_dry_run( + local_store: Store, cli_command: str, expected_paths_v2_metadata: list[Path] +) -> None: + """Test that all files are un-changed after a dry run""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + if cli_command == "convert": + result = runner.invoke(app, ["convert", str(local_store.root), "--dry-run"]) + else: + result = runner.invoke(app, ["clear", str(local_store.root), "2", "--dry-run"]) + + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v2_metadata] + assert paths == expected_paths From 42e0435ad7c4f3321fd7797b060160ea447b79e6 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:39:58 +0100 Subject: [PATCH 21/26] add zarr-converter script and enable cli dep in tests --- pyproject.toml | 5 ++- src/zarr/core/metadata/converter/cli.py | 2 +- tests/conftest.py | 23 ++++++++++ tests/test_codec_entrypoints.py | 18 -------- tests/test_dtype_registry.py | 22 ---------- tests/test_metadata/test_converter_v2_v3.py | 47 ++++++++++++--------- 6 files changed, 54 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index eb95c5dada..6273b3ca58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,9 @@ docs = [ 'astroid<4' ] +[project.scripts] +zarr-converter = "zarr.core.metadata.converter.cli:app" + [project.urls] "Bug Tracker" = "https://github.com/zarr-developers/zarr-python/issues" @@ -156,7 +159,7 @@ deps = ["minimal", "optional"] [tool.hatch.envs.test.overrides] matrix.deps.dependencies = [ - {value = "zarr[remote, remote_tests, test, optional]", if = ["optional"]} + {value = "zarr[remote, remote_tests, test, optional, cli]", if = ["optional"]} ] [tool.hatch.envs.test.scripts] diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py index 24a3de7937..21ffaef20f 100644 --- a/src/zarr/core/metadata/converter/cli.py +++ b/src/zarr/core/metadata/converter/cli.py @@ -103,7 +103,7 @@ def main( ) -> None: """ Convert metadata from v2 to v3. See available commands below - access help for individual commands with - cli.py COMMAND --help. + zarr-converter COMMAND --help. """ _set_logging_config(verbose) diff --git a/tests/conftest.py b/tests/conftest.py index 4d300a1fd4..d8262ab086 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import os import pathlib +import sys from dataclasses import dataclass, field from typing import TYPE_CHECKING @@ -10,6 +11,7 @@ import pytest from hypothesis import HealthCheck, Verbosity, settings +import zarr.registry from zarr import AsyncGroup, config from zarr.abc.store import Store from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation @@ -175,6 +177,27 @@ def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: raise ValueError(msg) +def _clear_registries() -> None: + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + tests_dir = str(pathlib.Path(__file__).parent.absolute()) + sys.path.append(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + + yield + + sys.path.remove(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + config.reset() + + def pytest_addoption(parser: Any) -> None: parser.addoption( "--run-slow-hypothesis", diff --git a/tests/test_codec_entrypoints.py b/tests/test_codec_entrypoints.py index e1ef027dd4..fc7b79fe54 100644 --- a/tests/test_codec_entrypoints.py +++ b/tests/test_codec_entrypoints.py @@ -1,26 +1,8 @@ -import os.path -import sys -from collections.abc import Generator - import pytest import zarr.registry from zarr import config -here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - @pytest.mark.usefixtures("set_path") @pytest.mark.parametrize("codec_name", ["TestEntrypointCodec", "TestEntrypointGroup.Codec"]) diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index d4e37440a7..c40d4d98b1 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -1,16 +1,12 @@ from __future__ import annotations import re -import sys -from pathlib import Path from typing import TYPE_CHECKING, Any, get_args import numpy as np import pytest -import zarr from tests.conftest import skip_object_dtype -from zarr.core.config import config from zarr.core.dtype import ( AnyDType, Bool, @@ -29,8 +25,6 @@ ) if TYPE_CHECKING: - from collections.abc import Generator - from zarr.core.common import ZarrFormat from .test_dtype.conftest import zdtype_examples @@ -147,22 +141,6 @@ def test_match_dtype_unique( data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) -# this is copied from the registry tests -- we should deduplicate -here = str(Path(__file__).parent.absolute()) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - - @pytest.mark.usefixtures("set_path") def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 029453cfeb..efc831ff28 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -6,7 +6,6 @@ import numcodecs.abc import pytest from numcodecs.zarr3 import LZMA, Delta -from typer.testing import CliRunner import zarr from zarr.abc.codec import Codec @@ -19,9 +18,15 @@ from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 -from zarr.core.metadata.converter.cli import app -runner = CliRunner() +typer_testing = pytest.importorskip( + "typer.testing", reason="optional cli dependencies aren't installed" +) +cli = pytest.importorskip( + "zarr.core.metadata.converter.cli", reason="optional cli dependencies aren't installed" +) + +runner = typer_testing.CliRunner() def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: @@ -130,7 +135,7 @@ def test_convert_array(local_store: Store) -> None: attributes=attributes, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -156,7 +161,7 @@ def test_convert_group(local_store: Store) -> None: attributes = {"baz": 42, "qux": [1, 4, 7, 12]} zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -176,7 +181,7 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> attributes = {"baz": 42, "qux": [1, 4, 7, 12]} paths = create_nested_zarr(local_store, attributes, separator) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 # check zarr.json were created for every group and array @@ -206,7 +211,7 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: create_nested_zarr(local_store, {}, separator) - result = runner.invoke(app, ["convert", str(local_store.root), "--path", "group_1"]) + result = runner.invoke(cli.app, ["convert", str(local_store.root), "--path", "group_1"]) assert result.exit_code == 0 group_path = local_store.root / "group_1" @@ -255,7 +260,7 @@ def test_convert_compressor( fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -283,7 +288,7 @@ def test_convert_filter(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -317,7 +322,7 @@ def test_convert_C_vs_F_order( order=order, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -349,7 +354,7 @@ def test_convert_endian( fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 assert (local_store.root / "zarr.json").exists() @@ -371,7 +376,7 @@ def test_convert_v3(local_store: Store, node_type: str) -> None: else: zarr.create_group(store=local_store, zarr_format=3) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" @@ -390,7 +395,7 @@ def test_convert_unknown_codec(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, ValueError) assert ( @@ -411,7 +416,7 @@ def test_convert_incorrect_filter(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -432,7 +437,7 @@ def test_convert_incorrect_compressor(local_store: Store) -> None: fill_value=0, ) - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 1 assert isinstance(result.exception, TypeError) assert ( @@ -447,7 +452,7 @@ def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - result = runner.invoke(app, ["clear", str(local_store.root), "2"]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2"]) assert result.exit_code == 0 # check metadata files removed, but all groups / arrays still remain @@ -465,7 +470,7 @@ def test_remove_metadata_v2_with_path( attributes = {"baz": 42, "qux": [1, 4, 7, 12]} create_nested_zarr(local_store, attributes, ".") - result = runner.invoke(app, ["clear", str(local_store.root), "2", "--path", "group_1"]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--path", "group_1"]) assert result.exit_code == 0 # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top @@ -494,9 +499,9 @@ def test_remove_metadata_after_conversion( create_nested_zarr(local_store, attributes, ".") # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata - result = runner.invoke(app, ["convert", str(local_store.root)]) + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 - result = runner.invoke(app, ["clear", str(local_store.root), zarr_format]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), zarr_format]) assert result.exit_code == 0 paths = sorted(local_store.root.rglob("*")) @@ -514,9 +519,9 @@ def test_dry_run( create_nested_zarr(local_store, attributes, ".") if cli_command == "convert": - result = runner.invoke(app, ["convert", str(local_store.root), "--dry-run"]) + result = runner.invoke(cli.app, ["convert", str(local_store.root), "--dry-run"]) else: - result = runner.invoke(app, ["clear", str(local_store.root), "2", "--dry-run"]) + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--dry-run"]) assert result.exit_code == 0 From 9e20b3901325f6c21e3615de1a80f144581d7355 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 9 Jul 2025 17:40:31 +0100 Subject: [PATCH 22/26] use v2 chunk key encoding type --- src/zarr/core/metadata/converter/converter_v2_v3.py | 4 ++-- tests/test_metadata/test_converter_v2_v3.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py index f5d0e8f010..e2895c757b 100644 --- a/src/zarr/core/metadata/converter/converter_v2_v3.py +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -13,7 +13,7 @@ from zarr.codecs.zstd import ZstdCodec from zarr.core.array import Array from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.common import ( ZARR_JSON, ZARRAY_JSON, @@ -142,7 +142,7 @@ async def remove_metadata( def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: - chunk_key_encoding = DefaultChunkKeyEncoding(separator=metadata_v2.dimension_separator) + chunk_key_encoding = V2ChunkKeyEncoding(separator=metadata_v2.dimension_separator) codecs: list[Codec] = [] diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index efc831ff28..11083570c1 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -16,7 +16,7 @@ from zarr.codecs.transpose import TransposeCodec from zarr.codecs.zstd import ZstdCodec from zarr.core.chunk_grids import RegularChunkGrid -from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 typer_testing = pytest.importorskip( @@ -145,7 +145,7 @@ def test_convert_array(local_store: Store) -> None: assert metadata.node_type == "array" assert metadata.shape == shape assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) - assert metadata.chunk_key_encoding == DefaultChunkKeyEncoding(separator=".") + assert metadata.chunk_key_encoding == V2ChunkKeyEncoding(separator=".") assert metadata.data_type == UInt16("little") assert metadata.codecs == ( BytesCodec(endian="little"), From ce409a38fbc5c6aa1c2b51e5c6167abc367f0d3d Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Mon, 14 Jul 2025 10:30:28 +0100 Subject: [PATCH 23/26] update endianness of test data type --- tests/test_metadata/test_converter_v2_v3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 11083570c1..e47ea985bf 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -30,7 +30,7 @@ def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: - """Create a zarr with nested groups / arrays, returning the paths to all.""" + """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" # 3 levels of nested groups group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) @@ -146,7 +146,7 @@ def test_convert_array(local_store: Store) -> None: assert metadata.shape == shape assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) assert metadata.chunk_key_encoding == V2ChunkKeyEncoding(separator=".") - assert metadata.data_type == UInt16("little") + assert metadata.data_type == UInt16(endianness="little") assert metadata.codecs == ( BytesCodec(endian="little"), BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), From 6585f246f5115512a4f47f4ca4627b56ed078dba Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 16 Jul 2025 12:45:25 +0100 Subject: [PATCH 24/26] check converted arrays can be accessed --- tests/test_metadata/test_converter_v2_v3.py | 31 +++++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index e47ea985bf..3fe32100f0 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -240,9 +240,23 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: (numcodecs.GZip(level=3), GzipCodec(level=3)), ( numcodecs.LZMA( - format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}] + format=lzma.FORMAT_RAW, + check=-1, + preset=None, + filters=[ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], + ), + LZMA( + format=lzma.FORMAT_RAW, + check=-1, + preset=None, + filters=[ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], ), - LZMA(format=1, check=-1, preset=None, filters=[{"id": lzma.FILTER_DELTA, "dist": 4}]), ), ], ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], @@ -250,7 +264,7 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: def test_convert_compressor( local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec ) -> None: - zarr.create_array( + zarr_array = zarr.create_array( store=local_store, shape=(10, 10), chunks=(10, 10), @@ -259,6 +273,7 @@ def test_convert_compressor( zarr_format=2, fill_value=0, ) + zarr_array[:] = 1 result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -271,6 +286,7 @@ def test_convert_compressor( BytesCodec(endian="little"), compressor_v3, ) + assert (zarr_array[:] == 1).all() def test_convert_filter(local_store: Store) -> None: @@ -311,7 +327,7 @@ def test_convert_filter(local_store: Store) -> None: def test_convert_C_vs_F_order( local_store: Store, order: str, expected_codecs: tuple[Codec] ) -> None: - zarr.create_array( + zarr_array = zarr.create_array( store=local_store, shape=(10, 10), chunks=(10, 10), @@ -321,6 +337,7 @@ def test_convert_C_vs_F_order( fill_value=0, order=order, ) + zarr_array[:] = 1 result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -329,8 +346,8 @@ def test_convert_C_vs_F_order( zarr_array = zarr.open(local_store.root, zarr_format=3) metadata = zarr_array.metadata assert metadata.zarr_format == 3 - assert metadata.codecs == expected_codecs + assert (zarr_array[:] == 1).all() @pytest.mark.parametrize( @@ -344,7 +361,7 @@ def test_convert_C_vs_F_order( def test_convert_endian( local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] ) -> None: - zarr.create_array( + zarr_array = zarr.create_array( store=local_store, shape=(10, 10), chunks=(10, 10), @@ -353,6 +370,7 @@ def test_convert_endian( zarr_format=2, fill_value=0, ) + zarr_array[:] = 1 result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 @@ -363,6 +381,7 @@ def test_convert_endian( assert metadata.zarr_format == 3 assert metadata.data_type == expected_data_type assert metadata.codecs == expected_codecs + assert (zarr_array[:] == 1).all() @pytest.mark.parametrize("node_type", ["array", "group"]) From 08fc138d4a2ac5c18042c2526e608c6642cd0f5b Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 16 Jul 2025 16:58:41 +0100 Subject: [PATCH 25/26] remove uses of pathlib walk, as it didn't exist in python 3.11 --- tests/test_metadata/test_converter_v2_v3.py | 114 +++++++++++--------- 1 file changed, 62 insertions(+), 52 deletions(-) diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py index 3fe32100f0..275bb0a4ed 100644 --- a/tests/test_metadata/test_converter_v2_v3.py +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -79,39 +79,57 @@ def expected_paths_no_metadata() -> list[Path]: @pytest.fixture -def expected_paths_v3_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: - """Expected paths from create_nested_zarr, with v3 metadata files""" - v3_paths = [ - Path("array_0/zarr.json"), - Path("group_1/array_1/zarr.json"), - Path("group_1/group_2/array_2/zarr.json"), - Path("zarr.json"), - Path("group_1/zarr.json"), - Path("group_1/group_2/zarr.json"), - ] - expected_paths_no_metadata.extend(v3_paths) +def expected_v3_metadata() -> list[Path]: + """Expected v3 metadata for create_nested_zarr""" + return sorted( + [ + Path("array_0/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + Path("zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + ] + ) + + +@pytest.fixture +def expected_v2_metadata() -> list[Path]: + """Expected v2 metadata for create_nested_zarr""" + return sorted( + [ + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + Path(".zgroup"), + Path(".zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + ] + ) + + +@pytest.fixture +def expected_paths_v3_metadata( + expected_paths_no_metadata: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v3 metadata files""" + expected_paths_no_metadata.extend(expected_v3_metadata) return sorted(expected_paths_no_metadata) @pytest.fixture -def expected_paths_v2_metadata(expected_paths_no_metadata: list[Path]) -> list[Path]: - """Expected paths from create_nested_zarr, with v2 metadata files""" - v2_paths = [ - Path("array_0/.zarray"), - Path("array_0/.zattrs"), - Path("group_1/array_1/.zarray"), - Path("group_1/array_1/.zattrs"), - Path("group_1/group_2/array_2/.zarray"), - Path("group_1/group_2/array_2/.zattrs"), - Path(".zgroup"), - Path(".zattrs"), - Path("group_1/.zgroup"), - Path("group_1/.zattrs"), - Path("group_1/group_2/.zgroup"), - Path("group_1/group_2/.zattrs"), - ] - expected_paths_no_metadata.extend(v2_paths) +def expected_paths_v2_metadata( + expected_paths_no_metadata: list[Path], expected_v2_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v2 metadata files""" + expected_paths_no_metadata.extend(expected_v2_metadata) return sorted(expected_paths_no_metadata) @@ -174,7 +192,9 @@ def test_convert_group(local_store: Store) -> None: @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> None: +def test_convert_nested_groups_and_arrays( + local_store: Store, separator: str, expected_v3_metadata: list[Path] +) -> None: """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays (including when there are additional dirs due to using a / separator)""" @@ -184,17 +204,9 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> result = runner.invoke(cli.app, ["convert", str(local_store.root)]) assert result.exit_code == 0 - # check zarr.json were created for every group and array - total_zarr_jsons = 0 - for _, _, filenames in local_store.root.walk(): - # group / array directories - if ".zattrs" in filenames: - assert "zarr.json" in filenames - total_zarr_jsons += 1 - # other directories e.g. for chunks when separator is / - else: - assert "zarr.json" not in filenames - assert total_zarr_jsons == 6 + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [local_store.root / p for p in expected_v3_metadata] + assert zarr_json_paths == expected_zarr_json_paths # Check converted zarr can be opened + metadata accessed at all levels zarr_array = zarr.open(local_store.root, zarr_format=3) @@ -206,7 +218,9 @@ def test_convert_nested_groups_and_arrays(local_store: Store, separator: str) -> @pytest.mark.parametrize("separator", [".", "/"]) -def test_convert_nested_with_path(local_store: Store, separator: str) -> None: +def test_convert_nested_with_path( + local_store: Store, separator: str, expected_v3_metadata: list[Path] +) -> None: """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" create_nested_zarr(local_store, {}, separator) @@ -216,17 +230,13 @@ def test_convert_nested_with_path(local_store: Store, separator: str) -> None: group_path = local_store.root / "group_1" - total_zarr_jsons = 0 - for dirpath, _, filenames in local_store.root.walk(): - inside_group = (dirpath == group_path) or (group_path in dirpath.parents) - if (".zattrs" in filenames) and inside_group: - # group / array directories inside the group - assert "zarr.json" in filenames - total_zarr_jsons += 1 - else: - assert "zarr.json" not in filenames - - assert total_zarr_jsons == 4 + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [ + local_store.root / p + for p in expected_v3_metadata + if group_path in (local_store.root / p).parents + ] + assert zarr_json_paths == expected_zarr_json_paths @pytest.mark.parametrize( From 3540434162daf08832cb366bbfbe5de16babadba Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <24316371+K-Meech@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:11:04 +0100 Subject: [PATCH 26/26] include tags in checkout for gpu test, to avoid numcodecs.zarr3 requesting a zarr version greater than 3 --- .github/workflows/gpu_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 752440719b..fdd0d27463 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -30,6 +30,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # grab all branches and tags # - name: cuda-toolkit # uses: Jimver/cuda-toolkit@v0.2.16 # id: cuda-toolkit