diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 752440719b..fdd0d27463 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -30,6 +30,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # grab all branches and tags # - name: cuda-toolkit # uses: Jimver/cuda-toolkit@v0.2.16 # id: cuda-toolkit diff --git a/pyproject.toml b/pyproject.toml index 85517d8d0a..edc16e4e4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,7 @@ remote = [ gpu = [ "cupy-cuda12x", ] +cli = ["typer"] # Development extras test = [ "coverage", @@ -113,6 +114,9 @@ docs = [ 'pytest' ] +[project.scripts] +zarr-converter = "zarr.core.metadata.converter.cli:app" + [project.urls] "Bug Tracker" = "https://github.com/zarr-developers/zarr-python/issues" @@ -159,7 +163,7 @@ deps = ["minimal", "optional"] [tool.hatch.envs.test.overrides] matrix.deps.dependencies = [ - {value = "zarr[remote, remote_tests, test, optional]", if = ["optional"]} + {value = "zarr[remote, remote_tests, test, optional, cli]", if = ["optional"]} ] [tool.hatch.envs.test.scripts] diff --git a/src/zarr/core/metadata/converter/__init__.py b/src/zarr/core/metadata/converter/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/core/metadata/converter/cli.py b/src/zarr/core/metadata/converter/cli.py new file mode 100644 index 0000000000..21ffaef20f --- /dev/null +++ b/src/zarr/core/metadata/converter/cli.py @@ -0,0 +1,112 @@ +import logging +from typing import Annotated, Literal, cast + +import typer + +from zarr.core.metadata.converter.converter_v2_v3 import convert_v2_to_v3, remove_metadata +from zarr.core.sync import sync + +app = typer.Typer() + +logger = logging.getLogger(__name__) + + +def _set_logging_config(verbose: bool) -> None: + if verbose: + lvl = logging.INFO + else: + lvl = logging.WARNING + fmt = "%(message)s" + logging.basicConfig(level=lvl, format=fmt) + + +def _set_verbose_level() -> None: + logging.getLogger().setLevel(logging.INFO) + + +@app.command() # type: ignore[misc] +def convert( + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be converted are logged, but no new files are actually created." + ), + ] = False, +) -> None: + """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + """ + if dry_run: + _set_verbose_level() + logger.info( + "Dry run enabled - no new files will be created. Log of files that would be created on a real run:" + ) + + convert_v2_to_v3(store=store, path=path, dry_run=dry_run) + + +@app.command() # type: ignore[misc] +def clear( + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + zarr_format: Annotated[ + int, + typer.Argument( + help="Which format's metadata to remove - 2 or 3.", + min=2, + max=3, + ), + ], + path: Annotated[str | None, typer.Option(help="The path within the store to open")] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be deleted are logged, but no files are actually removed." + ), + ] = False, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + """ + if dry_run: + _set_verbose_level() + logger.info( + "Dry run enabled - no files will be deleted. Log of files that would be deleted on a real run:" + ) + + sync( + remove_metadata( + store=store, zarr_format=cast(Literal[2, 3], zarr_format), path=path, dry_run=dry_run + ) + ) + + +@app.callback() # type: ignore[misc] +def main( + verbose: Annotated[ + bool, + typer.Option( + help="enable verbose logging - will print info about metadata files being deleted / saved." + ), + ] = False, +) -> None: + """ + Convert metadata from v2 to v3. See available commands below - access help for individual commands with + zarr-converter COMMAND --help. + """ + _set_logging_config(verbose) + + +if __name__ == "__main__": + app() diff --git a/src/zarr/core/metadata/converter/converter_v2_v3.py b/src/zarr/core/metadata/converter/converter_v2_v3.py new file mode 100644 index 0000000000..e2895c757b --- /dev/null +++ b/src/zarr/core/metadata/converter/converter_v2_v3.py @@ -0,0 +1,256 @@ +import asyncio +import logging +from typing import Any, cast + +import numcodecs.abc + +import zarr +from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec +from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec +from zarr.core.array import Array +from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding +from zarr.core.common import ( + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, + ZGROUP_JSON, + ZMETADATA_V2_JSON, + ZarrFormat, +) +from zarr.core.dtype.common import HasEndianness +from zarr.core.group import Group, GroupMetadata +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.sync import sync +from zarr.registry import get_codec_class +from zarr.storage import StoreLike +from zarr.storage._common import make_store_path + +logger = logging.getLogger(__name__) + + +def convert_v2_to_v3( + store: StoreLike, + path: str | None = None, + storage_options: dict[str, Any] | None = None, + dry_run: bool = False, +) -> None: + """Convert all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file at each level + (for every group / array). V2 files (.zarray, .zattrs etc.) will be left as-is. + + Parameters + ---------- + store : StoreLike + Store or path to directory in file system or name of zip file. + path : str | None, optional + The path within the store to open, by default None + storage_options : dict | None, optional + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are actually created. + """ + + zarr_v2 = zarr.open(store=store, mode="r+", path=path, storage_options=storage_options) + convert_array_or_group(zarr_v2, dry_run=dry_run) + + +def convert_array_or_group(zarr_v2: Array | Group, dry_run: bool = False) -> None: + """Convert all v2 metadata in a zarr array/group to v3. Note - if a group is provided, then + all arrays / groups within this group will also be converted. A zarr.json file will be created + at each level, with any V2 files (.zarray, .zattrs etc.) left as-is. + + Parameters + ---------- + zarr_v2 : Array | Group + An array or group with zarr_format = 2 + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are actually created. + """ + if not zarr_v2.metadata.zarr_format == 2: + raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") + + if isinstance(zarr_v2.metadata, GroupMetadata): + # process members of the group + for key in zarr_v2: + convert_array_or_group(zarr_v2[key], dry_run=dry_run) + + # write group's converted metadata + group_metadata_v3 = GroupMetadata( + attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None + ) + sync(_save_v3_metadata(zarr_v2, group_metadata_v3, dry_run=dry_run)) + + else: + # write array's converted metadata + array_metadata_v3 = _convert_array_metadata(zarr_v2.metadata) + sync(_save_v3_metadata(zarr_v2, array_metadata_v3, dry_run=dry_run)) + + +async def remove_metadata( + store: StoreLike, + zarr_format: ZarrFormat, + path: str | None = None, + storage_options: dict[str, Any] | None = None, + dry_run: bool = False, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + + Parameters + ---------- + store : StoreLike + Store or path to directory in file system or name of zip file. + zarr_format : ZarrFormat + Which format's metadata to remove - 2 or 3. + path : str | None, optional + The path within the store to open, by default None + storage_options : dict | None, optional + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. + dry_run : bool, optional + Enable a 'dry run' - files that would be deleted are logged, but no files are actually removed. + """ + store_path = await make_store_path(store, mode="r+", storage_options=storage_options) + if not store_path.store.supports_deletes: + raise ValueError("Store must support deletes to remove metadata") + + if path is None: + prefix = "" + else: + prefix = path + + if zarr_format == 2: + metadata_files = [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON] + else: + metadata_files = [ZARR_JSON] + + awaitables = [] + async for file_path in store_path.store.list_prefix(prefix): + if file_path.split("/")[-1] in metadata_files: + logger.info("Deleting metadata at %s", store_path / file_path) + + if not dry_run: + awaitables.append((store_path / file_path).delete()) + + await asyncio.gather(*awaitables) + + +def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: + chunk_key_encoding = V2ChunkKeyEncoding(separator=metadata_v2.dimension_separator) + + codecs: list[Codec] = [] + + # array-array codecs + if metadata_v2.order == "F": + # F is equivalent to order: n-1, ... 1, 0 + codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) + codecs.extend(_convert_filters(metadata_v2)) + + # array-bytes codecs + if not isinstance(metadata_v2.dtype, HasEndianness): + codecs.append(BytesCodec(endian=None)) + else: + codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) + + # bytes-bytes codecs + bytes_bytes_codec = _convert_compressor(metadata_v2) + if bytes_bytes_codec is not None: + codecs.append(bytes_bytes_codec) + + return ArrayV3Metadata( + shape=metadata_v2.shape, + data_type=metadata_v2.dtype, + chunk_grid=metadata_v2.chunk_grid, + chunk_key_encoding=chunk_key_encoding, + fill_value=metadata_v2.fill_value, + codecs=codecs, + attributes=metadata_v2.attributes, + dimension_names=None, + storage_transformers=None, + ) + + +def _convert_filters(metadata_v2: ArrayV2Metadata) -> list[ArrayArrayCodec]: + if metadata_v2.filters is None: + return [] + + filters_codecs = [_find_numcodecs_zarr3(filter) for filter in metadata_v2.filters] + for codec in filters_codecs: + if not isinstance(codec, ArrayArrayCodec): + raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") + + return cast(list[ArrayArrayCodec], filters_codecs) + + +def _convert_compressor(metadata_v2: ArrayV2Metadata) -> BytesBytesCodec | None: + if metadata_v2.compressor is None: + return None + + compressor_name = metadata_v2.compressor.codec_id + + match compressor_name: + case "blosc": + return BloscCodec( + typesize=metadata_v2.dtype.to_native_dtype().itemsize, + cname=metadata_v2.compressor.cname, + clevel=metadata_v2.compressor.clevel, + shuffle=BloscShuffle.from_int(metadata_v2.compressor.shuffle), + blocksize=metadata_v2.compressor.blocksize, + ) + + case "zstd": + return ZstdCodec( + level=metadata_v2.compressor.level, + checksum=metadata_v2.compressor.checksum, + ) + + case "gzip": + return GzipCodec(level=metadata_v2.compressor.level) + + case _: + # If possible, find matching numcodecs.zarr3 codec + compressor_codec = _find_numcodecs_zarr3(metadata_v2.compressor) + + if not isinstance(compressor_codec, BytesBytesCodec): + raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") + + return compressor_codec + + +def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: + """Find matching numcodecs.zarr3 codec (if it exists)""" + + numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}" + numcodec_dict = { + "name": numcodec_name, + "configuration": numcodecs_codec.get_config(), + } + + try: + codec_v3 = get_codec_class(numcodec_name) + except KeyError as exc: + raise ValueError( + f"Couldn't find corresponding numcodecs.zarr3 codec for {numcodecs_codec.codec_id}" + ) from exc + + return codec_v3.from_dict(numcodec_dict) + + +async def _save_v3_metadata( + zarr_v2: Array | Group, metadata_v3: ArrayV3Metadata | GroupMetadata, dry_run: bool = False +) -> None: + zarr_json_path = zarr_v2.store_path / ZARR_JSON + if await zarr_json_path.exists(): + raise ValueError(f"{ZARR_JSON} already exists at {zarr_v2.store_path}") + + logger.info("Saving metadata to %s", zarr_json_path) + to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) + + if not dry_run: + await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) diff --git a/tests/conftest.py b/tests/conftest.py index 4d300a1fd4..d8262ab086 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import os import pathlib +import sys from dataclasses import dataclass, field from typing import TYPE_CHECKING @@ -10,6 +11,7 @@ import pytest from hypothesis import HealthCheck, Verbosity, settings +import zarr.registry from zarr import AsyncGroup, config from zarr.abc.store import Store from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation @@ -175,6 +177,27 @@ def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: raise ValueError(msg) +def _clear_registries() -> None: + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + tests_dir = str(pathlib.Path(__file__).parent.absolute()) + sys.path.append(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + + yield + + sys.path.remove(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + config.reset() + + def pytest_addoption(parser: Any) -> None: parser.addoption( "--run-slow-hypothesis", diff --git a/tests/test_codec_entrypoints.py b/tests/test_codec_entrypoints.py index e1ef027dd4..fc7b79fe54 100644 --- a/tests/test_codec_entrypoints.py +++ b/tests/test_codec_entrypoints.py @@ -1,26 +1,8 @@ -import os.path -import sys -from collections.abc import Generator - import pytest import zarr.registry from zarr import config -here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - @pytest.mark.usefixtures("set_path") @pytest.mark.parametrize("codec_name", ["TestEntrypointCodec", "TestEntrypointGroup.Codec"]) diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 95ede9e1d7..eff0af1f90 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -1,16 +1,12 @@ from __future__ import annotations import re -import sys -from pathlib import Path from typing import TYPE_CHECKING, Any, get_args import numpy as np import pytest -import zarr from tests.conftest import skip_object_dtype -from zarr.core.config import config from zarr.core.dtype import ( AnyDType, Bool, @@ -29,8 +25,6 @@ ) if TYPE_CHECKING: - from collections.abc import Generator - from zarr.core.common import ZarrFormat from .test_dtype.conftest import zdtype_examples @@ -147,22 +141,6 @@ def test_match_dtype_unique( data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) -# this is copied from the registry tests -- we should deduplicate -here = str(Path(__file__).parent.absolute()) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - - @pytest.mark.usefixtures("set_path") def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType diff --git a/tests/test_metadata/test_converter_v2_v3.py b/tests/test_metadata/test_converter_v2_v3.py new file mode 100644 index 0000000000..275bb0a4ed --- /dev/null +++ b/tests/test_metadata/test_converter_v2_v3.py @@ -0,0 +1,559 @@ +import lzma +from pathlib import Path +from typing import Any + +import numcodecs +import numcodecs.abc +import pytest +from numcodecs.zarr3 import LZMA, Delta + +import zarr +from zarr.abc.codec import Codec +from zarr.abc.store import Store +from zarr.codecs.blosc import BloscCodec +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding +from zarr.core.dtype.npy.int import BaseInt, UInt8, UInt16 + +typer_testing = pytest.importorskip( + "typer.testing", reason="optional cli dependencies aren't installed" +) +cli = pytest.importorskip( + "zarr.core.metadata.converter.cli", reason="optional cli dependencies aren't installed" +) + +runner = typer_testing.CliRunner() + + +def create_nested_zarr(store: Store, attributes: dict[str, Any], separator: str) -> list[str]: + """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" + + # 3 levels of nested groups + group_0 = zarr.create_group(store=store, zarr_format=2, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] + + # 1 array per group + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) + + return paths + + +@pytest.fixture +def expected_paths_no_metadata() -> list[Path]: + """Expected paths from create_nested_zarr, with no metadata files""" + return [ + Path("array_0"), + Path("array_0/0.0"), + Path("array_0/0.1"), + Path("array_0/1.0"), + Path("array_0/1.1"), + Path("group_1"), + Path("group_1/array_1"), + Path("group_1/array_1/0.0"), + Path("group_1/array_1/0.1"), + Path("group_1/array_1/1.0"), + Path("group_1/array_1/1.1"), + Path("group_1/group_2"), + Path("group_1/group_2/array_2"), + Path("group_1/group_2/array_2/0.0"), + Path("group_1/group_2/array_2/0.1"), + Path("group_1/group_2/array_2/1.0"), + Path("group_1/group_2/array_2/1.1"), + ] + + +@pytest.fixture +def expected_v3_metadata() -> list[Path]: + """Expected v3 metadata for create_nested_zarr""" + return sorted( + [ + Path("array_0/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + Path("zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + ] + ) + + +@pytest.fixture +def expected_v2_metadata() -> list[Path]: + """Expected v2 metadata for create_nested_zarr""" + return sorted( + [ + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + Path(".zgroup"), + Path(".zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + ] + ) + + +@pytest.fixture +def expected_paths_v3_metadata( + expected_paths_no_metadata: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v3 metadata files""" + expected_paths_no_metadata.extend(expected_v3_metadata) + + return sorted(expected_paths_no_metadata) + + +@pytest.fixture +def expected_paths_v2_metadata( + expected_paths_no_metadata: list[Path], expected_v2_metadata: list[Path] +) -> list[Path]: + """Expected paths from create_nested_zarr + v2 metadata files""" + expected_paths_no_metadata.extend(expected_v2_metadata) + + return sorted(expected_paths_no_metadata) + + +def test_convert_array(local_store: Store) -> None: + shape = (10, 10) + chunks = (10, 10) + dtype = "uint16" + compressors = numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1) + fill_value = 2 + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + zarr.create_array( + store=local_store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressors=compressors, + zarr_format=2, + fill_value=fill_value, + attributes=attributes, + ) + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.node_type == "array" + assert metadata.shape == shape + assert metadata.chunk_grid == RegularChunkGrid(chunk_shape=chunks) + assert metadata.chunk_key_encoding == V2ChunkKeyEncoding(separator=".") + assert metadata.data_type == UInt16(endianness="little") + assert metadata.codecs == ( + BytesCodec(endian="little"), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ) + assert metadata.fill_value == fill_value + assert metadata.attributes == attributes + assert metadata.dimension_names is None + assert metadata.storage_transformers == () + + +def test_convert_group(local_store: Store) -> None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.node_type == "group" + assert metadata.attributes == attributes + assert metadata.consolidated_metadata is None + + +@pytest.mark.parametrize("separator", [".", "/"]) +def test_convert_nested_groups_and_arrays( + local_store: Store, separator: str, expected_v3_metadata: list[Path] +) -> None: + """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays + (including when there are additional dirs due to using a / separator)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + paths = create_nested_zarr(local_store, attributes, separator) + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [local_store.root / p for p in expected_v3_metadata] + assert zarr_json_paths == expected_zarr_json_paths + + # Check converted zarr can be opened + metadata accessed at all levels + zarr_array = zarr.open(local_store.root, zarr_format=3) + for path in paths: + zarr_v3 = zarr_array[path] + metadata = zarr_v3.metadata + assert metadata.zarr_format == 3 + assert metadata.attributes == attributes + + +@pytest.mark.parametrize("separator", [".", "/"]) +def test_convert_nested_with_path( + local_store: Store, separator: str, expected_v3_metadata: list[Path] +) -> None: + """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" + + create_nested_zarr(local_store, {}, separator) + + result = runner.invoke(cli.app, ["convert", str(local_store.root), "--path", "group_1"]) + assert result.exit_code == 0 + + group_path = local_store.root / "group_1" + + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [ + local_store.root / p + for p in expected_v3_metadata + if group_path in (local_store.root / p).parents + ] + assert zarr_json_paths == expected_zarr_json_paths + + +@pytest.mark.parametrize( + ("compressor_v2", "compressor_v3"), + [ + ( + numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ), + (numcodecs.Zstd(level=3), ZstdCodec(level=3)), + (numcodecs.GZip(level=3), GzipCodec(level=3)), + ( + numcodecs.LZMA( + format=lzma.FORMAT_RAW, + check=-1, + preset=None, + filters=[ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], + ), + LZMA( + format=lzma.FORMAT_RAW, + check=-1, + preset=None, + filters=[ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], + ), + ), + ], + ids=["blosc", "zstd", "gzip", "numcodecs-compressor"], +) +def test_convert_compressor( + local_store: Store, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec +) -> None: + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=compressor_v2, + zarr_format=2, + fill_value=0, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == ( + BytesCodec(endian="little"), + compressor_v3, + ) + assert (zarr_array[:] == 1).all() + + +def test_convert_filter(local_store: Store) -> None: + filter_v2 = numcodecs.Delta(dtype=" None: + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=None, + zarr_format=2, + fill_value=0, + order=order, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == expected_codecs + assert (zarr_array[:] == 1).all() + + +@pytest.mark.parametrize( + ("dtype", "expected_data_type", "expected_codecs"), + [ + ("uint8", UInt8(), (BytesCodec(endian=None),)), + ("uint16", UInt16(), (BytesCodec(endian="little"),)), + ], + ids=["single_byte", "multi_byte"], +) +def test_convert_endian( + local_store: Store, dtype: str, expected_data_type: BaseInt, expected_codecs: tuple[Codec] +) -> None: + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype=dtype, + compressors=None, + zarr_format=2, + fill_value=0, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.data_type == expected_data_type + assert metadata.codecs == expected_codecs + assert (zarr_array[:] == 1).all() + + +@pytest.mark.parametrize("node_type", ["array", "group"]) +def test_convert_v3(local_store: Store, node_type: str) -> None: + """Attempting to convert a v3 array/group should always fail""" + + if node_type == "array": + zarr.create_array( + store=local_store, shape=(10, 10), chunks=(10, 10), zarr_format=3, dtype="uint16" + ) + else: + zarr.create_group(store=local_store, zarr_format=3) + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" + + +def test_convert_unknown_codec(local_store: Store) -> None: + """Attempting to convert a codec without a v3 equivalent should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Categorize(labels=["a", "b"], dtype=object)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, ValueError) + assert ( + str(result.exception) == "Couldn't find corresponding numcodecs.zarr3 codec for categorize" + ) + + +def test_convert_incorrect_filter(local_store: Store) -> None: + """Attempting to convert a filter (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Zstd(level=3)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert ( + str(result.exception) == "Filter is not an ArrayArrayCodec" + ) + + +def test_convert_incorrect_compressor(local_store: Store) -> None: + """Attempting to convert a compressor (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=numcodecs.Delta(dtype=" is not a BytesBytesCodec" + ) + + +def test_remove_metadata_v2(local_store: Store, expected_paths_no_metadata: list[Path]) -> None: + """Test all v2 metadata can be removed (leaving all groups / arrays as-is)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2"]) + assert result.exit_code == 0 + + # check metadata files removed, but all groups / arrays still remain + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + assert paths == expected_paths + + +def test_remove_metadata_v2_with_path( + local_store: Store, expected_paths_no_metadata: list[Path] +) -> None: + """Test only v2 metadata within the given path (group_1) is removed""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--path", "group_1"]) + assert result.exit_code == 0 + + # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top + # group) + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + expected_paths.append(local_store.root / ".zattrs") + expected_paths.append(local_store.root / ".zgroup") + expected_paths.append(local_store.root / "array_0" / ".zarray") + expected_paths.append(local_store.root / "array_0" / ".zattrs") + assert paths == sorted(expected_paths) + + +@pytest.mark.parametrize( + ("zarr_format", "expected_paths"), + [("2", "expected_paths_v3_metadata"), ("3", "expected_paths_v2_metadata")], +) +def test_remove_metadata_after_conversion( + local_store: Store, request: pytest.FixtureRequest, zarr_format: str, expected_paths: list[Path] +) -> None: + """Test all v2/v3 metadata can be removed after metadata conversion (all groups / arrays / + metadata of other versions should remain as-is)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata + result = runner.invoke(cli.app, ["convert", str(local_store.root)]) + assert result.exit_code == 0 + result = runner.invoke(cli.app, ["clear", str(local_store.root), zarr_format]) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in request.getfixturevalue(expected_paths)] + assert paths == expected_paths + + +@pytest.mark.parametrize("cli_command", ["convert", "clear"]) +def test_dry_run( + local_store: Store, cli_command: str, expected_paths_v2_metadata: list[Path] +) -> None: + """Test that all files are un-changed after a dry run""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + create_nested_zarr(local_store, attributes, ".") + + if cli_command == "convert": + result = runner.invoke(cli.app, ["convert", str(local_store.root), "--dry-run"]) + else: + result = runner.invoke(cli.app, ["clear", str(local_store.root), "2", "--dry-run"]) + + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v2_metadata] + assert paths == expected_paths