Skip to content

valid identifiers for the bytes codec #352

@d-v-b

Description

@d-v-b

My understanding is that there are five possible JSON declarations for the bytes codec:

  1. "bytes"
  2. {"name": "bytes"}
  3. {"name": "bytes", "configuration": {}}
  4. {"name": "bytes", "configuration": {"endian": "little"}}
  5. {"name": "bytes", "configuration": {"endian": "big"}}

First, am I correct in this census or is there a detail from the spec I am missing?

Second, unless I'm misreading the spec, we have defined 3 different ways of saying the exact same thing (items 1-3). Putting aside the merits of this design, I'm curious what the different zarr v3 implementations support. cc @LDeakin @jbms

right now zarr-python only supports options 2 - 5. See the script and results below.

demo script
# /// script
# requires-python = ">=3.11"
# dependencies = [
#   "zarr @ git+https://github.com/zarr-developers/zarr-python.git@27615fd",
#   "pytest"
# ]
# ///
from typing import Any
import zarr
import pytest 

from zarr.core.metadata.v3 import ArrayV3Metadata

bytes_codec_specs = [
    "bytes",
    {"name": "bytes"},
    {"name": "bytes", "configuration": {}},
    {"name": "bytes", "configuration": {"endian": "little"}},
    {"name": "bytes", "configuration": {"endian": "big"}},
]

@pytest.mark.parametrize('bytes_codec_spec', bytes_codec_specs, ids=[str(x) for x in bytes_codec_specs])
def test(bytes_codec_spec: str | dict[str, Any]) -> None:
    data = {
        "node_type": "array",
        "fill_value": 0,
        "zarr_format": 3,
        "shape": [100, 100, 3],
        "data_type": "uint8",
        "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [10, 10, 3]}},
        "chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}},
        "codecs": [bytes_codec_spec]
        }
    ArrayV3Metadata.from_dict(data)

if __name__ == "__main__":
    pytest.main([__file__, f'-c {__file__}'])
test.py3.11-1.25-minimalbennettd@dvb-desktop-0 ➜  zarr-python git:(feat/numcodecs-compat) ✗ uv run test.py
Reading inline script metadata from `test.py`
 Updated https://github.com/zarr-developers/zarr-python.git (27615fd0)
Cannot read termcap database;
using dumb terminal settings.
============================================================================================================================================= test session starts =============================================================================================================================================
platform linux -- Python 3.11.5, pytest-8.4.1, pluggy-1.6.0
rootdir: /home/bennettd/dev/zarr-python/ /home/bennettd/dev/zarr-python
configfile: test.py
collected 5 items                                                                                                                                                                                                                                                                                             

 /home/bennettd/dev/zarr-python F....                                                                                                                                                                                                                                                                   [100%]

================================================================================================================================================== FAILURES ===================================================================================================================================================
_________________________________________________________________________________________________________________________________________________ test[bytes] _________________________________________________________________________________________________________________________________________________

bytes_codec_spec = 'bytes'

    @pytest.mark.parametrize('bytes_codec_spec', bytes_codec_specs, ids=[str(x) for x in bytes_codec_specs])
    def test(bytes_codec_spec: str | dict[str, Any]) -> None:
        data = {
            "node_type": "array",
            "fill_value": 0,
            "zarr_format": 3,
            "shape": [100, 100, 3],
            "data_type": "uint8",
            "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [10, 10, 3]}},
            "chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}},
            "codecs": [bytes_codec_spec]
            }
>       ArrayV3Metadata.from_dict(data)

test.py:34: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../.cache/uv/archive-v0/DL3T_HZF_GHlo1WkQHjiQ/lib/python3.11/site-packages/zarr/core/metadata/v3.py:323: in from_dict
    return cls(**_data, fill_value=fill_value_parsed, data_type=data_type)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
../../.cache/uv/archive-v0/DL3T_HZF_GHlo1WkQHjiQ/lib/python3.11/site-packages/zarr/core/metadata/v3.py:177: in __init__
    codecs_parsed_partial = parse_codecs(codecs)
                            ^^^^^^^^^^^^^^^^^^^^
../../.cache/uv/archive-v0/DL3T_HZF_GHlo1WkQHjiQ/lib/python3.11/site-packages/zarr/core/metadata/v3.py:66: in parse_codecs
    name_parsed, _ = parse_named_configuration(c, require_configuration=False)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

data = 'bytes', expected_name = None

    def parse_named_configuration(
        data: JSON, expected_name: str | None = None, *, require_configuration: bool = True
    ) -> tuple[str, JSON | None]:
        if not isinstance(data, dict):
>           raise TypeError(f"Expected dict, got {type(data)}")
E           TypeError: Expected dict, got <class 'str'>

../../.cache/uv/archive-v0/DL3T_HZF_GHlo1WkQHjiQ/lib/python3.11/site-packages/zarr/core/common.py:130: TypeError
=========================================================================================================================================== short test summary info ===========================================================================================================================================
FAILED  /home/bennettd/dev/zarr-python::test[bytes] - TypeError: Expected dict, got <class 'str'>
========================================================================================================================================= 1 failed, 4 passed in 0.08s =========================================================================================================================================

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions