diff --git a/changes/3264.fix.rst b/changes/3264.fix.rst new file mode 100644 index 0000000000..efcbab514e --- /dev/null +++ b/changes/3264.fix.rst @@ -0,0 +1,4 @@ +- Expand the range of types accepted by ``parse_data_type`` to include strings and Sequences. +- Move the functionality of ``parse_data_type`` to a new function called ``parse_dtype``. This change + ensures that nomenclature is consistent across the codebase. ``parse_data_type`` remains, so this + change is not breaking. \ No newline at end of file diff --git a/docs/user-guide/data_types.rst b/docs/user-guide/data_types.rst index d4b49ca43f..a968cc4c86 100644 --- a/docs/user-guide/data_types.rst +++ b/docs/user-guide/data_types.rst @@ -412,7 +412,7 @@ attempt data type resolution against *every* data type class, and if, for some r type matches multiple Zarr data types, we treat this as an error and raise an exception. If you have a NumPy data type and you want to get the corresponding ``ZDType`` instance, you can use -the ``parse_data_type`` function, which will use the dynamic resolution described above. ``parse_data_type`` +the ``parse_dtype`` function, which will use the dynamic resolution described above. ``parse_dtype`` handles a range of input types: - NumPy data types: @@ -420,9 +420,9 @@ handles a range of input types: .. code-block:: python >>> import numpy as np - >>> from zarr.dtype import parse_data_type + >>> from zarr.dtype import parse_dtype >>> my_dtype = np.dtype('>M8[10s]') - >>> parse_data_type(my_dtype, zarr_format=2) + >>> parse_dtype(my_dtype, zarr_format=2) DateTime64(endianness='big', scale_factor=10, unit='s') @@ -431,7 +431,7 @@ handles a range of input types: .. code-block:: python >>> dtype_str = '>M8[10s]' - >>> parse_data_type(dtype_str, zarr_format=2) + >>> parse_dtype(dtype_str, zarr_format=2) DateTime64(endianness='big', scale_factor=10, unit='s') - ``ZDType`` instances: @@ -440,7 +440,7 @@ handles a range of input types: >>> from zarr.dtype import DateTime64 >>> zdt = DateTime64(endianness='big', scale_factor=10, unit='s') - >>> parse_data_type(zdt, zarr_format=2) # Use a ZDType (this is a no-op) + >>> parse_dtype(zdt, zarr_format=2) # Use a ZDType (this is a no-op) DateTime64(endianness='big', scale_factor=10, unit='s') - Python dictionaries (requires ``zarr_format=3``). These dictionaries must be consistent with the @@ -449,7 +449,7 @@ handles a range of input types: .. code-block:: python >>> dt_dict = {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}} - >>> parse_data_type(dt_dict, zarr_format=3) + >>> parse_dtype(dt_dict, zarr_format=3) DateTime64(endianness='little', scale_factor=10, unit='s') - >>> parse_data_type(dt_dict, zarr_format=3).to_json(zarr_format=3) + >>> parse_dtype(dt_dict, zarr_format=3).to_json(zarr_format=3) {'name': 'numpy.datetime64', 'configuration': {'unit': 's', 'scale_factor': 10}} diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b85c5aba4b..78dddf3669 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -72,7 +72,7 @@ VariableLengthUTF8, ZDType, ZDTypeLike, - parse_data_type, + parse_dtype, ) from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec from zarr.core.indexing import ( @@ -617,7 +617,7 @@ async def _create( Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ - dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format) + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) store_path = await make_store_path(store) shape = parse_shapelike(shape) @@ -4238,7 +4238,7 @@ async def init_array( from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation - zdtype = parse_data_type(dtype, zarr_format=zarr_format) + zdtype = parse_dtype(dtype, zarr_format=zarr_format) shape_parsed = parse_shapelike(shape) chunk_key_encoding_parsed = _parse_chunk_key_encoding( chunk_key_encoding, zarr_format=zarr_format diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index aadf127c9b..bf09a7501e 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Sequence from typing import TYPE_CHECKING, Final, TypeAlias from zarr.core.dtype.common import ( @@ -94,6 +95,7 @@ "ZDType", "data_type_registry", "parse_data_type", + "parse_dtype", ] data_type_registry = DataTypeRegistry() @@ -188,22 +190,26 @@ def parse_data_type( zarr_format: ZarrFormat, ) -> ZDType[TBaseDType, TBaseScalar]: """ - Interpret the input as a ZDType instance. + Interpret the input as a ZDType. + + This function wraps ``parse_dtype``. The only difference is the function name. This function may + be deprecated in a future version of Zarr Python in favor of ``parse_dtype``. Parameters ---------- dtype_spec : ZDTypeLike - The input to be interpreted as a ZDType instance. This could be a native data type - (e.g., a NumPy data type), a Python object that can be converted into a native data type, - a ZDType instance (in which case the input is returned unchanged), or a JSON object - representation of a data type. + The input to be interpreted as a ZDType. This could be a ZDType, which will be returned + directly, or a JSON representation of a ZDType, or a native dtype, or a python object that + can be converted into a native dtype. zarr_format : ZarrFormat - The zarr format version. + The Zarr format version. This parameter is required because this function will attempt to + parse the JSON representation of a data type, and the JSON representation of data types + varies between Zarr 2 and Zarr 3. Returns ------- ZDType[TBaseDType, TBaseScalar] - The ZDType instance corresponding to the input. + The ZDType corresponding to the input. Examples -------- @@ -216,15 +222,57 @@ def parse_data_type( >>> parse_data_type({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3) DateTime64(endianness='little', scale_factor=10, unit='s') """ + return parse_dtype(dtype_spec, zarr_format=zarr_format) + + +def parse_dtype( + dtype_spec: ZDTypeLike, + *, + zarr_format: ZarrFormat, +) -> ZDType[TBaseDType, TBaseScalar]: + """ + Convert the input as a ZDType. + + Parameters + ---------- + dtype_spec : ZDTypeLike + The input to be converted to a ZDType. This could be a ZDType, which will be returned + directly, or a JSON representation of a ZDType, or a numpy dtype, or a python object that + can be converted into a native dtype. + zarr_format : ZarrFormat + The Zarr format version. This parameter is required because this function will attempt to + parse the JSON representation of a data type, and the JSON representation of data types + varies between Zarr 2 and Zarr 3. + + Returns + ------- + ZDType[TBaseDType, TBaseScalar] + The ZDType corresponding to the input. + + Examples + -------- + >>> from zarr.dtype import parse_dtype + >>> import numpy as np + >>> parse_dtype("int32", zarr_format=2) + Int32(endianness='little') + >>> parse_dtype(np.dtype('S10'), zarr_format=2) + NullTerminatedBytes(length=10) + >>> parse_dtype({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3) + DateTime64(endianness='little', scale_factor=10, unit='s') + """ if isinstance(dtype_spec, ZDType): return dtype_spec - # dict and zarr_format 3 means that we have a JSON object representation of the dtype - if zarr_format == 3 and isinstance(dtype_spec, Mapping): - return get_data_type_from_json(dtype_spec, zarr_format=3) + # First attempt to interpret the input as JSON + if isinstance(dtype_spec, Mapping | str | Sequence): + try: + return get_data_type_from_json(dtype_spec, zarr_format=zarr_format) # type: ignore[arg-type] + except ValueError: + # no data type matched this JSON-like input + pass if dtype_spec in VLEN_UTF8_ALIAS: # If the dtype request is one of the aliases for variable-length UTF-8 strings, # return that dtype. return VariableLengthUTF8() # type: ignore[return-value] # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case - # we can create a numpy dtype from it, and do the dtype inference from that + # we can create a native dtype from it, and do the dtype inference from that return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type] diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 79f3aa3a0f..80505ba2f7 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -38,7 +38,7 @@ VariableLengthUTF8JSON_V2, ZDType, data_type_registry, - parse_data_type, + parse_dtype, ) __all__ = [ @@ -83,5 +83,5 @@ "ZDType", "data_type_registry", "data_type_registry", - "parse_data_type", + "parse_dtype", ] diff --git a/tests/test_array.py b/tests/test_array.py index 3f8e61a2e3..42f4a1cbdd 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -53,7 +53,7 @@ VariableLengthBytes, VariableLengthUTF8, ZDType, - parse_data_type, + parse_dtype, ) from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str @@ -1308,7 +1308,7 @@ async def test_v2_chunk_encoding( filters=filters, ) filters_expected, compressor_expected = _parse_chunk_encoding_v2( - filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2) + filters=filters, compressor=compressors, dtype=parse_dtype(dtype, zarr_format=2) ) assert arr.metadata.zarr_format == 2 # guard for mypy assert arr.metadata.compressor == compressor_expected diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index 95ede9e1d7..2716665ff0 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -3,7 +3,7 @@ import re import sys from pathlib import Path -from typing import TYPE_CHECKING, Any, get_args +from typing import TYPE_CHECKING, Any, Literal, get_args import numpy as np import pytest @@ -15,18 +15,16 @@ AnyDType, Bool, DataTypeRegistry, - DateTime64, FixedLengthUTF32, - Int8, - Int16, TBaseDType, TBaseScalar, - VariableLengthUTF8, ZDType, data_type_registry, get_data_type_from_json, parse_data_type, + parse_dtype, ) +from zarr.core.dtype.common import unpack_dtype_json if TYPE_CHECKING: from collections.abc import Generator @@ -174,28 +172,56 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: data_type_registry.unregister(TestDataType._zarr_v3_name) +@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") +@pytest.mark.parametrize("data_type", zdtype_examples, ids=str) +@pytest.mark.parametrize("json_style", [(2, "internal"), (2, "metadata"), (3, None)], ids=str) @pytest.mark.parametrize( - ("dtype_params", "expected", "zarr_format"), - [ - ("str", VariableLengthUTF8(), 2), - ("str", VariableLengthUTF8(), 3), - ("int8", Int8(), 3), - (Int8(), Int8(), 3), - (">i2", Int16(endianness="big"), 2), - ("datetime64[10s]", DateTime64(unit="s", scale_factor=10), 2), - ( - {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, - DateTime64(unit="s", scale_factor=10), - 3, - ), - ], + "dtype_parser_func", [parse_dtype, parse_data_type], ids=["parse_dtype", "parse_data_type"] ) def test_parse_data_type( - dtype_params: Any, expected: ZDType[Any, Any], zarr_format: ZarrFormat + data_type: ZDType[Any, Any], + json_style: tuple[ZarrFormat, None | Literal["internal", "metadata"]], + dtype_parser_func: Any, ) -> None: """ - Test that parse_data_type accepts alternative representations of ZDType instances, and resolves - those inputs to the expected ZDType instance. + Test the parsing of data types into ZDType instances. + + This function tests the ability of `dtype_parser_func` to correctly + interpret and parse data type specifications into `ZDType` instances + according to the specified Zarr format and JSON style. + + Parameters + ---------- + data_type : ZDType[Any, Any] + The data type to be tested for parsing. + json_style : tuple[ZarrFormat, None or Literal["internal", "metadata"]] + A tuple specifying the Zarr format version and the JSON style + for Zarr V2 2. For Zarr V2 there are 2 JSON styles: "internal", and + "metadata". The internal style takes the form {"name": , "object_codec_id": }, + while the metadata style is just . + dtype_parser_func : Any + The function to be tested for parsing the data type. This is necessary for compatibility + reasons, as we support multiple functions that perform the same data type parsing operation. """ - observed = parse_data_type(dtype_params, zarr_format=zarr_format) - assert observed == expected + zarr_format, style = json_style + dtype_spec: Any + + if zarr_format == 2: + dtype_spec = data_type.to_json(zarr_format=zarr_format) + if style == "internal": + pass + elif style == "metadata": + dtype_spec = unpack_dtype_json(dtype_spec) + else: + raise ValueError(f"Invalid zarr v2 json style: {style}") + else: + dtype_spec = data_type.to_json(zarr_format=zarr_format) + + if dtype_spec == "|O": + # The object data type on its own is ambiguous and should fail to resolve. + msg = "Zarr data type resolution from object failed." + with pytest.raises(ValueError, match=msg): + dtype_parser_func(dtype_spec, zarr_format=zarr_format) + else: + observed = dtype_parser_func(dtype_spec, zarr_format=zarr_format) + assert observed == data_type diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 395e036db2..ea2f834bb6 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -18,7 +18,7 @@ open_consolidated, ) from zarr.core.buffer import cpu, default_buffer_prototype -from zarr.core.dtype import parse_data_type +from zarr.core.dtype import parse_dtype from zarr.core.group import ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayV3Metadata from zarr.core.metadata.v2 import ArrayV2Metadata @@ -504,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility( async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - dtype = parse_data_type("uint8", zarr_format=2) + dtype = parse_dtype("uint8", zarr_format=2) await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"})