Skip to content

Commit a27d4d6

Browse files
d-v-bdstansby
andauthored
improvements to parse_dtype (#3264)
* add parse_dtype as ergonomic replacement for parse_data_type, handle more JSON-like inputs, and test for round-trips * update docs * changelog * remove type: ignore * add test to check that parse_dtype is parse_data_type * Update src/zarr/dtype.py Co-authored-by: David Stansby <dstansby@gmail.com> * fix docstring * support the output of to_json(zarr_format=2) as input to parse_dtype * lint * remove infinite recursion * Update src/zarr/core/dtype/__init__.py Co-authored-by: David Stansby <dstansby@gmail.com> * Update src/zarr/core/dtype/__init__.py Co-authored-by: David Stansby <dstansby@gmail.com> * Update src/zarr/core/dtype/__init__.py Co-authored-by: David Stansby <dstansby@gmail.com> * Update src/zarr/core/dtype/__init__.py Co-authored-by: David Stansby <dstansby@gmail.com> * Update src/zarr/core/dtype/__init__.py Co-authored-by: David Stansby <dstansby@gmail.com> * Update src/zarr/core/dtype/__init__.py Co-authored-by: David Stansby <dstansby@gmail.com> --------- Co-authored-by: David Stansby <dstansby@gmail.com>
1 parent fd5425b commit a27d4d6

File tree

8 files changed

+129
-51
lines changed

8 files changed

+129
-51
lines changed

changes/3264.fix.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- Expand the range of types accepted by ``parse_data_type`` to include strings and Sequences.
2+
- Move the functionality of ``parse_data_type`` to a new function called ``parse_dtype``. This change
3+
ensures that nomenclature is consistent across the codebase. ``parse_data_type`` remains, so this
4+
change is not breaking.

docs/user-guide/data_types.rst

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -412,17 +412,17 @@ attempt data type resolution against *every* data type class, and if, for some r
412412
type matches multiple Zarr data types, we treat this as an error and raise an exception.
413413

414414
If you have a NumPy data type and you want to get the corresponding ``ZDType`` instance, you can use
415-
the ``parse_data_type`` function, which will use the dynamic resolution described above. ``parse_data_type``
415+
the ``parse_dtype`` function, which will use the dynamic resolution described above. ``parse_dtype``
416416
handles a range of input types:
417417

418418
- NumPy data types:
419419

420420
.. code-block:: python
421421
422422
>>> import numpy as np
423-
>>> from zarr.dtype import parse_data_type
423+
>>> from zarr.dtype import parse_dtype
424424
>>> my_dtype = np.dtype('>M8[10s]')
425-
>>> parse_data_type(my_dtype, zarr_format=2)
425+
>>> parse_dtype(my_dtype, zarr_format=2)
426426
DateTime64(endianness='big', scale_factor=10, unit='s')
427427
428428
@@ -431,7 +431,7 @@ handles a range of input types:
431431
.. code-block:: python
432432
433433
>>> dtype_str = '>M8[10s]'
434-
>>> parse_data_type(dtype_str, zarr_format=2)
434+
>>> parse_dtype(dtype_str, zarr_format=2)
435435
DateTime64(endianness='big', scale_factor=10, unit='s')
436436
437437
- ``ZDType`` instances:
@@ -440,7 +440,7 @@ handles a range of input types:
440440
441441
>>> from zarr.dtype import DateTime64
442442
>>> zdt = DateTime64(endianness='big', scale_factor=10, unit='s')
443-
>>> parse_data_type(zdt, zarr_format=2) # Use a ZDType (this is a no-op)
443+
>>> parse_dtype(zdt, zarr_format=2) # Use a ZDType (this is a no-op)
444444
DateTime64(endianness='big', scale_factor=10, unit='s')
445445
446446
- Python dictionaries (requires ``zarr_format=3``). These dictionaries must be consistent with the
@@ -449,7 +449,7 @@ handles a range of input types:
449449
.. code-block:: python
450450
451451
>>> dt_dict = {"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}
452-
>>> parse_data_type(dt_dict, zarr_format=3)
452+
>>> parse_dtype(dt_dict, zarr_format=3)
453453
DateTime64(endianness='little', scale_factor=10, unit='s')
454-
>>> parse_data_type(dt_dict, zarr_format=3).to_json(zarr_format=3)
454+
>>> parse_dtype(dt_dict, zarr_format=3).to_json(zarr_format=3)
455455
{'name': 'numpy.datetime64', 'configuration': {'unit': 's', 'scale_factor': 10}}

src/zarr/core/array.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
VariableLengthUTF8,
7373
ZDType,
7474
ZDTypeLike,
75-
parse_data_type,
75+
parse_dtype,
7676
)
7777
from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec
7878
from zarr.core.indexing import (
@@ -617,7 +617,7 @@ async def _create(
617617
Deprecated in favor of :func:`zarr.api.asynchronous.create_array`.
618618
"""
619619

620-
dtype_parsed = parse_data_type(dtype, zarr_format=zarr_format)
620+
dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format)
621621
store_path = await make_store_path(store)
622622

623623
shape = parse_shapelike(shape)
@@ -4238,7 +4238,7 @@ async def init_array(
42384238

42394239
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
42404240

4241-
zdtype = parse_data_type(dtype, zarr_format=zarr_format)
4241+
zdtype = parse_dtype(dtype, zarr_format=zarr_format)
42424242
shape_parsed = parse_shapelike(shape)
42434243
chunk_key_encoding_parsed = _parse_chunk_key_encoding(
42444244
chunk_key_encoding, zarr_format=zarr_format

src/zarr/core/dtype/__init__.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from collections.abc import Sequence
34
from typing import TYPE_CHECKING, Final, TypeAlias
45

56
from zarr.core.dtype.common import (
@@ -94,6 +95,7 @@
9495
"ZDType",
9596
"data_type_registry",
9697
"parse_data_type",
98+
"parse_dtype",
9799
]
98100

99101
data_type_registry = DataTypeRegistry()
@@ -188,22 +190,26 @@ def parse_data_type(
188190
zarr_format: ZarrFormat,
189191
) -> ZDType[TBaseDType, TBaseScalar]:
190192
"""
191-
Interpret the input as a ZDType instance.
193+
Interpret the input as a ZDType.
194+
195+
This function wraps ``parse_dtype``. The only difference is the function name. This function may
196+
be deprecated in a future version of Zarr Python in favor of ``parse_dtype``.
192197
193198
Parameters
194199
----------
195200
dtype_spec : ZDTypeLike
196-
The input to be interpreted as a ZDType instance. This could be a native data type
197-
(e.g., a NumPy data type), a Python object that can be converted into a native data type,
198-
a ZDType instance (in which case the input is returned unchanged), or a JSON object
199-
representation of a data type.
201+
The input to be interpreted as a ZDType. This could be a ZDType, which will be returned
202+
directly, or a JSON representation of a ZDType, or a native dtype, or a python object that
203+
can be converted into a native dtype.
200204
zarr_format : ZarrFormat
201-
The zarr format version.
205+
The Zarr format version. This parameter is required because this function will attempt to
206+
parse the JSON representation of a data type, and the JSON representation of data types
207+
varies between Zarr 2 and Zarr 3.
202208
203209
Returns
204210
-------
205211
ZDType[TBaseDType, TBaseScalar]
206-
The ZDType instance corresponding to the input.
212+
The ZDType corresponding to the input.
207213
208214
Examples
209215
--------
@@ -216,15 +222,57 @@ def parse_data_type(
216222
>>> parse_data_type({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3)
217223
DateTime64(endianness='little', scale_factor=10, unit='s')
218224
"""
225+
return parse_dtype(dtype_spec, zarr_format=zarr_format)
226+
227+
228+
def parse_dtype(
229+
dtype_spec: ZDTypeLike,
230+
*,
231+
zarr_format: ZarrFormat,
232+
) -> ZDType[TBaseDType, TBaseScalar]:
233+
"""
234+
Convert the input as a ZDType.
235+
236+
Parameters
237+
----------
238+
dtype_spec : ZDTypeLike
239+
The input to be converted to a ZDType. This could be a ZDType, which will be returned
240+
directly, or a JSON representation of a ZDType, or a numpy dtype, or a python object that
241+
can be converted into a native dtype.
242+
zarr_format : ZarrFormat
243+
The Zarr format version. This parameter is required because this function will attempt to
244+
parse the JSON representation of a data type, and the JSON representation of data types
245+
varies between Zarr 2 and Zarr 3.
246+
247+
Returns
248+
-------
249+
ZDType[TBaseDType, TBaseScalar]
250+
The ZDType corresponding to the input.
251+
252+
Examples
253+
--------
254+
>>> from zarr.dtype import parse_dtype
255+
>>> import numpy as np
256+
>>> parse_dtype("int32", zarr_format=2)
257+
Int32(endianness='little')
258+
>>> parse_dtype(np.dtype('S10'), zarr_format=2)
259+
NullTerminatedBytes(length=10)
260+
>>> parse_dtype({"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}}, zarr_format=3)
261+
DateTime64(endianness='little', scale_factor=10, unit='s')
262+
"""
219263
if isinstance(dtype_spec, ZDType):
220264
return dtype_spec
221-
# dict and zarr_format 3 means that we have a JSON object representation of the dtype
222-
if zarr_format == 3 and isinstance(dtype_spec, Mapping):
223-
return get_data_type_from_json(dtype_spec, zarr_format=3)
265+
# First attempt to interpret the input as JSON
266+
if isinstance(dtype_spec, Mapping | str | Sequence):
267+
try:
268+
return get_data_type_from_json(dtype_spec, zarr_format=zarr_format) # type: ignore[arg-type]
269+
except ValueError:
270+
# no data type matched this JSON-like input
271+
pass
224272
if dtype_spec in VLEN_UTF8_ALIAS:
225273
# If the dtype request is one of the aliases for variable-length UTF-8 strings,
226274
# return that dtype.
227275
return VariableLengthUTF8() # type: ignore[return-value]
228276
# otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
229-
# we can create a numpy dtype from it, and do the dtype inference from that
277+
# we can create a native dtype from it, and do the dtype inference from that
230278
return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type]

src/zarr/dtype.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
VariableLengthUTF8JSON_V2,
3939
ZDType,
4040
data_type_registry,
41-
parse_data_type,
41+
parse_dtype,
4242
)
4343

4444
__all__ = [
@@ -83,5 +83,5 @@
8383
"ZDType",
8484
"data_type_registry",
8585
"data_type_registry",
86-
"parse_data_type",
86+
"parse_dtype",
8787
]

tests/test_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
VariableLengthBytes,
5454
VariableLengthUTF8,
5555
ZDType,
56-
parse_data_type,
56+
parse_dtype,
5757
)
5858
from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
5959
from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
@@ -1308,7 +1308,7 @@ async def test_v2_chunk_encoding(
13081308
filters=filters,
13091309
)
13101310
filters_expected, compressor_expected = _parse_chunk_encoding_v2(
1311-
filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2)
1311+
filters=filters, compressor=compressors, dtype=parse_dtype(dtype, zarr_format=2)
13121312
)
13131313
assert arr.metadata.zarr_format == 2 # guard for mypy
13141314
assert arr.metadata.compressor == compressor_expected

tests/test_dtype_registry.py

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
import sys
55
from pathlib import Path
6-
from typing import TYPE_CHECKING, Any, get_args
6+
from typing import TYPE_CHECKING, Any, Literal, get_args
77

88
import numpy as np
99
import pytest
@@ -15,18 +15,16 @@
1515
AnyDType,
1616
Bool,
1717
DataTypeRegistry,
18-
DateTime64,
1918
FixedLengthUTF32,
20-
Int8,
21-
Int16,
2219
TBaseDType,
2320
TBaseScalar,
24-
VariableLengthUTF8,
2521
ZDType,
2622
data_type_registry,
2723
get_data_type_from_json,
2824
parse_data_type,
25+
parse_dtype,
2926
)
27+
from zarr.core.dtype.common import unpack_dtype_json
3028

3129
if TYPE_CHECKING:
3230
from collections.abc import Generator
@@ -174,28 +172,56 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None:
174172
data_type_registry.unregister(TestDataType._zarr_v3_name)
175173

176174

175+
@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")
176+
@pytest.mark.parametrize("data_type", zdtype_examples, ids=str)
177+
@pytest.mark.parametrize("json_style", [(2, "internal"), (2, "metadata"), (3, None)], ids=str)
177178
@pytest.mark.parametrize(
178-
("dtype_params", "expected", "zarr_format"),
179-
[
180-
("str", VariableLengthUTF8(), 2),
181-
("str", VariableLengthUTF8(), 3),
182-
("int8", Int8(), 3),
183-
(Int8(), Int8(), 3),
184-
(">i2", Int16(endianness="big"), 2),
185-
("datetime64[10s]", DateTime64(unit="s", scale_factor=10), 2),
186-
(
187-
{"name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 10}},
188-
DateTime64(unit="s", scale_factor=10),
189-
3,
190-
),
191-
],
179+
"dtype_parser_func", [parse_dtype, parse_data_type], ids=["parse_dtype", "parse_data_type"]
192180
)
193181
def test_parse_data_type(
194-
dtype_params: Any, expected: ZDType[Any, Any], zarr_format: ZarrFormat
182+
data_type: ZDType[Any, Any],
183+
json_style: tuple[ZarrFormat, None | Literal["internal", "metadata"]],
184+
dtype_parser_func: Any,
195185
) -> None:
196186
"""
197-
Test that parse_data_type accepts alternative representations of ZDType instances, and resolves
198-
those inputs to the expected ZDType instance.
187+
Test the parsing of data types into ZDType instances.
188+
189+
This function tests the ability of `dtype_parser_func` to correctly
190+
interpret and parse data type specifications into `ZDType` instances
191+
according to the specified Zarr format and JSON style.
192+
193+
Parameters
194+
----------
195+
data_type : ZDType[Any, Any]
196+
The data type to be tested for parsing.
197+
json_style : tuple[ZarrFormat, None or Literal["internal", "metadata"]]
198+
A tuple specifying the Zarr format version and the JSON style
199+
for Zarr V2 2. For Zarr V2 there are 2 JSON styles: "internal", and
200+
"metadata". The internal style takes the form {"name": <data type identifier>, "object_codec_id": <object codec id>},
201+
while the metadata style is just <data type identifier>.
202+
dtype_parser_func : Any
203+
The function to be tested for parsing the data type. This is necessary for compatibility
204+
reasons, as we support multiple functions that perform the same data type parsing operation.
199205
"""
200-
observed = parse_data_type(dtype_params, zarr_format=zarr_format)
201-
assert observed == expected
206+
zarr_format, style = json_style
207+
dtype_spec: Any
208+
209+
if zarr_format == 2:
210+
dtype_spec = data_type.to_json(zarr_format=zarr_format)
211+
if style == "internal":
212+
pass
213+
elif style == "metadata":
214+
dtype_spec = unpack_dtype_json(dtype_spec)
215+
else:
216+
raise ValueError(f"Invalid zarr v2 json style: {style}")
217+
else:
218+
dtype_spec = data_type.to_json(zarr_format=zarr_format)
219+
220+
if dtype_spec == "|O":
221+
# The object data type on its own is ambiguous and should fail to resolve.
222+
msg = "Zarr data type resolution from object failed."
223+
with pytest.raises(ValueError, match=msg):
224+
dtype_parser_func(dtype_spec, zarr_format=zarr_format)
225+
else:
226+
observed = dtype_parser_func(dtype_spec, zarr_format=zarr_format)
227+
assert observed == data_type

tests/test_metadata/test_consolidated.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
open_consolidated,
1919
)
2020
from zarr.core.buffer import cpu, default_buffer_prototype
21-
from zarr.core.dtype import parse_data_type
21+
from zarr.core.dtype import parse_dtype
2222
from zarr.core.group import ConsolidatedMetadata, GroupMetadata
2323
from zarr.core.metadata import ArrayV3Metadata
2424
from zarr.core.metadata.v2 import ArrayV2Metadata
@@ -504,7 +504,7 @@ async def test_consolidated_metadata_backwards_compatibility(
504504
async def test_consolidated_metadata_v2(self):
505505
store = zarr.storage.MemoryStore()
506506
g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2)
507-
dtype = parse_data_type("uint8", zarr_format=2)
507+
dtype = parse_dtype("uint8", zarr_format=2)
508508
await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype)
509509
g1 = await g.create_group(name="g1", attributes={"key": "g1"})
510510
await g1.create_group(name="g2", attributes={"key": "g2"})

0 commit comments

Comments
 (0)