Skip to content

Commit bbdefac

Browse files
authored
string dtype fixes (#3170)
* wip * change the v3 name of variablelengthstring to string * make v3 regression testing for string work properly * pin 3.0.8 regression tests to python 3.12 * don't check for a off-spec warning in variablelengthutf8 * add 0-length-is-illegal tests * changelog * lint
1 parent 27615fd commit bbdefac

File tree

12 files changed

+275
-38
lines changed

12 files changed

+275
-38
lines changed

changes/3170.bugfix.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Fixes a variety of issues related to string data types.
2+
3+
- Brings the ``VariableLengthUTF8`` data type Zarr V3 identifier in alignment with Zarr Python 3.0.8
4+
- Disallows creation of 0-length fixed-length data types
5+
- Adds a regression test for the ``VariableLengthUTF8`` data type that checks against version 3.0.8
6+
- Allows users to request the ``VariableLengthUTF8`` data type with ``str``, ``"str"``, or ``"string"``.

src/zarr/core/dtype/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,12 @@
109109
VariableLengthBytes,
110110
)
111111

112+
# These are aliases for variable-length UTF-8 strings
113+
# We handle them when a user requests a data type instead of using NumPy's dtype inferece because
114+
# the default NumPy behavior -- to inspect the user-provided array data and choose
115+
# an appropriately sized U dtype -- is unworkable for Zarr.
116+
VLEN_UTF8_ALIAS: Final = ("str", str, "string")
117+
112118
# This type models inputs that can be coerced to a ZDType
113119
ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str
114120

@@ -157,6 +163,10 @@ def parse_data_type(
157163
# dict and zarr_format 3 means that we have a JSON object representation of the dtype
158164
if zarr_format == 3 and isinstance(dtype_spec, Mapping):
159165
return get_data_type_from_json(dtype_spec, zarr_format=3)
166+
if dtype_spec in VLEN_UTF8_ALIAS:
167+
# If the dtype request is one of the aliases for variable-length UTF-8 strings,
168+
# return that dtype.
169+
return VariableLengthUTF8() # type: ignore[return-value]
160170
# otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
161171
# we can create a numpy dtype from it, and do the dtype inference from that
162172
return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type]

src/zarr/core/dtype/npy/bytes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLengt
3737
dtype_cls = np.dtypes.BytesDType
3838
_zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes"
3939

40+
def __post_init__(self) -> None:
41+
"""
42+
We don't allow instances of this class with length less than 1 because there is no way such
43+
a data type can contain actual data.
44+
"""
45+
if self.length < 1:
46+
raise ValueError(f"length must be >= 1, got {self.length}.")
47+
4048
@classmethod
4149
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
4250
if cls._check_native_dtype(dtype):
@@ -155,6 +163,14 @@ class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize
155163
dtype_cls = np.dtypes.VoidDType # type: ignore[assignment]
156164
_zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes"
157165

166+
def __post_init__(self) -> None:
167+
"""
168+
We don't allow instances of this class with length less than 1 because there is no way such
169+
a data type can contain actual data.
170+
"""
171+
if self.length < 1:
172+
raise ValueError(f"length must be >= 1, got {self.length}.")
173+
158174
@classmethod
159175
def _check_native_dtype(
160176
cls: type[Self], dtype: TBaseDType

src/zarr/core/dtype/npy/string.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ class FixedLengthUTF32(
6363
_zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32"
6464
code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point
6565

66+
def __post_init__(self) -> None:
67+
"""
68+
We don't allow instances of this class with length less than 1 because there is no way such
69+
a data type can contain actual data.
70+
"""
71+
if self.length < 1:
72+
raise ValueError(f"length must be >= 1, got {self.length}.")
73+
6674
@classmethod
6775
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
6876
if cls._check_native_dtype(dtype):
@@ -195,7 +203,7 @@ class UTF8Base(ZDType[TDType_co, str], HasObjectCodec):
195203
as data type, but as a base class for other variable length string data types.
196204
"""
197205

198-
_zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
206+
_zarr_v3_name: ClassVar[Literal["string"]] = "string"
199207
object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
200208

201209
@classmethod
@@ -222,7 +230,7 @@ def _check_json_v2(
222230
)
223231

224232
@classmethod
225-
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]:
233+
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["string"]]:
226234
return data == cls._zarr_v3_name
227235

228236
@classmethod
@@ -246,15 +254,14 @@ def to_json(
246254
self, zarr_format: Literal[2]
247255
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ...
248256
@overload
249-
def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ...
257+
def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ...
250258

251259
def to_json(
252260
self, zarr_format: ZarrFormat
253-
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]:
261+
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]:
254262
if zarr_format == 2:
255263
return {"name": "|O", "object_codec_id": self.object_codec_id}
256264
elif zarr_format == 3:
257-
v3_unstable_dtype_warning(self)
258265
return self._zarr_v3_name
259266
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
260267

src/zarr/core/dtype/npy/structured.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize):
3737
_zarr_v3_name = "structured"
3838
fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...]
3939

40+
def __post_init__(self) -> None:
41+
if len(self.fields) < 1:
42+
raise ValueError(f"must have at least one field. Got {self.fields!r}")
43+
4044
@classmethod
4145
def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]:
4246
"""

tests/test_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from zarr.core.chunk_grids import _auto_partition
4242
from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams
4343
from zarr.core.common import JSON, MemoryOrder, ZarrFormat
44-
from zarr.core.dtype import get_data_type_from_native_dtype
44+
from zarr.core.dtype import parse_data_type
4545
from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
4646
from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
4747
from zarr.core.dtype.npy.float import Float32, Float64
@@ -1285,7 +1285,7 @@ async def test_v2_chunk_encoding(
12851285
filters=filters,
12861286
)
12871287
filters_expected, compressor_expected = _parse_chunk_encoding_v2(
1288-
filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype)
1288+
filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2)
12891289
)
12901290
assert arr.metadata.zarr_format == 2 # guard for mypy
12911291
assert arr.metadata.compressor == compressor_expected

tests/test_dtype/test_npy/test_bytes.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class TestNullTerminatedBytes(BaseTestZDType):
1515
np.dtype("|U10"),
1616
)
1717
valid_json_v2 = (
18-
{"name": "|S0", "object_codec_id": None},
18+
{"name": "|S1", "object_codec_id": None},
1919
{"name": "|S2", "object_codec_id": None},
2020
{"name": "|S4", "object_codec_id": None},
2121
)
@@ -31,22 +31,22 @@ class TestNullTerminatedBytes(BaseTestZDType):
3131
)
3232

3333
scalar_v2_params = (
34-
(NullTerminatedBytes(length=0), ""),
34+
(NullTerminatedBytes(length=1), "MA=="),
3535
(NullTerminatedBytes(length=2), "YWI="),
3636
(NullTerminatedBytes(length=4), "YWJjZA=="),
3737
)
3838
scalar_v3_params = (
39-
(NullTerminatedBytes(length=0), ""),
39+
(NullTerminatedBytes(length=1), "MA=="),
4040
(NullTerminatedBytes(length=2), "YWI="),
4141
(NullTerminatedBytes(length=4), "YWJjZA=="),
4242
)
4343
cast_value_params = (
44-
(NullTerminatedBytes(length=0), "", np.bytes_("")),
44+
(NullTerminatedBytes(length=1), "", np.bytes_("")),
4545
(NullTerminatedBytes(length=2), "ab", np.bytes_("ab")),
4646
(NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")),
4747
)
4848
item_size_params = (
49-
NullTerminatedBytes(length=0),
49+
NullTerminatedBytes(length=1),
5050
NullTerminatedBytes(length=4),
5151
NullTerminatedBytes(length=10),
5252
)
@@ -62,7 +62,7 @@ class TestRawBytes(BaseTestZDType):
6262
)
6363
valid_json_v2 = ({"name": "|V10", "object_codec_id": None},)
6464
valid_json_v3 = (
65-
{"name": "raw_bytes", "configuration": {"length_bytes": 0}},
65+
{"name": "raw_bytes", "configuration": {"length_bytes": 1}},
6666
{"name": "raw_bytes", "configuration": {"length_bytes": 8}},
6767
)
6868

@@ -77,22 +77,22 @@ class TestRawBytes(BaseTestZDType):
7777
)
7878

7979
scalar_v2_params = (
80-
(RawBytes(length=0), ""),
80+
(RawBytes(length=1), "AA=="),
8181
(RawBytes(length=2), "YWI="),
8282
(RawBytes(length=4), "YWJjZA=="),
8383
)
8484
scalar_v3_params = (
85-
(RawBytes(length=0), ""),
85+
(RawBytes(length=1), "AA=="),
8686
(RawBytes(length=2), "YWI="),
8787
(RawBytes(length=4), "YWJjZA=="),
8888
)
8989
cast_value_params = (
90-
(RawBytes(length=0), b"", np.void(b"")),
90+
(RawBytes(length=1), b"\x00", np.void(b"\x00")),
9191
(RawBytes(length=2), b"ab", np.void(b"ab")),
9292
(RawBytes(length=4), b"abcd", np.void(b"abcd")),
9393
)
9494
item_size_params = (
95-
RawBytes(length=0),
95+
RawBytes(length=1),
9696
RawBytes(length=4),
9797
RawBytes(length=10),
9898
)
@@ -152,3 +152,14 @@ def test_unstable_dtype_warning(
152152
"""
153153
with pytest.raises(UnstableSpecificationWarning):
154154
zdtype.to_json(zarr_format=3)
155+
156+
157+
@pytest.mark.parametrize("zdtype_cls", [NullTerminatedBytes, RawBytes])
158+
def test_invalid_size(zdtype_cls: type[NullTerminatedBytes] | type[RawBytes]) -> None:
159+
"""
160+
Test that it's impossible to create a data type that has no length
161+
"""
162+
length = 0
163+
msg = f"length must be >= 1, got {length}."
164+
with pytest.raises(ValueError, match=msg):
165+
zdtype_cls(length=length)

tests/test_dtype/test_npy/test_string.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class TestVariableLengthString(BaseTestZDType):
1919
np.dtype("|S10"),
2020
)
2121
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
22-
valid_json_v3 = ("variable_length_utf8",)
22+
valid_json_v3 = ("string",)
2323
invalid_json_v2 = (
2424
"|S10",
2525
"|f8",
@@ -53,7 +53,7 @@ class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef]
5353
np.dtype("|S10"),
5454
)
5555
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
56-
valid_json_v3 = ("variable_length_utf8",)
56+
valid_json_v3 = ("string",)
5757
invalid_json_v2 = (
5858
"|S10",
5959
"|f8",
@@ -101,30 +101,45 @@ class TestFixedLengthUTF32(BaseTestZDType):
101101
{"name": "numpy.fixed_length_utf32", "configuration": {"length_bits": "invalid"}},
102102
)
103103

104-
scalar_v2_params = ((FixedLengthUTF32(length=0), ""), (FixedLengthUTF32(length=2), "hi"))
104+
scalar_v2_params = ((FixedLengthUTF32(length=1), ""), (FixedLengthUTF32(length=2), "hi"))
105105
scalar_v3_params = (
106-
(FixedLengthUTF32(length=0), ""),
106+
(FixedLengthUTF32(length=1), ""),
107107
(FixedLengthUTF32(length=2), "hi"),
108108
(FixedLengthUTF32(length=4), "hihi"),
109109
)
110110

111111
cast_value_params = (
112-
(FixedLengthUTF32(length=0), "", np.str_("")),
112+
(FixedLengthUTF32(length=1), "", np.str_("")),
113113
(FixedLengthUTF32(length=2), "hi", np.str_("hi")),
114114
(FixedLengthUTF32(length=4), "hihi", np.str_("hihi")),
115115
)
116116
item_size_params = (
117-
FixedLengthUTF32(length=0),
117+
FixedLengthUTF32(length=1),
118118
FixedLengthUTF32(length=4),
119119
FixedLengthUTF32(length=10),
120120
)
121121

122122

123-
@pytest.mark.parametrize("zdtype", [FixedLengthUTF32(length=10), VariableLengthUTF8()])
123+
@pytest.mark.parametrize(
124+
"zdtype",
125+
[
126+
FixedLengthUTF32(length=10),
127+
],
128+
)
124129
def test_unstable_dtype_warning(zdtype: FixedLengthUTF32 | VariableLengthUTF8) -> None:
125130
"""
126131
Test that we get a warning when serializing a dtype without a zarr v3 spec to json
127132
when zarr_format is 3
128133
"""
129134
with pytest.raises(UnstableSpecificationWarning):
130135
zdtype.to_json(zarr_format=3)
136+
137+
138+
def test_invalid_size() -> None:
139+
"""
140+
Test that it's impossible to create a data type that has no length
141+
"""
142+
length = 0
143+
msg = f"length must be >= 1, got {length}."
144+
with pytest.raises(ValueError, match=msg):
145+
FixedLengthUTF32(length=length)

tests/test_dtype/test_npy/test_structured.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Any
44

55
import numpy as np
6+
import pytest
67

78
from tests.test_dtype.test_wrapper import BaseTestZDType
89
from zarr.core.dtype import (
@@ -106,3 +107,13 @@ def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool:
106107
Structured(fields=(("field1", Int32()), ("field2", Float64()))),
107108
Structured(fields=(("field1", Int64()), ("field2", Int32()))),
108109
)
110+
111+
112+
def test_invalid_size() -> None:
113+
"""
114+
Test that it's impossible to create a data type that has no fields
115+
"""
116+
fields = ()
117+
msg = f"must have at least one field. Got {fields!r}"
118+
with pytest.raises(ValueError, match=msg):
119+
Structured(fields=fields)

tests/test_dtype_registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
Int16,
2222
TBaseDType,
2323
TBaseScalar,
24+
VariableLengthUTF8,
2425
ZDType,
2526
data_type_registry,
2627
get_data_type_from_json,
@@ -176,6 +177,8 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None:
176177
@pytest.mark.parametrize(
177178
("dtype_params", "expected", "zarr_format"),
178179
[
180+
("str", VariableLengthUTF8(), 2),
181+
("str", VariableLengthUTF8(), 3),
179182
("int8", Int8(), 3),
180183
(Int8(), Int8(), 3),
181184
(">i2", Int16(endianness="big"), 2),

0 commit comments

Comments
 (0)