Skip to content

Commit c0e39af

Browse files
authored
remove chunk encoding from config (#3228)
* refactor default chunk encoding to skip config. add tests for deprecated config keys * remove chunk encoding configuration from docs * don't create invalid string dtype arrays in test * add v2-style error when creating a vlen dtype without the right codec * test for v2-style error when creating an object array without an object codec * lint * changelog * Update 3228.removal.rst * test coverage
1 parent ded59d9 commit c0e39af

File tree

8 files changed

+286
-175
lines changed

8 files changed

+286
-175
lines changed

changes/3228.removal.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
Removes default chunk encoding settings (filters, serializer, compressors) from the global
2+
configuration object.
3+
4+
This removal is justified on the basis that storing chunk encoding settings in the config required
5+
a brittle, confusing, and inaccurate categorization of array data types, which was particularly
6+
unsuitable after the recent addition of new data types that didn't fit naturally into the
7+
pre-existing categories.
8+
9+
The default chunk encoding is the same (Zstandard compression, and the required object codecs for
10+
variable length data types), but the chunk encoding is now generated by functions that cannot be
11+
reconfigured at runtime. Users who relied on setting the default chunk encoding via the global configuration object should
12+
instead specify the desired chunk encoding explicitly when creating an array.
13+
14+
This change also adds an extra validation step to the creation of Zarr V2 arrays, which ensures that
15+
arrays with a ``VariableLengthUTF8`` or ``VariableLengthBytes`` data type cannot be created without the
16+
correct "object codec".

docs/user-guide/arrays.rst

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -246,16 +246,6 @@ built-in delta filter::
246246
>>> z.compressors
247247
(LZMA(codec_name='numcodecs.lzma', codec_config={'filters': [{'id': 3, 'dist': 4}, {'id': 33, 'preset': 1}]}),)
248248

249-
The default compressor can be changed by setting the value of the using Zarr's
250-
:ref:`user-guide-config`, e.g.::
251-
252-
>>> with zarr.config.set({'array.v2_default_compressor.default': {'id': 'blosc'}}):
253-
... z = zarr.create_array(store={}, shape=(100000000,), chunks=(1000000,), dtype='int32', zarr_format=2)
254-
>>> z.filters
255-
()
256-
>>> z.compressors
257-
(Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),)
258-
259249
To disable compression, set ``compressors=None`` when creating an array, e.g.::
260250

261251
>>> z = zarr.create_array(store='data/example-8.zarr', shape=(100000000,), chunks=(1000000,), dtype='int32', compressors=None)

docs/user-guide/config.rst

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,25 +43,7 @@ This is the current default configuration::
4343

4444
>>> zarr.config.pprint()
4545
{'array': {'order': 'C',
46-
'v2_default_compressor': {'default': {'checksum': False,
47-
'id': 'zstd',
48-
'level': 0},
49-
'variable-length-string': {'checksum': False,
50-
'id': 'zstd',
51-
'level': 0}},
52-
'v2_default_filters': {'default': None,
53-
'variable-length-string': [{'id': 'vlen-utf8'}]},
54-
'v3_default_compressors': {'default': [{'configuration': {'checksum': False,
55-
'level': 0},
56-
'name': 'zstd'}],
57-
'variable-length-string': [{'configuration': {'checksum': False,
58-
'level': 0},
59-
'name': 'zstd'}]},
60-
'v3_default_filters': {'default': [], 'variable-length-string': []},
61-
'v3_default_serializer': {'default': {'configuration': {'endian': 'little'},
62-
'name': 'bytes'},
63-
'variable-length-string': {'name': 'vlen-utf8'}},
64-
'write_empty_chunks': False},
46+
'write_empty_chunks': False},
6547
'async': {'concurrency': 10, 'timeout': None},
6648
'buffer': 'zarr.buffer.cpu.Buffer',
6749
'codec_pipeline': {'batch_size': 1,

src/zarr/core/array.py

Lines changed: 117 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
from zarr.abc.store import Store, set_or_delete
3131
from zarr.codecs._v2 import V2Codec
3232
from zarr.codecs.bytes import BytesCodec
33+
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
34+
from zarr.codecs.zstd import ZstdCodec
3335
from zarr.core._info import ArrayInfo
3436
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config
3537
from zarr.core.attributes import Attributes
@@ -68,11 +70,13 @@
6870
from zarr.core.config import categorize_data_type
6971
from zarr.core.config import config as zarr_config
7072
from zarr.core.dtype import (
73+
VariableLengthBytes,
74+
VariableLengthUTF8,
7175
ZDType,
7276
ZDTypeLike,
7377
parse_data_type,
7478
)
75-
from zarr.core.dtype.common import HasEndianness, HasItemSize
79+
from zarr.core.dtype.common import HasEndianness, HasItemSize, HasObjectCodec
7680
from zarr.core.indexing import (
7781
BasicIndexer,
7882
BasicSelection,
@@ -109,6 +113,7 @@
109113
)
110114
from zarr.core.metadata.v2 import (
111115
CompressorLikev2,
116+
get_object_codec_id,
112117
parse_compressor,
113118
parse_filters,
114119
)
@@ -710,7 +715,10 @@ def _create_metadata_v3(
710715

711716
shape = parse_shapelike(shape)
712717
if codecs is None:
713-
filters, serializer, compressors = _get_default_chunk_encoding_v3(dtype)
718+
filters = default_filters_v3(dtype)
719+
serializer = default_serializer_v3(dtype)
720+
compressors = default_compressors_v3(dtype)
721+
714722
codecs_parsed = (*filters, serializer, *compressors)
715723
else:
716724
codecs_parsed = tuple(codecs)
@@ -850,10 +858,9 @@ async def _create_v2(
850858
else:
851859
await ensure_no_existing_node(store_path, zarr_format=2)
852860

853-
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
854861
compressor_parsed: CompressorLikev2
855862
if compressor == "auto":
856-
compressor_parsed = default_compressor
863+
compressor_parsed = default_compressor_v2(dtype)
857864
elif isinstance(compressor, BytesBytesCodec):
858865
raise ValueError(
859866
"Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -863,7 +870,7 @@ async def _create_v2(
863870
compressor_parsed = compressor
864871

865872
if filters is None:
866-
filters = default_filters
873+
filters = default_filters_v2(dtype)
867874

868875
metadata = cls._create_metadata_v2(
869876
shape=shape,
@@ -4654,19 +4661,80 @@ def _get_default_chunk_encoding_v3(
46544661
)
46554662

46564663

4657-
def _get_default_chunk_encoding_v2(
4658-
dtype: ZDType[TBaseDType, TBaseScalar],
4659-
) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]:
4664+
def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]:
46604665
"""
4661-
Get the default chunk encoding for Zarr format 2 arrays, given a dtype
4666+
Given a data type, return the default filters for that data type.
4667+
4668+
This is an empty tuple. No data types have default filters.
46624669
"""
4663-
dtype_category = categorize_data_type(dtype)
4664-
filters = zarr_config.get("array.v2_default_filters").get(dtype_category)
4665-
compressor = zarr_config.get("array.v2_default_compressor").get(dtype_category)
4666-
if filters is not None:
4667-
filters = tuple(numcodecs.get_codec(f) for f in filters)
4670+
return ()
4671+
4672+
4673+
def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ...]:
4674+
"""
4675+
Given a data type, return the default compressors for that data type.
4676+
4677+
This is just a tuple containing ``ZstdCodec``
4678+
"""
4679+
return (ZstdCodec(),)
4680+
4681+
4682+
def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
4683+
"""
4684+
Given a data type, return the default serializer for that data type.
4685+
4686+
The default serializer for most data types is the ``BytesCodec``, which may or may not be
4687+
parameterized with an endianness, depending on whether the data type has endianness. Variable
4688+
length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
4689+
``VLenBytesCodec``, respectively.
4690+
4691+
"""
4692+
serializer: ArrayBytesCodec = BytesCodec(endian=None)
4693+
4694+
if isinstance(dtype, HasEndianness):
4695+
serializer = BytesCodec(endian="little")
4696+
elif isinstance(dtype, HasObjectCodec):
4697+
if dtype.object_codec_id == "vlen-bytes":
4698+
serializer = VLenBytesCodec()
4699+
elif dtype.object_codec_id == "vlen-utf8":
4700+
serializer = VLenUTF8Codec()
4701+
else:
4702+
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
4703+
raise ValueError(msg)
4704+
return serializer
4705+
4706+
4707+
def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None:
4708+
"""
4709+
Given a data type, return the default filters for that data type.
4710+
4711+
For data types that require an object codec, namely variable length data types,
4712+
this is a tuple containing the object codec. Otherwise it's ``None``.
4713+
"""
4714+
if isinstance(dtype, HasObjectCodec):
4715+
if dtype.object_codec_id == "vlen-bytes":
4716+
from numcodecs import VLenBytes
46684717

4669-
return filters, numcodecs.get_codec(compressor)
4718+
return (VLenBytes(),)
4719+
elif dtype.object_codec_id == "vlen-utf8":
4720+
from numcodecs import VLenUTF8
4721+
4722+
return (VLenUTF8(),)
4723+
else:
4724+
msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}."
4725+
raise ValueError(msg)
4726+
return None
4727+
4728+
4729+
def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec:
4730+
"""
4731+
Given a data type, return the default compressors for that data type.
4732+
4733+
This is just the numcodecs ``Zstd`` codec.
4734+
"""
4735+
from numcodecs import Zstd
4736+
4737+
return Zstd(level=0, checksum=False)
46704738

46714739

46724740
def _parse_chunk_encoding_v2(
@@ -4678,14 +4746,13 @@ def _parse_chunk_encoding_v2(
46784746
"""
46794747
Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
46804748
"""
4681-
default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype)
46824749
_filters: tuple[numcodecs.abc.Codec, ...] | None
46834750
_compressor: numcodecs.abc.Codec | None
46844751

46854752
if compressor is None or compressor == ():
46864753
_compressor = None
46874754
elif compressor == "auto":
4688-
_compressor = default_compressor
4755+
_compressor = default_compressor_v2(dtype)
46894756
elif isinstance(compressor, tuple | list) and len(compressor) == 1:
46904757
_compressor = parse_compressor(compressor[0])
46914758
else:
@@ -4697,7 +4764,7 @@ def _parse_chunk_encoding_v2(
46974764
if filters is None:
46984765
_filters = None
46994766
elif filters == "auto":
4700-
_filters = default_filters
4767+
_filters = default_filters_v2(dtype)
47014768
else:
47024769
if isinstance(filters, Iterable):
47034770
for idx, f in enumerate(filters):
@@ -4708,7 +4775,33 @@ def _parse_chunk_encoding_v2(
47084775
)
47094776
raise TypeError(msg)
47104777
_filters = parse_filters(filters)
4711-
4778+
if isinstance(dtype, HasObjectCodec):
4779+
# check the filters and the compressor for the object codec required for this data type
4780+
if _filters is None:
4781+
if _compressor is None:
4782+
object_codec_id = None
4783+
else:
4784+
object_codec_id = get_object_codec_id((_compressor.get_config(),))
4785+
else:
4786+
object_codec_id = get_object_codec_id(
4787+
(
4788+
*[f.get_config() for f in _filters],
4789+
_compressor.get_config() if _compressor is not None else None,
4790+
)
4791+
)
4792+
if object_codec_id is None:
4793+
if isinstance(dtype, VariableLengthUTF8): # type: ignore[unreachable]
4794+
codec_name = "the numcodecs.VLenUTF8 codec" # type: ignore[unreachable]
4795+
elif isinstance(dtype, VariableLengthBytes): # type: ignore[unreachable]
4796+
codec_name = "the numcodecs.VLenBytes codec" # type: ignore[unreachable]
4797+
else:
4798+
codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}"
4799+
msg = (
4800+
f"Data type {dtype} requires {codec_name}, "
4801+
"but no such codec was specified in the filters or compressor parameters for "
4802+
"this array. "
4803+
)
4804+
raise ValueError(msg)
47124805
return _filters, _compressor
47134806

47144807

@@ -4722,14 +4815,11 @@ def _parse_chunk_encoding_v3(
47224815
"""
47234816
Generate chunk encoding classes for v3 arrays with optional defaults.
47244817
"""
4725-
default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3(
4726-
dtype
4727-
)
47284818

47294819
if filters is None:
47304820
out_array_array: tuple[ArrayArrayCodec, ...] = ()
47314821
elif filters == "auto":
4732-
out_array_array = default_array_array
4822+
out_array_array = default_filters_v3(dtype)
47334823
else:
47344824
maybe_array_array: Iterable[Codec | dict[str, JSON]]
47354825
if isinstance(filters, dict | Codec):
@@ -4739,7 +4829,7 @@ def _parse_chunk_encoding_v3(
47394829
out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array)
47404830

47414831
if serializer == "auto":
4742-
out_array_bytes = default_array_bytes
4832+
out_array_bytes = default_serializer_v3(dtype)
47434833
else:
47444834
# TODO: ensure that the serializer is compatible with the ndarray produced by the
47454835
# array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4749,7 +4839,7 @@ def _parse_chunk_encoding_v3(
47494839
if compressors is None:
47504840
out_bytes_bytes: tuple[BytesBytesCodec, ...] = ()
47514841
elif compressors == "auto":
4752-
out_bytes_bytes = default_bytes_bytes
4842+
out_bytes_bytes = default_compressors_v3(dtype)
47534843
else:
47544844
maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]]
47554845
if isinstance(compressors, dict | Codec):
@@ -4759,17 +4849,11 @@ def _parse_chunk_encoding_v3(
47594849

47604850
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
47614851

4762-
# specialize codecs as needed given the dtype
4763-
4764-
# TODO: refactor so that the config only contains the name of the codec, and we use the dtype
4765-
# to create the codec instance, instead of storing a dict representation of a full codec.
4766-
47674852
# TODO: ensure that the serializer is compatible with the ndarray produced by the
47684853
# array-array codecs. For example, if a sequence of array-array codecs produces an
47694854
# array with a single-byte data type, then the serializer should not specify endiannesss.
4770-
if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness):
4771-
# The default endianness in the bytescodec might not be None, so we need to replace it
4772-
out_array_bytes = replace(out_array_bytes, endian=None)
4855+
4856+
# TODO: add checks to ensure that the right serializer is used for vlen data types
47734857
return out_array_array, out_array_bytes, out_bytes_bytes
47744858

47754859

src/zarr/core/config.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,25 @@ def enable_gpu(self) -> ConfigSet:
7878
)
7979

8080

81+
# these keys were removed from the config as part of the 3.1.0 release.
82+
# these deprecations should be removed in 3.1.1 or thereabouts.
83+
deprecations = {
84+
"array.v2_default_compressor.numeric": None,
85+
"array.v2_default_compressor.string": None,
86+
"array.v2_default_compressor.bytes": None,
87+
"array.v2_default_filters.string": None,
88+
"array.v2_default_filters.bytes": None,
89+
"array.v3_default_filters.numeric": None,
90+
"array.v3_default_filters.raw": None,
91+
"array.v3_default_filters.bytes": None,
92+
"array.v3_default_serializer.numeric": None,
93+
"array.v3_default_serializer.string": None,
94+
"array.v3_default_serializer.bytes": None,
95+
"array.v3_default_compressors.string": None,
96+
"array.v3_default_compressors.bytes": None,
97+
"array.v3_default_compressors": None,
98+
}
99+
81100
# The default configuration for zarr
82101
config = Config(
83102
"zarr",
@@ -87,27 +106,6 @@ def enable_gpu(self) -> ConfigSet:
87106
"array": {
88107
"order": "C",
89108
"write_empty_chunks": False,
90-
"v2_default_compressor": {
91-
"default": {"id": "zstd", "level": 0, "checksum": False},
92-
"variable-length-string": {"id": "zstd", "level": 0, "checksum": False},
93-
},
94-
"v2_default_filters": {
95-
"default": None,
96-
"variable-length-string": [{"id": "vlen-utf8"}],
97-
},
98-
"v3_default_filters": {"default": [], "variable-length-string": []},
99-
"v3_default_serializer": {
100-
"default": {"name": "bytes", "configuration": {"endian": "little"}},
101-
"variable-length-string": {"name": "vlen-utf8"},
102-
},
103-
"v3_default_compressors": {
104-
"default": [
105-
{"name": "zstd", "configuration": {"level": 0, "checksum": False}},
106-
],
107-
"variable-length-string": [
108-
{"name": "zstd", "configuration": {"level": 0, "checksum": False}}
109-
],
110-
},
111109
},
112110
"async": {"concurrency": 10, "timeout": None},
113111
"threading": {"max_workers": None},
@@ -132,6 +130,7 @@ def enable_gpu(self) -> ConfigSet:
132130
"ndbuffer": "zarr.buffer.cpu.NDBuffer",
133131
}
134132
],
133+
deprecations=deprecations,
135134
)
136135

137136

0 commit comments

Comments
 (0)