30
30
from zarr .abc .store import Store , set_or_delete
31
31
from zarr .codecs ._v2 import V2Codec
32
32
from zarr .codecs .bytes import BytesCodec
33
+ from zarr .codecs .vlen_utf8 import VLenBytesCodec , VLenUTF8Codec
34
+ from zarr .codecs .zstd import ZstdCodec
33
35
from zarr .core ._info import ArrayInfo
34
36
from zarr .core .array_spec import ArrayConfig , ArrayConfigLike , parse_array_config
35
37
from zarr .core .attributes import Attributes
68
70
from zarr .core .config import categorize_data_type
69
71
from zarr .core .config import config as zarr_config
70
72
from zarr .core .dtype import (
73
+ VariableLengthBytes ,
74
+ VariableLengthUTF8 ,
71
75
ZDType ,
72
76
ZDTypeLike ,
73
77
parse_data_type ,
74
78
)
75
- from zarr .core .dtype .common import HasEndianness , HasItemSize
79
+ from zarr .core .dtype .common import HasEndianness , HasItemSize , HasObjectCodec
76
80
from zarr .core .indexing import (
77
81
BasicIndexer ,
78
82
BasicSelection ,
109
113
)
110
114
from zarr .core .metadata .v2 import (
111
115
CompressorLikev2 ,
116
+ get_object_codec_id ,
112
117
parse_compressor ,
113
118
parse_filters ,
114
119
)
@@ -710,7 +715,10 @@ def _create_metadata_v3(
710
715
711
716
shape = parse_shapelike (shape )
712
717
if codecs is None :
713
- filters , serializer , compressors = _get_default_chunk_encoding_v3 (dtype )
718
+ filters = default_filters_v3 (dtype )
719
+ serializer = default_serializer_v3 (dtype )
720
+ compressors = default_compressors_v3 (dtype )
721
+
714
722
codecs_parsed = (* filters , serializer , * compressors )
715
723
else :
716
724
codecs_parsed = tuple (codecs )
@@ -850,10 +858,9 @@ async def _create_v2(
850
858
else :
851
859
await ensure_no_existing_node (store_path , zarr_format = 2 )
852
860
853
- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
854
861
compressor_parsed : CompressorLikev2
855
862
if compressor == "auto" :
856
- compressor_parsed = default_compressor
863
+ compressor_parsed = default_compressor_v2 ( dtype )
857
864
elif isinstance (compressor , BytesBytesCodec ):
858
865
raise ValueError (
859
866
"Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -863,7 +870,7 @@ async def _create_v2(
863
870
compressor_parsed = compressor
864
871
865
872
if filters is None :
866
- filters = default_filters
873
+ filters = default_filters_v2 ( dtype )
867
874
868
875
metadata = cls ._create_metadata_v2 (
869
876
shape = shape ,
@@ -4654,19 +4661,80 @@ def _get_default_chunk_encoding_v3(
4654
4661
)
4655
4662
4656
4663
4657
- def _get_default_chunk_encoding_v2 (
4658
- dtype : ZDType [TBaseDType , TBaseScalar ],
4659
- ) -> tuple [tuple [numcodecs .abc .Codec , ...] | None , numcodecs .abc .Codec | None ]:
4664
+ def default_filters_v3 (dtype : ZDType [Any , Any ]) -> tuple [ArrayArrayCodec , ...]:
4660
4665
"""
4661
- Get the default chunk encoding for Zarr format 2 arrays, given a dtype
4666
+ Given a data type, return the default filters for that data type.
4667
+
4668
+ This is an empty tuple. No data types have default filters.
4662
4669
"""
4663
- dtype_category = categorize_data_type (dtype )
4664
- filters = zarr_config .get ("array.v2_default_filters" ).get (dtype_category )
4665
- compressor = zarr_config .get ("array.v2_default_compressor" ).get (dtype_category )
4666
- if filters is not None :
4667
- filters = tuple (numcodecs .get_codec (f ) for f in filters )
4670
+ return ()
4671
+
4672
+
4673
+ def default_compressors_v3 (dtype : ZDType [Any , Any ]) -> tuple [BytesBytesCodec , ...]:
4674
+ """
4675
+ Given a data type, return the default compressors for that data type.
4676
+
4677
+ This is just a tuple containing ``ZstdCodec``
4678
+ """
4679
+ return (ZstdCodec (),)
4680
+
4681
+
4682
+ def default_serializer_v3 (dtype : ZDType [Any , Any ]) -> ArrayBytesCodec :
4683
+ """
4684
+ Given a data type, return the default serializer for that data type.
4685
+
4686
+ The default serializer for most data types is the ``BytesCodec``, which may or may not be
4687
+ parameterized with an endianness, depending on whether the data type has endianness. Variable
4688
+ length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
4689
+ ``VLenBytesCodec``, respectively.
4690
+
4691
+ """
4692
+ serializer : ArrayBytesCodec = BytesCodec (endian = None )
4693
+
4694
+ if isinstance (dtype , HasEndianness ):
4695
+ serializer = BytesCodec (endian = "little" )
4696
+ elif isinstance (dtype , HasObjectCodec ):
4697
+ if dtype .object_codec_id == "vlen-bytes" :
4698
+ serializer = VLenBytesCodec ()
4699
+ elif dtype .object_codec_id == "vlen-utf8" :
4700
+ serializer = VLenUTF8Codec ()
4701
+ else :
4702
+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id !r} ."
4703
+ raise ValueError (msg )
4704
+ return serializer
4705
+
4706
+
4707
+ def default_filters_v2 (dtype : ZDType [Any , Any ]) -> tuple [numcodecs .abc .Codec ] | None :
4708
+ """
4709
+ Given a data type, return the default filters for that data type.
4710
+
4711
+ For data types that require an object codec, namely variable length data types,
4712
+ this is a tuple containing the object codec. Otherwise it's ``None``.
4713
+ """
4714
+ if isinstance (dtype , HasObjectCodec ):
4715
+ if dtype .object_codec_id == "vlen-bytes" :
4716
+ from numcodecs import VLenBytes
4668
4717
4669
- return filters , numcodecs .get_codec (compressor )
4718
+ return (VLenBytes (),)
4719
+ elif dtype .object_codec_id == "vlen-utf8" :
4720
+ from numcodecs import VLenUTF8
4721
+
4722
+ return (VLenUTF8 (),)
4723
+ else :
4724
+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id !r} ."
4725
+ raise ValueError (msg )
4726
+ return None
4727
+
4728
+
4729
+ def default_compressor_v2 (dtype : ZDType [Any , Any ]) -> numcodecs .abc .Codec :
4730
+ """
4731
+ Given a data type, return the default compressors for that data type.
4732
+
4733
+ This is just the numcodecs ``Zstd`` codec.
4734
+ """
4735
+ from numcodecs import Zstd
4736
+
4737
+ return Zstd (level = 0 , checksum = False )
4670
4738
4671
4739
4672
4740
def _parse_chunk_encoding_v2 (
@@ -4678,14 +4746,13 @@ def _parse_chunk_encoding_v2(
4678
4746
"""
4679
4747
Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
4680
4748
"""
4681
- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
4682
4749
_filters : tuple [numcodecs .abc .Codec , ...] | None
4683
4750
_compressor : numcodecs .abc .Codec | None
4684
4751
4685
4752
if compressor is None or compressor == ():
4686
4753
_compressor = None
4687
4754
elif compressor == "auto" :
4688
- _compressor = default_compressor
4755
+ _compressor = default_compressor_v2 ( dtype )
4689
4756
elif isinstance (compressor , tuple | list ) and len (compressor ) == 1 :
4690
4757
_compressor = parse_compressor (compressor [0 ])
4691
4758
else :
@@ -4697,7 +4764,7 @@ def _parse_chunk_encoding_v2(
4697
4764
if filters is None :
4698
4765
_filters = None
4699
4766
elif filters == "auto" :
4700
- _filters = default_filters
4767
+ _filters = default_filters_v2 ( dtype )
4701
4768
else :
4702
4769
if isinstance (filters , Iterable ):
4703
4770
for idx , f in enumerate (filters ):
@@ -4708,7 +4775,33 @@ def _parse_chunk_encoding_v2(
4708
4775
)
4709
4776
raise TypeError (msg )
4710
4777
_filters = parse_filters (filters )
4711
-
4778
+ if isinstance (dtype , HasObjectCodec ):
4779
+ # check the filters and the compressor for the object codec required for this data type
4780
+ if _filters is None :
4781
+ if _compressor is None :
4782
+ object_codec_id = None
4783
+ else :
4784
+ object_codec_id = get_object_codec_id ((_compressor .get_config (),))
4785
+ else :
4786
+ object_codec_id = get_object_codec_id (
4787
+ (
4788
+ * [f .get_config () for f in _filters ],
4789
+ _compressor .get_config () if _compressor is not None else None ,
4790
+ )
4791
+ )
4792
+ if object_codec_id is None :
4793
+ if isinstance (dtype , VariableLengthUTF8 ): # type: ignore[unreachable]
4794
+ codec_name = "the numcodecs.VLenUTF8 codec" # type: ignore[unreachable]
4795
+ elif isinstance (dtype , VariableLengthBytes ): # type: ignore[unreachable]
4796
+ codec_name = "the numcodecs.VLenBytes codec" # type: ignore[unreachable]
4797
+ else :
4798
+ codec_name = f"an unknown object codec with id { dtype .object_codec_id !r} "
4799
+ msg = (
4800
+ f"Data type { dtype } requires { codec_name } , "
4801
+ "but no such codec was specified in the filters or compressor parameters for "
4802
+ "this array. "
4803
+ )
4804
+ raise ValueError (msg )
4712
4805
return _filters , _compressor
4713
4806
4714
4807
@@ -4722,14 +4815,11 @@ def _parse_chunk_encoding_v3(
4722
4815
"""
4723
4816
Generate chunk encoding classes for v3 arrays with optional defaults.
4724
4817
"""
4725
- default_array_array , default_array_bytes , default_bytes_bytes = _get_default_chunk_encoding_v3 (
4726
- dtype
4727
- )
4728
4818
4729
4819
if filters is None :
4730
4820
out_array_array : tuple [ArrayArrayCodec , ...] = ()
4731
4821
elif filters == "auto" :
4732
- out_array_array = default_array_array
4822
+ out_array_array = default_filters_v3 ( dtype )
4733
4823
else :
4734
4824
maybe_array_array : Iterable [Codec | dict [str , JSON ]]
4735
4825
if isinstance (filters , dict | Codec ):
@@ -4739,7 +4829,7 @@ def _parse_chunk_encoding_v3(
4739
4829
out_array_array = tuple (_parse_array_array_codec (c ) for c in maybe_array_array )
4740
4830
4741
4831
if serializer == "auto" :
4742
- out_array_bytes = default_array_bytes
4832
+ out_array_bytes = default_serializer_v3 ( dtype )
4743
4833
else :
4744
4834
# TODO: ensure that the serializer is compatible with the ndarray produced by the
4745
4835
# array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4749,7 +4839,7 @@ def _parse_chunk_encoding_v3(
4749
4839
if compressors is None :
4750
4840
out_bytes_bytes : tuple [BytesBytesCodec , ...] = ()
4751
4841
elif compressors == "auto" :
4752
- out_bytes_bytes = default_bytes_bytes
4842
+ out_bytes_bytes = default_compressors_v3 ( dtype )
4753
4843
else :
4754
4844
maybe_bytes_bytes : Iterable [Codec | dict [str , JSON ]]
4755
4845
if isinstance (compressors , dict | Codec ):
@@ -4759,17 +4849,11 @@ def _parse_chunk_encoding_v3(
4759
4849
4760
4850
out_bytes_bytes = tuple (_parse_bytes_bytes_codec (c ) for c in maybe_bytes_bytes )
4761
4851
4762
- # specialize codecs as needed given the dtype
4763
-
4764
- # TODO: refactor so that the config only contains the name of the codec, and we use the dtype
4765
- # to create the codec instance, instead of storing a dict representation of a full codec.
4766
-
4767
4852
# TODO: ensure that the serializer is compatible with the ndarray produced by the
4768
4853
# array-array codecs. For example, if a sequence of array-array codecs produces an
4769
4854
# array with a single-byte data type, then the serializer should not specify endiannesss.
4770
- if isinstance (out_array_bytes , BytesCodec ) and not isinstance (dtype , HasEndianness ):
4771
- # The default endianness in the bytescodec might not be None, so we need to replace it
4772
- out_array_bytes = replace (out_array_bytes , endian = None )
4855
+
4856
+ # TODO: add checks to ensure that the right serializer is used for vlen data types
4773
4857
return out_array_array , out_array_bytes , out_bytes_bytes
4774
4858
4775
4859
0 commit comments