|
70 | 70 | from zarr.core.config import categorize_data_type |
71 | 71 | from zarr.core.config import config as zarr_config |
72 | 72 | from zarr.core.dtype import ( |
| 73 | + VariableLengthBytes, |
| 74 | + VariableLengthUTF8, |
73 | 75 | ZDType, |
74 | 76 | ZDTypeLike, |
75 | 77 | parse_data_type, |
|
111 | 113 | ) |
112 | 114 | from zarr.core.metadata.v2 import ( |
113 | 115 | CompressorLikev2, |
| 116 | + get_object_codec_id, |
114 | 117 | parse_compressor, |
115 | 118 | parse_filters, |
116 | 119 | ) |
@@ -4686,7 +4689,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: |
4686 | 4689 | ``VLenBytesCodec``, respectively. |
4687 | 4690 |
|
4688 | 4691 | """ |
4689 | | - serializer: ArrayBytesCodec = BytesCodec() |
| 4692 | + serializer: ArrayBytesCodec = BytesCodec(endian=None) |
4690 | 4693 |
|
4691 | 4694 | if isinstance(dtype, HasEndianness): |
4692 | 4695 | serializer = BytesCodec(endian="little") |
@@ -4772,7 +4775,33 @@ def _parse_chunk_encoding_v2( |
4772 | 4775 | ) |
4773 | 4776 | raise TypeError(msg) |
4774 | 4777 | _filters = parse_filters(filters) |
4775 | | - |
| 4778 | + if isinstance(dtype, HasObjectCodec): |
| 4779 | + # check the filters and the compressor for the object codec required for this data type |
| 4780 | + if _filters is None: |
| 4781 | + if _compressor is None: |
| 4782 | + object_codec_id = None |
| 4783 | + else: |
| 4784 | + object_codec_id = get_object_codec_id((_compressor.get_config(),)) |
| 4785 | + else: |
| 4786 | + object_codec_id = get_object_codec_id( |
| 4787 | + ( |
| 4788 | + *[f.get_config() for f in _filters], |
| 4789 | + _compressor.get_config() if _compressor is not None else None, |
| 4790 | + ) |
| 4791 | + ) |
| 4792 | + if object_codec_id is None: |
| 4793 | + if isinstance(dtype, VariableLengthUTF8): |
| 4794 | + codec_name = "the numcodecs.VLenUTF8 codec" |
| 4795 | + elif isinstance(dtype, VariableLengthBytes): |
| 4796 | + codec_name = "the numcodecs.VLenBytes codec" |
| 4797 | + else: |
| 4798 | + codec_name = "an unknown object codec" |
| 4799 | + msg = ( |
| 4800 | + f"Data type {dtype} requires {codec_name}, " |
| 4801 | + "but no such codec was specified in the filters or compressor parameters for " |
| 4802 | + "this array. " |
| 4803 | + ) |
| 4804 | + raise ValueError(msg) |
4776 | 4805 | return _filters, _compressor |
4777 | 4806 |
|
4778 | 4807 |
|
@@ -4820,17 +4849,11 @@ def _parse_chunk_encoding_v3( |
4820 | 4849 |
|
4821 | 4850 | out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) |
4822 | 4851 |
|
4823 | | - # specialize codecs as needed given the dtype |
4824 | | - |
4825 | | - # TODO: refactor so that the config only contains the name of the codec, and we use the dtype |
4826 | | - # to create the codec instance, instead of storing a dict representation of a full codec. |
4827 | | - |
4828 | 4852 | # TODO: ensure that the serializer is compatible with the ndarray produced by the |
4829 | 4853 | # array-array codecs. For example, if a sequence of array-array codecs produces an |
4830 | 4854 | # array with a single-byte data type, then the serializer should not specify endiannesss. |
4831 | | - if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): |
4832 | | - # The default endianness in the bytescodec might not be None, so we need to replace it |
4833 | | - out_array_bytes = replace(out_array_bytes, endian=None) |
| 4855 | + |
| 4856 | + # TODO: add checks to ensure that the right serializer is used for vlen data types |
4834 | 4857 | return out_array_array, out_array_bytes, out_bytes_bytes |
4835 | 4858 |
|
4836 | 4859 |
|
|
0 commit comments