Skip to content

Commit 173766d

Browse files
committed
add v2-style error when creating a vlen dtype without the right codec
1 parent 7447805 commit 173766d

1 file changed

Lines changed: 33 additions & 10 deletions

File tree

src/zarr/core/array.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@
7070
from zarr.core.config import categorize_data_type
7171
from zarr.core.config import config as zarr_config
7272
from zarr.core.dtype import (
73+
VariableLengthBytes,
74+
VariableLengthUTF8,
7375
ZDType,
7476
ZDTypeLike,
7577
parse_data_type,
@@ -111,6 +113,7 @@
111113
)
112114
from zarr.core.metadata.v2 import (
113115
CompressorLikev2,
116+
get_object_codec_id,
114117
parse_compressor,
115118
parse_filters,
116119
)
@@ -4686,7 +4689,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
46864689
``VLenBytesCodec``, respectively.
46874690
46884691
"""
4689-
serializer: ArrayBytesCodec = BytesCodec()
4692+
serializer: ArrayBytesCodec = BytesCodec(endian=None)
46904693

46914694
if isinstance(dtype, HasEndianness):
46924695
serializer = BytesCodec(endian="little")
@@ -4772,7 +4775,33 @@ def _parse_chunk_encoding_v2(
47724775
)
47734776
raise TypeError(msg)
47744777
_filters = parse_filters(filters)
4775-
4778+
if isinstance(dtype, HasObjectCodec):
4779+
# check the filters and the compressor for the object codec required for this data type
4780+
if _filters is None:
4781+
if _compressor is None:
4782+
object_codec_id = None
4783+
else:
4784+
object_codec_id = get_object_codec_id((_compressor.get_config(),))
4785+
else:
4786+
object_codec_id = get_object_codec_id(
4787+
(
4788+
*[f.get_config() for f in _filters],
4789+
_compressor.get_config() if _compressor is not None else None,
4790+
)
4791+
)
4792+
if object_codec_id is None:
4793+
if isinstance(dtype, VariableLengthUTF8):
4794+
codec_name = "the numcodecs.VLenUTF8 codec"
4795+
elif isinstance(dtype, VariableLengthBytes):
4796+
codec_name = "the numcodecs.VLenBytes codec"
4797+
else:
4798+
codec_name = "an unknown object codec"
4799+
msg = (
4800+
f"Data type {dtype} requires {codec_name}, "
4801+
"but no such codec was specified in the filters or compressor parameters for "
4802+
"this array. "
4803+
)
4804+
raise ValueError(msg)
47764805
return _filters, _compressor
47774806

47784807

@@ -4820,17 +4849,11 @@ def _parse_chunk_encoding_v3(
48204849

48214850
out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes)
48224851

4823-
# specialize codecs as needed given the dtype
4824-
4825-
# TODO: refactor so that the config only contains the name of the codec, and we use the dtype
4826-
# to create the codec instance, instead of storing a dict representation of a full codec.
4827-
48284852
# TODO: ensure that the serializer is compatible with the ndarray produced by the
48294853
# array-array codecs. For example, if a sequence of array-array codecs produces an
48304854
# array with a single-byte data type, then the serializer should not specify endiannesss.
4831-
if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness):
4832-
# The default endianness in the bytescodec might not be None, so we need to replace it
4833-
out_array_bytes = replace(out_array_bytes, endian=None)
4855+
4856+
# TODO: add checks to ensure that the right serializer is used for vlen data types
48344857
return out_array_array, out_array_bytes, out_bytes_bytes
48354858

48364859

0 commit comments

Comments
 (0)