Skip to content

Commit a0ffbb3

Browse files
committed
Implement official support for structured and struct dtypes according to new extension.
1 parent 879e1ce commit a0ffbb3

File tree

4 files changed

+239
-41
lines changed

4 files changed

+239
-41
lines changed

src/zarr/codecs/bytes.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import sys
4+
import warnings
45
from dataclasses import dataclass, replace
56
from enum import Enum
67
from typing import TYPE_CHECKING
@@ -9,6 +10,7 @@
910
from zarr.core.buffer import Buffer, NDBuffer
1011
from zarr.core.common import JSON, parse_enum, parse_named_configuration
1112
from zarr.core.dtype.common import HasEndianness
13+
from zarr.core.dtype.npy.structured import Structured
1214

1315
if TYPE_CHECKING:
1416
from typing import Self
@@ -56,7 +58,20 @@ def to_dict(self) -> dict[str, JSON]:
5658
return {"name": "bytes", "configuration": {"endian": self.endian.value}}
5759

5860
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
59-
if not isinstance(array_spec.dtype, HasEndianness):
61+
if isinstance(array_spec.dtype, Structured):
62+
if array_spec.dtype.has_multi_byte_fields():
63+
if self.endian is None:
64+
warnings.warn(
65+
"Missing 'endian' for structured dtype with multi-byte fields. "
66+
"Assuming little-endian for legacy compatibility.",
67+
UserWarning,
68+
stacklevel=2,
69+
)
70+
return replace(self, endian=Endian.little)
71+
else:
72+
if self.endian is not None:
73+
return replace(self, endian=None)
74+
elif not isinstance(array_spec.dtype, HasEndianness):
6075
if self.endian is not None:
6176
return replace(self, endian=None)
6277
elif self.endian is None:

src/zarr/core/array.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
)
6767
from zarr.core.config import config as zarr_config
6868
from zarr.core.dtype import (
69+
Structured,
6970
VariableLengthBytes,
7071
VariableLengthUTF8,
7172
ZDType,
@@ -5054,10 +5055,13 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
50545055
length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
50555056
``VLenBytesCodec``, respectively.
50565057
5058+
Structured data types with multi-byte fields use ``BytesCodec`` with little-endian encoding.
50575059
"""
50585060
serializer: ArrayBytesCodec = BytesCodec(endian=None)
50595061

5060-
if isinstance(dtype, HasEndianness):
5062+
if isinstance(dtype, HasEndianness) or (
5063+
isinstance(dtype, Structured) and dtype.has_multi_byte_fields()
5064+
):
50615065
serializer = BytesCodec(endian="little")
50625066
elif isinstance(dtype, HasObjectCodec):
50635067
if dtype.object_codec_id == "vlen-bytes":

src/zarr/core/dtype/npy/structured.py

Lines changed: 75 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -58,28 +58,32 @@ class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]):
5858

5959

6060
class StructuredJSON_V3(
61-
NamedConfig[Literal["structured"], dict[str, Sequence[Sequence[str | DTypeJSON]]]]
61+
NamedConfig[Literal["struct", "structured"], dict[str, Sequence[dict[str, str | DTypeJSON]]]]
6262
):
6363
"""
6464
A JSON representation of a structured data type in Zarr V3.
6565
6666
References
6767
----------
68-
This representation is not currently defined in an external specification.
68+
The Zarr V3 specification for this data type is defined in the zarr-extensions repository:
69+
https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/struct
6970
7071
Examples
7172
--------
7273
```python
7374
{
74-
"name": "structured",
75+
"name": "struct",
7576
"configuration": {
7677
"fields": [
77-
["f0", "int32"],
78-
["f1", "float64"],
78+
{"name": "f0", "data_type": "int32"},
79+
{"name": "f1", "data_type": "float64"},
7980
]
8081
}
8182
}
8283
```
84+
85+
The legacy tuple format ``[["f0", "int32"], ["f1", "float64"]]`` is also
86+
accepted when reading for backward compatibility.
8387
"""
8488

8589

@@ -98,12 +102,14 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize):
98102
99103
References
100104
----------
101-
This data type does not have a Zarr V3 specification.
105+
The Zarr V3 specification for this data type is defined in the zarr-extensions repository:
106+
https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/struct
102107
103108
The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding).
104109
"""
105110

106-
_zarr_v3_name: ClassVar[Literal["structured"]] = "structured"
111+
_zarr_v3_name: ClassVar[Literal["struct"]] = "struct"
112+
_zarr_v3_names: ClassVar[tuple[str, ...]] = ("struct", "structured")
107113
dtype_cls = np.dtypes.VoidDType # type: ignore[assignment]
108114
fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...]
109115

@@ -234,11 +240,10 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]:
234240
True if the input is a valid JSON representation of a structured data type for Zarr V3,
235241
False otherwise.
236242
"""
237-
238243
return (
239244
isinstance(data, dict)
240245
and set(data.keys()) == {"name", "configuration"}
241-
and data["name"] == cls._zarr_v3_name
246+
and data["name"] in cls._zarr_v3_names
242247
and isinstance(data["configuration"], dict)
243248
and set(data["configuration"].keys()) == {"fields"}
244249
)
@@ -274,12 +279,24 @@ def _from_json_v3(cls, data: DTypeJSON) -> Self:
274279
if cls._check_json_v3(data):
275280
config = data["configuration"]
276281
meta_fields = config["fields"]
277-
return cls(
278-
fields=tuple(
282+
dtype_name = data["name"]
283+
parsed_fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = []
284+
for field in meta_fields:
285+
if dtype_name == "struct":
286+
if not isinstance(field, dict):
287+
msg = f"Invalid field format for 'struct' dtype. Expected object with 'name' and 'data_type' keys, got {field!r}"
288+
raise DataTypeValidationError(msg)
289+
f_name = field["name"]
290+
f_dtype = field["data_type"]
291+
else:
292+
if isinstance(field, dict):
293+
msg = f"Invalid field format for 'structured' dtype. Expected [name, dtype] tuple, got {field!r}"
294+
raise DataTypeValidationError(msg)
295+
f_name, f_dtype = field
296+
parsed_fields.append(
279297
(f_name, get_data_type_from_json(f_dtype, zarr_format=3)) # type: ignore[misc]
280-
for f_name, f_dtype in meta_fields
281298
)
282-
)
299+
return cls(fields=tuple(parsed_fields))
283300
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}"
284301
raise DataTypeValidationError(msg)
285302

@@ -317,7 +334,7 @@ def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON
317334
elif zarr_format == 3:
318335
v3_unstable_dtype_warning(self)
319336
fields = [
320-
[f_name, f_dtype.to_json(zarr_format=zarr_format)] # type: ignore[list-item]
337+
{"name": f_name, "data_type": f_dtype.to_json(zarr_format=zarr_format)}
321338
for f_name, f_dtype in self.fields
322339
]
323340
base_dict = {
@@ -425,7 +442,9 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void:
425442
Parameters
426443
----------
427444
data : JSON
428-
The JSON-serializable value.
445+
The JSON-serializable value. Can be either:
446+
- A dict mapping field names to values (primary format for V3)
447+
- A base64-encoded string (legacy format, for backward compatibility)
429448
zarr_format : ZarrFormat
430449
The zarr format version.
431450
@@ -437,17 +456,27 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void:
437456
Raises
438457
------
439458
TypeError
440-
If the input is not a base64-encoded string.
459+
If the input is not a dict or base64-encoded string.
441460
"""
442-
if check_json_str(data):
461+
if isinstance(data, dict):
462+
field_values = []
463+
for field_name, field_dtype in self.fields:
464+
if field_name in data:
465+
field_values.append(
466+
field_dtype.from_json_scalar(data[field_name], zarr_format=zarr_format)
467+
)
468+
else:
469+
field_values.append(field_dtype.default_scalar())
470+
return self._cast_scalar_unchecked(tuple(field_values))
471+
elif check_json_str(data):
443472
as_bytes = bytes_from_json(data, zarr_format=zarr_format)
444473
dtype = self.to_native_dtype()
445474
return cast("np.void", np.array([as_bytes]).view(dtype)[0])
446-
raise TypeError(f"Invalid type: {data}. Expected a string.")
475+
raise TypeError(f"Invalid type: {data}. Expected a dict or base64-encoded string.")
447476

448-
def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str:
477+
def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str | dict[str, JSON]:
449478
"""
450-
Convert a scalar to a JSON-serializable string representation.
479+
Convert a scalar to a JSON-serializable representation.
451480
452481
Parameters
453482
----------
@@ -458,11 +487,19 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str:
458487
459488
Returns
460489
-------
461-
str
462-
A string representation of the scalar, which is a base64-encoded
463-
string of the bytes that make up the scalar.
490+
str | dict[str, JSON]
491+
For V2: A base64-encoded string of the bytes that make up the scalar.
492+
For V3: A dict mapping field names to their JSON-serialized values.
464493
"""
465-
return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format)
494+
scalar = self.cast_scalar(data)
495+
if zarr_format == 2:
496+
return bytes_to_json(scalar.tobytes(), zarr_format)
497+
result: dict[str, JSON] = {}
498+
for field_name, field_dtype in self.fields:
499+
result[field_name] = field_dtype.to_json_scalar(
500+
scalar[field_name], zarr_format=zarr_format
501+
)
502+
return result
466503

467504
@property
468505
def item_size(self) -> int:
@@ -475,3 +512,17 @@ def item_size(self) -> int:
475512
The size of a single scalar in bytes.
476513
"""
477514
return self.to_native_dtype().itemsize
515+
516+
def has_multi_byte_fields(self) -> bool:
517+
"""
518+
Check if this structured dtype has any fields with item_size > 1.
519+
520+
Returns
521+
-------
522+
bool
523+
True if any field has item_size > 1, False otherwise.
524+
"""
525+
return any(
526+
isinstance(field_dtype, HasItemSize) and field_dtype.item_size > 1
527+
for _, field_dtype in self.fields
528+
)

0 commit comments

Comments
 (0)