Skip to content

Commit c0e2afa

Browse files
d-v-bclaude
andauthored
fix: handle NaN fill_value in array metadata equality (#3999)
Frozen dataclass __eq__ compared fill_value directly, so two identical metadata objects with a NaN fill value compared unequal (NaN != NaN under IEEE 754). Compare the JSON-serialized form instead, which treats matching NaN and infinite fill values as equal. Fixes #2929 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 1907ad6 commit c0e2afa

5 files changed

Lines changed: 154 additions & 2 deletions

File tree

changes/2929.bugfix.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix equality comparison of `ArrayV2Metadata` and `ArrayV3Metadata` objects with a
2+
`NaN` fill value. Such objects are now compared by their JSON-serialized form, so two
3+
otherwise-identical metadata objects with a `NaN` (or infinite) fill value compare equal.

src/zarr/core/metadata/v2.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import json
34
import warnings
45
from collections.abc import Iterable, Sequence
56
from functools import cached_property
@@ -25,7 +26,6 @@
2526
ZDType,
2627
)
2728

28-
import json
2929
from dataclasses import dataclass, field, fields, replace
3030

3131
import numpy as np
@@ -239,6 +239,19 @@ def to_dict(self) -> dict[str, JSON]:
239239

240240
return zarray_dict
241241

242+
def __eq__(self, other: object) -> bool:
243+
# The default dataclass __eq__ compares fields directly, which is wrong for a NaN
244+
# fill_value: NaN != NaN under IEEE 754. Comparing the JSON-serialized form instead
245+
# treats matching NaN (and inf) fill values as equal. See issue #2929.
246+
if not isinstance(other, ArrayV2Metadata):
247+
return NotImplemented
248+
return self.to_dict() == other.to_dict()
249+
250+
def __hash__(self) -> int:
251+
# Hash the JSON-serialized form to stay consistent with __eq__: equal metadata
252+
# must hash equally, which a field-based hash violates for a NaN fill_value.
253+
return hash(json.dumps(self.to_dict(), sort_keys=True))
254+
242255
def get_chunk_spec(
243256
self, _chunk_coords: tuple[int, ...], array_config: ArrayConfig, prototype: BufferPrototype
244257
) -> ArraySpec:

src/zarr/core/metadata/v3.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,19 @@ def to_dict(self) -> dict[str, JSON]:
697697
out_dict["data_type"] = dtype_meta.to_json(zarr_format=3) # type: ignore[unreachable]
698698
return out_dict
699699

700+
def __eq__(self, other: object) -> bool:
701+
# The default dataclass __eq__ compares fields directly, which is wrong for a NaN
702+
# fill_value: NaN != NaN under IEEE 754. Comparing the JSON-serialized form instead
703+
# treats matching NaN (and inf) fill values as equal. See issue #2929.
704+
if not isinstance(other, ArrayV3Metadata):
705+
return NotImplemented
706+
return self.to_dict() == other.to_dict()
707+
708+
def __hash__(self) -> int:
709+
# Hash the JSON-serialized form to stay consistent with __eq__: equal metadata
710+
# must hash equally, which a field-based hash violates for a NaN fill_value.
711+
return hash(json.dumps(self.to_dict(), sort_keys=True))
712+
700713
def update_shape(self, shape: tuple[int, ...]) -> Self:
701714
chunk_grid = self.chunk_grid
702715
if isinstance(chunk_grid, RectilinearChunkGridMetadata):

tests/test_metadata/test_v2.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,60 @@ def test_from_dict_extra_fields() -> None:
308308
assert result == expected
309309

310310

311+
def test_eq_nan_fill_value() -> None:
312+
"""Two metadata objects with an identical NaN fill_value compare equal.
313+
314+
NaN is not equal to itself under IEEE 754, so the default dataclass __eq__
315+
reports two otherwise-identical metadata objects as unequal. Metadata
316+
equality must treat matching NaN fill values as equal (see issue #2929).
317+
"""
318+
a = ArrayV2Metadata(
319+
shape=(8,), dtype=Float64(), chunks=(8,), fill_value=np.float64("nan"), order="C"
320+
)
321+
b = ArrayV2Metadata(
322+
shape=(8,), dtype=Float64(), chunks=(8,), fill_value=np.float64("nan"), order="C"
323+
)
324+
assert a == b
325+
326+
327+
def test_eq_distinct_fill_value() -> None:
328+
"""Metadata objects that differ only in fill_value do not compare equal."""
329+
a = ArrayV2Metadata(shape=(8,), dtype=Float64(), chunks=(8,), fill_value=0.0, order="C")
330+
b = ArrayV2Metadata(shape=(8,), dtype=Float64(), chunks=(8,), fill_value=1.0, order="C")
331+
assert a != b
332+
333+
334+
@pytest.mark.parametrize("fill_value", [np.float64("inf"), np.float64("-inf")])
335+
def test_eq_inf_fill_value(fill_value: np.float64) -> None:
336+
"""Two metadata objects with an identical infinite fill_value compare equal."""
337+
a = ArrayV2Metadata(shape=(8,), dtype=Float64(), chunks=(8,), fill_value=fill_value, order="C")
338+
b = ArrayV2Metadata(shape=(8,), dtype=Float64(), chunks=(8,), fill_value=fill_value, order="C")
339+
assert a == b
340+
341+
342+
def test_hash_consistent_with_eq_nan_fill_value() -> None:
343+
"""Equal metadata objects with a NaN fill_value hash equal.
344+
345+
NaN hashes by identity, so a field-based hash would break the
346+
``a == b implies hash(a) == hash(b)`` invariant for objects that compare
347+
equal under the to_dict-based __eq__.
348+
"""
349+
a = ArrayV2Metadata(
350+
shape=(8,), dtype=Float64(), chunks=(8,), fill_value=np.float64("nan"), order="C"
351+
)
352+
b = ArrayV2Metadata(
353+
shape=(8,), dtype=Float64(), chunks=(8,), fill_value=np.float64("nan"), order="C"
354+
)
355+
assert a == b
356+
assert hash(a) == hash(b)
357+
358+
359+
def test_eq_non_metadata() -> None:
360+
"""Comparison against a non-metadata object returns False rather than erroring."""
361+
a = ArrayV2Metadata(shape=(8,), dtype=Float64(), chunks=(8,), fill_value=0.0, order="C")
362+
assert a != object()
363+
364+
311365
def test_zstd_checksum() -> None:
312366
compressor_config: dict[str, JSON] = {"id": "zstd", "level": 5, "checksum": False}
313367
arr = zarr.create_array(

tests/test_metadata/test_v3.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
from zarr.core.buffer import default_buffer_prototype
1414
from zarr.core.chunk_grids import is_regular_1d, is_regular_nd
1515
from zarr.core.config import config
16-
from zarr.core.dtype import UInt8
16+
from zarr.core.dtype import Float64, UInt8
1717
from zarr.core.group import GroupMetadata, parse_node_type
18+
from zarr.core.metadata.v2 import ArrayV2Metadata
1819
from zarr.core.metadata.v3 import (
1920
ARRAY_METADATA_KEYS,
2021
ArrayMetadataJSON_V3,
@@ -335,6 +336,74 @@ def test_init_extra_fields_collision() -> None:
335336
)
336337

337338

339+
# ---------------------------------------------------------------------------
340+
# Equality
341+
# ---------------------------------------------------------------------------
342+
343+
344+
def test_eq_nan_fill_value() -> None:
345+
"""Two metadata objects with an identical NaN fill_value compare equal.
346+
347+
NaN is not equal to itself under IEEE 754, so the default dataclass __eq__
348+
reports two otherwise-identical metadata objects as unequal. Metadata
349+
equality must treat matching NaN fill values as equal (see issue #2929).
350+
"""
351+
a = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value="NaN")) # type: ignore[arg-type]
352+
b = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value="NaN")) # type: ignore[arg-type]
353+
assert a == b
354+
355+
356+
def test_eq_distinct_fill_value() -> None:
357+
"""Metadata objects that differ only in fill_value do not compare equal."""
358+
a = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value=0.0)) # type: ignore[arg-type]
359+
b = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value=1.0)) # type: ignore[arg-type]
360+
assert a != b
361+
362+
363+
@pytest.mark.parametrize("fill_value", ["Infinity", "-Infinity"])
364+
def test_eq_inf_fill_value(fill_value: str) -> None:
365+
"""Two metadata objects with an identical infinite fill_value compare equal."""
366+
a = ArrayV3Metadata.from_dict(
367+
minimal_metadata_dict_v3(data_type="float64", fill_value=fill_value) # type: ignore[arg-type]
368+
)
369+
b = ArrayV3Metadata.from_dict(
370+
minimal_metadata_dict_v3(data_type="float64", fill_value=fill_value) # type: ignore[arg-type]
371+
)
372+
assert a == b
373+
374+
375+
def test_hash_consistent_with_eq_nan_fill_value() -> None:
376+
"""Equal metadata objects with a NaN fill_value hash equal.
377+
378+
NaN hashes by identity, so a field-based hash would break the
379+
``a == b implies hash(a) == hash(b)`` invariant for objects that compare
380+
equal under the to_dict-based __eq__.
381+
"""
382+
a = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value="NaN")) # type: ignore[arg-type]
383+
b = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value="NaN")) # type: ignore[arg-type]
384+
assert a == b
385+
assert hash(a) == hash(b)
386+
387+
388+
def test_eq_non_metadata() -> None:
389+
"""Comparison against a non-metadata object returns False rather than erroring."""
390+
a = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value=0.0)) # type: ignore[arg-type]
391+
assert a != object()
392+
393+
394+
def test_eq_across_zarr_formats() -> None:
395+
"""A v2 and v3 metadata describing the same array do not compare equal.
396+
397+
Each __eq__ guards on its own concrete type and returns NotImplemented
398+
otherwise, so the two versions are never equal even when they describe the
399+
same array.
400+
"""
401+
v3 = ArrayV3Metadata.from_dict(minimal_metadata_dict_v3(data_type="float64", fill_value=0.0)) # type: ignore[arg-type]
402+
v2 = ArrayV2Metadata(shape=(4, 4), dtype=Float64(), chunks=(4, 4), fill_value=0.0, order="C")
403+
assert v2 != v3
404+
assert v3 != v2
405+
406+
338407
# ---------------------------------------------------------------------------
339408
# JSON indent
340409
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)