Skip to content

Commit 1020ca5

Browse files
BUG: Fix empty Group nodes lacking a "consolidated_metadata" field (zarr-developers#3967)
* BUG: Fix empty Group nodes lacking a "consolidated_metadata" field All nodes opened via `open_consolidated` are expected to have a non-null `.metadata.consolidated_metadata`. zarr-developers#3954 showed an example, with a data file written by zarr-python 2.x, where that condition failed to hold. This happened because of a slight difference between how zarr-python 3.x writes consolidated metadata and how zarr-python 2.x wrote it. For leaf `Group` nodes (a `Group` with no `Group` or `Array` children), zarr-python 3.x includes an "empty" `consolidated_metadata` field. In zarr-python 3.x, we include a `consolidated_metadata: metadata: {}` field to indicate that it's empty. zarr-python 2.x *didn't* include that. The fix is to update how we load this type of data. We effectively normalize a node missing consolidated metadata to one that is known not to have consolidated metadata (which is consistent with the idea behind consolidated metadata: just do one metadata read up front).
1 parent c79425e commit 1020ca5

3 files changed

Lines changed: 80 additions & 9 deletions

File tree

changes/3954.bugfix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Handle missing consolidated metadata in leaf Group nodes.

src/zarr/core/group.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,6 @@ def _flat_to_nested(
235235
# array metadata of its immediate children.
236236
# In the example, the group at `/a/b` will have consolidated metadata
237237
# for its children `array-0` and `array-1`.
238-
#
239-
# metadata = dict(metadata)
240238

241239
keys = sorted(metadata, key=lambda k: k.count("/"))
242240
grouped = {
@@ -269,13 +267,17 @@ def _flat_to_nested(
269267
# These are already present, either thanks to being an array in the
270268
# root, or by being collected as a child in the else clause
271269
continue
272-
children_keys = list(children_keys)
273-
# We pop from metadata, since we're *moving* this under group
274-
children = {
275-
child_key.split("/")[-1]: metadata.pop(child_key)
276-
for child_key in children_keys
277-
if child_key != key
278-
}
270+
children: dict[str, ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] = {}
271+
# We pop from metadata, since we're *moving* this under group.
272+
# While doing this, normalize leaf groups to carry empty consolidated metadata.
273+
for child_key in children_keys:
274+
if child_key == key:
275+
continue
276+
child = metadata.pop(child_key)
277+
if isinstance(child, GroupMetadata) and child.consolidated_metadata is None:
278+
child = replace(child, consolidated_metadata=ConsolidatedMetadata(metadata={}))
279+
children[child_key.split("/")[-1]] = child
280+
279281
parent[name] = replace(
280282
node, consolidated_metadata=ConsolidatedMetadata(metadata=children)
281283
)

tests/test_metadata/test_consolidated.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,74 @@ async def memory_store_with_hierarchy(memory_store: Store) -> Store:
5151

5252

5353
class TestConsolidated:
54+
@pytest.mark.filterwarnings("ignore:Consolidated metadata")
55+
async def test_getitem_consolidated_empty_leaf_group(
56+
self, memory_store: zarr.storage.MemoryStore, zarr_format: ZarrFormat
57+
) -> None:
58+
# This test writes the bytes directly, rather than using the zarr API, to mimic
59+
# how older versions of zarr-python wrote the consolidated metadata.
60+
# Notably, zarr-python 2.x does not include a
61+
#
62+
# "consolidated_metadata": {"metadata": {}}
63+
#
64+
# field on the leaf group nodes.
65+
if zarr_format == 2:
66+
zmetadata: dict[str, JSON] = {
67+
"metadata": {
68+
".zattrs": {},
69+
".zgroup": {"zarr_format": 2},
70+
"raw/.zattrs": {},
71+
"raw/.zgroup": {"zarr_format": 2},
72+
"raw/varm/.zattrs": {},
73+
"raw/varm/.zgroup": {"zarr_format": 2},
74+
},
75+
"zarr_consolidated_format": 1,
76+
}
77+
await memory_store.set(
78+
".zgroup", cpu.Buffer.from_bytes(json.dumps({"zarr_format": 2}).encode())
79+
)
80+
await memory_store.set(".zattrs", cpu.Buffer.from_bytes(json.dumps({}).encode()))
81+
await memory_store.set(
82+
".zmetadata", cpu.Buffer.from_bytes(json.dumps(zmetadata).encode())
83+
)
84+
85+
else:
86+
zmetadata = {
87+
"attributes": {},
88+
"zarr_format": 3,
89+
"consolidated_metadata": {
90+
"kind": "inline",
91+
"must_understand": False,
92+
"metadata": {
93+
"raw": {
94+
"attributes": {},
95+
"zarr_format": 3,
96+
"node_type": "group",
97+
},
98+
"raw/varm": {
99+
"attributes": {},
100+
"zarr_format": 3,
101+
"node_type": "group",
102+
},
103+
},
104+
},
105+
"node_type": "group",
106+
}
107+
await memory_store.set(
108+
"zarr.json", cpu.Buffer.from_bytes(json.dumps(zmetadata).encode())
109+
)
110+
111+
group = await zarr.api.asynchronous.open_consolidated(
112+
store=memory_store, zarr_format=zarr_format
113+
)
114+
raw = await group.getitem("raw")
115+
assert isinstance(raw, zarr.AsyncGroup)
116+
assert raw.metadata.consolidated_metadata is not None
117+
118+
varm = await raw.getitem("varm")
119+
assert isinstance(varm, zarr.AsyncGroup)
120+
assert varm.metadata.consolidated_metadata == ConsolidatedMetadata(metadata={})
121+
54122
async def test_open_consolidated_false_raises(self) -> None:
55123
store = zarr.storage.MemoryStore()
56124
with pytest.raises(TypeError, match="use_consolidated"):

0 commit comments

Comments
 (0)