Skip to content

Commit 22002e4

Browse files
author
Leo Ji
committed
fix: deduplicate ZipStore central directory on close
ZIP files are append-only: writing the same key (e.g. zarr.json) a second time adds a new entry without removing the old one. When an array is resized, zarr.json is rewritten, which accumulates duplicates in the central directory and triggers UserWarning: Duplicate name. On close(), _dedup_central_directory() now scans filelist in reverse and keeps only the last (most recent) entry for each filename, so the on-disk central directory is clean. Closes #3580 Made-with: Cursor
1 parent c9b534a commit 22002e4

File tree

3 files changed

+48
-0
lines changed

3 files changed

+48
-0
lines changed

changes/3580.bugfix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix `ZipStore` leaving duplicate entries in the zip central directory when array metadata is written more than once (e.g. after `resize()`). The central directory is now deduplicated on `close()`, keeping only the most recent entry for each filename.

src/zarr/storage/_zip.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,30 @@ def close(self) -> None:
122122
# docstring inherited
123123
super().close()
124124
with self._lock:
125+
self._dedup_central_directory()
125126
self._zf.close()
126127

128+
def _dedup_central_directory(self) -> None:
129+
"""Remove duplicate entries from the zip central directory.
130+
131+
When an array is resized or metadata is updated multiple times, ZipStore
132+
writes a new entry for the same key each time (ZIP files are append-only).
133+
This leaves duplicate filenames in the central directory, which confuses
134+
many zip readers and wastes space. Before closing, we keep only the
135+
*last* (most recent) entry for every filename so that the on-disk central
136+
directory is clean.
137+
"""
138+
if not self._zf.mode in ("w", "a", "x"):
139+
return
140+
seen: set[str] = set()
141+
deduped: list[zipfile.ZipInfo] = []
142+
for info in reversed(self._zf.filelist):
143+
if info.filename not in seen:
144+
seen.add(info.filename)
145+
deduped.append(info)
146+
self._zf.filelist = list(reversed(deduped))
147+
self._zf.NameToInfo = {info.filename: info for info in self._zf.filelist}
148+
127149
async def clear(self) -> None:
128150
# docstring inherited
129151
with self._lock:

tests/test_store/test_zip.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,28 @@ async def test_move(self, tmp_path: Path) -> None:
152152
assert destination.exists()
153153
assert not origin.exists()
154154
assert np.array_equal(array[...], np.arange(10))
155+
156+
def test_no_duplicate_entries_after_resize(self, tmp_path: Path) -> None:
157+
# Regression test for https://github.com/zarr-developers/zarr-python/issues/3580
158+
# Resizing an array (which rewrites zarr.json) used to leave duplicate
159+
# filenames in the zip central directory.
160+
zip_path = tmp_path / "data.zip"
161+
store = ZipStore(zip_path, mode="w")
162+
arr = zarr.create_array(store, shape=(5,), chunks=(5,), dtype="i4")
163+
arr[:] = np.arange(5)
164+
165+
# Resize triggers metadata rewrite, producing a second zarr.json entry
166+
arr.resize((10,))
167+
arr[5:] = np.arange(5, 10)
168+
store.close()
169+
170+
# Verify no duplicate filenames in the central directory
171+
with zipfile.ZipFile(zip_path, "r") as zf:
172+
names = [info.filename for info in zf.infolist()]
173+
assert len(names) == len(set(names)), f"Duplicate entries found: {names}"
174+
175+
# Verify data integrity
176+
store2 = ZipStore(zip_path, mode="r")
177+
arr2 = zarr.open_array(store2)
178+
assert arr2.shape == (10,)
179+
assert np.array_equal(arr2[:], np.arange(10))

0 commit comments

Comments
 (0)