From 22002e45d28c0f93e09dfac33e5cfa58e106c305 Mon Sep 17 00:00:00 2001 From: Leo Ji Date: Tue, 31 Mar 2026 19:55:56 +0000 Subject: [PATCH 1/5] fix: deduplicate ZipStore central directory on close ZIP files are append-only: writing the same key (e.g. zarr.json) a second time adds a new entry without removing the old one. When an array is resized, zarr.json is rewritten, which accumulates duplicates in the central directory and triggers UserWarning: Duplicate name. On close(), _dedup_central_directory() now scans filelist in reverse and keeps only the last (most recent) entry for each filename, so the on-disk central directory is clean. Closes #3580 Made-with: Cursor --- changes/3580.bugfix.md | 1 + src/zarr/storage/_zip.py | 22 ++++++++++++++++++++++ tests/test_store/test_zip.py | 25 +++++++++++++++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 changes/3580.bugfix.md diff --git a/changes/3580.bugfix.md b/changes/3580.bugfix.md new file mode 100644 index 0000000000..48a492bd16 --- /dev/null +++ b/changes/3580.bugfix.md @@ -0,0 +1 @@ +Fix `ZipStore` leaving duplicate entries in the zip central directory when array metadata is written more than once (e.g. after `resize()`). The central directory is now deduplicated on `close()`, keeping only the most recent entry for each filename. diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index 72bf9e335a..b538a54979 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -122,8 +122,30 @@ def close(self) -> None: # docstring inherited super().close() with self._lock: + self._dedup_central_directory() self._zf.close() + def _dedup_central_directory(self) -> None: + """Remove duplicate entries from the zip central directory. + + When an array is resized or metadata is updated multiple times, ZipStore + writes a new entry for the same key each time (ZIP files are append-only). + This leaves duplicate filenames in the central directory, which confuses + many zip readers and wastes space. Before closing, we keep only the + *last* (most recent) entry for every filename so that the on-disk central + directory is clean. + """ + if not self._zf.mode in ("w", "a", "x"): + return + seen: set[str] = set() + deduped: list[zipfile.ZipInfo] = [] + for info in reversed(self._zf.filelist): + if info.filename not in seen: + seen.add(info.filename) + deduped.append(info) + self._zf.filelist = list(reversed(deduped)) + self._zf.NameToInfo = {info.filename: info for info in self._zf.filelist} + async def clear(self) -> None: # docstring inherited with self._lock: diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index 744ee82945..08560ed685 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -152,3 +152,28 @@ async def test_move(self, tmp_path: Path) -> None: assert destination.exists() assert not origin.exists() assert np.array_equal(array[...], np.arange(10)) + + def test_no_duplicate_entries_after_resize(self, tmp_path: Path) -> None: + # Regression test for https://github.com/zarr-developers/zarr-python/issues/3580 + # Resizing an array (which rewrites zarr.json) used to leave duplicate + # filenames in the zip central directory. + zip_path = tmp_path / "data.zip" + store = ZipStore(zip_path, mode="w") + arr = zarr.create_array(store, shape=(5,), chunks=(5,), dtype="i4") + arr[:] = np.arange(5) + + # Resize triggers metadata rewrite, producing a second zarr.json entry + arr.resize((10,)) + arr[5:] = np.arange(5, 10) + store.close() + + # Verify no duplicate filenames in the central directory + with zipfile.ZipFile(zip_path, "r") as zf: + names = [info.filename for info in zf.infolist()] + assert len(names) == len(set(names)), f"Duplicate entries found: {names}" + + # Verify data integrity + store2 = ZipStore(zip_path, mode="r") + arr2 = zarr.open_array(store2) + assert arr2.shape == (10,) + assert np.array_equal(arr2[:], np.arange(10)) From b155426fa9210faf19b280d6421fa4d3c1992083 Mon Sep 17 00:00:00 2001 From: Leo Ji Date: Tue, 31 Mar 2026 20:01:13 +0000 Subject: [PATCH 2/5] fix: use 'not in' for ruff E713 Made-with: Cursor --- src/zarr/storage/_zip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index b538a54979..203e18bab3 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -135,7 +135,7 @@ def _dedup_central_directory(self) -> None: *last* (most recent) entry for every filename so that the on-disk central directory is clean. """ - if not self._zf.mode in ("w", "a", "x"): + if self._zf.mode not in ("w", "a", "x"): return seen: set[str] = set() deduped: list[zipfile.ZipInfo] = [] From 51d34f047338716a72417990d0bfc1f64c7e4db2 Mon Sep 17 00:00:00 2001 From: Leo Ji Date: Tue, 31 Mar 2026 20:17:03 +0000 Subject: [PATCH 3/5] dedup: compute new filelist atomically before swapping in Made-with: Cursor --- src/zarr/storage/_zip.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index 203e18bab3..636867d909 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -134,6 +134,10 @@ def _dedup_central_directory(self) -> None: many zip readers and wastes space. Before closing, we keep only the *last* (most recent) entry for every filename so that the on-disk central directory is clean. + + Both ``filelist`` and ``NameToInfo`` are computed upfront and then + swapped in together so that the ZipFile object is never left in an + inconsistent state if an exception occurs mid-update. """ if self._zf.mode not in ("w", "a", "x"): return @@ -143,8 +147,12 @@ def _dedup_central_directory(self) -> None: if info.filename not in seen: seen.add(info.filename) deduped.append(info) - self._zf.filelist = list(reversed(deduped)) - self._zf.NameToInfo = {info.filename: info for info in self._zf.filelist} + new_filelist = list(reversed(deduped)) + new_name_to_info = {info.filename: info for info in new_filelist} + # Swap both attributes together; if anything above raised, the + # original filelist/NameToInfo are still intact. + self._zf.filelist = new_filelist + self._zf.NameToInfo = new_name_to_info async def clear(self) -> None: # docstring inherited From 9add837cb9ddd1a5a2604f89e2dd792e9aec4c4c Mon Sep 17 00:00:00 2001 From: Leo Ji Date: Tue, 31 Mar 2026 22:31:57 +0000 Subject: [PATCH 4/5] fix: suppress Duplicate name UserWarning in _set() Python's zipfile warns when the same filename is written twice. ZipStore is append-only by design; duplicates are cleaned up by _dedup_central_directory() on close. Suppress the warning at the source so test suites with filterwarnings=error don't treat it as a failure. Made-with: Cursor --- src/zarr/storage/_zip.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index 636867d909..aba0f95766 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -4,6 +4,7 @@ import shutil import threading import time +import warnings import zipfile from pathlib import Path from typing import TYPE_CHECKING, Any, Literal @@ -235,7 +236,12 @@ def _set(self, key: str, value: Buffer) -> None: keyinfo.external_attr |= 0x10 # MS-DOS directory flag else: keyinfo.external_attr = 0o644 << 16 # ?rw-r--r-- - self._zf.writestr(keyinfo, value.to_bytes()) + # ZIP files are append-only; writing an existing key creates a second entry. + # We intentionally allow this and remove duplicates in _dedup_central_directory() + # when the store is closed. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + self._zf.writestr(keyinfo, value.to_bytes()) async def set(self, key: str, value: Buffer) -> None: # docstring inherited From eebe8678f18751ff5b2c1d97dc037b8b30c44624 Mon Sep 17 00:00:00 2001 From: Leo Ji Date: Tue, 7 Apr 2026 07:15:21 +0000 Subject: [PATCH 5/5] refactor: simplify _dedup_central_directory using NameToInfo NameToInfo is already a {filename -> latest ZipInfo} mapping because each writestr/write call does NameToInfo[name] = zinfo, so later writes overwrite earlier ones (confirmed via CPython zipfile source). Replace the manual seen/deduped loop with a two-liner that uses public API: - namelist() to get all filenames (preserving insertion order) - dict.fromkeys() to deduplicate while keeping first-seen order - getinfo() to retrieve the latest ZipInfo for each unique name Only filelist needs to be updated; NameToInfo is already correct and does not need to be rewritten. The new list is computed before assignment so the ZipFile is never left in an inconsistent state on exception. Made-with: Cursor --- src/zarr/storage/_zip.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/src/zarr/storage/_zip.py b/src/zarr/storage/_zip.py index aba0f95766..cc6754a790 100644 --- a/src/zarr/storage/_zip.py +++ b/src/zarr/storage/_zip.py @@ -136,24 +136,22 @@ def _dedup_central_directory(self) -> None: *last* (most recent) entry for every filename so that the on-disk central directory is clean. - Both ``filelist`` and ``NameToInfo`` are computed upfront and then - swapped in together so that the ZipFile object is never left in an - inconsistent state if an exception occurs mid-update. + ``NameToInfo`` is already a ``{filename: latest_ZipInfo}`` mapping because + each ``writestr``/``write`` call does ``NameToInfo[name] = zinfo``, so + later writes overwrite earlier ones. We therefore only need to rebuild + ``filelist`` from the values of ``NameToInfo``; ``NameToInfo`` itself + requires no update. + + The new list is computed before being assigned so that the ZipFile object + is never left in an inconsistent state if an exception is raised. """ if self._zf.mode not in ("w", "a", "x"): return - seen: set[str] = set() - deduped: list[zipfile.ZipInfo] = [] - for info in reversed(self._zf.filelist): - if info.filename not in seen: - seen.add(info.filename) - deduped.append(info) - new_filelist = list(reversed(deduped)) - new_name_to_info = {info.filename: info for info in new_filelist} - # Swap both attributes together; if anything above raised, the - # original filelist/NameToInfo are still intact. - self._zf.filelist = new_filelist - self._zf.NameToInfo = new_name_to_info + # namelist() and getinfo() are public API. + # dict.fromkeys preserves first-seen order while deduplicating names; + # getinfo() returns the latest ZipInfo for each name (NameToInfo[name]). + unique_names = dict.fromkeys(self._zf.namelist()) + self._zf.filelist = [self._zf.getinfo(name) for name in unique_names] async def clear(self) -> None: # docstring inherited