Use multi-part checksums for multi-part zarr uploads

jjnesbitt · jjnesbitt · commit 57b353f8d14a · 2026-04-29T11:26:18.000-04:00
diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py
@@ -1037,9 +1037,15 @@ def register(self, e: LocalZarrEntry, digest: str | None = None) -> None:
     @staticmethod
     def _mkitem(e: LocalZarrEntry) -> UploadItem:
         # Avoid heavy import by importing within function:
-        from dandi.support.digests import md5file_nocache
+        from dandi.support.digests import md5file_nocache, multipart_md5file_nocache
+
+        file_size = e.filepath.stat().st_size
+        digest = (
+            md5file_nocache(e.filepath)
+            if file_size <= ZARR_LARGE_CHUNK_THRESHOLD
+            else multipart_md5file_nocache(e.filepath)
+        )
 
-        digest = md5file_nocache(e.filepath)
         return UploadItem.from_entry(e, digest)
 
     def get_items(self, jobs: int = 5) -> Generator[UploadItem, None, None]:
diff --git a/dandi/support/digests.py b/dandi/support/digests.py
@@ -137,6 +137,30 @@ def md5file_nocache(filepath: str | Path) -> str:
     return Digester(["md5"])(filepath)["md5"]
 
 
+def multipart_md5file_nocache(filepath: str | Path) -> str:
+    """
+    Compute the S3 multipart ETag for a file.
+
+    Splits the file into parts of ``part_size`` bytes, hashes each part with
+    MD5, then returns ``MD5(concat(part_md5s))-{num_parts}``, matching what S3
+    stores as the ETag for a multipart upload.
+    """
+    if isinstance(filepath, str):
+        filepath = Path(filepath)
+
+    part_size = DandiETag(filepath.stat().st_size)._part_gen.initial_part_size
+    part_md5s = b""
+    num_parts = 0
+    with open(filepath, "rb") as f:
+        while True:
+            chunk = f.read(part_size)
+            if not chunk:
+                break
+            part_md5s += hashlib.md5(chunk).digest()
+            num_parts += 1
+    return f"{hashlib.md5(part_md5s).hexdigest()}-{num_parts}"
+
+
 def checksum_zarr_dir(
     files: dict[str, tuple[str, int]], directories: dict[str, tuple[str, int]]
 ) -> str: