Merge multipart behavior into md5file_nocache func

jjnesbitt · jjnesbitt · commit 6d996707ff02 · 2026-04-29T11:26:18.000-04:00
diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py
@@ -1037,15 +1037,9 @@ def register(self, e: LocalZarrEntry, digest: str | None = None) -> None:
     @staticmethod
     def _mkitem(e: LocalZarrEntry) -> UploadItem:
         # Avoid heavy import by importing within function:
-        from dandi.support.digests import md5file_nocache, multipart_md5file_nocache
-
-        file_size = e.filepath.stat().st_size
-        digest = (
-            md5file_nocache(e.filepath)
-            if file_size <= ZARR_LARGE_CHUNK_THRESHOLD
-            else multipart_md5file_nocache(e.filepath)
-        )
+        from dandi.support.digests import md5file_nocache
 
+        digest = md5file_nocache(e.filepath)
         return UploadItem.from_entry(e, digest)
 
     def get_items(self, jobs: int = 5) -> Generator[UploadItem, None, None]:
diff --git a/dandi/support/digests.py b/dandi/support/digests.py
@@ -26,6 +26,7 @@
 from fscacher import PersistentCache
 from zarr_checksum.checksum import ZarrChecksum, ZarrChecksumManifest
 from zarr_checksum.tree import ZarrChecksumTree
+from dandi.consts import ZARR_LARGE_CHUNK_THRESHOLD
 
 from .threaded_walk import threaded_walk
 from ..utils import Hasher, exclude_from_zarr
@@ -133,32 +134,17 @@ def md5file_nocache(filepath: str | Path) -> str:
     Compute the MD5 digest of a file without caching with fscacher, which has
     been shown to slow things down for the large numbers of files typically
     present in Zarrs
-    """
-    return Digester(["md5"])(filepath)["md5"]
 
-
-def multipart_md5file_nocache(filepath: str | Path) -> str:
-    """
-    Compute the S3 multipart ETag for a file.
-
-    Splits the file into parts of ``part_size`` bytes, hashes each part with
-    MD5, then returns ``MD5(concat(part_md5s))-{num_parts}``, matching what S3
-    stores as the ETag for a multipart upload.
+    If the file is larger than `ZARR_LARGE_CHUNK_THRESHOLD`, the computed checksum is not a
+    traditional md5 checksum, but is instead an S3 multipart ETag.
     """
     if isinstance(filepath, str):
         filepath = Path(filepath)
 
-    part_size = DandiETag(filepath.stat().st_size)._part_gen.initial_part_size
-    part_md5s = b""
-    num_parts = 0
-    with open(filepath, "rb") as f:
-        while True:
-            chunk = f.read(part_size)
-            if not chunk:
-                break
-            part_md5s += hashlib.md5(chunk).digest()
-            num_parts += 1
-    return f"{hashlib.md5(part_md5s).hexdigest()}-{num_parts}"
+    if filepath.stat().st_size > ZARR_LARGE_CHUNK_THRESHOLD:
+        return get_dandietag(filepath).as_str()
+
+    return Digester(["md5"])(filepath)["md5"]
 
 
 def checksum_zarr_dir(
diff --git a/dandi/tests/test_files.py b/dandi/tests/test_files.py
@@ -554,6 +554,7 @@ def spy_multipart_upload(**kwargs):
     # Set threshold to 0 so every chunk is treated as "large"
     with (
         patch("dandi.files.zarr.ZARR_LARGE_CHUNK_THRESHOLD", 0),
+        patch("dandi.support.digests.ZARR_LARGE_CHUNK_THRESHOLD", 0),
         patch(
             "dandi.files.zarr._multipart_upload",
             spy_multipart_upload,
@@ -590,6 +591,7 @@ def spy_multipart_upload(**kwargs):
     mixed_threshold = 200
     with (
         patch("dandi.files.zarr.ZARR_LARGE_CHUNK_THRESHOLD", mixed_threshold),
+        patch("dandi.support.digests.ZARR_LARGE_CHUNK_THRESHOLD", mixed_threshold),
         patch("dandi.files.zarr._multipart_upload", spy_multipart_upload),
     ):
         asset = zf.upload(new_dandiset.dandiset, {})