Skip to content

Commit 6d99670

Browse files
committed
Merge multipart behavior into md5file_nocache func
1 parent 9bc2976 commit 6d99670

3 files changed

Lines changed: 11 additions & 29 deletions

File tree

dandi/files/zarr.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,15 +1037,9 @@ def register(self, e: LocalZarrEntry, digest: str | None = None) -> None:
10371037
@staticmethod
10381038
def _mkitem(e: LocalZarrEntry) -> UploadItem:
10391039
# Avoid heavy import by importing within function:
1040-
from dandi.support.digests import md5file_nocache, multipart_md5file_nocache
1041-
1042-
file_size = e.filepath.stat().st_size
1043-
digest = (
1044-
md5file_nocache(e.filepath)
1045-
if file_size <= ZARR_LARGE_CHUNK_THRESHOLD
1046-
else multipart_md5file_nocache(e.filepath)
1047-
)
1040+
from dandi.support.digests import md5file_nocache
10481041

1042+
digest = md5file_nocache(e.filepath)
10491043
return UploadItem.from_entry(e, digest)
10501044

10511045
def get_items(self, jobs: int = 5) -> Generator[UploadItem, None, None]:

dandi/support/digests.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from fscacher import PersistentCache
2727
from zarr_checksum.checksum import ZarrChecksum, ZarrChecksumManifest
2828
from zarr_checksum.tree import ZarrChecksumTree
29+
from dandi.consts import ZARR_LARGE_CHUNK_THRESHOLD
2930

3031
from .threaded_walk import threaded_walk
3132
from ..utils import Hasher, exclude_from_zarr
@@ -133,32 +134,17 @@ def md5file_nocache(filepath: str | Path) -> str:
133134
Compute the MD5 digest of a file without caching with fscacher, which has
134135
been shown to slow things down for the large numbers of files typically
135136
present in Zarrs
136-
"""
137-
return Digester(["md5"])(filepath)["md5"]
138137
139-
140-
def multipart_md5file_nocache(filepath: str | Path) -> str:
141-
"""
142-
Compute the S3 multipart ETag for a file.
143-
144-
Splits the file into parts of ``part_size`` bytes, hashes each part with
145-
MD5, then returns ``MD5(concat(part_md5s))-{num_parts}``, matching what S3
146-
stores as the ETag for a multipart upload.
138+
If the file is larger than `ZARR_LARGE_CHUNK_THRESHOLD`, the computed checksum is not a
139+
traditional md5 checksum, but is instead an S3 multipart ETag.
147140
"""
148141
if isinstance(filepath, str):
149142
filepath = Path(filepath)
150143

151-
part_size = DandiETag(filepath.stat().st_size)._part_gen.initial_part_size
152-
part_md5s = b""
153-
num_parts = 0
154-
with open(filepath, "rb") as f:
155-
while True:
156-
chunk = f.read(part_size)
157-
if not chunk:
158-
break
159-
part_md5s += hashlib.md5(chunk).digest()
160-
num_parts += 1
161-
return f"{hashlib.md5(part_md5s).hexdigest()}-{num_parts}"
144+
if filepath.stat().st_size > ZARR_LARGE_CHUNK_THRESHOLD:
145+
return get_dandietag(filepath).as_str()
146+
147+
return Digester(["md5"])(filepath)["md5"]
162148

163149

164150
def checksum_zarr_dir(

dandi/tests/test_files.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ def spy_multipart_upload(**kwargs):
554554
# Set threshold to 0 so every chunk is treated as "large"
555555
with (
556556
patch("dandi.files.zarr.ZARR_LARGE_CHUNK_THRESHOLD", 0),
557+
patch("dandi.support.digests.ZARR_LARGE_CHUNK_THRESHOLD", 0),
557558
patch(
558559
"dandi.files.zarr._multipart_upload",
559560
spy_multipart_upload,
@@ -590,6 +591,7 @@ def spy_multipart_upload(**kwargs):
590591
mixed_threshold = 200
591592
with (
592593
patch("dandi.files.zarr.ZARR_LARGE_CHUNK_THRESHOLD", mixed_threshold),
594+
patch("dandi.support.digests.ZARR_LARGE_CHUNK_THRESHOLD", mixed_threshold),
593595
patch("dandi.files.zarr._multipart_upload", spy_multipart_upload),
594596
):
595597
asset = zf.upload(new_dandiset.dandiset, {})

0 commit comments

Comments
 (0)