Skip to content

Commit 57b353f

Browse files
committed
Use multi-part checksums for multi-part zarr uploads
1 parent 6e9064b commit 57b353f

2 files changed

Lines changed: 32 additions & 2 deletions

File tree

dandi/files/zarr.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,9 +1037,15 @@ def register(self, e: LocalZarrEntry, digest: str | None = None) -> None:
10371037
@staticmethod
10381038
def _mkitem(e: LocalZarrEntry) -> UploadItem:
10391039
# Avoid heavy import by importing within function:
1040-
from dandi.support.digests import md5file_nocache
1040+
from dandi.support.digests import md5file_nocache, multipart_md5file_nocache
1041+
1042+
file_size = e.filepath.stat().st_size
1043+
digest = (
1044+
md5file_nocache(e.filepath)
1045+
if file_size <= ZARR_LARGE_CHUNK_THRESHOLD
1046+
else multipart_md5file_nocache(e.filepath)
1047+
)
10411048

1042-
digest = md5file_nocache(e.filepath)
10431049
return UploadItem.from_entry(e, digest)
10441050

10451051
def get_items(self, jobs: int = 5) -> Generator[UploadItem, None, None]:

dandi/support/digests.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,30 @@ def md5file_nocache(filepath: str | Path) -> str:
137137
return Digester(["md5"])(filepath)["md5"]
138138

139139

140+
def multipart_md5file_nocache(filepath: str | Path) -> str:
141+
"""
142+
Compute the S3 multipart ETag for a file.
143+
144+
Splits the file into parts of ``part_size`` bytes, hashes each part with
145+
MD5, then returns ``MD5(concat(part_md5s))-{num_parts}``, matching what S3
146+
stores as the ETag for a multipart upload.
147+
"""
148+
if isinstance(filepath, str):
149+
filepath = Path(filepath)
150+
151+
part_size = DandiETag(filepath.stat().st_size)._part_gen.initial_part_size
152+
part_md5s = b""
153+
num_parts = 0
154+
with open(filepath, "rb") as f:
155+
while True:
156+
chunk = f.read(part_size)
157+
if not chunk:
158+
break
159+
part_md5s += hashlib.md5(chunk).digest()
160+
num_parts += 1
161+
return f"{hashlib.md5(part_md5s).hexdigest()}-{num_parts}"
162+
163+
140164
def checksum_zarr_dir(
141165
files: dict[str, tuple[str, int]], directories: dict[str, tuple[str, int]]
142166
) -> str:

0 commit comments

Comments
 (0)