|
26 | 26 | from fscacher import PersistentCache |
27 | 27 | from zarr_checksum.checksum import ZarrChecksum, ZarrChecksumManifest |
28 | 28 | from zarr_checksum.tree import ZarrChecksumTree |
| 29 | +from dandi.consts import ZARR_LARGE_CHUNK_THRESHOLD |
29 | 30 |
|
30 | 31 | from .threaded_walk import threaded_walk |
31 | 32 | from ..utils import Hasher, exclude_from_zarr |
@@ -133,32 +134,17 @@ def md5file_nocache(filepath: str | Path) -> str: |
133 | 134 | Compute the MD5 digest of a file without caching with fscacher, which has |
134 | 135 | been shown to slow things down for the large numbers of files typically |
135 | 136 | present in Zarrs |
136 | | - """ |
137 | | - return Digester(["md5"])(filepath)["md5"] |
138 | 137 |
|
139 | | - |
140 | | -def multipart_md5file_nocache(filepath: str | Path) -> str: |
141 | | - """ |
142 | | - Compute the S3 multipart ETag for a file. |
143 | | -
|
144 | | - Splits the file into parts of ``part_size`` bytes, hashes each part with |
145 | | - MD5, then returns ``MD5(concat(part_md5s))-{num_parts}``, matching what S3 |
146 | | - stores as the ETag for a multipart upload. |
| 138 | + If the file is larger than `ZARR_LARGE_CHUNK_THRESHOLD`, the computed checksum is not a |
| 139 | + traditional md5 checksum, but is instead an S3 multipart ETag. |
147 | 140 | """ |
148 | 141 | if isinstance(filepath, str): |
149 | 142 | filepath = Path(filepath) |
150 | 143 |
|
151 | | - part_size = DandiETag(filepath.stat().st_size)._part_gen.initial_part_size |
152 | | - part_md5s = b"" |
153 | | - num_parts = 0 |
154 | | - with open(filepath, "rb") as f: |
155 | | - while True: |
156 | | - chunk = f.read(part_size) |
157 | | - if not chunk: |
158 | | - break |
159 | | - part_md5s += hashlib.md5(chunk).digest() |
160 | | - num_parts += 1 |
161 | | - return f"{hashlib.md5(part_md5s).hexdigest()}-{num_parts}" |
| 144 | + if filepath.stat().st_size > ZARR_LARGE_CHUNK_THRESHOLD: |
| 145 | + return get_dandietag(filepath).as_str() |
| 146 | + |
| 147 | + return Digester(["md5"])(filepath)["md5"] |
162 | 148 |
|
163 | 149 |
|
164 | 150 | def checksum_zarr_dir( |
|
0 commit comments