|
10 | 10 | import json |
11 | 11 | import os |
12 | 12 | import os.path |
| 13 | +import re |
13 | 14 | from pathlib import Path |
| 15 | +from threading import Lock |
14 | 16 | from time import sleep |
15 | 17 | from typing import Any, Optional |
16 | 18 | import urllib.parse |
| 19 | +from xml.etree.ElementTree import fromstring |
17 | 20 |
|
18 | 21 | from dandischema.models import BareAsset, DigestType |
19 | 22 | from pydantic import BaseModel, ConfigDict, ValidationError |
|
25 | 28 | from dandi.consts import ( |
26 | 29 | MAX_ZARR_DEPTH, |
27 | 30 | ZARR_DELETE_BATCH_SIZE, |
| 31 | + ZARR_LARGE_CHUNK_THRESHOLD, |
28 | 32 | ZARR_MIME_TYPE, |
29 | 33 | ZARR_UPLOAD_BATCH_SIZE, |
30 | 34 | ) |
|
46 | 50 | pre_upload_size_check, |
47 | 51 | ) |
48 | 52 |
|
49 | | -from .bases import LocalDirectoryAsset |
| 53 | +from .bases import LocalDirectoryAsset, _upload_blob_part |
50 | 54 | from ..validate._types import ( |
51 | 55 | ORIGIN_VALIDATION_DANDI_ZARR, |
52 | 56 | Origin, |
@@ -742,13 +746,40 @@ def mkzarr() -> str: |
742 | 746 | ): |
743 | 747 | # Items to upload in this batch (may be retried e.g. due to |
744 | 748 | # 403 errors because of timed-out upload URLs) |
745 | | - items_to_upload = list(items) |
| 749 | + all_items = list(items) |
| 750 | + large_items = [ |
| 751 | + it for it in all_items if it.size > ZARR_LARGE_CHUNK_THRESHOLD |
| 752 | + ] |
| 753 | + items_to_upload = [ |
| 754 | + it for it in all_items if it.size <= ZARR_LARGE_CHUNK_THRESHOLD |
| 755 | + ] |
746 | 756 | max_retries = 5 |
747 | 757 | retry_count = 0 |
748 | 758 | # Add all items to checksum tree (only done once) |
749 | | - for it in items_to_upload: |
| 759 | + for it in all_items: |
750 | 760 | zcc.add_leaf(Path(it.entry_path), it.size, it.digest) |
751 | 761 |
|
| 762 | + # Upload chunks above 5GB individually via multipart upload |
| 763 | + for it in large_items: |
| 764 | + for status in upload_zarr_file_multipart( |
| 765 | + item=it, |
| 766 | + zarr_id=zarr_id, |
| 767 | + dandiset=dandiset, |
| 768 | + jobs=jobs, |
| 769 | + ): |
| 770 | + if status.get("status") == "done": |
| 771 | + changed = True |
| 772 | + bytes_uploaded += it.size |
| 773 | + yield { |
| 774 | + "status": "uploading", |
| 775 | + "progress": 100 |
| 776 | + * bytes_uploaded |
| 777 | + / to_upload.total_size, |
| 778 | + "current": bytes_uploaded, |
| 779 | + } |
| 780 | + else: |
| 781 | + yield status |
| 782 | + |
752 | 783 | while items_to_upload and retry_count <= max_retries: |
753 | 784 | # Prepare upload requests for current items |
754 | 785 | uploading = [it.upload_request() for it in items_to_upload] |
@@ -903,6 +934,115 @@ def _handle_failed_items_and_raise( |
903 | 934 | raise failed_items[0][1] |
904 | 935 |
|
905 | 936 |
|
| 937 | +def upload_zarr_file_multipart( |
| 938 | + item: UploadItem, |
| 939 | + zarr_id: str, |
| 940 | + dandiset: RemoteDandiset, |
| 941 | + jobs: int | None = None, |
| 942 | +): |
| 943 | + # Avoid heavy import by importing within function: |
| 944 | + from dandi.support.digests import get_dandietag |
| 945 | + |
| 946 | + client = dandiset.client |
| 947 | + |
| 948 | + yield {"status": "calculating etag"} |
| 949 | + etagger = get_dandietag(item.filepath) |
| 950 | + filetag = etagger.as_str() |
| 951 | + |
| 952 | + yield {"status": "initiating upload"} |
| 953 | + lgr.debug("%s: Beginning upload", item.filepath) |
| 954 | + total_size = pre_upload_size_check(item.filepath) |
| 955 | + |
| 956 | + resp = client.post( |
| 957 | + "/uploads/zarr/initialize/", |
| 958 | + json={ |
| 959 | + "contentSize": total_size, |
| 960 | + "digest": { |
| 961 | + "algorithm": "dandi:dandi-etag", |
| 962 | + "value": filetag, |
| 963 | + }, |
| 964 | + "zarr": { |
| 965 | + "chunk_key": item.entry_path, |
| 966 | + "zarr_id": zarr_id, |
| 967 | + }, |
| 968 | + }, |
| 969 | + ) |
| 970 | + |
| 971 | + try: |
| 972 | + upload_id = resp["upload_id"] |
| 973 | + parts = resp["parts"] |
| 974 | + if len(parts) != etagger.part_qty: |
| 975 | + raise RuntimeError( |
| 976 | + f"Server and client disagree on number of parts for upload;" |
| 977 | + f" server says {len(parts)}, client says {etagger.part_qty}" |
| 978 | + ) |
| 979 | + parts_out = [] |
| 980 | + bytes_uploaded = 0 |
| 981 | + lgr.debug("Uploading %s in %d parts", item.filepath, len(parts)) |
| 982 | + with RESTFullAPIClient("http://nil.nil") as storage: |
| 983 | + with item.filepath.open("rb") as fp: |
| 984 | + with ThreadPoolExecutor(max_workers=jobs or 5) as executor: |
| 985 | + lock = Lock() |
| 986 | + futures = [ |
| 987 | + executor.submit( |
| 988 | + _upload_blob_part, |
| 989 | + storage_session=storage, |
| 990 | + fp=fp, |
| 991 | + lock=lock, |
| 992 | + etagger=etagger, |
| 993 | + asset_path=item.entry_path, |
| 994 | + part=part, |
| 995 | + ) |
| 996 | + for part in parts |
| 997 | + ] |
| 998 | + for fut in as_completed(futures): |
| 999 | + out_part = fut.result() |
| 1000 | + bytes_uploaded += out_part["size"] |
| 1001 | + yield { |
| 1002 | + "status": "uploading", |
| 1003 | + "progress": 100 * bytes_uploaded / total_size, |
| 1004 | + "current": bytes_uploaded, |
| 1005 | + } |
| 1006 | + parts_out.append(out_part) |
| 1007 | + |
| 1008 | + lgr.debug("%s: Completing upload", item.entry_path) |
| 1009 | + resp = client.post( |
| 1010 | + f"/uploads/zarr/{upload_id}/complete/", |
| 1011 | + json={"parts": parts_out}, |
| 1012 | + ) |
| 1013 | + lgr.debug( |
| 1014 | + "%s: Announcing completion to %s", |
| 1015 | + item.entry_path, |
| 1016 | + resp["complete_url"], |
| 1017 | + ) |
| 1018 | + r = storage.post(resp["complete_url"], data=resp["body"], json_resp=False) |
| 1019 | + lgr.debug( |
| 1020 | + "%s: Upload completed. Response content: %s", |
| 1021 | + item.entry_path, |
| 1022 | + r.content, |
| 1023 | + ) |
| 1024 | + rxml = fromstring(r.text) |
| 1025 | + m = re.match(r"\{.+?\}", rxml.tag) |
| 1026 | + ns = m.group(0) if m else "" |
| 1027 | + final_etag = rxml.findtext(f"{ns}ETag") |
| 1028 | + if final_etag is not None: |
| 1029 | + final_etag = final_etag.strip('"') |
| 1030 | + if final_etag != filetag: |
| 1031 | + raise RuntimeError( |
| 1032 | + "Server and client disagree on final ETag of" |
| 1033 | + f" uploaded file; server says {final_etag}," |
| 1034 | + f" client says {filetag}" |
| 1035 | + ) |
| 1036 | + # else: Error? Warning? |
| 1037 | + resp = client.post(f"/uploads/zarr/{upload_id}/validate/") |
| 1038 | + yield {"status": "done"} |
| 1039 | + except Exception: |
| 1040 | + post_upload_size_check(item.filepath, total_size, True) |
| 1041 | + raise |
| 1042 | + else: |
| 1043 | + post_upload_size_check(item.filepath, total_size, False) |
| 1044 | + |
| 1045 | + |
906 | 1046 | def _upload_zarr_file( |
907 | 1047 | storage_session: RESTFullAPIClient, |
908 | 1048 | dandiset: RemoteDandiset, |
|
0 commit comments