Skip to content

Commit 3e86390

Browse files
committed
make configurable
1 parent d5d6617 commit 3e86390

File tree

4 files changed

+27
-35
lines changed

4 files changed

+27
-35
lines changed

lib/galaxy/config/sample/file_sources_conf.yml.sample

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# for accessing passwords stored in a vault:
1616
# password: ${user.user_vault.read_secret('preferences/owncloud/password')}
1717

18-
# By default, the plugin will use temp files to avoid loading entire files into memory.
18+
# By default, the plugin will use temp files to avoid loading entire files into memory.
1919
# You can change the directory here or omit to use the default temp directory.
2020
temp_path: /your/temp/path
2121
# Set writable to true if you have write access to this source
@@ -210,6 +210,10 @@
210210
# token: ${user.preferences['invenio_sandbox|token']} # Alternatively use this for retrieving the token from user preferences instead of the Vault
211211
public_name: ${user.preferences['invenio_sandbox|public_name']}
212212
writable: true
213+
# Enable multipart upload for files of size >= threshold (values in MB, optional, disabled by default)
214+
# multipart_threshold: 100
215+
# Part size for multipart uploads (values in MB, optional, defaults to 5 MiB minimum)
216+
# multipart_chunk_size: 50
213217

214218
- type: zenodo
215219
id: zenodo

lib/galaxy/files/sources/_rdm.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@
2525
class RDMFileSourceTemplateConfiguration(BaseFileSourceTemplateConfiguration):
2626
token: Optional[Union[str, TemplateExpansion]] = None
2727
public_name: Optional[Union[str, TemplateExpansion]] = None
28-
multipart_threshold: Optional[Union[int, TemplateExpansion]] = None # bytes
29-
multipart_chunk_size: Optional[Union[int, TemplateExpansion]] = None # bytes
28+
multipart_threshold: Optional[Union[int, TemplateExpansion]] = None # MB
29+
multipart_chunk_size: Optional[Union[int, TemplateExpansion]] = None # MB
3030

3131

3232
class RDMFileSourceConfiguration(BaseFileSourceConfiguration):
3333
token: Optional[str] = None
3434
public_name: Optional[str] = None
35-
multipart_threshold: Optional[int] = None # bytes
36-
multipart_chunk_size: Optional[int] = None # bytes
35+
multipart_threshold: Optional[int] = None # MB
36+
multipart_chunk_size: Optional[int] = None # MB
3737

3838

3939
class ContainerAndFileIdentifier(NamedTuple):

lib/galaxy/files/sources/invenio.py

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,11 @@ class RecordLinks(TypedDict):
113113
reserve_doi: str
114114

115115

116-
# AWS S3 multipart limits (used by Invenio RDM)
117-
MIN_UPLOAD_PART_SIZE = 50 * 1024 * 1024 # 50 MiB
116+
# AWS S3 multipart default limits (used by Invenio RDM)
117+
MIN_UPLOAD_PART_SIZE = 5 * 1024 * 1024 # 5 MiB
118118
MAX_UPLOAD_PART_SIZE = 5 * 1024**3 # 5 GiB
119119
MAX_UPLOAD_PARTS = 10_000
120120

121-
# Default threshold for using multipart upload (100 MiB)
122-
DEFAULT_MULTIPART_THRESHOLD = 100 * 1024 * 1024
123-
124121

125122
def calculate_multipart_params(file_size: int, preferred_part_size: int | None = None) -> tuple[int, int]:
126123
"""Calculate optimal parts count and part size for multipart upload.
@@ -391,16 +388,12 @@ def upload_file_to_draft_container(
391388
context: FilesSourceRuntimeContext[RDMFileSourceConfiguration],
392389
):
393390
file_size = os.path.getsize(file_path)
394-
threshold = context.config.multipart_threshold
395-
396-
# Use default threshold if not configured
397-
if threshold is None or threshold <= 0:
398-
threshold = DEFAULT_MULTIPART_THRESHOLD
399-
400-
use_multipart = file_size >= threshold
401391

392+
threshold_mb = context.config.multipart_threshold
393+
# Convert threshold from MB to bytes (config value is always in MB)
394+
threshold_bytes = threshold_mb * 1024 * 1024 if threshold_mb else None
395+
use_multipart = file_size >= threshold_bytes if threshold_bytes else False
402396
if use_multipart:
403-
log.info(f"Using multipart upload for file '{filename}' ({file_size} bytes >= threshold {threshold})")
404397
self._upload_file_multipart(record_id, filename, file_path, file_size, context)
405398
else:
406399
self._upload_file_single(record_id, filename, file_path, context, file_size)
@@ -434,11 +427,10 @@ def _upload_file_single(
434427
response = requests.put(upload_file_content_url, data=file, headers=headers)
435428
# Handle 413 (Payload Too Large) - suggest using multipart upload
436429
if response.status_code == 413:
437-
threshold_mb = DEFAULT_MULTIPART_THRESHOLD / (1024 * 1024)
438430
raise Exception(
439431
f"Failed to upload file '{filename}' ({file_size} bytes): HTTP 413 Payload Too Large. "
440432
f"The server rejected the upload because the file is too large for a single request. "
441-
f"Please configure 'multipart_threshold' to {threshold_mb}MB or lower to enable multipart upload for files of this size."
433+
f"Please configure 'multipart_threshold' in the file source configuration to enable multipart upload for files of this size."
442434
)
443435
self._ensure_response_has_expected_status_code(response, 200)
444436

@@ -463,7 +455,9 @@ def _upload_file_multipart(
463455
4. Upload parts (parallel for > 2 parts)
464456
5. POST to commit URL
465457
"""
466-
preferred_part_size = context.config.multipart_chunk_size
458+
preferred_part_size_mb = context.config.multipart_chunk_size
459+
# Convert chunk size from MB to bytes (config value is always in MB)
460+
preferred_part_size = preferred_part_size_mb * 1024 * 1024 if preferred_part_size_mb else None
467461
num_parts, part_size = calculate_multipart_params(file_size, preferred_part_size)
468462

469463
log.info(f"Multipart upload: {num_parts} parts of {part_size} bytes each for '{filename}'")
@@ -472,7 +466,6 @@ def _upload_file_multipart(
472466
upload_file_url = record["links"]["files"]
473467
headers = self._get_request_headers(context, auth_required=True)
474468

475-
# Initialize multipart upload with transfer metadata
476469
file_metadata = {
477470
"key": filename,
478471
"size": file_size,
@@ -497,13 +490,8 @@ def _upload_file_multipart(
497490
)
498491

499492
# Sort part links by part number to ensure correct ordering
500-
# Invenio uses 'part' key, not 'part_number'
501493
part_links = sorted(part_links, key=lambda p: p.get("part", 0))
502-
503-
# Upload parts
504494
self._upload_parts(file_path, file_size, part_size, part_links, headers)
505-
506-
# Commit multipart upload
507495
response = requests.post(commit_url, json={}, headers=headers)
508496
self._ensure_response_has_expected_status_code(response, 200)
509497
log.info(f"Multipart upload completed for '{filename}'")

test/unit/files/test_invenio_multipart.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,25 +20,25 @@ def test_calculate_multipart_params_zero_byte(self):
2020
assert part_size == 0
2121

2222
def test_calculate_multipart_params_small_file(self):
23-
"""Files under 50 MiB should use minimum part size."""
24-
# 10 MiB file
25-
file_size = 10 * 1024 * 1024
23+
"""Files under 5 MiB should use minimum part size."""
24+
# 2 MiB file
25+
file_size = 2 * 1024 * 1024
2626
parts, part_size = calculate_multipart_params(file_size)
2727
assert parts == 1
2828
assert part_size == MIN_UPLOAD_PART_SIZE
2929

3030
def test_calculate_multipart_params_medium_file(self):
31-
"""Files between 50 MiB and 100 MiB."""
32-
# 75 MiB file
33-
file_size = 75 * 1024 * 1024
31+
"""Files between 5 MiB and 10 MiB."""
32+
# 7.5 MiB file
33+
file_size = 7 * 1024 * 1024 + 512 * 1024
3434
parts, part_size = calculate_multipart_params(file_size)
3535
assert parts == 2
3636
assert part_size == MIN_UPLOAD_PART_SIZE
3737

3838
def test_calculate_multipart_params_large_file(self):
3939
"""Large files requiring multiple parts."""
40-
# 250 MiB file
41-
file_size = 250 * 1024 * 1024
40+
# 25 MiB file
41+
file_size = 25 * 1024 * 1024
4242
parts, part_size = calculate_multipart_params(file_size)
4343
assert parts == 5
4444
assert part_size == MIN_UPLOAD_PART_SIZE

0 commit comments

Comments
 (0)