Skip to content

Commit 3da89bd

Browse files
committed
fix: check for existence of DC resources before starting uploads (close #85)
1 parent bc1b41a commit 3da89bd

7 files changed

Lines changed: 132 additions & 25 deletions

File tree

CHANGELOG

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
0.17.2
2+
- fix: check for existence of DC resources before starting uploads (#85)
23
- enh: sort resources according to internal basin dependencies (#79)
34
- ref: use more f-strings
5+
- ref: add `common.is_dc_file` for consistent resource checks
46
0.17.1
57
- fix: found culprit of regular segmentation faults (#14)
68
- reg: invalid use PyQt6 namespace after migration from PyQt5

dcoraid/common.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pathlib
44
import weakref
55

6+
import dclab
67
import requests
78

89
from .api import s3_api
@@ -59,6 +60,27 @@ def etagsum(path):
5960
return etag
6061

6162

63+
@functools.lru_cache(maxsize=2000)
64+
def is_dc_file(path, test_open=True):
65+
"""Return True when the file is a valid DC dataset
66+
67+
Performs a path suffix check.
68+
If `test_open` is True (default), attempt to open the file with dclab.
69+
"""
70+
is_dc = False
71+
path = pathlib.Path(path)
72+
if path.suffix.lower() in [".rtdc", ".dc"]:
73+
if test_open:
74+
try:
75+
with dclab.new_dataset(path):
76+
is_dc = True
77+
except BaseException:
78+
pass
79+
else:
80+
is_dc = True
81+
return is_dc
82+
83+
6284
@functools.lru_cache(maxsize=2000)
6385
def sha256sum(path):
6486
"""Compute the SHA256 hash of a file"""

dcoraid/gui/upload/dlg_upload.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88

99
from ...api import dataset_create
10+
from ...common import is_dc_file
1011
from ...upload import job
1112

1213
from ..tools import ShowWaitCursor
@@ -297,11 +298,22 @@ def on_proceed(self):
297298
This will first trigger a creation of the draft dataset
298299
on DCOR. Then, the job is enqueued in the parent
299300
"""
300-
# We should only proceed if we have resources
301+
# We should only proceed when we have resources
301302
if self.rvmodel.rowCount() == 0:
302303
QtWidgets.QMessageBox.critical(self, "No resources selected",
303304
"Please add at least one resource.")
304305
return
306+
307+
# In addition, we should only proceed when we have at least one
308+
# DC resource.
309+
paths = self.rvmodel.get_file_list()
310+
if sum([is_dc_file(pp, test_open=False) for pp in paths]) == 0:
311+
QtWidgets.QMessageBox.critical(
312+
self, "No DC resources specified",
313+
"Please add at least one deformability cytometry "
314+
"resource.")
315+
return
316+
305317
# Checking for duplicate resources is the responsibility of
306318
# DCOR-Aid, because we are skipping existing resources in
307319
# dcoraid.upload.job.UploadJob.task_upload_resources.

dcoraid/gui/upload/resources_model.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from PyQt6 import QtCore, QtGui
88

9+
from ...common import is_dc_file
910
from ...upload import job
1011

1112

@@ -61,10 +62,7 @@ def filenames_were_edited(self):
6162
def index_is_dc(self, index):
6263
"""Does the given index instance belong to an RT-DC file?"""
6364
path = pathlib.Path(self.get_file_list()[index.row()])
64-
if path.suffix in [".dc", ".rtdc"]:
65-
return True
66-
else:
67-
return False
65+
return is_dc_file(path, test_open=False)
6866

6967
def index_has_edits(self, index):
7068
"""Is there a modification of the list entry of this index instance?"""

dcoraid/upload/job.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
from dclab.cli import compress
1313

1414
from ..api import dataset_activate, resource_add, resource_exists
15-
from ..common import sha256sum
15+
from ..common import is_dc_file, sha256sum
16+
17+
18+
class AtLeastOneDCResourceRequiredPerDatasetError(BaseException):
19+
"""Raised when an upload does not contain at least one DC resource"""
20+
pass
1621

1722

1823
logger = logging.getLogger(__name__)
@@ -76,10 +81,28 @@ def __init__(self, api, dataset_id, resource_paths,
7681
Multiple upload jobs may share the same cache dir,
7782
since each job creates its own subdirectory.
7883
"""
84+
# Ensure resource_paths is a list in case somebody passed an iterator.
85+
resource_paths = list(resource_paths)
86+
7987
self.api = api.copy() # create a copy of the API
8088
self.dataset_id = dataset_id
89+
90+
# Check whether at least one DC resource is present in the list.
91+
# This is a hard DCOR requirement.
92+
for pp in resource_paths:
93+
if is_dc_file(pp):
94+
break
95+
else:
96+
raise AtLeastOneDCResourceRequiredPerDatasetError(
97+
f"DCOR requires at least one valid deformability cytometry "
98+
f"file per dataset upload. Please make sure that this "
99+
f"condition is met and all of these files exist: "
100+
f"{resource_paths}")
101+
81102
# make sure the dataset_id is valid
82103
self.api.get("package_show", id=self.dataset_id, timeout=500)
104+
105+
# resolve paths and set resource names
83106
resolved_paths = [pathlib.Path(pp).resolve() for pp in resource_paths]
84107
if resource_names is None:
85108
resource_names = [pp.name for pp in resolved_paths]
@@ -192,18 +215,19 @@ def sort_resources_according_to_basin_hierarchy(paths) -> list[int]:
192215
paths_basins = {}
193216
for pp in path_strings:
194217
basins = []
195-
try:
196-
with dclab.new_dataset(pp) as ds:
197-
for bn in ds.basins:
198-
if (bn.basin_type == "file"
199-
and bn.basin_format == "hdf5"):
200-
bpath = str(bn.location.resolve())
201-
if bpath in path_strings:
202-
basins.append(bpath)
203-
except BaseException:
204-
logger.error(f"Failed to get basin info for {pp}. Traceback"
205-
f"follows.")
206-
logger.error(traceback.format_exc())
218+
if is_dc_file(pp, test_open=False):
219+
try:
220+
with dclab.new_dataset(pp) as ds:
221+
for bn in ds.basins:
222+
if (bn.basin_type == "file"
223+
and bn.basin_format == "hdf5"):
224+
bpath = str(bn.location.resolve())
225+
if bpath in path_strings:
226+
basins.append(bpath)
227+
except BaseException:
228+
logger.error(f"Failed to get basin info for {pp}. "
229+
f"Traceback follows.")
230+
logger.error(traceback.format_exc())
207231
paths_basins[pp] = basins
208232

209233
# edit path_strings in-place and populate paths_sort_order
@@ -418,14 +442,12 @@ def task_compress_resources(self):
418442
Data are stored in the user's cache directory and
419443
deleted after upload is complete.
420444
"""
421-
# make sure that we have .rtdc or .dc files
422-
dc_files = [pp for pp in self.paths if pp.suffix in [".rtdc", ".dc"]]
423-
if not dc_files:
424-
raise ValueError("There are no RT-DC files in this dataset!")
425445
self.set_state("compress")
426446
ds_dict = self.api.get("package_show", id=self.dataset_id, timeout=500)
427447
for ii, path in enumerate(self.paths):
428-
if path.suffix in [".rtdc", ".dc"]: # do we have a DC file?
448+
# We check EVERY DC file. So `test_open=False`. Integrity checker
449+
# will open it and make sure it is a valid DC instance.
450+
if is_dc_file(path, test_open=False):
429451
if resource_exists(
430452
dataset_id=self.dataset_id,
431453
resource_name=self.resource_names[ii],
@@ -434,7 +456,7 @@ def task_compress_resources(self):
434456
dataset_dict=ds_dict
435457
):
436458
# There is no need to compress resources that have
437-
# already been upladed to DCOR. The same check is done
459+
# already been uploaded to DCOR. The same check is done
438460
# in task_upload_resources, so there is no danger that
439461
# an uncompressed resource would be uploaded.
440462
continue

dcoraid/upload/task.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import uuid
1919

2020
from ..api import dataset_create, APINotFoundError
21+
from ..common import is_dc_file
2122

2223
from .job import UploadJob
2324

@@ -271,7 +272,7 @@ def load_task(path, api, dataset_kwargs=None, map_task_to_dataset_id=None,
271272
raise ValueError("Number of resource paths does not match number "
272273
f"of resource supplements in '{path}'!")
273274
for ii, pp in enumerate(uj_state["resource_paths"]):
274-
if (not str(pp).endswith(".rtdc")
275+
if (not is_dc_file(pp, test_open=False)
275276
and uj_state["resource_supplements"][ii]):
276277
raise ValueError("Resource supplements must be empty for "
277278
f"non-RT-DC datasets in '{path}'!")

tests/test_upload_job.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,56 @@
2424
]
2525

2626

27+
def test_check_existence_for_dc_resource_1(tmp_path):
28+
(tmp_path / "test.csv").write_text("This is not a DC resource")
29+
(tmp_path / "test.txt").write_text("This is not a DC resource as well")
30+
31+
api = common.get_api()
32+
33+
# create some metadata
34+
bare_dict = common.make_dataset_dict(hint="create-with-resource")
35+
# create dataset (to get the "id")
36+
dataset_dict = dataset_create(dataset_dict=bare_dict, api=api)
37+
with pytest.raises(job.AtLeastOneDCResourceRequiredPerDatasetError,
38+
match="test.csv"):
39+
job.UploadJob(api=api,
40+
dataset_id=dataset_dict["id"],
41+
resource_paths=list(tmp_path.glob("*")))
42+
43+
44+
def test_check_existence_for_dc_resource_2(tmp_path):
45+
(tmp_path / "test.rtdc").write_text("This is not a DC resource")
46+
47+
api = common.get_api()
48+
49+
# create some metadata
50+
bare_dict = common.make_dataset_dict(hint="create-with-resource")
51+
# create dataset (to get the "id")
52+
dataset_dict = dataset_create(dataset_dict=bare_dict, api=api)
53+
with pytest.raises(job.AtLeastOneDCResourceRequiredPerDatasetError):
54+
job.UploadJob(api=api,
55+
dataset_id=dataset_dict["id"],
56+
resource_paths=list(tmp_path.glob("*")))
57+
58+
59+
def test_check_existence_for_dc_resource_control(tmp_path):
60+
(tmp_path / "test.rtdc").write_text(
61+
"This is not a DC resource and running the upload job would fail,"
62+
"but we are only checking UploadJob instantiation here.")
63+
shutil.copy2(rtdc_paths[0], tmp_path / "test2.rtdc")
64+
65+
api = common.get_api()
66+
67+
# create some metadata
68+
bare_dict = common.make_dataset_dict(hint="create-with-resource")
69+
# create dataset (to get the "id")
70+
dataset_dict = dataset_create(dataset_dict=bare_dict, api=api)
71+
uj = job.UploadJob(api=api,
72+
dataset_id=dataset_dict["id"],
73+
resource_paths=list(tmp_path.glob("*")))
74+
assert len(uj.paths) == 2
75+
76+
2777
def test_resource_name_characters():
2878
assert re.match(job.VALID_RESOURCE_REGEXP, job.VALID_RESOURCE_CHARS)
2979
assert re.match(job.VALID_RESOURCE_REGEXP, job.VALID_RESOURCE_CHARS)

0 commit comments

Comments
 (0)