Skip to content

Commit a301074

Browse files
committed
enh: allow specifying collections in upload task files and via the API
1 parent 1df814a commit a301074

7 files changed

Lines changed: 117 additions & 5 deletions

File tree

CHANGELOG

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
0.17.3
2+
- enh: allow specifying collections in upload task files and via the API
23
- docs: increase warning threshold for max number of resources to 1000
34
(CKAN 2.11.3 handles datasets with many resources much better)
45
0.17.2

dcoraid/api/ckan_api.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,3 +430,23 @@ def post(self, api_call, data, dump_json=True, headers=None,
430430
timeout=timeout)
431431
resp = self.handle_response(req, api_call)
432432
return resp["result"]
433+
434+
def require_collection(self,
435+
name: str,
436+
title: str = None):
437+
"""Return a collection dict with the given name and optional title
438+
439+
The collection is created if it does not exist already. If the
440+
collection already exists, `title` is ignored.
441+
"""
442+
# check whether the group exists
443+
try:
444+
col_dict = self.get("group_show", id=name)
445+
except APINotFoundError:
446+
# Create non-existent group
447+
self.post("group_create",
448+
data={"name": name,
449+
"title": title or name,
450+
})
451+
col_dict = self.get("group_show", id=name)
452+
return col_dict

dcoraid/upload/job.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from dclab.rtdc_dataset.check import IntegrityChecker
1212
from dclab.cli import compress
1313

14-
from ..api import dataset_activate, resource_add, resource_exists
14+
from ..api import CKANAPI, dataset_activate, resource_add, resource_exists
1515
from ..common import is_dc_file, sha256sum
1616

1717

@@ -48,9 +48,15 @@ class AtLeastOneDCResourceRequiredPerDatasetError(BaseException):
4848

4949

5050
class UploadJob:
51-
def __init__(self, api, dataset_id, resource_paths,
52-
resource_names=None, resource_supplements=None,
53-
task_id=None, cache_dir=None):
51+
def __init__(self,
52+
api: CKANAPI,
53+
dataset_id: str,
54+
resource_paths: list[str | pathlib.Path],
55+
resource_names: list[str] = None,
56+
resource_supplements: list[dict] = None,
57+
collections: list[str] = None,
58+
task_id: str = None,
59+
cache_dir: str | pathlib.Path = None):
5460
"""Wrapper for resource uploads
5561
5662
This job is meant to be run from a separate thread.
@@ -73,6 +79,9 @@ def __init__(self, api, dataset_id, resource_paths,
7379
on DCOR
7480
resource_supplements: list of dict
7581
Supplementary resource information
82+
collections: list of strings
83+
List of unique identifiers of collections that this
84+
dataset should be appended to
7685
task_id: str
7786
Unique task ID (used for identifying jobs uploaded already)
7887
cache_dir: str or pathlib.Path
@@ -86,6 +95,16 @@ def __init__(self, api, dataset_id, resource_paths,
8695

8796
self.api = api.copy() # create a copy of the API
8897
self.dataset_id = dataset_id
98+
self.collections = []
99+
# add dataset to collections
100+
for col in collections or []:
101+
col_dict = self.api.require_collection(col)
102+
self.collections.append(col_dict)
103+
if self.collections:
104+
revise_dict = {
105+
"match": {"id": dataset_id},
106+
"update": {"groups": self.collections}}
107+
api.post("package_revise", revise_dict)
89108

90109
# Check whether at least one DC resource is present in the list.
91110
# This is a hard DCOR requirement.

tests/common.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def make_upload_task(task_id=True, # tester may pass `None` to disable
8888
resource_paths=None,
8989
resource_names=None,
9090
resource_supplements=None,
91+
collections=None,
9192
):
9293
"""Return path to example task file"""
9394
if resource_paths is None:
@@ -100,6 +101,8 @@ def make_upload_task(task_id=True, # tester may pass `None` to disable
100101
dataset_dict = make_dataset_dict(hint="task_test")
101102
if dataset_dict and dataset_id is None:
102103
dataset_id = dataset_dict.get("id")
104+
if collections is None:
105+
collections = []
103106
td = pathlib.Path(tempfile.mkdtemp(prefix="task_"))
104107
# copy resources there
105108
new_resource_paths = []
@@ -117,6 +120,7 @@ def make_upload_task(task_id=True, # tester may pass `None` to disable
117120
"resource_paths": [str(pp) for pp in new_resource_paths],
118121
"resource_names": resource_names,
119122
"resource_supplements": resource_supplements,
123+
"collections": collections,
120124
}
121125
data = {"upload_job": uj_state}
122126
if dataset_dict:

tests/test_api_base.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import time
2+
import uuid
23

34
import pytest
45

@@ -37,6 +38,23 @@ def test_api_requests_cache_with_parameters():
3738
assert (t1 - t0) > (t2 - t1)
3839

3940

41+
def test_require_collection():
42+
"""Test creating a collection"""
43+
api = common.get_api()
44+
name = f"test-collection-{uuid.uuid4()}"
45+
title = f"Test Collection {name[-5:]}"
46+
47+
# create collection
48+
col_dict = api.require_collection(name=name, title=title)
49+
assert col_dict["name"] == name
50+
assert col_dict["title"] == title
51+
52+
# retrieve collection, title is ignored
53+
col_dict2 = api.require_collection(name=name, title="RANDOM")
54+
assert col_dict2["name"] == name
55+
assert col_dict2["title"] == title
56+
57+
4058
@pytest.mark.parametrize("server,api_server", [
4159
("http://localhost:5000", "http://localhost:5000"),
4260
("https://localhost:5000", "https://localhost:5000"),

tests/test_upload_job.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import tempfile
66
import time
77
from unittest import mock
8+
import uuid
89
import warnings
910

1011
import pytest
@@ -74,6 +75,31 @@ def test_check_existence_for_dc_resource_control(tmp_path):
7475
assert len(uj.paths) == 2
7576

7677

78+
def test_collection():
79+
api = common.get_api()
80+
# create some metadata
81+
collection = f"test-collection-{uuid.uuid4()}"
82+
bare_dict = common.make_dataset_dict(hint="create-with-resource"
83+
)
84+
# create dataset (to get the "id")
85+
dataset_dict = dataset_create(dataset_dict=bare_dict, api=api)
86+
uj = job.UploadJob(api=api,
87+
dataset_id=dataset_dict["id"],
88+
resource_paths=rtdc_paths,
89+
collections=[collection],
90+
)
91+
assert uj.state == "init"
92+
uj.task_compress_resources()
93+
assert uj.state == "parcel"
94+
uj.task_upload_resources()
95+
assert uj.state == "online"
96+
97+
common.wait_for_job_no_queue(uj)
98+
99+
ds_dict = api.get("package_show", id=uj.dataset_id)
100+
assert ds_dict["groups"][0]["name"] == collection
101+
102+
77103
def test_resource_name_characters():
78104
assert re.match(job.VALID_RESOURCE_REGEXP, job.VALID_RESOURCE_CHARS)
79105
assert re.match(job.VALID_RESOURCE_REGEXP, job.VALID_RESOURCE_CHARS)

tests/test_upload_task.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,30 @@ def test_create_task():
5959
assert ddict["resources"][1]["size"] == 6
6060

6161

62+
def test_collection():
63+
api = common.get_api()
64+
collection = f"test-collection-{uuid.uuid4()}"
65+
# post dataset creation request
66+
task_path = common.make_upload_task(
67+
resource_paths=[str(dpath)],
68+
resource_names=[dpath.name],
69+
collections=[collection],
70+
)
71+
uj = task.load_task(task_path, api=api)
72+
uj.task_compress_resources()
73+
uj.task_upload_resources()
74+
uj.task_verify_resources()
75+
common.wait_for_job_no_queue(uj)
76+
77+
# now make sure the collection was set correctly
78+
ds_dict = api.get("package_show", id=uj.dataset_id)
79+
assert ds_dict["groups"][0]["name"] == collection
80+
# TODO: There is a bug in CKAN that does not allow listing group datasets
81+
# https://github.com/ckan/ckan/issues/9052
82+
# col_dict = api.get("group_show", id=collection, include_datasets=True)
83+
# assert col_dict["packages"][0]["id"] == ds_dict["id"]
84+
85+
6286
def test_custom_dataset_dict():
6387
api = common.get_api()
6488
# post dataset creation request
@@ -111,7 +135,7 @@ def test_dataset_id_already_exists_active_fails():
111135
uj.task_compress_resources()
112136
assert uj.state == "parcel"
113137
with pytest.raises(dcoraid.api.APIAuthorizationError,
114-
match=""):
138+
match=None):
115139
uj.task_upload_resources()
116140

117141

0 commit comments

Comments
 (0)