-
Notifications
You must be signed in to change notification settings - Fork 57
Expand file tree
/
Copy pathdatasets.py
More file actions
290 lines (233 loc) · 11.5 KB
/
datasets.py
File metadata and controls
290 lines (233 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import os
from django.core.files.base import ContentFile
from django.db.models import Q
from django.http import Http404
from django_filters.rest_framework import DjangoFilterBackend
from rest_framework import status
from rest_framework.decorators import api_view, action
from rest_framework.exceptions import PermissionDenied
from rest_framework.filters import SearchFilter
from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from rest_framework.permissions import AllowAny
from api.permissions import user_can_create_competition, is_creator_group_missing
from api.pagination import BasicPagination, LargePagination
from api.serializers import datasets as serializers
from datasets.models import Data
from competitions.models import CompetitionCreationTaskStatus
from utils.data import make_url_sassy, pretty_bytes, gb_to_bytes
class DataViewSet(ModelViewSet):
queryset = Data.objects.all()
filter_backends = (DjangoFilterBackend, SearchFilter)
filter_fields = ('type', 'name', 'key', 'was_created_by_competition', 'is_public')
search_fields = ('file_name', 'name', 'description', 'key', 'competition__title',)
pagination_class = BasicPagination
def get_queryset(self):
if self.request.method == 'GET':
# filters
# -----------
# _public = true if want to show public datasets/submissions
is_public = self.request.query_params.get('_public', 'false') == 'true'
# _type = submission if called from submissions tab to filter only submissions
is_submission = self.request.query_params.get('_type', '') == 'submission'
# _type = dataset if called from datasets and programs tab to filter datasets and programs
is_dataset = self.request.query_params.get('_type', '') == 'dataset'
# _type = dataset if called from datasets and programs tab to filter datasets and programs
is_bundle = self.request.query_params.get('_type', '') == 'bundle'
# get queryset
qs = self.queryset
# filter submissions
if is_submission:
qs = qs.filter(Q(type=Data.SUBMISSION))
# filter datasets and programs
if is_dataset:
qs = qs.filter(type__in=[
Data.INPUT_DATA,
Data.PUBLIC_DATA,
Data.REFERENCE_DATA,
Data.INGESTION_PROGRAM,
Data.SCORING_PROGRAM,
Data.STARTING_KIT,
Data.SOLUTION
])
# filter bundles
if is_bundle:
qs = qs.filter(Q(type=Data.COMPETITION_BUNDLE))
# public filter check
if is_public:
qs = qs.filter(Q(created_by=self.request.user) | Q(is_public=True))
else:
qs = qs.filter(Q(created_by=self.request.user))
# if GET is called but provided no filters, fall back to default behaviour
if (not is_submission) and (not is_dataset) and (not is_bundle) and (not is_public):
qs = self.queryset
qs = qs.filter(Q(is_public=True) | Q(created_by=self.request.user))
else:
qs = self.queryset
qs = qs.filter(Q(is_public=True) | Q(created_by=self.request.user))
qs = qs.exclude(Q(name__isnull=True))
qs = qs.select_related('created_by').order_by('-created_when')
return qs
def get_serializer_class(self):
if self.action == 'public':
return serializers.DatasetSerializer
elif self.request.method == 'GET':
return serializers.DataDetailSerializer
else:
return serializers.DataSerializer
def get_permissions(self):
if self.action == 'public':
return [AllowAny()]
return super().get_permissions()
def create(self, request, *args, **kwargs):
# Check required field
if not request.data.get("file_size"):
return Response({"file_size": "This field is required."}, status=status.HTTP_400_BAD_REQUEST)
# Check file_size is float
try:
file_size = float(request.data.get('file_size', 0))
except (TypeError, ValueError):
return Response(
{"file_size": ["A valid number is required."]},
status=status.HTTP_400_BAD_REQUEST,
)
# Check User quota
storage_used = float(request.user.get_used_storage_space())
quota = float(request.user.quota)
quota = gb_to_bytes(quota)
if storage_used + file_size > quota:
available_space = quota - storage_used
available_space = pretty_bytes(available_space, return_0_for_invalid=True)
file_size = pretty_bytes(file_size)
message = f'Insufficient space. Your available space is {available_space}. The file size is {file_size}. Please free up some space and try again. You can manage your files in the Resources page.'
return Response({'data_file': [message]}, status=status.HTTP_400_BAD_REQUEST)
# All good, let's proceed
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
new_dataset = serializer.save() # request_sassy_file_name is temporarily set via this serializer
headers = self.get_success_headers(serializer.data)
# Make an empty placeholder so we can sign a URL allowing us to upload to it
sassy_file_name = os.path.basename(new_dataset.request_sassy_file_name)
# encode here helps GCS do the upload, complains
# ```TypeError: ('`data` must be bytes, received', <class 'str'>)``` otherwise
new_dataset.data_file.save(sassy_file_name, ContentFile(''.encode()))
context = {
"key": new_dataset.key,
"sassy_url": make_url_sassy(new_dataset.data_file.name, 'w'),
}
return Response(context, status=status.HTTP_201_CREATED, headers=headers)
def destroy(self, request, *args, **kwargs):
dataset = self.get_object()
error = self.check_delete_permissions(request, dataset)
if error:
return Response(
{'error': error},
status=status.HTTP_400_BAD_REQUEST
)
return super().destroy(request, *args, **kwargs)
@action(detail=False, methods=('POST',))
def delete_many(self, request):
qs = Data.objects.filter(id__in=request.data)
errors = {}
for dataset in qs:
error = self.check_delete_permissions(request, dataset)
if error:
errors[dataset.name] = error
if not errors:
qs.delete()
return Response(
errors if errors else {'detail': 'Datasets deleted successfully'},
status=status.HTTP_400_BAD_REQUEST if errors else status.HTTP_200_OK
)
# This function allows for multiple errors when deleting multiple objects
def check_delete_permissions(self, request, dataset):
if request.user != dataset.created_by:
return 'Cannot delete a dataset that is not yours'
if dataset.in_use.exists():
return 'Cannot delete dataset: dataset is in use'
if dataset.submission.first():
sub = dataset.submission.first()
if sub.phase:
return 'Cannot delete submission: submission belongs to an existing competition. Please visit the competition and delete your submission from there.'
@action(detail=False, methods=('GET',), pagination_class=LargePagination)
def public(self, request):
"""
Retrieve a public list of datasets with optional filtering and ordering.
This endpoint returns a paginated list of datasets that are public.
It supports several optional query parameters for filtering and sorting the results.
Query Parameters:
-----------------
- search (str, optional): A search term to filter competitions by their title.
- ordering (str, optional): Specifies the order of the results. Supported values:
* "recently_added" - Most recently created datasets.
* "most_downloaded" - Datasets with the most downloads.
Defaults to "recently_added" if not provided or invalid.
- has_license (bool, optional): If "true", filters datasets that has license.
- is_verified (bool, optional): If "true", filters datasets that are verified.
Returns:
--------
- 200 OK: A paginated or full list of serialized datasets matching the filter criteria. The response is serialized using `DatasetSerializer`.
"""
# Receive filters from request query params
search = request.query_params.get("search")
ordering = request.query_params.get("ordering")
has_license = request.query_params.get("has_license", "false").lower() == "true"
is_verified = request.query_params.get("is_verified", "false").lower() == "true"
qs = Data.objects.filter(
is_public=True,
type=Data.PUBLIC_DATA
)
# Filter by title and description (search)
if search:
qs = qs.filter(
Q(name__icontains=search) |
Q(description__icontains=search)
)
# Filter by has_license
if has_license:
qs = qs.filter(license__isnull=False)
# Filter by is_verified
if is_verified:
qs = qs.filter(is_verified=True)
# Apply ordering
if ordering == "recently_added":
qs = qs.order_by("-id") # most recently created
elif ordering == "most_downloaded":
qs = qs.order_by("-downloads") # descending by download count
else:
qs = qs.order_by("-id") # default fallback
queryset = self.filter_queryset(qs)
page = self.paginate_queryset(queryset)
if page is not None:
serializer = self.get_serializer(page, many=True)
return self.get_paginated_response(serializer.data)
serializer = self.get_serializer(queryset, many=True)
return Response(serializer.data)
@api_view(['PUT'])
def upload_completed(request, key):
# TODO: This view is weird. We have competitions, submissions, etc. that may not need to call this?
# We might need special behavior/metadata for "submission finalization" for example.
# Competitions are a unique use case where they hold all of the metadata in the bundle itself
try:
dataset = Data.objects.get(created_by=request.user, key=key)
except Data.DoesNotExist:
raise Http404()
dataset.upload_completed_successfully = True
dataset.save()
if dataset.type == Data.COMPETITION_BUNDLE:
if is_creator_group_missing():
raise PermissionDenied(
"Competition creation is disabled: configured COMPETITION_CREATOR_GROUP does not exist."
)
if not user_can_create_competition(request.user):
raise PermissionDenied("You do not have permission to create competitions")
# Doing a local import here to avoid circular imports
from competitions.tasks import unpack_competition
status = CompetitionCreationTaskStatus.objects.create(
created_by=request.user,
dataset=dataset,
status=CompetitionCreationTaskStatus.STARTING,
)
unpack_competition.apply_async((status.pk,))
return Response({"status_id": status.pk})
return Response({"key": dataset.key})