Skip to content

Commit 01e9611

Browse files
committed
Split format correction into separate method with tests
Only set url_type to api when there's a url if it is not already set to upload (alternatively could check if url looks like a filestore one but I think there are some datasets set up linking to other datasets' resources)
1 parent 0ad9170 commit 01e9611

6 files changed

Lines changed: 101 additions & 57 deletions

File tree

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ rpds-py==0.27.0
260260
# referencing
261261
rsa==4.9.1
262262
# via google-auth
263-
ruamel-yaml==0.18.14
263+
ruamel-yaml==0.18.15
264264
# via hdx-python-utilities
265265
ruamel-yaml-clib==0.2.12
266266
# via ruamel-yaml

src/hdx/api/utilities/filestore_helper.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def check_filestore_resource(
6262
int: Status code
6363
"""
6464
resource_data_to_update.set_types()
65+
resource_data_to_update.correct_format(resource_data_to_update.data)
6566
cls.resource_check_required_fields(resource_data_to_update, **kwargs)
6667
file_to_upload = resource_data_to_update.get_file_to_upload()
6768
if file_to_upload:
@@ -120,19 +121,30 @@ def dataset_update_filestore_resource(
120121
else:
121122
# update file if size or hash has changed
122123
filestore_resources[resource_index] = file_to_upload
124+
resource_data_to_update["resource_type"] = "file.upload"
125+
resource_data_to_update["url_type"] = "upload"
126+
if "tracking_summary" in resource_data_to_update:
127+
del resource_data_to_update["tracking_summary"]
123128
resource_data_to_update["url"] = cls.temporary_url
124129
resource_data_to_update["size"] = size
125130
resource_data_to_update["hash"] = hash
126131
resource_data_to_update._url_backup = None
127132
status = 2
128-
elif resource_data_to_update.is_marked_data_updated():
129-
# Should not output timezone info here
130-
resource_data_to_update["last_modified"] = now_utc_notz().isoformat(
131-
timespec="microseconds"
132-
)
133-
resource_data_to_update._data_updated = False
134-
status = 0
135133
else:
136-
status = 1
137-
resource_data_to_update.set_types()
134+
if (
135+
"url" in resource_data_to_update
136+
and resource_data_to_update.get("url_type") != "upload"
137+
):
138+
resource_data_to_update["resource_type"] = "api"
139+
resource_data_to_update["url_type"] = "api"
140+
if resource_data_to_update.is_marked_data_updated():
141+
# Should not output timezone info here
142+
resource_data_to_update["last_modified"] = now_utc_notz().isoformat(
143+
timespec="microseconds"
144+
)
145+
resource_data_to_update._data_updated = False
146+
status = 0
147+
else:
148+
status = 1
149+
resource_data_to_update.correct_format(resource_data_to_update.data)
138150
return status

src/hdx/data/resource.py

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -332,37 +332,43 @@ def check_neither_url_filetoupload(self) -> None:
332332
if self._file_to_upload is None and "url" not in self.data:
333333
raise HDXError("Either a url or a file to upload must be supplied!")
334334

335-
def set_types(self, data: Optional[Dict] = None) -> None:
336-
"""Add resource_type and url_type if not supplied based on url or file to
337-
upload. Correct the file type.
335+
def correct_format(self, data: Dict = None) -> None:
336+
"""Correct the format of the file
338337
339338
Args:
340-
data (Dict): Resource data. Defaults to None (self.data)
339+
data (Dict): Resource data.
341340
342341
Returns:
343342
None
344343
"""
345-
if data is None:
346-
data = self.data
347-
if self._file_to_upload is None:
348-
if "url" in data:
349-
data["resource_type"] = "api"
350-
data["url_type"] = "api"
351-
else:
352-
data["resource_type"] = "file.upload"
353-
data["url_type"] = "upload"
354-
if "tracking_summary" in data:
355-
del data["tracking_summary"]
356344
file_format = data.get("format")
357-
if file_format is not None:
358-
file_format = self.get_mapped_format(
359-
file_format, configuration=self.configuration
345+
if file_format is None:
346+
return
347+
file_format = self.get_mapped_format(
348+
file_format, configuration=self.configuration
349+
)
350+
if not file_format:
351+
raise HDXError(
352+
f"Supplied file format {file_format} is invalid and could not be mapped to a known type!"
360353
)
361-
if not file_format:
362-
raise HDXError(
363-
f"Supplied file type {file_format} is invalid and could not be mapped to a known type!"
364-
)
365-
data["format"] = file_format
354+
data["format"] = file_format
355+
356+
def set_types(self) -> None:
357+
"""Add resource_type and url_type if not supplied based on url or file to
358+
upload.
359+
360+
Returns:
361+
None
362+
"""
363+
if self._file_to_upload is None:
364+
if "url" in self.data:
365+
self.data["resource_type"] = "api"
366+
self.data["url_type"] = "api"
367+
else:
368+
self.data["resource_type"] = "file.upload"
369+
self.data["url_type"] = "upload"
370+
if "tracking_summary" in self.data:
371+
del self.data["tracking_summary"]
366372

367373
def check_required_fields(self, ignore_fields: ListTuple[str] = tuple()) -> None:
368374
"""Check that metadata for resource is complete. The parameter ignore_fields
@@ -417,21 +423,29 @@ def _resource_merge_hdx_update(
417423
else:
418424
# update file if size or hash has changed
419425
files["upload"] = self._file_to_upload
426+
self._old_data["resource_type"] = "file.upload"
427+
self._old_data["url_type"] = "upload"
428+
if "tracking_summary" in self._old_data:
429+
del self._old_data["tracking_summary"]
420430
self._old_data["size"] = size
421431
self._old_data["hash"] = hash
422432
status = 2
423433
self._url_backup = None
424-
elif data_updated:
425-
# Should not output timezone info here
426-
self._old_data["last_modified"] = now_utc_notz().isoformat(
427-
timespec="microseconds"
428-
)
429-
self._data_updated = False
430-
status = 0
431434
else:
432-
status = 1
435+
if "url" in self._old_data:
436+
self._old_data["resource_type"] = "api"
437+
self._old_data["url_type"] = "api"
438+
if data_updated:
439+
# Should not output timezone info here
440+
self._old_data["last_modified"] = now_utc_notz().isoformat(
441+
timespec="microseconds"
442+
)
443+
self._data_updated = False
444+
status = 0
445+
else:
446+
status = 1
433447

434-
self.set_types(self._old_data)
448+
self.correct_format(self._old_data)
435449
# old_data will be merged into data in the next step
436450
self._merge_hdx_update("resource", "id", files, True, **kwargs)
437451
return status
@@ -498,6 +512,7 @@ def create_in_hdx(self, **kwargs: Any) -> int:
498512
return self._resource_merge_hdx_update(**kwargs)
499513

500514
self.set_types()
515+
self.correct_format(self.data)
501516
if "ignore_check" not in kwargs: # allow ignoring of field checks
502517
self.check_required_fields()
503518
files = {}

tests/hdx/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def json(self):
8484
"id": "de6549d8-268b-4dfe-adaf-a4ae5c8510d5",
8585
"description": "Resource1",
8686
"name": "Resource1",
87-
"url": "http://resource1.xlsx",
87+
"url": "http://data.humdata.org/dataset/resource/resource1.xlsx",
8888
"format": "xlsx",
8989
},
9090
{
@@ -148,9 +148,9 @@ def json(self):
148148
"hash": "",
149149
"package_id": "6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d",
150150
"name": "Resource1",
151-
"url": "http://resource1.xlsx",
152-
"resource_type": "api",
153-
"url_type": "api",
151+
"url": "http://data.humdata.org/dataset/resource/resource1.xlsx",
152+
"resource_type": "file.upload",
153+
"url_type": "upload",
154154
"size": None,
155155
"mimetype_inner": None,
156156
"cache_last_updated": None,

tests/hdx/data/test_dataset_core.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,12 @@ def post(url, data, headers, files, allow_redirects, auth=None):
365365
datadict["resources"][i] = resultdictcopy["resources"][
366366
i
367367
]
368+
if datadict["id"] == "TEST5":
369+
resultdictcopy["resources"][i].update(
370+
datadict["resources"][i]
371+
)
372+
if datadict["id"] == "TEST5":
373+
del datadict["resources"]
368374
merge_two_dictionaries(resultdictcopy, datadict)
369375
for i, resource in enumerate(resultdictcopy["resources"]):
370376
resource["package_id"] = resultdictcopy["id"]

tests/hdx/data/test_resource.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -678,26 +678,37 @@ def test_read_from_hdx(self, configuration, read):
678678
with pytest.raises(HDXError):
679679
Resource.read_from_hdx("ABC")
680680

681-
def test_check_types_url_filetoupload(self, configuration):
682-
resource_data_copy = copy.deepcopy(resource_data)
683-
del resource_data_copy["url"]
684-
resource = Resource(resource_data_copy)
681+
def test_url_filetoupload(self, configuration):
682+
resource = Resource(resource_data)
683+
del resource["url"]
685684
with pytest.raises(HDXError):
686685
resource.check_neither_url_filetoupload()
687-
resource.set_types()
688686
resource.set_file_to_upload("abc")
689-
resource.set_types()
690687
resource["url"] = "lala"
691688
with pytest.raises(HDXError):
692689
resource.check_both_url_filetoupload()
693-
resource = Resource(resource_data_copy)
690+
691+
def test_check_types(self, configuration):
692+
resource = Resource(resource_data)
693+
resource.set_types()
694+
assert resource["resource_type"] == "api"
695+
assert resource["url_type"] == "api"
696+
resource.set_file_to_upload("abc")
697+
resource.set_types()
698+
assert resource["resource_type"] == "file.upload"
699+
assert resource["url_type"] == "upload"
700+
701+
def test_correct_format(self, configuration):
702+
resource = Resource(resource_data)
703+
resource["format"] = "XLSX"
704+
resource.correct_format(resource.data)
705+
assert resource.get_format() == "xlsx"
694706
resource["format"] = "NOTEXIST"
695707
with pytest.raises(HDXError):
696-
resource.set_types()
697-
with pytest.raises(HDXError):
698-
resource.set_format("NOTEXIST")
708+
resource.correct_format(resource.data)
699709
del resource["format"]
700-
resource.set_types()
710+
resource.correct_format(resource.data)
711+
assert resource.get_format() is None
701712

702713
def test_get_set_date_of_resource(self, configuration):
703714
resource = Resource({"daterange_for_data": "[2020-01-07T00:00:00 TO *]"})

0 commit comments

Comments
 (0)