Skip to content

Commit b6eb092

Browse files
committed
Hash files before upload and compare hash and size
1 parent fe9d576 commit b6eb092

15 files changed

Lines changed: 1037 additions & 518 deletions

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ urllib3==2.5.0
313313
# requests
314314
validators==0.35.0
315315
# via frictionless
316-
virtualenv==20.33.1
316+
virtualenv==20.34.0
317317
# via pre-commit
318318
watchdog==6.0.0
319319
# via mkdocs

src/hdx/api/utilities/filestore_helper.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from typing import TYPE_CHECKING, Any, Dict
44

5+
from hdx.api.utilities.size_hash import get_size_and_hash
56
from hdx.utilities.dateparse import now_utc_notz
67

78
if TYPE_CHECKING:
@@ -60,19 +61,25 @@ def check_filestore_resource(
6061
cls.resource_check_required_fields(resource_data_to_update, **kwargs)
6162
file_to_upload = resource_data_to_update.get_file_to_upload()
6263
if file_to_upload:
64+
file_format = resource_data_to_update.get("format", "").lower()
65+
size, hash = get_size_and_hash(file_to_upload, file_format)
6366
filestore_resources[resource_index] = file_to_upload
6467
resource_data_to_update["url"] = cls.temporary_url
68+
resource_data_to_update["size"] = size
69+
resource_data_to_update["hash"] = hash
6570

6671
@classmethod
6772
def dataset_update_filestore_resource(
6873
cls,
74+
original_resource_data: "Resource",
6975
resource_data_to_update: "Resource",
7076
filestore_resources: Dict[int, str],
7177
resource_index: int,
7278
) -> None:
7379
"""Helper method to merge updated resource from dataset into HDX resource read from HDX including filestore.
7480
7581
Args:
82+
original_resource_data (Resource): Original resource from dataset
7683
resource_data_to_update (Resource): Updated resource from dataset
7784
filestore_resources (Dict[int, str]): List of (index of resources, file to upload)
7885
resource_index (int): Index of resource
@@ -82,11 +89,21 @@ def dataset_update_filestore_resource(
8289
"""
8390
file_to_upload = resource_data_to_update.get_file_to_upload()
8491
if file_to_upload:
85-
filestore_resources[resource_index] = file_to_upload
86-
resource_data_to_update["url"] = cls.temporary_url
87-
88-
data_updated = resource_data_to_update.is_marked_data_updated()
89-
if data_updated:
92+
file_format = resource_data_to_update.get("format", "").lower()
93+
size, hash = get_size_and_hash(file_to_upload, file_format)
94+
if size == original_resource_data.get(
95+
"size"
96+
) and hash == original_resource_data.get("hash"):
97+
# ensure last_modified is not updated if file hasn't changed
98+
if "last_modified" in resource_data_to_update:
99+
del resource_data_to_update["last_modified"]
100+
else:
101+
# update file if size or hash has changed
102+
filestore_resources[resource_index] = file_to_upload
103+
resource_data_to_update["url"] = cls.temporary_url
104+
resource_data_to_update["size"] = size
105+
resource_data_to_update["hash"] = hash
106+
elif resource_data_to_update.is_marked_data_updated():
90107
# Should not output timezone info here
91108
resource_data_to_update["last_modified"] = now_utc_notz().isoformat(
92109
timespec="microseconds"

src/hdx/api/utilities/size_hash.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import hashlib
2+
from io import BytesIO
3+
from typing import Tuple
4+
5+
from openpyxl import load_workbook
6+
7+
8+
def get_size_and_hash(file_to_upload: str, file_format: str) -> Tuple[int, str]:
9+
"""Return the size and hash of file to upload
10+
11+
Args:
12+
file_to_upload: File to upload
13+
file_format (str): File format
14+
15+
Returns:
16+
Tuple[int, str]: Tuple (size, hash)
17+
"""
18+
f = open(file_to_upload, "rb")
19+
md5hash = hashlib.md5()
20+
if file_format == "xlsx":
21+
first_chunk = f.read(4096)
22+
size = len(first_chunk)
23+
signature = first_chunk[:4]
24+
if signature == b"PK\x03\x04": # xlsx
25+
xlsxbuffer = bytearray(first_chunk)
26+
while chunk := f.read(4096):
27+
size += len(chunk)
28+
xlsxbuffer.extend(chunk)
29+
workbook = load_workbook(filename=BytesIO(xlsxbuffer), read_only=True)
30+
for sheet_name in workbook.sheetnames:
31+
sheet = workbook[sheet_name]
32+
for cols in sheet.iter_rows(values_only=True):
33+
md5hash.update(bytes(str(cols), "utf-8"))
34+
workbook.close()
35+
return size, md5hash.hexdigest()
36+
else:
37+
md5hash.update(first_chunk)
38+
else:
39+
size = 0
40+
while chunk := f.read(4096):
41+
size += len(chunk)
42+
md5hash.update(chunk)
43+
return size, md5hash.hexdigest()

src/hdx/data/dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -898,6 +898,7 @@ def _dataset_update_resources(
898898
]
899899
logger.warning(f"Resource exists. Updating {resource['name']}")
900900
FilestoreHelper.dataset_update_filestore_resource(
901+
resource,
901902
resource_data_to_update,
902903
filestore_resources,
903904
i,
@@ -930,13 +931,15 @@ def _dataset_update_resources(
930931
):
931932
if len(self.resources) > i:
932933
updated_resource_name = resource_data_to_update["name"]
933-
resource_name = self.resources[i]["name"]
934+
resource = self.resources[i]
935+
resource_name = resource["name"]
934936
logger.warning(f"Resource exists. Updating {resource_name}")
935937
if resource_name != updated_resource_name:
936938
logger.warning(
937939
f"Changing resource name to: {updated_resource_name}"
938940
)
939941
FilestoreHelper.dataset_update_filestore_resource(
942+
resource,
940943
resource_data_to_update,
941944
filestore_resources,
942945
i,

src/hdx/data/resource.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Resource class containing all logic for creating, checking, and updating resources."""
22

3-
import hashlib
43
import logging
54
import warnings
65
from datetime import datetime
@@ -12,6 +11,7 @@
1211
from hdx.api.configuration import Configuration
1312
from hdx.api.utilities.date_helper import DateHelper
1413
from hdx.api.utilities.filestore_helper import FilestoreHelper
14+
from hdx.api.utilities.size_hash import get_size_and_hash
1515
from hdx.data.hdxobject import HDXError, HDXObject
1616
from hdx.data.resource_view import ResourceView
1717
from hdx.utilities.dateparse import now_utc, now_utc_notz, parse_date
@@ -359,18 +359,6 @@ def check_required_fields(self, ignore_fields: ListTuple[str] = tuple()) -> None
359359
self.check_url_filetoupload()
360360
self._check_required_fields("resource", ignore_fields)
361361

362-
def _get_hash(self) -> str:
363-
"""Return the hash of file to upload
364-
365-
Returns:
366-
str: Hash of file to upload
367-
"""
368-
md5 = hashlib.md5()
369-
f = open(self.file_to_upload, "rb")
370-
while chunk := f.read(4096):
371-
md5.update(chunk)
372-
return md5.hexdigest()
373-
374362
def _resource_merge_hdx_update(
375363
self,
376364
**kwargs: Any,
@@ -389,9 +377,16 @@ def _resource_merge_hdx_update(
389377
data_updated = kwargs.pop("data_updated", self.data_updated)
390378
files = {}
391379
if self.file_to_upload:
392-
hash = self._get_hash()
393-
if hash != self.data.get("hash"): # update file if hash has changed
380+
file_format = self.old_data.get("format", "").lower()
381+
size, hash = get_size_and_hash(self.file_to_upload, file_format)
382+
if size == self.data.get("size") and hash == self.data.get("hash"):
383+
# ensure last_modified is not updated if file hasn't changed
384+
if "last_modified" in self.data:
385+
del self.data["last_modified"]
386+
else:
387+
# update file if size or hash has changed
394388
files["upload"] = self.file_to_upload
389+
self.old_data["size"] = size
395390
self.old_data["hash"] = hash
396391
elif data_updated:
397392
# Should not output timezone info here
@@ -403,7 +398,7 @@ def _resource_merge_hdx_update(
403398
# old_data will be merged into data in the next step
404399
self._merge_hdx_update("resource", "id", files, True, **kwargs)
405400

406-
def update_in_hdx(self, **kwargs: Any) -> None:
401+
def update_in_hdx(self, **kwargs: Any) -> int:
407402
"""Check if resource exists in HDX and if so, update it. To indicate
408403
that the data in an external resource (given by a URL) has been
409404
updated, set data_updated to True, which will result in the resource
@@ -418,7 +413,7 @@ def update_in_hdx(self, **kwargs: Any) -> None:
418413
date_data_updated (datetime): Date to use for last_modified. Default to None.
419414
420415
Returns:
421-
None
416+
int: Return status code
422417
"""
423418
self._check_load_existing_object("resource", "id")
424419
if self.file_to_upload and "url" in self.data:
@@ -454,7 +449,9 @@ def create_in_hdx(self, **kwargs: Any) -> None:
454449
files = {}
455450
if self.file_to_upload:
456451
files["upload"] = self.file_to_upload
457-
self.data["hash"] = self._get_hash()
452+
self.data["size"], self.data["hash"] = get_size_and_hash(
453+
self.file_to_upload, self.get_format()
454+
)
458455
self._save_to_hdx("create", "name", files, True)
459456

460457
def delete_from_hdx(self) -> None:

tests/fixtures/datastore/ACLED-All-Africa-File_20170101-to-20170708.xlsx renamed to tests/fixtures/size_hash/ACLED-All-Africa-File_20170101-to-20170708.xlsx

File renamed without changes.

tests/fixtures/update_dataset_resources/expected_resources_to_update.json

Lines changed: 85 additions & 87 deletions
Large diffs are not rendered by default.

tests/hdx/api/test_ckan.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,6 @@ def configuration(self):
4646
def datasetmetadata(self):
4747
return join("tests", "fixtures", "CKAN", "hdx_dataset_static.yaml")
4848

49-
@pytest.fixture(scope="function")
50-
def testdata(self):
51-
return join("tests", "fixtures", "test_data.csv")
52-
5349
@pytest.fixture(scope="class")
5450
def params(self):
5551
return {
@@ -101,7 +97,7 @@ def setup_teardown_folder(self, configuration, gclient, params):
10197
def test_create_dataset(
10298
self,
10399
datasetmetadata,
104-
testdata,
100+
test_data,
105101
setup_teardown_folder,
106102
params,
107103
):
@@ -156,7 +152,7 @@ def create_resource():
156152
filestore = resource_no % 2 == 0
157153
if filestore:
158154
resource.set_format("csv")
159-
resource.set_file_to_upload(testdata)
155+
resource.set_file_to_upload(test_data)
160156
else:
161157
wks, url = create_gsheet(
162158
"resource1",
@@ -217,7 +213,7 @@ def create_resource():
217213
resources.pop()
218214
gsheet_resource = resources[5]
219215
gsheet_resource.set_format("csv")
220-
gsheet_resource.set_file_to_upload(testdata)
216+
gsheet_resource.set_file_to_upload(test_data)
221217
for resource in resources:
222218
del resource["package_id"]
223219
dataset.add_update_resources(resources)
@@ -249,6 +245,8 @@ def create_resource():
249245
assert "humdata" not in updated_resource["url"]
250246
else:
251247
assert "humdata" in updated_resource["url"]
248+
assert updated_resource.get("size") == resource.get("size")
249+
assert updated_resource.get("hash") == resource.get("hash")
252250

253251
# modify dataset again starting with existing dataset
254252
title = "HDX Python API test changed again"
@@ -260,7 +258,7 @@ def create_resource():
260258
countryiso3s.append("YEM")
261259
dataset.add_country_location("YEM")
262260
dataset.delete_resource(updated_resources[5])
263-
updated_resources[0].set_file_to_upload(testdata)
261+
updated_resources[0].set_file_to_upload(test_data)
264262
create_resource()
265263
resources = dataset.get_resources()
266264

@@ -294,6 +292,9 @@ def create_resource():
294292
assert "humdata" not in updated_resource["url"]
295293
else:
296294
assert "humdata" in updated_resource["url"]
295+
if i != 7:
296+
assert updated_resource.get("size") == resource.get("size")
297+
assert updated_resource.get("hash") == resource.get("hash")
297298

298299
# tear down
299300
dataset.delete_from_hdx()

tests/hdx/api/utilities/test_filestore_helper.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616

1717

1818
class TestFilestoreHelper:
19-
def test_dataset_update_filestore_resource(self, configuration):
19+
def test_dataset_update_filestore_resource(self, configuration, test_data):
2020
resource_data_copy = copy.deepcopy(resource_data)
21+
orig_resource = Resource(resource_data_copy)
2122
resource = Resource(resource_data_copy)
2223
filestore_resources = {}
2324
FilestoreHelper.dataset_update_filestore_resource(
24-
resource, filestore_resources, 0
25+
orig_resource, resource, filestore_resources, 0
2526
)
2627
assert resource == {
2728
"api_type": "api",
@@ -34,25 +35,45 @@ def test_dataset_update_filestore_resource(self, configuration):
3435
}
3536
assert filestore_resources == {}
3637

37-
resource.set_file_to_upload("test")
38+
resource.set_file_to_upload(test_data)
3839
FilestoreHelper.dataset_update_filestore_resource(
39-
resource, filestore_resources, 0
40+
orig_resource, resource, filestore_resources, 0
4041
)
4142
assert resource == {
4243
"api_type": "api",
4344
"description": "My Resource",
4445
"format": "xlsx",
4546
"name": "MyResource1",
47+
"hash": "3790da698479326339fa99a074cbc1f7",
48+
"size": 1548,
4649
"package_id": "6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d",
4750
"resource_type": "api",
4851
"url": "updated_by_file_upload_step",
4952
}
50-
assert filestore_resources == {0: "test"}
53+
assert filestore_resources == {0: test_data}
5154

55+
filestore_resources = {}
56+
FilestoreHelper.dataset_update_filestore_resource(
57+
resource, resource, filestore_resources, 0
58+
)
59+
assert resource == {
60+
"api_type": "api",
61+
"description": "My Resource",
62+
"format": "xlsx",
63+
"name": "MyResource1",
64+
"hash": "3790da698479326339fa99a074cbc1f7",
65+
"size": 1548,
66+
"package_id": "6f36a41c-f126-4b18-aaaf-6c2ddfbc5d4d",
67+
"resource_type": "api",
68+
"url": "updated_by_file_upload_step",
69+
}
70+
assert filestore_resources == {}
71+
72+
resource.file_to_upload = None
5273
resource.mark_data_updated()
5374
FilestoreHelper.dataset_update_filestore_resource(
54-
resource, filestore_resources, 0
75+
orig_resource, resource, filestore_resources, 0
5576
)
5677
regex = r"^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d.\d\d\d\d\d\d$"
5778
assert re.match(regex, resource["last_modified"])
58-
assert filestore_resources == {0: "test"}
79+
assert filestore_resources == {}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from os.path import join
2+
3+
import pytest
4+
5+
from hdx.api.utilities.size_hash import get_size_and_hash
6+
7+
8+
class TestSizeHash:
9+
@pytest.fixture(scope="class")
10+
def test_xlsx(self, fixturesfolder):
11+
return join(
12+
fixturesfolder,
13+
"size_hash",
14+
"ACLED-All-Africa-File_20170101-to-20170708.xlsx",
15+
)
16+
17+
def test_get_size_and_hash(self, test_data, test_xlsx):
18+
size, hash = get_size_and_hash(test_data, "csv")
19+
assert size == 1548
20+
assert hash == "3790da698479326339fa99a074cbc1f7"
21+
22+
size, hash = get_size_and_hash(test_xlsx, "xlsx")
23+
assert size == 23724
24+
assert hash == "6b8acf7e28d62685a1e829e7fa220d17"

0 commit comments

Comments
 (0)