Skip to content

Commit fe9d576

Browse files
committed
Hash files on upload
1 parent c05787a commit fe9d576

2 files changed

Lines changed: 31 additions & 17 deletions

File tree

requirements.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ cfgv==3.4.0
2222
# via pre-commit
2323
chardet==5.2.0
2424
# via frictionless
25-
charset-normalizer==3.4.2
25+
charset-normalizer==3.4.3
2626
# via requests
2727
ckanapi==4.8
2828
# via hdx-python-api (pyproject.toml)
@@ -32,7 +32,7 @@ click==8.2.1
3232
# typer
3333
colorama==0.4.6
3434
# via mkdocs-material
35-
coverage==7.10.2
35+
coverage==7.10.3
3636
# via pytest-cov
3737
defopt==7.0.0
3838
# via hdx-python-api (pyproject.toml)
@@ -72,7 +72,7 @@ hdx-python-utilities==3.9.0
7272
# hdx-python-country
7373
humanize==4.12.3
7474
# via frictionless
75-
identify==2.6.12
75+
identify==2.6.13
7676
# via pre-commit
7777
idna==3.10
7878
# via
@@ -115,9 +115,9 @@ markdown==3.8.2
115115
# mkdocs
116116
# mkdocs-material
117117
# pymdown-extensions
118-
markdown-it-py==3.0.0
118+
markdown-it-py==4.0.0
119119
# via rich
120-
marko==2.1.4
120+
marko==2.2.0
121121
# via frictionless
122122
markupsafe==3.0.2
123123
# via
@@ -175,7 +175,7 @@ ply==3.11
175175
# libhxl
176176
pockets==0.9.1
177177
# via sphinxcontrib-napoleon
178-
pre-commit==4.2.0
178+
pre-commit==4.3.0
179179
# via hdx-python-api (pyproject.toml)
180180
pyasn1==0.6.1
181181
# via
@@ -253,7 +253,7 @@ rfc3986==2.0.0
253253
# via frictionless
254254
rich==14.1.0
255255
# via typer
256-
rpds-py==0.26.0
256+
rpds-py==0.27.0
257257
# via
258258
# jsonschema
259259
# referencing

src/hdx/data/resource.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Resource class containing all logic for creating, checking, and updating resources."""
22

3+
import hashlib
34
import logging
45
import warnings
56
from datetime import datetime
@@ -358,15 +359,17 @@ def check_required_fields(self, ignore_fields: ListTuple[str] = tuple()) -> None
358359
self.check_url_filetoupload()
359360
self._check_required_fields("resource", ignore_fields)
360361

361-
def _get_files(self) -> Dict:
362-
"""Return the files parameter for CKANAPI
362+
def _get_hash(self) -> str:
363+
"""Return the hash of file to upload
363364
364365
Returns:
365-
Dict: files parameter for CKANAPI
366+
str: Hash of file to upload
366367
"""
367-
if self.file_to_upload is None:
368-
return {}
369-
return {"upload": self.file_to_upload}
368+
md5 = hashlib.md5()
369+
f = open(self.file_to_upload, "rb")
370+
while chunk := f.read(4096):
371+
md5.update(chunk)
372+
return md5.hexdigest()
370373

371374
def _resource_merge_hdx_update(
372375
self,
@@ -384,14 +387,21 @@ def _resource_merge_hdx_update(
384387
None
385388
"""
386389
data_updated = kwargs.pop("data_updated", self.data_updated)
387-
if data_updated and not self.file_to_upload:
390+
files = {}
391+
if self.file_to_upload:
392+
hash = self._get_hash()
393+
if hash != self.data.get("hash"): # update file if hash has changed
394+
files["upload"] = self.file_to_upload
395+
self.old_data["hash"] = hash
396+
elif data_updated:
388397
# Should not output timezone info here
389398
self.old_data["last_modified"] = now_utc_notz().isoformat(
390399
timespec="microseconds"
391400
)
392401
self.data_updated = False
393-
# old_data will be merged into data in the next step
394-
self._merge_hdx_update("resource", "id", self._get_files(), True, **kwargs)
402+
403+
# old_data will be merged into data in the next step
404+
self._merge_hdx_update("resource", "id", files, True, **kwargs)
395405

396406
def update_in_hdx(self, **kwargs: Any) -> None:
397407
"""Check if resource exists in HDX and if so, update it. To indicate
@@ -441,7 +451,11 @@ def create_in_hdx(self, **kwargs: Any) -> None:
441451
del self.data["url"]
442452
self._resource_merge_hdx_update(**kwargs)
443453
else:
444-
self._save_to_hdx("create", "name", self._get_files(), True)
454+
files = {}
455+
if self.file_to_upload:
456+
files["upload"] = self.file_to_upload
457+
self.data["hash"] = self._get_hash()
458+
self._save_to_hdx("create", "name", files, True)
445459

446460
def delete_from_hdx(self) -> None:
447461
"""Deletes a resource from HDX

0 commit comments

Comments
 (0)