Skip to content

Commit 3f87a16

Browse files
committed
fix: multiple smaller issues related to hashing of remote data
1 parent d6e7238 commit 3f87a16

7 files changed

Lines changed: 48 additions & 16 deletions

File tree

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
0.71.4
2+
- fix: dataset `identifier` is session-unique for `RTDC_HTTP`
3+
- fix: dataset `hash` must be globally unique for `RTDC_HTTP` and `RTDC_S3`
4+
- fix: strip quotes from ETag in `S3File`
25
- enh: add unified `logger` property for all instances of `RTDCBase`
36
0.71.3
47
- enh: disable HDF5 file locking for basic reading operations

dclab/rtdc_dataset/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ def _get_basin_feature_data(
314314
# especially when considering networking issues.
315315
if feat in bn.features:
316316
data = bn.get_feature_data(feat)
317+
self.logger.info(f"Feature '{feat}' found in '{bn}'")
317318
# The data are available, we may abort the search.
318319
break
319320
except feat_basin.BasinIdentifierMismatchError:

dclab/rtdc_dataset/fmt_hdf5/base.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def __init__(self,
6565

6666
self._hash = None
6767

68-
#: Path to the experimental HDF5 (.rtdc) file
68+
#: Path to the measurement HDF5 (.rtdc) file
6969
self.path = h5path
7070

7171
# Increase the read cache (which defaults to 1MiB), since
@@ -208,9 +208,17 @@ def parse_config(h5path):
208208
def hash(self):
209209
"""Hash value based on file name and content"""
210210
if self._hash is None:
211-
tohash = [self.path.name,
212-
# Hash a maximum of ~1MB of the hdf5 file
213-
hashfile(self.path, blocksize=65536, count=20)]
211+
tohash = []
212+
if isinstance(self.path, pathlib.Path):
213+
# actual path on file system
214+
tohash.append(self.path.name)
215+
elif isinstance(self.path, str):
216+
# remote location (when `hash` not defined in subclass)
217+
tohash.append(self.path)
218+
tohash.append(
219+
# Hash a maximum of ~1MB of the hdf5 file
220+
hashfile(self.path, blocksize=65536, count=20)
221+
)
214222
self._hash = hashobj(tohash)
215223
return self._hash
216224

dclab/rtdc_dataset/fmt_http.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,6 @@ def __init__(self,
4040
f"Package `requests` required for loading http data '{url}'!")
4141

4242
self._fhttp = HTTPFile(url)
43-
if kwargs.get("identifier") is None:
44-
if self._fhttp.etag is not None:
45-
# Set the HTTP ETag as the identifier, it doesn't get
46-
# more unique than that!
47-
kwargs["identifier"] = self._fhttp.etag
48-
else:
49-
# Compute a hash of the first data chunk
50-
kwargs["identifier"] = hashlib.md5(
51-
self._fhttp.get_chunk(0)).hexdigest()
5243

5344
# Initialize the HDF5 dataset
5445
super(RTDC_HTTP, self).__init__(
@@ -59,6 +50,16 @@ def __init__(self,
5950
#: URL to the file
6051
self.path = url
6152

53+
@property
54+
def hash(self):
55+
if self._fhttp.etag is not None:
56+
# Set the HTTP ETag as the hash, it doesn't get
57+
# more unique than that!
58+
return self._fhttp.etag
59+
else:
60+
# Compute a hash of the first data chunk
61+
return hashlib.md5(self._fhttp.get_chunk(0)).hexdigest()
62+
6263
def close(self):
6364
super(RTDC_HTTP, self).close()
6465
self._fhttp.close()

dclab/rtdc_dataset/fmt_s3.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import functools
2+
import hashlib
23
# import multiprocessing BaseManager here, because there is some kind
34
# of circular dependency issue with s3transfer.compat and multiprocessing.
45
from multiprocessing.managers import BaseManager # noqa: F401
@@ -130,7 +131,7 @@ def __init__(self,
130131
def _parse_header(self):
131132
if self._len is None:
132133
self._len = self.s3_object.content_length
133-
self._etag = self.s3_object.e_tag
134+
self._etag = self.s3_object.e_tag.strip("'").strip('"')
134135

135136
def close(self):
136137
super(S3File, self).close()
@@ -207,6 +208,16 @@ def __init__(self,
207208
#: URL the object on S3
208209
self.path = self._s3file.url
209210

211+
@property
212+
def hash(self):
213+
if self._s3file.etag is not None:
214+
# Set the HTTP ETag as the hash, it doesn't get
215+
# more unique than that!
216+
return self._s3file.etag
217+
else:
218+
# Compute a hash of the first data chunk
219+
return hashlib.md5(self._s3file.get_chunk(0)).hexdigest()
220+
210221
def close(self):
211222
super(RTDC_S3, self).close()
212223
self._s3file.close()

tests/test_rtdc_fmt_http.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ def test_cache_features():
3434
assert t2 - t1 < t1 - t0
3535

3636

37-
def test_identifier():
37+
def test_hash():
3838
with RTDC_HTTP(s3_url) as ds:
3939
# This is the HTTP ETag (https://en.wikipedia.org/wiki/HTTP_ETag)
4040
# given to this resource by the object store. If the file is
4141
# re-uploaded, the ETag may change and this test will fail.
42-
assert ds.identifier == "6dd392feb1aeda7cfb73b4ec76c1fe7c"
42+
assert ds.hash == "6dd392feb1aeda7cfb73b4ec76c1fe7c"
4343

4444

4545
@pytest.mark.parametrize("netloc", [

tests/test_rtdc_fmt_s3.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ def test_cache_features():
3535
assert t2 - t1 < t1 - t0
3636

3737

38+
def test_hash():
39+
with RTDC_S3(s3_url) as ds:
40+
# This is the HTTP ETag (https://en.wikipedia.org/wiki/HTTP_ETag)
41+
# given to this resource by the object store. If the file is
42+
# re-uploaded, the ETag may change and this test will fail.
43+
assert ds.hash == "6dd392feb1aeda7cfb73b4ec76c1fe7c"
44+
45+
3846
@pytest.mark.parametrize("url, avail", [
3947
("https://objectstore.hpccloud.mpcdf.mpg.de/"
4048
"circle-5a7a053d-55fb-4f99-960c-f478d0bd418f/"

0 commit comments

Comments
 (0)