File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 110.71.4
2+ - fix: dataset `identifier` is session-unique for `RTDC_HTTP`
3+ - fix: dataset `hash` must be globally unique for `RTDC_HTTP` and `RTDC_S3`
4+ - fix: strip quotes from ETag in `S3File`
25 - enh: add unified `logger` property for all instances of `RTDCBase`
360.71.3
47 - enh: disable HDF5 file locking for basic reading operations
Original file line number Diff line number Diff line change @@ -314,6 +314,7 @@ def _get_basin_feature_data(
314314 # especially when considering networking issues.
315315 if feat in bn .features :
316316 data = bn .get_feature_data (feat )
317+ self .logger .info (f"Feature '{ feat } ' found in '{ bn } '" )
317318 # The data are available, we may abort the search.
318319 break
319320 except feat_basin .BasinIdentifierMismatchError :
Original file line number Diff line number Diff line change @@ -65,7 +65,7 @@ def __init__(self,
6565
6666 self ._hash = None
6767
68- #: Path to the experimental HDF5 (.rtdc) file
68+ #: Path to the measurement HDF5 (.rtdc) file
6969 self .path = h5path
7070
7171 # Increase the read cache (which defaults to 1MiB), since
@@ -208,9 +208,17 @@ def parse_config(h5path):
208208 def hash (self ):
209209 """Hash value based on file name and content"""
210210 if self ._hash is None :
211- tohash = [self .path .name ,
212- # Hash a maximum of ~1MB of the hdf5 file
213- hashfile (self .path , blocksize = 65536 , count = 20 )]
211+ tohash = []
212+ if isinstance (self .path , pathlib .Path ):
213+ # actual path on file system
214+ tohash .append (self .path .name )
215+ elif isinstance (self .path , str ):
216+ # remote location (when `hash` not defined in subclass)
217+ tohash .append (self .path )
218+ tohash .append (
219+ # Hash a maximum of ~1MB of the hdf5 file
220+ hashfile (self .path , blocksize = 65536 , count = 20 )
221+ )
214222 self ._hash = hashobj (tohash )
215223 return self ._hash
216224
Original file line number Diff line number Diff line change @@ -40,15 +40,6 @@ def __init__(self,
4040 f"Package `requests` required for loading http data '{ url } '!" )
4141
4242 self ._fhttp = HTTPFile (url )
43- if kwargs .get ("identifier" ) is None :
44- if self ._fhttp .etag is not None :
45- # Set the HTTP ETag as the identifier, it doesn't get
46- # more unique than that!
47- kwargs ["identifier" ] = self ._fhttp .etag
48- else :
49- # Compute a hash of the first data chunk
50- kwargs ["identifier" ] = hashlib .md5 (
51- self ._fhttp .get_chunk (0 )).hexdigest ()
5243
5344 # Initialize the HDF5 dataset
5445 super (RTDC_HTTP , self ).__init__ (
@@ -59,6 +50,16 @@ def __init__(self,
5950 #: URL to the file
6051 self .path = url
6152
53+ @property
54+ def hash (self ):
55+ if self ._fhttp .etag is not None :
56+ # Set the HTTP ETag as the hash, it doesn't get
57+ # more unique than that!
58+ return self ._fhttp .etag
59+ else :
60+ # Compute a hash of the first data chunk
61+ return hashlib .md5 (self ._fhttp .get_chunk (0 )).hexdigest ()
62+
6263 def close (self ):
6364 super (RTDC_HTTP , self ).close ()
6465 self ._fhttp .close ()
Original file line number Diff line number Diff line change 11import functools
2+ import hashlib
23# import multiprocessing BaseManager here, because there is some kind
34# of circular dependency issue with s3transfer.compat and multiprocessing.
45from multiprocessing .managers import BaseManager # noqa: F401
@@ -130,7 +131,7 @@ def __init__(self,
130131 def _parse_header (self ):
131132 if self ._len is None :
132133 self ._len = self .s3_object .content_length
133- self ._etag = self .s3_object .e_tag
134+ self ._etag = self .s3_object .e_tag . strip ( "'" ). strip ( '"' )
134135
135136 def close (self ):
136137 super (S3File , self ).close ()
@@ -207,6 +208,16 @@ def __init__(self,
207208 #: URL the object on S3
208209 self .path = self ._s3file .url
209210
211+ @property
212+ def hash (self ):
213+ if self ._s3file .etag is not None :
214+ # Set the HTTP ETag as the hash, it doesn't get
215+ # more unique than that!
216+ return self ._s3file .etag
217+ else :
218+ # Compute a hash of the first data chunk
219+ return hashlib .md5 (self ._s3file .get_chunk (0 )).hexdigest ()
220+
210221 def close (self ):
211222 super (RTDC_S3 , self ).close ()
212223 self ._s3file .close ()
Original file line number Diff line number Diff line change @@ -34,12 +34,12 @@ def test_cache_features():
3434 assert t2 - t1 < t1 - t0
3535
3636
37- def test_identifier ():
37+ def test_hash ():
3838 with RTDC_HTTP (s3_url ) as ds :
3939 # This is the HTTP ETag (https://en.wikipedia.org/wiki/HTTP_ETag)
4040 # given to this resource by the object store. If the file is
4141 # re-uploaded, the ETag may change and this test will fail.
42- assert ds .identifier == "6dd392feb1aeda7cfb73b4ec76c1fe7c"
42+ assert ds .hash == "6dd392feb1aeda7cfb73b4ec76c1fe7c"
4343
4444
4545@pytest .mark .parametrize ("netloc" , [
Original file line number Diff line number Diff line change @@ -35,6 +35,14 @@ def test_cache_features():
3535 assert t2 - t1 < t1 - t0
3636
3737
38+ def test_hash ():
39+ with RTDC_S3 (s3_url ) as ds :
40+ # This is the HTTP ETag (https://en.wikipedia.org/wiki/HTTP_ETag)
41+ # given to this resource by the object store. If the file is
42+ # re-uploaded, the ETag may change and this test will fail.
43+ assert ds .hash == "6dd392feb1aeda7cfb73b4ec76c1fe7c"
44+
45+
3846@pytest .mark .parametrize ("url, avail" , [
3947 ("https://objectstore.hpccloud.mpcdf.mpg.de/"
4048 "circle-5a7a053d-55fb-4f99-960c-f478d0bd418f/"
You can’t perform that action at this time.
0 commit comments