66from pathlib import Path
77from typing import Optional
88
9+ import aiohttp
910import fsspec
1011import numpy as np
1112import pyfive
@@ -83,14 +84,40 @@ def load_from_s3(uri, storage_options=None):
8384 return ds
8485
8586
86- def load_from_https (uri ):
87+ def get_endpoint_url (storage_options ):
88+ """
89+ Return the endpoint_url defined in storage_options, or `None` if not defined.
90+ """
91+ if storage_options is not None :
92+ endpoint_url = storage_options .get ('endpoint_url' )
93+ if endpoint_url is not None :
94+ return endpoint_url
95+ client_kwargs = storage_options .get ('client_kwargs' )
96+ if client_kwargs :
97+ endpoint_url = client_kwargs .get ('endpoint_url' )
98+ if endpoint_url is not None :
99+ return endpoint_url
100+
101+
102+ def load_from_https (uri , storage_options = None ):
87103 """
88104 Load a pyfive.high_level.Dataset from a
89105 netCDF4 file on an https server (NGINX).
106+ This works for both http and https endpoints.
90107 """
91- # TODO need to test if NGINX server behind https://
92- fs = fsspec .filesystem ('http' )
93- http_file = fs .open (uri , 'rb' )
108+ if storage_options is None :
109+ client_kwargs = {'auth' : None }
110+ fs = fsspec .filesystem ('http' , ** client_kwargs )
111+ http_file = fs .open (uri , 'rb' )
112+ else :
113+ username = storage_options .get ("username" , None )
114+ password = storage_options .get ("password" , None )
115+ client_kwargs = {
116+ 'auth' : aiohttp .BasicAuth (username , password ) if username and password else None
117+ }
118+ fs = fsspec .filesystem ('http' , ** client_kwargs )
119+ http_file = fs .open (uri , 'rb' )
120+
94121 ds = pyfive .File (http_file )
95122 print (f"Dataset loaded from https with Pyfive: { uri } " )
96123 return ds
@@ -272,9 +299,10 @@ def __load_nc_file(self):
272299 elif self .storage_type == "s3" :
273300 nc = load_from_s3 (self .uri , self .storage_options )
274301 elif self .storage_type == "https" :
275- nc = load_from_https (self .uri )
302+ nc = load_from_https (self .uri , self . storage_options )
276303 self .filename = self .uri
277304 self .ds = nc [ncvar ]
305+ print ("Loaded dataset" , self .ds )
278306
279307 def __get_missing_attributes (self ):
280308 if self .ds is None :
@@ -365,19 +393,22 @@ def method(self, value):
365393
366394 self ._method = value
367395
368- @property
369- def mean (self ):
396+ def mean (self , axis = None ):
370397 self ._method = "mean"
398+ if axis is not None :
399+ self ._axis = axis
371400 return self
372401
373- @property
374- def min (self ):
402+ def min (self , axis = None ):
375403 self ._method = "min"
404+ if axis is not None :
405+ self ._axis = axis
376406 return self
377407
378- @property
379- def max (self ):
408+ def max (self , axis = None ):
380409 self ._method = "max"
410+ if axis is not None :
411+ self ._axis = axis
381412 return self
382413
383414 @property
@@ -484,6 +515,10 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype,
484515 if self .storage_type == "s3" and self ._version == 2 :
485516 if self .storage_options is not None :
486517 key , secret = None , None
518+ if self .storage_options .get ("anon" , None ) is True :
519+ print ("Reductionist session for Anon S3 bucket." )
520+ session = reductionist .get_session (
521+ None , None , S3_ACTIVE_STORAGE_CACERT )
487522 if "key" in self .storage_options :
488523 key = self .storage_options ["key" ]
489524 if "secret" in self .storage_options :
@@ -498,6 +533,15 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype,
498533 session = reductionist .get_session (S3_ACCESS_KEY ,
499534 S3_SECRET_KEY ,
500535 S3_ACTIVE_STORAGE_CACERT )
536+ elif self .storage_type == "https" and self ._version == 2 :
537+ username , password = None , None
538+ if self .storage_options is not None :
539+ username = self .storage_options .get ("username" , None )
540+ password = self .storage_options .get ("password" , None )
541+ if username and password :
542+ session = reductionist .get_session (username , password , None )
543+ else :
544+ session = reductionist .get_session (None , None , None )
501545 else :
502546 session = None
503547
@@ -585,16 +629,9 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype,
585629
586630 def _get_endpoint_url (self ):
587631 """Return the endpoint_url of an S3 object store, or `None`"""
588- endpoint_url = self .storage_options . get ( 'endpoint_url' )
632+ endpoint_url = get_endpoint_url ( self .storage_options )
589633 if endpoint_url is not None :
590634 return endpoint_url
591-
592- client_kwargs = self .storage_options .get ('client_kwargs' )
593- if client_kwargs :
594- endpoint_url = client_kwargs .get ('endpoint_url' )
595- if endpoint_url is not None :
596- return endpoint_url
597-
598635 return f"http://{ urllib .parse .urlparse (self .filename ).netloc } "
599636
600637 def _process_chunk (self ,
@@ -624,7 +661,6 @@ def _process_chunk(self,
624661 axis = self ._axis
625662
626663 if self .storage_type == 's3' and self ._version == 1 :
627-
628664 tmp , count = reduce_opens3_chunk (ds ._fh ,
629665 offset ,
630666 size ,
@@ -640,9 +676,7 @@ def _process_chunk(self,
640676
641677 elif self .storage_type == "s3" and self ._version == 2 :
642678 # S3: pass in pre-configured storage options (credentials)
643- # print("S3 rfile is:", self.filename)
644679 parsed_url = urllib .parse .urlparse (self .filename )
645-
646680 bucket = parsed_url .netloc
647681 object = parsed_url .path
648682
@@ -651,17 +685,13 @@ def _process_chunk(self,
651685 if bucket == "" :
652686 bucket = os .path .dirname (object )
653687 object = os .path .basename (object )
654- # print("S3 bucket:", bucket)
655- # print("S3 file:", object)
656688 if self .storage_options is None :
657689
658690 # for the moment we need to force ds.dtype to be a numpy type
659691 # Reductionist returns "count" as a list even for single elements
660692 tmp , count = reductionist .reduce_chunk (session ,
661693 S3_ACTIVE_STORAGE_URL ,
662- S3_URL ,
663- bucket ,
664- object ,
694+ f"{ S3_URL } /{ bucket } /{ object } " ,
665695 offset ,
666696 size ,
667697 compressor ,
@@ -674,22 +704,14 @@ def _process_chunk(self,
674704 axis ,
675705 operation = self ._method )
676706 else :
677- # special case for "anon=True" buckets that work only with e.g.
678- # fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
679- # where file uri = bucketX/fileY.mc
680- # print("S3 Storage options to Reductionist:", self.storage_options)
681707 if self .storage_options .get ("anon" , None ) is True :
682- bucket = os .path .dirname (parsed_url .path ) # bucketX
683- object = os .path .basename (parsed_url .path ) # fileY
684- print ("S3 anon=True Bucket and File:" , bucket , object )
685-
708+ bucket = os .path .dirname (parsed_url .path )
709+ object = os .path .basename (parsed_url .path )
686710 # Reductionist returns "count" as a list even for single elements
687711 tmp , count = reductionist .reduce_chunk (
688712 session ,
689713 self .active_storage_url ,
690- self ._get_endpoint_url (),
691- bucket ,
692- object ,
714+ f"{ self ._get_endpoint_url ()} /{ bucket } /{ object } " ,
693715 offset ,
694716 size ,
695717 compressor ,
@@ -701,39 +723,23 @@ def _process_chunk(self,
701723 chunk_selection ,
702724 axis ,
703725 operation = self ._method )
704- # this is for testing ONLY until Reductionist is able to handle https
705- # located files; after that, we can pipe any regular https file through
706- # to Reductionist, provided the https server is "closer" to Reductionist
707726 elif self .storage_type == "https" and self ._version == 2 :
708- # build a simple session
709- session = requests .Session ()
710- session .auth = (None , None )
711- session .verify = False
712- bucket = "https" # really doesn't matter
713-
714- # note the extra "storage_type" kwarg
715- # this currently makes Reductionist throw a wobbly
716- # E activestorage.reductionist.ReductionistError: Reductionist error: HTTP 400: {"error": {"message": "request data is not valid", "caused_by": ["Failed to deserialize the JSON body into the target type", "storage_type: unknown field `storage_type`, expected one of `source`, `bucket`, `object`, `dtype`, `byte_order`, `offset`, `size`, `shape`, `order`, `selection`, `compression`, `filters`, `missing` at line 1 column 550"]}} # noqa
717-
718- # Reductionist returns "count" as a list even for single elements
719- tmp , count = reductionist .reduce_chunk (
720- session ,
721- "https://reductionist.jasmin.ac.uk/" , # Wacasoft
722- self .filename ,
723- bucket ,
724- self .filename ,
725- offset ,
726- size ,
727- compressor ,
728- filters ,
729- self .missing ,
730- np .dtype (ds .dtype ),
731- chunks ,
732- ds ._order ,
733- chunk_selection ,
734- axis ,
735- operation = self ._method ,
736- storage_type = "https" )
727+ tmp , count = reductionist .reduce_chunk (session ,
728+ self .active_storage_url ,
729+ f"{ self .uri } " ,
730+ offset ,
731+ size ,
732+ compressor ,
733+ filters ,
734+ self .missing ,
735+ np .dtype (ds .dtype ),
736+ chunks ,
737+ ds ._order ,
738+ chunk_selection ,
739+ axis ,
740+ operation = self ._method ,
741+ storage_type = "https" )
742+
737743 elif self .storage_type == 'ActivePosix' and self .version == 2 :
738744 # This is where the DDN Fuse and Infinia wrappers go
739745 raise NotImplementedError
0 commit comments