NCAS-CMS
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎activestorage/active.py‎
Lines changed: 98 additions & 92 deletions b/‎activestorage/active.py‎
Lines changed: 98 additions & 92 deletions
@@ -69,7 +69,7 @@ There are some (relatively obsolete) documents from our exploration of zarr inte
 ## Storage types
 
 PyActiveStorage is designed to interact with various storage backends.
-The storage backend is automatically detected, but can still be specified using the `storage_type` argument to the `Active` constructor.
+The storage backend is automatically detected, but can still be specified using the `interface_type` argument to the `Active` constructor.
 There are two main integration points for a storage backend:
 
 #. Load netCDF metadata
@@ -78,7 +78,7 @@ There are two main integration points for a storage backend:
 ### Local file
 
 The default storage backend is a local file.
-To use a local file, use a `storage_type` of `None`, which is its default value.
+To use a local file, use a `interface_type` of `None`, which is its default value.
 netCDF metadata is loaded using the [netCDF4](https://pypi.org/project/netCDF4/) library.
 The chunk reductions are implemented in `activestorage.storage` using NumPy.
 
@@ -87,7 +87,7 @@ The chunk reductions are implemented in `activestorage.storage` using NumPy.
 We now have support for Active runs with netCDF4 files on S3, from [PR 89](https://github.com/NCAS-CMS/PyActiveStorage/pull/89).
 To achieve this we integrate with [Reductionist](https://github.com/stackhpc/reductionist-rs), an S3 Active Storage Server.
 Reductionist is typically deployed "near" to an S3-compatible object store and provides an API to perform numerical reductions on object data.
-To use Reductionist, use a `storage_type` of `s3`.
+To use Reductionist, use a `interface_type` of `s3`.
 
 To load metadata, netCDF files are opened using `s3fs`, with `h5netcdf` used to put the open file (which is nothing more than a memory view of the netCDF file) into an hdf5/netCDF-like object format.
 Chunk reductions are implemented in `activestorage.reductionist`, with each operation resulting in an API request to the Reductionist server.
 
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import Optional
 
+import aiohttp
 import fsspec
 import numpy as np
 import pyfive
@@ -19,7 +20,7 @@
 from activestorage.storage import reduce_chunk, reduce_opens3_chunk
 
 
-def return_storage_type(uri):
+def return_interface_type(uri):
     """
     Extract the gateway-protocol to infer what type of storage
     """
@@ -83,14 +84,40 @@ def load_from_s3(uri, storage_options=None):
     return ds
 
 
-def load_from_https(uri):
+def get_endpoint_url(storage_options):
+    """
+    Return the endpoint_url defined in storage_options, or `None` if not defined.
+    """
+    if storage_options is not None:
+        endpoint_url = storage_options.get('endpoint_url')
+        if endpoint_url is not None:
+            return endpoint_url
+        client_kwargs = storage_options.get('client_kwargs')
+        if client_kwargs:
+            endpoint_url = client_kwargs.get('endpoint_url')
+            if endpoint_url is not None:
+                return endpoint_url
+
+
+def load_from_https(uri, storage_options=None):
     """
     Load a pyfive.high_level.Dataset from a
     netCDF4 file on an https server (NGINX).
+    This works for both http and https endpoints.
     """
-    # TODO need to test if NGINX server behind https://
-    fs = fsspec.filesystem('http')
-    http_file = fs.open(uri, 'rb')
+    if storage_options is None:
+        client_kwargs = {'auth': None}
+        fs = fsspec.filesystem('http', **client_kwargs)
+        http_file = fs.open(uri, 'rb')
+    else:
+        username = storage_options.get("username", None)
+        password = storage_options.get("password", None)
+        client_kwargs = {
+            'auth': aiohttp.BasicAuth(username, password) if username and password else None
+        }
+        fs = fsspec.filesystem('http', **client_kwargs)
+        http_file = fs.open(uri, 'rb')
+
     ds = pyfive.File(http_file)
     print(f"Dataset loaded from https with Pyfive: {uri}")
     return ds
@@ -161,7 +188,7 @@ def __init__(self,
                  dataset: Optional[str | Path | object],
                  ncvar: str = None,
                  axis: tuple = None,
-                 storage_type: str = None,
+                 interface_type: str = None,
                  max_threads: int = 100,
                  storage_options: dict = None,
                  active_storage_url: str = None) -> None:
@@ -192,9 +219,9 @@ def __init__(self,
             self.ds = dataset
         self.uri = dataset
 
-        # determine the storage_type
+        # determine the interface_type
         # based on what we have available
-        if not storage_type:
+        if not interface_type:
             if not input_variable:
                 check_uri = self.uri
             else:
@@ -210,29 +237,29 @@ def __init__(self,
                     else:
                         check_uri = os.path.join(base_url,
                                                  self.ds.id._filename)
-            storage_type = return_storage_type(check_uri)
+            interface_type = return_interface_type(check_uri)
 
-        # still allow for a passable storage_type
+        # still allow for a passable interface_type
         # for special cases eg "special-POSIX" ie DDN
-        if not storage_type and storage_options is not None:
-            storage_type = urllib.parse.urlparse(dataset).scheme
-        self.storage_type = storage_type
+        if not interface_type and storage_options is not None:
+            interface_type = urllib.parse.urlparse(dataset).scheme
+        self.interface_type = interface_type
 
         # set correct filename attr
-        if input_variable and not self.storage_type:
+        if input_variable and not self.interface_type:
             self.filename = self.ds
-        elif input_variable and self.storage_type == "s3":
+        elif input_variable and self.interface_type == "s3":
+            self.filename = self.ds.id._filename
+        elif input_variable and self.interface_type == "https":
             self.filename = self.ds.id._filename
-        elif input_variable and self.storage_type == "https":
-            self.filename = self.ds
 
         # get storage_options
         self.storage_options = storage_options
         self.active_storage_url = active_storage_url
 
         # basic check on file
         if not input_variable:
-            if not os.path.isfile(self.uri) and not self.storage_type:
+            if not os.path.isfile(self.uri) and not self.interface_type:
                 raise ValueError(
                     f"Must use existing file for uri. {self.uri} not found")
 
@@ -268,14 +295,15 @@ def __load_nc_file(self):
         and `_filename` attribute.
         """
         ncvar = self.ncvar
-        if self.storage_type is None:
+        if self.interface_type is None:
             nc = pyfive.File(self.uri)
-        elif self.storage_type == "s3":
+        elif self.interface_type == "s3":
             nc = load_from_s3(self.uri, self.storage_options)
-        elif self.storage_type == "https":
-            nc = load_from_https(self.uri)
+        elif self.interface_type == "https":
+            nc = load_from_https(self.uri, self.storage_options)
         self.filename = self.uri
         self.ds = nc[ncvar]
+        print("Loaded dataset", self.ds)
 
     def __get_missing_attributes(self):
         if self.ds is None:
@@ -366,19 +394,22 @@ def method(self, value):
 
         self._method = value
 
-    @property
-    def mean(self):
+    def mean(self, axis=None):
         self._method = "mean"
+        if axis is not None:
+            self._axis = axis
         return self
 
-    @property
-    def min(self):
+    def min(self, axis=None):
         self._method = "min"
+        if axis is not None:
+            self._axis = axis
         return self
 
-    @property
-    def max(self):
+    def max(self, axis=None):
         self._method = "max"
+        if axis is not None:
+            self._axis = axis
         return self
 
     @property
@@ -482,9 +513,13 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype,
             out = np.ma.empty(out_shape, dtype=out_dtype, order=ds._order)
 
         # Create a shared session object.
-        if self.storage_type == "s3" and self._version == 2:
+        if self.interface_type == "s3" and self._version == 2:
             if self.storage_options is not None:
                 key, secret = None, None
+                if self.storage_options.get("anon", None) is True:
+                    print("Reductionist session for Anon S3 bucket.")
+                    session = reductionist.get_session(
+                        None, None, S3_ACTIVE_STORAGE_CACERT)
                 if "key" in self.storage_options:
                     key = self.storage_options["key"]
                 if "secret" in self.storage_options:
@@ -499,6 +534,15 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype,
                 session = reductionist.get_session(S3_ACCESS_KEY,
                                                    S3_SECRET_KEY,
                                                    S3_ACTIVE_STORAGE_CACERT)
+        elif self.interface_type == "https" and self._version == 2:
+            username, password = None, None
+            if self.storage_options is not None:
+                username = self.storage_options.get("username", None)
+                password = self.storage_options.get("password", None)
+            if username and password:
+                session = reductionist.get_session(username, password, None)
+            else:
+                session = reductionist.get_session(None, None, None)
         else:
             session = None
 
@@ -586,16 +630,9 @@ def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype,
 
     def _get_endpoint_url(self):
         """Return the endpoint_url of an S3 object store, or `None`"""
-        endpoint_url = self.storage_options.get('endpoint_url')
+        endpoint_url = get_endpoint_url(self.storage_options)
         if endpoint_url is not None:
             return endpoint_url
-
-        client_kwargs = self.storage_options.get('client_kwargs')
-        if client_kwargs:
-            endpoint_url = client_kwargs.get('endpoint_url')
-            if endpoint_url is not None:
-                return endpoint_url
-
         return f"http://{urllib.parse.urlparse(self.filename).netloc}"
 
     def _process_chunk(self,
@@ -624,8 +661,7 @@ def _process_chunk(self,
         # Axes over which to apply a reduction
         axis = self._axis
 
-        if self.storage_type == 's3' and self._version == 1:
-
+        if self.interface_type == 's3' and self._version == 1:
             tmp, count = reduce_opens3_chunk(ds._fh,
                                              offset,
                                              size,
@@ -639,11 +675,9 @@ def _process_chunk(self,
                                              axis=axis,
                                              method=self.method)
 
-        elif self.storage_type == "s3" and self._version == 2:
+        elif self.interface_type == "s3" and self._version == 2:
             # S3: pass in pre-configured storage options (credentials)
-            # print("S3 rfile is:", self.filename)
             parsed_url = urllib.parse.urlparse(self.filename)
-
             bucket = parsed_url.netloc
             object = parsed_url.path
 
@@ -652,17 +686,13 @@ def _process_chunk(self,
             if bucket == "":
                 bucket = os.path.dirname(object)
                 object = os.path.basename(object)
-            # print("S3 bucket:", bucket)
-            # print("S3 file:", object)
             if self.storage_options is None:
 
                 # for the moment we need to force ds.dtype to be a numpy type
                 # Reductionist returns "count" as a list even for single elements
                 tmp, count = reductionist.reduce_chunk(session,
                                                        S3_ACTIVE_STORAGE_URL,
-                                                       S3_URL,
-                                                       bucket,
-                                                       object,
+                                                       f"{S3_URL}/{bucket}/{object}",
                                                        offset,
                                                        size,
                                                        compressor,
@@ -675,22 +705,14 @@ def _process_chunk(self,
                                                        axis,
                                                        operation=self._method)
             else:
-                # special case for "anon=True" buckets that work only with e.g.
-                # fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL})
-                # where file uri = bucketX/fileY.mc
-                # print("S3 Storage options to Reductionist:", self.storage_options)
                 if self.storage_options.get("anon", None) is True:
-                    bucket = os.path.dirname(parsed_url.path)  # bucketX
-                    object = os.path.basename(parsed_url.path)  # fileY
-                    print("S3 anon=True Bucket and File:", bucket, object)
-
+                    bucket = os.path.dirname(parsed_url.path)
+                    object = os.path.basename(parsed_url.path)
                 # Reductionist returns "count" as a list even for single elements
                 tmp, count = reductionist.reduce_chunk(
                     session,
                     self.active_storage_url,
-                    self._get_endpoint_url(),
-                    bucket,
-                    object,
+                    f"{self._get_endpoint_url()}/{bucket}/{object}",
                     offset,
                     size,
                     compressor,
@@ -702,40 +724,24 @@ def _process_chunk(self,
                     chunk_selection,
                     axis,
                     operation=self._method)
-        # this is for testing ONLY until Reductionist is able to handle https
-        # located files; after that, we can pipe any regular https file through
-        # to Reductionist, provided the https server is "closer" to Reductionist
-        elif self.storage_type == "https" and self._version == 2:
-            # build a simple session
-            session = requests.Session()
-            session.auth = (None, None)
-            session.verify = False
-            bucket = "https"  # really doesn't matter
-
-            # note the extra "storage_type" kwarg
-            # this currently makes Reductionist throw a wobbly
-            # E           activestorage.reductionist.ReductionistError: Reductionist error: HTTP 400: {"error": {"message": "request data is not valid", "caused_by": ["Failed to deserialize the JSON body into the target type", "storage_type: unknown field `storage_type`, expected one of `source`, `bucket`, `object`, `dtype`, `byte_order`, `offset`, `size`, `shape`, `order`, `selection`, `compression`, `filters`, `missing` at line 1 column 550"]}}  # noqa
-
-            # Reductionist returns "count" as a list even for single elements
-            tmp, count = reductionist.reduce_chunk(
-                session,
-                "https://reductionist.jasmin.ac.uk/",  # Wacasoft
-                self.filename,
-                bucket,
-                self.filename,
-                offset,
-                size,
-                compressor,
-                filters,
-                self.missing,
-                np.dtype(ds.dtype),
-                chunks,
-                ds._order,
-                chunk_selection,
-                axis,
-                operation=self._method,
-                storage_type="https")
-        elif self.storage_type == 'ActivePosix' and self.version == 2:
+        elif self.interface_type == "https" and self._version == 2:
+            tmp, count = reductionist.reduce_chunk(session,
+                                                   self.active_storage_url,
+                                                   self.filename,
+                                                   offset,
+                                                   size,
+                                                   compressor,
+                                                   filters,
+                                                   self.missing,
+                                                   np.dtype(ds.dtype),
+                                                   chunks,
+                                                   ds._order,
+                                                   chunk_selection,
+                                                   axis,
+                                                   operation=self._method,
+                                                   interface_type="https")
+
+        elif self.interface_type == 'ActivePosix' and self.version == 2:
             # This is where the DDN Fuse and Infinia wrappers go
             raise NotImplementedError
         else: