Merge pull request #145 from zequihg50/is_cloud_optim

valeriupredoi · web-flow · commit 13a95c3ccc1a · 2025-11-12T15:40:55.000Z
Proposal for is_cloud_optimized
diff --git a/Changelog.rst b/Changelog.rst
@@ -1,3 +1,10 @@
+Version 1.0.0
+-------------
+
+**2025-11-12**
+
+* Added consolidated metadata functionality by `Ezequiel Cimadevilla <https://github.com/zequihg50>`_ in https://github.com/NCAS-CMS/pyfive/pull/145
+
 Version 0.9.0
 -------------
 
diff --git a/pyfive/h5d.py b/pyfive/h5d.py
@@ -62,19 +62,19 @@ def __init__(self, dataobject, noindex=False, pseudo_chunking_size_MB=4):
             #  No file descriptor => Not Posix
             self.posix = False
             self.__fh = fh
-            self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
+            self.pseudo_chunking_size = pseudo_chunking_size_MB * 1024 * 1024
             try:
                 # maybe this is an S3File instance?
-                self._filename = getattr(fh,'path')
+                self._filename = getattr(fh, 'path')
             except:
                 # maybe a remote https file opened as bytes?
                 # failing that, maybe a memory file, return as None
-                self._filename = getattr(fh,'full_name','None')
+                self._filename = getattr(fh, 'full_name', 'None')
         else:
             # Has a file descriptor => Posix
             self.posix = True
             self._filename = fh.name
-            self.pseudo_chunking_size = 0 
+            self.pseudo_chunking_size = 0
 
         self.filter_pipeline = dataobject.filter_pipeline
         self.shape = dataobject.shape
@@ -284,7 +284,11 @@ def first_chunk(self):
 
         """
         self.__chunk_init_check()
-        return self.get_chunk_info(0).byte_offset
+        min_offset = None
+        for k in self._index:
+            if min_offset is None or self._index[k].byte_offset < min_offset:
+                min_offset = self._index[k].byte_offset
+        return min_offset
 
     #### The following method can be used to set pseudo chunking size after the 
     #### file has been closed and before data transactions. This is pyfive specific
diff --git a/pyfive/high_level.py b/pyfive/high_level.py
@@ -14,7 +14,6 @@
 from pyfive.h5py import Datatype
 
 
-
 class Group(Mapping):
     """
     An HDF5 Group which may hold attributes, datasets, or other groups.
@@ -64,7 +63,6 @@ def __getitem__(self, y):
         """
         return self.__getitem_lazy_control(y, noindex=False)
 
-
     def get_lazy_view(self, y):
         """ 
         This instantiates the object y, and if it is a 
@@ -81,7 +79,6 @@ def get_lazy_view(self, y):
 
         return self.__getitem_lazy_control(y, noindex=True)
 
-
     def __getitem_lazy_control(self, y, noindex):
         """ 
         This is the routine which actually does the get item
@@ -130,7 +127,7 @@ def __getitem_lazy_control(self, y, noindex):
             if additional_obj != '.':
                 raise KeyError('%s is a dataset, not a group' % (obj_name))
             return Dataset(obj_name, DatasetID(dataobjs, noindex=noindex), self)
-       
+
         try:
             # if true, this may well raise a NotImplementedError, if so, we need
             # to warn the user, who may be able to use other parts of the data.
@@ -263,14 +260,35 @@ def __init__(self, filename, mode='r'):
         self.userblock_size = 0
         super(File, self).__init__('/', dataobjects, self)
 
+    @property
+    def consolidated_metadata(self):
+        """Returns True if all B-tree nodes for chunked datasets are located before the first chunk in the file."""
+        is_consolidated = True
+        f = self
+
+        # for all chunked datasets, check if all btree nodes are located before any dataset chunk
+        max_btree, min_chunk = None, None
+        for ds in f:
+            if isinstance(f[ds], Dataset):
+                if f[ds].id.layout_class == 2:
+                    if max_btree is None or f[ds].id.btree_range[1] > max_btree:
+                        max_btree = f[ds].id.btree_range[1]
+                    if min_chunk is None or f[ds].id.first_chunk < min_chunk:
+                        min_chunk = f[ds].id.first_chunk
+
+        if max_btree is not None and min_chunk is not None:
+            is_consolidated = max_btree < min_chunk
+
+        return is_consolidated
+
     def __repr__(self):
         return '<HDF5 file "%s" (mode r)>' % (os.path.basename(self.filename))
 
     def _get_object_by_address(self, obj_addr):
         """ Return the object pointed to by a given address. """
         if self._dataobjects.offset == obj_addr:
             return self
-        
+
         queue = deque([(self.name.rstrip('/'), self)])
         while queue:
             base, grp = queue.popleft()
@@ -288,6 +306,7 @@ def close(self):
         """ Close the file. """
         if self._close:
             self._fh.close()
+
     __del__ = close
 
     def __enter__(self):
@@ -340,7 +359,6 @@ class Dataset(object):
         Group instance containing this dataset.
 
     """
-    
 
     def __init__(self, name, datasetid, parent):
         """ initalize. """
@@ -349,15 +367,14 @@ def __init__(self, name, datasetid, parent):
         self.name = name
         self._attrs = None
         self._astype = None
-        
-        self.id=datasetid
+
+        self.id = datasetid
         """ This is the DatasetID instance which provides the actual data access methods. """
 
-        #horrible kludge for now,
-        #https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
-        #we hide stuff we need here
+        # horrible kludge for now,
+        # https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
+        # we hide stuff we need here
         self._dataobjects = self.id._meta
-   
 
     def __repr__(self):
         info = (os.path.basename(self.name), self.shape, self.dtype)
@@ -392,16 +409,15 @@ def astype(self, dtype):
     def len(self):
         """ Return the size of the first axis. """
         return self.shape[0]
-    
+
     def iter_chunks(self, *args):
         return self.id.iter_chunks(args)
-    
 
     @property
     def shape(self):
         """ shape attribute. """
         return self.id.shape
-    
+
     @property
     def maxshape(self):
         """ maxshape attribute. (None for unlimited dimensions) """
@@ -473,15 +489,17 @@ def dims(self):
     def attrs(self):
         """ attrs attribute. """
         return self.id._meta.attributes
-     
+
+
 class DimensionManager(Sequence):
     """ Represents a collection of dimensions associated with a dataset. """
+
     def __init__(self, dset):
         ndim = len(dset.shape)
-        dim_list = [[]]*ndim
+        dim_list = [[]] * ndim
         if 'DIMENSION_LIST' in dset.attrs:
             dim_list = dset.attrs['DIMENSION_LIST']
-        dim_labels = [b'']*ndim
+        dim_labels = [b''] * ndim
         if 'DIMENSION_LABELS' in dset.attrs:
             dim_labels = dset.attrs['DIMENSION_LABELS']
         self._dims = [
@@ -521,8 +539,9 @@ class AstypeContext(object):
     """
     Context manager which allows changing the type read from a dataset.
     """
-    #FIXME:ENUM should this allow a conversion from enum base types to values using dictionary?
-    #Probably not, as it would be additional functionality to the h5py interface???
+
+    # FIXME:ENUM should this allow a conversion from enum base types to values using dictionary?
+    # Probably not, as it would be additional functionality to the h5py interface???
 
     def __init__(self, dset, dtype):
         self._dset = dset
@@ -533,4 +552,3 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self._dset._astype = None
-
diff --git a/tests/test_consolidated_metadata.py b/tests/test_consolidated_metadata.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pytest
+
+import pyfive
+import h5py
+
+
+def test_consolidated_metadata(name, name_consolidated, data, vname):
+    # non consolidated metadata
+    with pyfive.File(name) as hfile:
+        assert ((hfile[vname].id.btree_range[1] > hfile[vname].id.first_chunk) and (not hfile.consolidated_metadata))
+
+    # all btree nodes before first chunk (aka consolidated metadata)
+    with pyfive.File(name_consolidated) as hfile:
+        assert ((hfile[vname].id.btree_range[1] < hfile[vname].id.first_chunk) and hfile.consolidated_metadata)
+
+
+@pytest.fixture(scope='module')
+def data():
+    return np.arange(365 * 721 * 1440, dtype="f4").reshape((365, 721, 1440))
+
+
+@pytest.fixture(scope='module')
+def vname():
+    return "a"
+
+
+@pytest.fixture(scope='module')
+def name(data, vname, modular_tmp_path):
+    name = modular_tmp_path / 'non-consolidated-metadata.hdf5'
+
+    with h5py.File(name, 'w') as hfile:
+        hfile.create_dataset(vname, dtype="float32", shape=data.shape, chunks=(1, 721, 1440),
+                             compression="gzip", shuffle=True)
+        # in this way first logical chunk (0,0,0) will not be first physical chunk (byte offset)
+        hfile["a"][250:] = data[250:]
+        hfile["a"][:250] = data[:250]
+
+    return name
+
+
+@pytest.fixture(scope='module')
+def name_consolidated(data, vname, modular_tmp_path):
+    name_co = modular_tmp_path / 'consolidated-metadata.hdf5'
+
+    with h5py.File(name_co, 'w', meta_block_size=2 ** 20) as hfile:
+        hfile.create_dataset(vname, dtype="float32", shape=data.shape, chunks=(1, 721, 1440),
+                             compression="gzip", shuffle=True)
+        # in this way first logical chunk (0,0,0) will not be first physical chunk (byte offset)
+        hfile["a"][250:] = data[250:]
+        hfile["a"][:250] = data[:250]
+
+    return name_co