Skip to content

Commit 13a95c3

Browse files
Merge pull request #145 from zequihg50/is_cloud_optim
Proposal for is_cloud_optimized
2 parents e434584 + c315877 commit 13a95c3

4 files changed

Lines changed: 108 additions & 26 deletions

File tree

Changelog.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
Version 1.0.0
2+
-------------
3+
4+
**2025-11-12**
5+
6+
* Added consolidated metadata functionality by `Ezequiel Cimadevilla <https://github.com/zequihg50>`_ in https://github.com/NCAS-CMS/pyfive/pull/145
7+
18
Version 0.9.0
29
-------------
310

pyfive/h5d.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,19 +62,19 @@ def __init__(self, dataobject, noindex=False, pseudo_chunking_size_MB=4):
6262
# No file descriptor => Not Posix
6363
self.posix = False
6464
self.__fh = fh
65-
self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
65+
self.pseudo_chunking_size = pseudo_chunking_size_MB * 1024 * 1024
6666
try:
6767
# maybe this is an S3File instance?
68-
self._filename = getattr(fh,'path')
68+
self._filename = getattr(fh, 'path')
6969
except:
7070
# maybe a remote https file opened as bytes?
7171
# failing that, maybe a memory file, return as None
72-
self._filename = getattr(fh,'full_name','None')
72+
self._filename = getattr(fh, 'full_name', 'None')
7373
else:
7474
# Has a file descriptor => Posix
7575
self.posix = True
7676
self._filename = fh.name
77-
self.pseudo_chunking_size = 0
77+
self.pseudo_chunking_size = 0
7878

7979
self.filter_pipeline = dataobject.filter_pipeline
8080
self.shape = dataobject.shape
@@ -284,7 +284,11 @@ def first_chunk(self):
284284
285285
"""
286286
self.__chunk_init_check()
287-
return self.get_chunk_info(0).byte_offset
287+
min_offset = None
288+
for k in self._index:
289+
if min_offset is None or self._index[k].byte_offset < min_offset:
290+
min_offset = self._index[k].byte_offset
291+
return min_offset
288292

289293
#### The following method can be used to set pseudo chunking size after the
290294
#### file has been closed and before data transactions. This is pyfive specific

pyfive/high_level.py

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from pyfive.h5py import Datatype
1515

1616

17-
1817
class Group(Mapping):
1918
"""
2019
An HDF5 Group which may hold attributes, datasets, or other groups.
@@ -64,7 +63,6 @@ def __getitem__(self, y):
6463
"""
6564
return self.__getitem_lazy_control(y, noindex=False)
6665

67-
6866
def get_lazy_view(self, y):
6967
"""
7068
This instantiates the object y, and if it is a
@@ -81,7 +79,6 @@ def get_lazy_view(self, y):
8179

8280
return self.__getitem_lazy_control(y, noindex=True)
8381

84-
8582
def __getitem_lazy_control(self, y, noindex):
8683
"""
8784
This is the routine which actually does the get item
@@ -130,7 +127,7 @@ def __getitem_lazy_control(self, y, noindex):
130127
if additional_obj != '.':
131128
raise KeyError('%s is a dataset, not a group' % (obj_name))
132129
return Dataset(obj_name, DatasetID(dataobjs, noindex=noindex), self)
133-
130+
134131
try:
135132
# if true, this may well raise a NotImplementedError, if so, we need
136133
# to warn the user, who may be able to use other parts of the data.
@@ -263,14 +260,35 @@ def __init__(self, filename, mode='r'):
263260
self.userblock_size = 0
264261
super(File, self).__init__('/', dataobjects, self)
265262

263+
@property
264+
def consolidated_metadata(self):
265+
"""Returns True if all B-tree nodes for chunked datasets are located before the first chunk in the file."""
266+
is_consolidated = True
267+
f = self
268+
269+
# for all chunked datasets, check if all btree nodes are located before any dataset chunk
270+
max_btree, min_chunk = None, None
271+
for ds in f:
272+
if isinstance(f[ds], Dataset):
273+
if f[ds].id.layout_class == 2:
274+
if max_btree is None or f[ds].id.btree_range[1] > max_btree:
275+
max_btree = f[ds].id.btree_range[1]
276+
if min_chunk is None or f[ds].id.first_chunk < min_chunk:
277+
min_chunk = f[ds].id.first_chunk
278+
279+
if max_btree is not None and min_chunk is not None:
280+
is_consolidated = max_btree < min_chunk
281+
282+
return is_consolidated
283+
266284
def __repr__(self):
267285
return '<HDF5 file "%s" (mode r)>' % (os.path.basename(self.filename))
268286

269287
def _get_object_by_address(self, obj_addr):
270288
""" Return the object pointed to by a given address. """
271289
if self._dataobjects.offset == obj_addr:
272290
return self
273-
291+
274292
queue = deque([(self.name.rstrip('/'), self)])
275293
while queue:
276294
base, grp = queue.popleft()
@@ -288,6 +306,7 @@ def close(self):
288306
""" Close the file. """
289307
if self._close:
290308
self._fh.close()
309+
291310
__del__ = close
292311

293312
def __enter__(self):
@@ -340,7 +359,6 @@ class Dataset(object):
340359
Group instance containing this dataset.
341360
342361
"""
343-
344362

345363
def __init__(self, name, datasetid, parent):
346364
""" initalize. """
@@ -349,15 +367,14 @@ def __init__(self, name, datasetid, parent):
349367
self.name = name
350368
self._attrs = None
351369
self._astype = None
352-
353-
self.id=datasetid
370+
371+
self.id = datasetid
354372
""" This is the DatasetID instance which provides the actual data access methods. """
355373

356-
#horrible kludge for now,
357-
#https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
358-
#we hide stuff we need here
374+
# horrible kludge for now,
375+
# https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
376+
# we hide stuff we need here
359377
self._dataobjects = self.id._meta
360-
361378

362379
def __repr__(self):
363380
info = (os.path.basename(self.name), self.shape, self.dtype)
@@ -392,16 +409,15 @@ def astype(self, dtype):
392409
def len(self):
393410
""" Return the size of the first axis. """
394411
return self.shape[0]
395-
412+
396413
def iter_chunks(self, *args):
397414
return self.id.iter_chunks(args)
398-
399415

400416
@property
401417
def shape(self):
402418
""" shape attribute. """
403419
return self.id.shape
404-
420+
405421
@property
406422
def maxshape(self):
407423
""" maxshape attribute. (None for unlimited dimensions) """
@@ -473,15 +489,17 @@ def dims(self):
473489
def attrs(self):
474490
""" attrs attribute. """
475491
return self.id._meta.attributes
476-
492+
493+
477494
class DimensionManager(Sequence):
478495
""" Represents a collection of dimensions associated with a dataset. """
496+
479497
def __init__(self, dset):
480498
ndim = len(dset.shape)
481-
dim_list = [[]]*ndim
499+
dim_list = [[]] * ndim
482500
if 'DIMENSION_LIST' in dset.attrs:
483501
dim_list = dset.attrs['DIMENSION_LIST']
484-
dim_labels = [b'']*ndim
502+
dim_labels = [b''] * ndim
485503
if 'DIMENSION_LABELS' in dset.attrs:
486504
dim_labels = dset.attrs['DIMENSION_LABELS']
487505
self._dims = [
@@ -521,8 +539,9 @@ class AstypeContext(object):
521539
"""
522540
Context manager which allows changing the type read from a dataset.
523541
"""
524-
#FIXME:ENUM should this allow a conversion from enum base types to values using dictionary?
525-
#Probably not, as it would be additional functionality to the h5py interface???
542+
543+
# FIXME:ENUM should this allow a conversion from enum base types to values using dictionary?
544+
# Probably not, as it would be additional functionality to the h5py interface???
526545

527546
def __init__(self, dset, dtype):
528547
self._dset = dset
@@ -533,4 +552,3 @@ def __enter__(self):
533552

534553
def __exit__(self, *args):
535554
self._dset._astype = None
536-
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pyfive
5+
import h5py
6+
7+
8+
def test_consolidated_metadata(name, name_consolidated, data, vname):
9+
# non consolidated metadata
10+
with pyfive.File(name) as hfile:
11+
assert ((hfile[vname].id.btree_range[1] > hfile[vname].id.first_chunk) and (not hfile.consolidated_metadata))
12+
13+
# all btree nodes before first chunk (aka consolidated metadata)
14+
with pyfive.File(name_consolidated) as hfile:
15+
assert ((hfile[vname].id.btree_range[1] < hfile[vname].id.first_chunk) and hfile.consolidated_metadata)
16+
17+
18+
@pytest.fixture(scope='module')
19+
def data():
20+
return np.arange(365 * 721 * 1440, dtype="f4").reshape((365, 721, 1440))
21+
22+
23+
@pytest.fixture(scope='module')
24+
def vname():
25+
return "a"
26+
27+
28+
@pytest.fixture(scope='module')
29+
def name(data, vname, modular_tmp_path):
30+
name = modular_tmp_path / 'non-consolidated-metadata.hdf5'
31+
32+
with h5py.File(name, 'w') as hfile:
33+
hfile.create_dataset(vname, dtype="float32", shape=data.shape, chunks=(1, 721, 1440),
34+
compression="gzip", shuffle=True)
35+
# in this way first logical chunk (0,0,0) will not be first physical chunk (byte offset)
36+
hfile["a"][250:] = data[250:]
37+
hfile["a"][:250] = data[:250]
38+
39+
return name
40+
41+
42+
@pytest.fixture(scope='module')
43+
def name_consolidated(data, vname, modular_tmp_path):
44+
name_co = modular_tmp_path / 'consolidated-metadata.hdf5'
45+
46+
with h5py.File(name_co, 'w', meta_block_size=2 ** 20) as hfile:
47+
hfile.create_dataset(vname, dtype="float32", shape=data.shape, chunks=(1, 721, 1440),
48+
compression="gzip", shuffle=True)
49+
# in this way first logical chunk (0,0,0) will not be first physical chunk (byte offset)
50+
hfile["a"][250:] = data[250:]
51+
hfile["a"][:250] = data[:250]
52+
53+
return name_co

0 commit comments

Comments
 (0)