Skip to content

Commit 865ef2e

Browse files
Use ManifesStore to_virtual_variable directory in HDFVirtualBackend.
1 parent 7a97287 commit 865ef2e

2 files changed

Lines changed: 58 additions & 56 deletions

File tree

virtualizarr/common.py

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -59,26 +59,10 @@ def replace_virtual_with_loadable_vars(
5959
group=group,
6060
decode_times=decode_times,
6161
) as loadable_ds:
62-
var_names_to_load: list[Hashable]
63-
64-
if isinstance(loadable_variables, list):
65-
var_names_to_load = list(loadable_variables)
66-
elif loadable_variables is None:
67-
# If `loadable_variables` is None, then we have to explicitly match default
68-
# behaviour of xarray, i.e., load and create indexes only for dimension
69-
# coordinate variables. We already have all the indexes and variables
70-
# we should be keeping - we just need to distinguish them.
71-
var_names_to_load = [
72-
name
73-
for name, var in loadable_ds.variables.items()
74-
if var.dims == (name,)
75-
]
76-
else:
77-
raise ValueError(
78-
"loadable_variables must be an iterable of string variable names,"
79-
f" or None, but got type {type(loadable_variables)}"
80-
)
81-
62+
var_names_to_load = get_loadable_variables(
63+
dataset=loadable_ds,
64+
loadable_variables=loadable_variables
65+
)
8266
# this will automatically keep any IndexVariables needed for loadable 1D coordinates
8367
loadable_var_names_to_drop = set(loadable_ds.variables).difference(
8468
var_names_to_load
@@ -99,6 +83,34 @@ def replace_virtual_with_loadable_vars(
9983
],
10084
)
10185

86+
def get_loadable_variables(
87+
dataset: xr.Dataset,
88+
loadable_variables: Iterable[Hashable] | None = None,
89+
) -> Iterable[Hashable]:
90+
var_names_to_load: list[Hashable]
91+
92+
if isinstance(loadable_variables, list):
93+
var_names_to_load = list(loadable_variables)
94+
elif loadable_variables is None:
95+
# If `loadable_variables` is None, then we have to explicitly match default
96+
# behaviour of xarray, i.e., load and create indexes only for dimension
97+
# coordinate variables. We already have all the indexes and variables
98+
# we should be keeping - we just need to distinguish them.
99+
var_names_to_load = [
100+
name
101+
for name, var in dataset.variables.items()
102+
if var.dims == (name,)
103+
]
104+
else:
105+
raise ValueError(
106+
"loadable_variables must be an iterable of string variable names,"
107+
f" or None, but got type {type(loadable_variables)}"
108+
)
109+
non_loadable_vars = set(dataset.variables).difference(
110+
var_names_to_load
111+
)
112+
return var_names_to_load
113+
102114

103115
# TODO this probably doesn't need to actually support indexes != {}
104116
def separate_coords(

virtualizarr/readers/hdf/hdf.py

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121

2222
from virtualizarr.codecs import numcodec_config_to_configurable
2323
from virtualizarr.common import (
24-
construct_fully_virtual_dataset,
25-
replace_virtual_with_loadable_vars,
24+
get_loadable_variables,
2625
)
2726
from virtualizarr.manifests import (
2827
ChunkEntry,
@@ -139,7 +138,8 @@ def _construct_manifest_group(
139138
group=g
140139
)
141140
drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
142-
attrs: dict[str, Any] = {}
141+
# attrs: dict[str, Any] = {}
142+
attrs = HDFVirtualBackend._extract_attrs(g)
143143
for key in g.keys():
144144
if key not in drop_variables:
145145
if isinstance(g[key], h5py.Dataset):
@@ -151,18 +151,19 @@ def _construct_manifest_group(
151151
if variable is not None:
152152
manifest_dict[key] = variable
153153
return ManifestGroup(arrays=manifest_dict, attributes=attrs)
154-
154+
155155
@staticmethod
156156
def _create_manifest_store(
157157
filepath: str,
158158
*,
159159
prefix: str,
160160
store: ObjectStore,
161161
group: str | None = None,
162+
drop_variables: Iterable[str] | None = None,
162163
) -> ManifestStore:
163164
# Create a group containing dataset level metadata and all the manifest arrays
164165
manifest_group = HDFVirtualBackend._construct_manifest_group(
165-
store=store, filepath=filepath, group=group
166+
store=store, filepath=filepath, group=group, drop_variables=drop_variables,
166167
)
167168
# Convert to a manifest store
168169
return ManifestStore(stores={prefix: store}, group=manifest_group)
@@ -185,45 +186,34 @@ def open_virtual_dataset(
185186
"HDF reader does not understand any virtual_backend_kwargs"
186187
)
187188

188-
filepath = validate_and_normalize_path_to_uri(
189-
filepath, fs_root=Path.cwd().as_uri()
190-
)
189+
# filepath = validate_and_normalize_path_to_uri(
190+
# filepath, fs_root=Path.cwd().as_uri()
191+
# )
191192

192193
_drop_vars: list[Hashable] = (
193194
[] if drop_variables is None else list(drop_variables)
194195
)
195-
196-
# TODO provide a way to drop a variable _before_ h5py attempts to inspect it?
197-
virtual_vars = HDFVirtualBackend._virtual_vars_from_hdf(
198-
path=filepath,
196+
from obstore.store import LocalStore
197+
manifest_store = HDFVirtualBackend._create_manifest_store(
198+
filepath=filepath,
199+
store=LocalStore(),
200+
prefix="file://",
201+
drop_variables=_drop_vars,
199202
group=group,
200-
reader_options=reader_options,
201-
)
202-
203-
attrs = HDFVirtualBackend._get_group_attrs(
204-
path=filepath, reader_options=reader_options, group=group
205203
)
206-
coordinates_attr = attrs.pop("coordinates", "")
207-
coord_names = coordinates_attr.split()
208-
209-
fully_virtual_dataset = construct_fully_virtual_dataset(
210-
virtual_vars=virtual_vars,
211-
coord_names=coord_names,
212-
attrs=attrs,
213-
)
214-
215-
vds = replace_virtual_with_loadable_vars(
216-
fully_virtual_dataset,
217-
filepath,
218-
group=group,
204+
ds_virtual = manifest_store.to_virtual_dataset()
205+
_loadable_vars = get_loadable_variables(
206+
dataset=ds_virtual,
219207
loadable_variables=loadable_variables,
220-
reader_options=reader_options,
221-
indexes=indexes,
222-
decode_times=decode_times,
223208
)
224-
225-
return vds.drop_vars(_drop_vars)
226-
209+
non_loadable_vars = set(ds_virtual.variables).difference(_loadable_vars)
210+
ds_loadable = xr.open_zarr(
211+
manifest_store, consolidated=False, zarr_format=3, drop_variables=non_loadable_vars
212+
)
213+
ds_virtual = ds_virtual.drop_vars(_loadable_vars)
214+
ds = xr.merge([ds_virtual, ds_loadable])
215+
return ds
216+
227217
@staticmethod
228218
def _dataset_chunk_manifest(
229219
path: str,

0 commit comments

Comments
 (0)