Skip to content

Commit e836360

Browse files
committed
add obstore storage backend for faster dataset reads in open_mdio and write_mdio
1 parent a4bbb74 commit e836360

3 files changed

Lines changed: 82 additions & 12 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ dependencies = [
3535
]
3636

3737
[project.optional-dependencies]
38-
cloud = ["s3fs>=2025.9.0", "gcsfs>=2025.9.0", "adlfs>=2025.8.0"]
38+
cloud = ["s3fs>=2025.9.0", "gcsfs>=2025.9.0", "adlfs>=2025.8.0", "obstore>=0.8.2"]
3939
distributed = ["distributed>=2025.9.1", "bokeh>=3.8.0"]
4040
lossy = ["zfpy>=1.0.1"]
4141

src/mdio/api/io.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,36 +3,51 @@
33
from __future__ import annotations
44

55
from typing import TYPE_CHECKING
6-
from typing import Any
76
from typing import Literal
87

98
import zarr
9+
from obstore.store import from_url as obstore_from_url
1010
from upath import UPath
1111
from xarray import Dataset as xr_Dataset
1212
from xarray import open_zarr as xr_open_zarr
1313
from xarray.backends.writers import to_zarr as xr_to_zarr
14+
from zarr.storage import FsspecStore
15+
from zarr.storage import ObjectStore
1416

1517
from mdio.constants import ZarrFormat
1618
from mdio.core.zarr_io import zarr_warnings_suppress_unstable_structs_v3
1719

1820
if TYPE_CHECKING:
1921
from collections.abc import Mapping
2022
from pathlib import Path
23+
from typing import Any
2124

2225
from xarray import Dataset
2326
from xarray.core.types import T_Chunks
2427
from xarray.core.types import ZarrWriteModes
28+
from zarr.abc.store import Store
29+
30+
31+
StorageBackendT = Literal["fsspec", "obstore"]
2532

2633

2734
def _normalize_path(path: UPath | Path | str) -> UPath:
2835
return UPath(path)
2936

3037

31-
def _normalize_storage_options(path: UPath) -> dict[str, Any] | None:
32-
return None if len(path.storage_options) == 0 else path.storage_options
38+
def _get_store(upath: UPath, storage_backend: StorageBackendT) -> Store:
39+
if storage_backend == "obstore":
40+
uri = upath.as_posix()
41+
storage_options: Mapping[str, Any] = getattr(upath, "storage_options", {})
42+
return ObjectStore(obstore_from_url(uri, **storage_options))
43+
return FsspecStore.from_upath(upath)
3344

3445

35-
def open_mdio(input_path: UPath | Path | str, chunks: T_Chunks = None) -> xr_Dataset:
46+
def open_mdio(
47+
input_path: UPath | Path | str,
48+
chunks: T_Chunks = None,
49+
storage_backend: StorageBackendT = "fsspec",
50+
) -> xr_Dataset:
3651
"""Open a Zarr dataset from the specified universal file path.
3752
3853
Args:
@@ -45,18 +60,19 @@ def open_mdio(input_path: UPath | Path | str, chunks: T_Chunks = None) -> xr_Dat
4560
- ``chunks={dim: chunk, ...}`` loads the data with dask using the specified chunk size for each dimension.
4661
4762
See dask chunking for more details.
63+
storage_backend: The storage backend to use for reading the dataset. Defaults to "fsspec".
64+
For faster reads use "obstore". However, it is not as broadly compatible as "fsspec".
4865
4966
Returns:
5067
An Xarray dataset opened from the input path.
5168
"""
5269
input_path = _normalize_path(input_path)
53-
storage_options = _normalize_storage_options(input_path)
5470
zarr_format = zarr.config.get("default_zarr_format")
5571

72+
input_store = _get_store(input_path, storage_backend)
5673
return xr_open_zarr(
57-
input_path.as_posix(),
74+
input_store,
5875
chunks=chunks,
59-
storage_options=storage_options,
6076
mask_and_scale=zarr_format == ZarrFormat.V3, # off for v2, on for v3
6177
consolidated=zarr_format == ZarrFormat.V2, # on for v2, off for v3
6278
)
@@ -69,6 +85,7 @@ def to_mdio( # noqa: PLR0913
6985
*,
7086
compute: bool = True,
7187
region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None,
88+
storage_backend: StorageBackendT = "fsspec",
7289
) -> None:
7390
"""Write dataset contents to an MDIO output_path.
7491
@@ -81,23 +98,24 @@ def to_mdio( # noqa: PLR0913
8198
"a-" means only append those variables that have ``append_dim``.
8299
"r+" means modify existing array *values* only (raise an error if any metadata or shapes would change).
83100
The default mode is "r+" if ``region`` is set and ``w-`` otherwise.
84-
compute: If True write array data immediately; otherwise return a ``dask.delayed.Delayed`` object that
101+
compute: If True writes array data immediately; otherwise return a ``dask.delayed.Delayed`` object that
85102
can be computed to write array data later. Metadata is always updated eagerly.
86103
region: Optional mapping from dimension names to either a) ``"auto"``, or b) integer slices, indicating
87104
the region of existing MDIO array(s) in which to write this dataset's data.
105+
storage_backend: The storage backend to use for reading the dataset. Defaults to "fsspec".
106+
For faster reads use "obstore". However, it is not as broadly compatible as "fsspec".
88107
"""
89108
output_path = _normalize_path(output_path)
90-
storage_options = _normalize_storage_options(output_path)
91109
zarr_format = zarr.config.get("default_zarr_format")
92110

111+
output_store = _get_store(output_path, storage_backend)
93112
with zarr_warnings_suppress_unstable_structs_v3():
94113
xr_to_zarr(
95114
dataset,
96-
store=output_path.as_posix(), # xarray doesn't like URI when file:// is protocol
115+
store=output_store,
97116
mode=mode,
98117
compute=compute,
99118
consolidated=zarr_format == ZarrFormat.V2, # on for v2, off for v3
100119
region=region,
101-
storage_options=storage_options,
102120
write_empty_chunks=False,
103121
)

uv.lock

Lines changed: 52 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)