diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 03ac68dad..834a1a912 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -962,7 +962,12 @@ def isbacked(self) -> bool: is_x_none = ( getattr(self._adata_ref if self._is_view else self, "_X", None) is None ) - return is_filename_none and is_x_none + # TODO: How breaking of a change is it to start return True for other things (i.e., read_lazy with method="backed") in AnnData.isbacked? + is_x_open_dataset = isinstance( + getattr(self._adata_ref if self._is_view else self, "_X", None), + ZarrArray | h5py.Dataset | BaseCompressedSparseDataset, + ) + return (is_filename_none and is_x_none) or is_x_open_dataset @property def is_view(self) -> bool: diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index a70c771f7..68c474eed 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -62,9 +62,6 @@ def __init__( self._X = adata.X.copy() self._var = adata.var.copy() self.varm = adata.varm.copy() - elif adata.isbacked: - msg = "Cannot specify X if adata is backed" - raise ValueError(msg) def _get_X(self, layer=None): if layer is not None: diff --git a/src/anndata/_io/specs/__init__.py b/src/anndata/_io/specs/__init__.py index d97578c1d..3893cb5b8 100644 --- a/src/anndata/_io/specs/__init__.py +++ b/src/anndata/_io/specs/__init__.py @@ -2,7 +2,8 @@ from . import lazy_methods, methods from .registry import ( - _LAZY_REGISTRY, # noqa: F401 + _BACKED_REGISTRY, # noqa: F401 + _DASK_REGISTRY, # noqa: F401 _REGISTRY, # noqa: F401 IOSpec, Reader, diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index c3ac4a651..236135746 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -24,7 +24,7 @@ ZarrGroup, ) -from .registry import _LAZY_REGISTRY, IOSpec, read_elem +from .registry import _BACKED_REGISTRY, _DASK_REGISTRY, IOSpec, get_spec, read_elem if TYPE_CHECKING: from collections.abc import Generator, Mapping, Sequence @@ -33,7 +33,7 @@ from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from ...compat import CSArray, CSMatrix, H5File - from .registry import LazyDataStructures, LazyReader + from .registry import DaskReader, LazyDataStructures BlockInfo = Mapping[ None, @@ -112,14 +112,14 @@ def get_chunksize(obj) -> tuple[int, ...]: raise ValueError(msg) -@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) -@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) +@_DASK_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) +@_DASK_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) +@_DASK_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) +@_DASK_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, - _reader: LazyReader, + _reader: DaskReader, chunks: tuple[int, ...] | None = None, # only tuple[int, int] is supported here ) -> DaskArray: import dask.array as da @@ -172,6 +172,19 @@ def read_sparse_as_dask( return da_mtx +@_BACKED_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) +@_BACKED_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) +@_BACKED_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) +@_BACKED_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) +def read_sparse_as_backed( + elem: H5Group | ZarrGroup, + *, + _reader: DaskReader, +) -> CSRDataset | CSCDataset: + """Return a sparse_dataset (CSRDataset/CSCDataset) without going through dask.""" + return ad.io.sparse_dataset(elem, should_cache_indptr=False) + + def resolve_chunks( elem: H5Array | ZarrArray, chunks_arg: tuple[int, ...] | None, @@ -193,11 +206,11 @@ def resolve_chunks( # TODO: `map_blocks` of a string array in h5py is so insanely slow on benchmarking that in the case someone has # a pure string annotation (not categoricals! or nullables strings!), it's probably better to pay the memory penalty. # In the long run, it might be good to figure out what exactly is going on here but for now, this will do. -@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) +@_DASK_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) def read_h5_string_array( elem: H5Array, *, - _reader: LazyReader, + _reader: DaskReader, chunks: tuple[int] | None = None, ) -> DaskArray: import dask.array as da @@ -206,9 +219,9 @@ def read_h5_string_array( return da.from_array(read_elem(elem), chunks=chunks) -@_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) +@_DASK_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None + elem: H5Array, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da @@ -229,22 +242,49 @@ def read_h5_array( ) -@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) +@_DASK_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) +@_DASK_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, *, _reader: LazyReader, chunks: tuple[int, ...] | None = None + elem: ZarrArray, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da return da.from_zarr(elem, chunks=chunks) +@_BACKED_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) +@_BACKED_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) +def read_h5_array_backed( + elem: H5Array, + *, + _reader: DaskReader, +) -> H5Array: + """Return the h5py.Dataset directly (zero-copy, lazy).""" + return elem + + +@_BACKED_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) +@_BACKED_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) +def read_zarr_array_backed( + elem: ZarrArray, + *, + _reader: DaskReader, +) -> ZarrArray: + """Return the zarr.Array directly (zero-copy, lazy).""" + return elem + + def _gen_xarray_dict_iterator_from_elems( elem_dict: dict[str, LazyDataStructures], dim_name: str, index: np.NDArray, ) -> Generator[tuple[str, XVariable], None, None]: - from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray + from anndata.experimental.backed._lazy_arrays import ( + BackedArray, + BackedStringArray, + CategoricalArray, + MaskedArray, + ) from ...compat import xarray as xr @@ -264,6 +304,11 @@ def _gen_xarray_dict_iterator_from_elems( ), }, ) + elif isinstance(v, BackedArray | BackedStringArray) and k != dim_name: + variable = xr.Variable( + [dim_name], + data=xr.core.indexing.LazilyIndexedArray(v), + ) elif k == dim_name: variable = xr.Variable([dim_name], data=index) else: @@ -275,20 +320,38 @@ def _gen_xarray_dict_iterator_from_elems( DUMMY_RANGE_INDEX_KEY = "_anndata_dummy_range_index" -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -@_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) +def _extract_index_from_elem_dict( + elem_dict: dict, + dim_name: str, +) -> np.ndarray: + """Read the index column into memory, handling all supported column types.""" + from xarray.core.indexing import BasicIndexer + + from ...experimental.backed._lazy_arrays import ( + BackedArray, + BackedStringArray, + MaskedArray, + ) + + v = elem_dict[dim_name] + if isinstance(v, DaskArray): + return v.compute() + if isinstance(v, (BackedArray, BackedStringArray, MaskedArray)): + return v[BasicIndexer((slice(None),))] + msg = f"Cannot extract index from {type(v)}" + raise NotImplementedError(msg) + + +@_DASK_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) +@_DASK_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @requires_xarray def read_dataframe( elem: H5Group | ZarrGroup, *, - _reader: LazyReader, + _reader: DaskReader, use_range_index: bool = False, chunks: tuple[int] | None = None, ) -> Dataset2D: - from xarray.core.indexing import BasicIndexer - - from ...experimental.backed._lazy_arrays import MaskedArray - elem_dict = { k: _reader.read_elem(elem[k], chunks=chunks) for k in [*elem.attrs["column-order"], elem.attrs["_index"]] @@ -297,13 +360,7 @@ def read_dataframe( # which is used below as well. if not use_range_index: dim_name = elem.attrs["_index"] - # no sense in reading this in multiple times since xarray requires an in-memory index - if isinstance(elem_dict[dim_name], DaskArray): - index = elem_dict[dim_name].compute() - elif isinstance(elem_dict[dim_name], MaskedArray): - index = elem_dict[dim_name][BasicIndexer((slice(None),))] - else: - raise NotImplementedError() + index = _extract_index_from_elem_dict(elem_dict, dim_name) else: dim_name = DUMMY_RANGE_INDEX_KEY index = pd.RangeIndex(len(elem_dict[elem.attrs["_index"]])).astype("str") @@ -323,13 +380,73 @@ def read_dataframe( return ds -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -@_LAZY_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) +@_BACKED_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) +@_BACKED_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) +@requires_xarray +def read_dataframe_backed( + elem: H5Group | ZarrGroup, + *, + _reader: DaskReader, + use_range_index: bool = False, +) -> Dataset2D: + """Read a dataframe lazily without dask, using BackedArray/BackedStringArray wrappers.""" + from ...experimental.backed._lazy_arrays import BackedArray, BackedStringArray + + # Read raw elements via the backed registry so sub-columns come back as + # zarr.Array / h5py.Dataset / CategoricalArray / MaskedArray. + raw_dict: dict[str, object] = { + k: _reader.read_elem(elem[k]) + for k in [*elem.attrs["column-order"], elem.attrs["_index"]] + } + + # Wrap raw arrays into BackedArray / BackedStringArray for xarray compatibility. + def _wrap(k: str, v: object) -> object: + if isinstance(v, H5Array | ZarrArray): + iospec = get_spec(elem[k]) # peek at the on-disk spec + if iospec.encoding_type == "string-array": + return BackedStringArray(v) + return BackedArray(v) + # CategoricalArray / MaskedArray already work as XBackendArray + return v + + elem_dict = {k: _wrap(k, v) for k, v in raw_dict.items()} + + if not use_range_index: + dim_name = elem.attrs["_index"] + index = _extract_index_from_elem_dict(elem_dict, dim_name) + else: + dim_name = DUMMY_RANGE_INDEX_KEY + index = pd.RangeIndex(len(raw_dict[elem.attrs["_index"]])).astype("str") + + elem_xarray_dict = dict( + _gen_xarray_dict_iterator_from_elems(elem_dict, dim_name, index) + ) + if use_range_index: + elem_xarray_dict[DUMMY_RANGE_INDEX_KEY] = XVariable( + [DUMMY_RANGE_INDEX_KEY], + data=index, + ) + ds = Dataset2D(XDataset(elem_xarray_dict)) + ds.is_backed = True + ds.true_index_dim = elem.attrs["_index"] + return ds + + +# --------------------------------------------------------------------------- +# Categorical and nullable: identical for both registry modes — register once +# on both registries. +# --------------------------------------------------------------------------- + + +@_DASK_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) +@_DASK_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) +@_BACKED_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) +@_BACKED_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @requires_xarray def read_categorical( elem: H5Group | ZarrGroup, *, - _reader: LazyReader, + _reader: DaskReader, ) -> CategoricalArray: from anndata.experimental.backed._lazy_arrays import CategoricalArray @@ -353,7 +470,7 @@ def read_nullable( encoding_type: Literal[ "nullable-integer", "nullable-boolean", "nullable-string-array" ], - _reader: LazyReader, + _reader: DaskReader, ) -> MaskedArray: from anndata.experimental.backed._lazy_arrays import MaskedArray @@ -375,8 +492,14 @@ def read_nullable( ) -for dtype in ["integer", "boolean", "string-array"]: - for group_type in [ZarrGroup, H5Group]: - _LAZY_REGISTRY.register_read(group_type, IOSpec(f"nullable-{dtype}", "0.1.0"))( - partial(read_nullable, encoding_type=f"nullable-{dtype}") +for _dtype in ["integer", "boolean", "string-array"]: + for _group_type in [ZarrGroup, H5Group]: + _read_nullable_partial = partial( + read_nullable, encoding_type=f"nullable-{_dtype}" ) + _DASK_REGISTRY.register_read( + _group_type, IOSpec(f"nullable-{_dtype}", "0.1.0") + )(_read_nullable_partial) + _BACKED_REGISTRY.register_read( + _group_type, IOSpec(f"nullable-{_dtype}", "0.1.0") + )(_read_nullable_partial) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 51726e4e2..849f82a53 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -11,16 +11,26 @@ from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata._settings import settings -from anndata._types import Read, ReadLazy, _ReadInternal, _ReadLazyInternal +from anndata._types import ( + Read, + ReadBacked, + ReadDask, + _ReadBackedInternal, + _ReadDaskInternal, + _ReadInternal, +) from anndata.compat import DaskArray, ZarrGroup, _read_attr, has_xp from ...utils import warn if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any + from typing import Any, Literal from anndata._types import ( + BackedDataStructures, + DaskDataStructures, + LazyDataStructures, ReadCallback, StorageType, Write, @@ -28,13 +38,8 @@ _GroupStorageType, _WriteInternal, ) - from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray from anndata.typing import RWAble - from ..._core.xarray import Dataset2D - - type LazyDataStructures = DaskArray | Dataset2D | CategoricalArray | MaskedArray - def to_writeable(x): # Convert non-numpy arrays to dlpack @@ -100,7 +105,10 @@ def wrapper(g: _GroupStorageType, k: str, *args, **kwargs) -> None: return decorator -class IORegistry[RI: (_ReadInternal, _ReadLazyInternal), R: (Read, ReadLazy)]: +class IORegistry[ + RI: (_ReadInternal, _ReadDaskInternal, _ReadBackedInternal), + R: (Read, ReadDask, ReadBacked), +]: read: dict[tuple[type, IOSpec, frozenset[str]], RI] read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] write: dict[tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal] @@ -234,7 +242,12 @@ def get_spec(self, elem: Any) -> IOSpec: _REGISTRY: IORegistry[_ReadInternal, Read] = IORegistry() -_LAZY_REGISTRY: IORegistry[_ReadLazyInternal, ReadLazy] = IORegistry() +_DASK_REGISTRY: IORegistry[_ReadDaskInternal, ReadDask] = IORegistry() +_BACKED_REGISTRY: IORegistry[_ReadBackedInternal, ReadBacked] = IORegistry() + +# Legacy aliases kept for any external code that may import them +_LAZY_REGISTRY = _DASK_REGISTRY +_SIMPLE_REGISTRY = _BACKED_REGISTRY @singledispatch @@ -301,7 +314,9 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -class LazyReader(Reader): +class DaskReader(Reader): + """Reads store elements as dask-backed lazy objects. Uses :data:`_DASK_REGISTRY`.""" + @report_read_key_on_error def read_elem( self, @@ -309,11 +324,11 @@ def read_elem( modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, **kwargs, - ) -> LazyDataStructures: - """Read a dask element from a store. See exported function for more details.""" + ) -> DaskDataStructures: + """Read a dask-backed element from a store. Returns one of :data:`~anndata._types.DaskDataStructures`.""" iospec = get_spec(elem) - read_func: ReadLazy = self.registry.get_read( + read_func: ReadDask = self.registry.get_read( type(elem), iospec, modifiers, reader=self ) if self.callback is not None: @@ -323,7 +338,7 @@ def read_elem( for kwarg in kwargs: if kwarg not in read_params: msg = ( - f"Keyword argument {kwarg} passed to read_elem_lazy are not supported by the " + f"Keyword argument {kwarg} passed to read_elem_lazy are not in dask mode supported by the " "registered read function." ) raise ValueError(msg) @@ -332,6 +347,38 @@ def read_elem( return read_func(elem, **kwargs) +class BackedReader(Reader): + """Reads store elements as file-backed objects (zarr.Array / h5py.Dataset / sparse_dataset) + instead of dask arrays. Uses :data:`_BACKED_REGISTRY`.""" + + @report_read_key_on_error + def read_elem( + self, + elem: StorageType, + modifiers: frozenset[str] = frozenset(), + chunks: tuple[int, ...] | None = None, + **kwargs, + ) -> BackedDataStructures: + """Read a file-backed element from a store. Returns one of :data:`~anndata._types.BackedDataStructures`.""" + + iospec = get_spec(elem) + read_func: ReadBacked = self.registry.get_read( + type(elem), iospec, modifiers, reader=self + ) + if self.callback is not None: + msg = "Backed reading does not use a callback. Ignoring callback." + warn(msg, UserWarning) + read_params = inspect.signature(read_func).parameters + for kwarg in kwargs: + if kwarg not in read_params: + msg = ( + f"Keyword argument {kwarg} passed to read_elem_lazy are not in backed mode supported by the " + "registered read function." + ) + raise ValueError(msg) + return read_func(elem, **kwargs) + + class Writer: def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry @@ -429,7 +476,11 @@ def read_elem(elem: StorageType) -> RWAble: def read_elem_lazy( - elem: StorageType, chunks: tuple[int, ...] | None = None, **kwargs + elem: StorageType, + chunks: tuple[int, ...] | None = None, + *, + method: Literal["dask", "backed"] = "dask", + **kwargs, ) -> LazyDataStructures: """ Read an element from a store lazily. @@ -449,10 +500,16 @@ def read_elem_lazy( `(adata.shape[0], 1000)` for CSC sparse, and the on-disk chunking otherwise for dense. Can use `-1` or `None` to indicate use of the size of the corresponding dimension. + Ignored when ``mode="backed"``. + method + If ``"dask"`` (the default), return a :class:`dask.array.Array`-backed object. + If ``"backed"``, return a file-backed object (:class:`zarr.Array`, + :class:`h5py.Dataset`, or :class:`~anndata.abc.CSRDataset` / + :class:`~anndata.abc.CSCDataset` for sparse data) without dask. Returns ------- - A "lazy" elem + A lazy elem. Examples -------- @@ -504,7 +561,9 @@ def read_elem_lazy( >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, -1)) >>> adata.X = ad.experimental.read_elem_lazy(g["X"], chunks=(500, None)) """ - return LazyReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks, **kwargs) + if method == "backed": + return BackedReader(_BACKED_REGISTRY).read_elem(elem, **kwargs) + return DaskReader(_DASK_REGISTRY).read_elem(elem, chunks=chunks, **kwargs) def write_elem( diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 6006b31c3..d90356c96 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -15,14 +15,30 @@ from typing import Any, TypeAlias from anndata._core.xarray import Dataset2D + from anndata.abc import CSCDataset, CSRDataset - from ._io.specs.registry import ( - IOSpec, - LazyDataStructures, - LazyReader, - Reader, - Writer, + #: Objects returned by :class:`DaskReader` — always include a :class:`~dask.array.Array`. + from anndata.compat import DaskArray + from anndata.experimental.backed._lazy_arrays import CategoricalArray, MaskedArray + + from ._io.specs.registry import BackedReader, DaskReader, IOSpec, Reader, Writer + + type DaskDataStructures = DaskArray | Dataset2D | CategoricalArray | MaskedArray + + #: Objects returned by :class:`BackedReader` — file-backed, never dask. + type BackedDataStructures = ( + ZarrArray + | H5Array + | CSRDataset + | CSCDataset + | Dataset2D + | CategoricalArray + | MaskedArray ) + + # Legacy umbrella alias kept for external code that imported it + type LazyDataStructures = DaskDataStructures | BackedDataStructures + else: # https://github.com/tox-dev/sphinx-autodoc-typehints/issues/580 type S = StorageType type RWAble = typing.RWAble @@ -32,12 +48,14 @@ "StorageType", "_ArrayStorageType", "_GroupStorageType", + "_ReadBackedInternal", + "_ReadDaskInternal", "_ReadInternal", "_ReadLazyInternal", "_WriteInternal", ] -# These two are not public, so we don’t make them `type`s +# These two are not public, so we don't make them `type`s _ArrayStorageType: TypeAlias = ZarrArray | H5Array # noqa: UP040 _GroupStorageType: TypeAlias = ZarrGroup | H5Group # noqa: UP040 @@ -53,14 +71,41 @@ class _ReadInternal[S: StorageType, RWAble: typing.RWAble](Protocol): def __call__(self, elem: S, *, _reader: Reader) -> RWAble: ... -class _ReadLazyInternal[S: StorageType](Protocol): +class _ReadDaskInternal[S: StorageType](Protocol): + """Internal protocol for functions registered on :data:`~anndata._io.specs.registry._DASK_REGISTRY`. + + The ``_reader`` is always a :class:`~anndata._io.specs.registry.DaskReader` and + the optional ``chunks`` kwarg controls dask chunking. + Return type is one of :data:`DaskDataStructures`. + """ + def __call__( self, elem: S, *, - _reader: LazyReader, + _reader: DaskReader, chunks: tuple[int, ...] | None = None, - ) -> LazyDataStructures: ... + ) -> DaskDataStructures: ... + + +class _ReadBackedInternal[S: StorageType](Protocol): + """Internal protocol for functions registered on :data:`~anndata._io.specs.registry._BACKED_REGISTRY`. + + The ``_reader`` is always a :class:`~anndata._io.specs.registry.BackedReader`. + There is no ``chunks`` kwarg — backed reads are zero-copy and unchunked. + Return type is one of :data:`BackedDataStructures`. + """ + + def __call__( + self, + elem: S, + *, + _reader: BackedReader, + ) -> BackedDataStructures: ... + + +# Legacy alias: the old single protocol covered both modes. +_ReadLazyInternal = _ReadDaskInternal @set_module("anndata.experimental") @@ -79,25 +124,54 @@ def __call__(self, elem: S) -> RWAble: ... -class ReadLazy[S](Protocol): +class ReadDask[S](Protocol): + """Public callable type produced by :meth:`~anndata._io.specs.registry.DaskReader.read_elem` + after the internal ``_reader`` argument is bound via :func:`functools.partial`. + Returns one of :data:`DaskDataStructures`. + """ + def __call__( self, elem: S, *, chunks: tuple[int, ...] | None = None - ) -> LazyDataStructures: - """Low-level reading function for a lazy element. + ) -> DaskDataStructures: + """Low-level reading function for a dask-backed element. Parameters ---------- elem The element to read from. chunks - The chunk size to be used. + The dask chunk size to be used. + Returns + ------- + A dask-backed lazy element (see :data:`DaskDataStructures`). + """ + ... + + +class ReadBacked[S](Protocol): + """Public callable type produced by :meth:`~anndata._io.specs.registry.BackedReader.read_elem` + after the internal ``_reader`` argument is bound via :func:`functools.partial`. + Returns one of :data:`BackedDataStructures`. + """ + + def __call__(self, elem: S) -> BackedDataStructures: + """Low-level reading function for a file-backed (non-dask) element. + + Parameters + ---------- + elem + The element to read from. Returns ------- - The lazy element read from the store. + A file-backed lazy element (see :data:`BackedDataStructures`). """ ... +# Legacy alias +ReadLazy = ReadDask + + class _WriteInternal[RWAble: typing.RWAble](Protocol): def __call__( self, diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index 2cf33a918..41fd3b7c6 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from collections.abc import MutableMapping + from typing import Literal from anndata._io.specs.registry import IOSpec from anndata._types import Read, StorageType @@ -34,6 +35,7 @@ def read_lazy( store: PathLike[str] | str | MutableMapping | ZarrGroup | h5py.File | h5py.Group, *, load_annotation_index: bool = True, + method: Literal["dask", "backed"] = "dask", ) -> AnnData: """ Lazily read in on-disk/in-cloud AnnData stores, including `obs` and `var`. @@ -43,13 +45,18 @@ def read_lazy( ---------- store A store-like object to be read in. If :class:`zarr.Group`, it is best for it to be consolidated. - If a path to an ``.h5ad`` file is provided, the open HDF5 file will be attached to the {class}`~anndata.AnnData` at the `file` attribute and it will be the user’s responsibility to close it when done with the returned object. + If a path to an ``.h5ad`` file is provided, the open HDF5 file will be attached to the {class}`~anndata.AnnData` at the `file` attribute and it will be the user's responsibility to close it when done with the returned object. For this reason, it is recommended to use an {class}`h5py.File` as the `store` argument when working with h5 files. It must remain open for at least as long as this returned object is in use. load_annotation_index Whether or not to use a range index for the `{obs,var}` :class:`xarray.Dataset` so as not to load the index into memory. If `False`, the real `index` will be inserted as `{obs,var}_names` in the object but not be one of the `coords` thereby preventing read operations. Access to `adata.obs.index` will also only give the dummy index, and not the "real" index that is file-backed. + method + If ``"dask"`` (the default), array data is backed by :class:`dask.array.Array`. + If ``"backed"``, array data is backed directly by :class:`zarr.Array` or :class:`h5py.Dataset` + for dense arrays and by :class:`~anndata.abc.CSRDataset` / :class:`~anndata.abc.CSCDataset` + for sparse arrays, without requiring dask. Returns ------- @@ -149,8 +156,10 @@ def callback(func: Read, /, elem_name: str, elem: StorageType, *, iospec: IOSpec elem_name[:4] in {"/obs", "/var"} or elem_name[:8] in {"/raw/obs", "/raw/var"} ): - return read_elem_lazy(elem, use_range_index=not load_annotation_index) - return read_elem_lazy(elem) + return read_elem_lazy( + elem, use_range_index=not load_annotation_index, method=method + ) + return read_elem_lazy(elem, method=method) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) elif iospec.encoding_type == "dict": diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index e758c220a..c046fa061 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -198,6 +198,36 @@ def dtype(self) -> BaseMaskedDtype | np.dtypes.StringDType[NAType]: raise RuntimeError(msg) from None +class BackedArray[K: (H5Array | H5AsTypeView, ZarrArray)](ZarrOrHDF5Wrapper): + """ + A wrapper class that exposes a :class:`zarr.Array` or :class:`h5py.Dataset` as an + :class:`xarray.backends.BackendArray` for non-dask lazy reading. + + Used in ``mode="simple"`` by :func:`~anndata.experimental.read_lazy`. + We do not guarantee the stability of this API beyond that guaranteed + by :class:`xarray.backends.BackendArray`. + """ + + +class BackedStringArray[K: (H5Array | H5AsTypeView, ZarrArray)](ZarrOrHDF5Wrapper): + """ + A wrapper class that exposes a :class:`zarr.Array` or :class:`h5py.Dataset` holding + variable-length strings as an :class:`xarray.backends.BackendArray` for non-dask lazy + reading. HDF5 bytes are decoded to str on access. + + Used in ``mode="simple"`` by :func:`~anndata.experimental.read_lazy`. + We do not guarantee the stability of this API beyond that guaranteed + by :class:`xarray.backends.BackendArray`. + """ + + def __init__(self, array: K) -> None: + # For HDF5, wrap with .astype("T") for transparent string decoding. + if isinstance(array, H5Array) and array.dtype.kind == "S": + array = array.astype("T") + super().__init__(array) + self.dtype = np.dtype("O") # object dtype for strings + + @_subset.register(XDataArray) def _subset_masked( a: XDataArray, subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm] @@ -218,3 +248,13 @@ def _(a: MaskedArray): @get_chunksize.register(CategoricalArray) def _(a: CategoricalArray): return get_chunksize(a._codes) + + +@get_chunksize.register(BackedArray) +def _(a: BackedArray): + return get_chunksize(a._wrapper) + + +@get_chunksize.register(BackedStringArray) +def _(a: BackedStringArray): + return get_chunksize(a._wrapper) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 97519ed19..2fabb4199 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1193,12 +1193,8 @@ class AccessTrackingStore(LocalStore): _accessed_keys: defaultdict[str, list[str]] def __init__(self, *args, **kwargs): - import traceback - traceback.print_stack() - print(kwargs) super().__init__(*args, **kwargs) - print(self._read_only) self._access_count = Counter() self._accessed = defaultdict(set) self._accessed_keys = defaultdict(list) diff --git a/tests/lazy/conftest.py b/tests/lazy/conftest.py index 5048a526b..d6bf2ed9a 100644 --- a/tests/lazy/conftest.py +++ b/tests/lazy/conftest.py @@ -35,6 +35,13 @@ def mtx_format(request): return request.param +@pytest.fixture( + params=["dask", "backed"], +) +def method(request) -> Literal["dask", "backed"]: + return request.param + + @pytest.fixture( params=[True, False], ids=["vars_different", "vars_same"], scope="session" ) @@ -231,7 +238,7 @@ def remote_store_tall_skinny( @pytest.fixture def adata_remote_tall_skinny( - remote_store_tall_skinny: AccessTrackingStore, + remote_store_tall_skinny: AccessTrackingStore, method: Literal["dask", "backed"] ) -> AnnData: - remote = read_lazy(remote_store_tall_skinny) + remote = read_lazy(remote_store_tall_skinny, method=method) return remote