From 4b5d7ab17866150279b74f128b64594fe4f92f57 Mon Sep 17 00:00:00 2001 From: Dominik Date: Thu, 8 Jan 2026 12:45:14 +0100 Subject: [PATCH 01/20] implement LazyCategoricalDtype --- .../experimental/backed/_lazy_arrays.py | 222 +++++++++++++++++- tests/lazy/test_read.py | 197 +++++++++++++++- 2 files changed, 399 insertions(+), 20 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index cb3a926ff..273ef9ce1 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -35,6 +35,202 @@ from xarray.core.indexing import ExplicitIndexer +class LazyCategoricalDtype(pd.CategoricalDtype): + """A CategoricalDtype that lazily loads categories from zarr/h5 storage. + + This dtype provides efficient access to categorical metadata without loading + all categories into memory. Categories are loaded lazily on first full access + and cached thereafter. + + Parameters + ---------- + categories_array + The underlying zarr or h5 array containing category values. + ordered + Whether the categorical is ordered. + + Examples + -------- + >>> dtype = col.dtype # LazyCategoricalDtype + >>> dtype.n_categories # cheap, metadata only + 100000 + >>> dtype.head_categories(5) # partial read, first 5 + array(['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4'], dtype='>> dtype.tail_categories(3) # partial read, last 3 + array(['cat_99997', 'cat_99998', 'cat_99999'], dtype='>> dtype.categories # full load, cached + Index(['cat_0', 'cat_1', ...], dtype='str') + """ + + # Attributes that should be preserved during copying/pickling + _metadata = ("_categories_array", "_ordered_flag", "_cached_n_categories") + + def __new__( + cls, + categories_array: ZarrArray | H5Array | None = None, + *, + ordered: bool = False, + ): + # Create instance without calling parent __init__ with categories + instance = object.__new__(cls) + return instance + + def __init__( + self, + categories_array: ZarrArray | H5Array | None = None, + *, + ordered: bool = False, + ): + self._categories_array = categories_array + self._ordered_flag = bool(ordered) + self._cached_n_categories: int | None = None + self.__categories: pd.Index | None = ( + None # Double underscore to avoid conflicts + ) + + def _get_categories_array(self) -> ZarrArray | H5Array: + """Get the underlying categories array (handles both encodings). + + For string-array encoding: _categories_array is directly the array. + For nullable-string-array encoding: _categories_array is a Group with "values" key. + """ + if isinstance(self._categories_array, (ZarrArray, H5Array)): + return self._categories_array + # nullable-string-array encoding: Group with "values" and "mask" + return self._categories_array["values"] + + @property + def categories(self) -> pd.Index | None: + """Categories index. Loads all categories on first access and caches.""" + if self.__categories is None and self._categories_array is not None: + arr = self._get_categories_array() + if isinstance(arr, ZarrArray): + values = arr[...] + else: + from anndata.io import read_elem + + values = read_elem(self._categories_array) + self.__categories = pd.Index(values) + return self.__categories + + @property + def ordered(self) -> bool: + """Whether the categorical is ordered.""" + return self._ordered_flag + + @property + def n_categories(self) -> int: + """Number of categories (cheap, metadata only).""" + if self._cached_n_categories is not None: + return self._cached_n_categories + if self.__categories is not None: + return len(self.__categories) + if self._categories_array is not None: + n = self._get_categories_array().shape[0] + self._cached_n_categories = n + return n + return 0 + + def head_categories( + self, n: int = 5 + ) -> np.ndarray | pd.api.extensions.ExtensionArray: + """Return first n categories without loading all into memory. + + Parameters + ---------- + n + Number of categories to return. Default 5. + + Returns + ------- + np.ndarray or ExtensionArray + The first n categories. + + Examples + -------- + >>> dtype.head_categories() # first 5 + >>> dtype.head_categories(10) # first 10 + """ + # If already fully loaded, slice from cache + if self.__categories is not None: + return np.asarray(self.__categories[:n]) + + if self._categories_array is None: + return np.array([]) + + from anndata._io.specs.registry import read_elem_partial + + arr = self._get_categories_array() + total = self.n_categories + return read_elem_partial(arr, indices=slice(0, min(n, total))) + + def tail_categories( + self, n: int = 5 + ) -> np.ndarray | pd.api.extensions.ExtensionArray: + """Return last n categories without loading all into memory. + + Parameters + ---------- + n + Number of categories to return. Default 5. + + Returns + ------- + np.ndarray or ExtensionArray + The last n categories. + + Examples + -------- + >>> dtype.tail_categories() # last 5 + >>> dtype.tail_categories(10) # last 10 + """ + # If already fully loaded, slice from cache + if self.__categories is not None: + return np.asarray(self.__categories[-n:]) + + if self._categories_array is None: + return np.array([]) + + from anndata._io.specs.registry import read_elem_partial + + arr = self._get_categories_array() + total = self.n_categories + start = max(total - n, 0) + return read_elem_partial(arr, indices=slice(start, total)) + + def __repr__(self) -> str: + if self.__categories is not None: + # Fully loaded - use standard repr + return f"CategoricalDtype(categories={self.__categories!r}, ordered={self.ordered})" + return f"LazyCategoricalDtype(n_categories={self.n_categories}, ordered={self.ordered})" + + @property + def name(self) -> str: + """String identifier for this dtype.""" + return "category" + + def __hash__(self) -> int: + # Need to be hashable for pandas internals + return hash((id(self._categories_array), self._ordered_flag)) + + def __eq__(self, other) -> bool: + if isinstance(other, LazyCategoricalDtype): + return ( + self._categories_array is other._categories_array + and self._ordered_flag == other._ordered_flag + ) + if isinstance(other, pd.CategoricalDtype): + # Compare with regular CategoricalDtype - need to load categories + if self.ordered != other.ordered: + return False + if other.categories is None: + return self.categories is None + if self.categories is None: + return False + return self.categories.equals(other.categories) + return False + + class ZarrOrHDF5Wrapper[K: (H5Array, ZarrArray)](XZarrArrayWrapper): def __init__(self, array: K) -> None: self.chunks = array.chunks @@ -85,7 +281,7 @@ class CategoricalArray[K: (H5Array, ZarrArray)](XBackendArray): """ _codes: ZarrOrHDF5Wrapper[K] - _categories: ZarrArray | H5Array + _categories_array: ZarrArray | H5Array shape: tuple[int, ...] base_path_or_zarr_group: Path | ZarrGroup elem_name: str @@ -100,21 +296,22 @@ def __init__( ordered: bool, **kwargs, ): - self._categories = categories + self._categories_array = categories self._ordered = ordered self._codes = ZarrOrHDF5Wrapper(codes) self.shape = self._codes.shape self.base_path_or_zarr_group = base_path_or_zarr_group self.file_format = "zarr" if isinstance(codes, ZarrArray) else "h5" self.elem_name = elem_name + # Create the lazy dtype - this is where categories are cached + self._lazy_dtype = LazyCategoricalDtype( + categories_array=categories, ordered=ordered + ) - @cached_property - def categories(self) -> np.ndarray: - if isinstance(self._categories, ZarrArray): - return self._categories[...] - from anndata.io import read_elem - - return read_elem(self._categories) + @property + def categories(self) -> pd.Index | None: + """All categories. Loads and caches on first access.""" + return self._lazy_dtype.categories def __getitem__(self, key: ExplicitIndexer) -> PandasExtensionArray: from xarray.core.extension_array import PandasExtensionArray @@ -127,9 +324,10 @@ def __getitem__(self, key: ExplicitIndexer) -> PandasExtensionArray: categorical_array = categorical_array.remove_unused_categories() return PandasExtensionArray(categorical_array) - @cached_property - def dtype(self): - return pd.CategoricalDtype(categories=self.categories, ordered=self._ordered) + @property + def dtype(self) -> LazyCategoricalDtype: + """The dtype with lazy category loading support.""" + return self._lazy_dtype # circumvent https://github.com/tox-dev/sphinx-autodoc-typehints/issues/580 diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index f65849d21..f1c3c0829 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -111,18 +111,29 @@ def test_access_count_dtype( adata_remote_tall_skinny: AnnData, adata_remote_with_store_tall_skinny_path: Path, ) -> None: - adata_orig = read_zarr(adata_remote_with_store_tall_skinny_path) - remote_store_tall_skinny.initialize_key_trackers(["obs/cat/categories"]) remote_store_tall_skinny.assert_access_count("obs/cat/categories", 0) - count_expected = 2 if adata_orig.obs["cat"].cat.categories.dtype == "string" else 1 - # This should only cause categories to be read in once (and their mask if applicable) - adata_remote_tall_skinny.obs["cat"].dtype # noqa: B018 - remote_store_tall_skinny.assert_access_count("obs/cat/categories", count_expected) - adata_remote_tall_skinny.obs["cat"].dtype # noqa: B018 + # Accessing dtype alone should NOT load categories (lazy loading) adata_remote_tall_skinny.obs["cat"].dtype # noqa: B018 - remote_store_tall_skinny.assert_access_count("obs/cat/categories", count_expected) + remote_store_tall_skinny.assert_access_count("obs/cat/categories", 0) + + # n_categories should also be cheap (metadata only) + _ = adata_remote_tall_skinny.obs["cat"].dtype.n_categories + remote_store_tall_skinny.assert_access_count("obs/cat/categories", 0) + + # Accessing categories should trigger loading (once, then cached) + count_before = remote_store_tall_skinny.get_access_count("obs/cat/categories") + _ = adata_remote_tall_skinny.obs["cat"].dtype.categories + count_after = remote_store_tall_skinny.get_access_count("obs/cat/categories") + assert count_after > count_before, "categories access should trigger read" + + # Subsequent accesses should use cache (no additional reads) + _ = adata_remote_tall_skinny.obs["cat"].dtype.categories + _ = adata_remote_tall_skinny.obs["cat"].dtype.categories + assert ( + remote_store_tall_skinny.get_access_count("obs/cat/categories") == count_after + ), "cached categories should not trigger additional reads" def test_uns_uses_dask(adata_remote: AnnData): @@ -234,3 +245,173 @@ def test_chunks_df( for k in ds: if isinstance(arr := ds[k].data, DaskArray): assert arr.chunksize == expected_chunks + + +@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) +def test_lazy_categorical_dtype_n_categories(tmp_path: Path, diskfmt: str): + """Test LazyCategoricalDtype.n_categories is cheap (metadata only).""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + n_cats = 100 + categories = [f"Cat_{i:03d}" for i in range(n_cats)] + adata = AnnData( + X=np.zeros((n_cats, 2)), + obs=pd.DataFrame({"cell_type": pd.Categorical(categories)}), + ) + + path = tmp_path / f"test.{diskfmt}" + getattr(adata, f"write_{diskfmt}")(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cell_type"].dtype + + # dtype should be LazyCategoricalDtype + assert isinstance(dtype, LazyCategoricalDtype) + + # n_categories should work without loading all categories + assert dtype.n_categories == n_cats + + # ordered should be accessible + assert dtype.ordered is False + + +@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) +def test_lazy_categorical_dtype_head_tail_categories(tmp_path: Path, diskfmt: str): + """Test LazyCategoricalDtype.head_categories and tail_categories for partial reads.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + n_cats = 50 + categories = [f"Type_{i:02d}" for i in range(n_cats)] + adata = AnnData( + X=np.zeros((n_cats, 2)), + obs=pd.DataFrame({"cell_type": pd.Categorical(categories)}), + ) + + path = tmp_path / f"test.{diskfmt}" + getattr(adata, f"write_{diskfmt}")(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cell_type"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # Test head_categories (first n) + first5 = dtype.head_categories(5) + assert len(first5) == 5 + assert list(first5) == [f"Type_{i:02d}" for i in range(5)] + + # Test head_categories default (first 5) + default_head = dtype.head_categories() + assert len(default_head) == 5 + assert list(default_head) == [f"Type_{i:02d}" for i in range(5)] + + # Test tail_categories (last n) + last3 = dtype.tail_categories(3) + assert len(last3) == 3 + assert list(last3) == [f"Type_{i:02d}" for i in range(47, 50)] + + # Test tail_categories default (last 5) + default_tail = dtype.tail_categories() + assert len(default_tail) == 5 + assert list(default_tail) == [f"Type_{i:02d}" for i in range(45, 50)] + + # Test requesting more than available + all_head = dtype.head_categories(100) + assert len(all_head) == n_cats + assert list(all_head) == categories + + all_tail = dtype.tail_categories(100) + assert len(all_tail) == n_cats + assert list(all_tail) == categories + + +@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) +def test_lazy_categorical_dtype_categories_caching(tmp_path: Path, diskfmt: str): + """Test that categories are cached after full load.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = ["a", "b", "c", "d", "e"] + adata = AnnData( + X=np.zeros((5, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / f"test.{diskfmt}" + getattr(adata, f"write_{diskfmt}")(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # Before loading, categories should not be cached + # (accessing internal state for testing) + assert dtype._LazyCategoricalDtype__categories is None + + # Load categories + cats = dtype.categories + assert cats is not None + assert list(cats) == categories + + # After loading, should be cached + assert dtype._LazyCategoricalDtype__categories is not None + + # head_categories should now use cache + head = dtype.head_categories(3) + assert list(head) == ["a", "b", "c"] + + +@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) +def test_lazy_categorical_dtype_ordered(tmp_path: Path, diskfmt: str): + """Test LazyCategoricalDtype with ordered categories.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + adata = AnnData( + X=np.zeros((10, 2)), + obs=pd.DataFrame({ + "ordered_cat": pd.Categorical( + ["low", "medium", "high"] * 3 + ["low"], + categories=["low", "medium", "high"], + ordered=True, + ) + }), + ) + + path = tmp_path / f"test.{diskfmt}" + getattr(adata, f"write_{diskfmt}")(path) + + lazy = read_lazy(path) + dtype = lazy.obs["ordered_cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + assert dtype.ordered is True + assert dtype.n_categories == 3 + assert list(dtype.categories) == ["low", "medium", "high"] + + +def test_lazy_categorical_dtype_repr(tmp_path: Path): + """Test LazyCategoricalDtype repr before and after loading.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = [f"cat_{i}" for i in range(100)] + adata = AnnData( + X=np.zeros((100, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / "test.zarr" + adata.write_zarr(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # Before loading: lazy repr + repr_before = repr(dtype) + assert "LazyCategoricalDtype" in repr_before + assert "n_categories=100" in repr_before + + # Load categories + _ = dtype.categories + + # After loading: standard CategoricalDtype repr + repr_after = repr(dtype) + assert "CategoricalDtype" in repr_after From 7edb51062bcd0bd71d77210242337d35fa3c678f Mon Sep 17 00:00:00 2001 From: Dominik Date: Thu, 8 Jan 2026 13:26:46 +0100 Subject: [PATCH 02/20] fix: LazyCategoricalDtype.__eq__ handle string comparison The merge code checks `dtype == "category"` which requires LazyCategoricalDtype to handle string comparison in __eq__. --- .../experimental/backed/_lazy_arrays.py | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 273ef9ce1..0c8397a12 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -51,15 +51,17 @@ class LazyCategoricalDtype(pd.CategoricalDtype): Examples -------- - >>> dtype = col.dtype # LazyCategoricalDtype - >>> dtype.n_categories # cheap, metadata only - 100000 - >>> dtype.head_categories(5) # partial read, first 5 - array(['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4'], dtype='>> dtype.tail_categories(3) # partial read, last 3 - array(['cat_99997', 'cat_99998', 'cat_99999'], dtype='>> dtype.categories # full load, cached - Index(['cat_0', 'cat_1', ...], dtype='str') + .. code-block:: python + + dtype = col.dtype # LazyCategoricalDtype + dtype.n_categories # cheap, metadata only + # 100000 + dtype.head_categories(5) # partial read, first 5 + # array(['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4'], dtype='>> dtype.head_categories() # first 5 - >>> dtype.head_categories(10) # first 10 + .. code-block:: python + + dtype.head_categories() # first 5 + dtype.head_categories(10) # first 10 """ # If already fully loaded, slice from cache if self.__categories is not None: @@ -181,8 +185,10 @@ def tail_categories( Examples -------- - >>> dtype.tail_categories() # last 5 - >>> dtype.tail_categories(10) # last 10 + .. code-block:: python + + dtype.tail_categories() # last 5 + dtype.tail_categories(10) # last 10 """ # If already fully loaded, slice from cache if self.__categories is not None: @@ -214,21 +220,22 @@ def __hash__(self) -> int: return hash((id(self._categories_array), self._ordered_flag)) def __eq__(self, other) -> bool: + # Handle string comparison (e.g., dtype == "category") + if isinstance(other, str): + return other == self.name if isinstance(other, LazyCategoricalDtype): return ( self._categories_array is other._categories_array and self._ordered_flag == other._ordered_flag ) - if isinstance(other, pd.CategoricalDtype): - # Compare with regular CategoricalDtype - need to load categories - if self.ordered != other.ordered: - return False - if other.categories is None: - return self.categories is None - if self.categories is None: - return False - return self.categories.equals(other.categories) - return False + if not isinstance(other, pd.CategoricalDtype): + return False + # Compare with regular CategoricalDtype - need to load categories + if self.ordered != other.ordered: + return False + if other.categories is None or self.categories is None: + return other.categories is None and self.categories is None + return self.categories.equals(other.categories) class ZarrOrHDF5Wrapper[K: (H5Array, ZarrArray)](XZarrArrayWrapper): From 90ac52e8372b9cfb9088b4fb69a917c8540c04a1 Mon Sep 17 00:00:00 2001 From: Dominik Date: Thu, 8 Jan 2026 15:13:07 +0100 Subject: [PATCH 03/20] increase testing coverage of LazyCategoricalDtype --- tests/lazy/test_read.py | 171 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index f1c3c0829..17069f5a1 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -415,3 +415,174 @@ def test_lazy_categorical_dtype_repr(tmp_path: Path): # After loading: standard CategoricalDtype repr repr_after = repr(dtype) assert "CategoricalDtype" in repr_after + + +def test_lazy_categorical_dtype_equality(tmp_path: Path): + """Test LazyCategoricalDtype equality comparisons.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = ["a", "b", "c"] + adata = AnnData( + X=np.zeros((3, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / "test.zarr" + adata.write_zarr(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # Test string comparison (dtype == "category") + assert dtype == "category" + assert not (dtype == "int64") + + # Test comparison with regular CategoricalDtype + regular_dtype = pd.CategoricalDtype(categories=["a", "b", "c"], ordered=False) + assert dtype == regular_dtype + + # Test comparison with different categories + different_dtype = pd.CategoricalDtype(categories=["x", "y", "z"], ordered=False) + assert not (dtype == different_dtype) + + # Test comparison with different ordered flag + ordered_dtype = pd.CategoricalDtype(categories=["a", "b", "c"], ordered=True) + assert not (dtype == ordered_dtype) + + # Test comparison with non-CategoricalDtype + assert not (dtype == np.dtype("int64")) + assert not (dtype == 123) + assert not (dtype == None) + + +def test_lazy_categorical_dtype_equality_same_array(tmp_path: Path): + """Test LazyCategoricalDtype equality between instances with same underlying array.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = ["x", "y", "z"] + adata = AnnData( + X=np.zeros((3, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / "test.zarr" + adata.write_zarr(path) + + lazy = read_lazy(path) + dtype1 = lazy.obs["cat"].dtype + dtype2 = lazy.obs["cat"].dtype # Same underlying array + + # Same object should be equal + assert dtype1 is dtype2 # They are the same instance + assert dtype1 == dtype2 + + +def test_lazy_categorical_dtype_hash(tmp_path: Path): + """Test LazyCategoricalDtype is hashable.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = ["a", "b", "c"] + adata = AnnData( + X=np.zeros((3, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / "test.zarr" + adata.write_zarr(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # Should be hashable (required for pandas internals) + h = hash(dtype) + assert isinstance(h, int) + + # Can be used in a set + s = {dtype} + assert dtype in s + + +def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): + """Test n_categories returns from cache when categories already loaded.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = ["a", "b", "c", "d", "e"] + adata = AnnData( + X=np.zeros((5, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / "test.zarr" + adata.write_zarr(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # Load categories first + cats = dtype.categories + assert cats is not None + + # Now n_categories should return from cached categories + assert dtype.n_categories == 5 + + +def test_lazy_categorical_dtype_empty_array(): + """Test LazyCategoricalDtype with None categories_array.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + # Create dtype with None categories_array + dtype = LazyCategoricalDtype(categories_array=None, ordered=False) + + # Properties should handle None gracefully + assert dtype.n_categories == 0 + assert dtype.categories is None + + # head_categories and tail_categories should return empty arrays + head = dtype.head_categories(5) + assert len(head) == 0 + + tail = dtype.tail_categories(5) + assert len(tail) == 0 + + # repr should still work + r = repr(dtype) + assert "LazyCategoricalDtype" in r + assert "n_categories=0" in r + + +def test_lazy_categorical_dtype_name(tmp_path: Path): + """Test LazyCategoricalDtype.name property.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + categories = ["a", "b"] + adata = AnnData( + X=np.zeros((2, 2)), + obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + ) + + path = tmp_path / "test.zarr" + adata.write_zarr(path) + + lazy = read_lazy(path) + dtype = lazy.obs["cat"].dtype + assert isinstance(dtype, LazyCategoricalDtype) + + # name should be "category" + assert dtype.name == "category" + + +def test_lazy_categorical_dtype_equality_with_none_categories(tmp_path: Path): + """Test LazyCategoricalDtype equality when comparing dtypes with None categories.""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + # Create dtype with None categories + dtype1 = LazyCategoricalDtype(categories_array=None, ordered=False) + + # Regular CategoricalDtype without categories set + dtype2 = pd.CategoricalDtype(categories=None, ordered=False) + + # Both have None categories, should be equal + assert dtype1 == dtype2 From c6a68da71baf3c4d6722f057580abb7fd9d6b804 Mon Sep 17 00:00:00 2001 From: Dominik Date: Thu, 8 Jan 2026 16:18:43 +0100 Subject: [PATCH 04/20] manipulate cache for better testing --- tests/lazy/test_read.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 2b6c6504e..da2ac9c7d 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -354,9 +354,12 @@ def test_lazy_categorical_dtype_categories_caching(tmp_path: Path, diskfmt: str) # After loading, should be cached assert dtype._LazyCategoricalDtype__categories is not None - # head_categories should now use cache + # Verify head/tail_categories use cache by modifying cache + dtype._LazyCategoricalDtype__categories = pd.Index(["x", "y", "z", "w", "v"]) head = dtype.head_categories(3) - assert list(head) == ["a", "b", "c"] + assert list(head) == ["x", "y", "z"] # Returns cached values, not disk values + tail = dtype.tail_categories(3) + assert list(tail) == ["z", "w", "v"] # Returns cached values, not disk values @pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) @@ -524,8 +527,9 @@ def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): cats = dtype.categories assert cats is not None - # Now n_categories should return from cached categories - assert dtype.n_categories == 5 + # Verify n_categories uses cache by modifying cache + dtype._LazyCategoricalDtype__categories = pd.Index(["x", "y", "z"]) + assert dtype.n_categories == 3 # Returns cached length, not disk length def test_lazy_categorical_dtype_empty_array(): From 92b7bfa8773057b2ea0b3cf93c213dcbe8445db8 Mon Sep 17 00:00:00 2001 From: Dominik Date: Thu, 8 Jan 2026 19:03:55 +0100 Subject: [PATCH 05/20] refactor(LazyCategoricalDtype): implement review suggestions - Use @cached_property for categories (cleaner than manual caching) - Simplify cache detection to "categories" in self.__dict__ - Remove _cached_n_categories double caching (use shape[0] directly) - Rename _categories_array to _categories_elem (reflects group case) - Extract _read_partial_categories helper to deduplicate head/tail - Add ZarrGroup | H5Group to type annotation (code handles it) --- .../experimental/backed/_lazy_arrays.py | 137 ++++++++++-------- tests/lazy/test_read.py | 25 ++-- 2 files changed, 86 insertions(+), 76 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 8e1bd2558..268b39a34 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -27,7 +27,7 @@ from pandas._libs.missing import NAType from pandas.core.dtypes.base import ExtensionDtype - from anndata.compat import ZarrGroup + from anndata.compat import H5Group, ZarrGroup from ...compat import Index1DNorm @@ -40,13 +40,15 @@ class LazyCategoricalDtype(pd.CategoricalDtype): """A CategoricalDtype that lazily loads categories from zarr/h5 storage. This dtype provides efficient access to categorical metadata without loading - all categories into memory. Categories are loaded lazily on first full access - and cached thereafter. + all categories into memory. Use :meth:`n_categories`, :meth:`head_categories`, + and :meth:`tail_categories` for efficient partial access. The full + :attr:`categories` are loaded lazily on first access and cached thereafter. Parameters ---------- - categories_array - The underlying zarr or h5 array containing category values. + categories_elem + The underlying zarr or h5 array (or group for nullable-string-array + encoding) containing category values. Can be None for empty dtype. ordered Whether the categorical is ordered. @@ -66,11 +68,11 @@ class LazyCategoricalDtype(pd.CategoricalDtype): """ # Attributes that should be preserved during copying/pickling - _metadata = ("_categories_array", "_ordered_flag", "_cached_n_categories") + _metadata = ("_categories_elem", "_ordered_flag") def __new__( cls, - categories_array: ZarrArray | H5Array | None = None, + categories_elem: ZarrArray | H5Array | ZarrGroup | H5Group | None = None, *, ordered: bool = False, ): @@ -80,41 +82,40 @@ def __new__( def __init__( self, - categories_array: ZarrArray | H5Array | None = None, + categories_elem: ZarrArray | H5Array | ZarrGroup | H5Group | None = None, *, ordered: bool = False, ): - self._categories_array = categories_array + # Can be None for edge cases (empty dtype). See test_lazy_categorical_dtype_empty_array. + self._categories_elem = categories_elem self._ordered_flag = bool(ordered) - self._cached_n_categories: int | None = None - self.__categories: pd.Index | None = ( - None # Double underscore to avoid conflicts - ) def _get_categories_array(self) -> ZarrArray | H5Array: - """Get the underlying categories array (handles both encodings). + """Get the underlying categories array. - For string-array encoding: _categories_array is directly the array. - For nullable-string-array encoding: _categories_array is a Group with "values" key. + For string-array encoding: _categories_elem is directly the array. + For nullable-string-array encoding: _categories_elem would be a Group + with "values" key (not currently used for categories in anndata, but + handled defensively). """ - if isinstance(self._categories_array, (ZarrArray, H5Array)): - return self._categories_array + if isinstance(self._categories_elem, (ZarrArray, H5Array)): + return self._categories_elem # nullable-string-array encoding: Group with "values" and "mask" - return self._categories_array["values"] + return self._categories_elem["values"] - @property + @cached_property def categories(self) -> pd.Index | None: """Categories index. Loads all categories on first access and caches.""" - if self.__categories is None and self._categories_array is not None: - arr = self._get_categories_array() - if isinstance(arr, ZarrArray): - values = arr[...] - else: - from anndata.io import read_elem + if self._categories_elem is None: + return None + arr = self._get_categories_array() + if isinstance(arr, ZarrArray): + values = arr[...] + else: + from anndata.io import read_elem - values = read_elem(self._categories_array) - self.__categories = pd.Index(values) - return self.__categories + values = read_elem(self._categories_elem) + return pd.Index(values) @property def ordered(self) -> bool: @@ -124,15 +125,23 @@ def ordered(self) -> bool: @property def n_categories(self) -> int: """Number of categories (cheap, metadata only).""" - if self._cached_n_categories is not None: - return self._cached_n_categories - if self.__categories is not None: - return len(self.__categories) - if self._categories_array is not None: - n = self._get_categories_array().shape[0] - self._cached_n_categories = n - return n - return 0 + if self._categories_elem is None: + return 0 + if "categories" in self.__dict__: + return len(self.categories) + return self._get_categories_array().shape[0] + + def _read_partial_categories( + self, start: int, stop: int + ) -> np.ndarray | pd.api.extensions.ExtensionArray: + """Read a slice of categories from disk. + + Uses read_elem_partial for proper HDF5 string decoding. + """ + from anndata._io.specs.registry import read_elem_partial + + arr = self._get_categories_array() + return read_elem_partial(arr, indices=slice(start, stop)) def head_categories( self, n: int = 5 @@ -156,18 +165,15 @@ def head_categories( dtype.head_categories() # first 5 dtype.head_categories(10) # first 10 """ - # If already fully loaded, slice from cache - if self.__categories is not None: - return np.asarray(self.__categories[:n]) - - if self._categories_array is None: + if self._categories_elem is None: return np.array([]) - from anndata._io.specs.registry import read_elem_partial + # If already fully loaded, slice from cache + if "categories" in self.__dict__: + return np.asarray(self.categories[:n]) - arr = self._get_categories_array() total = self.n_categories - return read_elem_partial(arr, indices=slice(0, min(n, total))) + return self._read_partial_categories(0, min(n, total)) def tail_categories( self, n: int = 5 @@ -191,34 +197,39 @@ def tail_categories( dtype.tail_categories() # last 5 dtype.tail_categories(10) # last 10 """ - # If already fully loaded, slice from cache - if self.__categories is not None: - return np.asarray(self.__categories[-n:]) - - if self._categories_array is None: + if self._categories_elem is None: return np.array([]) - from anndata._io.specs.registry import read_elem_partial + # If already fully loaded, slice from cache + if "categories" in self.__dict__: + return np.asarray(self.categories[-n:]) - arr = self._get_categories_array() total = self.n_categories start = max(total - n, 0) - return read_elem_partial(arr, indices=slice(start, total)) + return self._read_partial_categories(start, total) def __repr__(self) -> str: - if self.__categories is not None: + if "categories" in self.__dict__ and self.categories is not None: # Fully loaded - use standard repr - return f"CategoricalDtype(categories={self.__categories!r}, ordered={self.ordered})" + return f"CategoricalDtype(categories={self.categories!r}, ordered={self.ordered})" return f"LazyCategoricalDtype(n_categories={self.n_categories}, ordered={self.ordered})" @property def name(self) -> str: - """String identifier for this dtype.""" + """String identifier for this dtype. + + Required for string comparison (e.g., dtype == "category") used in + anndata merge operations. + """ return "category" def __hash__(self) -> int: - # Need to be hashable for pandas internals - return hash((id(self._categories_array), self._ordered_flag)) + """Hash based on identity of underlying array and ordered flag. + + Required for use in sets and as dictionary keys (e.g., collecting + unique dtypes across AnnData objects). + """ + return hash((id(self._categories_elem), self._ordered_flag)) def __eq__(self, other) -> bool: # Handle string comparison (e.g., dtype == "category") @@ -226,7 +237,7 @@ def __eq__(self, other) -> bool: return other == self.name if isinstance(other, LazyCategoricalDtype): return ( - self._categories_array is other._categories_array + self._categories_elem is other._categories_elem and self._ordered_flag == other._ordered_flag ) if not isinstance(other, pd.CategoricalDtype): @@ -290,7 +301,7 @@ class CategoricalArray[K: (H5Array, ZarrArray)](XBackendArray): """ _codes: ZarrOrHDF5Wrapper[K] - _categories_array: K + _categories_elem: K shape: tuple[int, ...] base_path_or_zarr_group: Path | ZarrGroup elem_name: str @@ -305,7 +316,7 @@ def __init__( ordered: bool, **kwargs, ): - self._categories_array = categories + self._categories_elem = categories self._ordered = ordered self._codes = ZarrOrHDF5Wrapper(codes) self.shape = self._codes.shape @@ -314,7 +325,7 @@ def __init__( self.elem_name = elem_name # Create the lazy dtype - this is where categories are cached self._lazy_dtype = LazyCategoricalDtype( - categories_array=categories, ordered=ordered + categories_elem=categories, ordered=ordered ) @property diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index da2ac9c7d..24540848e 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -342,20 +342,19 @@ def test_lazy_categorical_dtype_categories_caching(tmp_path: Path, diskfmt: str) dtype = lazy.obs["cat"].dtype assert isinstance(dtype, LazyCategoricalDtype) - # Before loading, categories should not be cached - # (accessing internal state for testing) - assert dtype._LazyCategoricalDtype__categories is None + # Before loading, categories should not be cached (uses @cached_property) + assert "categories" not in dtype.__dict__ # Load categories cats = dtype.categories assert cats is not None assert list(cats) == categories - # After loading, should be cached - assert dtype._LazyCategoricalDtype__categories is not None + # After loading, should be cached in __dict__ (cached_property pattern) + assert "categories" in dtype.__dict__ - # Verify head/tail_categories use cache by modifying cache - dtype._LazyCategoricalDtype__categories = pd.Index(["x", "y", "z", "w", "v"]) + # Verify head/tail_categories use cache by modifying the cached value + dtype.__dict__["categories"] = pd.Index(["x", "y", "z", "w", "v"]) head = dtype.head_categories(3) assert list(head) == ["x", "y", "z"] # Returns cached values, not disk values tail = dtype.tail_categories(3) @@ -527,17 +526,17 @@ def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): cats = dtype.categories assert cats is not None - # Verify n_categories uses cache by modifying cache - dtype._LazyCategoricalDtype__categories = pd.Index(["x", "y", "z"]) + # Verify n_categories uses cache by modifying the cached value + dtype.__dict__["categories"] = pd.Index(["x", "y", "z"]) assert dtype.n_categories == 3 # Returns cached length, not disk length def test_lazy_categorical_dtype_empty_array(): - """Test LazyCategoricalDtype with None categories_array.""" + """Test LazyCategoricalDtype with None categories_elem.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - # Create dtype with None categories_array - dtype = LazyCategoricalDtype(categories_array=None, ordered=False) + # Create dtype with None categories_elem + dtype = LazyCategoricalDtype(categories_elem=None, ordered=False) # Properties should handle None gracefully assert dtype.n_categories == 0 @@ -582,7 +581,7 @@ def test_lazy_categorical_dtype_equality_with_none_categories(tmp_path: Path): from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype # Create dtype with None categories - dtype1 = LazyCategoricalDtype(categories_array=None, ordered=False) + dtype1 = LazyCategoricalDtype(categories_elem=None, ordered=False) # Regular CategoricalDtype without categories set dtype2 = pd.CategoricalDtype(categories=None, ordered=False) From 03fe0b042e408a4e8813a69322fc5cd658d64319 Mon Sep 17 00:00:00 2001 From: Dominik Date: Thu, 8 Jan 2026 19:10:21 +0100 Subject: [PATCH 06/20] remove unnecessary docstring --- .../experimental/backed/_lazy_arrays.py | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 268b39a34..8a710dba0 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -40,9 +40,9 @@ class LazyCategoricalDtype(pd.CategoricalDtype): """A CategoricalDtype that lazily loads categories from zarr/h5 storage. This dtype provides efficient access to categorical metadata without loading - all categories into memory. Use :meth:`n_categories`, :meth:`head_categories`, - and :meth:`tail_categories` for efficient partial access. The full - :attr:`categories` are loaded lazily on first access and cached thereafter. + all categories into memory via :meth:`head_categories`, :meth:`tail_categories`, + and :attr:`n_categories`. Accessing :attr:`categories` will load all categories + into memory. Parameters ---------- @@ -51,20 +51,6 @@ class LazyCategoricalDtype(pd.CategoricalDtype): encoding) containing category values. Can be None for empty dtype. ordered Whether the categorical is ordered. - - Examples - -------- - .. code-block:: python - - dtype = col.dtype # LazyCategoricalDtype - dtype.n_categories # cheap, metadata only - # 100000 - dtype.head_categories(5) # partial read, first 5 - # array(['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4'], dtype=' Date: Thu, 8 Jan 2026 19:27:32 +0100 Subject: [PATCH 07/20] remove remaining docstring examples --- src/anndata/experimental/backed/_lazy_arrays.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 8a710dba0..d10deba38 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -143,13 +143,6 @@ def head_categories( ------- np.ndarray or ExtensionArray The first n categories. - - Examples - -------- - .. code-block:: python - - dtype.head_categories() # first 5 - dtype.head_categories(10) # first 10 """ if self._categories_elem is None: return np.array([]) @@ -175,13 +168,6 @@ def tail_categories( ------- np.ndarray or ExtensionArray The last n categories. - - Examples - -------- - .. code-block:: python - - dtype.tail_categories() # last 5 - dtype.tail_categories(10) # last 10 """ if self._categories_elem is None: return np.array([]) From cc06639a019ba9b1cce70cdeb77041711d71e6ee Mon Sep 17 00:00:00 2001 From: Dominik Date: Fri, 9 Jan 2026 15:34:02 +0100 Subject: [PATCH 08/20] refactor(LazyCategoricalDtype): address second round of review feedback - Remove `name` property (inherited from CategoricalDtype) - Remove `None` support from type annotations and guards - Simplify `categories` property to use `read_elem` uniformly - Unify `head_categories`/`tail_categories` into `_get_categories_slice` helper - Keep `bool(ordered)` - required because HDF5 returns np.bool_ - Refactor tests to use `write_elem`/`read_elem_lazy` directly - Update equality check for `None` categories comparison --- .../experimental/backed/_lazy_arrays.py | 87 ++++++++---------- tests/lazy/test_read.py | 92 +++++++------------ 2 files changed, 71 insertions(+), 108 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index d10deba38..24a7b01cf 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -48,7 +48,7 @@ class LazyCategoricalDtype(pd.CategoricalDtype): ---------- categories_elem The underlying zarr or h5 array (or group for nullable-string-array - encoding) containing category values. Can be None for empty dtype. + encoding) containing category values. ordered Whether the categorical is ordered. """ @@ -58,7 +58,7 @@ class LazyCategoricalDtype(pd.CategoricalDtype): def __new__( cls, - categories_elem: ZarrArray | H5Array | ZarrGroup | H5Group | None = None, + categories_elem: ZarrArray | H5Array | ZarrGroup | H5Group, *, ordered: bool = False, ): @@ -68,11 +68,10 @@ def __new__( def __init__( self, - categories_elem: ZarrArray | H5Array | ZarrGroup | H5Group | None = None, + categories_elem: ZarrArray | H5Array | ZarrGroup | H5Group, *, ordered: bool = False, ): - # Can be None for edge cases (empty dtype). See test_lazy_categorical_dtype_empty_array. self._categories_elem = categories_elem self._ordered_flag = bool(ordered) @@ -90,18 +89,11 @@ def _get_categories_array(self) -> ZarrArray | H5Array: return self._categories_elem["values"] @cached_property - def categories(self) -> pd.Index | None: + def categories(self) -> pd.Index: """Categories index. Loads all categories on first access and caches.""" - if self._categories_elem is None: - return None - arr = self._get_categories_array() - if isinstance(arr, ZarrArray): - values = arr[...] - else: - from anndata.io import read_elem + from anndata.io import read_elem - values = read_elem(self._categories_elem) - return pd.Index(values) + return pd.Index(read_elem(self._categories_elem)) @property def ordered(self) -> bool: @@ -111,22 +103,41 @@ def ordered(self) -> bool: @property def n_categories(self) -> int: """Number of categories (cheap, metadata only).""" - if self._categories_elem is None: - return 0 if "categories" in self.__dict__: return len(self.categories) return self._get_categories_array().shape[0] - def _read_partial_categories( - self, start: int, stop: int + def _get_categories_slice( + self, n: int, *, from_end: bool = False ) -> np.ndarray | pd.api.extensions.ExtensionArray: - """Read a slice of categories from disk. + """Get n categories from start or end. + + Parameters + ---------- + n + Number of categories to return. + from_end + If True, return last n categories. If False, return first n. - Uses read_elem_partial for proper HDF5 string decoding. + Returns + ------- + np.ndarray or ExtensionArray + The requested categories. """ + # If already fully loaded, slice from cache + if "categories" in self.__dict__: + sliced = self.categories[-n:] if from_end else self.categories[:n] + return np.asarray(sliced) + + # Read partial from disk from anndata._io.specs.registry import read_elem_partial arr = self._get_categories_array() + total = arr.shape[0] + if from_end: + start, stop = max(total - n, 0), total + else: + start, stop = 0, min(n, total) return read_elem_partial(arr, indices=slice(start, stop)) def head_categories( @@ -144,15 +155,7 @@ def head_categories( np.ndarray or ExtensionArray The first n categories. """ - if self._categories_elem is None: - return np.array([]) - - # If already fully loaded, slice from cache - if "categories" in self.__dict__: - return np.asarray(self.categories[:n]) - - total = self.n_categories - return self._read_partial_categories(0, min(n, total)) + return self._get_categories_slice(n, from_end=False) def tail_categories( self, n: int = 5 @@ -169,32 +172,14 @@ def tail_categories( np.ndarray or ExtensionArray The last n categories. """ - if self._categories_elem is None: - return np.array([]) - - # If already fully loaded, slice from cache - if "categories" in self.__dict__: - return np.asarray(self.categories[-n:]) - - total = self.n_categories - start = max(total - n, 0) - return self._read_partial_categories(start, total) + return self._get_categories_slice(n, from_end=True) def __repr__(self) -> str: - if "categories" in self.__dict__ and self.categories is not None: + if "categories" in self.__dict__: # Fully loaded - use standard repr return f"CategoricalDtype(categories={self.categories!r}, ordered={self.ordered})" return f"LazyCategoricalDtype(n_categories={self.n_categories}, ordered={self.ordered})" - @property - def name(self) -> str: - """String identifier for this dtype. - - Required for string comparison (e.g., dtype == "category") used in - anndata merge operations. - """ - return "category" - def __hash__(self) -> int: """Hash based on identity of underlying array and ordered flag. @@ -217,8 +202,8 @@ def __eq__(self, other) -> bool: # Compare with regular CategoricalDtype - need to load categories if self.ordered != other.ordered: return False - if other.categories is None or self.categories is None: - return other.categories is None and self.categories is None + if other.categories is None: + return False # LazyCategoricalDtype always has categories return self.categories.equals(other.categories) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 24540848e..737dd91d1 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -484,16 +484,14 @@ def test_lazy_categorical_dtype_hash(tmp_path: Path): from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype categories = ["a", "b", "c"] - adata = AnnData( - X=np.zeros((3, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), - ) + cat = pd.Categorical(categories) - path = tmp_path / "test.zarr" - adata.write_zarr(path) + store = zarr.open(tmp_path / "test.zarr", mode="w") + write_elem(store, "cat", cat) - lazy = read_lazy(path) - dtype = lazy.obs["cat"].dtype + store = zarr.open(tmp_path / "test.zarr", mode="r") + lazy_cat = read_elem_lazy(store["cat"]) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) # Should be hashable (required for pandas internals) @@ -510,16 +508,14 @@ def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype categories = ["a", "b", "c", "d", "e"] - adata = AnnData( - X=np.zeros((5, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), - ) + cat = pd.Categorical(categories) - path = tmp_path / "test.zarr" - adata.write_zarr(path) + store = zarr.open(tmp_path / "test.zarr", mode="w") + write_elem(store, "cat", cat) - lazy = read_lazy(path) - dtype = lazy.obs["cat"].dtype + store = zarr.open(tmp_path / "test.zarr", mode="r") + lazy_cat = read_elem_lazy(store["cat"]) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) # Load categories first @@ -531,63 +527,45 @@ def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): assert dtype.n_categories == 3 # Returns cached length, not disk length -def test_lazy_categorical_dtype_empty_array(): - """Test LazyCategoricalDtype with None categories_elem.""" - from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - - # Create dtype with None categories_elem - dtype = LazyCategoricalDtype(categories_elem=None, ordered=False) - - # Properties should handle None gracefully - assert dtype.n_categories == 0 - assert dtype.categories is None - - # head_categories and tail_categories should return empty arrays - head = dtype.head_categories(5) - assert len(head) == 0 - - tail = dtype.tail_categories(5) - assert len(tail) == 0 - - # repr should still work - r = repr(dtype) - assert "LazyCategoricalDtype" in r - assert "n_categories=0" in r - - def test_lazy_categorical_dtype_name(tmp_path: Path): - """Test LazyCategoricalDtype.name property.""" + """Test LazyCategoricalDtype.name property (inherited from CategoricalDtype).""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype categories = ["a", "b"] - adata = AnnData( - X=np.zeros((2, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), - ) + cat = pd.Categorical(categories) - path = tmp_path / "test.zarr" - adata.write_zarr(path) + store = zarr.open(tmp_path / "test.zarr", mode="w") + write_elem(store, "cat", cat) - lazy = read_lazy(path) - dtype = lazy.obs["cat"].dtype + store = zarr.open(tmp_path / "test.zarr", mode="r") + lazy_cat = read_elem_lazy(store["cat"]) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) - # name should be "category" + # name should be "category" (inherited from CategoricalDtype) assert dtype.name == "category" -def test_lazy_categorical_dtype_equality_with_none_categories(tmp_path: Path): - """Test LazyCategoricalDtype equality when comparing dtypes with None categories.""" +def test_lazy_categorical_dtype_inequality_with_none_categories(tmp_path: Path): + """Test LazyCategoricalDtype is not equal to CategoricalDtype with None categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - # Create dtype with None categories - dtype1 = LazyCategoricalDtype(categories_elem=None, ordered=False) + categories = ["a", "b"] + cat = pd.Categorical(categories) + + store = zarr.open(tmp_path / "test.zarr", mode="w") + write_elem(store, "cat", cat) + + store = zarr.open(tmp_path / "test.zarr", mode="r") + lazy_cat = read_elem_lazy(store["cat"]) + dtype = lazy_cat.dtype + assert isinstance(dtype, LazyCategoricalDtype) # Regular CategoricalDtype without categories set - dtype2 = pd.CategoricalDtype(categories=None, ordered=False) + dtype_none = pd.CategoricalDtype(categories=None, ordered=False) - # Both have None categories, should be equal - assert dtype1 == dtype2 + # LazyCategoricalDtype always has categories, so should not equal None-categories dtype + assert dtype != dtype_none def test_nullable_string_index_decoding(tmp_path: Path): From 9ff164cfca744264687ac6691f26acbba4ce1b6c Mon Sep 17 00:00:00 2001 From: Dominik Date: Fri, 9 Jan 2026 15:41:05 +0100 Subject: [PATCH 09/20] test: refactor LazyCategoricalDtype tests to use write_elem/read_elem_lazy --- tests/lazy/test_read.py | 142 +++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 74 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 737dd91d1..9b1430dfc 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -247,23 +247,24 @@ def test_chunks_df( assert arr.chunksize == expected_chunks -@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) -def test_lazy_categorical_dtype_n_categories(tmp_path: Path, diskfmt: str): +def _write_categorical_zarr(tmp_path: Path, cat: pd.Categorical) -> zarr.Group: + """Helper to write categorical to zarr and return read-only store.""" + store = zarr.open(tmp_path / "test.zarr", mode="w") + write_elem(store, "cat", cat) + return zarr.open(tmp_path / "test.zarr", mode="r")["cat"] + + +def test_lazy_categorical_dtype_n_categories(tmp_path: Path): """Test LazyCategoricalDtype.n_categories is cheap (metadata only).""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype n_cats = 100 categories = [f"Cat_{i:03d}" for i in range(n_cats)] - adata = AnnData( - X=np.zeros((n_cats, 2)), - obs=pd.DataFrame({"cell_type": pd.Categorical(categories)}), - ) - - path = tmp_path / f"test.{diskfmt}" - getattr(adata, f"write_{diskfmt}")(path) + cat = pd.Categorical(categories) - lazy = read_lazy(path) - dtype = lazy.obs["cell_type"].dtype + cat_group = _write_categorical_zarr(tmp_path, cat) + lazy_cat = read_elem_lazy(cat_group) + dtype = lazy_cat.dtype # dtype should be LazyCategoricalDtype assert isinstance(dtype, LazyCategoricalDtype) @@ -275,23 +276,17 @@ def test_lazy_categorical_dtype_n_categories(tmp_path: Path, diskfmt: str): assert dtype.ordered is False -@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) -def test_lazy_categorical_dtype_head_tail_categories(tmp_path: Path, diskfmt: str): +def test_lazy_categorical_dtype_head_tail_categories(tmp_path: Path): """Test LazyCategoricalDtype.head_categories and tail_categories for partial reads.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype n_cats = 50 categories = [f"Type_{i:02d}" for i in range(n_cats)] - adata = AnnData( - X=np.zeros((n_cats, 2)), - obs=pd.DataFrame({"cell_type": pd.Categorical(categories)}), - ) - - path = tmp_path / f"test.{diskfmt}" - getattr(adata, f"write_{diskfmt}")(path) + cat = pd.Categorical(categories) - lazy = read_lazy(path) - dtype = lazy.obs["cell_type"].dtype + cat_group = _write_categorical_zarr(tmp_path, cat) + lazy_cat = read_elem_lazy(cat_group) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) # Test head_categories (first n) @@ -324,22 +319,16 @@ def test_lazy_categorical_dtype_head_tail_categories(tmp_path: Path, diskfmt: st assert list(all_tail) == categories -@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) -def test_lazy_categorical_dtype_categories_caching(tmp_path: Path, diskfmt: str): +def test_lazy_categorical_dtype_categories_caching(tmp_path: Path): """Test that categories are cached after full load.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype categories = ["a", "b", "c", "d", "e"] - adata = AnnData( - X=np.zeros((5, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), - ) - - path = tmp_path / f"test.{diskfmt}" - getattr(adata, f"write_{diskfmt}")(path) + cat = pd.Categorical(categories) - lazy = read_lazy(path) - dtype = lazy.obs["cat"].dtype + cat_group = _write_categorical_zarr(tmp_path, cat) + lazy_cat = read_elem_lazy(cat_group) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) # Before loading, categories should not be cached (uses @cached_property) @@ -361,27 +350,19 @@ def test_lazy_categorical_dtype_categories_caching(tmp_path: Path, diskfmt: str) assert list(tail) == ["z", "w", "v"] # Returns cached values, not disk values -@pytest.mark.parametrize("diskfmt", ["zarr", "h5ad"]) -def test_lazy_categorical_dtype_ordered(tmp_path: Path, diskfmt: str): +def test_lazy_categorical_dtype_ordered(tmp_path: Path): """Test LazyCategoricalDtype with ordered categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - adata = AnnData( - X=np.zeros((10, 2)), - obs=pd.DataFrame({ - "ordered_cat": pd.Categorical( - ["low", "medium", "high"] * 3 + ["low"], - categories=["low", "medium", "high"], - ordered=True, - ) - }), + cat = pd.Categorical( + ["low", "medium", "high"] * 3 + ["low"], + categories=["low", "medium", "high"], + ordered=True, ) - path = tmp_path / f"test.{diskfmt}" - getattr(adata, f"write_{diskfmt}")(path) - - lazy = read_lazy(path) - dtype = lazy.obs["ordered_cat"].dtype + cat_group = _write_categorical_zarr(tmp_path, cat) + lazy_cat = read_elem_lazy(cat_group) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) assert dtype.ordered is True @@ -394,16 +375,11 @@ def test_lazy_categorical_dtype_repr(tmp_path: Path): from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype categories = [f"cat_{i}" for i in range(100)] - adata = AnnData( - X=np.zeros((100, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), - ) - - path = tmp_path / "test.zarr" - adata.write_zarr(path) + cat = pd.Categorical(categories) - lazy = read_lazy(path) - dtype = lazy.obs["cat"].dtype + cat_group = _write_categorical_zarr(tmp_path, cat) + lazy_cat = read_elem_lazy(cat_group) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) # Before loading: lazy repr @@ -424,16 +400,11 @@ def test_lazy_categorical_dtype_equality(tmp_path: Path): from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype categories = ["a", "b", "c"] - adata = AnnData( - X=np.zeros((3, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), - ) - - path = tmp_path / "test.zarr" - adata.write_zarr(path) + cat = pd.Categorical(categories) - lazy = read_lazy(path) - dtype = lazy.obs["cat"].dtype + cat_group = _write_categorical_zarr(tmp_path, cat) + lazy_cat = read_elem_lazy(cat_group) + dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) # Test string comparison (dtype == "category") @@ -458,26 +429,49 @@ def test_lazy_categorical_dtype_equality(tmp_path: Path): assert dtype is not None -def test_lazy_categorical_dtype_equality_same_array(tmp_path: Path): - """Test LazyCategoricalDtype equality between instances with same underlying array.""" +def test_lazy_categorical_roundtrip_via_anndata(tmp_path: Path): + """Integration test: lazy categorical through full AnnData workflow. + + This test uses the full AnnData read/write path rather than write_elem/read_elem_lazy + to verify end-to-end integration including dtype caching and equality. + """ + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["x", "y", "z"] + categories = ["type_a", "type_b", "type_c"] adata = AnnData( - X=np.zeros((3, 2)), - obs=pd.DataFrame({"cat": pd.Categorical(categories)}), + X=np.zeros((6, 2)), + obs=pd.DataFrame({ + "cat": pd.Categorical(categories * 2), + "ordered_cat": pd.Categorical( + ["low", "high"] * 3, + categories=["low", "high"], + ordered=True, + ), + }), ) path = tmp_path / "test.zarr" adata.write_zarr(path) + # Read lazy and verify dtype lazy = read_lazy(path) dtype1 = lazy.obs["cat"].dtype dtype2 = lazy.obs["cat"].dtype # Same underlying array - # Same object should be equal - assert dtype1 is dtype2 # They are the same instance + assert isinstance(dtype1, LazyCategoricalDtype) + assert dtype1 is dtype2 # Same instance (cached) assert dtype1 == dtype2 + # Verify ordered categorical + ordered_dtype = lazy.obs["ordered_cat"].dtype + assert isinstance(ordered_dtype, LazyCategoricalDtype) + assert ordered_dtype.ordered is True + + # Round-trip: lazy -> memory should equal original + loaded = lazy.to_memory() + assert loaded.obs["cat"].equals(adata.obs["cat"]) + assert loaded.obs["ordered_cat"].equals(adata.obs["ordered_cat"]) + def test_lazy_categorical_dtype_hash(tmp_path: Path): """Test LazyCategoricalDtype is hashable.""" From d1c4d46ca39c09066d3e290cdddffec3ff275f28 Mon Sep 17 00:00:00 2001 From: Dominik Date: Mon, 12 Jan 2026 09:01:37 -0800 Subject: [PATCH 10/20] address third round review: simplify __eq__, improve repr, refactor fixtures - Simplify __eq__ to defer to pandas base implementation after fast paths: 1. Same Python object (identity check) 2. Same on-disk location (avoids loading categories when comparing dtypes from the same file opened multiple times) - Update __repr__ to always show categories (truncated for large n): small: LazyCategoricalDtype(categories=['a', 'b', 'c']) large: LazyCategoricalDtype(categories=['a', ..., 'z'], n=100) - Extract _N_CATEGORIES_REPR_SHOW constant to module level - Refactor tests to use session-scoped fixtures (write once, read many) instead of creating new categoricals in each test --- .../experimental/backed/_lazy_arrays.py | 64 ++++-- tests/lazy/test_read.py | 203 +++++++++--------- 2 files changed, 146 insertions(+), 121 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 24a7b01cf..c626681c5 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -20,6 +20,21 @@ ZarrArray, ) +# Number of categories to show at head/tail in LazyCategoricalDtype repr +_N_CATEGORIES_REPR_SHOW = 3 + + +def _same_disk_location(a: ZarrArray | H5Array, b: ZarrArray | H5Array) -> bool: + """Check if two arrays reference the same on-disk location.""" + if type(a) is not type(b): + return False + if isinstance(a, ZarrArray): + return a.store.path == b.store.path and a.path == b.path + if isinstance(a, H5Array): + return a.file.filename == b.file.filename and a.name == b.name + return False + + if TYPE_CHECKING: from pathlib import Path from typing import Literal @@ -175,10 +190,22 @@ def tail_categories( return self._get_categories_slice(n, from_end=True) def __repr__(self) -> str: - if "categories" in self.__dict__: - # Fully loaded - use standard repr - return f"CategoricalDtype(categories={self.categories!r}, ordered={self.ordered})" - return f"LazyCategoricalDtype(n_categories={self.n_categories}, ordered={self.ordered})" + n_total = self.n_categories + ordered_str = ", ordered=True" if self.ordered else "" + + if n_total <= _N_CATEGORIES_REPR_SHOW * 2: + # Small enough to show all categories + if "categories" in self.__dict__: + cats = list(self.categories) + else: + cats = list(self.head_categories(n_total)) + return f"LazyCategoricalDtype(categories={cats!r}{ordered_str})" + + # Show truncated: first n ... last n + head = list(self.head_categories(_N_CATEGORIES_REPR_SHOW)) + tail = list(self.tail_categories(_N_CATEGORIES_REPR_SHOW)) + cats_display = head + ["..."] + tail + return f"LazyCategoricalDtype(categories={cats_display!r}, n={n_total}{ordered_str})" def __hash__(self) -> int: """Hash based on identity of underlying array and ordered flag. @@ -189,22 +216,21 @@ def __hash__(self) -> int: return hash((id(self._categories_elem), self._ordered_flag)) def __eq__(self, other) -> bool: - # Handle string comparison (e.g., dtype == "category") - if isinstance(other, str): - return other == self.name if isinstance(other, LazyCategoricalDtype): - return ( - self._categories_elem is other._categories_elem - and self._ordered_flag == other._ordered_flag - ) - if not isinstance(other, pd.CategoricalDtype): - return False - # Compare with regular CategoricalDtype - need to load categories - if self.ordered != other.ordered: - return False - if other.categories is None: - return False # LazyCategoricalDtype always has categories - return self.categories.equals(other.categories) + if self._ordered_flag != other._ordered_flag: + return False + # Fast path: same Python object + if self._categories_elem is other._categories_elem: + return True + # Fast path: same on-disk location (avoids loading categories) + if _same_disk_location( + self._get_categories_array(), other._get_categories_array() + ): + return True + # Defer to pandas base implementation for all other comparisons + # This handles string comparison ("category"), CategoricalDtype comparisons, + # and all edge cases (None categories, ordered vs unordered, etc.) + return super().__eq__(other) class ZarrOrHDF5Wrapper[K: (H5Array | H5AsTypeView, ZarrArray)](XZarrArrayWrapper): diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 9b1430dfc..5e3d65eb0 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -247,45 +247,80 @@ def test_chunks_df( assert arr.chunksize == expected_chunks -def _write_categorical_zarr(tmp_path: Path, cat: pd.Categorical) -> zarr.Group: - """Helper to write categorical to zarr and return read-only store.""" - store = zarr.open(tmp_path / "test.zarr", mode="w") +# Session-scoped fixtures for categorical data (write once, read many) +@pytest.fixture(scope="session") +def cat_small_store(tmp_path_factory) -> zarr.Group: + """Session-scoped fixture: small categorical ['a', 'b', 'c'].""" + cat = pd.Categorical(["a", "b", "c"]) + path = tmp_path_factory.mktemp("cat_small.zarr") + store = zarr.open(path, mode="w") write_elem(store, "cat", cat) - return zarr.open(tmp_path / "test.zarr", mode="r")["cat"] + return zarr.open(path, mode="r")["cat"] -def test_lazy_categorical_dtype_n_categories(tmp_path: Path): - """Test LazyCategoricalDtype.n_categories is cheap (metadata only).""" - from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype +@pytest.fixture(scope="session") +def cat_medium_store(tmp_path_factory) -> zarr.Group: + """Session-scoped fixture: medium categorical ['a', 'b', 'c', 'd', 'e'].""" + cat = pd.Categorical(["a", "b", "c", "d", "e"]) + path = tmp_path_factory.mktemp("cat_medium.zarr") + store = zarr.open(path, mode="w") + write_elem(store, "cat", cat) + return zarr.open(path, mode="r")["cat"] + - n_cats = 100 - categories = [f"Cat_{i:03d}" for i in range(n_cats)] +@pytest.fixture(scope="session") +def cat_large_store(tmp_path_factory) -> zarr.Group: + """Session-scoped fixture: large categorical with 100 categories.""" + categories = [f"cat_{i}" for i in range(100)] cat = pd.Categorical(categories) + path = tmp_path_factory.mktemp("cat_large.zarr") + store = zarr.open(path, mode="w") + write_elem(store, "cat", cat) + return zarr.open(path, mode="r")["cat"] - cat_group = _write_categorical_zarr(tmp_path, cat) - lazy_cat = read_elem_lazy(cat_group) - dtype = lazy_cat.dtype - # dtype should be LazyCategoricalDtype - assert isinstance(dtype, LazyCategoricalDtype) +@pytest.fixture(scope="session") +def cat_ordered_store(tmp_path_factory) -> zarr.Group: + """Session-scoped fixture: ordered categorical ['low', 'medium', 'high'].""" + cat = pd.Categorical( + ["low", "medium", "high"] * 3 + ["low"], + categories=["low", "medium", "high"], + ordered=True, + ) + path = tmp_path_factory.mktemp("cat_ordered.zarr") + store = zarr.open(path, mode="w") + write_elem(store, "cat", cat) + return zarr.open(path, mode="r")["cat"] + + +@pytest.fixture(scope="session") +def cat_fifty_store(tmp_path_factory) -> zarr.Group: + """Session-scoped fixture: 50 categories for head/tail testing.""" + categories = [f"Type_{i:02d}" for i in range(50)] + cat = pd.Categorical(categories) + path = tmp_path_factory.mktemp("cat_fifty.zarr") + store = zarr.open(path, mode="w") + write_elem(store, "cat", cat) + return zarr.open(path, mode="r")["cat"] + - # n_categories should work without loading all categories - assert dtype.n_categories == n_cats +def test_lazy_categorical_dtype_n_categories(cat_large_store: zarr.Group): + """Test LazyCategoricalDtype.n_categories is cheap (metadata only).""" + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + lazy_cat = read_elem_lazy(cat_large_store) + dtype = lazy_cat.dtype - # ordered should be accessible + assert isinstance(dtype, LazyCategoricalDtype) + assert dtype.n_categories == 100 assert dtype.ordered is False -def test_lazy_categorical_dtype_head_tail_categories(tmp_path: Path): +def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store: zarr.Group): """Test LazyCategoricalDtype.head_categories and tail_categories for partial reads.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - n_cats = 50 - categories = [f"Type_{i:02d}" for i in range(n_cats)] - cat = pd.Categorical(categories) - - cat_group = _write_categorical_zarr(tmp_path, cat) - lazy_cat = read_elem_lazy(cat_group) + lazy_cat = read_elem_lazy(cat_fifty_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -311,23 +346,19 @@ def test_lazy_categorical_dtype_head_tail_categories(tmp_path: Path): # Test requesting more than available all_head = dtype.head_categories(100) - assert len(all_head) == n_cats - assert list(all_head) == categories + assert len(all_head) == 50 + assert list(all_head) == [f"Type_{i:02d}" for i in range(50)] all_tail = dtype.tail_categories(100) - assert len(all_tail) == n_cats - assert list(all_tail) == categories + assert len(all_tail) == 50 + assert list(all_tail) == [f"Type_{i:02d}" for i in range(50)] -def test_lazy_categorical_dtype_categories_caching(tmp_path: Path): +def test_lazy_categorical_dtype_categories_caching(cat_medium_store: zarr.Group): """Test that categories are cached after full load.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["a", "b", "c", "d", "e"] - cat = pd.Categorical(categories) - - cat_group = _write_categorical_zarr(tmp_path, cat) - lazy_cat = read_elem_lazy(cat_group) + lazy_cat = read_elem_lazy(cat_medium_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -337,7 +368,7 @@ def test_lazy_categorical_dtype_categories_caching(tmp_path: Path): # Load categories cats = dtype.categories assert cats is not None - assert list(cats) == categories + assert list(cats) == ["a", "b", "c", "d", "e"] # After loading, should be cached in __dict__ (cached_property pattern) assert "categories" in dtype.__dict__ @@ -350,18 +381,11 @@ def test_lazy_categorical_dtype_categories_caching(tmp_path: Path): assert list(tail) == ["z", "w", "v"] # Returns cached values, not disk values -def test_lazy_categorical_dtype_ordered(tmp_path: Path): +def test_lazy_categorical_dtype_ordered(cat_ordered_store: zarr.Group): """Test LazyCategoricalDtype with ordered categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - cat = pd.Categorical( - ["low", "medium", "high"] * 3 + ["low"], - categories=["low", "medium", "high"], - ordered=True, - ) - - cat_group = _write_categorical_zarr(tmp_path, cat) - lazy_cat = read_elem_lazy(cat_group) + lazy_cat = read_elem_lazy(cat_ordered_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -370,40 +394,41 @@ def test_lazy_categorical_dtype_ordered(tmp_path: Path): assert list(dtype.categories) == ["low", "medium", "high"] -def test_lazy_categorical_dtype_repr(tmp_path: Path): - """Test LazyCategoricalDtype repr before and after loading.""" +def test_lazy_categorical_dtype_repr( + cat_large_store: zarr.Group, cat_small_store: zarr.Group +): + """Test LazyCategoricalDtype repr shows truncated categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = [f"cat_{i}" for i in range(100)] - cat = pd.Categorical(categories) - - cat_group = _write_categorical_zarr(tmp_path, cat) - lazy_cat = read_elem_lazy(cat_group) + # Test large number of categories (truncated repr) + lazy_cat = read_elem_lazy(cat_large_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) - # Before loading: lazy repr - repr_before = repr(dtype) - assert "LazyCategoricalDtype" in repr_before - assert "n_categories=100" in repr_before + repr_str = repr(dtype) + assert "LazyCategoricalDtype" in repr_str + assert "n=100" in repr_str + assert "..." in repr_str # Truncation indicator + assert "cat_0" in repr_str # Head category + assert "cat_99" in repr_str # Tail category - # Load categories - _ = dtype.categories + # Test small number of categories (full repr) + small_lazy_cat = read_elem_lazy(cat_small_store) + small_dtype = small_lazy_cat.dtype - # After loading: standard CategoricalDtype repr - repr_after = repr(dtype) - assert "CategoricalDtype" in repr_after + small_repr = repr(small_dtype) + assert "LazyCategoricalDtype" in small_repr + assert "..." not in small_repr # No truncation for small categories + assert "'a'" in small_repr + assert "'b'" in small_repr + assert "'c'" in small_repr -def test_lazy_categorical_dtype_equality(tmp_path: Path): +def test_lazy_categorical_dtype_equality(cat_small_store: zarr.Group): """Test LazyCategoricalDtype equality comparisons.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["a", "b", "c"] - cat = pd.Categorical(categories) - - cat_group = _write_categorical_zarr(tmp_path, cat) - lazy_cat = read_elem_lazy(cat_group) + lazy_cat = read_elem_lazy(cat_small_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -473,18 +498,11 @@ def test_lazy_categorical_roundtrip_via_anndata(tmp_path: Path): assert loaded.obs["ordered_cat"].equals(adata.obs["ordered_cat"]) -def test_lazy_categorical_dtype_hash(tmp_path: Path): +def test_lazy_categorical_dtype_hash(cat_small_store: zarr.Group): """Test LazyCategoricalDtype is hashable.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["a", "b", "c"] - cat = pd.Categorical(categories) - - store = zarr.open(tmp_path / "test.zarr", mode="w") - write_elem(store, "cat", cat) - - store = zarr.open(tmp_path / "test.zarr", mode="r") - lazy_cat = read_elem_lazy(store["cat"]) + lazy_cat = read_elem_lazy(cat_small_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -497,18 +515,11 @@ def test_lazy_categorical_dtype_hash(tmp_path: Path): assert dtype in s -def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): +def test_lazy_categorical_dtype_n_categories_from_cache(cat_medium_store: zarr.Group): """Test n_categories returns from cache when categories already loaded.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["a", "b", "c", "d", "e"] - cat = pd.Categorical(categories) - - store = zarr.open(tmp_path / "test.zarr", mode="w") - write_elem(store, "cat", cat) - - store = zarr.open(tmp_path / "test.zarr", mode="r") - lazy_cat = read_elem_lazy(store["cat"]) + lazy_cat = read_elem_lazy(cat_medium_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -521,18 +532,11 @@ def test_lazy_categorical_dtype_n_categories_from_cache(tmp_path: Path): assert dtype.n_categories == 3 # Returns cached length, not disk length -def test_lazy_categorical_dtype_name(tmp_path: Path): +def test_lazy_categorical_dtype_name(cat_small_store: zarr.Group): """Test LazyCategoricalDtype.name property (inherited from CategoricalDtype).""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["a", "b"] - cat = pd.Categorical(categories) - - store = zarr.open(tmp_path / "test.zarr", mode="w") - write_elem(store, "cat", cat) - - store = zarr.open(tmp_path / "test.zarr", mode="r") - lazy_cat = read_elem_lazy(store["cat"]) + lazy_cat = read_elem_lazy(cat_small_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -540,18 +544,13 @@ def test_lazy_categorical_dtype_name(tmp_path: Path): assert dtype.name == "category" -def test_lazy_categorical_dtype_inequality_with_none_categories(tmp_path: Path): +def test_lazy_categorical_dtype_inequality_with_none_categories( + cat_small_store: zarr.Group, +): """Test LazyCategoricalDtype is not equal to CategoricalDtype with None categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - categories = ["a", "b"] - cat = pd.Categorical(categories) - - store = zarr.open(tmp_path / "test.zarr", mode="w") - write_elem(store, "cat", cat) - - store = zarr.open(tmp_path / "test.zarr", mode="r") - lazy_cat = read_elem_lazy(store["cat"]) + lazy_cat = read_elem_lazy(cat_small_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) From 3d8bbeab0f205cb01c9cf672d0f39f9908468e75 Mon Sep 17 00:00:00 2001 From: Dominik Date: Mon, 12 Jan 2026 13:06:40 -0800 Subject: [PATCH 11/20] fix linting and simplify __eq__ using zarr/h5py built-in location equality - Fix RUF005: use list unpacking [*head, "...", *tail] - Remove _same_disk_location helper - zarr/h5py arrays already compare equal by on-disk location, not content --- .../experimental/backed/_lazy_arrays.py | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index c626681c5..78a6d0526 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -23,18 +23,6 @@ # Number of categories to show at head/tail in LazyCategoricalDtype repr _N_CATEGORIES_REPR_SHOW = 3 - -def _same_disk_location(a: ZarrArray | H5Array, b: ZarrArray | H5Array) -> bool: - """Check if two arrays reference the same on-disk location.""" - if type(a) is not type(b): - return False - if isinstance(a, ZarrArray): - return a.store.path == b.store.path and a.path == b.path - if isinstance(a, H5Array): - return a.file.filename == b.file.filename and a.name == b.name - return False - - if TYPE_CHECKING: from pathlib import Path from typing import Literal @@ -204,7 +192,7 @@ def __repr__(self) -> str: # Show truncated: first n ... last n head = list(self.head_categories(_N_CATEGORIES_REPR_SHOW)) tail = list(self.tail_categories(_N_CATEGORIES_REPR_SHOW)) - cats_display = head + ["..."] + tail + cats_display = [*head, "...", *tail] return f"LazyCategoricalDtype(categories={cats_display!r}, n={n_total}{ordered_str})" def __hash__(self) -> int: @@ -222,10 +210,8 @@ def __eq__(self, other) -> bool: # Fast path: same Python object if self._categories_elem is other._categories_elem: return True - # Fast path: same on-disk location (avoids loading categories) - if _same_disk_location( - self._get_categories_array(), other._get_categories_array() - ): + # Fast path: zarr/h5py arrays compare equal by location + if self._get_categories_array() == other._get_categories_array(): return True # Defer to pandas base implementation for all other comparisons # This handles string comparison ("category"), CategoricalDtype comparisons, From e8ee0052a92cd326b1b7ad7357f964be4e97f58b Mon Sep 17 00:00:00 2001 From: Dominik Date: Mon, 12 Jan 2026 13:15:40 -0800 Subject: [PATCH 12/20] test: add same-location equality check for LazyCategoricalDtype Verify that comparing two dtypes from the same file (opened twice) uses the fast path and doesn't load categories. --- tests/lazy/test_read.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 5e3d65eb0..0aaba453f 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -249,13 +249,19 @@ def test_chunks_df( # Session-scoped fixtures for categorical data (write once, read many) @pytest.fixture(scope="session") -def cat_small_store(tmp_path_factory) -> zarr.Group: - """Session-scoped fixture: small categorical ['a', 'b', 'c'].""" +def cat_small_path(tmp_path_factory) -> Path: + """Session-scoped fixture: path to small categorical ['a', 'b', 'c'].""" cat = pd.Categorical(["a", "b", "c"]) path = tmp_path_factory.mktemp("cat_small.zarr") store = zarr.open(path, mode="w") write_elem(store, "cat", cat) - return zarr.open(path, mode="r")["cat"] + return path + + +@pytest.fixture(scope="session") +def cat_small_store(cat_small_path: Path) -> zarr.Group: + """Session-scoped fixture: small categorical ['a', 'b', 'c'].""" + return zarr.open(cat_small_path, mode="r")["cat"] @pytest.fixture(scope="session") @@ -424,7 +430,9 @@ def test_lazy_categorical_dtype_repr( assert "'c'" in small_repr -def test_lazy_categorical_dtype_equality(cat_small_store: zarr.Group): +def test_lazy_categorical_dtype_equality( + cat_small_store: zarr.Group, cat_small_path: Path +): """Test LazyCategoricalDtype equality comparisons.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -453,6 +461,20 @@ def test_lazy_categorical_dtype_equality(cat_small_store: zarr.Group): assert dtype != 123 assert dtype is not None + # Test same-location equality (file opened twice, different Python objects) + # Use fresh reads to ensure categories aren't cached + store_fresh1 = zarr.open(cat_small_path, mode="r")["cat"] + store_fresh2 = zarr.open(cat_small_path, mode="r")["cat"] + dtype_fresh1 = read_elem_lazy(store_fresh1).dtype + dtype_fresh2 = read_elem_lazy(store_fresh2).dtype + + assert dtype_fresh1._categories_elem is not dtype_fresh2._categories_elem + assert "categories" not in dtype_fresh1.__dict__ # Not yet loaded + assert "categories" not in dtype_fresh2.__dict__ + assert dtype_fresh1 == dtype_fresh2 # Equal via location check + assert "categories" not in dtype_fresh1.__dict__ # Still not loaded + assert "categories" not in dtype_fresh2.__dict__ + def test_lazy_categorical_roundtrip_via_anndata(tmp_path: Path): """Integration test: lazy categorical through full AnnData workflow. From e57ffb08c505fe7b52ae8e6863745e5c5952508e Mon Sep 17 00:00:00 2001 From: Dominik Date: Tue, 13 Jan 2026 13:04:24 -0800 Subject: [PATCH 13/20] test: improve LazyCategoricalDtype equality test to verify no I/O Replace the previous same-location equality test with a more rigorous parametrized test that covers both zarr and h5py backends. The new test uses `unittest.mock.patch.object` to patch `__getitem__` on the underlying category arrays to raise `AssertionError` if called. This proves that both backends use location-based equality comparison that doesn't read array contents: - h5py: compares HDF5 object IDs (file number + object number) - zarr 3.x: compares StorePath (URL string comparison via dataclass) The previous test only verified our `LazyCategoricalDtype.categories` cache wasn't populated, which doesn't prove the storage layer didn't load data internally. --- tests/lazy/test_read.py | 102 +++++++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 18 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 0aaba453f..4a47f98ea 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -2,7 +2,9 @@ from importlib.util import find_spec from typing import TYPE_CHECKING +from unittest.mock import patch +import h5py import numpy as np import pandas as pd import pytest @@ -249,8 +251,8 @@ def test_chunks_df( # Session-scoped fixtures for categorical data (write once, read many) @pytest.fixture(scope="session") -def cat_small_path(tmp_path_factory) -> Path: - """Session-scoped fixture: path to small categorical ['a', 'b', 'c'].""" +def cat_small_path_zarr(tmp_path_factory) -> Path: + """Session-scoped fixture: path to small categorical ['a', 'b', 'c'] in zarr.""" cat = pd.Categorical(["a", "b", "c"]) path = tmp_path_factory.mktemp("cat_small.zarr") store = zarr.open(path, mode="w") @@ -258,6 +260,23 @@ def cat_small_path(tmp_path_factory) -> Path: return path +@pytest.fixture(scope="session") +def cat_small_path_h5ad(tmp_path_factory) -> Path: + """Session-scoped fixture: path to small categorical ['a', 'b', 'c'] in h5ad.""" + cat = pd.Categorical(["a", "b", "c"]) + path = tmp_path_factory.mktemp("cat_small") / "cat.h5ad" + with h5py.File(path, mode="w") as f: + write_elem(f, "cat", cat) + return path + + +# Backward compatibility alias +@pytest.fixture(scope="session") +def cat_small_path(cat_small_path_zarr: Path) -> Path: + """Alias for cat_small_path_zarr for backward compatibility.""" + return cat_small_path_zarr + + @pytest.fixture(scope="session") def cat_small_store(cat_small_path: Path) -> zarr.Group: """Session-scoped fixture: small categorical ['a', 'b', 'c'].""" @@ -430,9 +449,7 @@ def test_lazy_categorical_dtype_repr( assert "'c'" in small_repr -def test_lazy_categorical_dtype_equality( - cat_small_store: zarr.Group, cat_small_path: Path -): +def test_lazy_categorical_dtype_equality(cat_small_store: zarr.Group): """Test LazyCategoricalDtype equality comparisons.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -461,19 +478,68 @@ def test_lazy_categorical_dtype_equality( assert dtype != 123 assert dtype is not None - # Test same-location equality (file opened twice, different Python objects) - # Use fresh reads to ensure categories aren't cached - store_fresh1 = zarr.open(cat_small_path, mode="r")["cat"] - store_fresh2 = zarr.open(cat_small_path, mode="r")["cat"] - dtype_fresh1 = read_elem_lazy(store_fresh1).dtype - dtype_fresh2 = read_elem_lazy(store_fresh2).dtype - - assert dtype_fresh1._categories_elem is not dtype_fresh2._categories_elem - assert "categories" not in dtype_fresh1.__dict__ # Not yet loaded - assert "categories" not in dtype_fresh2.__dict__ - assert dtype_fresh1 == dtype_fresh2 # Equal via location check - assert "categories" not in dtype_fresh1.__dict__ # Still not loaded - assert "categories" not in dtype_fresh2.__dict__ + +@pytest.mark.parametrize("backend", ["zarr", "h5ad"]) +def test_lazy_categorical_dtype_equality_no_load( + cat_small_path_zarr: Path, cat_small_path_h5ad: Path, backend: str +): + """Test same-location equality doesn't load category data. + + Both h5py (HDF5 object ID comparison) and zarr 3.x (StorePath comparison) use + location-based equality that doesn't read array contents. This test verifies + that behavior by patching __getitem__ to raise if called. + """ + from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + + if backend == "zarr": + path = cat_small_path_zarr + + def open_store(p): + return zarr.open(p, mode="r")["cat"] + + else: + path = cat_small_path_h5ad + # Keep h5py files open for the duration of the test + open_store = lambda p: h5py.File(p, mode="r")["cat"] + + # Open the same file twice to get different Python objects pointing to same location + store1 = open_store(path) + store2 = open_store(path) + dtype1 = read_elem_lazy(store1).dtype + dtype2 = read_elem_lazy(store2).dtype + + assert isinstance(dtype1, LazyCategoricalDtype) + assert isinstance(dtype2, LazyCategoricalDtype) + # Verify these are different Python objects + assert dtype1._categories_elem is not dtype2._categories_elem + + # Patch __getitem__ to raise if data is loaded during comparison + cat_arr1 = dtype1._get_categories_array() + cat_arr2 = dtype2._get_categories_array() + + with ( + patch.object( + cat_arr1, + "__getitem__", + side_effect=AssertionError("Data was loaded from arr1"), + ), + patch.object( + cat_arr2, + "__getitem__", + side_effect=AssertionError("Data was loaded from arr2"), + ), + ): + # This should use location-based comparison without triggering __getitem__ + assert dtype1 == dtype2 + + # Also verify our cache wasn't populated + assert "categories" not in dtype1.__dict__ + assert "categories" not in dtype2.__dict__ + + # Clean up h5py file handles + if backend == "h5ad": + store1.file.close() + store2.file.close() def test_lazy_categorical_roundtrip_via_anndata(tmp_path: Path): From f4b6cd9d3f4945c1db4c1f29f7a45a355a887ffd Mon Sep 17 00:00:00 2001 From: Dominik Date: Tue, 13 Jan 2026 13:09:53 -0800 Subject: [PATCH 14/20] test: parametrize all LazyCategoricalDtype tests for both zarr and h5ad Refactor categorical test fixtures to support both backends: - Add helper functions for writing categorical data to zarr/h5ad - Create path fixtures for each category type and backend (session-scoped) - Create parametrized store fixtures that test both zarr and h5ad All LazyCategoricalDtype tests now run for both backends, increasing test coverage from 12 to 24 tests: - test_lazy_categorical_dtype_n_categories[zarr/h5ad] - test_lazy_categorical_dtype_head_tail_categories[zarr/h5ad] - test_lazy_categorical_dtype_categories_caching[zarr/h5ad] - test_lazy_categorical_dtype_ordered[zarr/h5ad] - test_lazy_categorical_dtype_repr[zarr-zarr/zarr-h5ad/h5ad-zarr/h5ad-h5ad] - test_lazy_categorical_dtype_equality[zarr/h5ad] - test_lazy_categorical_dtype_equality_no_load[zarr/h5ad] - test_lazy_categorical_dtype_hash[zarr/h5ad] - test_lazy_categorical_dtype_n_categories_from_cache[zarr/h5ad] - test_lazy_categorical_dtype_name[zarr/h5ad] - test_lazy_categorical_dtype_inequality_with_none_categories[zarr/h5ad] --- tests/lazy/test_read.py | 195 ++++++++++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 56 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 4a47f98ea..6683bc5ff 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -250,86 +250,171 @@ def test_chunks_df( # Session-scoped fixtures for categorical data (write once, read many) -@pytest.fixture(scope="session") -def cat_small_path_zarr(tmp_path_factory) -> Path: - """Session-scoped fixture: path to small categorical ['a', 'b', 'c'] in zarr.""" - cat = pd.Categorical(["a", "b", "c"]) - path = tmp_path_factory.mktemp("cat_small.zarr") +# Each category type has zarr and h5ad path fixtures, plus a parametrized store fixture + + +def _write_categorical_zarr(tmp_path_factory, name: str, cat: pd.Categorical) -> Path: + """Helper to write categorical to zarr and return path.""" + path = tmp_path_factory.mktemp(f"{name}.zarr") store = zarr.open(path, mode="w") write_elem(store, "cat", cat) return path -@pytest.fixture(scope="session") -def cat_small_path_h5ad(tmp_path_factory) -> Path: - """Session-scoped fixture: path to small categorical ['a', 'b', 'c'] in h5ad.""" - cat = pd.Categorical(["a", "b", "c"]) - path = tmp_path_factory.mktemp("cat_small") / "cat.h5ad" +def _write_categorical_h5ad(tmp_path_factory, name: str, cat: pd.Categorical) -> Path: + """Helper to write categorical to h5ad and return path.""" + path = tmp_path_factory.mktemp(name) / "cat.h5ad" with h5py.File(path, mode="w") as f: write_elem(f, "cat", cat) return path -# Backward compatibility alias +def _open_categorical_store(path: Path, backend: str): + """Helper to open categorical store for either backend.""" + if backend == "zarr": + return zarr.open(path, mode="r")["cat"] + else: + return h5py.File(path, mode="r")["cat"] + + +# Small categorical ['a', 'b', 'c'] +@pytest.fixture(scope="session") +def cat_small_path_zarr(tmp_path_factory) -> Path: + return _write_categorical_zarr( + tmp_path_factory, "cat_small", pd.Categorical(["a", "b", "c"]) + ) + + @pytest.fixture(scope="session") -def cat_small_path(cat_small_path_zarr: Path) -> Path: - """Alias for cat_small_path_zarr for backward compatibility.""" - return cat_small_path_zarr +def cat_small_path_h5ad(tmp_path_factory) -> Path: + return _write_categorical_h5ad( + tmp_path_factory, "cat_small", pd.Categorical(["a", "b", "c"]) + ) + + +@pytest.fixture(params=["zarr", "h5ad"]) +def cat_small_store(request, cat_small_path_zarr: Path, cat_small_path_h5ad: Path): + """Parametrized fixture: small categorical ['a', 'b', 'c'] for both backends.""" + path = cat_small_path_zarr if request.param == "zarr" else cat_small_path_h5ad + store = _open_categorical_store(path, request.param) + yield store + if request.param == "h5ad": + store.file.close() +# Medium categorical ['a', 'b', 'c', 'd', 'e'] @pytest.fixture(scope="session") -def cat_small_store(cat_small_path: Path) -> zarr.Group: - """Session-scoped fixture: small categorical ['a', 'b', 'c'].""" - return zarr.open(cat_small_path, mode="r")["cat"] +def cat_medium_path_zarr(tmp_path_factory) -> Path: + return _write_categorical_zarr( + tmp_path_factory, "cat_medium", pd.Categorical(["a", "b", "c", "d", "e"]) + ) @pytest.fixture(scope="session") -def cat_medium_store(tmp_path_factory) -> zarr.Group: - """Session-scoped fixture: medium categorical ['a', 'b', 'c', 'd', 'e'].""" - cat = pd.Categorical(["a", "b", "c", "d", "e"]) - path = tmp_path_factory.mktemp("cat_medium.zarr") - store = zarr.open(path, mode="w") - write_elem(store, "cat", cat) - return zarr.open(path, mode="r")["cat"] +def cat_medium_path_h5ad(tmp_path_factory) -> Path: + return _write_categorical_h5ad( + tmp_path_factory, "cat_medium", pd.Categorical(["a", "b", "c", "d", "e"]) + ) +@pytest.fixture(params=["zarr", "h5ad"]) +def cat_medium_store(request, cat_medium_path_zarr: Path, cat_medium_path_h5ad: Path): + """Parametrized fixture: medium categorical for both backends.""" + path = cat_medium_path_zarr if request.param == "zarr" else cat_medium_path_h5ad + store = _open_categorical_store(path, request.param) + yield store + if request.param == "h5ad": + store.file.close() + + +# Large categorical with 100 categories @pytest.fixture(scope="session") -def cat_large_store(tmp_path_factory) -> zarr.Group: - """Session-scoped fixture: large categorical with 100 categories.""" +def cat_large_path_zarr(tmp_path_factory) -> Path: categories = [f"cat_{i}" for i in range(100)] - cat = pd.Categorical(categories) - path = tmp_path_factory.mktemp("cat_large.zarr") - store = zarr.open(path, mode="w") - write_elem(store, "cat", cat) - return zarr.open(path, mode="r")["cat"] + return _write_categorical_zarr( + tmp_path_factory, "cat_large", pd.Categorical(categories) + ) @pytest.fixture(scope="session") -def cat_ordered_store(tmp_path_factory) -> zarr.Group: - """Session-scoped fixture: ordered categorical ['low', 'medium', 'high'].""" +def cat_large_path_h5ad(tmp_path_factory) -> Path: + categories = [f"cat_{i}" for i in range(100)] + return _write_categorical_h5ad( + tmp_path_factory, "cat_large", pd.Categorical(categories) + ) + + +@pytest.fixture(params=["zarr", "h5ad"]) +def cat_large_store(request, cat_large_path_zarr: Path, cat_large_path_h5ad: Path): + """Parametrized fixture: large categorical (100 categories) for both backends.""" + path = cat_large_path_zarr if request.param == "zarr" else cat_large_path_h5ad + store = _open_categorical_store(path, request.param) + yield store + if request.param == "h5ad": + store.file.close() + + +# Ordered categorical ['low', 'medium', 'high'] +@pytest.fixture(scope="session") +def cat_ordered_path_zarr(tmp_path_factory) -> Path: cat = pd.Categorical( ["low", "medium", "high"] * 3 + ["low"], categories=["low", "medium", "high"], ordered=True, ) - path = tmp_path_factory.mktemp("cat_ordered.zarr") - store = zarr.open(path, mode="w") - write_elem(store, "cat", cat) - return zarr.open(path, mode="r")["cat"] + return _write_categorical_zarr(tmp_path_factory, "cat_ordered", cat) @pytest.fixture(scope="session") -def cat_fifty_store(tmp_path_factory) -> zarr.Group: - """Session-scoped fixture: 50 categories for head/tail testing.""" +def cat_ordered_path_h5ad(tmp_path_factory) -> Path: + cat = pd.Categorical( + ["low", "medium", "high"] * 3 + ["low"], + categories=["low", "medium", "high"], + ordered=True, + ) + return _write_categorical_h5ad(tmp_path_factory, "cat_ordered", cat) + + +@pytest.fixture(params=["zarr", "h5ad"]) +def cat_ordered_store( + request, cat_ordered_path_zarr: Path, cat_ordered_path_h5ad: Path +): + """Parametrized fixture: ordered categorical for both backends.""" + path = cat_ordered_path_zarr if request.param == "zarr" else cat_ordered_path_h5ad + store = _open_categorical_store(path, request.param) + yield store + if request.param == "h5ad": + store.file.close() + + +# 50 categories for head/tail testing +@pytest.fixture(scope="session") +def cat_fifty_path_zarr(tmp_path_factory) -> Path: categories = [f"Type_{i:02d}" for i in range(50)] - cat = pd.Categorical(categories) - path = tmp_path_factory.mktemp("cat_fifty.zarr") - store = zarr.open(path, mode="w") - write_elem(store, "cat", cat) - return zarr.open(path, mode="r")["cat"] + return _write_categorical_zarr( + tmp_path_factory, "cat_fifty", pd.Categorical(categories) + ) + + +@pytest.fixture(scope="session") +def cat_fifty_path_h5ad(tmp_path_factory) -> Path: + categories = [f"Type_{i:02d}" for i in range(50)] + return _write_categorical_h5ad( + tmp_path_factory, "cat_fifty", pd.Categorical(categories) + ) -def test_lazy_categorical_dtype_n_categories(cat_large_store: zarr.Group): +@pytest.fixture(params=["zarr", "h5ad"]) +def cat_fifty_store(request, cat_fifty_path_zarr: Path, cat_fifty_path_h5ad: Path): + """Parametrized fixture: 50 categories for head/tail testing, both backends.""" + path = cat_fifty_path_zarr if request.param == "zarr" else cat_fifty_path_h5ad + store = _open_categorical_store(path, request.param) + yield store + if request.param == "h5ad": + store.file.close() + + +def test_lazy_categorical_dtype_n_categories(cat_large_store): """Test LazyCategoricalDtype.n_categories is cheap (metadata only).""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -341,7 +426,7 @@ def test_lazy_categorical_dtype_n_categories(cat_large_store: zarr.Group): assert dtype.ordered is False -def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store: zarr.Group): +def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store): """Test LazyCategoricalDtype.head_categories and tail_categories for partial reads.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -379,7 +464,7 @@ def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store: zarr.Group assert list(all_tail) == [f"Type_{i:02d}" for i in range(50)] -def test_lazy_categorical_dtype_categories_caching(cat_medium_store: zarr.Group): +def test_lazy_categorical_dtype_categories_caching(cat_medium_store): """Test that categories are cached after full load.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -406,7 +491,7 @@ def test_lazy_categorical_dtype_categories_caching(cat_medium_store: zarr.Group) assert list(tail) == ["z", "w", "v"] # Returns cached values, not disk values -def test_lazy_categorical_dtype_ordered(cat_ordered_store: zarr.Group): +def test_lazy_categorical_dtype_ordered(cat_ordered_store): """Test LazyCategoricalDtype with ordered categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -419,9 +504,7 @@ def test_lazy_categorical_dtype_ordered(cat_ordered_store: zarr.Group): assert list(dtype.categories) == ["low", "medium", "high"] -def test_lazy_categorical_dtype_repr( - cat_large_store: zarr.Group, cat_small_store: zarr.Group -): +def test_lazy_categorical_dtype_repr(cat_large_store, cat_small_store): """Test LazyCategoricalDtype repr shows truncated categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -449,7 +532,7 @@ def test_lazy_categorical_dtype_repr( assert "'c'" in small_repr -def test_lazy_categorical_dtype_equality(cat_small_store: zarr.Group): +def test_lazy_categorical_dtype_equality(cat_small_store): """Test LazyCategoricalDtype equality comparisons.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -586,7 +669,7 @@ def test_lazy_categorical_roundtrip_via_anndata(tmp_path: Path): assert loaded.obs["ordered_cat"].equals(adata.obs["ordered_cat"]) -def test_lazy_categorical_dtype_hash(cat_small_store: zarr.Group): +def test_lazy_categorical_dtype_hash(cat_small_store): """Test LazyCategoricalDtype is hashable.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -603,7 +686,7 @@ def test_lazy_categorical_dtype_hash(cat_small_store: zarr.Group): assert dtype in s -def test_lazy_categorical_dtype_n_categories_from_cache(cat_medium_store: zarr.Group): +def test_lazy_categorical_dtype_n_categories_from_cache(cat_medium_store): """Test n_categories returns from cache when categories already loaded.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -620,7 +703,7 @@ def test_lazy_categorical_dtype_n_categories_from_cache(cat_medium_store: zarr.G assert dtype.n_categories == 3 # Returns cached length, not disk length -def test_lazy_categorical_dtype_name(cat_small_store: zarr.Group): +def test_lazy_categorical_dtype_name(cat_small_store): """Test LazyCategoricalDtype.name property (inherited from CategoricalDtype).""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype @@ -633,7 +716,7 @@ def test_lazy_categorical_dtype_name(cat_small_store: zarr.Group): def test_lazy_categorical_dtype_inequality_with_none_categories( - cat_small_store: zarr.Group, + cat_small_store, ): """Test LazyCategoricalDtype is not equal to CategoricalDtype with None categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype From 175aef6bbd66e6f752e05002edc29af56d780621 Mon Sep 17 00:00:00 2001 From: Dominik Date: Tue, 13 Jan 2026 15:04:25 -0800 Subject: [PATCH 15/20] test: consolidate LazyCategoricalDtype tests and add no-load verification Consolidate redundant tests and add proper verification for lazy behavior: 1. Merged n_categories tests: - test_lazy_categorical_dtype_n_categories now verifies: - Metadata-only access (categories not loaded) - Cache behavior after categories are loaded - Removed redundant test_lazy_categorical_dtype_n_categories_from_cache 2. Improved head_tail_categories test: - Added verification that partial reads don't load all categories - Each head/tail call now checks "categories" not in __dict__ 3. Consolidated equality test: - Merged test_lazy_categorical_dtype_name (trivial 1-assertion test) - Merged test_lazy_categorical_dtype_inequality_with_none_categories - Now tests name property and None-categories edge case Test count reduced from 24 to 18 while improving coverage quality: - Tests now verify lazy behavior claims, not just return values - Removed redundant test code without losing coverage --- tests/lazy/test_read.py | 85 ++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 52 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 6683bc5ff..45a1b9229 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -415,44 +415,63 @@ def cat_fifty_store(request, cat_fifty_path_zarr: Path, cat_fifty_path_h5ad: Pat def test_lazy_categorical_dtype_n_categories(cat_large_store): - """Test LazyCategoricalDtype.n_categories is cheap (metadata only).""" + """Test n_categories is cheap (metadata only) and uses cache when loaded.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype lazy_cat = read_elem_lazy(cat_large_store) dtype = lazy_cat.dtype - assert isinstance(dtype, LazyCategoricalDtype) + + # Before loading: n_categories should work without loading categories + assert "categories" not in dtype.__dict__ assert dtype.n_categories == 100 + assert "categories" not in dtype.__dict__ # Still not loaded - proves metadata-only assert dtype.ordered is False + # After loading: n_categories should use cache + _ = dtype.categories # Force load + assert "categories" in dtype.__dict__ + assert dtype.n_categories == 100 # Uses cache now + + # Verify cache is used by modifying cached value + dtype.__dict__["categories"] = pd.Index(["x", "y", "z"]) + assert dtype.n_categories == 3 # Returns cached length, not disk length + def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store): - """Test LazyCategoricalDtype.head_categories and tail_categories for partial reads.""" + """Test head_categories and tail_categories perform partial reads without loading all.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype lazy_cat = read_elem_lazy(cat_fifty_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) - # Test head_categories (first n) + # Verify categories not loaded initially + assert "categories" not in dtype.__dict__ + + # Test head_categories (first n) - should NOT load all categories first5 = dtype.head_categories(5) assert len(first5) == 5 assert list(first5) == [f"Type_{i:02d}" for i in range(5)] + assert "categories" not in dtype.__dict__ # Still not fully loaded # Test head_categories default (first 5) default_head = dtype.head_categories() assert len(default_head) == 5 assert list(default_head) == [f"Type_{i:02d}" for i in range(5)] + assert "categories" not in dtype.__dict__ # Still not fully loaded - # Test tail_categories (last n) + # Test tail_categories (last n) - should NOT load all categories last3 = dtype.tail_categories(3) assert len(last3) == 3 assert list(last3) == [f"Type_{i:02d}" for i in range(47, 50)] + assert "categories" not in dtype.__dict__ # Still not fully loaded # Test tail_categories default (last 5) default_tail = dtype.tail_categories() assert len(default_tail) == 5 assert list(default_tail) == [f"Type_{i:02d}" for i in range(45, 50)] + assert "categories" not in dtype.__dict__ # Still not fully loaded # Test requesting more than available all_head = dtype.head_categories(100) @@ -533,13 +552,16 @@ def test_lazy_categorical_dtype_repr(cat_large_store, cat_small_store): def test_lazy_categorical_dtype_equality(cat_small_store): - """Test LazyCategoricalDtype equality comparisons.""" + """Test LazyCategoricalDtype equality comparisons and basic properties.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype lazy_cat = read_elem_lazy(cat_small_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) + # Test name property (inherited from CategoricalDtype) + assert dtype.name == "category" + # Test string comparison (dtype == "category") assert dtype == "category" assert dtype != "int64" @@ -556,6 +578,11 @@ def test_lazy_categorical_dtype_equality(cat_small_store): ordered_dtype = pd.CategoricalDtype(categories=["a", "b", "c"], ordered=True) assert dtype != ordered_dtype + # Test comparison with CategoricalDtype with None categories + # LazyCategoricalDtype always has categories, so should not equal None-categories dtype + dtype_none = pd.CategoricalDtype(categories=None, ordered=False) + assert dtype != dtype_none + # Test comparison with non-CategoricalDtype assert dtype != np.dtype("int64") assert dtype != 123 @@ -686,52 +713,6 @@ def test_lazy_categorical_dtype_hash(cat_small_store): assert dtype in s -def test_lazy_categorical_dtype_n_categories_from_cache(cat_medium_store): - """Test n_categories returns from cache when categories already loaded.""" - from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - - lazy_cat = read_elem_lazy(cat_medium_store) - dtype = lazy_cat.dtype - assert isinstance(dtype, LazyCategoricalDtype) - - # Load categories first - cats = dtype.categories - assert cats is not None - - # Verify n_categories uses cache by modifying the cached value - dtype.__dict__["categories"] = pd.Index(["x", "y", "z"]) - assert dtype.n_categories == 3 # Returns cached length, not disk length - - -def test_lazy_categorical_dtype_name(cat_small_store): - """Test LazyCategoricalDtype.name property (inherited from CategoricalDtype).""" - from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - - lazy_cat = read_elem_lazy(cat_small_store) - dtype = lazy_cat.dtype - assert isinstance(dtype, LazyCategoricalDtype) - - # name should be "category" (inherited from CategoricalDtype) - assert dtype.name == "category" - - -def test_lazy_categorical_dtype_inequality_with_none_categories( - cat_small_store, -): - """Test LazyCategoricalDtype is not equal to CategoricalDtype with None categories.""" - from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - - lazy_cat = read_elem_lazy(cat_small_store) - dtype = lazy_cat.dtype - assert isinstance(dtype, LazyCategoricalDtype) - - # Regular CategoricalDtype without categories set - dtype_none = pd.CategoricalDtype(categories=None, ordered=False) - - # LazyCategoricalDtype always has categories, so should not equal None-categories dtype - assert dtype != dtype_none - - def test_nullable_string_index_decoding(tmp_path: Path): """Test that nullable string indices are properly decoded from bytes. From 6259d1476937da16656fa0a256f32ca121965aa2 Mon Sep 17 00:00:00 2001 From: Dominik Date: Tue, 13 Jan 2026 15:08:16 -0800 Subject: [PATCH 16/20] test: fix misleading comment about hash requirement --- tests/lazy/test_read.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 45a1b9229..26fb25868 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -704,7 +704,7 @@ def test_lazy_categorical_dtype_hash(cat_small_store): dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) - # Should be hashable (required for pandas internals) + # Should be hashable (useful for collecting unique dtypes in sets/dicts) h = hash(dtype) assert isinstance(h, int) From 87d399b52d1baf90c4715151f86c126a7b16a9b4 Mon Sep 17 00:00:00 2001 From: "Dominik J. Otto" Date: Fri, 23 Jan 2026 11:09:22 -0800 Subject: [PATCH 17/20] simplify LazyCategoricalDtype comparison Co-authored-by: Ilan Gold --- src/anndata/experimental/backed/_lazy_arrays.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 78a6d0526..6ca1ad8c4 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -205,14 +205,9 @@ def __hash__(self) -> int: def __eq__(self, other) -> bool: if isinstance(other, LazyCategoricalDtype): - if self._ordered_flag != other._ordered_flag: - return False - # Fast path: same Python object - if self._categories_elem is other._categories_elem: - return True - # Fast path: zarr/h5py arrays compare equal by location - if self._get_categories_array() == other._get_categories_array(): - return True + has_same_ordering = self._ordered_flag == other._ordered_flag + are_arrays_equal = (self._categories_elem is other._categories_elem) or (self._get_categories_array() == other._get_categories_array()) + return has_same_ordering and are_arrays_equal # Defer to pandas base implementation for all other comparisons # This handles string comparison ("category"), CategoricalDtype comparisons, # and all edge cases (None categories, ordered vs unordered, etc.) From f740784ccc37440ce216db356d50e4aa4d3b6311 Mon Sep 17 00:00:00 2001 From: Dominik Date: Fri, 23 Jan 2026 11:10:26 -0800 Subject: [PATCH 18/20] increase number of preview categories in LazyCategoricalDtype --- src/anndata/experimental/backed/_lazy_arrays.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 6ca1ad8c4..26b790b74 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -21,7 +21,7 @@ ) # Number of categories to show at head/tail in LazyCategoricalDtype repr -_N_CATEGORIES_REPR_SHOW = 3 +_N_CATEGORIES_REPR_SHOW = 10 if TYPE_CHECKING: from pathlib import Path @@ -205,9 +205,11 @@ def __hash__(self) -> int: def __eq__(self, other) -> bool: if isinstance(other, LazyCategoricalDtype): - has_same_ordering = self._ordered_flag == other._ordered_flag - are_arrays_equal = (self._categories_elem is other._categories_elem) or (self._get_categories_array() == other._get_categories_array()) - return has_same_ordering and are_arrays_equal + has_same_ordering = self._ordered_flag == other._ordered_flag + are_arrays_equal = (self._categories_elem is other._categories_elem) or ( + self._get_categories_array() == other._get_categories_array() + ) + return has_same_ordering and are_arrays_equal # Defer to pandas base implementation for all other comparisons # This handles string comparison ("category"), CategoricalDtype comparisons, # and all edge cases (None categories, ordered vs unordered, etc.) From ac1cab524e9727c3331d0b10a669acd85f4eb6b3 Mon Sep 17 00:00:00 2001 From: Dominik Date: Fri, 23 Jan 2026 11:33:01 -0800 Subject: [PATCH 19/20] test: consolidate categorical fixtures with factory pattern Reduce repetition in categorical test fixtures by using a config-driven factory pattern instead of separate fixture groups for each category size. Changes: - Replace 15 individual fixtures with 3 generated fixtures + 1 data fixture - Consolidate n50 and n100 into single n100 config (serves both use cases) - Use `_make_cat_fixture()` factory for zarr/h5ad parametrization - Update tests to use new fixture names (cat_n3_store, cat_n100_store) Addresses review feedback about fixture repetitiveness. --- tests/lazy/test_read.py | 247 ++++++++++++---------------------------- 1 file changed, 72 insertions(+), 175 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 26fb25868..5a0edea46 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -249,176 +249,73 @@ def test_chunks_df( assert arr.chunksize == expected_chunks -# Session-scoped fixtures for categorical data (write once, read many) -# Each category type has zarr and h5ad path fixtures, plus a parametrized store fixture +# Session-scoped categorical fixtures parametrized by (n_categories, ordered) +# Data is written once per session; stores are opened per-test with backend parametrization - -def _write_categorical_zarr(tmp_path_factory, name: str, cat: pd.Categorical) -> Path: - """Helper to write categorical to zarr and return path.""" - path = tmp_path_factory.mktemp(f"{name}.zarr") - store = zarr.open(path, mode="w") - write_elem(store, "cat", cat) - return path - - -def _write_categorical_h5ad(tmp_path_factory, name: str, cat: pd.Categorical) -> Path: - """Helper to write categorical to h5ad and return path.""" - path = tmp_path_factory.mktemp(name) / "cat.h5ad" - with h5py.File(path, mode="w") as f: - write_elem(f, "cat", cat) - return path - - -def _open_categorical_store(path: Path, backend: str): - """Helper to open categorical store for either backend.""" - if backend == "zarr": - return zarr.open(path, mode="r")["cat"] - else: - return h5py.File(path, mode="r")["cat"] - - -# Small categorical ['a', 'b', 'c'] -@pytest.fixture(scope="session") -def cat_small_path_zarr(tmp_path_factory) -> Path: - return _write_categorical_zarr( - tmp_path_factory, "cat_small", pd.Categorical(["a", "b", "c"]) - ) +# Configuration: (name, n_categories, ordered, category_names) +_CAT_CONFIGS: list[tuple[str, int, bool, list[str] | None]] = [ + ("n3", 3, False, ["a", "b", "c"]), # basic tests, equality, hashing + ("n100", 100, False, None), # truncation, n_categories, head/tail + ("ordered", 3, True, ["low", "medium", "high"]), # ordered categories +] @pytest.fixture(scope="session") -def cat_small_path_h5ad(tmp_path_factory) -> Path: - return _write_categorical_h5ad( - tmp_path_factory, "cat_small", pd.Categorical(["a", "b", "c"]) - ) - - -@pytest.fixture(params=["zarr", "h5ad"]) -def cat_small_store(request, cat_small_path_zarr: Path, cat_small_path_h5ad: Path): - """Parametrized fixture: small categorical ['a', 'b', 'c'] for both backends.""" - path = cat_small_path_zarr if request.param == "zarr" else cat_small_path_h5ad - store = _open_categorical_store(path, request.param) - yield store - if request.param == "h5ad": - store.file.close() - - -# Medium categorical ['a', 'b', 'c', 'd', 'e'] -@pytest.fixture(scope="session") -def cat_medium_path_zarr(tmp_path_factory) -> Path: - return _write_categorical_zarr( - tmp_path_factory, "cat_medium", pd.Categorical(["a", "b", "c", "d", "e"]) - ) - +def cat_data_paths(tmp_path_factory) -> dict[tuple[str, str], Path]: + """Create all categorical test data once per session, return paths dict.""" + base = tmp_path_factory.mktemp("categorical_data") + paths: dict[tuple[str, str], Path] = {} -@pytest.fixture(scope="session") -def cat_medium_path_h5ad(tmp_path_factory) -> Path: - return _write_categorical_h5ad( - tmp_path_factory, "cat_medium", pd.Categorical(["a", "b", "c", "d", "e"]) - ) + for name, n_cat, ordered, cat_names in _CAT_CONFIGS: + categories = cat_names or [f"cat_{i:02d}" for i in range(n_cat)] + cat = pd.Categorical(categories, categories=categories, ordered=ordered) + # Write zarr + zarr_path = base / f"{name}.zarr" + store = zarr.open(zarr_path, mode="w") + write_elem(store, "cat", cat) + paths[(name, "zarr")] = zarr_path -@pytest.fixture(params=["zarr", "h5ad"]) -def cat_medium_store(request, cat_medium_path_zarr: Path, cat_medium_path_h5ad: Path): - """Parametrized fixture: medium categorical for both backends.""" - path = cat_medium_path_zarr if request.param == "zarr" else cat_medium_path_h5ad - store = _open_categorical_store(path, request.param) - yield store - if request.param == "h5ad": - store.file.close() + # Write h5ad + h5_path = base / f"{name}.h5ad" + with h5py.File(h5_path, mode="w") as f: + write_elem(f, "cat", cat) + paths[(name, "h5ad")] = h5_path + return paths -# Large categorical with 100 categories -@pytest.fixture(scope="session") -def cat_large_path_zarr(tmp_path_factory) -> Path: - categories = [f"cat_{i}" for i in range(100)] - return _write_categorical_zarr( - tmp_path_factory, "cat_large", pd.Categorical(categories) - ) - -@pytest.fixture(scope="session") -def cat_large_path_h5ad(tmp_path_factory) -> Path: - categories = [f"cat_{i}" for i in range(100)] - return _write_categorical_h5ad( - tmp_path_factory, "cat_large", pd.Categorical(categories) - ) - - -@pytest.fixture(params=["zarr", "h5ad"]) -def cat_large_store(request, cat_large_path_zarr: Path, cat_large_path_h5ad: Path): - """Parametrized fixture: large categorical (100 categories) for both backends.""" - path = cat_large_path_zarr if request.param == "zarr" else cat_large_path_h5ad - store = _open_categorical_store(path, request.param) - yield store - if request.param == "h5ad": - store.file.close() - - -# Ordered categorical ['low', 'medium', 'high'] -@pytest.fixture(scope="session") -def cat_ordered_path_zarr(tmp_path_factory) -> Path: - cat = pd.Categorical( - ["low", "medium", "high"] * 3 + ["low"], - categories=["low", "medium", "high"], - ordered=True, - ) - return _write_categorical_zarr(tmp_path_factory, "cat_ordered", cat) - - -@pytest.fixture(scope="session") -def cat_ordered_path_h5ad(tmp_path_factory) -> Path: - cat = pd.Categorical( - ["low", "medium", "high"] * 3 + ["low"], - categories=["low", "medium", "high"], - ordered=True, - ) - return _write_categorical_h5ad(tmp_path_factory, "cat_ordered", cat) +def _open_cat_store(path: Path, backend: str): + """Open categorical store for either backend.""" + if backend == "zarr": + return zarr.open(path, mode="r")["cat"] + return h5py.File(path, mode="r")["cat"] -@pytest.fixture(params=["zarr", "h5ad"]) -def cat_ordered_store( - request, cat_ordered_path_zarr: Path, cat_ordered_path_h5ad: Path -): - """Parametrized fixture: ordered categorical for both backends.""" - path = cat_ordered_path_zarr if request.param == "zarr" else cat_ordered_path_h5ad - store = _open_categorical_store(path, request.param) - yield store - if request.param == "h5ad": - store.file.close() +def _make_cat_fixture(config_name: str): + """Factory to create categorical store fixtures with zarr/h5ad parametrization.""" + @pytest.fixture(params=["zarr", "h5ad"]) + def _fixture(request, cat_data_paths): + path = cat_data_paths[(config_name, request.param)] + store = _open_cat_store(path, request.param) + yield store + if request.param == "h5ad": + store.file.close() -# 50 categories for head/tail testing -@pytest.fixture(scope="session") -def cat_fifty_path_zarr(tmp_path_factory) -> Path: - categories = [f"Type_{i:02d}" for i in range(50)] - return _write_categorical_zarr( - tmp_path_factory, "cat_fifty", pd.Categorical(categories) - ) + return _fixture -@pytest.fixture(scope="session") -def cat_fifty_path_h5ad(tmp_path_factory) -> Path: - categories = [f"Type_{i:02d}" for i in range(50)] - return _write_categorical_h5ad( - tmp_path_factory, "cat_fifty", pd.Categorical(categories) - ) - +cat_n3_store = _make_cat_fixture("n3") +cat_n100_store = _make_cat_fixture("n100") +cat_ordered_store = _make_cat_fixture("ordered") -@pytest.fixture(params=["zarr", "h5ad"]) -def cat_fifty_store(request, cat_fifty_path_zarr: Path, cat_fifty_path_h5ad: Path): - """Parametrized fixture: 50 categories for head/tail testing, both backends.""" - path = cat_fifty_path_zarr if request.param == "zarr" else cat_fifty_path_h5ad - store = _open_categorical_store(path, request.param) - yield store - if request.param == "h5ad": - store.file.close() - -def test_lazy_categorical_dtype_n_categories(cat_large_store): +def test_lazy_categorical_dtype_n_categories(cat_n100_store): """Test n_categories is cheap (metadata only) and uses cache when loaded.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - lazy_cat = read_elem_lazy(cat_large_store) + lazy_cat = read_elem_lazy(cat_n100_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -438,11 +335,11 @@ def test_lazy_categorical_dtype_n_categories(cat_large_store): assert dtype.n_categories == 3 # Returns cached length, not disk length -def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store): +def test_lazy_categorical_dtype_head_tail_categories(cat_n100_store): """Test head_categories and tail_categories perform partial reads without loading all.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - lazy_cat = read_elem_lazy(cat_fifty_store) + lazy_cat = read_elem_lazy(cat_n100_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -452,42 +349,42 @@ def test_lazy_categorical_dtype_head_tail_categories(cat_fifty_store): # Test head_categories (first n) - should NOT load all categories first5 = dtype.head_categories(5) assert len(first5) == 5 - assert list(first5) == [f"Type_{i:02d}" for i in range(5)] + assert list(first5) == [f"cat_{i:02d}" for i in range(5)] assert "categories" not in dtype.__dict__ # Still not fully loaded # Test head_categories default (first 5) default_head = dtype.head_categories() assert len(default_head) == 5 - assert list(default_head) == [f"Type_{i:02d}" for i in range(5)] + assert list(default_head) == [f"cat_{i:02d}" for i in range(5)] assert "categories" not in dtype.__dict__ # Still not fully loaded # Test tail_categories (last n) - should NOT load all categories last3 = dtype.tail_categories(3) assert len(last3) == 3 - assert list(last3) == [f"Type_{i:02d}" for i in range(47, 50)] + assert list(last3) == [f"cat_{i:02d}" for i in range(97, 100)] assert "categories" not in dtype.__dict__ # Still not fully loaded # Test tail_categories default (last 5) default_tail = dtype.tail_categories() assert len(default_tail) == 5 - assert list(default_tail) == [f"Type_{i:02d}" for i in range(45, 50)] + assert list(default_tail) == [f"cat_{i:02d}" for i in range(95, 100)] assert "categories" not in dtype.__dict__ # Still not fully loaded # Test requesting more than available - all_head = dtype.head_categories(100) - assert len(all_head) == 50 - assert list(all_head) == [f"Type_{i:02d}" for i in range(50)] + all_head = dtype.head_categories(200) + assert len(all_head) == 100 + assert list(all_head) == [f"cat_{i:02d}" for i in range(100)] - all_tail = dtype.tail_categories(100) - assert len(all_tail) == 50 - assert list(all_tail) == [f"Type_{i:02d}" for i in range(50)] + all_tail = dtype.tail_categories(200) + assert len(all_tail) == 100 + assert list(all_tail) == [f"cat_{i:02d}" for i in range(100)] -def test_lazy_categorical_dtype_categories_caching(cat_medium_store): +def test_lazy_categorical_dtype_categories_caching(cat_n3_store): """Test that categories are cached after full load.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - lazy_cat = read_elem_lazy(cat_medium_store) + lazy_cat = read_elem_lazy(cat_n3_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -497,7 +394,7 @@ def test_lazy_categorical_dtype_categories_caching(cat_medium_store): # Load categories cats = dtype.categories assert cats is not None - assert list(cats) == ["a", "b", "c", "d", "e"] + assert list(cats) == ["a", "b", "c"] # After loading, should be cached in __dict__ (cached_property pattern) assert "categories" in dtype.__dict__ @@ -523,12 +420,12 @@ def test_lazy_categorical_dtype_ordered(cat_ordered_store): assert list(dtype.categories) == ["low", "medium", "high"] -def test_lazy_categorical_dtype_repr(cat_large_store, cat_small_store): +def test_lazy_categorical_dtype_repr(cat_n100_store, cat_n3_store): """Test LazyCategoricalDtype repr shows truncated categories.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype # Test large number of categories (truncated repr) - lazy_cat = read_elem_lazy(cat_large_store) + lazy_cat = read_elem_lazy(cat_n100_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -536,11 +433,11 @@ def test_lazy_categorical_dtype_repr(cat_large_store, cat_small_store): assert "LazyCategoricalDtype" in repr_str assert "n=100" in repr_str assert "..." in repr_str # Truncation indicator - assert "cat_0" in repr_str # Head category + assert "cat_00" in repr_str # Head category assert "cat_99" in repr_str # Tail category # Test small number of categories (full repr) - small_lazy_cat = read_elem_lazy(cat_small_store) + small_lazy_cat = read_elem_lazy(cat_n3_store) small_dtype = small_lazy_cat.dtype small_repr = repr(small_dtype) @@ -551,11 +448,11 @@ def test_lazy_categorical_dtype_repr(cat_large_store, cat_small_store): assert "'c'" in small_repr -def test_lazy_categorical_dtype_equality(cat_small_store): +def test_lazy_categorical_dtype_equality(cat_n3_store): """Test LazyCategoricalDtype equality comparisons and basic properties.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - lazy_cat = read_elem_lazy(cat_small_store) + lazy_cat = read_elem_lazy(cat_n3_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) @@ -591,7 +488,7 @@ def test_lazy_categorical_dtype_equality(cat_small_store): @pytest.mark.parametrize("backend", ["zarr", "h5ad"]) def test_lazy_categorical_dtype_equality_no_load( - cat_small_path_zarr: Path, cat_small_path_h5ad: Path, backend: str + cat_data_paths: dict[tuple[str, str], Path], backend: str ): """Test same-location equality doesn't load category data. @@ -601,14 +498,14 @@ def test_lazy_categorical_dtype_equality_no_load( """ from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype + path = cat_data_paths[("n3", backend)] + if backend == "zarr": - path = cat_small_path_zarr def open_store(p): return zarr.open(p, mode="r")["cat"] else: - path = cat_small_path_h5ad # Keep h5py files open for the duration of the test open_store = lambda p: h5py.File(p, mode="r")["cat"] @@ -696,11 +593,11 @@ def test_lazy_categorical_roundtrip_via_anndata(tmp_path: Path): assert loaded.obs["ordered_cat"].equals(adata.obs["ordered_cat"]) -def test_lazy_categorical_dtype_hash(cat_small_store): +def test_lazy_categorical_dtype_hash(cat_n3_store): """Test LazyCategoricalDtype is hashable.""" from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype - lazy_cat = read_elem_lazy(cat_small_store) + lazy_cat = read_elem_lazy(cat_n3_store) dtype = lazy_cat.dtype assert isinstance(dtype, LazyCategoricalDtype) From edb04fc2d4cf4a2a2c4caf74a4beedbcdc4c7efd Mon Sep 17 00:00:00 2001 From: Dominik Date: Fri, 23 Jan 2026 11:47:13 -0800 Subject: [PATCH 20/20] test: improve equality_no_load test with read_elem patching - Switch from patching __getitem__ to patching read_elem (more reliable) - Add positive control: comparison with pd.CategoricalDtype triggers read_elem - This proves both that the optimization works AND that the patch detects loads --- tests/lazy/test_read.py | 43 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 5a0edea46..23485aa68 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -492,24 +492,21 @@ def test_lazy_categorical_dtype_equality_no_load( ): """Test same-location equality doesn't load category data. - Both h5py (HDF5 object ID comparison) and zarr 3.x (StorePath comparison) use - location-based equality that doesn't read array contents. This test verifies - that behavior by patching __getitem__ to raise if called. + LazyCategoricalDtype uses location-based comparison to avoid loading categories: + - zarr: StorePath comparison + - h5py: HDF5 object ID comparison + + We patch read_elem to verify no data is loaded during comparison. """ from anndata.experimental.backed._lazy_arrays import LazyCategoricalDtype path = cat_data_paths[("n3", backend)] if backend == "zarr": - - def open_store(p): - return zarr.open(p, mode="r")["cat"] - + open_store = lambda p: zarr.open(p, mode="r")["cat"] else: - # Keep h5py files open for the duration of the test open_store = lambda p: h5py.File(p, mode="r")["cat"] - # Open the same file twice to get different Python objects pointing to same location store1 = open_store(path) store2 = open_store(path) dtype1 = read_elem_lazy(store1).dtype @@ -517,33 +514,19 @@ def open_store(p): assert isinstance(dtype1, LazyCategoricalDtype) assert isinstance(dtype2, LazyCategoricalDtype) - # Verify these are different Python objects assert dtype1._categories_elem is not dtype2._categories_elem - # Patch __getitem__ to raise if data is loaded during comparison - cat_arr1 = dtype1._get_categories_array() - cat_arr2 = dtype2._get_categories_array() + # Same-location comparison should NOT call read_elem + with patch("anndata.io.read_elem", side_effect=AssertionError("read_elem called")): + assert dtype1 == dtype2 + # Positive control: comparison with regular CategoricalDtype DOES call read_elem with ( - patch.object( - cat_arr1, - "__getitem__", - side_effect=AssertionError("Data was loaded from arr1"), - ), - patch.object( - cat_arr2, - "__getitem__", - side_effect=AssertionError("Data was loaded from arr2"), - ), + pytest.raises(AssertionError, match="read_elem called"), + patch("anndata.io.read_elem", side_effect=AssertionError("read_elem called")), ): - # This should use location-based comparison without triggering __getitem__ - assert dtype1 == dtype2 - - # Also verify our cache wasn't populated - assert "categories" not in dtype1.__dict__ - assert "categories" not in dtype2.__dict__ + dtype1 == pd.CategoricalDtype(categories=["a", "b", "c"]) # noqa: B015 - # Clean up h5py file handles if backend == "h5ad": store1.file.close() store2.file.close()