fix: update scdl tests for anndata 0.12.11 compatibility

svc-bionemo · svc-bionemo · commit 32c53fec3527 · 2026-05-04T10:08:34.000-07:00
anndata 0.12.11 now validates that raw X rows match n_obs when
constructing AnnData objects. Tests that used AnnData(X=None, raw=...)
failed because n_obs defaulted to 0 while raw X had rows.

Fix: pass obs=pd.DataFrame(index=range(n_rows)) so anndata infers the
correct n_obs from the obs DataFrame.

Signed-off-by: svc-bionemo &lt;267129667+svc-bionemo@users.noreply.github.com&gt;
diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
@@ -139,8 +139,12 @@ def _make(tmp_path, dtype1: str, dtype2: str):
 
         h1 = tmp_path / "var1.h5ad"
         h2 = tmp_path / "var2.h5ad"
-        ad.AnnData(X=None, var=pd.DataFrame(index=np.arange(X1.shape[1])), raw={"X": X1}).write_h5ad(h1)
-        ad.AnnData(X=None, var=pd.DataFrame(index=np.arange(X2.shape[1])), raw={"X": X2}).write_h5ad(h2)
+        ad.AnnData(
+            obs=pd.DataFrame(index=range(X1.shape[0])), var=pd.DataFrame(index=np.arange(X1.shape[1])), raw={"X": X1}
+        ).write_h5ad(h1)
+        ad.AnnData(
+            obs=pd.DataFrame(index=range(X2.shape[0])), var=pd.DataFrame(index=np.arange(X2.shape[1])), raw={"X": X2}
+        ).write_h5ad(h2)
 
         ds1 = SingleCellMemMapDataset(tmp_path / "var_ds1", h5ad_path=h1, data_dtype=dtype1)
         ds2 = SingleCellMemMapDataset(tmp_path / "var_ds2", h5ad_path=h2, data_dtype=dtype2)
@@ -164,6 +168,7 @@ def _make(tmp_path):
         indices_small_vals = np.array([0, 11, 5, 7], dtype=np.int64)
         indptr_small_vals = np.array([0, 0, 2, 2, 4], dtype=np.int64)
         X_small = ad.AnnData(
+            obs=pd.DataFrame(index=range(n_rows_small)),
             var=pd.DataFrame(index=np.arange(n_cols_small)),
             raw={
                 "X": sp.csr_matrix(
@@ -180,6 +185,7 @@ def _make(tmp_path):
         indices_large_vals = np.array([10, 65_537], dtype=np.int64)
         indptr_large_vals = np.array([0, 1, 1, 2], dtype=np.int64)
         X_large = ad.AnnData(
+            obs=pd.DataFrame(index=range(n_rows_large)),
             var=pd.DataFrame(index=np.arange(n_cols_large)),
             raw={
                 "X": sp.csr_matrix(
diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/io/test_single_cell_memmap_dataset.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/io/test_single_cell_memmap_dataset.py
@@ -17,6 +17,7 @@
 
 import anndata as ad
 import numpy as np
+import pandas as pd
 import pytest
 import scipy.sparse as sp
 
@@ -112,7 +113,7 @@ def big_int_h5ad(tmp_path, big_h5ad_data):
     """Create and return the path to an h5ad with large values/columns for dtype promotion tests."""
     d = big_h5ad_data
     X = sp.csr_matrix((d["data"].astype(np.uint32), d["indices"], d["indptr"]), shape=(d["n_rows"], d["n_cols"]))
-    a = ad.AnnData(X=None, raw={"X": X})
+    a = ad.AnnData(obs=pd.DataFrame(index=range(d["n_rows"])), raw={"X": X})
     h5ad_path = tmp_path / "big_dtype.h5ad"
     a.write_h5ad(h5ad_path)
     return h5ad_path
@@ -123,7 +124,7 @@ def big_float_h5ad(tmp_path, big_h5ad_data):
     """Create and return the path to an h5ad with large values/columns for dtype promotion tests."""
     d = big_h5ad_data
     X = sp.csr_matrix((d["data"].astype("float32"), d["indices"], d["indptr"]), shape=(d["n_rows"], d["n_cols"]))
-    a = ad.AnnData(X=None, raw={"X": X})
+    a = ad.AnnData(obs=pd.DataFrame(index=range(d["n_rows"])), raw={"X": X})
     h5ad_path = tmp_path / "big_dtype.h5ad"
     a.write_h5ad(h5ad_path)
     return h5ad_path