add init_pos to umap (#427)

Intron7 · flying-sheep · web-flow · commit 781be593f37c · 2025-08-12T11:35:22.000Z
* add test and implementation

* add igraph for testing

* pytest full

* update marks

* Update src/rapids_singlecell/tools/_umap.py

Co-authored-by: Philipp A. &lt;flying-sheep@web.de&gt;

* fix string

* adds release note

* test only minimal

---------

Co-authored-by: Philipp A. &lt;flying-sheep@web.de&gt;
diff --git a/.github/workflows/test-gpu-dev.yml b/.github/workflows/test-gpu-dev.yml
@@ -49,7 +49,7 @@ jobs:
 
       - name: Install rapids-singlecell
         run: >-
-          pip install -e .[test]
+          pip install -e .[test-minimal]
           "scanpy @ git+https://github.com/scverse/scanpy.git"
           "anndata @ git+https://github.com/scverse/anndata.git"
 
diff --git a/docs/release-notes/0.13.1.md b/docs/release-notes/0.13.1.md
@@ -2,6 +2,7 @@
 
 ```{rubric} Features
 ```
+* adds support for `tl.umap` to support `init_pos` in the form of `ndarray`, `paga` or `obsm[key]` {pr}`427` {smaller}`S Dicks`
 
 ```{rubric} Performance
 ```
@@ -14,3 +15,4 @@
 
 ```{rubric} Misc
 ```
+* refactors `testing_utils` {pr}`427` {smaller}`S Dicks`
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,14 +40,18 @@ doc = [
     "dask",
     "pytest",
 ]
-test = [
+test-minimal = [
     "pytest",
     "profimp",
     "scanpy>=1.10.0",
     "bbknn",
     "decoupler",
     "fast-array-utils",
 ]
+test = [
+    "rapids_singlecell[test-minimal]",
+    "igraph",
+]
 
 [project.urls]
 Documentation = "https://rapids-singlecell.readthedocs.io"
@@ -88,8 +92,6 @@ lint.ignore = [
 "docs/*" = [ "I" ]
 "tests/*" = [ "D" ]
 "*/__init__.py" = [ "F401" ]
-"src/rapids_singlecell/decoupler_gpu/_method_mlm.py" = [ "PLR0917" ]
-"src/rapids_singlecell/decoupler_gpu/_method_wsum.py" = [ "PLR0917" ]
 [tool.ruff.lint.isort]
 known-first-party = [ "rapids_singlecell" ]
 required-imports = [ "from __future__ import annotations" ]
@@ -106,7 +108,6 @@ markers = [
 [tool.hatch.build]
 # exclude big files that don’t need to be installed
 exclude = [
-    "src/rapids_singlecell/_testing.py",
     "tests",
     "docs",
     "notebooks",
@@ -118,7 +119,7 @@ version-file = "src/rapids_singlecell/_version.py"
 source = "vcs"
 
 [tool.hatch.build.targets.wheel]
-packages = [ 'src/rapids_singlecell' ]
+packages = [ 'src/rapids_singlecell', 'src/testing' ]
 
 [tool.codespell]
 skip = '*.ipynb,*.csv'
diff --git a/src/rapids_singlecell/tools/_umap.py b/src/rapids_singlecell/tools/_umap.py
@@ -5,12 +5,15 @@
 import cuml
 import cuml.internals.logger as logger
 import cupy as cp
+import numpy as np
 from cuml.manifold.simpl_set import simplicial_set_embedding
 from cuml.manifold.umap import UMAP
 from cuml.manifold.umap_utils import find_ab_params
+from cuml.thirdparty_adapters import check_array as check_array_cuml
 from cupyx.scipy import sparse
 from packaging.version import parse as parse_version
 from scanpy._utils import NeighborsView
+from scanpy.tools._utils import get_init_pos_from_paga
 from sklearn.utils import check_random_state
 
 from rapids_singlecell._utils import _get_logger_level
@@ -20,7 +23,7 @@
 if TYPE_CHECKING:
     from anndata import AnnData
 
-_InitPos = Literal["auto", "spectral", "random"]
+_InitPos = Literal["auto", "spectral", "random", "paga"]
 
 
 def umap(
@@ -32,7 +35,7 @@ def umap(
     maxiter: int | None = None,
     alpha: float = 1.0,
     negative_sample_rate: int = 5,
-    init_pos: _InitPos = "auto",
+    init_pos: _InitPos | np.ndarray | cp.ndarray | str | None = "auto",
     random_state: int = 0,
     a: float | None = None,
     b: float | None = None,
@@ -82,6 +85,9 @@ def umap(
             * 'auto': chooses 'spectral' for `'n_samples' < 1000000`, 'random' otherwise.
             * 'spectral': use a spectral embedding of the graph.
             * 'random': assign initial embedding positions at random.
+            * 'paga': use the :func:`~scanpy.tl.paga` layout as initial embedding positions.
+            * Array of shape (n_obs, 2)
+            * Any key for :attr:`~anndata.AnnData.obsm`
 
         .. note::
             If your embedding looks odd it's recommended setting `init_pos` to 'random'.
@@ -143,8 +149,6 @@ def umap(
         **({"random_state": random_state} if random_state != 0 else {}),
     }
 
-    random_state = check_random_state(random_state)
-
     neigh_params = neighbors["params"]
     X = _choose_representation(
         adata,
@@ -167,6 +171,14 @@ def umap(
         else:
             pre_knn = None
 
+        if init_pos not in ["auto", "spectral", "random"]:
+            raise ValueError(
+                f"Invalid init_pos: {init_pos}",
+                "Valid options are: auto, spectral, random, paga for RAPIDS < 24.10",
+            )
+
+        random_state = check_random_state(random_state)
+
         if init_pos == "auto":
             init_pos = "spectral" if n_obs < 1000000 else "random"
 
@@ -192,8 +204,25 @@ def umap(
     else:
         pre_knn = neighbors["connectivities"]
 
-        if init_pos == "auto":
-            init_pos = "spectral" if n_obs < 1000000 else "random"
+        match init_pos:
+            case str() if init_pos in adata.obsm:
+                init_coords = adata.obsm[init_pos]
+            case str() if init_pos == "paga":
+                init_coords = get_init_pos_from_paga(
+                    adata, random_state=random_state, neighbors_key=neighbors_key
+                )
+            case str() if init_pos == "auto":
+                init_coords = "spectral" if n_obs < 1000000 else "random"
+            case _:
+                init_coords = init_pos
+
+        if hasattr(init_coords, "dtype"):
+            init_coords = check_array_cuml(
+                init_coords, dtype=np.float32, accept_sparse=False
+            )
+
+        random_state = check_random_state(random_state)
+
         logger_level = _get_logger_level(logger)
         X_umap = simplicial_set_embedding(
             data=cp.array(X),
@@ -204,7 +233,7 @@ def umap(
             b=b,
             negative_sample_rate=negative_sample_rate,
             n_epochs=n_epochs,
-            init=init_pos,
+            init=init_coords,
             random_state=random_state,
             metric=neigh_params.get("metric", "euclidean"),
             metric_kwds=neigh_params.get("metric_kwds", None),
diff --git a/src/testing/rapids_singlecell/_helper/__init__.py b/src/testing/rapids_singlecell/_helper/__init__.py
diff --git a/src/testing/rapids_singlecell/_pytest/__init__.py b/src/testing/rapids_singlecell/_pytest/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+from .marks import needs
diff --git a/src/testing/rapids_singlecell/_pytest/marks.py b/src/testing/rapids_singlecell/_pytest/marks.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from enum import Enum, auto
+from importlib.util import find_spec
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+SKIP_EXTRA: dict[str, Callable[[], str | None]] = {}
+
+
+class QuietMarkDecorator(pytest.MarkDecorator):
+    def __init__(self, mark: pytest.Mark) -> None:
+        super().__init__(mark, _ispytest=True)
+
+
+class needs(QuietMarkDecorator, Enum):
+    """Pytest skip marker evaluated at module import.
+
+    This allows us to see the amount of skipped tests at the start of a test run.
+    :func:`pytest.importorskip` skips tests after they started running.
+    """
+
+    # _generate_next_value_ needs to come before members
+    @staticmethod
+    def _generate_next_value_(
+        name: str, start: int, count: int, last_values: list[str]
+    ) -> str:
+        """Distribution name for matching modules."""
+        return name.replace("_", "-")
+
+    mod: str
+
+    igraph = auto()
+
+    def __init__(self, mod: str) -> None:
+        self.mod = mod
+        reason = self.skip_reason
+        dec = pytest.mark.skipif(bool(reason), reason=reason or "")
+        super().__init__(dec.mark)
+
+    @property
+    def skip_reason(self) -> str | None:
+        if find_spec(self._name_):
+            if skip_extra := SKIP_EXTRA.get(self._name_):
+                return skip_extra()
+            return None
+        reason = f"needs module `{self._name_}`"
+        if self._name_.casefold() != self.mod.casefold().replace("-", "_"):
+            reason = f"{reason} (`pip install {self.mod}`)"
+        return reason
diff --git a/tests/dask/test_dask_aggr.py b/tests/dask/test_dask_aggr.py
@@ -7,7 +7,7 @@
 from scanpy.datasets import pbmc3k_processed
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_dask_mean_var.py b/tests/dask/test_dask_mean_var.py
@@ -5,11 +5,11 @@
 from scanpy.datasets import pbmc3k, pbmc68k_reduced
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from rapids_singlecell.preprocessing._utils import _get_mean_var
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
-from rapids_singlecell.preprocessing._utils import _get_mean_var
 
 from ..test_score_genes import _create_sparse_nan_matrix  # noqa: TID252
 
diff --git a/tests/dask/test_dask_pca.py b/tests/dask/test_dask_pca.py
@@ -8,7 +8,7 @@
 from scipy import sparse
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_dask_rank_logreg.py b/tests/dask/test_dask_rank_logreg.py
@@ -5,7 +5,7 @@
 from scanpy.datasets import pbmc3k_processed, pbmc68k_reduced
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_dask_score_genes.py b/tests/dask/test_dask_score_genes.py
@@ -5,7 +5,7 @@
 from scanpy.datasets import pbmc3k, pbmc68k_reduced
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_get.py b/tests/dask/test_get.py
@@ -7,7 +7,7 @@
 from scipy import sparse
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_hvg_dask.py b/tests/dask/test_hvg_dask.py
@@ -8,7 +8,7 @@
 from scanpy.datasets import pbmc3k
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_normalize_dask.py b/tests/dask/test_normalize_dask.py
@@ -7,7 +7,7 @@
 from scanpy.datasets import pbmc3k
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_qc_dask.py b/tests/dask/test_qc_dask.py
@@ -7,7 +7,7 @@
 from scanpy.datasets import pbmc3k
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/dask/test_scale_dask.py b/tests/dask/test_scale_dask.py
@@ -8,7 +8,7 @@
 from scanpy.datasets import pbmc3k
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import (
+from testing.rapids_singlecell._helper import (
     as_dense_cupy_dask_array,
     as_sparse_cupy_dask_array,
 )
diff --git a/tests/test_aggregated.py b/tests/test_aggregated.py
@@ -13,7 +13,7 @@
 from scipy.sparse import csr_matrix
 
 import rapids_singlecell as rsc
-from rapids_singlecell._testing import ARRAY_TYPES_MEM
+from testing.rapids_singlecell._helper import ARRAY_TYPES_MEM
 
 
 @pytest.fixture
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
@@ -1,8 +1,13 @@
 from __future__ import annotations
 
+import cupy as cp
+import numpy as np
+import pytest
+import scanpy as sc
 from scanpy.datasets import pbmc68k_reduced
 
 from rapids_singlecell.tools import tsne, umap
+from testing.rapids_singlecell._pytest import needs
 
 
 def test_umap():
@@ -16,3 +21,28 @@ def test_tsne():
     pbmc = pbmc68k_reduced()
     tsne(pbmc)
     assert pbmc.obsm["X_tsne"].shape == (700, 2)
+
+
+@needs.igraph
+def test_umap_init_paga():
+    pbmc = pbmc68k_reduced()[:100, :].copy()
+    sc.tl.paga(pbmc)
+    sc.pl.paga(pbmc, show=False)
+    umap(pbmc, init_pos="paga")
+
+
+@pytest.mark.parametrize("init_pos", ["X_pca", "X_tsne", "numpy", "cupy"])
+def test_umap_init_pos(init_pos):
+    pbmc = pbmc68k_reduced()[:100, :].copy()
+    if init_pos == "X_pca":
+        with pytest.raises(ValueError, match="Expected 2 columns but got 50 columns."):
+            umap(pbmc, init_pos=init_pos)
+    elif init_pos == "X_tsne":
+        tsne(pbmc)
+        umap(pbmc, init_pos=init_pos)
+    else:
+        if init_pos == "numpy":
+            init_pos = np.random.random((100, 2))
+        else:
+            init_pos = cp.random.random((100, 2))
+        umap(pbmc, init_pos=init_pos)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
	`3`	`+from .marks import needs`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`from scanpy.datasets import pbmc3k_processed`
`8`	`8`
`9`	`9`	`import rapids_singlecell as rsc`
`10`		`-from rapids_singlecell._testing import (`
	`10`	`+from testing.rapids_singlecell._helper import (`
`11`	`11`	`as_dense_cupy_dask_array,`
`12`	`12`	`as_sparse_cupy_dask_array,`
`13`	`13`	`)`
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`from scipy import sparse`
`9`	`9`
`10`	`10`	`import rapids_singlecell as rsc`
`11`		`-from rapids_singlecell._testing import (`
	`11`	`+from testing.rapids_singlecell._helper import (`
`12`	`12`	`as_dense_cupy_dask_array,`
`13`	`13`	`as_sparse_cupy_dask_array,`
`14`	`14`	`)`