Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/0.13.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
```
* Add support for aggregate operations on CSC matrices, Fortran-ordered arrays, and Dask with sparse CSR and dense matrices {pr}`395` {smaller}`S Dicks`
* Adds dask support for `tl.score_genes` & `tl.score_genes_cell_cycle` {pr}`408` {smaller}`S Dicks`
* Adds dask support for `tl.rank_genes_groups_logreg` {pr}`413` {smaller}`S Dicks`

```{rubric} Performance
```
Expand Down
22 changes: 18 additions & 4 deletions src/rapids_singlecell/tools/_rank_gene_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pandas as pd

from rapids_singlecell._compat import DaskArray, _meta_dense

if TYPE_CHECKING:
from collections.abc import Iterable

Expand Down Expand Up @@ -52,7 +54,7 @@ def rank_genes_groups_logreg(
groupby: str,
*,
groups: Literal["all"] | Iterable[str] = "all",
use_raw: bool = None,
use_raw: bool | None = None,
reference: str = "rest",
n_genes: int = None,
layer: str = None,
Expand Down Expand Up @@ -155,7 +157,6 @@ def rank_genes_groups_logreg(
# if reference is not set, then the groups listed will be compared to the rest
# if reference is set, then the groups listed will be compared only to the other groups listed
refname = reference
from cuml.linear_model import LogisticRegression

reference = groups_order[0]
if len(groups) == 1:
Expand All @@ -167,15 +168,28 @@ def rank_genes_groups_logreg(
X = X[grouping_mask.values, :]
# Indexing with a series causes issues, possibly segfault

grouping_logreg = grouping.cat.codes.to_numpy().astype("float32")
grouping_logreg = grouping.cat.codes.to_numpy().astype(X.dtype)
uniques = np.unique(grouping_logreg)
for idx, cat in enumerate(uniques):
grouping_logreg[np.where(grouping_logreg == cat)] = idx

if isinstance(X, DaskArray):
import dask.array as da
from cuml.dask.linear_model import LogisticRegression

grouping_logreg = da.from_array(
grouping_logreg,
chunks=(X.chunks[0]),
meta=_meta_dense(grouping_logreg.dtype),
)
else:
from cuml.linear_model import LogisticRegression

clf = LogisticRegression(**kwds)

clf = LogisticRegression(**kwds)
clf.fit(X, grouping_logreg)
scores_all = cp.array(clf.coef_)

if len(groups_order) == scores_all.shape[1]:
scores_all = scores_all.T
for igroup, _group in enumerate(groups_order):
Expand Down
46 changes: 46 additions & 0 deletions tests/dask/test_dask_rank_logreg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

import cupy as cp
import pandas as pd
import pytest
from scanpy.datasets import pbmc3k_processed, pbmc68k_reduced

import rapids_singlecell as rsc
from rapids_singlecell._testing import (
as_dense_cupy_dask_array,
as_sparse_cupy_dask_array,
)


@pytest.mark.parametrize("data_kind", ["sparse", "dense"])
@pytest.mark.parametrize("dtype", [cp.float32, cp.float64])
def test_rank_genes_groups_logreg(client, data_kind, dtype):
if data_kind == "dense":
adata = pbmc68k_reduced()
adata.X = adata.X.astype(dtype)
dask_data = adata.copy()
dask_data.X = as_dense_cupy_dask_array(dask_data.X).persist()
rsc.get.anndata_to_GPU(adata)
groupby = "bulk_labels"
read = "Dendritic"
elif data_kind == "sparse":
adata = pbmc3k_processed()
org_var_names = adata.var_names
adata = adata.raw.to_adata()
adata = adata[:, org_var_names].copy()
adata.X = adata.X.astype(dtype)
dask_data = adata.copy()
dask_data.X = as_sparse_cupy_dask_array(dask_data.X).persist()
rsc.get.anndata_to_GPU(adata)
groupby = "louvain"
read = "B cells"

rsc.tl.rank_genes_groups_logreg(adata, groupby=groupby, use_raw=False)
rsc.tl.rank_genes_groups_logreg(dask_data, groupby=groupby, use_raw=False)
array_ad = pd.DataFrame(adata.uns["rank_genes_groups"]["scores"][read]).to_numpy()[
:10
]
array_bd = pd.DataFrame(
dask_data.uns["rank_genes_groups"]["scores"][read]
).to_numpy()[:10]
cp.testing.assert_allclose(array_ad, array_bd, atol=1e-3)
17 changes: 17 additions & 0 deletions tests/test_rank_genes_groups_logreg.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import numpy as np
import pandas as pd
import scanpy as sc

import rapids_singlecell as rsc
Expand Down Expand Up @@ -38,3 +39,19 @@ def test_rank_genes_groups_with_renamed_categories_use_rep():

rsc.tl.rank_genes_groups_logreg(adata, "blobs")
assert not adata.uns["rank_genes_groups"]["names"][0].tolist() == ("3", "1", "0")


def test_rank_genes_groups_with_unsorted_groups():
adata = sc.datasets.blobs(n_variables=10, n_centers=5, n_observations=200)
adata._sanitize()
adata.rename_categories("blobs", ["Zero", "One", "Two", "Three", "Four"])
bdata = adata.copy()
rsc.tl.rank_genes_groups_logreg(adata, "blobs", groups=["Zero", "One", "Three"])
rsc.tl.rank_genes_groups_logreg(bdata, "blobs", groups=["One", "Three", "Zero"])
array_ad = pd.DataFrame(
adata.uns["rank_genes_groups"]["scores"]["Three"]
).to_numpy()
array_bd = pd.DataFrame(
bdata.uns["rank_genes_groups"]["scores"]["Three"]
).to_numpy()
np.testing.assert_equal(array_ad, array_bd)
Loading