Use a kernel for embedding density (#590)

Intron7 · web-flow · commit 119007fda2ff · 2026-02-26T17:41:07.000+01:00
* first test

* fix kernel to run like scipy

* add release note
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -83,6 +83,7 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_pv_cuda           src/rapids_singlecell/_cuda/pv/pv.cu)
   add_nb_cuda_module(_edistance_cuda    src/rapids_singlecell/_cuda/edistance/edistance.cu)
   add_nb_cuda_module(_hvg_cuda          src/rapids_singlecell/_cuda/hvg/hvg.cu)
+  add_nb_cuda_module(_kde_cuda          src/rapids_singlecell/_cuda/kde/kde.cu)
   add_nb_cuda_module(_wilcoxon_cuda     src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
diff --git a/docs/release-notes/0.15.0.md b/docs/release-notes/0.15.0.md
@@ -3,6 +3,7 @@
 ```{rubric} Features
 ```
 * Improves numerical accuracy and adds parameters to `tl.rank_genes_groups` Wilcoxon methods: uses ``erfc`` for p-values to avoid underflow, adds ``tie_correct`` and ``use_continuity`` to ``wilcoxon_binned``, and refactors ``Aggregate`` with a unified ``count_mean_var()`` dispatcher and raw ``sq_sum`` output for GPU-resident stats computation {pr}`585` {smaller}`S Dicks`
+* Replace cuML KDE in ``tl.embedding_density`` with a custom CUDA kernel using covariance-aware Gaussian KDE matching ``scipy.stats.gaussian_kde``, removing the cuML dependency and the ``batchsize`` parameter {pr}`590` {smaller}`S Dicks`
 
 ```{rubric} Removals
 ```
diff --git a/src/rapids_singlecell/_cuda/__init__.py b/src/rapids_singlecell/_cuda/__init__.py
@@ -30,6 +30,7 @@
     "_harmony_pen_cuda",
     "_harmony_scatter_cuda",
     "_hvg_cuda",
+    "_kde_cuda",
     "_ligrec_cuda",
     "_mean_var_cuda",
     "_nanmean_cuda",
diff --git a/src/rapids_singlecell/_cuda/kde/kde.cu b/src/rapids_singlecell/_cuda/kde/kde.cu
@@ -0,0 +1,34 @@
+#include "kernels_kde.cuh"
+#include "../nb_types.h"
+
+using namespace nb::literals;
+
+template <typename T>
+inline void launch_gaussian_kde_2d(const T* xy, T* out, int n, T a, T b, T c,
+                                   cudaStream_t stream) {
+    constexpr int threads = 256;
+    const int blocks = (n + threads - 1) / threads;
+    gaussian_kde_2d_kernel<<<blocks, threads, 0, stream>>>(xy, out, n, a, b, c);
+}
+
+NB_MODULE(_kde_cuda, m) {
+    m.def(
+        "gaussian_kde_2d",
+        [](cuda_array_c<const float> xy, cuda_array_c<float> out, int n,
+           float a, float b, float c, std::uintptr_t stream) {
+            launch_gaussian_kde_2d(xy.data(), out.data(), n, a, b, c,
+                                   (cudaStream_t)stream);
+        },
+        "xy"_a, nb::kw_only(), "out"_a, "n"_a, "a"_a, "b"_a, "c"_a,
+        "stream"_a = 0);
+
+    m.def(
+        "gaussian_kde_2d",
+        [](cuda_array_c<const double> xy, cuda_array_c<double> out, int n,
+           double a, double b, double c, std::uintptr_t stream) {
+            launch_gaussian_kde_2d(xy.data(), out.data(), n, a, b, c,
+                                   (cudaStream_t)stream);
+        },
+        "xy"_a, nb::kw_only(), "out"_a, "n"_a, "a"_a, "b"_a, "c"_a,
+        "stream"_a = 0);
+}
diff --git a/src/rapids_singlecell/_cuda/kde/kernels_kde.cuh b/src/rapids_singlecell/_cuda/kde/kernels_kde.cuh
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <math_constants.h>
+
+template <typename T>
+__device__ __forceinline__ T neg_infinity();
+
+template <>
+__device__ __forceinline__ float neg_infinity<float>() {
+    return -CUDART_INF_F;
+}
+
+template <>
+__device__ __forceinline__ double neg_infinity<double>() {
+    return -CUDART_INF;
+}
+
+template <typename T>
+__global__ void gaussian_kde_2d_kernel(const T* __restrict__ xy,
+                                       T* __restrict__ out, const int n,
+                                       const T a, const T b, const T c) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n) return;
+
+    const T xi = xy[2 * i];
+    const T yi = xy[2 * i + 1];
+
+    T running_max = neg_infinity<T>();
+    T running_sum = T(0);
+
+    for (int j = 0; j < n; j++) {
+        const T dx = xi - xy[2 * j];
+        const T dy = yi - xy[2 * j + 1];
+        const T log_k = a * dx * dx + b * dx * dy + c * dy * dy;
+
+        if (log_k > running_max) {
+            running_sum = running_sum * exp(running_max - log_k) + T(1);
+            running_max = log_k;
+        } else {
+            running_sum += exp(log_k - running_max);
+        }
+    }
+
+    out[i] = log(running_sum) + running_max;
+}
diff --git a/src/rapids_singlecell/tools/_embedding_density.py b/src/rapids_singlecell/tools/_embedding_density.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import math
 from typing import TYPE_CHECKING
 
 import cupy as cp
@@ -20,7 +19,6 @@ def embedding_density(
     *,
     groupby: str | None = None,
     key_added: str | None = None,
-    batchsize: int = 10000,
     components: str | Sequence[str] = None,
 ) -> None:
     """\
@@ -34,10 +32,6 @@ def embedding_density(
     the same category.
     This function was written by Sophie Tritschler and implemented into
     Scanpy by Malte Luecken.
-    This function uses cuML's KernelDensity. It returns log Likelihood as does
-    sklearn's implementation. scipy.stats implementation, used
-    in scanpy, returns PDF.
-
     Parameters
     ----------
         adata
@@ -51,8 +45,6 @@ def embedding_density(
         key_added
             Name of the `.obs` covariate that will be added with the density
             estimates.
-        batchsize
-            Number of cells that should be processed together.
         components
             The embedding dimensions over which the density should be calculated.
             This is limited to two components.
@@ -76,7 +68,7 @@ def embedding_density(
     if basis == "fa":
         basis = "draw_graph_fa"
 
-    if f"X_{basis}" not in adata.obsm_keys():
+    if f"X_{basis}" not in adata.obsm:
         raise ValueError(
             "Cannot find the embedded representation "
             f"`adata.obsm['X_{basis}']`. Compute the embedding first."
@@ -117,16 +109,16 @@ def embedding_density(
             embed_x = adata.obsm[f"X_{basis}"][cat_mask, components[0]]
             embed_y = adata.obsm[f"X_{basis}"][cat_mask, components[1]]
 
-            dens_embed = _calc_density(cp.array(embed_x), cp.array(embed_y), batchsize)
+            dens_embed = _calc_density(cp.array(embed_x), cp.array(embed_y))
             density_values[cat_mask] = dens_embed
 
         adata.obs[density_covariate] = density_values
     else:  # if groupby is None
         # Calculate the density over the whole embedding without subsetting
-        embed_x = adata.obsm[f"X_{basis}"][:, components[0]]
-        embed_y = adata.obsm[f"X_{basis}"][:, components[1]]
+        embed_x = cp.asarray(adata.obsm[f"X_{basis}"][:, components[0]])
+        embed_y = cp.asarray(adata.obsm[f"X_{basis}"][:, components[1]])
 
-        adata.obs[density_covariate] = _calc_density(embed_x, embed_y, batchsize)
+        adata.obs[density_covariate] = _calc_density(embed_x, embed_y)
 
     # Reduce diffmap components for labeling
     # Note: plot_scatter takes care of correcting diffmap components
@@ -140,26 +132,47 @@ def embedding_density(
     }
 
 
-def _calc_density(x: cp.ndarray, y: cp.ndarray, batchsize: int):
+def _calc_density(x: cp.ndarray, y: cp.ndarray) -> np.ndarray:
     """\
-    Calculates the density of points in 2 dimensions.
+    Calculates the density of points in 2 dimensions using a Gaussian KDE kernel.
+
+    Uses a covariance-aware bandwidth (Scott's rule) matching
+    :class:`scipy.stats.gaussian_kde`, and min-max scales the PDF.
+    Each GPU thread computes the log-density for one query point via an
+    in-thread streaming logsumexp over all training points.  No intermediate
+    distance matrix is ever materialised.
     """
-    from cuml.neighbors import KernelDensity
-
-    # Calculate the point density
-    xy = np.vstack([x, y]).T
-    bandwidth = np.power(xy.shape[0], (-1.0 / (xy.shape[1] + 4)))
-    kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(xy)
-    z = cp.zeros(xy.shape[0])
-    n_batches = math.ceil(xy.shape[0] / batchsize)
-    for batch in range(n_batches):
-        start_idx = batch * batchsize
-        stop_idx = min(batch * batchsize + batchsize, xy.shape[0])
-        z[start_idx:stop_idx] = cp.array(kde.score_samples(xy[start_idx:stop_idx, :]))
-    min_z = cp.min(z)
-    max_z = cp.max(z)
-
-    # Scale between 0 and 1
-    scaled_z = (z - min_z) / (max_z - min_z)
-
-    return scaled_z.get()
+    from rapids_singlecell._cuda import _kde_cuda
+
+    xy = cp.stack([x, y], axis=1)  # (n, 2), C-contiguous
+    n = xy.shape[0]
+    dtype = xy.dtype
+
+    # Covariance-aware bandwidth matching scipy.stats.gaussian_kde
+    scotts_factor = n ** (-1.0 / 6.0)
+    data_cov = cp.cov(xy.T)  # (2, 2)
+    inv_cov = cp.linalg.inv(scotts_factor**2 * data_cov)
+
+    # Pre-multiply so the kernel just computes a·dx² + b·dx·dy + c·dy²
+    a = -0.5 * float(inv_cov[0, 0])
+    b = -float(inv_cov[0, 1])
+    c = -0.5 * float(inv_cov[1, 1])
+
+    z = cp.empty(n, dtype=dtype)
+
+    _kde_cuda.gaussian_kde_2d(
+        xy,
+        out=z,
+        n=n,
+        a=a,
+        b=b,
+        c=c,
+        stream=cp.cuda.get_current_stream().ptr,
+    )
+
+    # Min-max scale PDF (not log-PDF) to match scipy/scanpy
+    pdf = cp.exp(z)
+    min_pdf = pdf.min()
+    scaled = (pdf - min_pdf) / (pdf.max() - min_pdf)
+
+    return scaled.get()
diff --git a/tests/test_embedding_density.py b/tests/test_embedding_density.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import scanpy as sc
 from anndata import AnnData
 
 import rapids_singlecell as rsc
@@ -153,3 +154,29 @@ def test_fa_alias():
 
     rsc.tl.embedding_density(adata, "fa")
     assert "draw_graph_fa_density" in adata.obs.columns
+
+
+@pytest.fixture
+def pbmc68k():
+    return sc.datasets.pbmc68k_reduced()
+
+
+@pytest.mark.parametrize("groupby", [None, "louvain"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_matches_scanpy(pbmc68k, groupby, dtype):
+    """GPU density matches scanpy on pbmc68k_reduced."""
+    adata_sc = pbmc68k.copy()
+    adata_sc.obsm["X_umap"] = adata_sc.obsm["X_umap"].astype(dtype)
+    sc.tl.embedding_density(adata_sc, "umap", groupby=groupby)
+
+    adata_gpu = pbmc68k.copy()
+    adata_gpu.obsm["X_umap"] = adata_gpu.obsm["X_umap"].astype(dtype)
+    rsc.tl.embedding_density(adata_gpu, "umap", groupby=groupby)
+
+    key = "umap_density" if groupby is None else f"umap_density_{groupby}"
+    atol = 1e-6 if dtype == np.float32 else 1e-12
+    np.testing.assert_allclose(
+        adata_gpu.obs[key].values,
+        adata_sc.obs[key].values,
+        atol=atol,
+    )