From a8f2f0f644fcabcaa9b21b84da08e492b45277cf Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 8 Aug 2025 14:45:40 +0200 Subject: [PATCH 1/5] speed up mean_var * update harmony * fix release notes --- docs/release-notes/0.13.1.md | 15 ++++ docs/release-notes/index.md | 2 + .../_kernels/_mean_var_kernel.py | 71 +++++++++++++++++++ src/rapids_singlecell/preprocessing/_utils.py | 21 ++++-- tests/test_mean_var.py | 32 +++++++++ 5 files changed, 134 insertions(+), 7 deletions(-) create mode 100644 docs/release-notes/0.13.1.md create mode 100644 tests/test_mean_var.py diff --git a/docs/release-notes/0.13.1.md b/docs/release-notes/0.13.1.md new file mode 100644 index 00000000..fdb11c97 --- /dev/null +++ b/docs/release-notes/0.13.1.md @@ -0,0 +1,15 @@ +### 0.13.0 {small}`2025-08-06` + +```{rubric} Features +``` + +```{rubric} Performance +``` +* speed up `_get_mean_var` for the minor axis of `csx` matrices {pr}`408` {smaller}`S Dicks` + +```{rubric} Bug fixes +``` + + +```{rubric} Misc +``` diff --git a/docs/release-notes/index.md b/docs/release-notes/index.md index 059730b1..0838be0e 100644 --- a/docs/release-notes/index.md +++ b/docs/release-notes/index.md @@ -3,6 +3,8 @@ # Release notes ## Version 0.13.0 +```{include} /release-notes/0.13.1.md +``` ```{include} /release-notes/0.13.0.md ``` diff --git a/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py b/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py index 39c162c3..76aa6d3a 100644 --- a/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py +++ b/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py @@ -58,6 +58,71 @@ } """ +_get_mean_var_minor_fast_kernel = r""" +(const long long nnz, +const int* __restrict__ indices, +const {0}* __restrict__ data, +double* __restrict__ g_sum, +double* __restrict__ g_sumsq) +{ + extern __shared__ unsigned shmem[]; + unsigned HASH_SIZE = 1024; + // layout in shared: + // keys[HASH_SIZE] (uint32, 0xFFFFFFFF = empty) + // sum[HASH_SIZE] (double) + // sq[HASH_SIZE] (double) + unsigned* keys = shmem; + double* sum = (double*)(keys + HASH_SIZE); + double* sq = (double*)(sum + HASH_SIZE); + + // init table + for (int i = threadIdx.x; i < HASH_SIZE; i += blockDim.x) { + keys[i] = 0xFFFFFFFFu; + sum[i] = 0.0; + sq[i] = 0.0; + } + __syncthreads(); + + const size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + i < nnz; i += stride) + { + unsigned col = (unsigned)__ldg(indices + i); + double dv = (double)__ldg(data + i); + double d2 = dv * dv; + + unsigned h = (col * 2654435761u) & (HASH_SIZE - 1); + bool done = false; + + #pragma unroll 8 + for (int probe = 0; probe < 8; ++probe) { + unsigned pos = (h + probe) & (HASH_SIZE - 1); + unsigned key = atomicCAS(&keys[pos], 0xFFFFFFFFu, col); + if (key == 0xFFFFFFFFu || key == col) { + atomicAdd(&sum[pos], dv); + atomicAdd(&sq[pos], d2); + done = true; + break; + } + } + if (!done) { + atomicAdd(&g_sum[col], dv); + atomicAdd(&g_sumsq[col], d2); + } + } + __syncthreads(); + + // flush + for (int i = threadIdx.x; i < HASH_SIZE; i += blockDim.x) { + unsigned key = keys[i]; + if (key != 0xFFFFFFFFu) { + atomicAdd(&g_sum[key], sum[i]); + atomicAdd(&g_sumsq[key], sq[i]); + } + } +} +""" + sq_sum = cp.ReductionKernel( "T x", # input params @@ -90,3 +155,9 @@ def _get_mean_var_minor(dtype): return cuda_kernel_factory( _get_mean_var_minor_kernel, (dtype,), "_get_mean_var_minor_kernel" ) + + +def _get_mean_var_minor_fast(dtype): + return cuda_kernel_factory( + _get_mean_var_minor_fast_kernel, (dtype,), "_get_mean_var_minor_fast_kernel" + ) diff --git a/src/rapids_singlecell/preprocessing/_utils.py b/src/rapids_singlecell/preprocessing/_utils.py index c06b5d23..98e14c89 100644 --- a/src/rapids_singlecell/preprocessing/_utils.py +++ b/src/rapids_singlecell/preprocessing/_utils.py @@ -84,16 +84,23 @@ def _mean_var_major(X, major, minor): def _mean_var_minor(X, major, minor): - from ._kernels._mean_var_kernel import _get_mean_var_minor + from ._kernels._mean_var_kernel import _get_mean_var_minor_fast mean = cp.zeros(minor, dtype=cp.float64) var = cp.zeros(minor, dtype=cp.float64) - block = (32,) - grid = (int(math.ceil(X.nnz / block[0])),) - get_mean_var_minor = _get_mean_var_minor(X.data.dtype) - get_mean_var_minor(grid, block, (X.indices, X.data, mean, var, major, X.nnz)) - - var = (var - mean**2) * (major / (major - 1)) + block = 256 + sm = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["multiProcessorCount"] + grid = (min(max((X.nnz + block - 1) // block, sm * 4), 65535),) + shmem_bytes = 1024 * 4 + 1024 * 8 * 2 # keys + two double arrays + + get_mean_var_minor = _get_mean_var_minor_fast(X.data.dtype) + get_mean_var_minor( + grid, (block,), (X.nnz, X.indices, X.data, mean, var), shared_mem=shmem_bytes + ) + mean /= major + var /= major + var -= mean**2 + var *= major / (major - 1) return mean, var diff --git a/tests/test_mean_var.py b/tests/test_mean_var.py new file mode 100644 index 00000000..5046f9ac --- /dev/null +++ b/tests/test_mean_var.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import cupy as cp +import numpy as np +import pytest +from scanpy.datasets import pbmc3k, pbmc68k_reduced +from scanpy.preprocessing._utils import _get_mean_var as sc_get_mean_var + +import rapids_singlecell as rsc +from rapids_singlecell.preprocessing._utils import _get_mean_var as rsc_get_mean_var + + +@pytest.mark.parametrize("data_kind", ["csc", "csr", "dense"]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_mean_var(data_kind, axis, dtype): + if data_kind == "dense": + adata = pbmc68k_reduced() + adata.X = adata.X.astype(dtype) + cudata = rsc.get.anndata_to_GPU(adata, copy=True) + else: + adata = pbmc3k() + if data_kind == "csc": + adata.X = adata.X.tocsc() + adata.X = adata.X.astype(dtype) + cudata = rsc.get.anndata_to_GPU(adata, copy=True) + + mean, var = sc_get_mean_var(adata.X, axis=axis) + rsc_mean, rsc_var = rsc_get_mean_var(cudata.X, axis=axis) + + cp.testing.assert_allclose(mean, rsc_mean) + cp.testing.assert_allclose(var, rsc_var) From 6d6c4b5380a9c10edf814061bf4f9ab1d7579385 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 8 Aug 2025 14:46:07 +0200 Subject: [PATCH 2/5] update release note --- docs/release-notes/0.13.1.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/0.13.1.md b/docs/release-notes/0.13.1.md index fdb11c97..4f3ed1fb 100644 --- a/docs/release-notes/0.13.1.md +++ b/docs/release-notes/0.13.1.md @@ -5,7 +5,7 @@ ```{rubric} Performance ``` -* speed up `_get_mean_var` for the minor axis of `csx` matrices {pr}`408` {smaller}`S Dicks` +* speed up `_get_mean_var` for the minor axis of `csx` matrices {pr}`423` {smaller}`S Dicks` ```{rubric} Bug fixes ``` From 943b1e1882cf04e2c203f8d677f02b447e38c700 Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 8 Aug 2025 14:46:35 +0200 Subject: [PATCH 3/5] update file --- docs/release-notes/0.13.1.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release-notes/0.13.1.md b/docs/release-notes/0.13.1.md index 4f3ed1fb..97348d21 100644 --- a/docs/release-notes/0.13.1.md +++ b/docs/release-notes/0.13.1.md @@ -1,4 +1,4 @@ -### 0.13.0 {small}`2025-08-06` +### 0.13.1 {small}`the-future` ```{rubric} Features ``` From 2e7f34b0cdde16f1eac7c4e5cefb128ca94f7a5f Mon Sep 17 00:00:00 2001 From: Intron7 Date: Fri, 8 Aug 2025 14:58:55 +0200 Subject: [PATCH 4/5] fix test --- pyproject.toml | 1 + tests/test_mean_var.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3c9cfab1..73415088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ test = [ "scanpy>=1.10.0", "bbknn", "decoupler", + "fast-array-utils", ] [project.urls] diff --git a/tests/test_mean_var.py b/tests/test_mean_var.py index 5046f9ac..46f8d6ed 100644 --- a/tests/test_mean_var.py +++ b/tests/test_mean_var.py @@ -3,8 +3,8 @@ import cupy as cp import numpy as np import pytest +from fast_array_utils.stats import mean_var as sc_get_mean_var from scanpy.datasets import pbmc3k, pbmc68k_reduced -from scanpy.preprocessing._utils import _get_mean_var as sc_get_mean_var import rapids_singlecell as rsc from rapids_singlecell.preprocessing._utils import _get_mean_var as rsc_get_mean_var @@ -25,7 +25,7 @@ def test_mean_var(data_kind, axis, dtype): adata.X = adata.X.astype(dtype) cudata = rsc.get.anndata_to_GPU(adata, copy=True) - mean, var = sc_get_mean_var(adata.X, axis=axis) + mean, var = sc_get_mean_var(adata.X, axis=axis, correction=1) rsc_mean, rsc_var = rsc_get_mean_var(cudata.X, axis=axis) cp.testing.assert_allclose(mean, rsc_mean) From 0fa5a42638e18bbca5a45c03881d68aff7de2a13 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 11 Aug 2025 14:28:37 +0200 Subject: [PATCH 5/5] Update test_mean_var.py --- tests/test_mean_var.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_mean_var.py b/tests/test_mean_var.py index 46f8d6ed..0fbb596e 100644 --- a/tests/test_mean_var.py +++ b/tests/test_mean_var.py @@ -16,14 +16,12 @@ def test_mean_var(data_kind, axis, dtype): if data_kind == "dense": adata = pbmc68k_reduced() - adata.X = adata.X.astype(dtype) - cudata = rsc.get.anndata_to_GPU(adata, copy=True) else: adata = pbmc3k() if data_kind == "csc": adata.X = adata.X.tocsc() - adata.X = adata.X.astype(dtype) - cudata = rsc.get.anndata_to_GPU(adata, copy=True) + adata.X = adata.X.astype(dtype) + cudata = rsc.get.anndata_to_GPU(adata, copy=True) mean, var = sc_get_mean_var(adata.X, axis=axis, correction=1) rsc_mean, rsc_var = rsc_get_mean_var(cudata.X, axis=axis)