diff --git a/docs/release-notes/0.13.1.md b/docs/release-notes/0.13.1.md new file mode 100644 index 00000000..97348d21 --- /dev/null +++ b/docs/release-notes/0.13.1.md @@ -0,0 +1,15 @@ +### 0.13.1 {small}`the-future` + +```{rubric} Features +``` + +```{rubric} Performance +``` +* speed up `_get_mean_var` for the minor axis of `csx` matrices {pr}`423` {smaller}`S Dicks` + +```{rubric} Bug fixes +``` + + +```{rubric} Misc +``` diff --git a/docs/release-notes/index.md b/docs/release-notes/index.md index 059730b1..0838be0e 100644 --- a/docs/release-notes/index.md +++ b/docs/release-notes/index.md @@ -3,6 +3,8 @@ # Release notes ## Version 0.13.0 +```{include} /release-notes/0.13.1.md +``` ```{include} /release-notes/0.13.0.md ``` diff --git a/pyproject.toml b/pyproject.toml index 3c9cfab1..73415088 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ test = [ "scanpy>=1.10.0", "bbknn", "decoupler", + "fast-array-utils", ] [project.urls] diff --git a/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py b/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py index 39c162c3..76aa6d3a 100644 --- a/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py +++ b/src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py @@ -58,6 +58,71 @@ } """ +_get_mean_var_minor_fast_kernel = r""" +(const long long nnz, +const int* __restrict__ indices, +const {0}* __restrict__ data, +double* __restrict__ g_sum, +double* __restrict__ g_sumsq) +{ + extern __shared__ unsigned shmem[]; + unsigned HASH_SIZE = 1024; + // layout in shared: + // keys[HASH_SIZE] (uint32, 0xFFFFFFFF = empty) + // sum[HASH_SIZE] (double) + // sq[HASH_SIZE] (double) + unsigned* keys = shmem; + double* sum = (double*)(keys + HASH_SIZE); + double* sq = (double*)(sum + HASH_SIZE); + + // init table + for (int i = threadIdx.x; i < HASH_SIZE; i += blockDim.x) { + keys[i] = 0xFFFFFFFFu; + sum[i] = 0.0; + sq[i] = 0.0; + } + __syncthreads(); + + const size_t stride = (size_t)gridDim.x * blockDim.x; + for (size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + i < nnz; i += stride) + { + unsigned col = (unsigned)__ldg(indices + i); + double dv = (double)__ldg(data + i); + double d2 = dv * dv; + + unsigned h = (col * 2654435761u) & (HASH_SIZE - 1); + bool done = false; + + #pragma unroll 8 + for (int probe = 0; probe < 8; ++probe) { + unsigned pos = (h + probe) & (HASH_SIZE - 1); + unsigned key = atomicCAS(&keys[pos], 0xFFFFFFFFu, col); + if (key == 0xFFFFFFFFu || key == col) { + atomicAdd(&sum[pos], dv); + atomicAdd(&sq[pos], d2); + done = true; + break; + } + } + if (!done) { + atomicAdd(&g_sum[col], dv); + atomicAdd(&g_sumsq[col], d2); + } + } + __syncthreads(); + + // flush + for (int i = threadIdx.x; i < HASH_SIZE; i += blockDim.x) { + unsigned key = keys[i]; + if (key != 0xFFFFFFFFu) { + atomicAdd(&g_sum[key], sum[i]); + atomicAdd(&g_sumsq[key], sq[i]); + } + } +} +""" + sq_sum = cp.ReductionKernel( "T x", # input params @@ -90,3 +155,9 @@ def _get_mean_var_minor(dtype): return cuda_kernel_factory( _get_mean_var_minor_kernel, (dtype,), "_get_mean_var_minor_kernel" ) + + +def _get_mean_var_minor_fast(dtype): + return cuda_kernel_factory( + _get_mean_var_minor_fast_kernel, (dtype,), "_get_mean_var_minor_fast_kernel" + ) diff --git a/src/rapids_singlecell/preprocessing/_utils.py b/src/rapids_singlecell/preprocessing/_utils.py index c06b5d23..98e14c89 100644 --- a/src/rapids_singlecell/preprocessing/_utils.py +++ b/src/rapids_singlecell/preprocessing/_utils.py @@ -84,16 +84,23 @@ def _mean_var_major(X, major, minor): def _mean_var_minor(X, major, minor): - from ._kernels._mean_var_kernel import _get_mean_var_minor + from ._kernels._mean_var_kernel import _get_mean_var_minor_fast mean = cp.zeros(minor, dtype=cp.float64) var = cp.zeros(minor, dtype=cp.float64) - block = (32,) - grid = (int(math.ceil(X.nnz / block[0])),) - get_mean_var_minor = _get_mean_var_minor(X.data.dtype) - get_mean_var_minor(grid, block, (X.indices, X.data, mean, var, major, X.nnz)) - - var = (var - mean**2) * (major / (major - 1)) + block = 256 + sm = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["multiProcessorCount"] + grid = (min(max((X.nnz + block - 1) // block, sm * 4), 65535),) + shmem_bytes = 1024 * 4 + 1024 * 8 * 2 # keys + two double arrays + + get_mean_var_minor = _get_mean_var_minor_fast(X.data.dtype) + get_mean_var_minor( + grid, (block,), (X.nnz, X.indices, X.data, mean, var), shared_mem=shmem_bytes + ) + mean /= major + var /= major + var -= mean**2 + var *= major / (major - 1) return mean, var diff --git a/tests/test_mean_var.py b/tests/test_mean_var.py new file mode 100644 index 00000000..0fbb596e --- /dev/null +++ b/tests/test_mean_var.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import cupy as cp +import numpy as np +import pytest +from fast_array_utils.stats import mean_var as sc_get_mean_var +from scanpy.datasets import pbmc3k, pbmc68k_reduced + +import rapids_singlecell as rsc +from rapids_singlecell.preprocessing._utils import _get_mean_var as rsc_get_mean_var + + +@pytest.mark.parametrize("data_kind", ["csc", "csr", "dense"]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +def test_mean_var(data_kind, axis, dtype): + if data_kind == "dense": + adata = pbmc68k_reduced() + else: + adata = pbmc3k() + if data_kind == "csc": + adata.X = adata.X.tocsc() + adata.X = adata.X.astype(dtype) + cudata = rsc.get.anndata_to_GPU(adata, copy=True) + + mean, var = sc_get_mean_var(adata.X, axis=axis, correction=1) + rsc_mean, rsc_var = rsc_get_mean_var(cudata.X, axis=axis) + + cp.testing.assert_allclose(mean, rsc_mean) + cp.testing.assert_allclose(var, rsc_var)