Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/release-notes/0.13.1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
### 0.13.1 {small}`the-future`

```{rubric} Features
```

```{rubric} Performance
```
* speed up `_get_mean_var` for the minor axis of `csx` matrices {pr}`423` {smaller}`S Dicks`

```{rubric} Bug fixes
```


```{rubric} Misc
```
2 changes: 2 additions & 0 deletions docs/release-notes/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# Release notes

## Version 0.13.0
```{include} /release-notes/0.13.1.md
```
```{include} /release-notes/0.13.0.md
```

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ test = [
"scanpy>=1.10.0",
"bbknn",
"decoupler",
"fast-array-utils",
]

[project.urls]
Expand Down
71 changes: 71 additions & 0 deletions src/rapids_singlecell/preprocessing/_kernels/_mean_var_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,71 @@
}
"""

_get_mean_var_minor_fast_kernel = r"""
(const long long nnz,
const int* __restrict__ indices,
const {0}* __restrict__ data,
double* __restrict__ g_sum,
double* __restrict__ g_sumsq)
{
extern __shared__ unsigned shmem[];
unsigned HASH_SIZE = 1024;
// layout in shared:
// keys[HASH_SIZE] (uint32, 0xFFFFFFFF = empty)
// sum[HASH_SIZE] (double)
// sq[HASH_SIZE] (double)
unsigned* keys = shmem;
double* sum = (double*)(keys + HASH_SIZE);
double* sq = (double*)(sum + HASH_SIZE);

// init table
for (int i = threadIdx.x; i < HASH_SIZE; i += blockDim.x) {
keys[i] = 0xFFFFFFFFu;
sum[i] = 0.0;
sq[i] = 0.0;
}
__syncthreads();

const size_t stride = (size_t)gridDim.x * blockDim.x;
for (size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
i < nnz; i += stride)
{
unsigned col = (unsigned)__ldg(indices + i);
double dv = (double)__ldg(data + i);
double d2 = dv * dv;

unsigned h = (col * 2654435761u) & (HASH_SIZE - 1);
bool done = false;

#pragma unroll 8
for (int probe = 0; probe < 8; ++probe) {
unsigned pos = (h + probe) & (HASH_SIZE - 1);
unsigned key = atomicCAS(&keys[pos], 0xFFFFFFFFu, col);
if (key == 0xFFFFFFFFu || key == col) {
atomicAdd(&sum[pos], dv);
atomicAdd(&sq[pos], d2);
done = true;
break;
}
}
if (!done) {
atomicAdd(&g_sum[col], dv);
atomicAdd(&g_sumsq[col], d2);
}
}
__syncthreads();

// flush
for (int i = threadIdx.x; i < HASH_SIZE; i += blockDim.x) {
unsigned key = keys[i];
if (key != 0xFFFFFFFFu) {
atomicAdd(&g_sum[key], sum[i]);
atomicAdd(&g_sumsq[key], sq[i]);
}
}
}
"""


sq_sum = cp.ReductionKernel(
"T x", # input params
Expand Down Expand Up @@ -90,3 +155,9 @@ def _get_mean_var_minor(dtype):
return cuda_kernel_factory(
_get_mean_var_minor_kernel, (dtype,), "_get_mean_var_minor_kernel"
)


def _get_mean_var_minor_fast(dtype):
return cuda_kernel_factory(
_get_mean_var_minor_fast_kernel, (dtype,), "_get_mean_var_minor_fast_kernel"
)
21 changes: 14 additions & 7 deletions src/rapids_singlecell/preprocessing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,23 @@ def _mean_var_major(X, major, minor):


def _mean_var_minor(X, major, minor):
from ._kernels._mean_var_kernel import _get_mean_var_minor
from ._kernels._mean_var_kernel import _get_mean_var_minor_fast

mean = cp.zeros(minor, dtype=cp.float64)
var = cp.zeros(minor, dtype=cp.float64)
block = (32,)
grid = (int(math.ceil(X.nnz / block[0])),)
get_mean_var_minor = _get_mean_var_minor(X.data.dtype)
get_mean_var_minor(grid, block, (X.indices, X.data, mean, var, major, X.nnz))

var = (var - mean**2) * (major / (major - 1))
block = 256
sm = cp.cuda.runtime.getDeviceProperties(cp.cuda.Device().id)["multiProcessorCount"]
grid = (min(max((X.nnz + block - 1) // block, sm * 4), 65535),)
shmem_bytes = 1024 * 4 + 1024 * 8 * 2 # keys + two double arrays

get_mean_var_minor = _get_mean_var_minor_fast(X.data.dtype)
get_mean_var_minor(
grid, (block,), (X.nnz, X.indices, X.data, mean, var), shared_mem=shmem_bytes
)
mean /= major
var /= major
var -= mean**2
var *= major / (major - 1)
return mean, var


Expand Down
30 changes: 30 additions & 0 deletions tests/test_mean_var.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import annotations

import cupy as cp
import numpy as np
import pytest
from fast_array_utils.stats import mean_var as sc_get_mean_var
from scanpy.datasets import pbmc3k, pbmc68k_reduced

import rapids_singlecell as rsc
from rapids_singlecell.preprocessing._utils import _get_mean_var as rsc_get_mean_var


@pytest.mark.parametrize("data_kind", ["csc", "csr", "dense"])
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_mean_var(data_kind, axis, dtype):
if data_kind == "dense":
adata = pbmc68k_reduced()
else:
adata = pbmc3k()
if data_kind == "csc":
adata.X = adata.X.tocsc()
adata.X = adata.X.astype(dtype)
cudata = rsc.get.anndata_to_GPU(adata, copy=True)

mean, var = sc_get_mean_var(adata.X, axis=axis, correction=1)
rsc_mean, rsc_var = rsc_get_mean_var(cudata.X, axis=axis)

cp.testing.assert_allclose(mean, rsc_mean)
cp.testing.assert_allclose(var, rsc_var)
Loading