Skip to content

Commit b17de46

Browse files
authored
fix kernels for RMM (#592)
* fix kernels for RMM * add release note
1 parent 119007f commit b17de46

34 files changed

Lines changed: 1056 additions & 627 deletions

File tree

docs/release-notes/0.15.0.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
* Improves numerical accuracy and adds parameters to `tl.rank_genes_groups` Wilcoxon methods: uses ``erfc`` for p-values to avoid underflow, adds ``tie_correct`` and ``use_continuity`` to ``wilcoxon_binned``, and refactors ``Aggregate`` with a unified ``count_mean_var()`` dispatcher and raw ``sq_sum`` output for GPU-resident stats computation {pr}`585` {smaller}`S Dicks`
66
* Replace cuML KDE in ``tl.embedding_density`` with a custom CUDA kernel using covariance-aware Gaussian KDE matching ``scipy.stats.gaussian_kde``, removing the cuML dependency and the ``batchsize`` parameter {pr}`590` {smaller}`S Dicks`
77

8+
```{rubric} Bug fixes
9+
```
10+
* Fix ``TypeError`` when using nanobind CUDA kernels with RMM managed memory (``managed_memory=True``). Nanobind bindings now accept both ``kDLCUDA`` and ``kDLCUDAManaged`` DLPack device types {pr}`592` {smaller}`S Dicks`
11+
812
```{rubric} Removals
913
```
1014
* Remove `tl.mde` and the `pymde` dependency. The function is still available in `scvi-tools` {pr}`588` {smaller}`S Dicks`

src/rapids_singlecell/_cuda/aggr/aggr.cu

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,16 @@ static inline void launch_sparse_var(const int* indptr, const int* index,
7373
indptr, index, data, mean_data, n_cells, dof, n_groups);
7474
}
7575

76-
template <typename T>
76+
template <typename T, typename Device>
7777
void def_sparse_aggr(nb::module_& m) {
7878
m.def(
7979
"sparse_aggr",
80-
[](cuda_array_c<const int> indptr, cuda_array_c<const int> index,
81-
cuda_array_c<const T> data, cuda_array_c<double> out,
82-
cuda_array_c<const int> cats, cuda_array_c<const bool> mask,
83-
size_t n_cells, size_t n_genes, size_t n_groups, bool is_csc,
84-
std::uintptr_t stream) {
80+
[](gpu_array_c<const int, Device> indptr,
81+
gpu_array_c<const int, Device> index,
82+
gpu_array_c<const T, Device> data, gpu_array_c<double, Device> out,
83+
gpu_array_c<const int, Device> cats,
84+
gpu_array_c<const bool, Device> mask, size_t n_cells, size_t n_genes,
85+
size_t n_groups, bool is_csc, std::uintptr_t stream) {
8586
if (is_csc) {
8687
launch_csc_aggr<T>(indptr.data(), index.data(), data.data(),
8788
out.data(), cats.data(), mask.data(),
@@ -99,13 +100,13 @@ void def_sparse_aggr(nb::module_& m) {
99100
"stream"_a = 0);
100101
}
101102

102-
template <typename T, typename DataContig>
103+
template <typename T, typename DataContig, typename Device>
103104
void def_dense_aggr(nb::module_& m) {
104105
m.def(
105106
"dense_aggr",
106-
[](cuda_array_contig<const T, DataContig> data,
107-
cuda_array_c<double> out, cuda_array_c<const int> cats,
108-
cuda_array_c<const bool> mask, size_t n_cells, size_t n_genes,
107+
[](gpu_array_contig<const T, Device, DataContig> data,
108+
gpu_array_c<double, Device> out, gpu_array_c<const int, Device> cats,
109+
gpu_array_c<const bool, Device> mask, size_t n_cells, size_t n_genes,
109110
size_t n_groups, bool is_fortran, std::uintptr_t stream) {
110111
if constexpr (std::is_same_v<DataContig, nb::f_contig>) {
111112
launch_dense_aggr_F<T>(data.data(), out.data(), cats.data(),
@@ -121,15 +122,18 @@ void def_dense_aggr(nb::module_& m) {
121122
"n_genes"_a, "n_groups"_a, "is_fortran"_a, "stream"_a = 0);
122123
}
123124

124-
template <typename T>
125+
template <typename T, typename Device>
125126
void def_csr_to_coo(nb::module_& m) {
126127
m.def(
127128
"csr_to_coo",
128-
[](cuda_array_c<const int> indptr, cuda_array_c<const int> index,
129-
cuda_array_c<const T> data, cuda_array_c<int> out_row,
130-
cuda_array_c<int> out_col, cuda_array_c<double> out_data,
131-
cuda_array_c<const int> cats, cuda_array_c<const bool> mask,
132-
int n_cells, std::uintptr_t stream) {
129+
[](gpu_array_c<const int, Device> indptr,
130+
gpu_array_c<const int, Device> index,
131+
gpu_array_c<const T, Device> data, gpu_array_c<int, Device> out_row,
132+
gpu_array_c<int, Device> out_col,
133+
gpu_array_c<double, Device> out_data,
134+
gpu_array_c<const int, Device> cats,
135+
gpu_array_c<const bool, Device> mask, int n_cells,
136+
std::uintptr_t stream) {
133137
launch_csr_to_coo<T>(indptr.data(), index.data(), data.data(),
134138
out_row.data(), out_col.data(),
135139
out_data.data(), cats.data(), mask.data(),
@@ -140,24 +144,27 @@ void def_csr_to_coo(nb::module_& m) {
140144
"stream"_a = 0);
141145
}
142146

143-
NB_MODULE(_aggr_cuda, m) {
144-
def_sparse_aggr<float>(m);
145-
def_sparse_aggr<double>(m);
147+
template <typename Device>
148+
void register_bindings(nb::module_& m) {
149+
def_sparse_aggr<float, Device>(m);
150+
def_sparse_aggr<double, Device>(m);
146151

147152
// F-order must come before C-order for proper dispatch
148-
def_dense_aggr<float, nb::f_contig>(m);
149-
def_dense_aggr<float, nb::c_contig>(m);
150-
def_dense_aggr<double, nb::f_contig>(m);
151-
def_dense_aggr<double, nb::c_contig>(m);
153+
def_dense_aggr<float, nb::f_contig, Device>(m);
154+
def_dense_aggr<float, nb::c_contig, Device>(m);
155+
def_dense_aggr<double, nb::f_contig, Device>(m);
156+
def_dense_aggr<double, nb::c_contig, Device>(m);
152157

153-
def_csr_to_coo<float>(m);
154-
def_csr_to_coo<double>(m);
158+
def_csr_to_coo<float, Device>(m);
159+
def_csr_to_coo<double, Device>(m);
155160

156161
m.def(
157162
"sparse_var",
158-
[](cuda_array_c<const int> indptr, cuda_array_c<const int> index,
159-
cuda_array_c<double> data, cuda_array_c<const double> means,
160-
cuda_array_c<double> n_cells, int dof, int n_groups,
163+
[](gpu_array_c<const int, Device> indptr,
164+
gpu_array_c<const int, Device> index,
165+
gpu_array_c<double, Device> data,
166+
gpu_array_c<const double, Device> means,
167+
gpu_array_c<double, Device> n_cells, int dof, int n_groups,
161168
std::uintptr_t stream) {
162169
launch_sparse_var(indptr.data(), index.data(), data.data(),
163170
means.data(), n_cells.data(), dof, n_groups,
@@ -166,3 +173,7 @@ NB_MODULE(_aggr_cuda, m) {
166173
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a,
167174
"dof"_a, "n_groups"_a, "stream"_a = 0);
168175
}
176+
177+
NB_MODULE(_aggr_cuda, m) {
178+
REGISTER_GPU_BINDINGS(register_bindings, m);
179+
}

src/rapids_singlecell/_cuda/aucell/aucell.cu

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,24 @@ static inline void launch_auc(const int* ranks, int R, int C, const int* cnct,
4242
n_sets, n_up, max_aucs, es);
4343
}
4444

45-
NB_MODULE(_aucell_cuda, m) {
45+
template <typename Device>
46+
void register_bindings(nb::module_& m) {
4647
m.def(
4748
"auc",
48-
[](cuda_array_c<const int> ranks, int R, int C,
49-
cuda_array_c<const int> cnct, cuda_array_c<const int> starts,
50-
cuda_array_c<const int> lens, int n_sets, int n_up,
51-
cuda_array_c<const float> max_aucs, cuda_array_c<float> es,
52-
std::uintptr_t stream) {
49+
[](gpu_array_c<const int, Device> ranks, int R, int C,
50+
gpu_array_c<const int, Device> cnct,
51+
gpu_array_c<const int, Device> starts,
52+
gpu_array_c<const int, Device> lens, int n_sets, int n_up,
53+
gpu_array_c<const float, Device> max_aucs,
54+
gpu_array_c<float, Device> es, std::uintptr_t stream) {
5355
launch_auc(ranks.data(), R, C, cnct.data(), starts.data(),
5456
lens.data(), n_sets, n_up, max_aucs.data(), es.data(),
5557
(cudaStream_t)stream);
5658
},
5759
"ranks"_a, nb::kw_only(), "R"_a, "C"_a, "cnct"_a, "starts"_a, "lens"_a,
5860
"n_sets"_a, "n_up"_a, "max_aucs"_a, "es"_a, "stream"_a = 0);
5961
}
62+
63+
NB_MODULE(_aucell_cuda, m) {
64+
REGISTER_GPU_BINDINGS(register_bindings, m);
65+
}

src/rapids_singlecell/_cuda/autocorr/autocorr.cu

Lines changed: 72 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,17 @@ static inline void launch_pre_den_sparse(const int* data_col_ind,
6767
data_col_ind, data_values, nnz, mean_array, den, counter);
6868
}
6969

70-
NB_MODULE(_autocorr_cuda, m) {
70+
template <typename Device>
71+
void register_bindings(nb::module_& m) {
7172
// morans_dense - float32
7273
m.def(
7374
"morans_dense",
74-
[](cuda_array_c<const float> data_centered,
75-
cuda_array_c<const int> adj_row_ptr,
76-
cuda_array_c<const int> adj_col_ind,
77-
cuda_array_c<const float> adj_data, cuda_array_c<float> num,
78-
int n_samples, int n_features, std::uintptr_t stream) {
75+
[](gpu_array_c<const float, Device> data_centered,
76+
gpu_array_c<const int, Device> adj_row_ptr,
77+
gpu_array_c<const int, Device> adj_col_ind,
78+
gpu_array_c<const float, Device> adj_data,
79+
gpu_array_c<float, Device> num, int n_samples, int n_features,
80+
std::uintptr_t stream) {
7981
launch_morans_dense(data_centered.data(), adj_row_ptr.data(),
8082
adj_col_ind.data(), adj_data.data(), num.data(),
8183
n_samples, n_features, (cudaStream_t)stream);
@@ -85,11 +87,12 @@ NB_MODULE(_autocorr_cuda, m) {
8587
// morans_dense - float64
8688
m.def(
8789
"morans_dense",
88-
[](cuda_array_c<const double> data_centered,
89-
cuda_array_c<const int> adj_row_ptr,
90-
cuda_array_c<const int> adj_col_ind,
91-
cuda_array_c<const double> adj_data, cuda_array_c<double> num,
92-
int n_samples, int n_features, std::uintptr_t stream) {
90+
[](gpu_array_c<const double, Device> data_centered,
91+
gpu_array_c<const int, Device> adj_row_ptr,
92+
gpu_array_c<const int, Device> adj_col_ind,
93+
gpu_array_c<const double, Device> adj_data,
94+
gpu_array_c<double, Device> num, int n_samples, int n_features,
95+
std::uintptr_t stream) {
9396
launch_morans_dense(data_centered.data(), adj_row_ptr.data(),
9497
adj_col_ind.data(), adj_data.data(), num.data(),
9598
n_samples, n_features, (cudaStream_t)stream);
@@ -100,14 +103,14 @@ NB_MODULE(_autocorr_cuda, m) {
100103
// morans_sparse - float32
101104
m.def(
102105
"morans_sparse",
103-
[](cuda_array_c<const int> adj_row_ptr,
104-
cuda_array_c<const int> adj_col_ind,
105-
cuda_array_c<const float> adj_data,
106-
cuda_array_c<const int> data_row_ptr,
107-
cuda_array_c<const int> data_col_ind,
108-
cuda_array_c<const float> data_values, int n_samples, int n_features,
109-
cuda_array_c<const float> mean_array, cuda_array_c<float> num,
110-
std::uintptr_t stream) {
106+
[](gpu_array_c<const int, Device> adj_row_ptr,
107+
gpu_array_c<const int, Device> adj_col_ind,
108+
gpu_array_c<const float, Device> adj_data,
109+
gpu_array_c<const int, Device> data_row_ptr,
110+
gpu_array_c<const int, Device> data_col_ind,
111+
gpu_array_c<const float, Device> data_values, int n_samples,
112+
int n_features, gpu_array_c<const float, Device> mean_array,
113+
gpu_array_c<float, Device> num, std::uintptr_t stream) {
111114
launch_morans_sparse(adj_row_ptr.data(), adj_col_ind.data(),
112115
adj_data.data(), data_row_ptr.data(),
113116
data_col_ind.data(), data_values.data(),
@@ -120,14 +123,14 @@ NB_MODULE(_autocorr_cuda, m) {
120123
// morans_sparse - float64
121124
m.def(
122125
"morans_sparse",
123-
[](cuda_array_c<const int> adj_row_ptr,
124-
cuda_array_c<const int> adj_col_ind,
125-
cuda_array_c<const double> adj_data,
126-
cuda_array_c<const int> data_row_ptr,
127-
cuda_array_c<const int> data_col_ind,
128-
cuda_array_c<const double> data_values, int n_samples,
129-
int n_features, cuda_array_c<const double> mean_array,
130-
cuda_array_c<double> num, std::uintptr_t stream) {
126+
[](gpu_array_c<const int, Device> adj_row_ptr,
127+
gpu_array_c<const int, Device> adj_col_ind,
128+
gpu_array_c<const double, Device> adj_data,
129+
gpu_array_c<const int, Device> data_row_ptr,
130+
gpu_array_c<const int, Device> data_col_ind,
131+
gpu_array_c<const double, Device> data_values, int n_samples,
132+
int n_features, gpu_array_c<const double, Device> mean_array,
133+
gpu_array_c<double, Device> num, std::uintptr_t stream) {
131134
launch_morans_sparse(adj_row_ptr.data(), adj_col_ind.data(),
132135
adj_data.data(), data_row_ptr.data(),
133136
data_col_ind.data(), data_values.data(),
@@ -141,10 +144,12 @@ NB_MODULE(_autocorr_cuda, m) {
141144
// gearys_dense - float32
142145
m.def(
143146
"gearys_dense",
144-
[](cuda_array_c<const float> data, cuda_array_c<const int> adj_row_ptr,
145-
cuda_array_c<const int> adj_col_ind,
146-
cuda_array_c<const float> adj_data, cuda_array_c<float> num,
147-
int n_samples, int n_features, std::uintptr_t stream) {
147+
[](gpu_array_c<const float, Device> data,
148+
gpu_array_c<const int, Device> adj_row_ptr,
149+
gpu_array_c<const int, Device> adj_col_ind,
150+
gpu_array_c<const float, Device> adj_data,
151+
gpu_array_c<float, Device> num, int n_samples, int n_features,
152+
std::uintptr_t stream) {
148153
launch_gearys_dense(data.data(), adj_row_ptr.data(),
149154
adj_col_ind.data(), adj_data.data(), num.data(),
150155
n_samples, n_features, (cudaStream_t)stream);
@@ -154,10 +159,12 @@ NB_MODULE(_autocorr_cuda, m) {
154159
// gearys_dense - float64
155160
m.def(
156161
"gearys_dense",
157-
[](cuda_array_c<const double> data, cuda_array_c<const int> adj_row_ptr,
158-
cuda_array_c<const int> adj_col_ind,
159-
cuda_array_c<const double> adj_data, cuda_array_c<double> num,
160-
int n_samples, int n_features, std::uintptr_t stream) {
162+
[](gpu_array_c<const double, Device> data,
163+
gpu_array_c<const int, Device> adj_row_ptr,
164+
gpu_array_c<const int, Device> adj_col_ind,
165+
gpu_array_c<const double, Device> adj_data,
166+
gpu_array_c<double, Device> num, int n_samples, int n_features,
167+
std::uintptr_t stream) {
161168
launch_gearys_dense(data.data(), adj_row_ptr.data(),
162169
adj_col_ind.data(), adj_data.data(), num.data(),
163170
n_samples, n_features, (cudaStream_t)stream);
@@ -168,13 +175,14 @@ NB_MODULE(_autocorr_cuda, m) {
168175
// gearys_sparse - float32
169176
m.def(
170177
"gearys_sparse",
171-
[](cuda_array_c<const int> adj_row_ptr,
172-
cuda_array_c<const int> adj_col_ind,
173-
cuda_array_c<const float> adj_data,
174-
cuda_array_c<const int> data_row_ptr,
175-
cuda_array_c<const int> data_col_ind,
176-
cuda_array_c<const float> data_values, int n_samples, int n_features,
177-
cuda_array_c<float> num, std::uintptr_t stream) {
178+
[](gpu_array_c<const int, Device> adj_row_ptr,
179+
gpu_array_c<const int, Device> adj_col_ind,
180+
gpu_array_c<const float, Device> adj_data,
181+
gpu_array_c<const int, Device> data_row_ptr,
182+
gpu_array_c<const int, Device> data_col_ind,
183+
gpu_array_c<const float, Device> data_values, int n_samples,
184+
int n_features, gpu_array_c<float, Device> num,
185+
std::uintptr_t stream) {
178186
launch_gearys_sparse(
179187
adj_row_ptr.data(), adj_col_ind.data(), adj_data.data(),
180188
data_row_ptr.data(), data_col_ind.data(), data_values.data(),
@@ -186,13 +194,14 @@ NB_MODULE(_autocorr_cuda, m) {
186194
// gearys_sparse - float64
187195
m.def(
188196
"gearys_sparse",
189-
[](cuda_array_c<const int> adj_row_ptr,
190-
cuda_array_c<const int> adj_col_ind,
191-
cuda_array_c<const double> adj_data,
192-
cuda_array_c<const int> data_row_ptr,
193-
cuda_array_c<const int> data_col_ind,
194-
cuda_array_c<const double> data_values, int n_samples,
195-
int n_features, cuda_array_c<double> num, std::uintptr_t stream) {
197+
[](gpu_array_c<const int, Device> adj_row_ptr,
198+
gpu_array_c<const int, Device> adj_col_ind,
199+
gpu_array_c<const double, Device> adj_data,
200+
gpu_array_c<const int, Device> data_row_ptr,
201+
gpu_array_c<const int, Device> data_col_ind,
202+
gpu_array_c<const double, Device> data_values, int n_samples,
203+
int n_features, gpu_array_c<double, Device> num,
204+
std::uintptr_t stream) {
196205
launch_gearys_sparse(
197206
adj_row_ptr.data(), adj_col_ind.data(), adj_data.data(),
198207
data_row_ptr.data(), data_col_ind.data(), data_values.data(),
@@ -205,10 +214,11 @@ NB_MODULE(_autocorr_cuda, m) {
205214
// pre_den_sparse - float32
206215
m.def(
207216
"pre_den_sparse",
208-
[](cuda_array_c<const int> data_col_ind,
209-
cuda_array_c<const float> data_values, int nnz,
210-
cuda_array_c<const float> mean_array, cuda_array_c<float> den,
211-
cuda_array_c<int> counter, std::uintptr_t stream) {
217+
[](gpu_array_c<const int, Device> data_col_ind,
218+
gpu_array_c<const float, Device> data_values, int nnz,
219+
gpu_array_c<const float, Device> mean_array,
220+
gpu_array_c<float, Device> den, gpu_array_c<int, Device> counter,
221+
std::uintptr_t stream) {
212222
launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
213223
mean_array.data(), den.data(), counter.data(),
214224
(cudaStream_t)stream);
@@ -218,14 +228,19 @@ NB_MODULE(_autocorr_cuda, m) {
218228
// pre_den_sparse - float64
219229
m.def(
220230
"pre_den_sparse",
221-
[](cuda_array_c<const int> data_col_ind,
222-
cuda_array_c<const double> data_values, int nnz,
223-
cuda_array_c<const double> mean_array, cuda_array_c<double> den,
224-
cuda_array_c<int> counter, std::uintptr_t stream) {
231+
[](gpu_array_c<const int, Device> data_col_ind,
232+
gpu_array_c<const double, Device> data_values, int nnz,
233+
gpu_array_c<const double, Device> mean_array,
234+
gpu_array_c<double, Device> den, gpu_array_c<int, Device> counter,
235+
std::uintptr_t stream) {
225236
launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
226237
mean_array.data(), den.data(), counter.data(),
227238
(cudaStream_t)stream);
228239
},
229240
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
230241
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
231242
}
243+
244+
NB_MODULE(_autocorr_cuda, m) {
245+
REGISTER_GPU_BINDINGS(register_bindings, m);
246+
}

src/rapids_singlecell/_cuda/bbknn/bbknn.cu

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ static inline void launch_cut_smaller(int* indptr, int* index, float* data,
2626
n_rows);
2727
}
2828

29-
NB_MODULE(_bbknn_cuda, m) {
29+
template <typename Device>
30+
void register_bindings(nb::module_& m) {
3031
m.def(
3132
"find_top_k_per_row",
32-
[](cuda_array_c<const float> data, cuda_array_c<const int> indptr,
33-
int n_rows, int trim, cuda_array_c<float> vals,
34-
std::uintptr_t stream) {
33+
[](gpu_array_c<const float, Device> data,
34+
gpu_array_c<const int, Device> indptr, int n_rows, int trim,
35+
gpu_array_c<float, Device> vals, std::uintptr_t stream) {
3536
launch_find_top_k_per_row(data.data(), indptr.data(), n_rows, trim,
3637
vals.data(), (cudaStream_t)stream);
3738
},
@@ -40,12 +41,16 @@ NB_MODULE(_bbknn_cuda, m) {
4041

4142
m.def(
4243
"cut_smaller",
43-
[](cuda_array_c<int> indptr, cuda_array_c<int> index,
44-
cuda_array_c<float> data, cuda_array_c<float> vals, int n_rows,
45-
std::uintptr_t stream) {
44+
[](gpu_array_c<int, Device> indptr, gpu_array_c<int, Device> index,
45+
gpu_array_c<float, Device> data, gpu_array_c<float, Device> vals,
46+
int n_rows, std::uintptr_t stream) {
4647
launch_cut_smaller(indptr.data(), index.data(), data.data(),
4748
vals.data(), n_rows, (cudaStream_t)stream);
4849
},
4950
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "vals"_a, "n_rows"_a,
5051
"stream"_a = 0);
5152
}
53+
54+
NB_MODULE(_bbknn_cuda, m) {
55+
REGISTER_GPU_BINDINGS(register_bindings, m);
56+
}

0 commit comments

Comments
 (0)