scverse
diff --git a/‎docs/release-notes/0.15.0.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/release-notes/0.15.0.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/aggr/aggr.cu‎
Lines changed: 39 additions & 28 deletions b/‎src/rapids_singlecell/_cuda/aggr/aggr.cu‎
Lines changed: 39 additions & 28 deletions
diff --git a/‎src/rapids_singlecell/_cuda/aucell/aucell.cu‎
Lines changed: 12 additions & 6 deletions b/‎src/rapids_singlecell/_cuda/aucell/aucell.cu‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎src/rapids_singlecell/_cuda/autocorr/autocorr.cu‎
Lines changed: 72 additions & 57 deletions b/‎src/rapids_singlecell/_cuda/autocorr/autocorr.cu‎
Lines changed: 72 additions & 57 deletions
diff --git a/‎src/rapids_singlecell/_cuda/bbknn/bbknn.cu‎
Lines changed: 12 additions & 7 deletions b/‎src/rapids_singlecell/_cuda/bbknn/bbknn.cu‎
Lines changed: 12 additions & 7 deletions
@@ -5,6 +5,10 @@
 * Improves numerical accuracy and adds parameters to `tl.rank_genes_groups` Wilcoxon methods: uses ``erfc`` for p-values to avoid underflow, adds ``tie_correct`` and ``use_continuity`` to ``wilcoxon_binned``, and refactors ``Aggregate`` with a unified ``count_mean_var()`` dispatcher and raw ``sq_sum`` output for GPU-resident stats computation {pr}`585` {smaller}`S Dicks`
 * Replace cuML KDE in ``tl.embedding_density`` with a custom CUDA kernel using covariance-aware Gaussian KDE matching ``scipy.stats.gaussian_kde``, removing the cuML dependency and the ``batchsize`` parameter {pr}`590` {smaller}`S Dicks`
 
+```{rubric} Bug fixes
+```
+* Fix ``TypeError`` when using nanobind CUDA kernels with RMM managed memory (``managed_memory=True``). Nanobind bindings now accept both ``kDLCUDA`` and ``kDLCUDAManaged`` DLPack device types {pr}`592` {smaller}`S Dicks`
+
 ```{rubric} Removals
 ```
 * Remove `tl.mde` and the `pymde` dependency. The function is still available in `scvi-tools` {pr}`588` {smaller}`S Dicks`
@@ -73,15 +73,16 @@ static inline void launch_sparse_var(const int* indptr, const int* index,
         indptr, index, data, mean_data, n_cells, dof, n_groups);
 }
 
-template <typename T>
+template <typename T, typename Device>
 void def_sparse_aggr(nb::module_& m) {
     m.def(
         "sparse_aggr",
-        [](cuda_array_c<const int> indptr, cuda_array_c<const int> index,
-           cuda_array_c<const T> data, cuda_array_c<double> out,
-           cuda_array_c<const int> cats, cuda_array_c<const bool> mask,
-           size_t n_cells, size_t n_genes, size_t n_groups, bool is_csc,
-           std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> indptr,
+           gpu_array_c<const int, Device> index,
+           gpu_array_c<const T, Device> data, gpu_array_c<double, Device> out,
+           gpu_array_c<const int, Device> cats,
+           gpu_array_c<const bool, Device> mask, size_t n_cells, size_t n_genes,
+           size_t n_groups, bool is_csc, std::uintptr_t stream) {
             if (is_csc) {
                 launch_csc_aggr<T>(indptr.data(), index.data(), data.data(),
                                    out.data(), cats.data(), mask.data(),
@@ -99,13 +100,13 @@ void def_sparse_aggr(nb::module_& m) {
         "stream"_a = 0);
 }
 
-template <typename T, typename DataContig>
+template <typename T, typename DataContig, typename Device>
 void def_dense_aggr(nb::module_& m) {
     m.def(
         "dense_aggr",
-        [](cuda_array_contig<const T, DataContig> data,
-           cuda_array_c<double> out, cuda_array_c<const int> cats,
-           cuda_array_c<const bool> mask, size_t n_cells, size_t n_genes,
+        [](gpu_array_contig<const T, Device, DataContig> data,
+           gpu_array_c<double, Device> out, gpu_array_c<const int, Device> cats,
+           gpu_array_c<const bool, Device> mask, size_t n_cells, size_t n_genes,
            size_t n_groups, bool is_fortran, std::uintptr_t stream) {
             if constexpr (std::is_same_v<DataContig, nb::f_contig>) {
                 launch_dense_aggr_F<T>(data.data(), out.data(), cats.data(),
@@ -121,15 +122,18 @@ void def_dense_aggr(nb::module_& m) {
         "n_genes"_a, "n_groups"_a, "is_fortran"_a, "stream"_a = 0);
 }
 
-template <typename T>
+template <typename T, typename Device>
 void def_csr_to_coo(nb::module_& m) {
     m.def(
         "csr_to_coo",
-        [](cuda_array_c<const int> indptr, cuda_array_c<const int> index,
-           cuda_array_c<const T> data, cuda_array_c<int> out_row,
-           cuda_array_c<int> out_col, cuda_array_c<double> out_data,
-           cuda_array_c<const int> cats, cuda_array_c<const bool> mask,
-           int n_cells, std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> indptr,
+           gpu_array_c<const int, Device> index,
+           gpu_array_c<const T, Device> data, gpu_array_c<int, Device> out_row,
+           gpu_array_c<int, Device> out_col,
+           gpu_array_c<double, Device> out_data,
+           gpu_array_c<const int, Device> cats,
+           gpu_array_c<const bool, Device> mask, int n_cells,
+           std::uintptr_t stream) {
             launch_csr_to_coo<T>(indptr.data(), index.data(), data.data(),
                                  out_row.data(), out_col.data(),
                                  out_data.data(), cats.data(), mask.data(),
@@ -140,24 +144,27 @@ void def_csr_to_coo(nb::module_& m) {
         "stream"_a = 0);
 }
 
-NB_MODULE(_aggr_cuda, m) {
-    def_sparse_aggr<float>(m);
-    def_sparse_aggr<double>(m);
+template <typename Device>
+void register_bindings(nb::module_& m) {
+    def_sparse_aggr<float, Device>(m);
+    def_sparse_aggr<double, Device>(m);
 
     // F-order must come before C-order for proper dispatch
-    def_dense_aggr<float, nb::f_contig>(m);
-    def_dense_aggr<float, nb::c_contig>(m);
-    def_dense_aggr<double, nb::f_contig>(m);
-    def_dense_aggr<double, nb::c_contig>(m);
+    def_dense_aggr<float, nb::f_contig, Device>(m);
+    def_dense_aggr<float, nb::c_contig, Device>(m);
+    def_dense_aggr<double, nb::f_contig, Device>(m);
+    def_dense_aggr<double, nb::c_contig, Device>(m);
 
-    def_csr_to_coo<float>(m);
-    def_csr_to_coo<double>(m);
+    def_csr_to_coo<float, Device>(m);
+    def_csr_to_coo<double, Device>(m);
 
     m.def(
         "sparse_var",
-        [](cuda_array_c<const int> indptr, cuda_array_c<const int> index,
-           cuda_array_c<double> data, cuda_array_c<const double> means,
-           cuda_array_c<double> n_cells, int dof, int n_groups,
+        [](gpu_array_c<const int, Device> indptr,
+           gpu_array_c<const int, Device> index,
+           gpu_array_c<double, Device> data,
+           gpu_array_c<const double, Device> means,
+           gpu_array_c<double, Device> n_cells, int dof, int n_groups,
            std::uintptr_t stream) {
             launch_sparse_var(indptr.data(), index.data(), data.data(),
                               means.data(), n_cells.data(), dof, n_groups,
@@ -166,3 +173,7 @@ NB_MODULE(_aggr_cuda, m) {
         "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a,
         "dof"_a, "n_groups"_a, "stream"_a = 0);
 }
+
+NB_MODULE(_aggr_cuda, m) {
+    REGISTER_GPU_BINDINGS(register_bindings, m);
+}
@@ -42,18 +42,24 @@ static inline void launch_auc(const int* ranks, int R, int C, const int* cnct,
                                            n_sets, n_up, max_aucs, es);
 }
 
-NB_MODULE(_aucell_cuda, m) {
+template <typename Device>
+void register_bindings(nb::module_& m) {
     m.def(
         "auc",
-        [](cuda_array_c<const int> ranks, int R, int C,
-           cuda_array_c<const int> cnct, cuda_array_c<const int> starts,
-           cuda_array_c<const int> lens, int n_sets, int n_up,
-           cuda_array_c<const float> max_aucs, cuda_array_c<float> es,
-           std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> ranks, int R, int C,
+           gpu_array_c<const int, Device> cnct,
+           gpu_array_c<const int, Device> starts,
+           gpu_array_c<const int, Device> lens, int n_sets, int n_up,
+           gpu_array_c<const float, Device> max_aucs,
+           gpu_array_c<float, Device> es, std::uintptr_t stream) {
             launch_auc(ranks.data(), R, C, cnct.data(), starts.data(),
                        lens.data(), n_sets, n_up, max_aucs.data(), es.data(),
                        (cudaStream_t)stream);
         },
         "ranks"_a, nb::kw_only(), "R"_a, "C"_a, "cnct"_a, "starts"_a, "lens"_a,
         "n_sets"_a, "n_up"_a, "max_aucs"_a, "es"_a, "stream"_a = 0);
 }
+
+NB_MODULE(_aucell_cuda, m) {
+    REGISTER_GPU_BINDINGS(register_bindings, m);
+}
@@ -67,15 +67,17 @@ static inline void launch_pre_den_sparse(const int* data_col_ind,
         data_col_ind, data_values, nnz, mean_array, den, counter);
 }
 
-NB_MODULE(_autocorr_cuda, m) {
+template <typename Device>
+void register_bindings(nb::module_& m) {
     // morans_dense - float32
     m.def(
         "morans_dense",
-        [](cuda_array_c<const float> data_centered,
-           cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const float> adj_data, cuda_array_c<float> num,
-           int n_samples, int n_features, std::uintptr_t stream) {
+        [](gpu_array_c<const float, Device> data_centered,
+           gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const float, Device> adj_data,
+           gpu_array_c<float, Device> num, int n_samples, int n_features,
+           std::uintptr_t stream) {
             launch_morans_dense(data_centered.data(), adj_row_ptr.data(),
                                 adj_col_ind.data(), adj_data.data(), num.data(),
                                 n_samples, n_features, (cudaStream_t)stream);
@@ -85,11 +87,12 @@ NB_MODULE(_autocorr_cuda, m) {
     // morans_dense - float64
     m.def(
         "morans_dense",
-        [](cuda_array_c<const double> data_centered,
-           cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const double> adj_data, cuda_array_c<double> num,
-           int n_samples, int n_features, std::uintptr_t stream) {
+        [](gpu_array_c<const double, Device> data_centered,
+           gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const double, Device> adj_data,
+           gpu_array_c<double, Device> num, int n_samples, int n_features,
+           std::uintptr_t stream) {
             launch_morans_dense(data_centered.data(), adj_row_ptr.data(),
                                 adj_col_ind.data(), adj_data.data(), num.data(),
                                 n_samples, n_features, (cudaStream_t)stream);
@@ -100,14 +103,14 @@ NB_MODULE(_autocorr_cuda, m) {
     // morans_sparse - float32
     m.def(
         "morans_sparse",
-        [](cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const float> adj_data,
-           cuda_array_c<const int> data_row_ptr,
-           cuda_array_c<const int> data_col_ind,
-           cuda_array_c<const float> data_values, int n_samples, int n_features,
-           cuda_array_c<const float> mean_array, cuda_array_c<float> num,
-           std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const float, Device> adj_data,
+           gpu_array_c<const int, Device> data_row_ptr,
+           gpu_array_c<const int, Device> data_col_ind,
+           gpu_array_c<const float, Device> data_values, int n_samples,
+           int n_features, gpu_array_c<const float, Device> mean_array,
+           gpu_array_c<float, Device> num, std::uintptr_t stream) {
             launch_morans_sparse(adj_row_ptr.data(), adj_col_ind.data(),
                                  adj_data.data(), data_row_ptr.data(),
                                  data_col_ind.data(), data_values.data(),
@@ -120,14 +123,14 @@ NB_MODULE(_autocorr_cuda, m) {
     // morans_sparse - float64
     m.def(
         "morans_sparse",
-        [](cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const double> adj_data,
-           cuda_array_c<const int> data_row_ptr,
-           cuda_array_c<const int> data_col_ind,
-           cuda_array_c<const double> data_values, int n_samples,
-           int n_features, cuda_array_c<const double> mean_array,
-           cuda_array_c<double> num, std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const double, Device> adj_data,
+           gpu_array_c<const int, Device> data_row_ptr,
+           gpu_array_c<const int, Device> data_col_ind,
+           gpu_array_c<const double, Device> data_values, int n_samples,
+           int n_features, gpu_array_c<const double, Device> mean_array,
+           gpu_array_c<double, Device> num, std::uintptr_t stream) {
             launch_morans_sparse(adj_row_ptr.data(), adj_col_ind.data(),
                                  adj_data.data(), data_row_ptr.data(),
                                  data_col_ind.data(), data_values.data(),
@@ -141,10 +144,12 @@ NB_MODULE(_autocorr_cuda, m) {
     // gearys_dense - float32
     m.def(
         "gearys_dense",
-        [](cuda_array_c<const float> data, cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const float> adj_data, cuda_array_c<float> num,
-           int n_samples, int n_features, std::uintptr_t stream) {
+        [](gpu_array_c<const float, Device> data,
+           gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const float, Device> adj_data,
+           gpu_array_c<float, Device> num, int n_samples, int n_features,
+           std::uintptr_t stream) {
             launch_gearys_dense(data.data(), adj_row_ptr.data(),
                                 adj_col_ind.data(), adj_data.data(), num.data(),
                                 n_samples, n_features, (cudaStream_t)stream);
@@ -154,10 +159,12 @@ NB_MODULE(_autocorr_cuda, m) {
     // gearys_dense - float64
     m.def(
         "gearys_dense",
-        [](cuda_array_c<const double> data, cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const double> adj_data, cuda_array_c<double> num,
-           int n_samples, int n_features, std::uintptr_t stream) {
+        [](gpu_array_c<const double, Device> data,
+           gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const double, Device> adj_data,
+           gpu_array_c<double, Device> num, int n_samples, int n_features,
+           std::uintptr_t stream) {
             launch_gearys_dense(data.data(), adj_row_ptr.data(),
                                 adj_col_ind.data(), adj_data.data(), num.data(),
                                 n_samples, n_features, (cudaStream_t)stream);
@@ -168,13 +175,14 @@ NB_MODULE(_autocorr_cuda, m) {
     // gearys_sparse - float32
     m.def(
         "gearys_sparse",
-        [](cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const float> adj_data,
-           cuda_array_c<const int> data_row_ptr,
-           cuda_array_c<const int> data_col_ind,
-           cuda_array_c<const float> data_values, int n_samples, int n_features,
-           cuda_array_c<float> num, std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const float, Device> adj_data,
+           gpu_array_c<const int, Device> data_row_ptr,
+           gpu_array_c<const int, Device> data_col_ind,
+           gpu_array_c<const float, Device> data_values, int n_samples,
+           int n_features, gpu_array_c<float, Device> num,
+           std::uintptr_t stream) {
             launch_gearys_sparse(
                 adj_row_ptr.data(), adj_col_ind.data(), adj_data.data(),
                 data_row_ptr.data(), data_col_ind.data(), data_values.data(),
@@ -186,13 +194,14 @@ NB_MODULE(_autocorr_cuda, m) {
     // gearys_sparse - float64
     m.def(
         "gearys_sparse",
-        [](cuda_array_c<const int> adj_row_ptr,
-           cuda_array_c<const int> adj_col_ind,
-           cuda_array_c<const double> adj_data,
-           cuda_array_c<const int> data_row_ptr,
-           cuda_array_c<const int> data_col_ind,
-           cuda_array_c<const double> data_values, int n_samples,
-           int n_features, cuda_array_c<double> num, std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> adj_row_ptr,
+           gpu_array_c<const int, Device> adj_col_ind,
+           gpu_array_c<const double, Device> adj_data,
+           gpu_array_c<const int, Device> data_row_ptr,
+           gpu_array_c<const int, Device> data_col_ind,
+           gpu_array_c<const double, Device> data_values, int n_samples,
+           int n_features, gpu_array_c<double, Device> num,
+           std::uintptr_t stream) {
             launch_gearys_sparse(
                 adj_row_ptr.data(), adj_col_ind.data(), adj_data.data(),
                 data_row_ptr.data(), data_col_ind.data(), data_values.data(),
@@ -205,10 +214,11 @@ NB_MODULE(_autocorr_cuda, m) {
     // pre_den_sparse - float32
     m.def(
         "pre_den_sparse",
-        [](cuda_array_c<const int> data_col_ind,
-           cuda_array_c<const float> data_values, int nnz,
-           cuda_array_c<const float> mean_array, cuda_array_c<float> den,
-           cuda_array_c<int> counter, std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> data_col_ind,
+           gpu_array_c<const float, Device> data_values, int nnz,
+           gpu_array_c<const float, Device> mean_array,
+           gpu_array_c<float, Device> den, gpu_array_c<int, Device> counter,
+           std::uintptr_t stream) {
             launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
                                   mean_array.data(), den.data(), counter.data(),
                                   (cudaStream_t)stream);
@@ -218,14 +228,19 @@ NB_MODULE(_autocorr_cuda, m) {
     // pre_den_sparse - float64
     m.def(
         "pre_den_sparse",
-        [](cuda_array_c<const int> data_col_ind,
-           cuda_array_c<const double> data_values, int nnz,
-           cuda_array_c<const double> mean_array, cuda_array_c<double> den,
-           cuda_array_c<int> counter, std::uintptr_t stream) {
+        [](gpu_array_c<const int, Device> data_col_ind,
+           gpu_array_c<const double, Device> data_values, int nnz,
+           gpu_array_c<const double, Device> mean_array,
+           gpu_array_c<double, Device> den, gpu_array_c<int, Device> counter,
+           std::uintptr_t stream) {
             launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
                                   mean_array.data(), den.data(), counter.data(),
                                   (cudaStream_t)stream);
         },
         "data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
         "mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
 }
+
+NB_MODULE(_autocorr_cuda, m) {
+    REGISTER_GPU_BINDINGS(register_bindings, m);
+}
@@ -26,12 +26,13 @@ static inline void launch_cut_smaller(int* indptr, int* index, float* data,
                                                    n_rows);
 }
 
-NB_MODULE(_bbknn_cuda, m) {
+template <typename Device>
+void register_bindings(nb::module_& m) {
     m.def(
         "find_top_k_per_row",
-        [](cuda_array_c<const float> data, cuda_array_c<const int> indptr,
-           int n_rows, int trim, cuda_array_c<float> vals,
-           std::uintptr_t stream) {
+        [](gpu_array_c<const float, Device> data,
+           gpu_array_c<const int, Device> indptr, int n_rows, int trim,
+           gpu_array_c<float, Device> vals, std::uintptr_t stream) {
             launch_find_top_k_per_row(data.data(), indptr.data(), n_rows, trim,
                                       vals.data(), (cudaStream_t)stream);
         },
@@ -40,12 +41,16 @@ NB_MODULE(_bbknn_cuda, m) {
 
     m.def(
         "cut_smaller",
-        [](cuda_array_c<int> indptr, cuda_array_c<int> index,
-           cuda_array_c<float> data, cuda_array_c<float> vals, int n_rows,
-           std::uintptr_t stream) {
+        [](gpu_array_c<int, Device> indptr, gpu_array_c<int, Device> index,
+           gpu_array_c<float, Device> data, gpu_array_c<float, Device> vals,
+           int n_rows, std::uintptr_t stream) {
             launch_cut_smaller(indptr.data(), index.data(), data.data(),
                                vals.data(), n_rows, (cudaStream_t)stream);
         },
         "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "vals"_a, "n_rows"_a,
         "stream"_a = 0);
 }
+
+NB_MODULE(_bbknn_cuda, m) {
+    REGISTER_GPU_BINDINGS(register_bindings, m);
+}