scverse
diff --git a/‎.github/workflows/publish.yml‎
Lines changed: 8 additions & 1 deletion b/‎.github/workflows/publish.yml‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker/manylinux_2_28_aarch64_cuda12.2.Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎docker/manylinux_2_28_aarch64_cuda12.2.Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docker/manylinux_2_28_aarch64_cuda13.0.Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎docker/manylinux_2_28_aarch64_cuda13.0.Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docker/manylinux_2_28_x86_64_cuda12.2.Dockerfile‎
Lines changed: 7 additions & 2 deletions b/‎docker/manylinux_2_28_x86_64_cuda12.2.Dockerfile‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎docker/manylinux_2_28_x86_64_cuda13.0.Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎docker/manylinux_2_28_x86_64_cuda13.0.Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/api/pertpy_gpu.md‎
Lines changed: 32 additions & 0 deletions b/‎docs/api/pertpy_gpu.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎docs/release-notes/0.15.3.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/release-notes/0.15.3.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/rapids_singlecell/_cuda/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/rapids_singlecell/_cuda/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -120,7 +120,14 @@ jobs:
             scikit-build-core cmake ninja nanobind
           CIBW_TEST_SKIP: "*"
           CIBW_TEST_COMMAND: ""
-          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} -w {dest_dir} {wheel}"
+          # Exclude CUDA libs by SONAME glob (auditwheel >=6.2): the runtime
+          # stack (CuPy / nvidia-* wheels) provides them. Globs are version
+          # agnostic -- cusolver's SONAME is libcusolver.so.11 on CUDA 12 but
+          # .12 on CUDA 13, and nvJitLink is .12 vs .13, so pinning to the CUDA
+          # major would graft the wrong (or no) lib. cusolver's transitive deps
+          # (cublasLt, cusparse ~186MB, nvJitLink) are reached by auditwheel's
+          # tree walk and must each be excluded or they bloat the wheel.
+          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude 'libcublas.so.*' --exclude 'libcublasLt.so.*' --exclude 'libcudart.so.*' --exclude 'libcusolver.so.*' --exclude 'libcusparse.so.*' --exclude 'libnvJitLink.so.*' -w {dest_dir} {wheel}"
           CIBW_BUILD_VERBOSITY: "1"
 
       - uses: actions/upload-artifact@v7
 
@@ -73,6 +73,8 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_norm_cuda         src/rapids_singlecell/_cuda/norm/norm.cu)
   add_nb_cuda_module(_gmm_cuda          src/rapids_singlecell/_cuda/gmm/gmm.cu)
   target_link_libraries(_gmm_cuda PRIVATE CUDA::cublas)
+  target_link_libraries(_gmm_cuda PRIVATE CUDA::cusolver)
+  add_nb_cuda_module(_mixscale_cuda     src/rapids_singlecell/_cuda/mixscale/mixscale.cu)
   add_nb_cuda_module(_pr_cuda           src/rapids_singlecell/_cuda/pr/pr.cu)
   add_nb_cuda_module(_nn_descent_cuda   src/rapids_singlecell/_cuda/nn_descent/nn_descent.cu)
   add_nb_cuda_module(_aucell_cuda       src/rapids_singlecell/_cuda/aucell/aucell.cu)
 
@@ -14,7 +14,10 @@ RUN yum -y install dnf-plugins-core && \
         libcublas-12-2 \
         libcublas-devel-12-2 \
         libcusparse-12-2 \
-        libcusparse-devel-12-2 && \
+        libcusparse-devel-12-2 \
+        libcusolver-12-2 \
+        libcusolver-devel-12-2 \
+        libnvjitlink-12-2 && \
     yum clean all
 
 ENV CUDA_HOME=/usr/local/cuda
 
@@ -10,7 +10,10 @@ RUN yum -y install dnf-plugins-core && \
         libcublas-13-0 \
         libcublas-devel-13-0 \
         libcusparse-13-0 \
-        libcusparse-devel-13-0 && \
+        libcusparse-devel-13-0 \
+        libcusolver-13-0 \
+        libcusolver-devel-13-0 \
+        libnvjitlink-13-0 && \
     yum clean all
 
 ENV CUDA_HOME=/usr/local/cuda
 
@@ -8,15 +8,20 @@ RUN yum -y install gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ && \
 RUN yum -y install dnf-plugins-core && \
     dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
     yum -y clean all && yum -y makecache && \
-    # Install only what you actually link against
+    # Install what you link against, plus cusolver's runtime nvJitLink dependency
+    # (cusolver's ELF NEEDs libnvJitLink but its RPM does not declare it, so it
+    # must be installed explicitly or the nanobind stub-generation import fails).
     yum -y install \
       cuda-nvcc-12-2 \
       cuda-cudart-12-2 \
       cuda-cudart-devel-12-2 \
       libcublas-12-2 \
       libcublas-devel-12-2 \
       libcusparse-12-2 \
-      libcusparse-devel-12-2 && \
+      libcusparse-devel-12-2 \
+      libcusolver-12-2 \
+      libcusolver-devel-12-2 \
+      libnvjitlink-12-2 && \
     yum clean all
 
 ENV CUDA_HOME=/usr/local/cuda
 
@@ -12,7 +12,10 @@ RUN yum -y install dnf-plugins-core && \
       libcublas-13-0 \
       libcublas-devel-13-0 \
       libcusparse-13-0 \
-      libcusparse-devel-13-0 && \
+      libcusparse-devel-13-0 \
+      libcusolver-13-0 \
+      libcusolver-devel-13-0 \
+      libnvjitlink-13-0 && \
     yum clean all
 
 ENV CUDA_HOME=/usr/local/cuda
 
@@ -73,3 +73,35 @@
     .. automethod:: assign_mixture_model
         :no-index:
 ```
+
+## Mixscape
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+
+    Mixscape
+```
+
+```{eval-rst}
+.. autoclass:: Mixscape
+    :no-index:
+
+    .. rubric:: Methods
+
+    .. autosummary::
+
+        ~Mixscape.perturbation_signature
+        ~Mixscape.mixscape
+        ~Mixscape.mixscale
+        ~Mixscape.lda
+
+    .. automethod:: perturbation_signature
+        :no-index:
+    .. automethod:: mixscape
+        :no-index:
+    .. automethod:: mixscale
+        :no-index:
+    .. automethod:: lda
+        :no-index:
+```
@@ -2,4 +2,9 @@
 
 ```{rubric} Features
 ```
+* Add {class}`~rapids_singlecell.ptg.Mixscape` for GPU-accelerated Mixscape (`perturbation_signature`, `mixscape`, `mixscale`, `lda`) {pr}`688` {smaller}`S Dicks`
+
+```{rubric} Performance
+```
+* Batched cuSOLVER precision-Cholesky for the full-covariance GMM (`gmm_fit_predict`), ~2-3x faster {pr}`688` {smaller}`S Dicks`
 * {class}`~rapids_singlecell.ptg.Distance` with ``metric="edistance"`` now accepts sparse CSR input (a sparse layer or ``layer_key="X"``), densified inside the CUDA kernel so the dense matrix is never materialized on the GPU {pr}`689` {smaller}`S Dicks`
@@ -35,6 +35,7 @@
     "_kde_cuda",
     "_ligrec_cuda",
     "_mean_var_cuda",
+    "_mixscale_cuda",
     "_nanmean_cuda",
     "_nn_descent_cuda",
     "_norm_cuda",