scverse · Intron7 · Jun 26, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -1,6 +1,5 @@
-# This workflow will build two Docker image and push then to GitHub Packages Container registry:
-# - a base image with the dependencies
-# - a main image with the application code
+# Build/push two GHCR images: dependency base and application image.
+# Release events push; PR/comment runs only validate.
 
 name: Docker
 
@@ -73,8 +72,8 @@ jobs:
         RAPIDS_VER:
           - "26.04"
         CUDA_SUFFIX:
-          - { ver: "12.8.0", label: "cuda12", pkg: "cu12" }
-          - { ver: "13.0.2", label: "cuda13", pkg: "cu13" }
+          - { ver: "12.9.1", label: "cuda12", pkg: "cu12" }
+          - { ver: "13.1.0", label: "cuda13", pkg: "cu13" }
     name: Build Docker images (${{ matrix.CUDA_SUFFIX.label }})
     runs-on: ubuntu-latest
     permissions:

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -69,16 +69,46 @@ jobs:
           path = pathlib.Path("pyproject.toml")
           text = path.read_text()
 
+          def remove_toml_array(text, key):
+              lines = text.splitlines(keepends=True)
+              out = []
+              i = 0
+              while i < len(lines):
+                  if lines[i].startswith(f"{key} = ["):
+                      depth = lines[i].count("[") - lines[i].count("]")
+                      i += 1
+                      while i < len(lines) and depth > 0:
+                          depth += lines[i].count("[") - lines[i].count("]")
+                          i += 1
+                      continue
+                  out.append(lines[i])
+                  i += 1
+              return "".join(out)
+
           # Rename package
           text = text.replace(
               'name = "rapids-singlecell"',
               f'name = "rapids-singlecell-cu{cuda}"',
           )
           # Rename matching extra to "rapids", remove the other
-          text = text.replace(f'rapids-cu{cuda} =', 'rapids =')
-          # Remove the other CUDA extra line entirely
-          lines = text.splitlines(keepends=True)
-          text = "".join(l for l in lines if f'rapids-cu{other}' not in l)
+          text = text.replace(f'rapids-cu{cuda} = [', 'rapids = [')
+          text = remove_toml_array(text, f"rapids-cu{other}")
+
+          # CMake links CUDA extensions against librmm.
+          # Add the matching wheel to isolated build requirements.
+          for dep in (
+              f'    "librmm-cu{other}>=25.12",\n',
+              f'    "rmm-cu{other}>=25.12",\n',
+          ):
+              text = text.replace(dep, "")
+          rmm_build_req = f'    "librmm-cu{cuda}>=25.12",\n'
+          build_system_text = text.split("[project]", 1)[0]
+          if f'"librmm-cu{cuda}>=25.12"' not in build_system_text:
+              text = text.replace(
+                  ']\nbuild-backend = "scikit_build_core.build"',
+                  f'{rmm_build_req}]\nbuild-backend = "scikit_build_core.build"',
+                  1,
+              )
 
           # Set CUDA architectures (replace "native" with CI target archs)
           text = text.replace(
@@ -96,6 +126,7 @@ jobs:
 
       - name: Sanity check pyproject.toml
         run: |
+          python3 -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"
           grep -E "name|rapids|CUDA_ARCH" pyproject.toml
 
       - name: Build CUDA manylinux image
@@ -116,18 +147,25 @@ jobs:
             LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
             PATH=/usr/local/cuda/bin:$PATH
           CIBW_BEFORE_BUILD: >
+            rm -f build/.librmm_dir &&
+            mkdir -p build &&
             python -m pip install -U pip
             scikit-build-core cmake ninja nanobind
+            librmm-cu${{ matrix.cuda_major }} &&
+            RMM_ROOT=$(python -c "import librmm; print(librmm.__path__[0])") &&
+            LOG_ROOT=$(python -c "import rapids_logger; print(rapids_logger.__path__[0])") &&
+            echo "[rsc-build] librmm=$RMM_ROOT" &&
+            echo "[rsc-build] rapids_logger=$LOG_ROOT" &&
+            ln -sf "$RMM_ROOT/lib64/librmm.so" /usr/local/lib/librmm.so &&
+            ln -sf "$LOG_ROOT/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
+            ldconfig &&
+            python -c "import librmm; print(librmm.__path__[0])" > build/.librmm_dir &&
+            echo "[rsc-build] marker=$(cat build/.librmm_dir)"
           CIBW_TEST_SKIP: "*"
           CIBW_TEST_COMMAND: ""
-          # Exclude CUDA libs by SONAME glob (auditwheel >=6.2): the runtime
-          # stack (CuPy / nvidia-* wheels) provides them. Globs are version
-          # agnostic -- cusolver's SONAME is libcusolver.so.11 on CUDA 12 but
-          # .12 on CUDA 13, and nvJitLink is .12 vs .13, so pinning to the CUDA
-          # major would graft the wrong (or no) lib. cusolver's transitive deps
-          # (cublasLt, cusparse ~186MB, nvJitLink) are reached by auditwheel's
-          # tree walk and must each be excluded or they bloat the wheel.
-          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude 'libcublas.so.*' --exclude 'libcublasLt.so.*' --exclude 'libcudart.so.*' --exclude 'libcusolver.so.*' --exclude 'libcusparse.so.*' --exclude 'libnvJitLink.so.*' -w {dest_dir} {wheel}"
+          # Exclude CUDA/RAPIDS runtime libs provided by dependency wheels.
+          # Use SONAME globs so CUDA 12/13 suffix changes do not bundle them.
+          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude 'libcublas.so.*' --exclude 'libcublasLt.so.*' --exclude 'libcudart.so.*' --exclude 'libcusolver.so.*' --exclude 'libcusparse.so.*' --exclude 'libnvJitLink.so.*' --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}"
           CIBW_BUILD_VERBOSITY: "1"
 
       - uses: actions/upload-artifact@v7

diff --git a/.gitignore b/.gitignore
@@ -54,3 +54,4 @@ AGENTS.md
 
 # tmp_scripts
 tmp_scripts/
+/benchmarks/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,130 @@ if (RSC_BUILD_EXTENSIONS)
   find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
   find_package(nanobind CONFIG REQUIRED)
   find_package(CUDAToolkit REQUIRED)
+  set(RSC_RMM_HINTS)
+  set(RSC_RAPIDS_CMAKE_PREFIXES)
+  set(RSC_CCCL_HINTS)
+  set(RSC_RAPIDS_LOGGER_HINTS)
+  set(RSC_NVTX3_HINTS)
+  macro(_rsc_collect_rapids_python_prefix _rsc_prefix)
+    if (NOT "${_rsc_prefix}" STREQUAL "")
+      file(GLOB _rsc_rmm_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/rmm")
+      file(GLOB _rsc_rapids_prefixes
+        "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64"
+        "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids"
+        "${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64"
+        "${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib"
+      )
+      file(GLOB _rsc_cccl_dirs
+        "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids/cmake/cccl"
+        "${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib/cmake/cccl"
+      )
+      file(GLOB _rsc_rapids_logger_dirs "${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64/cmake/rapids_logger")
+      file(GLOB _rsc_nvtx3_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/nvtx3")
+      list(APPEND RSC_RMM_HINTS ${_rsc_rmm_dirs})
+      list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_rapids_prefixes})
+      list(APPEND RSC_CCCL_HINTS ${_rsc_cccl_dirs})
+      list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_rapids_logger_dirs})
+      list(APPEND RSC_NVTX3_HINTS ${_rsc_nvtx3_dirs})
+    endif()
+  endmacro()
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import importlib.util, pathlib; spec = importlib.util.find_spec('librmm'); print(pathlib.Path(spec.origin).parent / 'lib64' / 'cmake' / 'rmm' if spec else '')"
+    OUTPUT_VARIABLE RSC_PYTHON_RMM_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+  )
+  if (RSC_PYTHON_RMM_DIR AND EXISTS "${RSC_PYTHON_RMM_DIR}/rmm-config.cmake")
+    set(_rsc_python_rmm_hint "${RSC_PYTHON_RMM_DIR}")
+  else()
+    set(_rsc_python_rmm_hint "")
+  endif()
+  # Wheel builds write build/.librmm_dir from CIBW_BEFORE_BUILD.
+  # publish.yml symlinks runtime libs so auditwheel excludes them.
+  if(DEFINED ENV{RSC_LIBRMM_DIR} AND EXISTS "$ENV{RSC_LIBRMM_DIR}/lib64/cmake/rmm/rmm-config.cmake")
+    set(_rsc_librmm_marker "$ENV{RSC_LIBRMM_DIR}")
+  elseif(EXISTS "${CMAKE_SOURCE_DIR}/build/.librmm_dir")
+    file(READ "${CMAKE_SOURCE_DIR}/build/.librmm_dir" _rsc_librmm_marker)
+    string(STRIP "${_rsc_librmm_marker}" _rsc_librmm_marker)
+  else()
+    set(_rsc_librmm_marker "")
+  endif()
+  if(NOT "${_rsc_librmm_marker}" STREQUAL "" AND EXISTS "${_rsc_librmm_marker}/lib64/cmake/rmm/rmm-config.cmake")
+    file(GLOB _rsc_marker_rmm_dirs "${_rsc_librmm_marker}/lib64/cmake/rmm")
+    file(GLOB _rsc_marker_rapids_prefixes
+      "${_rsc_librmm_marker}/lib64"
+      "${_rsc_librmm_marker}/lib64/rapids"
+      "${_rsc_librmm_marker}/../rapids_logger/lib64"
+    )
+    file(GLOB _rsc_marker_cccl_dirs
+      "${_rsc_librmm_marker}/lib64/rapids/cmake/cccl"
+    )
+    file(GLOB _rsc_marker_rapids_logger_dirs "${_rsc_librmm_marker}/../rapids_logger/lib64/cmake/rapids_logger")
+    file(GLOB _rsc_marker_nvtx3_dirs "${_rsc_librmm_marker}/lib64/cmake/nvtx3")
+    list(APPEND RSC_RMM_HINTS ${_rsc_marker_rmm_dirs})
+    list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_marker_rapids_prefixes})
+    list(APPEND RSC_CCCL_HINTS ${_rsc_marker_cccl_dirs})
+    list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_marker_rapids_logger_dirs})
+    list(APPEND RSC_NVTX3_HINTS ${_rsc_marker_nvtx3_dirs})
+  endif()
+  foreach(_rsc_python_prefix IN ITEMS "${Python_ROOT_DIR}" "${Python3_ROOT_DIR}")
+    _rsc_collect_rapids_python_prefix("${_rsc_python_prefix}")
+  endforeach()
+  foreach(_rsc_env_prefix IN ITEMS "$ENV{CONDA_PREFIX}" "$ENV{VIRTUAL_ENV}")
+    _rsc_collect_rapids_python_prefix("${_rsc_env_prefix}")
+  endforeach()
+  string(REPLACE ":" ";" _rsc_path_entries "$ENV{PATH}")
+  foreach(_rsc_path_entry IN LISTS _rsc_path_entries)
+    get_filename_component(_rsc_path_prefix "${_rsc_path_entry}/.." ABSOLUTE)
+    _rsc_collect_rapids_python_prefix("${_rsc_path_prefix}")
+  endforeach()
+  if (NOT RSC_RMM_HINTS
+      AND NOT "${_rsc_python_rmm_hint}" STREQUAL "")
+    list(APPEND RSC_RMM_HINTS "${_rsc_python_rmm_hint}")
+  endif()
+  if (RSC_RAPIDS_CMAKE_PREFIXES)
+    list(APPEND CMAKE_PREFIX_PATH ${RSC_RAPIDS_CMAKE_PREFIXES})
+    if (RSC_CCCL_HINTS)
+      list(GET RSC_CCCL_HINTS 0 _rsc_cccl_dir)
+      set(CCCL_DIR "${_rsc_cccl_dir}" CACHE PATH "Path to CCCL package config" FORCE)
+    endif()
+    if (RSC_RAPIDS_LOGGER_HINTS)
+      list(GET RSC_RAPIDS_LOGGER_HINTS 0 _rsc_rapids_logger_dir)
+      set(rapids_logger_DIR "${_rsc_rapids_logger_dir}" CACHE PATH "Path to rapids_logger package config" FORCE)
+    endif()
+    if (RSC_NVTX3_HINTS)
+      list(GET RSC_NVTX3_HINTS 0 _rsc_nvtx3_dir)
+      set(nvtx3_DIR "${_rsc_nvtx3_dir}" CACHE PATH "Path to nvtx3 package config" FORCE)
+    endif()
+  endif()
+  if (RSC_RMM_HINTS)
+    list(GET RSC_RMM_HINTS 0 _rsc_rmm_dir)
+    set(rmm_DIR "${_rsc_rmm_dir}" CACHE PATH "Path to rmm package config" FORCE)
+    find_package(rmm CONFIG REQUIRED)
+  else()
+    find_package(rmm CONFIG REQUIRED)
+  endif()
+
+  # CCCL 3.3.0 gates cudaDevAttrHostNumaMemoryPoolsSupported too loosely.
+  # Fail fast for CUDA 12.6-12.8 source builds with that buggy CCCL.
+  set(_rsc_cccl_buggy_numa_guard TRUE)
+  if (DEFINED CCCL_VERSION AND CCCL_VERSION VERSION_GREATER 3.3.0)
+    set(_rsc_cccl_buggy_numa_guard FALSE)
+  endif()
+  if (NOT RSC_SKIP_CUDA_VERSION_CHECK
+      AND _rsc_cccl_buggy_numa_guard
+      AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.6
+      AND CUDAToolkit_VERSION VERSION_LESS 12.9)
+    message(FATAL_ERROR
+      "Cannot build rapids_singlecell from source with CUDA ${CUDAToolkit_VERSION} against "
+      "CCCL ${CCCL_VERSION} (RAPIDS 26.04): it references cudaDevAttrHostNumaMemoryPoolsSupported, "
+      "which the CUDA 12.6-12.8 toolkit does not define (NVIDIA added it in 12.9). "
+      "Use CUDA >= 12.9 (or <= 12.5), upgrade to RAPIDS >= 26.06 (CCCL > 3.3.0 fixes the guard), "
+      "or install the prebuilt wheel (pip install rapids-singlecell-cu12). "
+      "If your toolkit does define this enum, override with -DRSC_SKIP_CUDA_VERSION_CHECK=ON.")
+  endif()
+
+  message(STATUS "Using RMM for CUDA extension scratch allocations")
   message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 else()
   message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
@@ -62,6 +186,57 @@ function(add_nb_cuda_module target src)
   endif()
 endfunction()
 
+# RMM-backed nanobind CUDA module: normal module plus shared scratch allocator.
+# Wheels use sibling RAPIDS packages; editable imports still preload fallbacks.
+function(add_rmm_cuda_module target src)
+  add_nb_cuda_module(${target} ${src})
+  if (RSC_BUILD_EXTENSIONS)
+    target_sources(${target} PRIVATE
+        src/rapids_singlecell/_cuda/rmm_scratch.cu)
+    target_link_libraries(${target} PRIVATE rmm::rmm)
+    set(_rsc_rmm_build_rpath)
+    set(_rsc_rmm_have_build_librmm FALSE)
+    set(_rsc_rmm_have_build_rapids_logger FALSE)
+    if (DEFINED ENV{CONDA_PREFIX})
+      set(_rsc_rmm_env_site
+          "$ENV{CONDA_PREFIX}/lib/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages")
+      if (EXISTS "${_rsc_rmm_env_site}/librmm/lib64")
+        list(APPEND _rsc_rmm_build_rpath
+            "${_rsc_rmm_env_site}/librmm/lib64")
+        set(_rsc_rmm_have_build_librmm TRUE)
+      endif()
+      if (EXISTS "${_rsc_rmm_env_site}/rapids_logger/lib64")
+        list(APPEND _rsc_rmm_build_rpath
+            "${_rsc_rmm_env_site}/rapids_logger/lib64")
+        set(_rsc_rmm_have_build_rapids_logger TRUE)
+      endif()
+    endif()
+    if (NOT _rsc_rmm_have_build_librmm AND rmm_DIR)
+      get_filename_component(_rsc_rmm_build_librmm_dir
+          "${rmm_DIR}/../.." REALPATH)
+      list(APPEND _rsc_rmm_build_rpath "${_rsc_rmm_build_librmm_dir}")
+    endif()
+    if (NOT _rsc_rmm_have_build_rapids_logger AND rapids_logger_DIR)
+      get_filename_component(_rsc_rmm_build_rapids_logger_dir
+          "${rapids_logger_DIR}/../.." REALPATH)
+      list(APPEND _rsc_rmm_build_rpath
+          "${_rsc_rmm_build_rapids_logger_dir}")
+    endif()
+    set(_rsc_rmm_install_rpath
+        "\$ORIGIN/../../librmm/lib64"
+        "\$ORIGIN/../../rapids_logger/lib64"
+    )
+    if (CUDAToolkit_LIBRARY_DIR)
+      list(APPEND _rsc_rmm_build_rpath "${CUDAToolkit_LIBRARY_DIR}")
+      list(APPEND _rsc_rmm_install_rpath "${CUDAToolkit_LIBRARY_DIR}")
+    endif()
+    set_target_properties(${target} PROPERTIES
+        BUILD_RPATH "${_rsc_rmm_build_rpath}"
+        INSTALL_RPATH "${_rsc_rmm_install_rpath}"
+    )
+  endif()
+endfunction()
+
 if (RSC_BUILD_EXTENSIONS)
   # CUDA modules
   add_nb_cuda_module(_mean_var_cuda     src/rapids_singlecell/_cuda/mean_var/mean_var.cu)
@@ -91,7 +266,9 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_pseudobulk_cuda   src/rapids_singlecell/_cuda/pseudobulk/pseudobulk.cu)
   add_nb_cuda_module(_hvg_cuda          src/rapids_singlecell/_cuda/hvg/hvg.cu)
   add_nb_cuda_module(_kde_cuda          src/rapids_singlecell/_cuda/kde/kde.cu)
-  add_nb_cuda_module(_wilcoxon_cuda     src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
+  add_rmm_cuda_module(_wilcoxon_cuda        src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
+  add_rmm_cuda_module(_wilcoxon_sparse_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_sparse.cu)
+  add_nb_cuda_module(_rank_stats_cuda   src/rapids_singlecell/_cuda/rank_genes/rank_stats.cu)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
   add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)

diff --git a/conda/rsc_rapids_26.04_cuda12.yml b/conda/rsc_rapids_26.04_cuda12.yml
@@ -7,7 +7,7 @@ channels:
 dependencies:
  - rapids=26.04
  - python=3.14
- - cuda-version=12.8
+ - cuda-version=12.9
  - cudnn
  - cutensor
  - cusparselt

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -5,6 +5,11 @@ ARG GIT_ID=main
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 ENV PATH=/opt/conda/bin:$PATH
+# Point CMake's find_package(rmm) at the conda env. The conda RAPIDS env resolved
+# librmm + cuda-version together, so its librmm/rapids_logger headers match the
+# image's CUDA toolkit. This is what lets the --no-build-isolation build below
+# pick up the CUDA-matched librmm instead of a mismatched PyPI wheel.
+ENV CMAKE_PREFIX_PATH=/opt/conda
 ARG CUDA_ARCHS="75-real;80-real;86-real;89-real;90-real;100-real;120"
 
 RUN <<EOF
@@ -18,5 +23,13 @@ git checkout ${GIT_ID}
 # Set CUDA architectures directly in pyproject.toml (avoids SKBUILD_CMAKE_ARGS semicolon splitting)
 sed -i 's/CMAKE_CUDA_ARCHITECTURES = "native"/CMAKE_CUDA_ARCHITECTURES = "'"${CUDA_ARCHS}"'"/' pyproject.toml
 grep CMAKE_CUDA_ARCHITECTURES pyproject.toml
-/opt/conda/bin/python -m pip install --no-cache-dir -e .
+# Build with --no-build-isolation so the compile uses the conda env's
+# CUDA-matched librmm/rapids_logger headers. With isolation, PEP 517 would pull
+# a fresh librmm-cu12 from PyPI (hardcoded in [build-system].requires) that
+# mismatches the image's CUDA toolkit -> "cudaDevAttr* has no global scope"
+# errors on both cu12 (toolkit older than the latest librmm) and cu13 (wrong
+# cu12 variant). Install the PEP 517 backend deps first since isolation is off;
+# the conda env already provides the librmm/rapids_logger headers + cmake config.
+/opt/conda/bin/python -m pip install --no-cache-dir scikit-build-core nanobind setuptools-scm cmake ninja
+/opt/conda/bin/python -m pip install --no-cache-dir --no-build-isolation -e .
 EOF
diff --git a/docker/Dockerfile.deps b/docker/Dockerfile.deps
@@ -1,4 +1,4 @@
-ARG CUDA_VER=13.0.2
+ARG CUDA_VER=13.1.0
 ARG LINUX_VER=ubuntu24.04
 
 FROM nvidia/cuda:${CUDA_VER}-devel-${LINUX_VER}
@@ -7,7 +7,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
 
 ARG PYTHON_VER=3.13
 # Re-declare after FROM so it is available to RUN steps (passed by docker.yml build-args)
-ARG CUDA_VER=13.0.2
+ARG CUDA_VER=13.1.0
 
 ENV PATH=/opt/conda/bin:$PATH
 ENV PYTHON_VERSION=${PYTHON_VER}

diff --git a/docker/docker-push.sh b/docker/docker-push.sh
@@ -6,7 +6,7 @@ rapids_version=26.04
 
 declare -A cuda_versions=(
     [cu12]="12.8.0"
-    [cu13]="13.0.2"
+    [cu13]="13.1.0"
 )
 
 declare -A cuda_archs=(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -54,3 +54,4 @@ AGENTS.md

		# tmp_scripts
		tmp_scripts/
		/benchmarks/