Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
bc51273
first iteration of refactor
Intron7 Apr 24, 2026
4094e6b
add rmm
Intron7 Apr 24, 2026
9c391ed
update publish and cmake
Intron7 Apr 24, 2026
76389bb
update notebooks
Intron7 Apr 24, 2026
f69f1d8
make dense faster
Intron7 Apr 24, 2026
a0e9b0c
update tests and fix issues
Intron7 Apr 24, 2026
2e36351
Merge branch 'main' into wilcoxon-refactor
Intron7 Apr 29, 2026
3682c92
Merge branch 'main' into wilcoxon-refactor
Intron7 May 12, 2026
43bc9c2
Merge branch 'main' into wilcoxon-refactor
Intron7 May 13, 2026
73cda58
fix tests
Intron7 May 13, 2026
9769f3c
Merge branch 'main' into wilcoxon-refactor
Intron7 May 20, 2026
a99fdf7
Merge branch 'main' into wilcoxon-refactor
Intron7 May 26, 2026
ce6bc48
Merge branch 'main' into wilcoxon-refactor
Intron7 May 27, 2026
bcbd4d3
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 2, 2026
0f5e89e
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 3, 2026
49d43d0
update
Intron7 Jun 3, 2026
4e4b55d
safety commit
Intron7 Jun 3, 2026
75b810a
start cleanup
Intron7 Jun 3, 2026
f8c00d8
first draft
Intron7 Jun 3, 2026
28bc282
update rmm
Intron7 Jun 5, 2026
e8638de
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 5, 2026
c165d9f
update ci buildwheel
Intron7 Jun 5, 2026
e8a0ba0
add csr densification columnwise clean up rmm
Intron7 Jun 6, 2026
4c087d3
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 8, 2026
df988d8
fix docker
Intron7 Jun 8, 2026
490bec2
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 8, 2026
82703d2
fix issues
Intron7 Jun 8, 2026
00eccb2
redo memory allocation
Intron7 Jun 9, 2026
7836578
clean up
Intron7 Jun 16, 2026
a65155a
more cleanup
Intron7 Jun 16, 2026
8710399
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 17, 2026
9f7d3e0
improve memory and dtypes and nnz for large datasets
Intron7 Jun 17, 2026
6327d60
update comments
Intron7 Jun 18, 2026
81094a2
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 18, 2026
8388d86
start dedup
Intron7 Jun 22, 2026
9470896
more dedup
Intron7 Jun 22, 2026
36a5de4
make even smaller
Intron7 Jun 22, 2026
b555a28
update testing
Intron7 Jun 22, 2026
bc8bc9e
add more tests
Intron7 Jun 22, 2026
7ca959d
update kernels and layout
Intron7 Jun 23, 2026
58bbd4a
remove small and tiny and speed up larger paths
Intron7 Jun 24, 2026
a4150cc
fix logreg order
Intron7 Jun 24, 2026
dec3593
add 64 bit
Intron7 Jun 24, 2026
a2c4b3a
update streaming
Intron7 Jun 25, 2026
ad4f811
Merge branch 'main' into wilcoxon-refactor
Intron7 Jun 25, 2026
8567329
add memory safety
Intron7 Jun 25, 2026
45d2a0b
update streaming
Intron7 Jun 25, 2026
3167d31
update python
Intron7 Jun 26, 2026
328697a
make negative fall bag better
Intron7 Jun 26, 2026
cff61e3
slim down comments
Intron7 Jun 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# This workflow will build two Docker image and push then to GitHub Packages Container registry:
# - a base image with the dependencies
# - a main image with the application code
# Build/push two GHCR images: dependency base and application image.
# Release events push; PR/comment runs only validate.

name: Docker

Expand Down Expand Up @@ -73,8 +72,8 @@ jobs:
RAPIDS_VER:
- "26.04"
CUDA_SUFFIX:
- { ver: "12.8.0", label: "cuda12", pkg: "cu12" }
- { ver: "13.0.2", label: "cuda13", pkg: "cu13" }
- { ver: "12.9.1", label: "cuda12", pkg: "cu12" }
- { ver: "13.1.0", label: "cuda13", pkg: "cu13" }
name: Build Docker images (${{ matrix.CUDA_SUFFIX.label }})
runs-on: ubuntu-latest
permissions:
Expand Down
62 changes: 50 additions & 12 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,46 @@ jobs:
path = pathlib.Path("pyproject.toml")
text = path.read_text()

def remove_toml_array(text, key):
lines = text.splitlines(keepends=True)
out = []
i = 0
while i < len(lines):
if lines[i].startswith(f"{key} = ["):
depth = lines[i].count("[") - lines[i].count("]")
i += 1
while i < len(lines) and depth > 0:
depth += lines[i].count("[") - lines[i].count("]")
i += 1
continue
out.append(lines[i])
i += 1
return "".join(out)

# Rename package
text = text.replace(
'name = "rapids-singlecell"',
f'name = "rapids-singlecell-cu{cuda}"',
)
# Rename matching extra to "rapids", remove the other
text = text.replace(f'rapids-cu{cuda} =', 'rapids =')
# Remove the other CUDA extra line entirely
lines = text.splitlines(keepends=True)
text = "".join(l for l in lines if f'rapids-cu{other}' not in l)
text = text.replace(f'rapids-cu{cuda} = [', 'rapids = [')
text = remove_toml_array(text, f"rapids-cu{other}")

# CMake links CUDA extensions against librmm.
# Add the matching wheel to isolated build requirements.
for dep in (
f' "librmm-cu{other}>=25.12",\n',
f' "rmm-cu{other}>=25.12",\n',
):
text = text.replace(dep, "")
rmm_build_req = f' "librmm-cu{cuda}>=25.12",\n'
build_system_text = text.split("[project]", 1)[0]
if f'"librmm-cu{cuda}>=25.12"' not in build_system_text:
text = text.replace(
']\nbuild-backend = "scikit_build_core.build"',
f'{rmm_build_req}]\nbuild-backend = "scikit_build_core.build"',
1,
)

# Set CUDA architectures (replace "native" with CI target archs)
text = text.replace(
Expand All @@ -96,6 +126,7 @@ jobs:

- name: Sanity check pyproject.toml
run: |
python3 -c "import tomllib; tomllib.load(open('pyproject.toml', 'rb'))"
grep -E "name|rapids|CUDA_ARCH" pyproject.toml

- name: Build CUDA manylinux image
Expand All @@ -116,18 +147,25 @@ jobs:
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
PATH=/usr/local/cuda/bin:$PATH
CIBW_BEFORE_BUILD: >
rm -f build/.librmm_dir &&
mkdir -p build &&
python -m pip install -U pip
scikit-build-core cmake ninja nanobind
librmm-cu${{ matrix.cuda_major }} &&
RMM_ROOT=$(python -c "import librmm; print(librmm.__path__[0])") &&
LOG_ROOT=$(python -c "import rapids_logger; print(rapids_logger.__path__[0])") &&
echo "[rsc-build] librmm=$RMM_ROOT" &&
echo "[rsc-build] rapids_logger=$LOG_ROOT" &&
ln -sf "$RMM_ROOT/lib64/librmm.so" /usr/local/lib/librmm.so &&
ln -sf "$LOG_ROOT/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
ldconfig &&
python -c "import librmm; print(librmm.__path__[0])" > build/.librmm_dir &&
echo "[rsc-build] marker=$(cat build/.librmm_dir)"
CIBW_TEST_SKIP: "*"
CIBW_TEST_COMMAND: ""
# Exclude CUDA libs by SONAME glob (auditwheel >=6.2): the runtime
# stack (CuPy / nvidia-* wheels) provides them. Globs are version
# agnostic -- cusolver's SONAME is libcusolver.so.11 on CUDA 12 but
# .12 on CUDA 13, and nvJitLink is .12 vs .13, so pinning to the CUDA
# major would graft the wrong (or no) lib. cusolver's transitive deps
# (cublasLt, cusparse ~186MB, nvJitLink) are reached by auditwheel's
# tree walk and must each be excluded or they bloat the wheel.
CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude 'libcublas.so.*' --exclude 'libcublasLt.so.*' --exclude 'libcudart.so.*' --exclude 'libcusolver.so.*' --exclude 'libcusparse.so.*' --exclude 'libnvJitLink.so.*' -w {dest_dir} {wheel}"
# Exclude CUDA/RAPIDS runtime libs provided by dependency wheels.
# Use SONAME globs so CUDA 12/13 suffix changes do not bundle them.
CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude 'libcublas.so.*' --exclude 'libcublasLt.so.*' --exclude 'libcudart.so.*' --exclude 'libcusolver.so.*' --exclude 'libcusparse.so.*' --exclude 'libnvJitLink.so.*' --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}"
CIBW_BUILD_VERBOSITY: "1"

- uses: actions/upload-artifact@v7
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,4 @@ AGENTS.md

# tmp_scripts
tmp_scripts/
/benchmarks/
179 changes: 178 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,130 @@ if (RSC_BUILD_EXTENSIONS)
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
find_package(nanobind CONFIG REQUIRED)
find_package(CUDAToolkit REQUIRED)
set(RSC_RMM_HINTS)
set(RSC_RAPIDS_CMAKE_PREFIXES)
set(RSC_CCCL_HINTS)
set(RSC_RAPIDS_LOGGER_HINTS)
set(RSC_NVTX3_HINTS)
macro(_rsc_collect_rapids_python_prefix _rsc_prefix)
if (NOT "${_rsc_prefix}" STREQUAL "")
file(GLOB _rsc_rmm_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/rmm")
file(GLOB _rsc_rapids_prefixes
"${_rsc_prefix}/lib/python*/site-packages/librmm/lib64"
"${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids"
"${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64"
"${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib"
)
file(GLOB _rsc_cccl_dirs
"${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/rapids/cmake/cccl"
"${_rsc_prefix}/lib/python*/site-packages/nvidia/cu*/lib/cmake/cccl"
)
file(GLOB _rsc_rapids_logger_dirs "${_rsc_prefix}/lib/python*/site-packages/rapids_logger/lib64/cmake/rapids_logger")
file(GLOB _rsc_nvtx3_dirs "${_rsc_prefix}/lib/python*/site-packages/librmm/lib64/cmake/nvtx3")
list(APPEND RSC_RMM_HINTS ${_rsc_rmm_dirs})
list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_rapids_prefixes})
list(APPEND RSC_CCCL_HINTS ${_rsc_cccl_dirs})
list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_rapids_logger_dirs})
list(APPEND RSC_NVTX3_HINTS ${_rsc_nvtx3_dirs})
endif()
endmacro()
execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import importlib.util, pathlib; spec = importlib.util.find_spec('librmm'); print(pathlib.Path(spec.origin).parent / 'lib64' / 'cmake' / 'rmm' if spec else '')"
OUTPUT_VARIABLE RSC_PYTHON_RMM_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
)
if (RSC_PYTHON_RMM_DIR AND EXISTS "${RSC_PYTHON_RMM_DIR}/rmm-config.cmake")
set(_rsc_python_rmm_hint "${RSC_PYTHON_RMM_DIR}")
else()
set(_rsc_python_rmm_hint "")
endif()
# Wheel builds write build/.librmm_dir from CIBW_BEFORE_BUILD.
# publish.yml symlinks runtime libs so auditwheel excludes them.
if(DEFINED ENV{RSC_LIBRMM_DIR} AND EXISTS "$ENV{RSC_LIBRMM_DIR}/lib64/cmake/rmm/rmm-config.cmake")
set(_rsc_librmm_marker "$ENV{RSC_LIBRMM_DIR}")
elseif(EXISTS "${CMAKE_SOURCE_DIR}/build/.librmm_dir")
file(READ "${CMAKE_SOURCE_DIR}/build/.librmm_dir" _rsc_librmm_marker)
string(STRIP "${_rsc_librmm_marker}" _rsc_librmm_marker)
else()
set(_rsc_librmm_marker "")
endif()
if(NOT "${_rsc_librmm_marker}" STREQUAL "" AND EXISTS "${_rsc_librmm_marker}/lib64/cmake/rmm/rmm-config.cmake")
file(GLOB _rsc_marker_rmm_dirs "${_rsc_librmm_marker}/lib64/cmake/rmm")
file(GLOB _rsc_marker_rapids_prefixes
"${_rsc_librmm_marker}/lib64"
"${_rsc_librmm_marker}/lib64/rapids"
"${_rsc_librmm_marker}/../rapids_logger/lib64"
)
file(GLOB _rsc_marker_cccl_dirs
"${_rsc_librmm_marker}/lib64/rapids/cmake/cccl"
)
file(GLOB _rsc_marker_rapids_logger_dirs "${_rsc_librmm_marker}/../rapids_logger/lib64/cmake/rapids_logger")
file(GLOB _rsc_marker_nvtx3_dirs "${_rsc_librmm_marker}/lib64/cmake/nvtx3")
list(APPEND RSC_RMM_HINTS ${_rsc_marker_rmm_dirs})
list(APPEND RSC_RAPIDS_CMAKE_PREFIXES ${_rsc_marker_rapids_prefixes})
list(APPEND RSC_CCCL_HINTS ${_rsc_marker_cccl_dirs})
list(APPEND RSC_RAPIDS_LOGGER_HINTS ${_rsc_marker_rapids_logger_dirs})
list(APPEND RSC_NVTX3_HINTS ${_rsc_marker_nvtx3_dirs})
endif()
foreach(_rsc_python_prefix IN ITEMS "${Python_ROOT_DIR}" "${Python3_ROOT_DIR}")
_rsc_collect_rapids_python_prefix("${_rsc_python_prefix}")
endforeach()
foreach(_rsc_env_prefix IN ITEMS "$ENV{CONDA_PREFIX}" "$ENV{VIRTUAL_ENV}")
_rsc_collect_rapids_python_prefix("${_rsc_env_prefix}")
endforeach()
string(REPLACE ":" ";" _rsc_path_entries "$ENV{PATH}")
foreach(_rsc_path_entry IN LISTS _rsc_path_entries)
get_filename_component(_rsc_path_prefix "${_rsc_path_entry}/.." ABSOLUTE)
_rsc_collect_rapids_python_prefix("${_rsc_path_prefix}")
endforeach()
if (NOT RSC_RMM_HINTS
AND NOT "${_rsc_python_rmm_hint}" STREQUAL "")
list(APPEND RSC_RMM_HINTS "${_rsc_python_rmm_hint}")
endif()
if (RSC_RAPIDS_CMAKE_PREFIXES)
list(APPEND CMAKE_PREFIX_PATH ${RSC_RAPIDS_CMAKE_PREFIXES})
if (RSC_CCCL_HINTS)
list(GET RSC_CCCL_HINTS 0 _rsc_cccl_dir)
set(CCCL_DIR "${_rsc_cccl_dir}" CACHE PATH "Path to CCCL package config" FORCE)
endif()
if (RSC_RAPIDS_LOGGER_HINTS)
list(GET RSC_RAPIDS_LOGGER_HINTS 0 _rsc_rapids_logger_dir)
set(rapids_logger_DIR "${_rsc_rapids_logger_dir}" CACHE PATH "Path to rapids_logger package config" FORCE)
endif()
if (RSC_NVTX3_HINTS)
list(GET RSC_NVTX3_HINTS 0 _rsc_nvtx3_dir)
set(nvtx3_DIR "${_rsc_nvtx3_dir}" CACHE PATH "Path to nvtx3 package config" FORCE)
endif()
endif()
if (RSC_RMM_HINTS)
list(GET RSC_RMM_HINTS 0 _rsc_rmm_dir)
set(rmm_DIR "${_rsc_rmm_dir}" CACHE PATH "Path to rmm package config" FORCE)
find_package(rmm CONFIG REQUIRED)
else()
find_package(rmm CONFIG REQUIRED)
endif()

# CCCL 3.3.0 gates cudaDevAttrHostNumaMemoryPoolsSupported too loosely.
# Fail fast for CUDA 12.6-12.8 source builds with that buggy CCCL.
set(_rsc_cccl_buggy_numa_guard TRUE)
if (DEFINED CCCL_VERSION AND CCCL_VERSION VERSION_GREATER 3.3.0)
set(_rsc_cccl_buggy_numa_guard FALSE)
endif()
if (NOT RSC_SKIP_CUDA_VERSION_CHECK
AND _rsc_cccl_buggy_numa_guard
AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.6
AND CUDAToolkit_VERSION VERSION_LESS 12.9)
message(FATAL_ERROR
"Cannot build rapids_singlecell from source with CUDA ${CUDAToolkit_VERSION} against "
"CCCL ${CCCL_VERSION} (RAPIDS 26.04): it references cudaDevAttrHostNumaMemoryPoolsSupported, "
"which the CUDA 12.6-12.8 toolkit does not define (NVIDIA added it in 12.9). "
"Use CUDA >= 12.9 (or <= 12.5), upgrade to RAPIDS >= 26.06 (CCCL > 3.3.0 fixes the guard), "
"or install the prebuilt wheel (pip install rapids-singlecell-cu12). "
"If your toolkit does define this enum, override with -DRSC_SKIP_CUDA_VERSION_CHECK=ON.")
endif()

message(STATUS "Using RMM for CUDA extension scratch allocations")
message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
Expand Down Expand Up @@ -62,6 +186,57 @@ function(add_nb_cuda_module target src)
endif()
endfunction()

# RMM-backed nanobind CUDA module: normal module plus shared scratch allocator.
# Wheels use sibling RAPIDS packages; editable imports still preload fallbacks.
function(add_rmm_cuda_module target src)
add_nb_cuda_module(${target} ${src})
if (RSC_BUILD_EXTENSIONS)
target_sources(${target} PRIVATE
src/rapids_singlecell/_cuda/rmm_scratch.cu)
target_link_libraries(${target} PRIVATE rmm::rmm)
set(_rsc_rmm_build_rpath)
set(_rsc_rmm_have_build_librmm FALSE)
set(_rsc_rmm_have_build_rapids_logger FALSE)
if (DEFINED ENV{CONDA_PREFIX})
set(_rsc_rmm_env_site
"$ENV{CONDA_PREFIX}/lib/python${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}/site-packages")
if (EXISTS "${_rsc_rmm_env_site}/librmm/lib64")
list(APPEND _rsc_rmm_build_rpath
"${_rsc_rmm_env_site}/librmm/lib64")
set(_rsc_rmm_have_build_librmm TRUE)
endif()
if (EXISTS "${_rsc_rmm_env_site}/rapids_logger/lib64")
list(APPEND _rsc_rmm_build_rpath
"${_rsc_rmm_env_site}/rapids_logger/lib64")
set(_rsc_rmm_have_build_rapids_logger TRUE)
endif()
endif()
if (NOT _rsc_rmm_have_build_librmm AND rmm_DIR)
get_filename_component(_rsc_rmm_build_librmm_dir
"${rmm_DIR}/../.." REALPATH)
list(APPEND _rsc_rmm_build_rpath "${_rsc_rmm_build_librmm_dir}")
endif()
if (NOT _rsc_rmm_have_build_rapids_logger AND rapids_logger_DIR)
get_filename_component(_rsc_rmm_build_rapids_logger_dir
"${rapids_logger_DIR}/../.." REALPATH)
list(APPEND _rsc_rmm_build_rpath
"${_rsc_rmm_build_rapids_logger_dir}")
endif()
set(_rsc_rmm_install_rpath
"\$ORIGIN/../../librmm/lib64"
"\$ORIGIN/../../rapids_logger/lib64"
)
if (CUDAToolkit_LIBRARY_DIR)
list(APPEND _rsc_rmm_build_rpath "${CUDAToolkit_LIBRARY_DIR}")
list(APPEND _rsc_rmm_install_rpath "${CUDAToolkit_LIBRARY_DIR}")
endif()
set_target_properties(${target} PROPERTIES
BUILD_RPATH "${_rsc_rmm_build_rpath}"
INSTALL_RPATH "${_rsc_rmm_install_rpath}"
)
endif()
endfunction()

if (RSC_BUILD_EXTENSIONS)
# CUDA modules
add_nb_cuda_module(_mean_var_cuda src/rapids_singlecell/_cuda/mean_var/mean_var.cu)
Expand Down Expand Up @@ -91,7 +266,9 @@ if (RSC_BUILD_EXTENSIONS)
add_nb_cuda_module(_pseudobulk_cuda src/rapids_singlecell/_cuda/pseudobulk/pseudobulk.cu)
add_nb_cuda_module(_hvg_cuda src/rapids_singlecell/_cuda/hvg/hvg.cu)
add_nb_cuda_module(_kde_cuda src/rapids_singlecell/_cuda/kde/kde.cu)
add_nb_cuda_module(_wilcoxon_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
add_rmm_cuda_module(_wilcoxon_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
add_rmm_cuda_module(_wilcoxon_sparse_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_sparse.cu)
add_nb_cuda_module(_rank_stats_cuda src/rapids_singlecell/_cuda/rank_genes/rank_stats.cu)
# Harmony CUDA modules
add_nb_cuda_module(_harmony_scatter_cuda src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
add_nb_cuda_module(_harmony_outer_cuda src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
Expand Down
2 changes: 1 addition & 1 deletion conda/rsc_rapids_26.04_cuda12.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ channels:
dependencies:
- rapids=26.04
- python=3.14
- cuda-version=12.8
- cuda-version=12.9
- cudnn
- cutensor
- cusparselt
Expand Down
15 changes: 14 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ ARG GIT_ID=main
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

ENV PATH=/opt/conda/bin:$PATH
# Point CMake's find_package(rmm) at the conda env. The conda RAPIDS env resolved
# librmm + cuda-version together, so its librmm/rapids_logger headers match the
# image's CUDA toolkit. This is what lets the --no-build-isolation build below
# pick up the CUDA-matched librmm instead of a mismatched PyPI wheel.
ENV CMAKE_PREFIX_PATH=/opt/conda
ARG CUDA_ARCHS="75-real;80-real;86-real;89-real;90-real;100-real;120"

RUN <<EOF
Expand All @@ -18,5 +23,13 @@ git checkout ${GIT_ID}
# Set CUDA architectures directly in pyproject.toml (avoids SKBUILD_CMAKE_ARGS semicolon splitting)
sed -i 's/CMAKE_CUDA_ARCHITECTURES = "native"/CMAKE_CUDA_ARCHITECTURES = "'"${CUDA_ARCHS}"'"/' pyproject.toml
grep CMAKE_CUDA_ARCHITECTURES pyproject.toml
/opt/conda/bin/python -m pip install --no-cache-dir -e .
# Build with --no-build-isolation so the compile uses the conda env's
# CUDA-matched librmm/rapids_logger headers. With isolation, PEP 517 would pull
# a fresh librmm-cu12 from PyPI (hardcoded in [build-system].requires) that
# mismatches the image's CUDA toolkit -> "cudaDevAttr* has no global scope"
# errors on both cu12 (toolkit older than the latest librmm) and cu13 (wrong
# cu12 variant). Install the PEP 517 backend deps first since isolation is off;
# the conda env already provides the librmm/rapids_logger headers + cmake config.
/opt/conda/bin/python -m pip install --no-cache-dir scikit-build-core nanobind setuptools-scm cmake ninja
/opt/conda/bin/python -m pip install --no-cache-dir --no-build-isolation -e .
EOF
4 changes: 2 additions & 2 deletions docker/Dockerfile.deps
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG CUDA_VER=13.0.2
ARG CUDA_VER=13.1.0
ARG LINUX_VER=ubuntu24.04

FROM nvidia/cuda:${CUDA_VER}-devel-${LINUX_VER}
Expand All @@ -7,7 +7,7 @@ SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

ARG PYTHON_VER=3.13
# Re-declare after FROM so it is available to RUN steps (passed by docker.yml build-args)
ARG CUDA_VER=13.0.2
ARG CUDA_VER=13.1.0

ENV PATH=/opt/conda/bin:$PATH
ENV PYTHON_VERSION=${PYTHON_VER}
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ rapids_version=26.04

declare -A cuda_versions=(
[cu12]="12.8.0"
[cu13]="13.0.2"
[cu13]="13.1.0"
)

declare -A cuda_archs=(
Expand Down
Loading
Loading