Skip to content

Commit 55fc707

Browse files
authored
Upgrade to CUDA 12.9 (#77)
* Upgrade to CUDA 12.9 * Use latest NCCL * fix
1 parent 21b7e27 commit 55fc707

5 files changed

Lines changed: 11 additions & 7 deletions

File tree

containers/ci_container.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ x-rapids_versions:
77
stable: &rapids_version "25.12"
88

99
x-cuda_versions:
10-
cuda: &cuda_version "12.8.0"
11-
nccl: &nccl_version "2.27.7-1"
10+
cuda: &cuda_version "12.9.0"
11+
nccl: &nccl_version "2.29.2-1"
1212

1313
xgb-ci.gpu_build_rockylinux8:
1414
container_def: gpu_build_rockylinux8

containers/dockerfile/Dockerfile.gpu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@ ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
1818
# Install all basic requirements
1919
RUN \
2020
{ [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \
21+
export CUDA_SHORT=`echo $CUDA_VERSION | grep -o -E '[0-9]+\.[0-9]'` && \
2122
export NCCL_VERSION=$NCCL_VERSION && \
2223
sed -i 's/ports.ubuntu.com/mirrors.ocf.berkeley.edu/g' /etc/apt/sources.list && \
2324
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${CUDA_REPO_ARCH}/3bf863cc.pub && \
2425
apt-get update && \
2526
apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \
26-
apt-get install "libnccl2=${NCCL_VERSION}+cuda12.9" \
27-
"libnccl-dev=${NCCL_VERSION}+cuda12.9" -y --allow-change-held-packages && \
27+
apt-get install "libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT}" \
28+
"libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}" -y --allow-change-held-packages && \
2829
# Miniforge
2930
wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/$MINIFORGE_VERSION/Miniforge3-$MINIFORGE_VERSION-Linux-${ARCH}.sh && \
3031
bash conda.sh -b -p /opt/miniforge

containers/dockerfile/Dockerfile.gpu_build_cuda13_rockylinux8

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,11 @@ RUN \
3535
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
3636
RUN \
3737
{ [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \
38+
export CUDA_SHORT=`echo $CUDA_VERSION | grep -o -E '[0-9]+\.[0-9]'` && \
3839
export NCCL_VERSION=$NCCL_VERSION && \
3940
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${CUDA_REPO_ARCH}/cuda-rhel8.repo && \
4041
dnf -y update && \
41-
dnf install -y libnccl-${NCCL_VERSION}+cuda13.0 libnccl-devel-${NCCL_VERSION}+cuda13.0 libnccl-static-${NCCL_VERSION}+cuda13.0
42+
dnf install -y libnccl-${NCCL_VERSION}+cuda13.1 libnccl-devel-${NCCL_VERSION}+cuda13.1 libnccl-static-${NCCL_VERSION}+cuda13.1
4243

4344
# Install lightweight sudo (not bound to TTY)
4445
RUN set -ex; \

containers/dockerfile/Dockerfile.gpu_build_rockylinux8

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,11 @@ RUN \
3636
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
3737
RUN \
3838
{ [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \
39+
export CUDA_SHORT=`echo $CUDA_VERSION | grep -o -E '[0-9]+\.[0-9]'` && \
3940
export NCCL_VERSION=$NCCL_VERSION && \
4041
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${CUDA_REPO_ARCH}/cuda-rhel8.repo && \
4142
dnf -y update && \
42-
dnf install -y libnccl-${NCCL_VERSION}+cuda12.9 libnccl-devel-${NCCL_VERSION}+cuda12.9
43+
dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT}
4344

4445
# Install gRPC
4546
# Patch Abseil to apply https://github.com/abseil/abseil-cpp/issues/1629

containers/dockerfile/Dockerfile.jvm_gpu_build

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,11 @@ RUN \
3535

3636
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
3737
RUN \
38+
export CUDA_SHORT=`echo $CUDA_VERSION | grep -o -E '[0-9]+\.[0-9]'` && \
3839
export NCCL_VERSION=$NCCL_VERSION && \
3940
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
4041
dnf -y update && \
41-
dnf install -y libnccl-${NCCL_VERSION}+cuda12.9 libnccl-devel-${NCCL_VERSION}+cuda12.9 libnccl-static-${NCCL_VERSION}+cuda12.9
42+
dnf install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}
4243

4344
# Install Python packages
4445
RUN pip install numpy pytest scipy scikit-learn wheel kubernetes awscli

0 commit comments

Comments
 (0)