diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index d98664bdc..60ef0104c 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -316,5 +316,5 @@ standalone_container_registry_port: "5000" # Configuration for NGC-Ready playbook # ################################################################################ ngc_ready_cuda_container: "nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04" -ngc_ready_pytorch: "nvcr.io/nvidia/pytorch:24.04-py3" -ngc_ready_tensorflow: "nvcr.io/nvidia/tensorflow:24.04-tf2-py3" +ngc_ready_pytorch: "nvcr.io/nvidia/pytorch:26.04-py3" +ngc_ready_tensorflow: "nvcr.io/nvidia/tensorflow:25.02-tf2-py3" diff --git a/config.example/helm/rapids-dask.yml b/config.example/helm/rapids-dask.yml index 246bd3d14..21788310b 100644 --- a/config.example/helm/rapids-dask.yml +++ b/config.example/helm/rapids-dask.yml @@ -4,10 +4,8 @@ # Specify the resources used for each worker as well as the number of workers. worker: image: - # repository: nvcr.io/nvidia/rapidsai/rapidsai - # repository: dask-rapids - repository: supertetelman/k8s-rapids-dask - tag: cuda9.2-runtime-ubuntu16.04 + repository: nvcr.io/nvidia/rapidsai/notebooks + tag: 26.04-cuda12-py3.13 env: replicas: 1 resources: @@ -18,15 +16,15 @@ worker: scheduler: image: - repository: supertetelman/k8s-rapids-dask - tag: cuda9.2-runtime-ubuntu16.04 + repository: nvcr.io/nvidia/rapidsai/notebooks + tag: 26.04-cuda12-py3.13 # By default we should be doing all Dask works on workers using calls to distributed.Client() # If you would like to run/test your GPU code without using workers you may comment the resources section jupyter: image: - repository: supertetelman/k8s-rapids-dask - tag: cuda9.2-runtime-ubuntu16.04 + repository: nvcr.io/nvidia/rapidsai/notebooks + tag: 26.04-cuda12-py3.13 resources: requests: nvidia.com/gpu: 0 diff --git a/docs/airgap/ngc-ready.md b/docs/airgap/ngc-ready.md index 0efbe0483..d93757c56 100644 --- a/docs/airgap/ngc-ready.md +++ b/docs/airgap/ngc-ready.md @@ -37,8 +37,8 @@ For instructions on setting up an HTTP mirror, see the [doc on HTTP mirrors](./m Container images are only needed if you want to run the tests built into the playbook: - nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 -- nvcr.io/nvidia/pytorch:24.04-py3 -- nvcr.io/nvidia/tensorflow:24.04-tf2-py3 +- nvcr.io/nvidia/pytorch:26.04-py3 +- nvcr.io/nvidia/tensorflow:25.02-tf2-py3 For instructions on setting up a Docker registry mirror, see the [doc on Docker mirrors](./mirror-docker-images.md). @@ -62,8 +62,8 @@ For instructions on setting up an HTTP mirror, see the [doc on HTTP mirrors](./m Container images (how to mirror) are only needed if you want to run the tests built into the playbook: - nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 -- nvcr.io/nvidia/pytorch:24.04-py3 -- nvcr.io/nvidia/tensorflow:24.04-tf2-py3 +- nvcr.io/nvidia/pytorch:26.04-py3 +- nvcr.io/nvidia/tensorflow:25.02-tf2-py3 For instructions on setting up a Docker registry mirror, see the [doc on Docker mirrors](./mirror-docker-images.md). @@ -177,8 +177,8 @@ If running the container tests as part of the NGC-Ready playbook, set the follow ```bash ngc_ready_cuda_container: "/nvidia/cuda:12.4.1-base-ubuntu22.04" -ngc_ready_pytorch: "/nvidia/pytorch:24.04-py3" -ngc_ready_tensorflow: "/nvidia/tensorflow:24.04-tf2-py3" +ngc_ready_pytorch: "/nvidia/pytorch:26.04-py3" +ngc_ready_tensorflow: "/nvidia/tensorflow:25.02-tf2-py3" ``` ## Running the NGC-Ready playbook diff --git a/docs/container/docker-rootless.md b/docs/container/docker-rootless.md index 9bfe3dd13..c40cd8098 100644 --- a/docs/container/docker-rootless.md +++ b/docs/container/docker-rootless.md @@ -77,7 +77,7 @@ module load rootless-docker start_rootless_docker.sh # specify --quiet option to hide rootles docker messages -docker run --gpus all -it --rm nvcr.io/nvidia/cuda:11.0-base-ubuntu18.04 +docker run --gpus all -it --rm nvcr.io/nvidia/cuda:13.0.2-base-ubuntu24.04 root@445bf5cca686:/# echo NGPUS: $(nvidia-smi -L | wc -l) NGPUS: 1 diff --git a/docs/container/nginx-docker-cache.md b/docs/container/nginx-docker-cache.md index 178172537..34366ad32 100644 --- a/docs/container/nginx-docker-cache.md +++ b/docs/container/nginx-docker-cache.md @@ -42,8 +42,8 @@ The following variables are the most common configuration you may want to adjust | Variable | Default value | Description | | ------------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------- | -| `nginx_docker_cache_image` | `"rpardini/docker-registry-proxy:0.6.1"` | Container image used to deploy the proxy | -| `nginx_docker_cache_registry_string` | `"quay.io k8s.gcr.io gcr.io nvcr.io"` | Space-separated list of registries to proxy | +| `nginx_docker_cache_image` | `"rpardini/docker-registry-proxy:0.6.5"` | Container image used to deploy the proxy | +| `nginx_docker_cache_registry_string` | `"registry.k8s.io quay.io k8s.gcr.io gcr.io nvcr.io"` | Space-separated list of registries to proxy; `k8s.gcr.io` is retained for older clusters while current Kubernetes images use `registry.k8s.io` | | `nginx_docker_cache_manifests` | `"false"` | Flag to determine whether to cache image manifests | | `nginx_docker_cache_manifest_default_time` | "1h" | If manifests are cached, time to cache them | | `nginx_docker_cache_hostgroup` | `"cache"` | Ansible inventory host group where proxy is deployed | diff --git a/docs/k8s-cluster/kubernetes-usage.md b/docs/k8s-cluster/kubernetes-usage.md index 106f4ecc2..2511509f6 100644 --- a/docs/k8s-cluster/kubernetes-usage.md +++ b/docs/k8s-cluster/kubernetes-usage.md @@ -10,7 +10,7 @@ Kubernetes Usage Guide ## Introduction -Most of the following examples can be configured and executed through the Kubernetes Dashboard. For a basic run-through on how to leverage the Kubernetes Dashboard, please see the [official documentation](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/). The following examples `kubectl` on the master node instead. +Most of the following examples can be configured and executed through the Kubernetes Dashboard. For a basic run-through on how to leverage the Kubernetes Dashboard, please see the [official documentation](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/). The following examples use `kubectl` on the master node instead. ## Simple Commands @@ -63,12 +63,12 @@ kubectl get pods --all-namespaces 4. Delete the job (and the corresponding pod). ```bash - kubectl delete job cuda-job + kubectl delete job pytorch-job ``` ## Using NGC Containers with Kubernetes and Launching Jobs -[NVIDIA GPU Cloud (NGC)](https://docs.nvidia.com/ngc/ngc-introduction) manages a catalog of fully integrated and optimized DL framework containers that take full advantage of NVIDIA GPUs in both single and multi-GPU configurations. They include NVIDIA CUDA® Toolkit, DIGITS workflow, and the following DL frameworks: NVCaffe, Caffe2, Microsoft Cognitive Toolkit (CNTK), MXNet, PyTorch, TensorFlow, Theano, and Torch. These framework containers are delivered ready-to-run, including all necessary dependencies such as the CUDA runtime and NVIDIA libraries. +[NVIDIA GPU Cloud (NGC)](https://docs.nvidia.com/ngc/ngc-introduction) manages a catalog of optimized GPU containers for CUDA, PyTorch, TensorFlow, Triton Inference Server, RAPIDS, and other NVIDIA software. Use the NGC catalog and the NVIDIA framework container release notes to choose the current image for your workload. To access the NGC container registry via Kubernetes, add a secret which will be employed when Kubernetes asks NGC to pull container images from it. @@ -105,9 +105,9 @@ To access the NGC container registry via Kubernetes, add a secret which will be - name: nvcr.dgxkey containers: - name: pytorch-container - image: nvcr.io/nvidia/pytorch:19.02-py3 + image: nvcr.io/nvidia/pytorch:26.04-py3 command: ["/bin/sh"] - args: ["-c", "python /workspace/examples/upstream/mnist/main.py"] + args: ["-c", "python -c 'import torch; print(\"cuda_available=\", torch.cuda.is_available()); print(\"device_count=\", torch.cuda.device_count())'"] resources: limits: nvidia.com/gpu: 1 diff --git a/docs/k8s-cluster/roce_backend.md b/docs/k8s-cluster/roce_backend.md index aa5c9dba6..0bee96738 100644 --- a/docs/k8s-cluster/roce_backend.md +++ b/docs/k8s-cluster/roce_backend.md @@ -80,11 +80,15 @@ dev_id: 101c num_vf: 8 -5. Mellanox Ofed place and image name - mofed_site_place, mofed_file_name. - -mofed_site_place: "MLNX_OFED-4.6-1.0.1.1" - -mofed_file_name: "MLNX_OFED_LINUX-4.6-1.0.1.1-ubuntu18.04-x86_64.iso" +5. NVIDIA OFED version, site place and image name - mofed_version, mofed_site_place, mofed_file_name. + +For new Kubernetes RDMA/RoCE deployments, prefer NVIDIA Network Operator coverage with DOCA-OFED driver management. This DeepOps role remains a legacy direct-host install path for environments that still need it, so verify the OFED package against your exact operating system and kernel before using it in production. + +mofed_version: "24.10-4.1.4.0" + +mofed_site_place: "MLNX_OFED-24.10-4.1.4.0" + +mofed_file_name: "MLNX_OFED_LINUX-24.10-4.1.4.0-ubuntu24.04-x86_64.iso" ## Dependencies diff --git a/docs/slurm-cluster/README.md b/docs/slurm-cluster/README.md index e5842cd4f..5d60986c4 100644 --- a/docs/slurm-cluster/README.md +++ b/docs/slurm-cluster/README.md @@ -87,7 +87,7 @@ default parameters that can be overriden: ```bash # String; Container for nccl performance/validation tests. Either docker # tag or can be path to sqsh file. - base_container: "nvcr.io/nvidia/tensorflow:21.09-tf2-py3" + base_container: "nvcr.io/nvidia/pytorch:26.04-py3" # String; Container to be created or one that might exist with nccl tests. # If `compile_nccl_tests` is True, it must be a sqsh file. @@ -166,17 +166,17 @@ NOTE: This will use Pyxis to download a container. ```bash ansible-playbook -l slurm-cluster playbooks/slurm-cluster/slurm-validation.yml \ - -e '{base_container: nvcr.io/nvidia/pytorch:21.09-py3}' \ + -e '{base_container: nvcr.io/nvidia/pytorch:26.04-py3}' \ -e '{nccl_tests_container: "${HOME}/enroot_images/nccl_tests_torch_val.sqsh"}' \ -e '{num_nodes: 2}' \ -e '{srun_exports: "NCCL_DEBUG=INFO,OMPI_MCA_pml=^ucx,OMPI_MCA_coll=^hcoll"}' \ -e '{cleanup: True}' ``` -3. Example to run on 1 node using existing NCCL container from a docker repo. +3. Example to run on 1 node using an existing NCCL test container from a site registry. ```bash ansible-playbook -l slurm-cluster playbooks/slurm-cluster/slurm-validation.yml \ - -e '{nccl_tests_container: deepops/nccl-tests-tf20.06-ubuntu18.04:latest}' \ + -e '{nccl_tests_container: registry.example.com/hpc/nccl-tests:latest}' \ -e '{compile_nccl_tests: False}' \ -e '{num_nodes: 1}' ``` diff --git a/docs/slurm-cluster/slurm-perf-cluster.md b/docs/slurm-cluster/slurm-perf-cluster.md index 6b252f0d8..7aa92aec0 100644 --- a/docs/slurm-cluster/slurm-perf-cluster.md +++ b/docs/slurm-cluster/slurm-perf-cluster.md @@ -254,7 +254,7 @@ If errors are noticed when running `sinfo -R`, it's also helpful to search the l sudo journalctl -e | grep slurm ``` -To re-run the test manually, from the slurm login node... +To re-run the test manually, from the slurm login node. Replace `registry.example.com/hpc/nccl-tests:latest` with your site's current NCCL tests image or a `.sqsh` image built by `playbooks/slurm-cluster/slurm-validation.yml`. ```bash # on the slurm login node @@ -269,7 +269,7 @@ scancel sudo scontrol update nodename= state=idle # run the test again -srun -N --mpi=pmix --exclusive --container-image=deepops/nccl-tests-tf20.06-ubuntu18.04 --ntasks-per-node=8 -G all_reduce_perf -b 1M -e 4G -f 2 -g +srun -N --mpi=pmix --exclusive --container-image=registry.example.com/hpc/nccl-tests:latest --ntasks-per-node=8 -G all_reduce_perf -b 1M -e 4G -f 2 -g ``` ### Performance validation test results are suboptimal @@ -289,7 +289,7 @@ Try running the test from the slurm login node, but with debug output enabled... ```bash # from the slurm login node -$ NCCL_DEBUG=INFO srun -N --mpi=pmix --exclusive --container-image=deepops/nccl-tests-tf20.06-ubuntu18.04 --ntasks-per-node=8 -G all_reduce_perf -b 1M -e 4G -f 2 -g +$ NCCL_DEBUG=INFO srun -N --mpi=pmix --exclusive --container-image=registry.example.com/hpc/nccl-tests:latest --ntasks-per-node=8 -G all_reduce_perf -b 1M -e 4G -f 2 -g # examine the output, looking for any mention of `GDRDMA` # for example: `NET/IB/0/GDRDMA` diff --git a/docs/slurm-cluster/slurm-single-node.md b/docs/slurm-cluster/slurm-single-node.md index fda8d8dde..94518be98 100644 --- a/docs/slurm-cluster/slurm-single-node.md +++ b/docs/slurm-cluster/slurm-single-node.md @@ -368,11 +368,11 @@ compute-session:start_rootless_docker.sh ``` An option “--quiet” can be passed to the “start_rootless_docker.sh” script to -hide rootless docker messages. Pull/run a docker image: +hide rootless docker messages. Pull/run a site-maintained NCCL tests image: ```bash compute-session:docker run --gpus=all --rm -it \ - deepops/nccl-tests-tf20.06-ubuntu18.04:latest \ + registry.example.com/hpc/nccl-tests:latest \ mpirun --allow-run-as-root -np 2 all_reduce_perf -b 1M -e 4G -f 2 -g 1 ``` @@ -386,7 +386,7 @@ module load rootless-docker start_rootless_docker.sh --quiet -docker run --gpus=all --rm -t deepops/nccl-tests-tf20.06-ubuntu18.04:latest \ +docker run --gpus=all --rm -t registry.example.com/hpc/nccl-tests:latest \ mpirun --allow-run-as-root -np 2 all_reduce_perf -b 1M -e 4G -f 2 -g 1 stop_rootless_docker.sh @@ -403,7 +403,7 @@ starting the container and checking the number of GPUs and CPUs available. ```bash compute-session:docker run --gpus=all --rm -it \ - deepops/nccl-tests-tf20.06-ubuntu18.04:latest \ + registry.example.com/hpc/nccl-tests:latest \ bash -c 'echo NGPUS: $(nvidia-smi -L | wc -l) NCPUS: $(nproc)' NGPUS: 2 NCPUS: 2 ``` @@ -416,7 +416,7 @@ already does not have permission to outside of the container. ```bash compute-session:docker run --gpus=all --rm -it -v ${PWD}:${PWD} --workdir=${PWD} \ - deepops/nccl-tests-tf20.06-ubuntu18.04:latest bash -c 'touch somefile-in-container' + registry.example.com/hpc/nccl-tests:latest bash -c 'touch somefile-in-container' ``` Then outside of the container. @@ -434,7 +434,7 @@ outside of the container. ```bash compute-session:docker run --gpus=all --rm -it -v /etc/slurm:/slurm --workdir=${PWD} \ - deepops/nccl-tests-tf20.06-ubuntu18.04:latest bash -c 'cat /slurm/slurmdbd.conf' + registry.example.com/hpc/nccl-tests:latest bash -c 'cat /slurm/slurmdbd.conf' cat: /slurm/slurmdbd.conf: Permission denied ``` @@ -464,13 +464,15 @@ Singularity and enroot could also be deployed via DeepOps. These would be useful for multi-node jobs if running on more than one DGX system. Enroot with pyxis can be tested by running: +The examples below use `registry.example.com/hpc/nccl-tests:latest` as a placeholder for a site-maintained NCCL tests image. + ```bash login-session:srun --mpi=pmi2 --ntasks=2 --gpus-per-task=1 \ - --container-image=deepops/nccl-tests-tf20.06-ubuntu18.04:latest \ + --container-image=registry.example.com/hpc/nccl-tests:latest \ all_reduce_perf -b 1M -e 4G -f 2 -g 1 ``` -The pyxis+enroot is invoked via option “ --container-image=deepops/nccl-tests-tf20.06-ubuntu18.04:latest” +The pyxis+enroot is invoked via option “ --container-image=registry.example.com/hpc/nccl-tests:latest” to run the “all_reduce_perf” nccl test. Refer to enroot and pyxis documentation for further details. @@ -490,7 +492,7 @@ Then invoke as: ```bash login-session:srun --ntasks=2 --gpus-per-task=1 --no-container-remap-root \ - --container-image=deepops/nccl-tests-tf20.06-ubuntu18.04:latest --container-workdir=${PWD} \ + --container-image=registry.example.com/hpc/nccl-tests:latest --container-workdir=${PWD} \ test-allreduce.sh ``` @@ -507,7 +509,7 @@ Singularity could be used in a similar fashion to enroot. Don’t forget the ```bash login-session:srun --mpi=pmi2 --ntasks=2 --gpus-per-task=1 \ - singularity exec --nv docker://deepops/nccl-tests-tf20.06-ubuntu18.04:latest \ + singularity exec --nv docker://registry.example.com/hpc/nccl-tests:latest \ all_reduce_perf -b 1M -e 4G -f 2 -g 1 ``` @@ -516,7 +518,7 @@ with enroot): ```bash login-session:srun --ntasks=2 --gpus-per-task=1 \ - singularity exec --nv docker://deepops/nccl-tests-tf20.06-ubuntu18.04:latest \ + singularity exec --nv docker://registry.example.com/hpc/nccl-tests:latest \ ${PWD}/test_allreduce.sh ``` diff --git a/playbooks/slurm-cluster/slurm-validation.yml b/playbooks/slurm-cluster/slurm-validation.yml index 0cc03417d..5e575941c 100644 --- a/playbooks/slurm-cluster/slurm-validation.yml +++ b/playbooks/slurm-cluster/slurm-validation.yml @@ -11,7 +11,7 @@ vars: # String; Container for nccl performance/validation tests. Either docker # repo or can be path to sqsh file. - base_container: "nvcr.io/nvidia/tensorflow:21.09-tf2-py3" + base_container: "nvcr.io/nvidia/pytorch:26.04-py3" # String; Container to be created or one that might exist with nccl tests. # If `compile_nccl_tests` is True, it must be a sqsh file. nccl_tests_container: "${HOME}/enroot_images/nccl_tests_slurm_val.sqsh" diff --git a/roles/nginx-docker-registry-cache/defaults/main.yml b/roles/nginx-docker-registry-cache/defaults/main.yml index 6824bc7d5..cea6bada1 100644 --- a/roles/nginx-docker-registry-cache/defaults/main.yml +++ b/roles/nginx-docker-registry-cache/defaults/main.yml @@ -5,7 +5,7 @@ nginx_docker_cache_image: "rpardini/docker-registry-proxy:0.6.5" nginx_docker_cache_mirror_path: "/opt/deepops/nginx-docker-cache/mirror" nginx_docker_cache_ca_path: "/opt/deepops/nginx-docker-cache/ca" -nginx_docker_cache_registry_string: "quay.io k8s.gcr.io gcr.io nvcr.io" +nginx_docker_cache_registry_string: "registry.k8s.io quay.io k8s.gcr.io gcr.io nvcr.io" nginx_docker_cache_manifests: "false" nginx_docker_cache_manifest_default_time: "1h" diff --git a/roles/roce_backend/README.md b/roles/roce_backend/README.md index 08aad44e4..e3d72a807 100755 --- a/roles/roce_backend/README.md +++ b/roles/roce_backend/README.md @@ -71,13 +71,15 @@ dev_id: "101c" num_vf: 8 -5. Mellanox Ofed version, site place and image name - mofed_version, mofed_site_place, mofed_file_name. -``` -#Mellanox OFED parameters -mofed_version: "4.7-3.2.9.0" -mofed_site_place: "MLNX_OFED-4.7-3.2.9.0" -mofed_file_name: "MLNX_OFED_LINUX-4.7-3.2.9.0-ubuntu18.04-x86_64.iso" -``` +5. NVIDIA OFED version, site place and image name - mofed_version, mofed_site_place, mofed_file_name. + +For new Kubernetes RDMA/RoCE deployments, prefer NVIDIA Network Operator coverage with DOCA-OFED driver management. This DeepOps role remains a legacy direct-host install path for environments that still need it, so verify the OFED package against your exact operating system and kernel before using it in production. +``` +# NVIDIA OFED parameters +mofed_version: "24.10-4.1.4.0" +mofed_site_place: "MLNX_OFED-24.10-4.1.4.0" +mofed_file_name: "MLNX_OFED_LINUX-24.10-4.1.4.0-ubuntu24.04-x86_64.iso" +``` Dependencies diff --git a/roles/roce_backend/tasks/mofed-install.yaml b/roles/roce_backend/tasks/mofed-install.yaml index 491c060ef..12ce4e82f 100755 --- a/roles/roce_backend/tasks/mofed-install.yaml +++ b/roles/roce_backend/tasks/mofed-install.yaml @@ -22,7 +22,7 @@ shell: | cd /tmp rm -f /tmp/{{ mofed_file_name }} - wget http://content.mellanox.com/ofed/{{ mofed_site_place }}/{{ mofed_file_name }} + wget https://content.mellanox.com/ofed/{{ mofed_site_place }}/{{ mofed_file_name }} mkdir -p /mnt/iso mount -o loop /tmp/{{ mofed_file_name }} /mnt/iso /mnt/iso/mlnxofedinstall --all -q diff --git a/roles/roce_backend/vars/main.yml b/roles/roce_backend/vars/main.yml index e3eeea120..0353a4e08 100755 --- a/roles/roce_backend/vars/main.yml +++ b/roles/roce_backend/vars/main.yml @@ -31,18 +31,12 @@ vendor: "15b3" dev_id: "101c" num_vf: 8 -#Mellanox OFED parameters -mofed_version: "4.7-3.2.9.0" -mofed_site_place: "MLNX_OFED-4.7-3.2.9.0" -mofed_file_name: "MLNX_OFED_LINUX-4.7-3.2.9.0-ubuntu18.04-x86_64.iso" - -# before K8s 1.16 -multus_ds: "https://raw.githubusercontent.com/intel/multus-cni/master/images/multus-daemonset-pre-1.16.yml" -sriov_dp_ds: "https://raw.githubusercontent.com/intel/sriov-network-device-plugin/master/deployments/k8s-v1.10-v1.15/sriovdp-daemonset.yaml" -sriov_cni_ds: "https://raw.githubusercontent.com/intel/sriov-cni/master/images/k8s-v1.10-v1.15/sriov-cni-daemonset.yaml" - -# from K8s 1.16 -# multus_ds: "https://raw.githubusercontent.com/intel/multus-cni/master/images/multus-daemonset.yml" -# sriov_dp_ds: "https://raw.githubusercontent.com/intel/sriov-network-device-plugin/master/deployments/k8s-v1.16/sriovdp-daemonset.yaml" -# sriov_cni_ds: "https://raw.githubusercontent.com/intel/sriov-cni/master/images/k8s-v1.16/sriov-cni-daemonset.yaml" +# NVIDIA OFED parameters. Prefer NVIDIA Network Operator with DOCA-OFED for +# new Kubernetes RDMA/RoCE deployments; this role is a legacy direct-host path. +mofed_version: "24.10-4.1.4.0" +mofed_site_place: "MLNX_OFED-24.10-4.1.4.0" +mofed_file_name: "MLNX_OFED_LINUX-24.10-4.1.4.0-ubuntu24.04-x86_64.iso" +multus_ds: "https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset.yml" +sriov_dp_ds: "https://raw.githubusercontent.com/k8snetworkplumbingwg/sriov-network-device-plugin/master/deployments/sriovdp-daemonset.yaml" +sriov_cni_ds: "https://raw.githubusercontent.com/k8snetworkplumbingwg/sriov-cni/master/images/sriov-cni-daemonset.yaml" diff --git a/workloads/examples/k8s/dask-rapids/README.md b/workloads/examples/k8s/dask-rapids/README.md index da96e6bf6..590d4f46e 100644 --- a/workloads/examples/k8s/dask-rapids/README.md +++ b/workloads/examples/k8s/dask-rapids/README.md @@ -34,6 +34,8 @@ DeepOps includes support for running a local registry, but configuration of that In my workflow below, I am pushing the image to [Docker Hub](https://hub.docker.com). If you haven't used Docker Hub before, the [quickstart documentation](https://docs.docker.com/docker-hub/) provides a good tutorial. +The included Dockerfile extends the current NGC RAPIDS notebooks image, `nvcr.io/nvidia/rapidsai/notebooks:26.04-cuda12-py3.13`, which provides RAPIDS, Dask CUDA, and JupyterLab on Ubuntu 24.04. If you move to a newer RAPIDS release, update both the Dockerfile and Helm values together. + ### Editing the deployment scripts We'll deploy the RAPIDS/Dask container in one step using the `deploy.sh` script. @@ -140,9 +142,8 @@ Feel free to adjust the number of CPU cores or GPUs used and the parameters for ## Experimenting further -The base container we used for this benchmark contains more examples using RAPIDS in the `cuml/` directory, -as well as an end-to-end workflow example based on a Fannie Mae mortgage dataset in the `mortgage/` directory. -Both directories can be accessed easily via JupyterLab. +The base container used for this benchmark includes RAPIDS libraries, Dask CUDA, and JupyterLab. +For more examples and current image tags, use the RAPIDS documentation and NGC container catalog. You can also experiment with the custom container by making changes to the `Dockerfile` used to create it, in `examples/k8s/dask-rapids/docker`. diff --git a/workloads/examples/k8s/dask-rapids/docker/Dockerfile b/workloads/examples/k8s/dask-rapids/docker/Dockerfile index b84ea955e..6db4abe1e 100644 --- a/workloads/examples/k8s/dask-rapids/docker/Dockerfile +++ b/workloads/examples/k8s/dask-rapids/docker/Dockerfile @@ -1,24 +1,16 @@ -# Base our new image on the CUDA 9.2 RAPIDS image from upstream -FROM nvcr.io/nvidia/rapidsai/rapidsai:cuda9.2-runtime-ubuntu16.04 +# Base the example on the current RAPIDS notebooks image from NGC. +FROM nvcr.io/nvidia/rapidsai/notebooks:26.04-cuda12-py3.13 + +USER root -# Fix font-manager package RUN apt-get update && \ - apt-get install -y --fix-missing font-manager && \ + apt-get install -y --no-install-recommends font-manager && \ + mkdir -p /opt/rapids/notebooks && \ + chown -R rapids:conda /opt/rapids && \ rm -rf /var/lib/apt/lists/* -# The name of the Anaconda Python environment we'll use (from upstream) -ENV CONDA_ENV rapids - -# Install additional Python packages into the environment -# (If you want to install more packages, add them here!) -RUN source activate $CONDA_ENV && \ - conda install -y unzip python-graphviz && \ - pip install ipyvolume dask-kubernetes matplotlib cupy-cuda92 +USER rapids +WORKDIR /opt/rapids/notebooks # Copy the parallel sum notebook in -COPY ParallelSum.ipynb /rapids/notebooks/ParallelSum.ipynb - -# Set up image to be run -COPY prepare.sh /usr/bin/prepare.sh -CMD ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root", "--no-browser", "--NotebookApp.token='dask'"] -ENTRYPOINT ["tini", "--", "/usr/bin/prepare.sh"] +COPY --chown=rapids:conda ParallelSum.ipynb ./ParallelSum.ipynb diff --git a/workloads/examples/k8s/dask-rapids/docker/prepare.sh b/workloads/examples/k8s/dask-rapids/docker/prepare.sh index 8dd3df26c..7b6251c59 100755 --- a/workloads/examples/k8s/dask-rapids/docker/prepare.sh +++ b/workloads/examples/k8s/dask-rapids/docker/prepare.sh @@ -27,11 +27,9 @@ if [ "$EXTRA_PIP_PACKAGES" ]; then fi # Activate the specified or default conda environment -#conda_env="rapids" -#if [ "$CONDA_ENV" ]; then -# conda_env="$CONDA_ENV" -#fi -source activate $CONDA_ENV +CONDA_ENV="${CONDA_ENV:-base}" +source /opt/conda/etc/profile.d/conda.sh +conda activate "$CONDA_ENV" # Run pre-commands if [ "$PRE_RUN_HOOK" ]; then diff --git a/workloads/examples/k8s/pytorch-job.yml b/workloads/examples/k8s/pytorch-job.yml index 9f20d8728..69a38d749 100644 --- a/workloads/examples/k8s/pytorch-job.yml +++ b/workloads/examples/k8s/pytorch-job.yml @@ -8,9 +8,9 @@ spec: spec: containers: - name: pytorch-container - image: nvcr.io/nvidia/pytorch:19.02-py3 + image: nvcr.io/nvidia/pytorch:26.04-py3 command: ["/bin/sh"] - args: ["-c", "python /workspace/examples/upstream/mnist/main.py"] + args: ["-c", "python -c 'import torch; print(\"cuda_available=\", torch.cuda.is_available()); print(\"device_count=\", torch.cuda.device_count())'"] resources: limits: nvidia.com/gpu: 1