From 77675f7735a1434f32164ccc39e4da9fb147ce2c Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 01:01:11 +0000 Subject: [PATCH 01/34] Trigger EFA Test --- docker/pytorch/Dockerfile.cuda | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 93effb4ff91d..7b9c4a7223b6 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -15,6 +15,7 @@ # sourcing versions.env so there is a single source of truth. # ============================================================================ +# Trigger PR workflow!! # ── Global ARGs (available to all stages) ─────────────────────────────────── ARG DLC_MAJOR_VERSION=1 ARG DLC_MINOR_VERSION=0 From 542dc11c8faa9562ac771fe1ca0eb28083ebdc3d Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 02:49:31 +0000 Subject: [PATCH 02/34] fix: set NCCL_NET_PLUGIN=ofi for EFA NCCL plugin discovery EFA installer >= 1.44.0 installs the aws-ofi-nccl plugin as libnccl-net-ofi.so (not libnccl-net.so). NCCL's default plugin search looks for libnccl-net.so which no longer exists, causing NCCL to fall back to sockets and fail on EFA-only instances. Setting NCCL_NET_PLUGIN=ofi tells NCCL to look for libnccl-net-ofi.so instead, which is what the EFA installer provides. Also adds a build-time verification that the OFI plugin .so exists after EFA installation, matching the pattern in scripts/common/. --- docker/pytorch/Dockerfile.cuda | 7 ++++--- scripts/pytorch/install_efa.sh | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 7b9c4a7223b6..4824907c28e7 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -15,7 +15,6 @@ # sourcing versions.env so there is a single source of truth. # ============================================================================ -# Trigger PR workflow!! # ── Global ARGs (available to all stages) ─────────────────────────────────── ARG DLC_MAJOR_VERSION=1 ARG DLC_MINOR_VERSION=0 @@ -225,7 +224,8 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" + LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ + NCCL_NET_PLUGIN=ofi EXPOSE 22 WORKDIR /workspace @@ -289,7 +289,8 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" + LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ + NCCL_NET_PLUGIN=ofi EXPOSE 22 WORKDIR /workspace diff --git a/scripts/pytorch/install_efa.sh b/scripts/pytorch/install_efa.sh index 5106a6a4d19c..abeac219168e 100755 --- a/scripts/pytorch/install_efa.sh +++ b/scripts/pytorch/install_efa.sh @@ -13,6 +13,15 @@ cd aws-efa-installer ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify rm -rf /tmp/efa +# Verify OFI NCCL plugin was installed +OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib64" +if [ ! -f "${OFI_LIB_DIR}/libnccl-net-ofi.so" ]; then + echo "ERROR: ${OFI_LIB_DIR}/libnccl-net-ofi.so not found after EFA install" + ls -la "${OFI_LIB_DIR}/" 2>/dev/null || echo "Directory does not exist" + exit 1 +fi +echo "NCCL OFI plugin found at: ${OFI_LIB_DIR}/libnccl-net-ofi.so" + # Configure OpenMPI — allow root execution mv "${OPEN_MPI_PATH}/bin/mpirun" "${OPEN_MPI_PATH}/bin/mpirun.real" cat > "${OPEN_MPI_PATH}/bin/mpirun" <<'WRAPPER' From 6eda96f7b509cf50364fb31845c868e180bdf016 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 02:55:06 +0000 Subject: [PATCH 03/34] debug: add EFA/NCCL plugin diagnostics to allreduce test Print plugin file existence, ldd output, and env vars before running the NCCL test. Also explicitly pass NCCL_NET_PLUGIN via mpirun -x to ensure it reaches all ranks. --- test/efa/scripts/nccl_allreduce.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index bd185bfb8169..fbc2710e2c14 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -52,9 +52,17 @@ check_efa_nccl_all_reduce_performance(){ fi } +echo "=== EFA/NCCL diagnostics ===" +echo "NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-}" +echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" +ls -la /opt/amazon/ofi-nccl/lib64/libnccl-net*.so* 2>/dev/null || echo "No libnccl-net*.so in /opt/amazon/ofi-nccl/lib64/" +ldd /opt/amazon/ofi-nccl/lib64/libnccl-net-ofi.so 2>/dev/null | grep -i "not found" || true +echo "=== End diagnostics ===" + echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ - -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ + -x NCCL_DEBUG=INFO -x NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-ofi} \ + ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" From 1a519f36c6770c97d1a6efb3d8868d45c7d8f5b3 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 04:19:50 +0000 Subject: [PATCH 04/34] debug: move diagnostics to pytest for visible output Shell diagnostics in nccl_allreduce.sh weren't visible in CI logs because Fabric only captures the final command's stdout/stderr on failure. Move diagnostics to the Python test as a separate run_on_container call whose output goes to pytest's captured log. --- test/efa/scripts/nccl_allreduce.sh | 7 ------- test/efa/test_efa.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index fbc2710e2c14..8596e1edb060 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -52,13 +52,6 @@ check_efa_nccl_all_reduce_performance(){ fi } -echo "=== EFA/NCCL diagnostics ===" -echo "NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-}" -echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" -ls -la /opt/amazon/ofi-nccl/lib64/libnccl-net*.so* 2>/dev/null || echo "No libnccl-net*.so in /opt/amazon/ofi-nccl/lib64/" -ldd /opt/amazon/ofi-nccl/lib64/libnccl-net-ofi.so 2>/dev/null | grep -i "not found" || true -echo "=== End diagnostics ===" - echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO -x NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-ofi} \ diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index aa06a572c0e8..764475555598 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -9,6 +9,7 @@ pytest test/efa/test_efa.py --image-uri -v """ +import logging import os from efa.ec2_helpers import ( @@ -19,6 +20,8 @@ run_on_container, ) +LOGGER = logging.getLogger(__name__) + IMAGE_URI = os.environ["TEST_IMAGE_URI"] EFA_INSTANCE_TYPE = os.environ.get("EFA_INSTANCE_TYPE", "p4d.24xlarge") @@ -41,6 +44,18 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): worker_conn, aws_session, ): + # Diagnostics: dump NCCL plugin state before running the test + diag = run_on_container( + MASTER_CONTAINER_NAME, + master_conn, + "echo NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN && " + "ls -la /opt/amazon/ofi-nccl/lib64/ 2>&1 && " + "ldd /opt/amazon/ofi-nccl/lib64/libnccl-net-ofi.so 2>&1 && " + "echo --- && " + "ls -la /opt/amazon/ofi-nccl/lib64/libnccl-net.so 2>&1 || true", + ) + LOGGER.info("=== NCCL plugin diagnostics ===\n%s", diag.stdout) + # EFA sanity on master run_on_container( MASTER_CONTAINER_NAME, From c5b9d0c4b8b61faafb80c06e5bbd88227405fddf Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 04:56:09 +0000 Subject: [PATCH 05/34] debug: use print() and warn=True for visible diagnostic output - Use print() instead of LOGGER.info() so output appears in pytest's "Captured stdout call" section (logger output was being swallowed) - Use warn=True on allreduce call so we can capture and print the full stdout/stderr/log file content on failure instead of just getting the UnexpectedExit traceback --- test/efa/test_efa.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index 764475555598..f306b0b8f7ef 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -9,9 +9,9 @@ pytest test/efa/test_efa.py --image-uri -v """ -import logging import os +import pytest from efa.ec2_helpers import ( DEFAULT_TIMEOUT, HOSTS_FILE_LOCATION, @@ -20,8 +20,6 @@ run_on_container, ) -LOGGER = logging.getLogger(__name__) - IMAGE_URI = os.environ["TEST_IMAGE_URI"] EFA_INSTANCE_TYPE = os.environ.get("EFA_INSTANCE_TYPE", "p4d.24xlarge") @@ -44,7 +42,7 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): worker_conn, aws_session, ): - # Diagnostics: dump NCCL plugin state before running the test + # Diagnostics: dump NCCL plugin state and network info diag = run_on_container( MASTER_CONTAINER_NAME, master_conn, @@ -54,7 +52,7 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "echo --- && " "ls -la /opt/amazon/ofi-nccl/lib64/libnccl-net.so 2>&1 || true", ) - LOGGER.info("=== NCCL plugin diagnostics ===\n%s", diag.stdout) + print(f"=== NCCL plugin diagnostics (master) ===\n{diag.stdout}") # EFA sanity on master run_on_container( @@ -63,10 +61,26 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "/test/efa/scripts/efa_sanity.sh", ) - # NCCL all_reduce across 2 nodes - run_on_container( + # NCCL all_reduce across 2 nodes — capture failure details + result = run_on_container( MASTER_CONTAINER_NAME, master_conn, f"/test/efa/scripts/nccl_allreduce.sh {HOSTS_FILE_LOCATION} 2", timeout=DEFAULT_TIMEOUT, + warn=True, ) + if result.failed: + print(f"=== NCCL allreduce FAILED (exit code {result.return_code}) ===") + print(f"=== stdout ===\n{result.stdout}") + print(f"=== stderr ===\n{result.stderr}") + log_dump = run_on_container( + MASTER_CONTAINER_NAME, + master_conn, + "cat /test/efa/logs/testEFA.log 2>&1 || echo 'Log file empty or missing'", + warn=True, + ) + print(f"=== testEFA.log ===\n{log_dump.stdout}") + pytest.fail( + f"NCCL allreduce failed with exit code {result.return_code}. " + f"See stdout above for details." + ) From dcb451381adfbe1fd5ebcdc6a7b2e57b24c4638e Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 04:57:55 +0000 Subject: [PATCH 06/34] debug: use print() and warn=True for visible diagnostic output Previous LOGGER.info wasn't visible in CI. Use print() which pytest -s captures. Also dump SG rules to check if the all-traffic self-ref rule is missing (known previous issue). Use warn=True on allreduce so we can capture and print stdout/stderr/log file on failure. --- test/efa/test_efa.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index f306b0b8f7ef..e831b9c03ec5 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -17,6 +17,7 @@ HOSTS_FILE_LOCATION, MASTER_CONTAINER_NAME, efa_instances, + get_efa_security_group_id, run_on_container, ) @@ -54,6 +55,17 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): ) print(f"=== NCCL plugin diagnostics (master) ===\n{diag.stdout}") + # Dump SG rules to check for missing all-traffic self-referencing rule + sg_id = get_efa_security_group_id(aws_session) + sg_resp = aws_session.ec2.describe_security_groups(GroupIds=[sg_id]) + sg = sg_resp["SecurityGroups"][0] + print(f"=== Security Group {sg_id} rules ===") + for rule in sg.get("IpPermissions", []): + print(f" IN: {rule}") + for rule in sg.get("IpPermissionsEgress", []): + print(f" OUT: {rule}") + print("=== End SG rules ===") + # EFA sanity on master run_on_container( MASTER_CONTAINER_NAME, From cab5691c5ec0708ebb827ce6cd8c0ebbb266d91d Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 05:14:26 +0000 Subject: [PATCH 07/34] fix: add cuda-compat to LD_LIBRARY_PATH for driver forward compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The real failure is 'CUDA system not yet initialized' — the p4d host driver is older than what CUDA 13.0.2 requires. The cuda-compat package (installed via dnf upgrade cuda-compat-*) provides a forward-compatible libcuda.so at /usr/local/cuda/compat/. Prepend /usr/local/cuda/compat to LD_LIBRARY_PATH in the container ENV so the compat driver is always used, regardless of whether the command runs via the entrypoint or via docker exec. Also add the compat path in the allreduce test script as a belt-and- suspenders measure. --- docker/pytorch/Dockerfile.cuda | 6 ++++-- test/efa/scripts/nccl_allreduce.sh | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 4824907c28e7..3df2ce178128 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -223,8 +223,10 @@ COPY scripts/pytorch/start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH +# /usr/local/cuda/compat MUST be first — provides forward-compatible libcuda.so +# when the host driver is older than the toolkit version (CUDA 13.0). ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ NCCL_NET_PLUGIN=ofi EXPOSE 22 @@ -289,7 +291,7 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ NCCL_NET_PLUGIN=ofi EXPOSE 22 diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index 8596e1edb060..7064bca62368 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -11,6 +11,12 @@ if [[ -z "${CUDA_HOME}" ]]; then exit 1 fi +# CUDA forward compatibility: prepend compat libs if host driver is older +if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then + export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH} + echo "CUDA compat enabled: /usr/local/cuda/compat prepended to LD_LIBRARY_PATH" +fi + TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type) From dff255a7057e74878ec5edfb6a1c3f07ccf3e72a Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 05:39:43 +0000 Subject: [PATCH 08/34] fix: remove cuda-compat override from allreduce test script The cuda-compat block unconditionally prepends /usr/local/cuda/compat to LD_LIBRARY_PATH when libcuda.so.1 exists there. On p4d instances with --runtime=nvidia, the real host driver libcuda.so is mounted by the NVIDIA container runtime. The compat stub overrides it, causing 'CUDA system not yet initialized' because the compat library can't communicate with the actual GPU hardware. The entrypoint.sh already handles cuda-compat correctly (comparing driver versions), but docker exec commands bypass the entrypoint. The test script should not touch cuda-compat at all. --- test/efa/scripts/nccl_allreduce.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index 7064bca62368..e6ec41a78b1d 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -11,11 +11,6 @@ if [[ -z "${CUDA_HOME}" ]]; then exit 1 fi -# CUDA forward compatibility: prepend compat libs if host driver is older -if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then - export LD_LIBRARY_PATH=/usr/local/cuda/compat:${LD_LIBRARY_PATH} - echo "CUDA compat enabled: /usr/local/cuda/compat prepended to LD_LIBRARY_PATH" -fi TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type) From f9b77b9fc34f10b0abca0ad1e2cccae59f712529 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 06:26:34 +0000 Subject: [PATCH 09/34] fix: add cuda-compat to LD_LIBRARY_PATH for driver forward compatibility The cuda-compat RPM registers /usr/local/cuda/compat in the ldconfig cache via /etc/ld.so.conf.d/cuda-compat.conf. This makes the compat libcuda.so visible system-wide, overriding the real host driver mounted by --runtime=nvidia. Result: 'system not yet initialized'. Fix: remove the ldconfig conf file after installing cuda-compat so the compat libs are only used when explicitly prepended to LD_LIBRARY_PATH (which the entrypoint does after driver version comparison). --- docker/pytorch/Dockerfile.cuda | 4 ++++ test/efa/test_efa.py | 13 ++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 3df2ce178128..8b5bfd7a8018 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -205,6 +205,8 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ && dnf upgrade -y cuda-compat-* \ + && rm -f /etc/ld.so.conf.d/cuda-compat.conf \ + && ldconfig \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) @@ -272,6 +274,8 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch RUN dnf upgrade -y --security --releasever latest \ && dnf upgrade -y cuda-compat-* \ + && rm -f /etc/ld.so.conf.d/cuda-compat.conf \ + && ldconfig \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index e831b9c03ec5..b2ce4b9e09c6 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -43,17 +43,20 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): worker_conn, aws_session, ): - # Diagnostics: dump NCCL plugin state and network info + # Diagnostics: dump NCCL plugin state and CUDA driver info diag = run_on_container( MASTER_CONTAINER_NAME, master_conn, "echo NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN && " - "ls -la /opt/amazon/ofi-nccl/lib64/ 2>&1 && " - "ldd /opt/amazon/ofi-nccl/lib64/libnccl-net-ofi.so 2>&1 && " + "ldconfig -p | grep libcuda 2>&1 && " "echo --- && " - "ls -la /opt/amazon/ofi-nccl/lib64/libnccl-net.so 2>&1 || true", + "ls -la /usr/local/cuda/compat/libcuda* 2>&1 && " + "echo --- && " + "cat /etc/ld.so.conf.d/cuda*.conf 2>&1 || true && " + "echo --- && " + "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true", ) - print(f"=== NCCL plugin diagnostics (master) ===\n{diag.stdout}") + print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") # Dump SG rules to check for missing all-traffic self-referencing rule sg_id = get_efa_security_group_id(aws_session) From 401badb93e9434619ef977d1f40eb4f0287191e1 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 07:12:42 +0000 Subject: [PATCH 10/34] fix: use LD_PRELOAD for cuda-compat in EFA NCCL test Host driver (580.150) is older than container cuda-compat (580.159.04). The --runtime=nvidia mounts host libcuda.so which overrides LD_LIBRARY_PATH. Use LD_PRELOAD to force the cuda-compat version for forward compatibility. --- test/efa/scripts/nccl_allreduce.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index e6ec41a78b1d..c79710405f0e 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -54,10 +54,12 @@ check_efa_nccl_all_reduce_performance(){ } echo "Running all_reduce_perf test" +# Force cuda-compat libcuda.so for forward compatibility with older host drivers +export LD_PRELOAD=/usr/local/cuda/compat/libcuda.so.1 mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO -x NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-ofi} \ ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ - -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ + -x LD_PRELOAD -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" From 8d9f5f15f027d79cd50d2161f9bbe63e7e10a8e0 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 08:10:52 +0000 Subject: [PATCH 11/34] fix: use resolved path for LD_PRELOAD cuda-compat and add load verification --- test/efa/scripts/nccl_allreduce.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index c79710405f0e..d8c7a5470be8 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -54,8 +54,14 @@ check_efa_nccl_all_reduce_performance(){ } echo "Running all_reduce_perf test" -# Force cuda-compat libcuda.so for forward compatibility with older host drivers -export LD_PRELOAD=/usr/local/cuda/compat/libcuda.so.1 +# Force cuda-compat libcuda.so for forward compatibility with older host drivers. +# Use the actual .so file (not symlink) and also preload libcudadebugger. +COMPAT_DIR=/usr/local/cuda/compat +LIBCUDA_REAL=$(readlink -f ${COMPAT_DIR}/libcuda.so.1) +export LD_PRELOAD="${LIBCUDA_REAL}" +echo "LD_PRELOAD=${LD_PRELOAD}" +echo "Verifying compat libcuda loads:" +python3 -c "import ctypes; lib=ctypes.CDLL('${LIBCUDA_REAL}'); print(f'Loaded: {lib._name}')" 2>&1 || echo "Failed to load compat libcuda" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO -x NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-ofi} \ ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ From 8e88edbcdd0e6e36e4c2d232691978f8f9c8bd95 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 08:41:32 +0000 Subject: [PATCH 12/34] fix: stop upgrading cuda-compat to avoid driver mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dnf upgrade -y cuda-compat-* pulled 580.159.04 which is newer than the DLAMI host driver (580.150). The 580.159 userspace can't initialize with the 580.150 kernel module, causing "system not yet initialized". Remove the cuda-compat upgrade — the base image already ships a compatible version. The host AMI's driver is the ceiling. --- docker/pytorch/Dockerfile.cuda | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 8b5bfd7a8018..42f8c505f7a8 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -204,7 +204,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ + && echo "Skipping cuda-compat upgrade to avoid mismatch with host driver" \ && rm -f /etc/ld.so.conf.d/cuda-compat.conf \ && ldconfig \ && dnf clean all @@ -273,7 +273,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ + && echo "Skipping cuda-compat upgrade to avoid mismatch with host driver" \ && rm -f /etc/ld.so.conf.d/cuda-compat.conf \ && ldconfig \ && dnf clean all From 8b71703a36e53307a6e710246614702c8c9c3425 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 08:42:12 +0000 Subject: [PATCH 13/34] cleanup: remove LD_PRELOAD hack, real fix is in Dockerfile cuda-compat --- test/efa/scripts/nccl_allreduce.sh | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index d8c7a5470be8..e6ec41a78b1d 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -54,18 +54,10 @@ check_efa_nccl_all_reduce_performance(){ } echo "Running all_reduce_perf test" -# Force cuda-compat libcuda.so for forward compatibility with older host drivers. -# Use the actual .so file (not symlink) and also preload libcudadebugger. -COMPAT_DIR=/usr/local/cuda/compat -LIBCUDA_REAL=$(readlink -f ${COMPAT_DIR}/libcuda.so.1) -export LD_PRELOAD="${LIBCUDA_REAL}" -echo "LD_PRELOAD=${LD_PRELOAD}" -echo "Verifying compat libcuda loads:" -python3 -c "import ctypes; lib=ctypes.CDLL('${LIBCUDA_REAL}'); print(f'Loaded: {lib._name}')" 2>&1 || echo "Failed to load compat libcuda" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ -x NCCL_DEBUG=INFO -x NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-ofi} \ ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ - -x LD_PRELOAD -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ + -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" From 5c866ccf71b300847c7246888f40aab87eedafe2 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 14:34:28 +0000 Subject: [PATCH 14/34] fix: keep cuda-compat.conf for driver forward compatibility The nvidia container runtime uses ldconfig (not LD_LIBRARY_PATH) to resolve libcuda.so. Previously we removed cuda-compat.conf causing ldconfig to resolve to the host driver at /lib64/ instead of the cuda-compat version at /usr/local/cuda/compat/. This broke EFA tests when the container's cuda-compat (580.159) is newer than the host driver (580.150). Fix: write /usr/local/cuda/compat to cuda-compat.conf so ldconfig prefers the forward-compatible library. Also restores dnf upgrade cuda-compat-* for security patching. --- docker/pytorch/Dockerfile.cuda | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 42f8c505f7a8..1c565d65a4e5 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -204,8 +204,8 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ - && echo "Skipping cuda-compat upgrade to avoid mismatch with host driver" \ - && rm -f /etc/ld.so.conf.d/cuda-compat.conf \ + && dnf upgrade -y cuda-compat-* \ + && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ && ldconfig \ && dnf clean all @@ -273,8 +273,8 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch RUN dnf upgrade -y --security --releasever latest \ - && echo "Skipping cuda-compat upgrade to avoid mismatch with host driver" \ - && rm -f /etc/ld.so.conf.d/cuda-compat.conf \ + && dnf upgrade -y cuda-compat-* \ + && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ && ldconfig \ && dnf clean all From 4dbc5bd34366186d233da49bd93d0e166b12fa4f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 15:36:45 +0000 Subject: [PATCH 15/34] chore: retrigger CI From 9cd117d5495a26bdbed1f028a227d5b13b61ea53 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 16:24:02 +0000 Subject: [PATCH 16/34] fix: exclude cuda-compat from security upgrade to match host driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cuda-compat 580.159 is incompatible with DLAMI host driver 580.150 — the newer userspace cannot talk to the older kernel module, causing "system not yet initialized" in all CUDA operations. Exclude cuda-compat from dnf upgrade --security and allowlist CVE-2025-33219 until the DLAMI is updated to driver >= 580.159. Keep cuda-compat.conf for forward compat when AMI does get updated. --- docker/pytorch/Dockerfile.cuda | 6 ++---- .../pytorch_runtime/framework_allowlist.json | 7 +++++++ 2 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 1c565d65a4e5..4df23704d972 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -203,8 +203,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && rm /tmp/bash_telemetry.sh.template # Security patch — run after all installers so every OS package is covered -RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ +RUN dnf upgrade -y --security --releasever latest --exclude='cuda-compat-*' \ && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ && ldconfig \ && dnf clean all @@ -272,8 +271,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && rm /tmp/bash_telemetry.sh.template # Security patch -RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ +RUN dnf upgrade -y --security --releasever latest --exclude='cuda-compat-*' \ && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ && ldconfig \ && dnf clean all diff --git a/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json new file mode 100644 index 000000000000..07102743d842 --- /dev/null +++ b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json @@ -0,0 +1,7 @@ +[ + { + "vulnerability_id": "CVE-2025-33219", + "reason": "NVIDIA display driver kernel module vulnerability in cuda-compat package. Cannot upgrade cuda-compat beyond host driver version (580.150) without breaking CUDA initialization. Awaiting DLAMI driver update.", + "review_by": "2026-06-20" + } +] From d672e8b1c4d14a1f60f7affe69093a76a65def79 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 16:35:11 +0000 Subject: [PATCH 17/34] fix: add CACHE_REFRESH ARG to invalidate cached security patch layer --- docker/pytorch/Dockerfile.cuda | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 4df23704d972..f11f89898b6d 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -202,6 +202,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && chmod +x /usr/local/bin/bash_telemetry.sh \ && rm /tmp/bash_telemetry.sh.template +ARG CACHE_REFRESH=0 # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest --exclude='cuda-compat-*' \ && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ From d211053b07e70311aebfdc93c52f24d08d8fd23c Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 16:36:44 +0000 Subject: [PATCH 18/34] cleanup: remove NCCL_NET_PLUGIN=ofi debug env var --- docker/pytorch/Dockerfile.cuda | 2 -- 1 file changed, 2 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index f11f89898b6d..0ef6d83ed422 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -229,7 +229,6 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # when the host driver is older than the toolkit version (CUDA 13.0). ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ - NCCL_NET_PLUGIN=ofi EXPOSE 22 WORKDIR /workspace @@ -295,7 +294,6 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ - NCCL_NET_PLUGIN=ofi EXPOSE 22 WORKDIR /workspace From fdc20ba7daa179a67c0951687fb44a26963716c0 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 16:37:24 +0000 Subject: [PATCH 19/34] fix: remove trailing backslash from ENV after NCCL_NET_PLUGIN removal --- docker/pytorch/Dockerfile.cuda | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 0ef6d83ed422..82eda50e57c3 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -228,7 +228,7 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # /usr/local/cuda/compat MUST be first — provides forward-compatible libcuda.so # when the host driver is older than the toolkit version (CUDA 13.0). ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" EXPOSE 22 WORKDIR /workspace @@ -293,7 +293,7 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" \ + LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" EXPOSE 22 WORKDIR /workspace From 8d56a93d1585e72a9ad04b4b82523b9950058e78 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 16:42:48 +0000 Subject: [PATCH 20/34] fix: remove cuda-compat upgrade to prevent driver version mismatch dnf upgrade -y cuda-compat-* pulls 580.159 which is newer than the host driver (580.150) on both DLAMI and CodeBuild GPU runners. The newer userspace library cannot communicate with the older kernel module, causing "system not yet initialized" on all CUDA operations. Remove the cuda-compat upgrade and allowlist CVE-2025-33219 until host drivers are updated to >= 580.159. --- .claude/scheduled_tasks.lock | 1 + .claude/settings.json | 66 ++++++++++++++++++++++++++++++ docker/pytorch/Dockerfile.cuda | 15 ++----- scripts/pytorch/install_efa.sh | 9 ---- test/efa/scripts/nccl_allreduce.sh | 4 +- test/efa/test_efa.py | 48 +--------------------- 6 files changed, 74 insertions(+), 69 deletions(-) create mode 100644 .claude/scheduled_tasks.lock create mode 100644 .claude/settings.json diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 000000000000..e6daa6d6c9c8 --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"644c89d2-7615-4a63-bbc2-28dfe8dfd982","pid":1940,"procStart":"14131840","acquiredAt":1779259323980} \ No newline at end of file diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 000000000000..7a54c7ccb08a --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,66 @@ +{ + "permissions": { + "allow": [ + "Bash(git mv *)", + "Bash(python3 -c ' *)", + "Bash(git reset *)", + "Bash(git show *)", + "Bash(git revert *)", + "Bash(grep -v \"^$\")", + "Bash(docker pull *)", + "Bash(docker rmi *)", + "Bash(git rebase *)", + "Bash(git branch *)", + "Bash(git diff *)", + "Bash(ada --version)", + "Bash(ada credentials *)", + "Bash(aws ec2 *)", + "Bash(aws sts *)", + "Bash(aws service-quotas *)", + "Bash(aws ssm *)", + "Bash(aws logs *)", + "Bash(aws s3 *)", + "Bash(aws sagemaker *)", + "Bash(aws sagemaker-runtime *)", + "Bash(aws iam *)", + "Bash(aws stepfunctions *)", + "Bash(gh pr *)", + "Bash(gh run *)", + "Bash(gh workflow *)", + "Bash(javap *)", + "Bash(ripcli *)", + "Bash(aim *)", + "Bash(git add *)", + "Bash(git commit *)", + "Bash(git fetch *)", + "Bash(git checkout *)", + "Bash(git stash *)", + "Bash(pre-commit *)", + "Bash(curl *)", + "mcp__builder-mcp__ReadInternalWebsites", + "mcp__builder-mcp__InternalCodeSearch", + "mcp__builder-mcp__InternalSearch", + "mcp__builder-mcp__TaskeiListTasks", + "mcp__builder-mcp__TaskeiGetRooms", + "mcp__builder-mcp__TaskeiGetTask", + "mcp__builder-mcp__GetPipelineDetails", + "mcp__builder-mcp__GetPipelineHealth", + "mcp__builder-mcp__ReadRemoteTestRun", + "mcp__builder-mcp__QuipEditor", + "mcp__builder-mcp__TaskeiUpdateTask", + "mcp__builder-mcp__TaskeiCreateTask", + "mcp__builder-mcp__CRRevisionCreator", + "mcp__builder-mcp__CrCheckout", + "mcp__builder-mcp__CRAddComment", + "Bash(python3 *)", + "Bash(gh *)", + "Bash(git ls-tree *)" + ], + "additionalDirectories": [ + "/home/kwanggg/workplace/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", + "/workplace/kwanggg/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", + "/home/kwanggg/workplace/AsimovImageSecurityScan/src/asimov_image_security_scan", + "/workplace/kwanggg/AsimovImageSecurityScan/src/asimov_image_security_scan" + ] + } +} diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 82eda50e57c3..96251e67b1ab 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -202,11 +202,8 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && chmod +x /usr/local/bin/bash_telemetry.sh \ && rm /tmp/bash_telemetry.sh.template -ARG CACHE_REFRESH=0 # Security patch — run after all installers so every OS package is covered -RUN dnf upgrade -y --security --releasever latest --exclude='cuda-compat-*' \ - && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ - && ldconfig \ +RUN dnf upgrade -y --security --releasever latest \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) @@ -225,10 +222,8 @@ COPY scripts/pytorch/start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH -# /usr/local/cuda/compat MUST be first — provides forward-compatible libcuda.so -# when the host driver is older than the toolkit version (CUDA 13.0). ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" + LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" EXPOSE 22 WORKDIR /workspace @@ -271,9 +266,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && rm /tmp/bash_telemetry.sh.template # Security patch -RUN dnf upgrade -y --security --releasever latest --exclude='cuda-compat-*' \ - && echo "/usr/local/cuda/compat" >/etc/ld.so.conf.d/cuda-compat.conf \ - && ldconfig \ +RUN dnf upgrade -y --security --releasever latest \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) @@ -293,7 +286,7 @@ RUN chmod +x /usr/local/bin/entrypoint.sh /usr/local/bin/start_cuda_compat.sh # PATH and LD_LIBRARY_PATH ENV PATH="/opt/venv/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/cuda/compat:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" + LD_LIBRARY_PATH="/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib:/opt/venv/lib/python3.12/site-packages/nvidia/nccl/lib:/opt/amazon/ofi-nccl/lib64:/opt/amazon/openmpi/lib:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/cuda/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" EXPOSE 22 WORKDIR /workspace diff --git a/scripts/pytorch/install_efa.sh b/scripts/pytorch/install_efa.sh index abeac219168e..5106a6a4d19c 100755 --- a/scripts/pytorch/install_efa.sh +++ b/scripts/pytorch/install_efa.sh @@ -13,15 +13,6 @@ cd aws-efa-installer ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify rm -rf /tmp/efa -# Verify OFI NCCL plugin was installed -OFI_LIB_DIR="/opt/amazon/ofi-nccl/lib64" -if [ ! -f "${OFI_LIB_DIR}/libnccl-net-ofi.so" ]; then - echo "ERROR: ${OFI_LIB_DIR}/libnccl-net-ofi.so not found after EFA install" - ls -la "${OFI_LIB_DIR}/" 2>/dev/null || echo "Directory does not exist" - exit 1 -fi -echo "NCCL OFI plugin found at: ${OFI_LIB_DIR}/libnccl-net-ofi.so" - # Configure OpenMPI — allow root execution mv "${OPEN_MPI_PATH}/bin/mpirun" "${OPEN_MPI_PATH}/bin/mpirun.real" cat > "${OPEN_MPI_PATH}/bin/mpirun" <<'WRAPPER' diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index e6ec41a78b1d..bd185bfb8169 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -11,7 +11,6 @@ if [[ -z "${CUDA_HOME}" ]]; then exit 1 fi - TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") INSTANCE_TYPE=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/instance-type) @@ -55,8 +54,7 @@ check_efa_nccl_all_reduce_performance(){ echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ - -x NCCL_DEBUG=INFO -x NCCL_NET_PLUGIN=${NCCL_NET_PLUGIN:-ofi} \ - ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ + -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index b2ce4b9e09c6..aa06a572c0e8 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -11,13 +11,11 @@ import os -import pytest from efa.ec2_helpers import ( DEFAULT_TIMEOUT, HOSTS_FILE_LOCATION, MASTER_CONTAINER_NAME, efa_instances, - get_efa_security_group_id, run_on_container, ) @@ -43,32 +41,6 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): worker_conn, aws_session, ): - # Diagnostics: dump NCCL plugin state and CUDA driver info - diag = run_on_container( - MASTER_CONTAINER_NAME, - master_conn, - "echo NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN && " - "ldconfig -p | grep libcuda 2>&1 && " - "echo --- && " - "ls -la /usr/local/cuda/compat/libcuda* 2>&1 && " - "echo --- && " - "cat /etc/ld.so.conf.d/cuda*.conf 2>&1 || true && " - "echo --- && " - "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true", - ) - print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") - - # Dump SG rules to check for missing all-traffic self-referencing rule - sg_id = get_efa_security_group_id(aws_session) - sg_resp = aws_session.ec2.describe_security_groups(GroupIds=[sg_id]) - sg = sg_resp["SecurityGroups"][0] - print(f"=== Security Group {sg_id} rules ===") - for rule in sg.get("IpPermissions", []): - print(f" IN: {rule}") - for rule in sg.get("IpPermissionsEgress", []): - print(f" OUT: {rule}") - print("=== End SG rules ===") - # EFA sanity on master run_on_container( MASTER_CONTAINER_NAME, @@ -76,26 +48,10 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "/test/efa/scripts/efa_sanity.sh", ) - # NCCL all_reduce across 2 nodes — capture failure details - result = run_on_container( + # NCCL all_reduce across 2 nodes + run_on_container( MASTER_CONTAINER_NAME, master_conn, f"/test/efa/scripts/nccl_allreduce.sh {HOSTS_FILE_LOCATION} 2", timeout=DEFAULT_TIMEOUT, - warn=True, ) - if result.failed: - print(f"=== NCCL allreduce FAILED (exit code {result.return_code}) ===") - print(f"=== stdout ===\n{result.stdout}") - print(f"=== stderr ===\n{result.stderr}") - log_dump = run_on_container( - MASTER_CONTAINER_NAME, - master_conn, - "cat /test/efa/logs/testEFA.log 2>&1 || echo 'Log file empty or missing'", - warn=True, - ) - print(f"=== testEFA.log ===\n{log_dump.stdout}") - pytest.fail( - f"NCCL allreduce failed with exit code {result.return_code}. " - f"See stdout above for details." - ) From 50e12ca029efefe9526bf191cfb9f506f5719d77 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 16:42:56 +0000 Subject: [PATCH 21/34] remove .claude from tracking --- .claude/scheduled_tasks.lock | 1 - .claude/settings.json | 66 ------------------------------------ 2 files changed, 67 deletions(-) delete mode 100644 .claude/scheduled_tasks.lock delete mode 100644 .claude/settings.json diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock deleted file mode 100644 index e6daa6d6c9c8..000000000000 --- a/.claude/scheduled_tasks.lock +++ /dev/null @@ -1 +0,0 @@ -{"sessionId":"644c89d2-7615-4a63-bbc2-28dfe8dfd982","pid":1940,"procStart":"14131840","acquiredAt":1779259323980} \ No newline at end of file diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 7a54c7ccb08a..000000000000 --- a/.claude/settings.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(git mv *)", - "Bash(python3 -c ' *)", - "Bash(git reset *)", - "Bash(git show *)", - "Bash(git revert *)", - "Bash(grep -v \"^$\")", - "Bash(docker pull *)", - "Bash(docker rmi *)", - "Bash(git rebase *)", - "Bash(git branch *)", - "Bash(git diff *)", - "Bash(ada --version)", - "Bash(ada credentials *)", - "Bash(aws ec2 *)", - "Bash(aws sts *)", - "Bash(aws service-quotas *)", - "Bash(aws ssm *)", - "Bash(aws logs *)", - "Bash(aws s3 *)", - "Bash(aws sagemaker *)", - "Bash(aws sagemaker-runtime *)", - "Bash(aws iam *)", - "Bash(aws stepfunctions *)", - "Bash(gh pr *)", - "Bash(gh run *)", - "Bash(gh workflow *)", - "Bash(javap *)", - "Bash(ripcli *)", - "Bash(aim *)", - "Bash(git add *)", - "Bash(git commit *)", - "Bash(git fetch *)", - "Bash(git checkout *)", - "Bash(git stash *)", - "Bash(pre-commit *)", - "Bash(curl *)", - "mcp__builder-mcp__ReadInternalWebsites", - "mcp__builder-mcp__InternalCodeSearch", - "mcp__builder-mcp__InternalSearch", - "mcp__builder-mcp__TaskeiListTasks", - "mcp__builder-mcp__TaskeiGetRooms", - "mcp__builder-mcp__TaskeiGetTask", - "mcp__builder-mcp__GetPipelineDetails", - "mcp__builder-mcp__GetPipelineHealth", - "mcp__builder-mcp__ReadRemoteTestRun", - "mcp__builder-mcp__QuipEditor", - "mcp__builder-mcp__TaskeiUpdateTask", - "mcp__builder-mcp__TaskeiCreateTask", - "mcp__builder-mcp__CRRevisionCreator", - "mcp__builder-mcp__CrCheckout", - "mcp__builder-mcp__CRAddComment", - "Bash(python3 *)", - "Bash(gh *)", - "Bash(git ls-tree *)" - ], - "additionalDirectories": [ - "/home/kwanggg/workplace/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", - "/workplace/kwanggg/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", - "/home/kwanggg/workplace/AsimovImageSecurityScan/src/asimov_image_security_scan", - "/workplace/kwanggg/AsimovImageSecurityScan/src/asimov_image_security_scan" - ] - } -} From f1db9ade8f603bba7bc4ccf15346debc46464bd2 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 17:11:32 +0000 Subject: [PATCH 22/34] debug: add pre-flight checks and log dump for EFA NCCL test --- test/efa/scripts/nccl_allreduce.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index bd185bfb8169..9871e399cf69 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -52,9 +52,15 @@ check_efa_nccl_all_reduce_performance(){ fi } +echo "=== Pre-flight checks ===" +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +ls /opt/amazon/ofi-nccl/lib64/libnccl-net* 2>/dev/null || echo "WARNING: ofi-nccl plugin not found" +nvidia-smi -L 2>&1 | head -2 +echo "===" + echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ - -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ + -x NCCL_DEBUG=WARN ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" @@ -63,7 +69,10 @@ RETURN_VAL=${PIPESTATUS[0]} if [ ${RETURN_VAL} -eq 0 ]; then echo "check_efa_nccl_all_reduce passed" else - echo "check_efa_nccl_all_reduce failed" + echo "check_efa_nccl_all_reduce failed (exit code: ${RETURN_VAL})" + echo "=== Full test log ===" + cat "${TRAINING_LOG}" + echo "=== End log ===" fi validate_all_reduce_performance_logs From 0a626a5a8268fd06efaeecb573b61f815ed36f3f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 17:34:59 +0000 Subject: [PATCH 23/34] fix: add CACHE_BUST ARG to force rebuild without stale cuda-compat --- docker/pytorch/Dockerfile.cuda | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 96251e67b1ab..ae59c3a1765f 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -202,6 +202,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && chmod +x /usr/local/bin/bash_telemetry.sh \ && rm /tmp/bash_telemetry.sh.template +ARG CACHE_BUST=1 # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ && dnf clean all From 792ef2c904204050e55c859020eb2837473e4a73 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 17:41:52 +0000 Subject: [PATCH 24/34] fix: restore EFA test diagnostics (CUDA driver info, SG rules, NCCL allreduce capture) --- test/efa/test_efa.py | 48 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index aa06a572c0e8..b2ce4b9e09c6 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -11,11 +11,13 @@ import os +import pytest from efa.ec2_helpers import ( DEFAULT_TIMEOUT, HOSTS_FILE_LOCATION, MASTER_CONTAINER_NAME, efa_instances, + get_efa_security_group_id, run_on_container, ) @@ -41,6 +43,32 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): worker_conn, aws_session, ): + # Diagnostics: dump NCCL plugin state and CUDA driver info + diag = run_on_container( + MASTER_CONTAINER_NAME, + master_conn, + "echo NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN && " + "ldconfig -p | grep libcuda 2>&1 && " + "echo --- && " + "ls -la /usr/local/cuda/compat/libcuda* 2>&1 && " + "echo --- && " + "cat /etc/ld.so.conf.d/cuda*.conf 2>&1 || true && " + "echo --- && " + "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true", + ) + print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") + + # Dump SG rules to check for missing all-traffic self-referencing rule + sg_id = get_efa_security_group_id(aws_session) + sg_resp = aws_session.ec2.describe_security_groups(GroupIds=[sg_id]) + sg = sg_resp["SecurityGroups"][0] + print(f"=== Security Group {sg_id} rules ===") + for rule in sg.get("IpPermissions", []): + print(f" IN: {rule}") + for rule in sg.get("IpPermissionsEgress", []): + print(f" OUT: {rule}") + print("=== End SG rules ===") + # EFA sanity on master run_on_container( MASTER_CONTAINER_NAME, @@ -48,10 +76,26 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "/test/efa/scripts/efa_sanity.sh", ) - # NCCL all_reduce across 2 nodes - run_on_container( + # NCCL all_reduce across 2 nodes — capture failure details + result = run_on_container( MASTER_CONTAINER_NAME, master_conn, f"/test/efa/scripts/nccl_allreduce.sh {HOSTS_FILE_LOCATION} 2", timeout=DEFAULT_TIMEOUT, + warn=True, ) + if result.failed: + print(f"=== NCCL allreduce FAILED (exit code {result.return_code}) ===") + print(f"=== stdout ===\n{result.stdout}") + print(f"=== stderr ===\n{result.stderr}") + log_dump = run_on_container( + MASTER_CONTAINER_NAME, + master_conn, + "cat /test/efa/logs/testEFA.log 2>&1 || echo 'Log file empty or missing'", + warn=True, + ) + print(f"=== testEFA.log ===\n{log_dump.stdout}") + pytest.fail( + f"NCCL allreduce failed with exit code {result.return_code}. " + f"See stdout above for details." + ) From c39e5bfb10ba4292f0d81abed04a21bd48d6542c Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 18:14:51 +0000 Subject: [PATCH 25/34] debug: add CUDA availability check on p4d, remove CACHE_BUST single-gpu passes (CodeBuild) but EFA fails (DLAMI p4d) with same image. Add torch.cuda.is_available() + nvidia-smi -L inside the container on p4d to determine if CUDA is accessible at all. Remove CACHE_BUST ARG since image is confirmed correct (single-gpu passes). --- docker/pytorch/Dockerfile.cuda | 1 - test/efa/test_efa.py | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index ae59c3a1765f..96251e67b1ab 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -202,7 +202,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ && chmod +x /usr/local/bin/bash_telemetry.sh \ && rm /tmp/bash_telemetry.sh.template -ARG CACHE_BUST=1 # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ && dnf clean all diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index b2ce4b9e09c6..f0584a82f4ba 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -54,7 +54,11 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "echo --- && " "cat /etc/ld.so.conf.d/cuda*.conf 2>&1 || true && " "echo --- && " - "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true", + "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true && " + "echo --- && " + "nvidia-smi -L 2>&1 || true && " + "echo --- && " + 'python3 -c \'import torch; print(f"CUDA available: {torch.cuda.is_available()}"); print(f"Device count: {torch.cuda.device_count()}")\' 2>&1 || true', ) print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") From 46db31028520190839755f556116450bb6b1fb31 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 19:21:39 +0000 Subject: [PATCH 26/34] fix: nested quote syntax error in CUDA diagnostic command --- test/efa/test_efa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index f0584a82f4ba..eafcc0d32767 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -58,7 +58,7 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "echo --- && " "nvidia-smi -L 2>&1 || true && " "echo --- && " - 'python3 -c \'import torch; print(f"CUDA available: {torch.cuda.is_available()}"); print(f"Device count: {torch.cuda.device_count()}")\' 2>&1 || true', + "python3 -c \"import torch; print('CUDA available:', torch.cuda.is_available()); print('Device count:', torch.cuda.device_count())\" 2>&1 || true", ) print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") From 058afca55a6a620facd2b0fedc48e7a1c0141a92 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 19:40:22 +0000 Subject: [PATCH 27/34] fix: remove python torch check that breaks nested quoting, keep nvidia-smi only --- test/efa/test_efa.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index eafcc0d32767..4b8ddb19960a 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -56,9 +56,7 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "echo --- && " "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true && " "echo --- && " - "nvidia-smi -L 2>&1 || true && " - "echo --- && " - "python3 -c \"import torch; print('CUDA available:', torch.cuda.is_available()); print('Device count:', torch.cuda.device_count())\" 2>&1 || true", + "nvidia-smi -L 2>&1 || true", ) print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") From 3fc5b6e2a7fe16601002748af32e1fb057f1246c Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 20:15:45 +0000 Subject: [PATCH 28/34] fix: add diagnostic logging to EFA test for failure analysis Revert all Dockerfile/allowlist changes. Keep only EFA test diagnostics: - test_efa.py: CUDA driver info, ldconfig, nvidia-smi, SG rules - nccl_allreduce.sh: pre-flight checks, full log dump on failure --- .claude/scheduled_tasks.lock | 1 + .claude/settings.json | 66 +++++++++++++++++++ docker/pytorch/Dockerfile.cuda | 2 + .../pytorch_runtime/framework_allowlist.json | 7 -- 4 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 .claude/scheduled_tasks.lock create mode 100644 .claude/settings.json delete mode 100644 test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 000000000000..e6daa6d6c9c8 --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"644c89d2-7615-4a63-bbc2-28dfe8dfd982","pid":1940,"procStart":"14131840","acquiredAt":1779259323980} \ No newline at end of file diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 000000000000..7a54c7ccb08a --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,66 @@ +{ + "permissions": { + "allow": [ + "Bash(git mv *)", + "Bash(python3 -c ' *)", + "Bash(git reset *)", + "Bash(git show *)", + "Bash(git revert *)", + "Bash(grep -v \"^$\")", + "Bash(docker pull *)", + "Bash(docker rmi *)", + "Bash(git rebase *)", + "Bash(git branch *)", + "Bash(git diff *)", + "Bash(ada --version)", + "Bash(ada credentials *)", + "Bash(aws ec2 *)", + "Bash(aws sts *)", + "Bash(aws service-quotas *)", + "Bash(aws ssm *)", + "Bash(aws logs *)", + "Bash(aws s3 *)", + "Bash(aws sagemaker *)", + "Bash(aws sagemaker-runtime *)", + "Bash(aws iam *)", + "Bash(aws stepfunctions *)", + "Bash(gh pr *)", + "Bash(gh run *)", + "Bash(gh workflow *)", + "Bash(javap *)", + "Bash(ripcli *)", + "Bash(aim *)", + "Bash(git add *)", + "Bash(git commit *)", + "Bash(git fetch *)", + "Bash(git checkout *)", + "Bash(git stash *)", + "Bash(pre-commit *)", + "Bash(curl *)", + "mcp__builder-mcp__ReadInternalWebsites", + "mcp__builder-mcp__InternalCodeSearch", + "mcp__builder-mcp__InternalSearch", + "mcp__builder-mcp__TaskeiListTasks", + "mcp__builder-mcp__TaskeiGetRooms", + "mcp__builder-mcp__TaskeiGetTask", + "mcp__builder-mcp__GetPipelineDetails", + "mcp__builder-mcp__GetPipelineHealth", + "mcp__builder-mcp__ReadRemoteTestRun", + "mcp__builder-mcp__QuipEditor", + "mcp__builder-mcp__TaskeiUpdateTask", + "mcp__builder-mcp__TaskeiCreateTask", + "mcp__builder-mcp__CRRevisionCreator", + "mcp__builder-mcp__CrCheckout", + "mcp__builder-mcp__CRAddComment", + "Bash(python3 *)", + "Bash(gh *)", + "Bash(git ls-tree *)" + ], + "additionalDirectories": [ + "/home/kwanggg/workplace/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", + "/workplace/kwanggg/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", + "/home/kwanggg/workplace/AsimovImageSecurityScan/src/asimov_image_security_scan", + "/workplace/kwanggg/AsimovImageSecurityScan/src/asimov_image_security_scan" + ] + } +} diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 96251e67b1ab..93effb4ff91d 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -204,6 +204,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ + && dnf upgrade -y cuda-compat-* \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) @@ -267,6 +268,7 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch RUN dnf upgrade -y --security --releasever latest \ + && dnf upgrade -y cuda-compat-* \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) diff --git a/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json deleted file mode 100644 index 07102743d842..000000000000 --- a/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json +++ /dev/null @@ -1,7 +0,0 @@ -[ - { - "vulnerability_id": "CVE-2025-33219", - "reason": "NVIDIA display driver kernel module vulnerability in cuda-compat package. Cannot upgrade cuda-compat beyond host driver version (580.150) without breaking CUDA initialization. Awaiting DLAMI driver update.", - "review_by": "2026-06-20" - } -] From 3e7cfc1c4a67333f2c4f152f6872157d45f19b39 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 20:15:55 +0000 Subject: [PATCH 29/34] remove .claude from tracking --- .claude/scheduled_tasks.lock | 1 - .claude/settings.json | 66 ------------------------------------ 2 files changed, 67 deletions(-) delete mode 100644 .claude/scheduled_tasks.lock delete mode 100644 .claude/settings.json diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock deleted file mode 100644 index e6daa6d6c9c8..000000000000 --- a/.claude/scheduled_tasks.lock +++ /dev/null @@ -1 +0,0 @@ -{"sessionId":"644c89d2-7615-4a63-bbc2-28dfe8dfd982","pid":1940,"procStart":"14131840","acquiredAt":1779259323980} \ No newline at end of file diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 7a54c7ccb08a..000000000000 --- a/.claude/settings.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(git mv *)", - "Bash(python3 -c ' *)", - "Bash(git reset *)", - "Bash(git show *)", - "Bash(git revert *)", - "Bash(grep -v \"^$\")", - "Bash(docker pull *)", - "Bash(docker rmi *)", - "Bash(git rebase *)", - "Bash(git branch *)", - "Bash(git diff *)", - "Bash(ada --version)", - "Bash(ada credentials *)", - "Bash(aws ec2 *)", - "Bash(aws sts *)", - "Bash(aws service-quotas *)", - "Bash(aws ssm *)", - "Bash(aws logs *)", - "Bash(aws s3 *)", - "Bash(aws sagemaker *)", - "Bash(aws sagemaker-runtime *)", - "Bash(aws iam *)", - "Bash(aws stepfunctions *)", - "Bash(gh pr *)", - "Bash(gh run *)", - "Bash(gh workflow *)", - "Bash(javap *)", - "Bash(ripcli *)", - "Bash(aim *)", - "Bash(git add *)", - "Bash(git commit *)", - "Bash(git fetch *)", - "Bash(git checkout *)", - "Bash(git stash *)", - "Bash(pre-commit *)", - "Bash(curl *)", - "mcp__builder-mcp__ReadInternalWebsites", - "mcp__builder-mcp__InternalCodeSearch", - "mcp__builder-mcp__InternalSearch", - "mcp__builder-mcp__TaskeiListTasks", - "mcp__builder-mcp__TaskeiGetRooms", - "mcp__builder-mcp__TaskeiGetTask", - "mcp__builder-mcp__GetPipelineDetails", - "mcp__builder-mcp__GetPipelineHealth", - "mcp__builder-mcp__ReadRemoteTestRun", - "mcp__builder-mcp__QuipEditor", - "mcp__builder-mcp__TaskeiUpdateTask", - "mcp__builder-mcp__TaskeiCreateTask", - "mcp__builder-mcp__CRRevisionCreator", - "mcp__builder-mcp__CrCheckout", - "mcp__builder-mcp__CRAddComment", - "Bash(python3 *)", - "Bash(gh *)", - "Bash(git ls-tree *)" - ], - "additionalDirectories": [ - "/home/kwanggg/workplace/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", - "/workplace/kwanggg/AsimovAgent/src/AsimovBuilderCoreContext/skills/pytorch-currency", - "/home/kwanggg/workplace/AsimovImageSecurityScan/src/asimov_image_security_scan", - "/workplace/kwanggg/AsimovImageSecurityScan/src/asimov_image_security_scan" - ] - } -} From 51c24c69d5ad5c83dace1f5147d4170726214b43 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 20 May 2026 20:17:37 +0000 Subject: [PATCH 30/34] chore: trigger EFA test rebuild --- docker/pytorch/Dockerfile.cuda | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 93effb4ff91d..8e8bf83f45ed 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -1,3 +1,4 @@ +# trigger EFA test # ============================================================================ # PyTorch DLC — Amazon Linux 2023 (CUDA 13.0) # Multi-stage build with parallel builder stages: From 98027feb83122aa1706795633a4a3249f2111f40 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 21 May 2026 02:16:41 +0000 Subject: [PATCH 31/34] fix: remove cuda-compat upgrade to fix EFA test on DLAMI Confirmed: base nvidia/cuda:13.0.2 image works on DLAMI p4d (A100, driver 580.150). Our image fails because dnf upgrade cuda-compat-* pulls 580.159 which is incompatible with the DLAMI's embargo driver. Remove the upgrade and allowlist CVE-2025-33219 until DLAMI ships the public 580.159+ driver. --- docker/pytorch/Dockerfile.cuda | 3 --- .../pytorch_runtime/framework_allowlist.json | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 8e8bf83f45ed..96251e67b1ab 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -1,4 +1,3 @@ -# trigger EFA test # ============================================================================ # PyTorch DLC — Amazon Linux 2023 (CUDA 13.0) # Multi-stage build with parallel builder stages: @@ -205,7 +204,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) @@ -269,7 +267,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) diff --git a/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json new file mode 100644 index 000000000000..757ef73d0ea4 --- /dev/null +++ b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json @@ -0,0 +1,7 @@ +[ + { + "vulnerability_id": "CVE-2025-33219", + "reason": "NVIDIA display driver vulnerability in cuda-compat. Cannot upgrade cuda-compat beyond base image version (580.95) due to incompatibility with DLAMI embargo host driver (580.150). Awaiting DLAMI driver update to public 580.159+.", + "review_by": "2026-06-20" + } +] From 495a4c99f458a0120295f01934e594eaa5631867 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 21 May 2026 02:54:05 +0000 Subject: [PATCH 32/34] fix: pin EFA test to pre-embargo DLAMI (before 2026-05-05) --- .github/scripts/efa/ec2_helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py index 0cba98608ab5..c510faf6f92b 100644 --- a/.github/scripts/efa/ec2_helpers.py +++ b/.github/scripts/efa/ec2_helpers.py @@ -393,7 +393,9 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION Yields (master_conn, worker_conn, aws_session) where connections are to the EC2 hosts. """ aws_session = AWSSessionManager(region=region) - ami_id = aws_session.get_latest_ami() + # TODO: Remove before_date once DLAMI publishes driver >= 580.159 (post-embargo). + # The 20260519 AMI has embargo driver 580.150 which is incompatible with CUDA 13.0.2 on A100. + ami_id = aws_session.get_latest_ami(before_date="2026-05-05") sg_id = get_efa_security_group_id(aws_session) key_name = None From 0fc5bcfde6b5812dad8654343b641bf1f4351735 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 21 May 2026 04:02:36 +0000 Subject: [PATCH 33/34] fix: use before_date=2026-05-12 to get AMI with Docker + pre-embargo driver --- .github/scripts/efa/ec2_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py index c510faf6f92b..983e26e86fee 100644 --- a/.github/scripts/efa/ec2_helpers.py +++ b/.github/scripts/efa/ec2_helpers.py @@ -395,7 +395,7 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION aws_session = AWSSessionManager(region=region) # TODO: Remove before_date once DLAMI publishes driver >= 580.159 (post-embargo). # The 20260519 AMI has embargo driver 580.150 which is incompatible with CUDA 13.0.2 on A100. - ami_id = aws_session.get_latest_ami(before_date="2026-05-05") + ami_id = aws_session.get_latest_ami(before_date="2026-05-12") sg_id = get_efa_security_group_id(aws_session) key_name = None From 40d1fdca76e154cba68588aa11e817f154abac5a Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 21 May 2026 04:27:32 +0000 Subject: [PATCH 34/34] revert: use latest AMI, wait for DLAMI team fix --- .github/scripts/efa/ec2_helpers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/scripts/efa/ec2_helpers.py b/.github/scripts/efa/ec2_helpers.py index 983e26e86fee..0cba98608ab5 100644 --- a/.github/scripts/efa/ec2_helpers.py +++ b/.github/scripts/efa/ec2_helpers.py @@ -393,9 +393,7 @@ def efa_instances(image_uri, instance_type="p4d.24xlarge", region=DEFAULT_REGION Yields (master_conn, worker_conn, aws_session) where connections are to the EC2 hosts. """ aws_session = AWSSessionManager(region=region) - # TODO: Remove before_date once DLAMI publishes driver >= 580.159 (post-embargo). - # The 20260519 AMI has embargo driver 580.150 which is incompatible with CUDA 13.0.2 on A100. - ami_id = aws_session.get_latest_ami(before_date="2026-05-12") + ami_id = aws_session.get_latest_ami() sg_id = get_efa_security_group_id(aws_session) key_name = None