diff --git a/docker/pytorch/Dockerfile.cuda b/docker/pytorch/Dockerfile.cuda index 93effb4ff91d..96251e67b1ab 100644 --- a/docker/pytorch/Dockerfile.cuda +++ b/docker/pytorch/Dockerfile.cuda @@ -204,7 +204,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch — run after all installers so every OS package is covered RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) @@ -268,7 +267,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \ # Security patch RUN dnf upgrade -y --security --releasever latest \ - && dnf upgrade -y cuda-compat-* \ && dnf clean all # Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc) diff --git a/test/efa/scripts/nccl_allreduce.sh b/test/efa/scripts/nccl_allreduce.sh index bd185bfb8169..9871e399cf69 100755 --- a/test/efa/scripts/nccl_allreduce.sh +++ b/test/efa/scripts/nccl_allreduce.sh @@ -52,9 +52,15 @@ check_efa_nccl_all_reduce_performance(){ fi } +echo "=== Pre-flight checks ===" +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" +ls /opt/amazon/ofi-nccl/lib64/libnccl-net* 2>/dev/null || echo "WARNING: ofi-nccl plugin not found" +nvidia-smi -L 2>&1 | head -2 +echo "===" + echo "Running all_reduce_perf test" mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \ - -x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ + -x NCCL_DEBUG=WARN ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \ -x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \ -x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \ /usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}" @@ -63,7 +69,10 @@ RETURN_VAL=${PIPESTATUS[0]} if [ ${RETURN_VAL} -eq 0 ]; then echo "check_efa_nccl_all_reduce passed" else - echo "check_efa_nccl_all_reduce failed" + echo "check_efa_nccl_all_reduce failed (exit code: ${RETURN_VAL})" + echo "=== Full test log ===" + cat "${TRAINING_LOG}" + echo "=== End log ===" fi validate_all_reduce_performance_logs diff --git a/test/efa/test_efa.py b/test/efa/test_efa.py index aa06a572c0e8..4b8ddb19960a 100644 --- a/test/efa/test_efa.py +++ b/test/efa/test_efa.py @@ -11,11 +11,13 @@ import os +import pytest from efa.ec2_helpers import ( DEFAULT_TIMEOUT, HOSTS_FILE_LOCATION, MASTER_CONTAINER_NAME, efa_instances, + get_efa_security_group_id, run_on_container, ) @@ -41,6 +43,34 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): worker_conn, aws_session, ): + # Diagnostics: dump NCCL plugin state and CUDA driver info + diag = run_on_container( + MASTER_CONTAINER_NAME, + master_conn, + "echo NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN && " + "ldconfig -p | grep libcuda 2>&1 && " + "echo --- && " + "ls -la /usr/local/cuda/compat/libcuda* 2>&1 && " + "echo --- && " + "cat /etc/ld.so.conf.d/cuda*.conf 2>&1 || true && " + "echo --- && " + "nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true && " + "echo --- && " + "nvidia-smi -L 2>&1 || true", + ) + print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}") + + # Dump SG rules to check for missing all-traffic self-referencing rule + sg_id = get_efa_security_group_id(aws_session) + sg_resp = aws_session.ec2.describe_security_groups(GroupIds=[sg_id]) + sg = sg_resp["SecurityGroups"][0] + print(f"=== Security Group {sg_id} rules ===") + for rule in sg.get("IpPermissions", []): + print(f" IN: {rule}") + for rule in sg.get("IpPermissionsEgress", []): + print(f" OUT: {rule}") + print("=== End SG rules ===") + # EFA sanity on master run_on_container( MASTER_CONTAINER_NAME, @@ -48,10 +78,26 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI): "/test/efa/scripts/efa_sanity.sh", ) - # NCCL all_reduce across 2 nodes - run_on_container( + # NCCL all_reduce across 2 nodes — capture failure details + result = run_on_container( MASTER_CONTAINER_NAME, master_conn, f"/test/efa/scripts/nccl_allreduce.sh {HOSTS_FILE_LOCATION} 2", timeout=DEFAULT_TIMEOUT, + warn=True, ) + if result.failed: + print(f"=== NCCL allreduce FAILED (exit code {result.return_code}) ===") + print(f"=== stdout ===\n{result.stdout}") + print(f"=== stderr ===\n{result.stderr}") + log_dump = run_on_container( + MASTER_CONTAINER_NAME, + master_conn, + "cat /test/efa/logs/testEFA.log 2>&1 || echo 'Log file empty or missing'", + warn=True, + ) + print(f"=== testEFA.log ===\n{log_dump.stdout}") + pytest.fail( + f"NCCL allreduce failed with exit code {result.return_code}. " + f"See stdout above for details." + ) diff --git a/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json new file mode 100644 index 000000000000..757ef73d0ea4 --- /dev/null +++ b/test/security/data/ecr_scan_allowlist/pytorch_runtime/framework_allowlist.json @@ -0,0 +1,7 @@ +[ + { + "vulnerability_id": "CVE-2025-33219", + "reason": "NVIDIA display driver vulnerability in cuda-compat. Cannot upgrade cuda-compat beyond base image version (580.95) due to incompatibility with DLAMI embargo host driver (580.150). Awaiting DLAMI driver update to public 580.159+.", + "review_by": "2026-06-20" + } +]