Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
77675f7
Trigger EFA Test
Eren-Jeager123 May 20, 2026
542dc11
fix: set NCCL_NET_PLUGIN=ofi for EFA NCCL plugin discovery
Eren-Jeager123 May 20, 2026
6eda96f
debug: add EFA/NCCL plugin diagnostics to allreduce test
Eren-Jeager123 May 20, 2026
1a519f3
debug: move diagnostics to pytest for visible output
Eren-Jeager123 May 20, 2026
c5b9d0c
debug: use print() and warn=True for visible diagnostic output
Eren-Jeager123 May 20, 2026
dcb4513
debug: use print() and warn=True for visible diagnostic output
Eren-Jeager123 May 20, 2026
cab5691
fix: add cuda-compat to LD_LIBRARY_PATH for driver forward compatibility
Eren-Jeager123 May 20, 2026
dff255a
fix: remove cuda-compat override from allreduce test script
Eren-Jeager123 May 20, 2026
f9b77b9
fix: add cuda-compat to LD_LIBRARY_PATH for driver forward compatibility
Eren-Jeager123 May 20, 2026
401badb
fix: use LD_PRELOAD for cuda-compat in EFA NCCL test
Eren-Jeager123 May 20, 2026
8d9f5f1
fix: use resolved path for LD_PRELOAD cuda-compat and add load verifi…
Eren-Jeager123 May 20, 2026
8e88edb
fix: stop upgrading cuda-compat to avoid driver mismatch
Eren-Jeager123 May 20, 2026
8b71703
cleanup: remove LD_PRELOAD hack, real fix is in Dockerfile cuda-compat
Eren-Jeager123 May 20, 2026
5c866cc
fix: keep cuda-compat.conf for driver forward compatibility
Eren-Jeager123 May 20, 2026
4dbc5bd
chore: retrigger CI
Eren-Jeager123 May 20, 2026
9cd117d
fix: exclude cuda-compat from security upgrade to match host driver
Eren-Jeager123 May 20, 2026
d672e8b
fix: add CACHE_REFRESH ARG to invalidate cached security patch layer
Eren-Jeager123 May 20, 2026
d211053
cleanup: remove NCCL_NET_PLUGIN=ofi debug env var
Eren-Jeager123 May 20, 2026
fdc20ba
fix: remove trailing backslash from ENV after NCCL_NET_PLUGIN removal
Eren-Jeager123 May 20, 2026
8d56a93
fix: remove cuda-compat upgrade to prevent driver version mismatch
Eren-Jeager123 May 20, 2026
50e12ca
remove .claude from tracking
Eren-Jeager123 May 20, 2026
f1db9ad
debug: add pre-flight checks and log dump for EFA NCCL test
Eren-Jeager123 May 20, 2026
0a626a5
fix: add CACHE_BUST ARG to force rebuild without stale cuda-compat
Eren-Jeager123 May 20, 2026
792ef2c
fix: restore EFA test diagnostics (CUDA driver info, SG rules, NCCL a…
Eren-Jeager123 May 20, 2026
c39e5bf
debug: add CUDA availability check on p4d, remove CACHE_BUST
Eren-Jeager123 May 20, 2026
46db310
fix: nested quote syntax error in CUDA diagnostic command
Eren-Jeager123 May 20, 2026
058afca
fix: remove python torch check that breaks nested quoting, keep nvidi…
Eren-Jeager123 May 20, 2026
3fc5b6e
fix: add diagnostic logging to EFA test for failure analysis
Eren-Jeager123 May 20, 2026
3e7cfc1
remove .claude from tracking
Eren-Jeager123 May 20, 2026
51c24c6
chore: trigger EFA test rebuild
Eren-Jeager123 May 20, 2026
98027fe
fix: remove cuda-compat upgrade to fix EFA test on DLAMI
Eren-Jeager123 May 21, 2026
495a4c9
fix: pin EFA test to pre-embargo DLAMI (before 2026-05-05)
Eren-Jeager123 May 21, 2026
0fc5bcf
fix: use before_date=2026-05-12 to get AMI with Docker + pre-embargo …
Eren-Jeager123 May 21, 2026
40d1fdc
revert: use latest AMI, wait for DLAMI team fix
Eren-Jeager123 May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docker/pytorch/Dockerfile.cuda
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \

# Security patch — run after all installers so every OS package is covered
RUN dnf upgrade -y --security --releasever latest \
&& dnf upgrade -y cuda-compat-* \
&& dnf clean all

# Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc)
Expand Down Expand Up @@ -268,7 +267,6 @@ RUN chmod +x /usr/local/bin/deep_learning_container.py \

# Security patch
RUN dnf upgrade -y --security --releasever latest \
&& dnf upgrade -y cuda-compat-* \
&& dnf clean all

# Telemetry bashrc hook — must be after security patch (dnf may replace /etc/bashrc)
Expand Down
13 changes: 11 additions & 2 deletions test/efa/scripts/nccl_allreduce.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,15 @@ check_efa_nccl_all_reduce_performance(){
fi
}

echo "=== Pre-flight checks ==="
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
ls /opt/amazon/ofi-nccl/lib64/libnccl-net* 2>/dev/null || echo "WARNING: ofi-nccl plugin not found"
nvidia-smi -L 2>&1 | head -2
echo "==="

echo "Running all_reduce_perf test"
mpirun -x FI_PROVIDER="efa" -x FI_EFA_FORK_SAFE=1 -n $NODES -N $GPU_COUNT --hostfile $NUM_HOSTS_FILE \
-x NCCL_DEBUG=INFO ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
-x NCCL_DEBUG=WARN ${USE_DEVICE_RDMA_ARG} -x NCCL_PROTO=simple -x NCCL_ALGO=ring -x RDMAV_FORK_SAFE=1 \
-x PATH -x LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:$LD_LIBRARY_PATH \
-x NCCL_SOCKET_IFNAME=^lo --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
/usr/local/bin/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 2>&1 | tee "${TRAINING_LOG}"
Expand All @@ -63,7 +69,10 @@ RETURN_VAL=${PIPESTATUS[0]}
if [ ${RETURN_VAL} -eq 0 ]; then
echo "check_efa_nccl_all_reduce passed"
else
echo "check_efa_nccl_all_reduce failed"
echo "check_efa_nccl_all_reduce failed (exit code: ${RETURN_VAL})"
echo "=== Full test log ==="
cat "${TRAINING_LOG}"
echo "=== End log ==="
fi

validate_all_reduce_performance_logs
Expand Down
50 changes: 48 additions & 2 deletions test/efa/test_efa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@

import os

import pytest
from efa.ec2_helpers import (
DEFAULT_TIMEOUT,
HOSTS_FILE_LOCATION,
MASTER_CONTAINER_NAME,
efa_instances,
get_efa_security_group_id,
run_on_container,
)

Expand All @@ -41,17 +43,61 @@ def test_efa_sanity_and_nccl(image_uri=IMAGE_URI):
worker_conn,
aws_session,
):
# Diagnostics: dump NCCL plugin state and CUDA driver info
diag = run_on_container(
MASTER_CONTAINER_NAME,
master_conn,
"echo NCCL_NET_PLUGIN=$NCCL_NET_PLUGIN && "
"ldconfig -p | grep libcuda 2>&1 && "
"echo --- && "
"ls -la /usr/local/cuda/compat/libcuda* 2>&1 && "
"echo --- && "
"cat /etc/ld.so.conf.d/cuda*.conf 2>&1 || true && "
"echo --- && "
"nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>&1 || true && "
"echo --- && "
"nvidia-smi -L 2>&1 || true",
)
print(f"=== CUDA/NCCL diagnostics (master) ===\n{diag.stdout}")

# Dump SG rules to check for missing all-traffic self-referencing rule
sg_id = get_efa_security_group_id(aws_session)
sg_resp = aws_session.ec2.describe_security_groups(GroupIds=[sg_id])
sg = sg_resp["SecurityGroups"][0]
print(f"=== Security Group {sg_id} rules ===")
for rule in sg.get("IpPermissions", []):
print(f" IN: {rule}")
for rule in sg.get("IpPermissionsEgress", []):
print(f" OUT: {rule}")
print("=== End SG rules ===")

# EFA sanity on master
run_on_container(
MASTER_CONTAINER_NAME,
master_conn,
"/test/efa/scripts/efa_sanity.sh",
)

# NCCL all_reduce across 2 nodes
run_on_container(
# NCCL all_reduce across 2 nodes — capture failure details
result = run_on_container(
MASTER_CONTAINER_NAME,
master_conn,
f"/test/efa/scripts/nccl_allreduce.sh {HOSTS_FILE_LOCATION} 2",
timeout=DEFAULT_TIMEOUT,
warn=True,
)
if result.failed:
print(f"=== NCCL allreduce FAILED (exit code {result.return_code}) ===")
print(f"=== stdout ===\n{result.stdout}")
print(f"=== stderr ===\n{result.stderr}")
log_dump = run_on_container(
MASTER_CONTAINER_NAME,
master_conn,
"cat /test/efa/logs/testEFA.log 2>&1 || echo 'Log file empty or missing'",
warn=True,
)
print(f"=== testEFA.log ===\n{log_dump.stdout}")
pytest.fail(
f"NCCL allreduce failed with exit code {result.return_code}. "
f"See stdout above for details."
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[
{
"vulnerability_id": "CVE-2025-33219",
"reason": "NVIDIA display driver vulnerability in cuda-compat. Cannot upgrade cuda-compat beyond base image version (580.95) due to incompatibility with DLAMI embargo host driver (580.150). Awaiting DLAMI driver update to public 580.159+.",
"review_by": "2026-06-20"
}
]
Loading