Skip to content

Commit fb2d771

Browse files
ichbinblauclaude
andcommitted
fix docker detection: per-node probe since group membership varies
Export DOCKER_CMD_DETECT as a shell snippet that each srun participant evaluates locally, instead of testing a single node and assuming all nodes have the same docker socket permissions. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
1 parent 672e693 commit fb2d771

1 file changed

Lines changed: 9 additions & 11 deletions

File tree

benchmarks/multi_node/amd_utils/job.slurm

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -199,15 +199,9 @@ FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
199199
SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES)
200200
SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//')
201201

202-
# Docker privilege detection — test on a compute node, not the batch host.
203-
FIRST_NODE=$(echo "$SELECTED_NODES" | head -1)
204-
if srun --nodelist="$FIRST_NODE" -N1 -n1 --overlap bash -c 'docker ps &>/dev/null'; then
205-
DOCKER_CMD="docker"
206-
else
207-
DOCKER_CMD="sudo docker"
208-
fi
209-
export DOCKER_CMD
210-
echo "[docker-detect] DOCKER_CMD=$DOCKER_CMD (tested on $FIRST_NODE)"
202+
# Docker privilege detection — evaluated per-node since group membership varies.
203+
# Exported as a snippet so every srun participant resolves it locally.
204+
export DOCKER_CMD_DETECT='if docker ps &>/dev/null 2>&1; then DOCKER_CMD=docker; else DOCKER_CMD="sudo docker"; fi'
211205

212206
# Update SLURM environment variables
213207
export SLURM_NNODES=$NUM_NODES
@@ -402,6 +396,10 @@ set -euo pipefail
402396
403397
echo \"Rank \$SLURM_PROCID on \$(hostname)\"
404398
399+
# Per-node docker privilege detection
400+
eval \"\$DOCKER_CMD_DETECT\"
401+
echo \"[docker-detect] rank \$SLURM_PROCID: DOCKER_CMD=\$DOCKER_CMD\"
402+
405403
# Pre-clean (idempotent)
406404
\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true
407405
\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true
@@ -484,12 +482,12 @@ exit \$DOCKER_EXIT_CODE
484482
"
485483

486484
if [[ "${KEEP_CONTAINERS}" != "1" ]]; then
487-
srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true'
485+
srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true'
488486

489487
# Clean up vLLM external router container on node 0
490488
if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then
491489
srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c '
492-
'"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true
490+
eval "$DOCKER_CMD_DETECT"; $DOCKER_CMD rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true
493491
'
494492
fi
495493
fi

0 commit comments

Comments
 (0)