From 6a71383dc744c763b1279ba16e66beb2dab9b0ba Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 17 May 2026 22:42:34 -0400 Subject: [PATCH] runners(mi325x): exclude broken enroot node chi-mi325x-pod1-121 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root-caused via the failed sweeps on #1467, #1468, #1469 (all three [Klaud Cold] vLLM v0.21 bumps on different mi325x recipes): every failure landed on chi-mi325x-pod1-121 with enroot-aufs2ovlfs: failed to set capabilities: Operation not permitted before the .sqsh import even completes; subsequent pyxis mount then fails with "No such file or directory". The same image works cleanly on every other up node (017/018/019/020/027) — confirmed not OOM and not a recipe issue. This matches the existing pattern for mi300x in #1462 (pin salloc away from chronically-bad nodes); for mi325x there's currently only the one node to exclude, so use --exclude rather than --nodelist so we don't have to maintain the allow-list as nodes come and go. pod1-121 has separately been drained on the controller with a watchdog (per KLAUD_DEBUG.md §5.6) so it stays out of the pool until ops fix the underlying setcap regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_mi325x-amds.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 144b54646..810cbde2f 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -9,7 +9,11 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Exclude known-broken mi325x nodes: +# chi-mi325x-pod1-121: enroot-aufs2ovlfs setcap fails on this node's NFS-backed +# squash dir; container image import never completes +# (root-caused via #1467/#1468/#1469 sweep failures). +JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi325x-pod1-121.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=480 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job"