Skip to content

Commit 9fe4591

Browse files
committed
[NV] llm-d: probe envoy admin /ready with timeout, bundle logs on cancel
Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
1 parent 40fe5fc commit 9fe4591

2 files changed

Lines changed: 39 additions & 13 deletions

File tree

benchmarks/multi_node/llm-d/server.sh

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,28 @@ PY
290290
envoy -c /etc/envoy/envoy.yaml > "$ENVOY_LOG" 2>&1 &
291291
ENVOY_PID=$!
292292

293-
wait_for_server_ready --port "$ENVOY_PORT" --server-log "$ENVOY_LOG" --server-pid "$ENVOY_PID"
293+
# Probe Envoy's admin /ready (port 9901) instead of /health on :8080.
294+
# /health on :8080 routes through ext_proc -> EPP -> ORIGINAL_DST, which
295+
# only resolves once a request has the right model/profile metadata for
296+
# EPP to set x-gateway-destination-endpoint. Health-style requests
297+
# without that metadata get 503 and the wait loop spins forever.
298+
echo "Waiting for envoy admin on 127.0.0.1:9901/ready"
299+
ENVOY_WAIT_DEADLINE=$(( $(date +%s) + 120 ))
300+
until [[ "$(curl --output /dev/null --silent --write-out '%{http_code}' \
301+
"http://127.0.0.1:9901/ready" 2>/dev/null)" == "200" ]]; do
302+
if ! kill -0 "$ENVOY_PID" 2>/dev/null; then
303+
echo "ERROR: envoy died before admin /ready returned 200" >&2
304+
tail -n 80 "$ENVOY_LOG" >&2 || true
305+
exit 1
306+
fi
307+
if [[ "$(date +%s)" -ge "$ENVOY_WAIT_DEADLINE" ]]; then
308+
echo "ERROR: envoy admin /ready did not return 200 within 120s" >&2
309+
tail -n 80 "$ENVOY_LOG" >&2 || true
310+
exit 1
311+
fi
312+
sleep 2
313+
done
314+
echo "Envoy admin ready; listener should be on $ENVOY_PORT"
294315

295316
# Wait for the prefill leader's sidecar before starting the bench.
296317
# wait_for_server_ready can only probe localhost; the prefill leader

runners/launch_h200-dgxc-slurm.sh

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,20 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
106106
fi
107107
echo "Submitted llm-d job: $JOB_ID"
108108

109+
# Make sure the server-log tarball ships even when this step is
110+
# killed mid-flight (e.g. workflow cancel): without the trap the
111+
# body of this branch is interrupted before the tar+cp below run,
112+
# the `Upload server logs` step finds no file, and the user sees
113+
# nothing about what happened inside the container.
114+
bundle_server_logs() {
115+
if [[ -d "$BENCHMARK_LOGS_DIR" ]] && compgen -G "$BENCHMARK_LOGS_DIR/*" >/dev/null 2>&1; then
116+
tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" \
117+
-C "$BENCHMARK_LOGS_DIR" . 2>/dev/null || \
118+
echo "WARNING: failed to bundle multinode_server_logs.tar.gz" >&2
119+
fi
120+
}
121+
trap 'bundle_server_logs; scancel "$JOB_ID" 2>/dev/null || true' EXIT INT TERM HUP
122+
109123
LOG_FILE="${BENCHMARK_LOGS_DIR}/slurm_job-${JOB_ID}.out"
110124

111125
# Wait for log file (also catch early failures).
@@ -129,18 +143,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
129143
tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
130144
wait $POLL_PID
131145

132-
# Bundle every server-side log into the artifact path the
133-
# `Upload server logs` workflow step picks up. Without this, the
134-
# llm-d-vllm path (this branch) leaves the logs on the runner
135-
# host where the user cannot reach them - epp.log, vllm_*.log,
136-
# sidecar_*.log, envoy_*.log, slurm_job-*.{out,err}, etc.
137-
if [[ -d "$BENCHMARK_LOGS_DIR" ]] && compgen -G "$BENCHMARK_LOGS_DIR/*" >/dev/null 2>&1; then
138-
tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" \
139-
-C "$BENCHMARK_LOGS_DIR" . 2>/dev/null || \
140-
echo "WARNING: failed to bundle multinode_server_logs.tar.gz" >&2
141-
fi
142-
143-
# Result collection: same shape as AMD path.
146+
# Result collection: same shape as AMD path. The server-log
147+
# tarball is produced by the EXIT trap above (so it ships even
148+
# when this step is cancelled mid-flight).
144149
for result_file in $(find "${BENCHMARK_LOGS_DIR}" -name "${RESULT_FILENAME}*.json" 2>/dev/null); do
145150
file_name=$(basename "$result_file")
146151
cp "$result_file" "$GITHUB_WORKSPACE/${file_name}"

0 commit comments

Comments
 (0)