@@ -106,6 +106,20 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
106106 fi
107107 echo " Submitted llm-d job: $JOB_ID "
108108
109+ # Make sure the server-log tarball ships even when this step is
110+ # killed mid-flight (e.g. workflow cancel): without the trap the
111+ # body of this branch is interrupted before the tar+cp below run,
112+ # the `Upload server logs` step finds no file, and the user sees
113+ # nothing about what happened inside the container.
114+ bundle_server_logs () {
115+ if [[ -d " $BENCHMARK_LOGS_DIR " ]] && compgen -G " $BENCHMARK_LOGS_DIR /*" > /dev/null 2>&1 ; then
116+ tar czf " $GITHUB_WORKSPACE /multinode_server_logs.tar.gz" \
117+ -C " $BENCHMARK_LOGS_DIR " . 2> /dev/null || \
118+ echo " WARNING: failed to bundle multinode_server_logs.tar.gz" >&2
119+ fi
120+ }
121+ trap ' bundle_server_logs; scancel "$JOB_ID" 2>/dev/null || true' EXIT INT TERM HUP
122+
109123 LOG_FILE=" ${BENCHMARK_LOGS_DIR} /slurm_job-${JOB_ID} .out"
110124
111125 # Wait for log file (also catch early failures).
@@ -129,18 +143,9 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
129143 tail -F -s 2 -n+1 " $LOG_FILE " --pid=$POLL_PID 2> /dev/null
130144 wait $POLL_PID
131145
132- # Bundle every server-side log into the artifact path the
133- # `Upload server logs` workflow step picks up. Without this, the
134- # llm-d-vllm path (this branch) leaves the logs on the runner
135- # host where the user cannot reach them - epp.log, vllm_*.log,
136- # sidecar_*.log, envoy_*.log, slurm_job-*.{out,err}, etc.
137- if [[ -d " $BENCHMARK_LOGS_DIR " ]] && compgen -G " $BENCHMARK_LOGS_DIR /*" > /dev/null 2>&1 ; then
138- tar czf " $GITHUB_WORKSPACE /multinode_server_logs.tar.gz" \
139- -C " $BENCHMARK_LOGS_DIR " . 2> /dev/null || \
140- echo " WARNING: failed to bundle multinode_server_logs.tar.gz" >&2
141- fi
142-
143- # Result collection: same shape as AMD path.
146+ # Result collection: same shape as AMD path. The server-log
147+ # tarball is produced by the EXIT trap above (so it ships even
148+ # when this step is cancelled mid-flight).
144149 for result_file in $( find " ${BENCHMARK_LOGS_DIR} " -name " ${RESULT_FILENAME} *.json" 2> /dev/null) ; do
145150 file_name=$( basename " $result_file " )
146151 cp " $result_file " " $GITHUB_WORKSPACE /${file_name} "
0 commit comments