Skip to content

Commit 784f17c

Browse files
committed
emulator: fail-fast on provision errors, diagnose smoke test failures
Provisioning used to silently wait out the full 6000s timeout on any guest-side failure because the cleanup trap only logged the error. Now it writes STACK_CLOUD_INIT_FAILED and shuts the VM down, and the host waiter breaks on that marker and reports it distinctly. Also bump smoke test timeout 120s->300s, dump docker ps / container logs / free -m / verbose curl when it fails, log the qemu accel path, and enable /dev/kvm on the CI runner so the VM isn't stuck in TCG.
1 parent cd087c5 commit 784f17c

3 files changed

Lines changed: 48 additions & 8 deletions

File tree

.github/workflows/qemu-emulator-build.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,20 @@ jobs:
4747
- name: Install QEMU dependencies
4848
run: |
4949
sudo apt-get update
50-
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-utils genisoimage socat qemu-efi-aarch64
50+
sudo apt-get install -y qemu-system-x86 qemu-system-arm qemu-kvm qemu-utils genisoimage socat qemu-efi-aarch64
51+
52+
- name: Enable KVM access
53+
run: |
54+
echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' \
55+
| sudo tee /etc/udev/rules.d/99-kvm4all.rules
56+
sudo udevadm control --reload-rules
57+
sudo udevadm trigger --name-match=kvm || true
58+
ls -la /dev/kvm || echo "no /dev/kvm present"
59+
if [ -w /dev/kvm ]; then
60+
echo "KVM is writable — hardware acceleration will be used"
61+
else
62+
echo "WARNING: /dev/kvm is not writable — will fall back to TCG (very slow)"
63+
fi
5164
5265
- name: Build QEMU image
5366
run: |

docker/local-emulator/qemu/build-image.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ build_one() {
237237
local qemu_base pid elapsed total_build_lines
238238
local last_build_lines=0
239239
local guest_exited=false
240+
local guest_failed=false
240241
local start_time=$SECONDS
241242

242243
cp "$base_img" "$tmp_img"
@@ -258,6 +259,7 @@ build_one() {
258259
: > "$serial_log"
259260
: > "$provision_log"
260261
qemu_base="$(qemu_cmd_prefix_for_arch "$arch")"
262+
log "QEMU command prefix (${arch}): $qemu_base"
261263

262264
# shellcheck disable=SC2086
263265
$qemu_base \
@@ -282,6 +284,11 @@ build_one() {
282284
break
283285
fi
284286

287+
if contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_FAILED"; then
288+
guest_failed=true
289+
break
290+
fi
291+
285292
if [ -f "$provision_log" ]; then
286293
total_build_lines="$(line_count "$provision_log")"
287294
if [ "$total_build_lines" -gt "$last_build_lines" ]; then
@@ -308,7 +315,9 @@ build_one() {
308315
echo ""
309316

310317
if ! contains_provision_marker "$provision_log" "$serial_log" "STACK_CLOUD_INIT_DONE"; then
311-
if [ "$guest_exited" = true ]; then
318+
if [ "$guest_failed" = true ]; then
319+
err "Guest provisioning reported failure for emulator (${arch})"
320+
elif [ "$guest_exited" = true ]; then
312321
err "Provisioning exited before completion for emulator (${arch})"
313322
else
314323
err "Provisioning timed out for emulator (${arch})"

docker/local-emulator/qemu/cloud-init/emulator/user-data

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ write_files:
273273
-v stack-inbucket-data:/data/inbucket \
274274
-d stack-local-emulator-slim
275275

276-
smoke_timeout=120
276+
smoke_timeout=300
277277
smoke_elapsed=0
278278
smoke_passed=false
279279
while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do
@@ -286,13 +286,22 @@ write_files:
286286
smoke_elapsed=$((smoke_elapsed + 2))
287287
done
288288

289-
docker stop smoke-test 2>/dev/null || true
290-
sleep 2
291-
292289
if [ "$smoke_passed" = "false" ]; then
293290
log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s"
291+
log "--- docker ps -a ---"
292+
docker ps -a 2>&1 | while IFS= read -r line; do log "ps: $line"; done || true
293+
log "--- smoke-test container logs (last 200 lines) ---"
294+
docker logs --tail 200 smoke-test 2>&1 | while IFS= read -r line; do log "smoke-test: $line"; done || true
295+
log "--- free -m ---"
296+
free -m 2>&1 | while IFS= read -r line; do log "mem: $line"; done || true
297+
log "--- curl -v /health?db=1 ---"
298+
curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | while IFS= read -r line; do log "curl: $line"; done || true
299+
docker stop smoke-test 2>/dev/null || true
294300
exit 1
295301
fi
302+
303+
docker stop smoke-test 2>/dev/null || true
304+
sleep 2
296305
log "Smoke test passed (${smoke_elapsed}s)."
297306

298307
log "Flattening image (docker export/import)..."
@@ -363,8 +372,17 @@ write_files:
363372

364373
cleanup() {
365374
local status=$?
366-
if [ "$status" -ne 0 ] && [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
367-
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
375+
if [ "$status" -ne 0 ]; then
376+
if [ -n "${STACK_PROVISION_LOG_FILE:-}" ]; then
377+
printf 'ERROR: provision-build exited with code %s\n' "$status" >> "$STACK_PROVISION_LOG_FILE"
378+
printf '%s\n' "STACK_CLOUD_INIT_FAILED" >> "$STACK_PROVISION_LOG_FILE"
379+
fi
380+
for dev in /dev/console /dev/ttyAMA0 /dev/ttyS0; do
381+
echo "STACK_CLOUD_INIT_FAILED" > "$dev" 2>/dev/null || true
382+
done
383+
sync || true
384+
(sleep 2 && shutdown -P now) &
385+
(sleep 15 && poweroff -f) &
368386
fi
369387
}
370388
trap cleanup EXIT

0 commit comments

Comments
 (0)