Skip to content

Commit e92110f

Browse files
committed
ui-smoke: fail fast and explain when the GUI process dies
The driver polled NML for up to 60s after a GUI crash, then blamed whatever stage timed out (e.g. homing); a dead task keeps serving its last stat buffer. Watch the launcher PID and fail in ~1s pointing at the crash. Enable PYTHONFAULTHANDLER for a Python traceback on fatal signals.
1 parent 0ad17aa commit e92110f

2 files changed

Lines changed: 89 additions & 19 deletions

File tree

tests/ui-smoke/_lib/drive.py

Lines changed: 84 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import argparse
1515
import linuxcnc
16+
import os
1617
import sys
1718
import time
1819

@@ -32,6 +33,55 @@
3233
STATE_STABILITY_S = 0.5
3334
STATE_RETRY_BUDGET = 6
3435

36+
# linuxcnc launcher PID, written to linuxcnc.pid by the launcher and read
37+
# once at startup. The driver watches it so a GUI crash, which tears
38+
# linuxcnc down, fails the test in ~1s with a clear message instead of
39+
# waiting out a long NML poll. A dead task keeps serving its last stat
40+
# buffer, so process liveness is the only reliable crash signal.
41+
_WATCH_PID = None
42+
43+
44+
class LauncherGone(Exception):
45+
"""linuxcnc process group exited (GUI crashed or task died)."""
46+
47+
48+
def _read_pid(path):
49+
try:
50+
with open(path) as f:
51+
return int(f.read().strip())
52+
except (OSError, ValueError):
53+
return None
54+
55+
56+
# Crash markers faulthandler and scripts/linuxcnc write to linuxcnc.err
57+
# the instant the GUI dies. The launcher PID can linger in Cleanup, so
58+
# scanning these catches the crash sooner and regardless of which GUI.
59+
_CRASH_MARKERS = ("Fatal Python error", "Segmentation fault", "Aborted")
60+
61+
62+
def _crash_marker_seen():
63+
try:
64+
with open("linuxcnc.err") as f:
65+
return any(m in f.read() for m in _CRASH_MARKERS)
66+
except OSError:
67+
return False
68+
69+
70+
def _watchdog():
71+
"""Raise LauncherGone if the GUI has crashed: either the launcher PID
72+
is gone, or a crash marker appeared in linuxcnc.err. Unknown PID and
73+
a missing log count as alive, so a not-yet-written file never
74+
false-fails the test."""
75+
if _WATCH_PID is not None:
76+
try:
77+
os.kill(_WATCH_PID, 0)
78+
except ProcessLookupError:
79+
raise LauncherGone()
80+
except PermissionError:
81+
pass
82+
if _crash_marker_seen():
83+
raise LauncherGone()
84+
3585

3686
def connect_and_wait_ready(timeout):
3787
"""Wait until linuxcnc.stat().poll() returns without error and
@@ -47,6 +97,7 @@ def connect_and_wait_ready(timeout):
4797
deadline = time.monotonic() + timeout
4898
last_err = None
4999
while time.monotonic() < deadline:
100+
_watchdog()
50101
try:
51102
stat = linuxcnc.stat()
52103
stat.poll()
@@ -70,6 +121,7 @@ def wait_until_quiet(stat, predicate, timeout):
70121
must not happen."""
71122
deadline = time.monotonic() + timeout
72123
while time.monotonic() < deadline:
124+
_watchdog()
73125
stat.poll()
74126
if predicate(stat):
75127
return True
@@ -195,6 +247,7 @@ def wait_program_started(stat, timeout):
195247
IDLE; we then read stat.position at (0,0,0)."""
196248
deadline = time.monotonic() + timeout
197249
while time.monotonic() < deadline:
250+
_watchdog()
198251
stat.poll()
199252
if stat.interp_state != linuxcnc.INTERP_IDLE:
200253
return True
@@ -214,6 +267,7 @@ def wait_program_idle(stat, timeout):
214267
deadline = time.monotonic() + timeout
215268
consecutive = 0
216269
while time.monotonic() < deadline:
270+
_watchdog()
217271
stat.poll()
218272
idle = (
219273
stat.interp_state == linuxcnc.INTERP_IDLE
@@ -311,30 +365,41 @@ def main():
311365
if args.run_program and args.expect_delta_mm is None:
312366
ap.error("--run-program requires --expect-delta-mm DX,DY,DZ")
313367

314-
cmd, stat = connect_and_wait_ready(CONNECT_TIMEOUT_S)
315-
if cmd is None:
316-
return 1
317-
318-
# Give the GUI process enough time to finish constructing itself
319-
# (load .ui files, compile resources.py if needed, etc.) and
320-
# settle. If the GUI was going to crash on startup it has crashed
321-
# by now.
322-
time.sleep(SETTLE_S)
368+
global _WATCH_PID
369+
_WATCH_PID = _read_pid("linuxcnc.pid")
323370

324-
# Re-check task is still alive; a GUI crash may have torn linuxcnc
325-
# down via Cleanup.
326371
try:
327-
stat.poll()
328-
except linuxcnc.error as e:
329-
sys.stderr.write(f"UI_SMOKE_FAIL: task disappeared after GUI startup: {e}\n")
330-
return 1
372+
cmd, stat = connect_and_wait_ready(CONNECT_TIMEOUT_S)
373+
if cmd is None:
374+
return 1
331375

332-
if args.run_program:
333-
if not run_program(cmd, stat,
334-
args.run_program, args.expect_delta_mm,
335-
args.tol, args.run_timeout):
376+
# Give the GUI process enough time to finish constructing itself
377+
# (load .ui files, compile resources.py if needed, etc.) and
378+
# settle. If the GUI was going to crash on startup it has crashed
379+
# by now.
380+
time.sleep(SETTLE_S)
381+
_watchdog()
382+
383+
# Re-check task is still alive; a GUI crash may have torn linuxcnc
384+
# down via Cleanup.
385+
try:
386+
stat.poll()
387+
except linuxcnc.error as e:
388+
sys.stderr.write(f"UI_SMOKE_FAIL: task disappeared after GUI startup: {e}\n")
336389
return 1
337390

391+
if args.run_program:
392+
if not run_program(cmd, stat,
393+
args.run_program, args.expect_delta_mm,
394+
args.tol, args.run_timeout):
395+
return 1
396+
except LauncherGone:
397+
sys.stderr.write(
398+
"UI_SMOKE_FAIL: linuxcnc exited before the driver finished; "
399+
"the GUI crashed or task died. See linuxcnc.out / linuxcnc.err "
400+
"above for the backtrace.\n")
401+
return 1
402+
338403
print("UI_SMOKE_OK")
339404
return 0
340405

tests/ui-smoke/_lib/launch-env.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,8 @@ export CANBERRA_DRIVER=null
2424
export GST_PLUGIN_FEATURE_RANK="pulsesink:NONE,alsasink:NONE,osssink:NONE,oss4sink:NONE,jackaudiosink:NONE,pipewiresink:NONE,openalsink:NONE"
2525
export PULSE_SERVER=/dev/null
2626
export SDL_AUDIODRIVER=dummy
27+
28+
# Dump a Python traceback on a fatal signal. For a pure-Python crash this
29+
# names the line; for a C/C++ crash (Qt, dbus, GL) it shows the Python
30+
# frame that called in. The native side is captured by crashdump.sh.
31+
export PYTHONFAULTHANDLER=1

0 commit comments

Comments
 (0)