|
1 | 1 | import faulthandler |
| 2 | +import os |
| 3 | +import threading |
2 | 4 |
|
3 | 5 | import pytest |
4 | 6 |
|
| 7 | +DEFAULT_TIMEOUT = 300 # 5 minutes |
5 | 8 |
|
6 | | -def pytest_runtest_setup(item: pytest.Item) -> None: |
7 | | - """Set faulthandler alarm as a backup timeout mechanism. |
| 9 | +_timeout_timer: threading.Timer | None = None |
8 | 10 |
|
9 | | - This fires even if the process is stuck in C code (e.g., gRPC core). |
10 | | - Set to pytest-timeout value + 30s so pytest-timeout handles it first. |
11 | 11 |
|
12 | | - Uses exit=False to avoid killing xdist worker processes — a killed worker |
13 | | - causes 'node down: Not properly terminated' and loses the stack trace output. |
14 | | - With exit=False, faulthandler dumps tracebacks to stderr (relayed by xdist) |
15 | | - without terminating the process, letting pytest-timeout handle the interruption. |
16 | | - """ |
| 12 | +def _get_timeout(item: pytest.Item) -> float: |
17 | 13 | marker = item.get_closest_marker("timeout") |
18 | 14 | if marker and marker.args: |
19 | | - test_timeout = marker.args[0] |
20 | | - else: |
21 | | - test_timeout = item.config.getini("timeout") or 300 |
| 15 | + return float(marker.args[0]) |
| 16 | + return float(DEFAULT_TIMEOUT) |
22 | 17 |
|
23 | | - if test_timeout and float(test_timeout) > 0: |
24 | | - faulthandler.dump_traceback_later(float(test_timeout) + 30, exit=False) |
| 18 | + |
| 19 | +def pytest_runtest_setup(item: pytest.Item) -> None: |
| 20 | + """Start a watchdog timer that dumps all thread stack traces on timeout. |
| 21 | +
|
| 22 | + Unlike pytest-timeout, this does NOT raise KeyboardInterrupt (which crashes |
| 23 | + xdist workers and corrupts asyncio event loops). Instead it: |
| 24 | + 1. Writes the test name + all thread tracebacks directly to fd 2 (stderr). |
| 25 | + With --capture=sys in pytest.ini, fd 2 is the real stderr (not captured), |
| 26 | + so the output goes directly to the CI log even under xdist. |
| 27 | + 2. Calls os._exit(1) to terminate the worker process. |
| 28 | +
|
| 29 | + xdist will report 'node down: Not properly terminated' which is expected — |
| 30 | + the diagnostic output will already be in the CI logs above that message. |
| 31 | + """ |
| 32 | + global _timeout_timer |
| 33 | + timeout = _get_timeout(item) |
| 34 | + if timeout <= 0: |
| 35 | + return |
| 36 | + |
| 37 | + def _on_timeout() -> None: |
| 38 | + banner = "=" * 70 |
| 39 | + os.write(2, f"\n\n{banner}\n".encode()) |
| 40 | + os.write(2, f"TIMEOUT: {item.nodeid} exceeded {timeout}s\n".encode()) |
| 41 | + os.write(2, f"{banner}\n\n".encode()) |
| 42 | + # faulthandler needs a file object — wrap a dup of fd 2 to avoid closing it |
| 43 | + with os.fdopen(os.dup(2), "w") as f: |
| 44 | + faulthandler.dump_traceback(file=f) |
| 45 | + f.flush() |
| 46 | + os.write(2, f"\n{banner}\n\n".encode()) |
| 47 | + os._exit(1) |
| 48 | + |
| 49 | + _timeout_timer = threading.Timer(timeout, _on_timeout) |
| 50 | + _timeout_timer.daemon = True |
| 51 | + _timeout_timer.start() |
25 | 52 |
|
26 | 53 |
|
27 | 54 | def pytest_runtest_teardown(item: pytest.Item, nextitem: pytest.Item | None) -> None: |
28 | | - """Cancel the faulthandler alarm after each test.""" |
29 | | - faulthandler.cancel_dump_traceback_later() |
| 55 | + """Cancel the watchdog timer after each test completes.""" |
| 56 | + global _timeout_timer |
| 57 | + if _timeout_timer is not None: |
| 58 | + _timeout_timer.cancel() |
| 59 | + _timeout_timer = None |
0 commit comments