Add test to verify behaviour with xdist, change behaviour to match test

tsmith023 · tsmith023 · commit 1630b0b1c519 · 2026-04-16T17:31:09.000+01:00
diff --git a/conftest.py b/conftest.py
@@ -1,29 +1,59 @@
 import faulthandler
+import os
+import threading
 
 import pytest
 
+DEFAULT_TIMEOUT = 300  # 5 minutes
 
-def pytest_runtest_setup(item: pytest.Item) -> None:
-    """Set faulthandler alarm as a backup timeout mechanism.
+_timeout_timer: threading.Timer | None = None
 
-    This fires even if the process is stuck in C code (e.g., gRPC core).
-    Set to pytest-timeout value + 30s so pytest-timeout handles it first.
 
-    Uses exit=False to avoid killing xdist worker processes — a killed worker
-    causes 'node down: Not properly terminated' and loses the stack trace output.
-    With exit=False, faulthandler dumps tracebacks to stderr (relayed by xdist)
-    without terminating the process, letting pytest-timeout handle the interruption.
-    """
+def _get_timeout(item: pytest.Item) -> float:
     marker = item.get_closest_marker("timeout")
     if marker and marker.args:
-        test_timeout = marker.args[0]
-    else:
-        test_timeout = item.config.getini("timeout") or 300
+        return float(marker.args[0])
+    return float(DEFAULT_TIMEOUT)
 
-    if test_timeout and float(test_timeout) > 0:
-        faulthandler.dump_traceback_later(float(test_timeout) + 30, exit=False)
+
+def pytest_runtest_setup(item: pytest.Item) -> None:
+    """Start a watchdog timer that dumps all thread stack traces on timeout.
+
+    Unlike pytest-timeout, this does NOT raise KeyboardInterrupt (which crashes
+    xdist workers and corrupts asyncio event loops). Instead it:
+    1. Writes the test name + all thread tracebacks directly to fd 2 (stderr).
+       With --capture=sys in pytest.ini, fd 2 is the real stderr (not captured),
+       so the output goes directly to the CI log even under xdist.
+    2. Calls os._exit(1) to terminate the worker process.
+
+    xdist will report 'node down: Not properly terminated' which is expected —
+    the diagnostic output will already be in the CI logs above that message.
+    """
+    global _timeout_timer
+    timeout = _get_timeout(item)
+    if timeout <= 0:
+        return
+
+    def _on_timeout() -> None:
+        banner = "=" * 70
+        os.write(2, f"\n\n{banner}\n".encode())
+        os.write(2, f"TIMEOUT: {item.nodeid} exceeded {timeout}s\n".encode())
+        os.write(2, f"{banner}\n\n".encode())
+        # faulthandler needs a file object — wrap a dup of fd 2 to avoid closing it
+        with os.fdopen(os.dup(2), "w") as f:
+            faulthandler.dump_traceback(file=f)
+            f.flush()
+        os.write(2, f"\n{banner}\n\n".encode())
+        os._exit(1)
+
+    _timeout_timer = threading.Timer(timeout, _on_timeout)
+    _timeout_timer.daemon = True
+    _timeout_timer.start()
 
 
 def pytest_runtest_teardown(item: pytest.Item, nextitem: pytest.Item | None) -> None:
-    """Cancel the faulthandler alarm after each test."""
-    faulthandler.cancel_dump_traceback_later()
+    """Cancel the watchdog timer after each test completes."""
+    global _timeout_timer
+    if _timeout_timer is not None:
+        _timeout_timer.cancel()
+        _timeout_timer = None
diff --git a/pytest.ini b/pytest.ini
@@ -1,5 +1,6 @@
 [pytest]
-addopts = -m 'not profiling' --benchmark-skip -l --timeout=300 --timeout_method=signal
+addopts = -m 'not profiling' --benchmark-skip -l --capture=sys --max-worker-restart=3
 markers =
     profiling: marks tests that can be profiled
+    timeout: marks tests with a custom timeout in seconds (default: 300)
 asyncio_default_fixture_loop_scope = function
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -3,7 +3,6 @@ pytest-cov==6.2.1
 pytest-asyncio==1.3.0
 pytest-benchmark==5.1.0
 pytest-profiling==1.8.1
-pytest-timeout==2.3.1
 coverage==7.10.7
 pytest-xdist==3.7.0
 werkzeug==3.1.6
diff --git a/test/test_timeout.py b/test/test_timeout.py
@@ -0,0 +1,90 @@
+"""Tests for the custom per-test timeout mechanism in conftest.py.
+
+Uses subprocess because the timeout mechanism calls os._exit(1).
+"""
+
+import subprocess
+import sys
+import textwrap
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).parent.parent
+
+
+def _run_pytest(tmp_path: Path, test_code: str, *extra_args: str) -> subprocess.CompletedProcess:
+    """Run pytest in a subprocess with a copy of our timeout conftest."""
+    (tmp_path / "conftest.py").write_text((PROJECT_ROOT / "conftest.py").read_text())
+    (tmp_path / "pytest.ini").write_text(
+        "[pytest]\naddopts = --capture=sys --max-worker-restart=0\nmarkers =\n    timeout: custom timeout\n"
+    )
+    (tmp_path / "test_it.py").write_text(textwrap.dedent(test_code))
+    return subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pytest",
+            "-v",
+            "-n",
+            "auto",
+            "--dist",
+            "loadgroup",
+            "test_it.py",
+            *extra_args,
+        ],
+        capture_output=True,
+        text=True,
+        timeout=60,
+        cwd=str(tmp_path),
+    )
+
+
+def test_timeout_prints_test_name_and_stacktrace(tmp_path: Path) -> None:
+    result = _run_pytest(
+        tmp_path,
+        """\
+        import time
+        import pytest
+
+        @pytest.mark.timeout(2)
+        def test_hangs():
+            time.sleep(999)
+        """,
+    )
+    assert result.returncode != 0
+    assert "TIMEOUT: test_it.py::test_hangs exceeded 2.0s" in result.stderr
+    assert "test_hangs" in result.stderr
+
+
+def test_fast_test_not_killed(tmp_path: Path) -> None:
+    result = _run_pytest(
+        tmp_path,
+        """\
+        import pytest
+
+        @pytest.mark.timeout(10)
+        def test_fast():
+            assert True
+        """,
+    )
+    assert result.returncode == 0
+    assert "TIMEOUT" not in result.stderr
+
+
+def test_timeout_with_passing_and_hanging_test(tmp_path: Path) -> None:
+    result = _run_pytest(
+        tmp_path,
+        """\
+        import time
+        import pytest
+
+        @pytest.mark.timeout(2)
+        def test_hangs_in_worker():
+            time.sleep(999)
+
+        def test_passes():
+            assert True
+        """,
+    )
+    assert result.returncode != 0
+    assert "TIMEOUT: test_it.py::test_hangs_in_worker exceeded 2.0s" in result.stderr
+    assert "test_hangs_in_worker" in result.stderr