Skip to content

Commit 4dc7323

Browse files
authored
test: deflake subprocess kill test (#9423)
The async reap test patched asyncio.sleep to a no-op to skip the reaper's own waits, but that left no time for the kernel to tear down the SIGKILL'd child before the test asserted it was dead. Replace the bare liveness assertions with a bounded poll.
1 parent ddcf720 commit 4dc7323

1 file changed

Lines changed: 23 additions & 10 deletions

File tree

tests/_utils/test_subprocess.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,19 @@ def _is_alive(pid: int) -> bool:
107107
return True
108108

109109

110+
def _wait_until_dead(pid: int, timeout_s: float = 5.0) -> None:
111+
"""Poll until the kernel has reaped `pid`, then assert.
112+
113+
Tolerates the latency between SIGKILL delivery and the kernel actually
114+
tearing the process down — a bare `assert not _is_alive(pid)` immediately
115+
after a kill is racy.
116+
"""
117+
deadline = time.monotonic() + timeout_s
118+
while time.monotonic() < deadline and _is_alive(pid):
119+
time.sleep(0.05)
120+
assert not _is_alive(pid)
121+
122+
110123
@pytest.mark.skipif(sys.platform == "win32", reason="POSIX only")
111124
def test_try_kill_process_and_group_kills_pgroup_spares_new_session() -> None:
112125
"""Covers the shutdown path used by KernelManagerImpl, IPCKernelManagerImpl,
@@ -129,11 +142,7 @@ def test_try_kill_process_and_group_kills_pgroup_spares_new_session() -> None:
129142
# wait() reaps the child so it doesn't linger as a zombie.
130143
assert leader.wait(timeout=5) is not None
131144

132-
deadline = time.monotonic() + 5.0
133-
while time.monotonic() < deadline and _is_alive(child_pg_pid):
134-
time.sleep(0.05)
135-
136-
assert not _is_alive(child_pg_pid)
145+
_wait_until_dead(child_pg_pid)
137146
assert _is_alive(child_newpg_pid)
138147
finally:
139148
for pid in (leader.pid, child_pg_pid, child_newpg_pid):
@@ -153,13 +162,17 @@ async def test_try_kill_process_and_group_sigkills_stubborn_child(
153162
"""try_kill_process_and_group schedules a reap task that escalates to
154163
SIGKILL when the child ignores SIGTERM.
155164
"""
156-
# Skip the reaper's real 5s/1s waits so the test is fast.
165+
# Cap the reaper's real 5s/1s waits to 50ms so the test is fast, but
166+
# leave enough wall-clock time for the kernel to deliver SIGKILL and
167+
# for the reaper's own `poll()` to reap the zombie. Zeroing the delay
168+
# entirely lets the reaper exit before the child is reaped, after which
169+
# `_is_alive` (which sees zombies as alive) would spin to timeout.
157170
real_sleep = asyncio.sleep
158171

159-
async def no_wait(_delay: float) -> None:
160-
await real_sleep(0)
172+
async def short_sleep(_delay: float) -> None:
173+
await real_sleep(0.05)
161174

162-
monkeypatch.setattr(asyncio, "sleep", no_wait)
175+
monkeypatch.setattr(asyncio, "sleep", short_sleep)
163176

164177
script = (
165178
"import signal, time\n"
@@ -174,7 +187,7 @@ async def no_wait(_delay: float) -> None:
174187
try_kill_process_and_group(cast(ProcessLike, proc))
175188
# Drain the reap task(s) scheduled on the current loop.
176189
await asyncio.gather(*list(_REAP_TASKS), return_exceptions=True)
177-
assert not _is_alive(proc.pid)
190+
_wait_until_dead(proc.pid)
178191
finally:
179192
with contextlib.suppress(ProcessLookupError):
180193
os.killpg(pgid, signal.SIGKILL)

0 commit comments

Comments
 (0)