Skip to content

Commit 53d8082

Browse files
georgeh0claude
andauthored
fix: make sure daemons are brought down cleanly (#84)
* fix: make sure daemons are brought down cleanly * chore: be more verbose in pre-commit workflow * fix: split CI workflow to avoid prek output capture deadlock on Windows prek --verbose captures subprocess stdout via pipe. On Windows, the 4KB pipe buffer fills when pytest produces verbose output, causing a deadlock (pytest blocks writing, prek blocks waiting for exit). Split into two steps: prek --skip pytest for lint checks, and pytest running directly for streaming output. Also: - Add OSError catch in _pid_alive for Windows edge cases - Properly shut down daemon thread in test_daemon.py session fixture Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: catch SystemError from os.kill on Windows On Windows, os.kill(pid, 0) raises SystemError when the target process has exited but its handle state is in transition (WinError 87). This corrupts CPython C exception state, causing subsequent built-in calls like time.monotonic() to also raise SystemError. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: avoid os.kill on Windows due to CPython C exception state corruption os.kill(pid, 0) on Windows corrupts CPython C-level exception state even after the raised OSError is caught. This causes subsequent calls to C built-ins (time.monotonic, time.sleep) to raise SystemError. Use ctypes OpenProcess instead, which is the proper Win32 API for checking process existence. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b8ed86f commit 53d8082

5 files changed

Lines changed: 142 additions & 56 deletions

File tree

.github/workflows/pre-commit.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,8 @@ jobs:
2929
- name: Install dependencies
3030
run: uv sync --python ${{ matrix.python-version }} --python-preference only-managed
3131

32-
- name: Run prek
33-
run: uv run prek run --all-files
32+
- name: Lint & format (prek)
33+
run: uv run prek run --all-files --verbose --show-diff-on-failure --skip pytest
34+
35+
- name: Test (pytest)
36+
run: uv run pytest tests/ -v

src/cocoindex_code/cli.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -474,26 +474,26 @@ def daemon_restart() -> None:
474474
@daemon_app.command("stop")
475475
def daemon_stop() -> None:
476476
"""Stop the daemon."""
477-
from .client import stop_daemon
477+
from .client import is_daemon_running, stop_daemon
478478
from .daemon import daemon_pid_path
479479

480480
pid_path = daemon_pid_path()
481-
if not pid_path.exists():
481+
if not pid_path.exists() and not is_daemon_running():
482482
_typer.echo("Daemon is not running.")
483483
return
484484

485485
stop_daemon()
486486

487-
# Wait for process to exit
487+
# Wait for process to exit (check both pid file and socket)
488488
import time
489489

490490
deadline = time.monotonic() + 5.0
491491
while time.monotonic() < deadline:
492-
if not pid_path.exists():
492+
if not pid_path.exists() and not is_daemon_running():
493493
break
494494
time.sleep(0.1)
495495

496-
if pid_path.exists():
496+
if pid_path.exists() or is_daemon_running():
497497
_typer.echo("Warning: daemon may not have stopped cleanly.", err=True)
498498
else:
499499
_typer.echo("Daemon stopped.")

src/cocoindex_code/client.py

Lines changed: 107 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,26 @@ def start_daemon() -> None:
183183
cmd = [sys.executable, "-m", "cocoindex_code.cli", "run-daemon"]
184184

185185
log_fd = open(log_path, "a")
186-
subprocess.Popen(
187-
cmd,
188-
start_new_session=True,
189-
stdout=log_fd,
190-
stderr=log_fd,
191-
stdin=subprocess.DEVNULL,
192-
)
186+
if sys.platform == "win32":
187+
# DETACHED_PROCESS fully detaches the daemon from the parent console,
188+
# preventing its exit code from leaking back to the calling shell.
189+
_create_new_process_group = 0x00000200
190+
_detached_process = 0x00000008
191+
subprocess.Popen(
192+
cmd,
193+
stdout=log_fd,
194+
stderr=log_fd,
195+
stdin=subprocess.DEVNULL,
196+
creationflags=_create_new_process_group | _detached_process,
197+
)
198+
else:
199+
subprocess.Popen(
200+
cmd,
201+
start_new_session=True,
202+
stdout=log_fd,
203+
stderr=log_fd,
204+
stdin=subprocess.DEVNULL,
205+
)
193206
log_fd.close()
194207

195208

@@ -205,12 +218,49 @@ def _find_ccc_executable() -> str | None:
205218
return None
206219

207220

221+
def _pid_alive(pid: int) -> bool:
222+
"""Return True if *pid* is still running."""
223+
if sys.platform == "win32":
224+
# Avoid os.kill(pid, 0) on Windows — it has a CPython bug that corrupts
225+
# the C-level exception state, causing subsequent C function calls
226+
# (time.monotonic, time.sleep) to raise SystemError even after the
227+
# OSError is caught. Use OpenProcess via ctypes instead.
228+
import ctypes
229+
230+
kernel32 = getattr(ctypes, "windll").kernel32
231+
handle = kernel32.OpenProcess(0x1000, False, pid) # PROCESS_QUERY_LIMITED_INFORMATION
232+
if handle:
233+
kernel32.CloseHandle(handle)
234+
return True
235+
return False
236+
try:
237+
os.kill(pid, 0) # signal 0: check existence without killing
238+
return True
239+
except ProcessLookupError:
240+
return False
241+
except PermissionError:
242+
return True # process exists but we can't signal it
243+
244+
208245
def stop_daemon() -> None:
209246
"""Stop the daemon gracefully.
210247
211-
Sends a StopRequest, waits for the process to exit, falls back to SIGTERM.
248+
Sends a StopRequest, waits for the process to exit, falls back to
249+
SIGTERM → SIGKILL. Only removes the PID file after confirming that
250+
the specific PID is no longer alive.
212251
"""
213-
# Step 1: try sending StopRequest
252+
pid_path = daemon_pid_path()
253+
254+
# Read the PID early so we can track the actual process.
255+
pid: int | None = None
256+
try:
257+
pid = int(pid_path.read_text().strip())
258+
if pid == os.getpid():
259+
pid = None # safety: never kill ourselves
260+
except (FileNotFoundError, ValueError):
261+
pass
262+
263+
# Step 1: try sending StopRequest via socket
214264
try:
215265
client = DaemonClient.connect()
216266
client.handshake()
@@ -220,65 +270,78 @@ def stop_daemon() -> None:
220270
pass
221271

222272
# Step 2: wait for process to exit (up to 5s)
223-
pid_path = daemon_pid_path()
224-
deadline = time.monotonic() + 5.0
225-
while time.monotonic() < deadline and pid_path.exists():
226-
time.sleep(0.1)
227-
228-
if not pid_path.exists():
229-
return # Clean exit
273+
if pid is not None:
274+
deadline = time.monotonic() + 5.0
275+
while time.monotonic() < deadline and _pid_alive(pid):
276+
time.sleep(0.1)
277+
if not _pid_alive(pid):
278+
_cleanup_stale_files(pid_path, pid)
279+
return
230280

231281
# Step 3: if still running, try SIGTERM
232-
pid: int | None = None
233-
if pid_path.exists():
282+
if pid is not None and _pid_alive(pid):
234283
try:
235-
pid = int(pid_path.read_text().strip())
236-
if pid != os.getpid():
237-
os.kill(pid, signal.SIGTERM)
238-
else:
239-
pid = None
240-
except (ValueError, ProcessLookupError, PermissionError):
284+
os.kill(pid, signal.SIGTERM)
285+
except (ProcessLookupError, PermissionError):
241286
pass
242287

243-
# Wait a bit more
244288
deadline = time.monotonic() + 2.0
245-
while time.monotonic() < deadline and pid_path.exists():
289+
while time.monotonic() < deadline and _pid_alive(pid):
246290
time.sleep(0.1)
247291

248-
# Step 4: if still running, escalate to SIGKILL (Unix only;
292+
if not _pid_alive(pid):
293+
_cleanup_stale_files(pid_path, pid)
294+
return
295+
296+
# Step 4: escalate to SIGKILL (Unix only;
249297
# on Windows SIGTERM already calls TerminateProcess)
250-
if sys.platform != "win32" and pid_path.exists():
298+
if sys.platform != "win32" and pid is not None and _pid_alive(pid):
251299
try:
252-
pid = int(pid_path.read_text().strip())
253-
if pid != os.getpid():
254-
os.kill(pid, signal.SIGKILL)
255-
else:
256-
pid = None
257-
except (ValueError, ProcessLookupError, PermissionError):
300+
os.kill(pid, signal.SIGKILL)
301+
except (ProcessLookupError, PermissionError):
258302
pass
259303

304+
# SIGKILL is async; give the kernel a moment to reap
305+
deadline = time.monotonic() + 1.0
306+
while time.monotonic() < deadline and _pid_alive(pid):
307+
time.sleep(0.1)
308+
260309
# Step 4b: on Windows, wait for the process to fully exit after TerminateProcess
261310
# so that named pipe handles are released before starting a new daemon.
262311
if sys.platform == "win32" and pid is not None:
263312
deadline = time.monotonic() + 3.0
264-
while time.monotonic() < deadline:
265-
try:
266-
os.kill(pid, 0) # Check if process still exists
267-
time.sleep(0.1)
268-
except (ProcessLookupError, PermissionError, OSError):
269-
break # Process has exited
313+
while time.monotonic() < deadline and _pid_alive(pid):
314+
time.sleep(0.1)
270315

271316
# Step 5: clean up stale files
317+
_cleanup_stale_files(pid_path, pid)
318+
319+
320+
def _cleanup_stale_files(pid_path: Path, pid: int | None) -> None:
321+
"""Remove socket and PID file after the daemon has exited.
322+
323+
Only removes the PID file when *pid* matches what is on disk, to
324+
avoid accidentally deleting a newer daemon's PID file.
325+
"""
272326
if sys.platform != "win32":
273327
sock = daemon_socket_path()
274328
try:
275329
Path(sock).unlink(missing_ok=True)
276330
except Exception:
277331
pass
278-
try:
279-
pid_path.unlink(missing_ok=True)
280-
except Exception:
281-
pass
332+
if pid is not None:
333+
try:
334+
stored = pid_path.read_text().strip()
335+
if stored == str(pid):
336+
pid_path.unlink(missing_ok=True)
337+
except (FileNotFoundError, ValueError):
338+
pass
339+
else:
340+
# No PID known — cautiously remove if file exists
341+
try:
342+
pid_path.unlink(missing_ok=True)
343+
except Exception:
344+
pass
282345

283346

284347
def _wait_for_daemon(timeout: float = 30.0) -> None:

src/cocoindex_code/daemon.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -454,17 +454,24 @@ def run_daemon() -> None:
454454
try:
455455
asyncio.run(_async_daemon_main(embedder, settings_mtime_us))
456456
finally:
457-
# Clean up PID file and socket (named pipes on Windows clean up automatically)
458-
try:
459-
pid_path.unlink(missing_ok=True)
460-
except Exception:
461-
pass
457+
# Clean up socket first, then PID file last.
458+
# The PID file is the authoritative "daemon is alive" indicator, so it
459+
# must be the very last thing removed to avoid races where a client
460+
# sees the PID gone but the socket (or process) is still lingering.
462461
if sys.platform != "win32":
463462
sock = daemon_socket_path()
464463
try:
465464
Path(sock).unlink(missing_ok=True)
466465
except Exception:
467466
pass
467+
# Only remove the PID file if it still contains *our* PID.
468+
# A new daemon may have already overwritten it during a restart race.
469+
try:
470+
stored = pid_path.read_text().strip()
471+
if stored == str(os.getpid()):
472+
pid_path.unlink(missing_ok=True)
473+
except Exception:
474+
pass
468475
logger.info("Daemon stopped")
469476

470477

tests/test_daemon.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
RemoveProjectRequest,
3030
Response,
3131
SearchRequest,
32+
StopRequest,
3233
decode_response,
3334
encode_request,
3435
)
@@ -92,6 +93,18 @@ def daemon_sock() -> Iterator[str]:
9293

9394
yield sock_path
9495

96+
# Gracefully shut down the daemon thread so named pipes are released on Windows
97+
try:
98+
conn = Client(sock_path, family=_connection_family())
99+
conn.send_bytes(encode_request(HandshakeRequest(version=__version__)))
100+
conn.recv_bytes()
101+
conn.send_bytes(encode_request(StopRequest()))
102+
conn.recv_bytes()
103+
conn.close()
104+
except Exception:
105+
pass
106+
thread.join(timeout=5)
107+
95108
# Restore patches and env var
96109
dm.create_embedder = _orig_create_embedder # type: ignore[attr-defined]
97110
if old_env is None:

0 commit comments

Comments
 (0)