Skip to content

Commit dfcbbce

Browse files
committed
fix: prevent zombie MCP processes via handshake timeout + deferred init
Adds a 30-second handshake timer that exits the process if no MCP client sends an initialize message within the timeout window. Defers CPU-heavy indexing and file-watcher setup to the oninitialized callback so a misfire consumes near-zero resources before exiting. Closes the recurring zombie process bug where Codex CLI spawns \ px codebase-context <path>\ without a subcommand, starting the MCP server with no client to connect. The 4 previous guards (ppid poller, stdin close, server.onclose, SIGHUP) all failed because cmd.exe holds the pipe open while waiting for the child to produce output. The fix is tool-agnostic: it does not rely on who is calling it or how. Any invocation that does not result in an MCP initialize handshake within 30 seconds exits cleanly with code 1.
1 parent 66824f9 commit dfcbbce

File tree

3 files changed

+171
-16
lines changed

3 files changed

+171
-16
lines changed

src/eval/discovery-harness.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,9 @@ export async function evaluateDiscoveryFixture({
429429

430430
for (const task of fixture.tasks) {
431431
const runner = runners[task.surface];
432+
if (!runner) {
433+
throw new Error(`No runner registered for surface: ${task.surface}`);
434+
}
432435
const payload = await runner(task, rootPath);
433436
results.push(evaluateDiscoveryTask(task, payload));
434437
}

src/index.ts

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1748,8 +1748,7 @@ async function main() {
17481748
const transport = new StdioServerTransport();
17491749
await server.connect(transport);
17501750

1751-
// Register cleanup before any handler that calls process.exit(), so the
1752-
// exit listener is always in place when stdin/onclose/signals fire.
1751+
// ── Cleanup guards (normal MCP lifecycle) ──────────────────────────────────
17531752
const stopAllWatchers = () => {
17541753
for (const project of getAllProjects()) {
17551754
project.stopWatcher?.();
@@ -1770,27 +1769,58 @@ async function main() {
17701769
process.exit(0);
17711770
});
17721771

1773-
// Detect stdin pipe closure — the primary signal that the MCP client is gone.
1774-
// StdioServerTransport only listens for 'data'/'error', never 'end'.
17751772
process.stdin.on('end', () => process.exit(0));
17761773
process.stdin.on('close', () => process.exit(0));
1777-
1778-
// Handle graceful MCP protocol-level disconnect.
1779-
// Fires after SDK internal cleanup when transport.close() is called.
17801774
server.onclose = () => process.exit(0);
17811775

1782-
if (process.env.CODEBASE_CONTEXT_DEBUG) console.error('[DEBUG] Server ready');
1776+
// ── Zombie process prevention ──────────────────────────────────────────────
1777+
// If no MCP client sends an `initialize` message within 30 seconds, this
1778+
// process was started incorrectly (e.g. `npx codebase-context <path>` from
1779+
// a shell or AI agent without a subcommand). Exit cleanly to avoid a zombie.
1780+
const HANDSHAKE_TIMEOUT_MS =
1781+
Number.parseInt(process.env.CODEBASE_CONTEXT_HANDSHAKE_TIMEOUT_MS ?? '', 10) || 30_000;
1782+
let mcpClientInitialized = false;
1783+
1784+
const handshakeTimer = setTimeout(() => {
1785+
if (!mcpClientInitialized) {
1786+
console.error(
1787+
'No MCP client connected within ' +
1788+
Math.round(HANDSHAKE_TIMEOUT_MS / 1000) +
1789+
's - exiting.\n' +
1790+
'If you meant to use CLI commands:\n' +
1791+
' npx codebase-context memory list\n' +
1792+
' npx codebase-context search --query "..."\n' +
1793+
' npx codebase-context --help'
1794+
);
1795+
process.exit(1);
1796+
}
1797+
}, HANDSHAKE_TIMEOUT_MS);
1798+
handshakeTimer.unref();
17831799

1784-
await refreshKnownRootsFromClient();
1800+
// ── Deferred project initialization ────────────────────────────────────────
1801+
// Don't start CPU-heavy indexing or file-watchers until the MCP handshake
1802+
// completes. A misfire (no real client) consumes near-zero resources during
1803+
// the timeout window and exits cleanly.
1804+
server.oninitialized = async () => {
1805+
mcpClientInitialized = true;
1806+
clearTimeout(handshakeTimer);
17851807

1786-
// Keep the current single-project auto-select behavior when exactly one startup project is known.
1787-
const startupRoots = getKnownRootPaths();
1788-
if (startupRoots.length === 1) {
1789-
await initProject(startupRoots[0], watcherDebounceMs, { enableWatcher: true });
1790-
setActiveProject(startupRoots[0]);
1791-
}
1808+
if (process.env.CODEBASE_CONTEXT_DEBUG) console.error('[DEBUG] Server ready');
1809+
1810+
try {
1811+
await refreshKnownRootsFromClient();
1812+
1813+
const startupRoots = getKnownRootPaths();
1814+
if (startupRoots.length === 1) {
1815+
await initProject(startupRoots[0], watcherDebounceMs, { enableWatcher: true });
1816+
setActiveProject(startupRoots[0]);
1817+
}
1818+
} catch (error) {
1819+
console.error('[codebase-context] Project initialization failed:', error);
1820+
}
1821+
};
17921822

1793-
// Subscribe to root changes
1823+
// Subscribe to root changes (lightweight — no project init cost)
17941824
server.setNotificationHandler(RootsListChangedNotificationSchema, async () => {
17951825
try {
17961826
await refreshKnownRootsFromClient();

tests/zombie-guard.test.ts

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/**
2+
* Integration tests for zombie process prevention.
3+
*
4+
* These tests verify that the MCP server exits cleanly when no client connects
5+
* (handshake timeout) and that project initialization is deferred until after
6+
* the MCP handshake completes.
7+
*
8+
* The tests spawn real child processes to exercise the actual startup path.
9+
*/
10+
11+
import { describe, it, expect, beforeAll } from 'vitest';
12+
import { spawn } from 'node:child_process';
13+
import { existsSync } from 'node:fs';
14+
import path from 'node:path';
15+
import os from 'node:os';
16+
import { fileURLToPath } from 'node:url';
17+
18+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
19+
const ENTRY_POINT = path.resolve(__dirname, '..', 'dist', 'index.js');
20+
21+
/**
22+
* Spawn the MCP server as a child process and wait for it to exit.
23+
* Returns { code, stderr, elapsed } where elapsed is in milliseconds.
24+
*/
25+
function spawnServer(
26+
args: string[],
27+
env: Record<string, string> = {},
28+
timeoutMs = 45_000
29+
): Promise<{ code: number | null; signal: string | null; stderr: string; elapsed: number }> {
30+
return new Promise((resolve, reject) => {
31+
const start = Date.now();
32+
let stderr = '';
33+
34+
const child = spawn(process.execPath, [ENTRY_POINT, ...args], {
35+
stdio: ['pipe', 'pipe', 'pipe'],
36+
env: { ...process.env, ...env },
37+
timeout: timeoutMs
38+
});
39+
40+
child.stderr?.on('data', (chunk: Buffer) => {
41+
stderr += chunk.toString();
42+
});
43+
44+
child.on('error', reject);
45+
child.on('close', (code, signal) => {
46+
resolve({ code, signal, stderr, elapsed: Date.now() - start });
47+
});
48+
49+
// Don't write anything to stdin — simulate the zombie scenario
50+
// where no MCP client sends an `initialize` message.
51+
});
52+
}
53+
54+
describe('zombie process prevention', () => {
55+
beforeAll(() => {
56+
if (!existsSync(ENTRY_POINT)) {
57+
throw new Error(
58+
`dist/index.js not found - run \`npm run build\` before the zombie-guard tests.`
59+
);
60+
}
61+
});
62+
63+
it('exits with code 1 when no MCP client connects within timeout', async () => {
64+
// Use a short timeout for the test (2 seconds instead of the default 30).
65+
// Use os.tmpdir() as a real existing directory so path validation passes —
66+
// this tests the realistic scenario where a valid path IS provided but no
67+
// MCP client connects (which is exactly the Codex zombie scenario).
68+
const result = await spawnServer(
69+
[os.tmpdir()],
70+
{ CODEBASE_CONTEXT_HANDSHAKE_TIMEOUT_MS: '2000' }
71+
);
72+
73+
expect(result.code).toBe(1);
74+
expect(result.stderr).toContain('No MCP client connected within');
75+
expect(result.stderr).toContain('npx codebase-context --help');
76+
// Should exit roughly around the timeout (2s), not hang forever
77+
expect(result.elapsed).toBeLessThan(10_000);
78+
}, 15_000);
79+
80+
it('exits with code 1 even when invoked with no arguments at all', async () => {
81+
const result = await spawnServer(
82+
[],
83+
{ CODEBASE_CONTEXT_HANDSHAKE_TIMEOUT_MS: '2000' }
84+
);
85+
86+
expect(result.code).toBe(1);
87+
expect(result.stderr).toContain('No MCP client connected within');
88+
expect(result.elapsed).toBeLessThan(10_000);
89+
}, 15_000);
90+
91+
it('does not start indexing or file watchers before handshake', async () => {
92+
// With DEBUG on, the server logs "[DEBUG] Server ready" inside oninitialized.
93+
// Since no client ever connects, that log must never appear.
94+
// Use os.tmpdir() so path validation passes before the handshake timer runs.
95+
const result = await spawnServer(
96+
[os.tmpdir()],
97+
{
98+
CODEBASE_CONTEXT_HANDSHAKE_TIMEOUT_MS: '2000',
99+
CODEBASE_CONTEXT_DEBUG: '1'
100+
}
101+
);
102+
103+
expect(result.code).toBe(1);
104+
// "[DEBUG] Server ready" is printed inside oninitialized — should NOT appear
105+
// because no client ever sends `initialize`.
106+
expect(result.stderr).not.toContain('[DEBUG] Server ready');
107+
}, 15_000);
108+
109+
it('respects custom timeout via environment variable', async () => {
110+
const start = Date.now();
111+
const result = await spawnServer(
112+
[],
113+
{ CODEBASE_CONTEXT_HANDSHAKE_TIMEOUT_MS: '1000' }
114+
);
115+
const elapsed = Date.now() - start;
116+
117+
expect(result.code).toBe(1);
118+
// Should exit around 1 second, definitely under 5
119+
expect(elapsed).toBeGreaterThan(800);
120+
expect(elapsed).toBeLessThan(5_000);
121+
}, 10_000);
122+
});

0 commit comments

Comments
 (0)