Skip to content

Commit 5260ac8

Browse files
authored
fix: harden Node runtime cleanup (#457)
* fix: harden node runtime cleanup * fix: propagate command cancellation * fix: address runtime cleanup review * fix: close partial artifact writes before cleanup
1 parent d77a921 commit 5260ac8

25 files changed

Lines changed: 806 additions & 525 deletions

.github/actions/setup-node-pnpm/action.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@ inputs:
55
node-version:
66
description: 'Node.js version'
77
required: false
8-
default: '22'
8+
default: '22.19'
99
pnpm-version:
1010
description: 'pnpm version'
1111
required: false
12-
default: '9.12.2'
12+
default: '10.33.2'
1313
install-deps:
1414
description: 'Whether to install dependencies with pnpm install --frozen-lockfile'
1515
required: false

package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"url": "https://github.com/callstackincubator/agent-device/issues"
1414
},
1515
"type": "module",
16+
"packageManager": "pnpm@10.33.2",
1617
"main": "dist/src/index.js",
1718
"types": "dist/src/index.d.ts",
1819
"exports": {
@@ -62,7 +63,7 @@
6263
}
6364
},
6465
"engines": {
65-
"node": ">=22"
66+
"node": ">=22.19"
6667
},
6768
"bin": {
6869
"agent-device": "bin/agent-device.mjs"
@@ -82,7 +83,7 @@
8283
"build:all": "pnpm build:node && pnpm build:xcuitest",
8384
"ad": "node bin/agent-device.mjs",
8485
"lint": "oxlint . --deny-warnings",
85-
"format": "oxfmt --write src test skills package.json tsconfig.json .oxlintrc.json .oxfmtrc.json '!test/skillgym/.skillgym-results/**'",
86+
"format": "oxfmt --write src test skills package.json tsconfig.json tsconfig.lib.json rslib.config.ts vitest.config.ts .github/actions/setup-node-pnpm/action.yml .oxlintrc.json .oxfmtrc.json '!test/skillgym/.skillgym-results/**'",
8687
"fallow": "fallow --summary",
8788
"fallow:baseline": "(fallow dead-code --save-baseline fallow-baselines/dead-code.json --summary || true) && (fallow dupes --save-baseline fallow-baselines/dupes.json --summary || true) && (fallow health --save-baseline fallow-baselines/health.json --summary || true)",
8889
"check:fallow": "fallow audit",

src/__tests__/client-companion-tunnel-worker.test.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,15 @@ function createDeferred<T>(): Deferred<T> {
3838
}
3939

4040
function waitFor<T>(promise: Promise<T>, timeoutMs: number, label: string): Promise<T> {
41-
return Promise.race([
42-
promise,
43-
delay(timeoutMs).then(() => {
44-
throw new Error(`Timed out waiting for ${label}.`);
45-
}),
46-
]);
41+
let timeout: NodeJS.Timeout | null = null;
42+
const timeoutPromise = new Promise<never>((_, reject) => {
43+
timeout = setTimeout(() => {
44+
reject(new Error(`Timed out waiting for ${label}.`));
45+
}, timeoutMs);
46+
});
47+
return Promise.race([promise, timeoutPromise]).finally(() => {
48+
if (timeout) clearTimeout(timeout);
49+
});
4750
}
4851

4952
function encodeTextFrame(text: string): Buffer {

src/__tests__/client-metro.test.ts

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import os from 'node:os';
1010
import path from 'node:path';
1111
import { prepareMetroRuntime, reloadMetro } from '../client-metro.ts';
1212
import { AppError } from '../utils/errors.ts';
13+
import { isProcessAlive, waitForProcessExit } from '../utils/process-identity.ts';
1314

1415
const TEST_TOKEN = 'agent-device-proxy-test-token';
1516

@@ -163,11 +164,7 @@ test('prepareMetroRuntime starts Metro, bridges through proxy, and writes runtim
163164
socket.destroy();
164165
}
165166
await closeServer(proxyServer);
166-
if (pid) {
167-
try {
168-
process.kill(pid);
169-
} catch {}
170-
}
167+
await stopProcess(pid);
171168
rmSync(tempRoot, { recursive: true, force: true });
172169
}
173170
});
@@ -326,3 +323,19 @@ async function closeServer(server: ReturnType<typeof createServer>): Promise<voi
326323
});
327324
});
328325
}
326+
327+
async function stopProcess(pid: number): Promise<void> {
328+
if (!pid || !isProcessAlive(pid)) return;
329+
try {
330+
process.kill(pid, 'SIGTERM');
331+
} catch {
332+
return;
333+
}
334+
if (await waitForProcessExit(pid, 1_500)) return;
335+
try {
336+
process.kill(pid, 'SIGKILL');
337+
} catch {
338+
return;
339+
}
340+
await waitForProcessExit(pid, 1_500);
341+
}

src/daemon-client.ts

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import http from 'node:http';
33
import https from 'node:https';
44
import fs from 'node:fs';
55
import path from 'node:path';
6+
import { pipeline } from 'node:stream/promises';
67
import { sleep } from './utils/timeouts.ts';
78
import { AppError, toAppErrorCode } from './utils/errors.ts';
89
import type {
@@ -1334,13 +1335,6 @@ export async function downloadRemoteArtifact(params: {
13341335
});
13351336
return;
13361337
}
1337-
const output = fs.createWriteStream(params.destinationPath);
1338-
output.on('error', (error) => {
1339-
settle(error instanceof Error ? error : new Error(String(error)));
1340-
});
1341-
res.on('error', (error) => {
1342-
settle(error instanceof Error ? error : new Error(String(error)));
1343-
});
13441338
res.on('aborted', () => {
13451339
settle(
13461340
new AppError('COMMAND_FAILED', 'Remote artifact download was interrupted', {
@@ -1349,10 +1343,10 @@ export async function downloadRemoteArtifact(params: {
13491343
}),
13501344
);
13511345
});
1352-
output.on('finish', () => {
1353-
output.close(() => settle());
1354-
});
1355-
res.pipe(output);
1346+
void pipeline(res, fs.createWriteStream(params.destinationPath)).then(
1347+
() => settle(),
1348+
(error: unknown) => settle(error instanceof Error ? error : new Error(String(error))),
1349+
);
13561350
},
13571351
);
13581352
const timeoutHandle = setTimeout(() => {

src/daemon.ts

Lines changed: 114 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@ import { createDaemonHttpServer } from './daemon/http-server.ts';
88
import { trackDownloadableArtifact } from './daemon/artifact-tracking.ts';
99
import { LeaseRegistry } from './daemon/lease-registry.ts';
1010
import { createRequestHandler } from './daemon/request-router.ts';
11+
import { teardownSessionResources } from './daemon/handlers/session-close.ts';
12+
import { closeDaemonServers } from './daemon/server-shutdown.ts';
13+
import type { SessionState } from './daemon/types.ts';
14+
import {
15+
emitDiagnostic,
16+
flushDiagnosticsToSessionFile,
17+
withDiagnosticsScope,
18+
} from './utils/diagnostics.ts';
1119
import {
1220
acquireDaemonLock,
1321
parseIntegerEnv,
@@ -18,7 +26,15 @@ import {
1826
resolveDaemonCodeSignature,
1927
writeInfo,
2028
} from './daemon/server-lifecycle.ts';
21-
import { createSocketServer, listenHttpServer, listenNetServer } from './daemon/transport.ts';
29+
import {
30+
createSocketServer,
31+
listenHttpServer,
32+
listenNetServer,
33+
type DaemonServer,
34+
} from './daemon/transport.ts';
35+
import { sleep } from './utils/timeouts.ts';
36+
37+
const DAEMON_SESSION_TEARDOWN_TIMEOUT_MS = 5_000;
2238

2339
const daemonPaths = resolveDaemonPaths(process.env.AGENT_DEVICE_STATE_DIR);
2440
const { baseDir, infoPath, lockPath, logPath, sessionsDir } = daemonPaths;
@@ -45,6 +61,90 @@ const handleRequest = createRequestHandler({
4561
trackDownloadableArtifact,
4662
});
4763

64+
async function emitFatalDiagnostic(error: unknown): Promise<void> {
65+
await withDiagnosticsScope(
66+
{ command: 'daemon', session: 'daemon', logPath, debug: true },
67+
async () => {
68+
emitDiagnostic({
69+
level: 'error',
70+
phase: 'daemon_fatal',
71+
data: {
72+
error: error instanceof Error ? error.message : String(error),
73+
},
74+
});
75+
flushDiagnosticsToSessionFile({ force: true });
76+
},
77+
);
78+
}
79+
80+
async function teardownDaemonSessions(): Promise<void> {
81+
const sessionsToStop = sessionStore.toArray();
82+
await Promise.all(sessionsToStop.map(teardownDaemonSession));
83+
}
84+
85+
async function teardownDaemonSession(session: SessionState) {
86+
const teardown = teardownSessionResources(session, session.name).catch((error) => {
87+
process.stderr.write(
88+
`Daemon session teardown error (${session.name}): ${
89+
error instanceof Error ? error.message : String(error)
90+
}\n`,
91+
);
92+
});
93+
await Promise.race([
94+
teardown,
95+
sleep(DAEMON_SESSION_TEARDOWN_TIMEOUT_MS).then(() => {
96+
process.stderr.write(`Daemon session teardown timed out (${session.name}).\n`);
97+
}),
98+
]);
99+
sessionStore.writeSessionLog(session);
100+
sessionStore.delete(session.name);
101+
}
102+
103+
async function openDaemonServers(): Promise<{
104+
servers: DaemonServer[];
105+
socketPort?: number;
106+
httpPort?: number;
107+
}> {
108+
const servers: DaemonServer[] = [];
109+
let socketPort: number | undefined;
110+
let httpPort: number | undefined;
111+
const startSocketServer = daemonServerMode !== 'http';
112+
const startHttpServer = daemonServerMode !== 'socket';
113+
if (startSocketServer) {
114+
const socketServer = createSocketServer(handleRequest);
115+
servers.push(socketServer);
116+
socketPort = await listenNetServer(socketServer);
117+
}
118+
119+
if (startHttpServer) {
120+
const httpServer = await createDaemonHttpServer({ handleRequest, token });
121+
servers.push(httpServer);
122+
httpPort = await listenHttpServer(httpServer);
123+
}
124+
return { servers, socketPort, httpPort };
125+
}
126+
127+
function publishDaemonInfo(socketPort: number | undefined, httpPort: number | undefined): void {
128+
writeInfo(baseDir, infoPath, logPath, {
129+
socketPort,
130+
httpPort,
131+
token,
132+
version,
133+
codeSignature: daemonCodeSignature,
134+
processStartTime: daemonProcessStartTime,
135+
});
136+
if (socketPort) process.stdout.write(`AGENT_DEVICE_DAEMON_PORT=${socketPort}\n`);
137+
if (httpPort) process.stdout.write(`AGENT_DEVICE_DAEMON_HTTP_PORT=${httpPort}\n`);
138+
}
139+
140+
function closeServersBestEffort(servers: DaemonServer[]): void {
141+
for (const server of servers) {
142+
try {
143+
server.close(() => {});
144+
} catch {}
145+
}
146+
}
147+
48148
async function start(): Promise<void> {
49149
const lockData = {
50150
pid: process.pid,
@@ -58,73 +158,34 @@ async function start(): Promise<void> {
58158
return;
59159
}
60160

61-
const servers: Array<{ close: (cb: (err?: Error) => void) => void }> = [];
62-
let socketPort: number | undefined;
63-
let httpPort: number | undefined;
64-
161+
let servers: DaemonServer[] = [];
65162
try {
66-
if (daemonServerMode === 'socket' || daemonServerMode === 'dual') {
67-
const socketServer = createSocketServer(handleRequest);
68-
servers.push(socketServer);
69-
socketPort = await listenNetServer(socketServer);
70-
}
71-
72-
if (daemonServerMode === 'http' || daemonServerMode === 'dual') {
73-
const httpServer = await createDaemonHttpServer({ handleRequest, token });
74-
servers.push(httpServer);
75-
httpPort = await listenHttpServer(httpServer);
76-
}
77-
78-
writeInfo(baseDir, infoPath, logPath, {
79-
socketPort,
80-
httpPort,
81-
token,
82-
version,
83-
codeSignature: daemonCodeSignature,
84-
processStartTime: daemonProcessStartTime,
85-
});
86-
if (socketPort) process.stdout.write(`AGENT_DEVICE_DAEMON_PORT=${socketPort}\n`);
87-
if (httpPort) process.stdout.write(`AGENT_DEVICE_DAEMON_HTTP_PORT=${httpPort}\n`);
163+
const opened = await openDaemonServers();
164+
servers = opened.servers;
165+
publishDaemonInfo(opened.socketPort, opened.httpPort);
88166
} catch (error) {
89167
const appErr = asAppError(error);
90168
process.stderr.write(`Daemon error: ${appErr.message}\n`);
91-
for (const server of servers) {
92-
try {
93-
server.close(() => {});
94-
} catch {}
95-
}
169+
closeServersBestEffort(servers);
96170
removeInfo(infoPath);
97171
releaseDaemonLock(lockPath);
98172
process.exit(1);
99173
return;
100174
}
101175

102176
let shuttingDown = false;
103-
const closeServers = async (): Promise<void> => {
104-
await Promise.all(
105-
servers.map(async (server) => {
106-
await new Promise<void>((resolve) => {
107-
try {
108-
server.close(() => resolve());
109-
} catch {
110-
resolve();
111-
}
112-
});
113-
}),
114-
);
115-
};
116-
const shutdown = async () => {
177+
const shutdown = async (options: { exitCode?: number; cause?: unknown } = {}) => {
117178
if (shuttingDown) return;
118179
shuttingDown = true;
119-
await closeServers();
120-
const sessionsToStop = sessionStore.toArray();
121-
for (const session of sessionsToStop) {
122-
sessionStore.writeSessionLog(session);
180+
if (options.cause) {
181+
await emitFatalDiagnostic(options.cause);
123182
}
183+
await closeDaemonServers(servers);
184+
await teardownDaemonSessions();
124185
await stopAllIosRunnerSessions();
125186
removeInfo(infoPath);
126187
releaseDaemonLock(lockPath);
127-
process.exit(0);
188+
process.exit(options.exitCode ?? 0);
128189
};
129190

130191
process.on('SIGINT', () => {
@@ -139,13 +200,13 @@ async function start(): Promise<void> {
139200
process.on('uncaughtException', (err) => {
140201
const appErr = err instanceof AppError ? err : asAppError(err);
141202
process.stderr.write(`Daemon error: ${appErr.message}\n`);
142-
void shutdown();
203+
void shutdown({ exitCode: 1, cause: err });
143204
});
144205
process.on('unhandledRejection', (reason) => {
145206
const err = reason instanceof Error ? reason : new Error(String(reason));
146207
const appErr = err instanceof AppError ? err : asAppError(err);
147208
process.stderr.write(`Daemon error: ${appErr.message}\n`);
148-
void shutdown();
209+
void shutdown({ exitCode: 1, cause: err });
149210
});
150211
}
151212

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import { test } from 'vitest';
2+
import assert from 'node:assert/strict';
3+
import type { DaemonServer } from '../transport.ts';
4+
import { closeDaemonServers } from '../server-shutdown.ts';
5+
6+
test('closeDaemonServers forces stuck servers after timeout', async () => {
7+
let destroyed = false;
8+
const server = {
9+
close: () => {},
10+
destroyConnections: () => {
11+
destroyed = true;
12+
},
13+
} as unknown as DaemonServer;
14+
15+
const startedAt = Date.now();
16+
await closeDaemonServers([server], 20);
17+
18+
assert.equal(destroyed, true);
19+
assert.ok(Date.now() - startedAt < 500);
20+
});

0 commit comments

Comments
 (0)