Skip to content

Commit 6b86524

Browse files
committed
fix: stabilize runner websocket across suspend/restore
1 parent f2a756a commit 6b86524

File tree

4 files changed

+24
-1
lines changed

4 files changed

+24
-1
lines changed

.changeset/slow-lions-tickle.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@basicblock/trigger-cli": patch
3+
---
4+
5+
Reset supervisor socket around checkpoint suspend/restore and add configurable websocket ping interval/timeout in supervisor to reduce disconnect issues during runner execution.

apps/supervisor/src/env.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ const Env = z.object({
2424
TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"),
2525
TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on
2626
TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller
27+
TRIGGER_WORKLOAD_API_WS_PING_INTERVAL_MS: z.coerce.number().int().positive().default(25000),
28+
TRIGGER_WORKLOAD_API_WS_PING_TIMEOUT_MS: z.coerce.number().int().positive().default(120000),
2729

2830
// Runner settings
2931
RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(),

apps/supervisor/src/workloadServer/index.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,10 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
398398
}
399399

400400
private createWebsocketServer() {
401-
const io = new Server(this.httpServer.server);
401+
const io = new Server(this.httpServer.server, {
402+
pingInterval: env.TRIGGER_WORKLOAD_API_WS_PING_INTERVAL_MS,
403+
pingTimeout: env.TRIGGER_WORKLOAD_API_WS_PING_TIMEOUT_MS,
404+
});
402405

403406
const websocketServer: Namespace<
404407
WorkloadClientToServerEvents,

packages/cli-v3/src/entryPoints/managed/execution.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,15 @@ export class RunExecution {
865865
// Short delay to give websocket time to reconnect
866866
await sleep(100);
867867

868+
// Reset the supervisor socket after checkpoint restore.
869+
// Socket transports don't reliably survive process snapshot/restore boundaries.
870+
try {
871+
this.supervisorSocket.disconnect();
872+
} catch (_) {
873+
// noop
874+
}
875+
this.supervisorSocket.connect();
876+
868877
// Process any env overrides
869878
await this.processEnvOverrides("restore");
870879

@@ -1204,6 +1213,10 @@ export class RunExecution {
12041213
}
12051214

12061215
this.sendDebugLog("suspending, any day now 🚬", { suspendableSnapshot });
1216+
1217+
// Disconnect before snapshotting so the restored process does a fresh socket handshake.
1218+
// This avoids carrying a stale websocket transport through checkpoint/restore.
1219+
this.supervisorSocket.disconnect();
12071220
}
12081221

12091222
/**

0 commit comments

Comments
 (0)