File tree Expand file tree Collapse file tree 4 files changed +24
-1
lines changed
packages/cli-v3/src/entryPoints/managed Expand file tree Collapse file tree 4 files changed +24
-1
lines changed Original file line number Diff line number Diff line change 1+ ---
2+ " @basicblock/trigger-cli " : patch
3+ ---
4+
5+ Reset supervisor socket around checkpoint suspend/restore and add configurable websocket ping interval/timeout in supervisor to reduce disconnect issues during runner execution.
Original file line number Diff line number Diff line change @@ -24,6 +24,8 @@ const Env = z.object({
2424 TRIGGER_WORKLOAD_API_HOST_INTERNAL : z . string ( ) . default ( "0.0.0.0" ) ,
2525 TRIGGER_WORKLOAD_API_PORT_INTERNAL : z . coerce . number ( ) . default ( 8020 ) , // This is the port the workload API listens on
2626 TRIGGER_WORKLOAD_API_PORT_EXTERNAL : z . coerce . number ( ) . default ( 8020 ) , // This is the exposed port passed to the run controller
27+ TRIGGER_WORKLOAD_API_WS_PING_INTERVAL_MS : z . coerce . number ( ) . int ( ) . positive ( ) . default ( 25000 ) ,
28+ TRIGGER_WORKLOAD_API_WS_PING_TIMEOUT_MS : z . coerce . number ( ) . int ( ) . positive ( ) . default ( 120000 ) ,
2729
2830 // Runner settings
2931 RUNNER_HEARTBEAT_INTERVAL_SECONDS : z . coerce . number ( ) . optional ( ) ,
Original file line number Diff line number Diff line change @@ -398,7 +398,10 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
398398 }
399399
400400 private createWebsocketServer ( ) {
401- const io = new Server ( this . httpServer . server ) ;
401+ const io = new Server ( this . httpServer . server , {
402+ pingInterval : env . TRIGGER_WORKLOAD_API_WS_PING_INTERVAL_MS ,
403+ pingTimeout : env . TRIGGER_WORKLOAD_API_WS_PING_TIMEOUT_MS ,
404+ } ) ;
402405
403406 const websocketServer : Namespace <
404407 WorkloadClientToServerEvents ,
Original file line number Diff line number Diff line change @@ -865,6 +865,15 @@ export class RunExecution {
865865 // Short delay to give websocket time to reconnect
866866 await sleep ( 100 ) ;
867867
868+ // Reset the supervisor socket after checkpoint restore.
869+ // Socket transports don't reliably survive process snapshot/restore boundaries.
870+ try {
871+ this . supervisorSocket . disconnect ( ) ;
872+ } catch ( _ ) {
873+ // noop
874+ }
875+ this . supervisorSocket . connect ( ) ;
876+
868877 // Process any env overrides
869878 await this . processEnvOverrides ( "restore" ) ;
870879
@@ -1204,6 +1213,10 @@ export class RunExecution {
12041213 }
12051214
12061215 this . sendDebugLog ( "suspending, any day now 🚬" , { suspendableSnapshot } ) ;
1216+
1217+ // Disconnect before snapshotting so the restored process does a fresh socket handshake.
1218+ // This avoids carrying a stale websocket transport through checkpoint/restore.
1219+ this . supervisorSocket . disconnect ( ) ;
12071220 }
12081221
12091222 /**
You can’t perform that action at this time.
0 commit comments