@@ -6,12 +6,41 @@ import {
66 type WorkloadManagerCreateOptions ,
77 type WorkloadManagerOptions ,
88} from "./types.js" ;
9- import { ComputeClient , stripImageDigest } from "@internal/compute" ;
9+ import { ComputeClient , ComputeClientError , stripImageDigest } from "@internal/compute" ;
10+ import { setTimeout as sleep } from "node:timers/promises" ;
1011import { extractTraceparent , getRunnerId } from "../util.js" ;
1112import type { OtlpTraceService } from "../services/otlpTraceService.js" ;
1213import { tryCatch } from "@trigger.dev/core" ;
1314import { encodeBaggage , fromContext } from "../wideEvents/index.js" ;
1415
16+ const CREATE_MAX_ATTEMPTS = 3 ;
17+ const CREATE_RETRY_BASE_DELAY_MS = 250 ;
18+
19+ /**
20+ * Whether a failed instance create is worth retrying. Only statuses where
21+ * the create definitely did NOT commit are retried: 500 means the agent or
22+ * fcrun returned a create error (e.g. a netns slot holding the tap busy, a
23+ * full node disk - placement may differ on retry), 503 means the gateway
24+ * had nowhere to place it. 502/504 are excluded: the gateway emits those
25+ * when it fails to reach the node or read its response, which can happen
26+ * AFTER the agent committed the create - and the gateway only records the
27+ * instance name on a clean 201, so a same-name retry would miss the
28+ * collision check and could double-create the VM on another node. 4xx won't
29+ * heal on retry, and timeouts may still be provisioning. Network-level
30+ * fetch failures are safe: if the gateway processed the create, its name
31+ * index is populated and the retry 409s harmlessly.
32+ */
33+ export function isRetryableCreateError ( error : unknown ) : boolean {
34+ if ( error instanceof ComputeClientError ) {
35+ return error . status === 500 || error . status === 503 ;
36+ }
37+ if ( error instanceof DOMException && error . name === "TimeoutError" ) {
38+ return false ;
39+ }
40+ // Network-level fetch failures (gateway briefly unreachable)
41+ return error instanceof TypeError ;
42+ }
43+
1544type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
1645 gateway : {
1746 url : string ;
@@ -165,27 +194,48 @@ export class ComputeWorkloadManager implements WorkloadManager {
165194 const startMs = performance . now ( ) ;
166195
167196 try {
168- const [ error , data ] = await tryCatch (
169- this . compute . instances . create ( {
170- name : runnerId ,
171- image : imageRef ,
172- env : envVars ,
173- cpu : opts . machine . cpu ,
174- memory_gb : opts . machine . memory ,
175- metadata : {
176- runId : opts . runFriendlyId ,
177- envId : opts . envId ,
178- envType : opts . envType ,
179- orgId : opts . orgId ,
180- projectId : opts . projectId ,
181- deploymentVersion : opts . deploymentVersion ,
182- machine : opts . machine . name ,
183- } ,
184- ...( Object . keys ( labels ) . length > 0 ? { labels } : { } ) ,
185- } )
186- ) ;
197+ const createRequest = {
198+ name : runnerId ,
199+ image : imageRef ,
200+ env : envVars ,
201+ cpu : opts . machine . cpu ,
202+ memory_gb : opts . machine . memory ,
203+ metadata : {
204+ runId : opts . runFriendlyId ,
205+ envId : opts . envId ,
206+ envType : opts . envType ,
207+ orgId : opts . orgId ,
208+ projectId : opts . projectId ,
209+ deploymentVersion : opts . deploymentVersion ,
210+ machine : opts . machine . name ,
211+ } ,
212+ ...( Object . keys ( labels ) . length > 0 ? { labels } : { } ) ,
213+ } ;
214+
215+ // Retry transient placement failures instead of abandoning the run: a
216+ // swallowed create error leaves the run waiting for the run engine's
217+ // PENDING_EXECUTING timeout (minutes) before it is redriven, while a
218+ // retried create typically succeeds in under a second (TRI-10293).
219+ let error : unknown ;
220+ let data : Awaited < ReturnType < typeof this . compute . instances . create > > | null | undefined ;
221+ let attempt = 1 ;
222+ for ( ; attempt <= CREATE_MAX_ATTEMPTS ; attempt ++ ) {
223+ [ error , data ] = await tryCatch ( this . compute . instances . create ( createRequest ) ) ;
224+
225+ if ( ! error ) break ;
226+
227+ this . logger . warn ( "create instance attempt failed" , {
228+ runnerId,
229+ attempt,
230+ error : error instanceof Error ? error . message : String ( error ) ,
231+ } ) ;
232+
233+ if ( ! isRetryableCreateError ( error ) || attempt === CREATE_MAX_ATTEMPTS ) break ;
234+ await sleep ( CREATE_RETRY_BASE_DELAY_MS * attempt ) ;
235+ }
236+ event . createAttempts = attempt ;
187237
188- if ( error ) {
238+ if ( error || ! data ) {
189239 event . error = error instanceof Error ? error . message : String ( error ) ;
190240 event . errorType =
191241 error instanceof DOMException && error . name === "TimeoutError" ? "timeout" : "fetch" ;
0 commit comments