@@ -6,12 +6,58 @@ import {
66 type WorkloadManagerCreateOptions ,
77 type WorkloadManagerOptions ,
88} from "./types.js" ;
9- import { ComputeClient , stripImageDigest } from "@internal/compute" ;
9+ import { ComputeClient , ComputeClientError , stripImageDigest } from "@internal/compute" ;
10+ import { setTimeout as sleep } from "node:timers/promises" ;
1011import { extractTraceparent , getRunnerId } from "../util.js" ;
1112import type { OtlpTraceService } from "../services/otlpTraceService.js" ;
1213import { tryCatch } from "@trigger.dev/core" ;
1314import { encodeBaggage , fromContext } from "../wideEvents/index.js" ;
1415
16+ const DEFAULT_CREATE_MAX_ATTEMPTS = 3 ;
17+ const DEFAULT_CREATE_RETRY_BASE_DELAY_MS = 250 ;
18+
19+ /**
20+ * TEMPORARY (TRI-10293): a failed create can leave its instance name
21+ * registered gateway/fcrun-side until async cleanup runs, so a same-name
22+ * retry can 409 against our own residue. Until the gateway cleans up
23+ * failed-create registrations properly, retry attempts get a deterministic
24+ * suffix. Attempt 1 keeps the unsuffixed name so the non-retry path is
25+ * unchanged; the suffixed name flows into both the instance name and
26+ * TRIGGER_RUNNER_ID, which downstream flows treat as one opaque
27+ * self-reported token. Only attempts following a ComputeClientError are
28+ * suffixed - network-failure retries keep the same name on purpose, because
29+ * the gateway's name-collision 409 is their safety net against
30+ * double-creating an instance whose create response was lost.
31+ */
32+ export function runnerNameForAttempt ( runnerId : string , attempt : number ) : string {
33+ return attempt === 1 ? runnerId : `${ runnerId } -r${ attempt } ` ;
34+ }
35+
36+ /**
37+ * Whether a failed instance create is worth retrying. Only statuses where
38+ * the create definitely did NOT commit are retried: 500 means the agent or
39+ * fcrun returned a create error (e.g. a netns slot holding the tap busy, a
40+ * full node disk - placement may differ on retry), 503 means the gateway
41+ * had nowhere to place it. 502/504 are excluded: the gateway emits those
42+ * when it fails to reach the node or read its response, which can happen
43+ * AFTER the agent committed the create - and the gateway only records the
44+ * instance name on a clean 201, so a same-name retry would miss the
45+ * collision check and could double-create the VM on another node. 4xx won't
46+ * heal on retry, and timeouts may still be provisioning. Network-level
47+ * fetch failures are safe: if the gateway processed the create, its name
48+ * index is populated and the retry 409s harmlessly.
49+ */
50+ export function isRetryableCreateError ( error : unknown ) : boolean {
51+ if ( error instanceof ComputeClientError ) {
52+ return error . status === 500 || error . status === 503 ;
53+ }
54+ if ( error instanceof DOMException && error . name === "TimeoutError" ) {
55+ return false ;
56+ }
57+ // Network-level fetch failures (gateway briefly unreachable)
58+ return error instanceof TypeError ;
59+ }
60+
1561type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
1662 gateway : {
1763 url : string ;
@@ -30,13 +76,23 @@ type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
3076 otelEndpoint : string ;
3177 prettyLogs : boolean ;
3278 } ;
79+ createRetry ?: {
80+ maxAttempts : number ;
81+ baseDelayMs : number ;
82+ } ;
3383} ;
3484
3585export class ComputeWorkloadManager implements WorkloadManager {
3686 private readonly logger = new SimpleStructuredLogger ( "compute-workload-manager" ) ;
3787 private readonly compute : ComputeClient ;
88+ private readonly createMaxAttempts : number ;
89+ private readonly createRetryBaseDelayMs : number ;
3890
3991 constructor ( private opts : ComputeWorkloadManagerOptions ) {
92+ this . createMaxAttempts = opts . createRetry ?. maxAttempts ?? DEFAULT_CREATE_MAX_ATTEMPTS ;
93+ this . createRetryBaseDelayMs =
94+ opts . createRetry ?. baseDelayMs ?? DEFAULT_CREATE_RETRY_BASE_DELAY_MS ;
95+
4096 if ( opts . workloadApiDomain ) {
4197 this . logger . warn ( "⚠️ Custom workload API domain" , {
4298 domain : opts . workloadApiDomain ,
@@ -163,27 +219,71 @@ export class ComputeWorkloadManager implements WorkloadManager {
163219 const startMs = performance . now ( ) ;
164220
165221 try {
166- const [ error , data ] = await tryCatch (
167- this . compute . instances . create ( {
168- name : runnerId ,
169- image : imageRef ,
170- env : envVars ,
171- cpu : opts . machine . cpu ,
172- memory_gb : opts . machine . memory ,
173- metadata : {
174- runId : opts . runFriendlyId ,
175- envId : opts . envId ,
176- envType : opts . envType ,
177- orgId : opts . orgId ,
178- projectId : opts . projectId ,
179- deploymentVersion : opts . deploymentVersion ,
180- machine : opts . machine . name ,
181- } ,
182- ...( Object . keys ( labels ) . length > 0 ? { labels } : { } ) ,
183- } )
184- ) ;
222+ const createRequest = {
223+ name : runnerId ,
224+ image : imageRef ,
225+ env : envVars ,
226+ cpu : opts . machine . cpu ,
227+ memory_gb : opts . machine . memory ,
228+ metadata : {
229+ runId : opts . runFriendlyId ,
230+ envId : opts . envId ,
231+ envType : opts . envType ,
232+ orgId : opts . orgId ,
233+ projectId : opts . projectId ,
234+ deploymentVersion : opts . deploymentVersion ,
235+ machine : opts . machine . name ,
236+ } ,
237+ ...( Object . keys ( labels ) . length > 0 ? { labels } : { } ) ,
238+ } ;
239+
240+ // Retry transient placement failures instead of abandoning the run: a
241+ // swallowed create error leaves the run waiting for the run engine's
242+ // PENDING_EXECUTING timeout (minutes) before it is redriven, while a
243+ // retried create typically succeeds in under a second (TRI-10293).
244+ let error : unknown ;
245+ let data : Awaited < ReturnType < typeof this . compute . instances . create > > | null | undefined ;
246+ let attempt = 1 ;
247+ // Set after a ComputeClientError: the failed create may have left its
248+ // name registered, so subsequent attempts use a suffixed name.
249+ let suffixAttempts = false ;
250+ for ( ; attempt <= this . createMaxAttempts ; attempt ++ ) {
251+ const attemptRunnerId = suffixAttempts
252+ ? runnerNameForAttempt ( runnerId , attempt )
253+ : runnerId ;
254+ [ error , data ] = await tryCatch (
255+ this . compute . instances . create (
256+ attemptRunnerId === runnerId
257+ ? createRequest
258+ : {
259+ ...createRequest ,
260+ name : attemptRunnerId ,
261+ env : { ...envVars , TRIGGER_RUNNER_ID : attemptRunnerId } ,
262+ }
263+ )
264+ ) ;
265+
266+ if ( ! error ) {
267+ event . runnerId = attemptRunnerId ;
268+ break ;
269+ }
270+
271+ if ( error instanceof ComputeClientError ) {
272+ suffixAttempts = true ;
273+ }
274+
275+ this . logger . warn ( "create instance attempt failed" , {
276+ runnerId : attemptRunnerId ,
277+ attempt,
278+ error : error instanceof Error ? error . message : String ( error ) ,
279+ } ) ;
280+
281+ if ( ! isRetryableCreateError ( error ) || attempt === this . createMaxAttempts ) break ;
282+ await sleep ( this . createRetryBaseDelayMs * attempt ) ;
283+ }
284+ event . createAttempts = attempt ;
185285
186- if ( error ) {
286+ if ( error || ! data ) {
187287 event . error = error instanceof Error ? error . message : String ( error ) ;
188288 event . errorType =
189289 error instanceof DOMException && error . name === "TimeoutError" ? "timeout" : "fetch" ;
0 commit comments