Skip to content

Commit 2397ca2

Browse files
authored
fix(supervisor): retry transient instance create failures in compute workload manager (#3902)
`ComputeWorkloadManager.create` swallows gateway errors currently, so a cold start that fails placement (e.g. a netns slot with a busy tap, a full node disk) silently abandons the dequeued run until the run engine's `PENDING_EXECUTING` heartbeat timeout redrives it via stall detection. ### Changes - Retry `instances.create` with short backoff (default 3 attempts, 250ms backoff), recording `createAttempts` in the wide event. - **Only statuses where the create definitely did not commit are retried**: 500 (agent/fcrun create failed) and 503 (no placement). 502/504 are excluded — the gateway emits those when it fails to reach the node or read its response, which can happen *after* the agent committed the create; the gateway only records the instance name on a clean 201, so a same-name retry would miss the collision check and could double-create the VM on another node. Network-level fetch failures are retried (if the gateway processed the create, its name index is populated and the retry 409s harmlessly). Timeouts are not retried. - **Retry attempts after a 5xx use a deterministic `-rN` name suffix**: a failed create can leave its name registered until async cleanup runs. Attempt 1 keeps the unsuffixed name.
1 parent 7b4443a commit 2397ca2

5 files changed

Lines changed: 195 additions & 21 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
area: supervisor
3+
type: fix
4+
---
5+
6+
Retry transient instance create failures during cold starts instead of waiting minutes for the run to be requeued.

apps/supervisor/src/env.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,14 @@ const Env = z
114114
COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset)
115115
COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000),
116116
COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10),
117+
// Instance create retries for transient placement failures (1 = no retries)
118+
COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS: z.coerce.number().int().min(1).max(10).default(3),
119+
COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS: z.coerce
120+
.number()
121+
.int()
122+
.min(0)
123+
.max(10_000)
124+
.default(250),
117125

118126
// Kubernetes settings
119127
KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),

apps/supervisor/src/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ class ManagedSupervisor {
144144
otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT,
145145
prettyLogs: env.RUNNER_PRETTY_LOGS,
146146
},
147+
createRetry: {
148+
maxAttempts: env.COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS,
149+
baseDelayMs: env.COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS,
150+
},
147151
});
148152
this.computeManager = computeManager;
149153
this.workloadManager = computeManager;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import { describe, expect, it } from "vitest";
2+
import { ComputeClientError } from "@internal/compute";
3+
import { isRetryableCreateError, runnerNameForAttempt } from "./compute.js";
4+
5+
describe("runnerNameForAttempt", () => {
6+
it("keeps the unsuffixed name for the first attempt", () => {
7+
expect(runnerNameForAttempt("runner-abc123", 1)).toBe("runner-abc123");
8+
});
9+
10+
it("suffixes retry attempts deterministically", () => {
11+
expect(runnerNameForAttempt("runner-abc123", 2)).toBe("runner-abc123-r2");
12+
expect(runnerNameForAttempt("runner-abc123", 3)).toBe("runner-abc123-r3");
13+
});
14+
});
15+
16+
describe("isRetryableCreateError", () => {
17+
it("retries statuses where the create definitely did not commit", () => {
18+
expect(isRetryableCreateError(new ComputeClientError(500, "tap busy", "http://gw"))).toBe(
19+
true
20+
);
21+
expect(isRetryableCreateError(new ComputeClientError(503, "no placement", "http://gw"))).toBe(
22+
true
23+
);
24+
});
25+
26+
it("does not retry lost-response statuses (create may have committed)", () => {
27+
expect(isRetryableCreateError(new ComputeClientError(502, "bad gateway", "http://gw"))).toBe(
28+
false
29+
);
30+
expect(
31+
isRetryableCreateError(new ComputeClientError(504, "gateway timeout", "http://gw"))
32+
).toBe(false);
33+
});
34+
35+
it("does not retry 4xx responses", () => {
36+
expect(isRetryableCreateError(new ComputeClientError(400, "bad request", "http://gw"))).toBe(
37+
false
38+
);
39+
expect(isRetryableCreateError(new ComputeClientError(409, "conflict", "http://gw"))).toBe(
40+
false
41+
);
42+
});
43+
44+
it("does not retry timeouts (instance may still be provisioning)", () => {
45+
expect(isRetryableCreateError(new DOMException("timed out", "TimeoutError"))).toBe(false);
46+
});
47+
48+
it("retries network-level fetch failures", () => {
49+
expect(isRetryableCreateError(new TypeError("fetch failed"))).toBe(true);
50+
});
51+
52+
it("does not retry unknown errors", () => {
53+
expect(isRetryableCreateError(new Error("something else"))).toBe(false);
54+
expect(isRetryableCreateError("string error")).toBe(false);
55+
});
56+
});

apps/supervisor/src/workloadManager/compute.ts

Lines changed: 121 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,58 @@ import {
66
type WorkloadManagerCreateOptions,
77
type WorkloadManagerOptions,
88
} from "./types.js";
9-
import { ComputeClient, stripImageDigest } from "@internal/compute";
9+
import { ComputeClient, ComputeClientError, stripImageDigest } from "@internal/compute";
10+
import { setTimeout as sleep } from "node:timers/promises";
1011
import { extractTraceparent, getRunnerId } from "../util.js";
1112
import type { OtlpTraceService } from "../services/otlpTraceService.js";
1213
import { tryCatch } from "@trigger.dev/core";
1314
import { encodeBaggage, fromContext } from "../wideEvents/index.js";
1415

16+
const DEFAULT_CREATE_MAX_ATTEMPTS = 3;
17+
const DEFAULT_CREATE_RETRY_BASE_DELAY_MS = 250;
18+
19+
/**
20+
* TEMPORARY (TRI-10293): a failed create can leave its instance name
21+
* registered gateway/fcrun-side until async cleanup runs, so a same-name
22+
* retry can 409 against our own residue. Until the gateway cleans up
23+
* failed-create registrations properly, retry attempts get a deterministic
24+
* suffix. Attempt 1 keeps the unsuffixed name so the non-retry path is
25+
* unchanged; the suffixed name flows into both the instance name and
26+
* TRIGGER_RUNNER_ID, which downstream flows treat as one opaque
27+
* self-reported token. Only attempts following a ComputeClientError are
28+
* suffixed - network-failure retries keep the same name on purpose, because
29+
* the gateway's name-collision 409 is their safety net against
30+
* double-creating an instance whose create response was lost.
31+
*/
32+
export function runnerNameForAttempt(runnerId: string, attempt: number): string {
33+
return attempt === 1 ? runnerId : `${runnerId}-r${attempt}`;
34+
}
35+
36+
/**
37+
* Whether a failed instance create is worth retrying. Only statuses where
38+
* the create definitely did NOT commit are retried: 500 means the agent or
39+
* fcrun returned a create error (e.g. a netns slot holding the tap busy, a
40+
* full node disk - placement may differ on retry), 503 means the gateway
41+
* had nowhere to place it. 502/504 are excluded: the gateway emits those
42+
* when it fails to reach the node or read its response, which can happen
43+
* AFTER the agent committed the create - and the gateway only records the
44+
* instance name on a clean 201, so a same-name retry would miss the
45+
* collision check and could double-create the VM on another node. 4xx won't
46+
* heal on retry, and timeouts may still be provisioning. Network-level
47+
* fetch failures are safe: if the gateway processed the create, its name
48+
* index is populated and the retry 409s harmlessly.
49+
*/
50+
export function isRetryableCreateError(error: unknown): boolean {
51+
if (error instanceof ComputeClientError) {
52+
return error.status === 500 || error.status === 503;
53+
}
54+
if (error instanceof DOMException && error.name === "TimeoutError") {
55+
return false;
56+
}
57+
// Network-level fetch failures (gateway briefly unreachable)
58+
return error instanceof TypeError;
59+
}
60+
1561
type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
1662
gateway: {
1763
url: string;
@@ -30,13 +76,23 @@ type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
3076
otelEndpoint: string;
3177
prettyLogs: boolean;
3278
};
79+
createRetry?: {
80+
maxAttempts: number;
81+
baseDelayMs: number;
82+
};
3383
};
3484

3585
export class ComputeWorkloadManager implements WorkloadManager {
3686
private readonly logger = new SimpleStructuredLogger("compute-workload-manager");
3787
private readonly compute: ComputeClient;
88+
private readonly createMaxAttempts: number;
89+
private readonly createRetryBaseDelayMs: number;
3890

3991
constructor(private opts: ComputeWorkloadManagerOptions) {
92+
this.createMaxAttempts = opts.createRetry?.maxAttempts ?? DEFAULT_CREATE_MAX_ATTEMPTS;
93+
this.createRetryBaseDelayMs =
94+
opts.createRetry?.baseDelayMs ?? DEFAULT_CREATE_RETRY_BASE_DELAY_MS;
95+
4096
if (opts.workloadApiDomain) {
4197
this.logger.warn("⚠️ Custom workload API domain", {
4298
domain: opts.workloadApiDomain,
@@ -163,27 +219,71 @@ export class ComputeWorkloadManager implements WorkloadManager {
163219
const startMs = performance.now();
164220

165221
try {
166-
const [error, data] = await tryCatch(
167-
this.compute.instances.create({
168-
name: runnerId,
169-
image: imageRef,
170-
env: envVars,
171-
cpu: opts.machine.cpu,
172-
memory_gb: opts.machine.memory,
173-
metadata: {
174-
runId: opts.runFriendlyId,
175-
envId: opts.envId,
176-
envType: opts.envType,
177-
orgId: opts.orgId,
178-
projectId: opts.projectId,
179-
deploymentVersion: opts.deploymentVersion,
180-
machine: opts.machine.name,
181-
},
182-
...(Object.keys(labels).length > 0 ? { labels } : {}),
183-
})
184-
);
222+
const createRequest = {
223+
name: runnerId,
224+
image: imageRef,
225+
env: envVars,
226+
cpu: opts.machine.cpu,
227+
memory_gb: opts.machine.memory,
228+
metadata: {
229+
runId: opts.runFriendlyId,
230+
envId: opts.envId,
231+
envType: opts.envType,
232+
orgId: opts.orgId,
233+
projectId: opts.projectId,
234+
deploymentVersion: opts.deploymentVersion,
235+
machine: opts.machine.name,
236+
},
237+
...(Object.keys(labels).length > 0 ? { labels } : {}),
238+
};
239+
240+
// Retry transient placement failures instead of abandoning the run: a
241+
// swallowed create error leaves the run waiting for the run engine's
242+
// PENDING_EXECUTING timeout (minutes) before it is redriven, while a
243+
// retried create typically succeeds in under a second (TRI-10293).
244+
let error: unknown;
245+
let data: Awaited<ReturnType<typeof this.compute.instances.create>> | null | undefined;
246+
let attempt = 1;
247+
// Set after a ComputeClientError: the failed create may have left its
248+
// name registered, so subsequent attempts use a suffixed name.
249+
let suffixAttempts = false;
250+
for (; attempt <= this.createMaxAttempts; attempt++) {
251+
const attemptRunnerId = suffixAttempts
252+
? runnerNameForAttempt(runnerId, attempt)
253+
: runnerId;
254+
[error, data] = await tryCatch(
255+
this.compute.instances.create(
256+
attemptRunnerId === runnerId
257+
? createRequest
258+
: {
259+
...createRequest,
260+
name: attemptRunnerId,
261+
env: { ...envVars, TRIGGER_RUNNER_ID: attemptRunnerId },
262+
}
263+
)
264+
);
265+
266+
if (!error) {
267+
event.runnerId = attemptRunnerId;
268+
break;
269+
}
270+
271+
if (error instanceof ComputeClientError) {
272+
suffixAttempts = true;
273+
}
274+
275+
this.logger.warn("create instance attempt failed", {
276+
runnerId: attemptRunnerId,
277+
attempt,
278+
error: error instanceof Error ? error.message : String(error),
279+
});
280+
281+
if (!isRetryableCreateError(error) || attempt === this.createMaxAttempts) break;
282+
await sleep(this.createRetryBaseDelayMs * attempt);
283+
}
284+
event.createAttempts = attempt;
185285

186-
if (error) {
286+
if (error || !data) {
187287
event.error = error instanceof Error ? error.message : String(error);
188288
event.errorType =
189289
error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch";

0 commit comments

Comments
 (0)