Skip to content

Commit dbbe9b3

Browse files
committed
fix: retry transient instance create failures instead of abandoning the run
ComputeWorkloadManager.create swallows gateway errors by design, so a cold start that fails placement (e.g. a netns slot with a busy tap, a full node disk) silently abandons the dequeued run until the run engine's PENDING_EXECUTING timeout redrives it minutes later. These failures are transient per placement - redriven runs virtually always succeed - so retry the create up to 3 times with short backoff before giving up. Gateway 5xx and network-level fetch failures are retried; 4xx responses (won't heal) and timeouts (the instance may still be provisioning) are not.
1 parent 829fec6 commit dbbe9b3

2 files changed

Lines changed: 116 additions & 21 deletions

File tree

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import { describe, expect, it } from "vitest";
2+
import { ComputeClientError } from "@internal/compute";
3+
import { isRetryableCreateError } from "./compute.js";
4+
5+
describe("isRetryableCreateError", () => {
6+
it("retries statuses where the create definitely did not commit", () => {
7+
expect(isRetryableCreateError(new ComputeClientError(500, "tap busy", "http://gw"))).toBe(
8+
true
9+
);
10+
expect(isRetryableCreateError(new ComputeClientError(503, "no placement", "http://gw"))).toBe(
11+
true
12+
);
13+
});
14+
15+
it("does not retry lost-response statuses (create may have committed)", () => {
16+
expect(isRetryableCreateError(new ComputeClientError(502, "bad gateway", "http://gw"))).toBe(
17+
false
18+
);
19+
expect(
20+
isRetryableCreateError(new ComputeClientError(504, "gateway timeout", "http://gw"))
21+
).toBe(false);
22+
});
23+
24+
it("does not retry 4xx responses", () => {
25+
expect(isRetryableCreateError(new ComputeClientError(400, "bad request", "http://gw"))).toBe(
26+
false
27+
);
28+
expect(isRetryableCreateError(new ComputeClientError(409, "conflict", "http://gw"))).toBe(
29+
false
30+
);
31+
});
32+
33+
it("does not retry timeouts (instance may still be provisioning)", () => {
34+
expect(isRetryableCreateError(new DOMException("timed out", "TimeoutError"))).toBe(false);
35+
});
36+
37+
it("retries network-level fetch failures", () => {
38+
expect(isRetryableCreateError(new TypeError("fetch failed"))).toBe(true);
39+
});
40+
41+
it("does not retry unknown errors", () => {
42+
expect(isRetryableCreateError(new Error("something else"))).toBe(false);
43+
expect(isRetryableCreateError("string error")).toBe(false);
44+
});
45+
});

apps/supervisor/src/workloadManager/compute.ts

Lines changed: 71 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,41 @@ import {
66
type WorkloadManagerCreateOptions,
77
type WorkloadManagerOptions,
88
} from "./types.js";
9-
import { ComputeClient, stripImageDigest } from "@internal/compute";
9+
import { ComputeClient, ComputeClientError, stripImageDigest } from "@internal/compute";
10+
import { setTimeout as sleep } from "node:timers/promises";
1011
import { extractTraceparent, getRunnerId } from "../util.js";
1112
import type { OtlpTraceService } from "../services/otlpTraceService.js";
1213
import { tryCatch } from "@trigger.dev/core";
1314
import { encodeBaggage, fromContext } from "../wideEvents/index.js";
1415

16+
const CREATE_MAX_ATTEMPTS = 3;
17+
const CREATE_RETRY_BASE_DELAY_MS = 250;
18+
19+
/**
20+
* Whether a failed instance create is worth retrying. Only statuses where
21+
* the create definitely did NOT commit are retried: 500 means the agent or
22+
* fcrun returned a create error (e.g. a netns slot holding the tap busy, a
23+
* full node disk - placement may differ on retry), 503 means the gateway
24+
* had nowhere to place it. 502/504 are excluded: the gateway emits those
25+
* when it fails to reach the node or read its response, which can happen
26+
* AFTER the agent committed the create - and the gateway only records the
27+
* instance name on a clean 201, so a same-name retry would miss the
28+
* collision check and could double-create the VM on another node. 4xx won't
29+
* heal on retry, and timeouts may still be provisioning. Network-level
30+
* fetch failures are safe: if the gateway processed the create, its name
31+
* index is populated and the retry 409s harmlessly.
32+
*/
33+
export function isRetryableCreateError(error: unknown): boolean {
34+
if (error instanceof ComputeClientError) {
35+
return error.status === 500 || error.status === 503;
36+
}
37+
if (error instanceof DOMException && error.name === "TimeoutError") {
38+
return false;
39+
}
40+
// Network-level fetch failures (gateway briefly unreachable)
41+
return error instanceof TypeError;
42+
}
43+
1544
type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
1645
gateway: {
1746
url: string;
@@ -165,27 +194,48 @@ export class ComputeWorkloadManager implements WorkloadManager {
165194
const startMs = performance.now();
166195

167196
try {
168-
const [error, data] = await tryCatch(
169-
this.compute.instances.create({
170-
name: runnerId,
171-
image: imageRef,
172-
env: envVars,
173-
cpu: opts.machine.cpu,
174-
memory_gb: opts.machine.memory,
175-
metadata: {
176-
runId: opts.runFriendlyId,
177-
envId: opts.envId,
178-
envType: opts.envType,
179-
orgId: opts.orgId,
180-
projectId: opts.projectId,
181-
deploymentVersion: opts.deploymentVersion,
182-
machine: opts.machine.name,
183-
},
184-
...(Object.keys(labels).length > 0 ? { labels } : {}),
185-
})
186-
);
197+
const createRequest = {
198+
name: runnerId,
199+
image: imageRef,
200+
env: envVars,
201+
cpu: opts.machine.cpu,
202+
memory_gb: opts.machine.memory,
203+
metadata: {
204+
runId: opts.runFriendlyId,
205+
envId: opts.envId,
206+
envType: opts.envType,
207+
orgId: opts.orgId,
208+
projectId: opts.projectId,
209+
deploymentVersion: opts.deploymentVersion,
210+
machine: opts.machine.name,
211+
},
212+
...(Object.keys(labels).length > 0 ? { labels } : {}),
213+
};
214+
215+
// Retry transient placement failures instead of abandoning the run: a
216+
// swallowed create error leaves the run waiting for the run engine's
217+
// PENDING_EXECUTING timeout (minutes) before it is redriven, while a
218+
// retried create typically succeeds in under a second (TRI-10293).
219+
let error: unknown;
220+
let data: Awaited<ReturnType<typeof this.compute.instances.create>> | null | undefined;
221+
let attempt = 1;
222+
for (; attempt <= CREATE_MAX_ATTEMPTS; attempt++) {
223+
[error, data] = await tryCatch(this.compute.instances.create(createRequest));
224+
225+
if (!error) break;
226+
227+
this.logger.warn("create instance attempt failed", {
228+
runnerId,
229+
attempt,
230+
error: error instanceof Error ? error.message : String(error),
231+
});
232+
233+
if (!isRetryableCreateError(error) || attempt === CREATE_MAX_ATTEMPTS) break;
234+
await sleep(CREATE_RETRY_BASE_DELAY_MS * attempt);
235+
}
236+
event.createAttempts = attempt;
187237

188-
if (error) {
238+
if (error || !data) {
189239
event.error = error instanceof Error ? error.message : String(error);
190240
event.errorType =
191241
error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch";

0 commit comments

Comments
 (0)