Skip to content

Commit 461dc89

Browse files
committed
fix: suffix retried instance create names to dodge stale registrations
A failed create can leave its instance name registered gateway/fcrun-side until async cleanup runs, so a same-name retry can 409 against our own residue (observed: tap-EBUSY 500 at 18:29Z followed by 409 name_conflict on the retry 2.7s later, costing the full redrive anyway). Give retry attempts a deterministic -rN suffix; attempt 1 keeps the unsuffixed name so the non-retry path is unchanged. The suffixed name flows into both the instance name and TRIGGER_RUNNER_ID from the same variable - every downstream flow (suspend scheduling, snapshot dispatch, cancel guards, run-engine fields) treats it as one opaque self-reported token, and restored VMs already carry deterministic name suffixes. Temporary measure (TRI-10293): the proper fix is gateway-side cleanup of failed-create registrations.
1 parent 25f868e commit 461dc89

2 files changed

Lines changed: 55 additions & 4 deletions

File tree

apps/supervisor/src/workloadManager/compute.test.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
11
import { describe, expect, it } from "vitest";
22
import { ComputeClientError } from "@internal/compute";
3-
import { isRetryableCreateError } from "./compute.js";
3+
import { isRetryableCreateError, runnerNameForAttempt } from "./compute.js";
4+
5+
describe("runnerNameForAttempt", () => {
6+
it("keeps the unsuffixed name for the first attempt", () => {
7+
expect(runnerNameForAttempt("runner-abc123", 1)).toBe("runner-abc123");
8+
});
9+
10+
it("suffixes retry attempts deterministically", () => {
11+
expect(runnerNameForAttempt("runner-abc123", 2)).toBe("runner-abc123-r2");
12+
expect(runnerNameForAttempt("runner-abc123", 3)).toBe("runner-abc123-r3");
13+
});
14+
});
415

516
describe("isRetryableCreateError", () => {
617
it("retries statuses where the create definitely did not commit", () => {

apps/supervisor/src/workloadManager/compute.ts

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,23 @@ import { encodeBaggage, fromContext } from "../wideEvents/index.js";
1616
const CREATE_MAX_ATTEMPTS = 3;
1717
const CREATE_RETRY_BASE_DELAY_MS = 250;
1818

19+
/**
20+
* TEMPORARY (TRI-10293): a failed create can leave its instance name
21+
* registered gateway/fcrun-side until async cleanup runs, so a same-name
22+
* retry can 409 against our own residue. Until the gateway cleans up
23+
* failed-create registrations properly, retry attempts get a deterministic
24+
* suffix. Attempt 1 keeps the unsuffixed name so the non-retry path is
25+
* unchanged; the suffixed name flows into both the instance name and
26+
* TRIGGER_RUNNER_ID, which downstream flows treat as one opaque
27+
* self-reported token. Only attempts following a ComputeClientError are
28+
* suffixed - network-failure retries keep the same name on purpose, because
29+
* the gateway's name-collision 409 is their safety net against
30+
* double-creating an instance whose create response was lost.
31+
*/
32+
export function runnerNameForAttempt(runnerId: string, attempt: number): string {
33+
return attempt === 1 ? runnerId : `${runnerId}-r${attempt}`;
34+
}
35+
1936
/**
2037
* Whether a failed instance create is worth retrying. Only statuses where
2138
* the create definitely did NOT commit are retried: 500 means the agent or
@@ -219,13 +236,36 @@ export class ComputeWorkloadManager implements WorkloadManager {
219236
let error: unknown;
220237
let data: Awaited<ReturnType<typeof this.compute.instances.create>> | null | undefined;
221238
let attempt = 1;
239+
// Set after a ComputeClientError: the failed create may have left its
240+
// name registered, so subsequent attempts use a suffixed name.
241+
let suffixAttempts = false;
222242
for (; attempt <= CREATE_MAX_ATTEMPTS; attempt++) {
223-
[error, data] = await tryCatch(this.compute.instances.create(createRequest));
243+
const attemptRunnerId = suffixAttempts
244+
? runnerNameForAttempt(runnerId, attempt)
245+
: runnerId;
246+
[error, data] = await tryCatch(
247+
this.compute.instances.create(
248+
attemptRunnerId === runnerId
249+
? createRequest
250+
: {
251+
...createRequest,
252+
name: attemptRunnerId,
253+
env: { ...envVars, TRIGGER_RUNNER_ID: attemptRunnerId },
254+
}
255+
)
256+
);
257+
258+
if (!error) {
259+
event.runnerId = attemptRunnerId;
260+
break;
261+
}
224262

225-
if (!error) break;
263+
if (error instanceof ComputeClientError) {
264+
suffixAttempts = true;
265+
}
226266

227267
this.logger.warn("create instance attempt failed", {
228-
runnerId,
268+
runnerId: attemptRunnerId,
229269
attempt,
230270
error: error instanceof Error ? error.message : String(error),
231271
});

0 commit comments

Comments
 (0)