Skip to content

Commit af4962b

Browse files
committed
feat: make instance create retry attempts and backoff configurable
COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS (default 3, 1 disables retries) and COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS (default 250) thread through ComputeWorkloadManagerOptions.createRetry; behavior is unchanged at the defaults.
1 parent 461dc89 commit af4962b

3 files changed

Lines changed: 27 additions & 5 deletions

File tree

apps/supervisor/src/env.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,14 @@ const Env = z
114114
COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset)
115115
COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000),
116116
COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10),
117+
// Instance create retries for transient placement failures (1 = no retries)
118+
COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS: z.coerce.number().int().min(1).max(10).default(3),
119+
COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS: z.coerce
120+
.number()
121+
.int()
122+
.min(0)
123+
.max(10_000)
124+
.default(250),
117125

118126
// Kubernetes settings
119127
KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),

apps/supervisor/src/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ class ManagedSupervisor {
144144
otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT,
145145
prettyLogs: env.RUNNER_PRETTY_LOGS,
146146
},
147+
createRetry: {
148+
maxAttempts: env.COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS,
149+
baseDelayMs: env.COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS,
150+
},
147151
});
148152
this.computeManager = computeManager;
149153
this.workloadManager = computeManager;

apps/supervisor/src/workloadManager/compute.ts

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import type { OtlpTraceService } from "../services/otlpTraceService.js";
1313
import { tryCatch } from "@trigger.dev/core";
1414
import { encodeBaggage, fromContext } from "../wideEvents/index.js";
1515

16-
const CREATE_MAX_ATTEMPTS = 3;
17-
const CREATE_RETRY_BASE_DELAY_MS = 250;
16+
const DEFAULT_CREATE_MAX_ATTEMPTS = 3;
17+
const DEFAULT_CREATE_RETRY_BASE_DELAY_MS = 250;
1818

1919
/**
2020
* TEMPORARY (TRI-10293): a failed create can leave its instance name
@@ -76,13 +76,23 @@ type ComputeWorkloadManagerOptions = WorkloadManagerOptions & {
7676
otelEndpoint: string;
7777
prettyLogs: boolean;
7878
};
79+
createRetry?: {
80+
maxAttempts: number;
81+
baseDelayMs: number;
82+
};
7983
};
8084

8185
export class ComputeWorkloadManager implements WorkloadManager {
8286
private readonly logger = new SimpleStructuredLogger("compute-workload-manager");
8387
private readonly compute: ComputeClient;
88+
private readonly createMaxAttempts: number;
89+
private readonly createRetryBaseDelayMs: number;
8490

8591
constructor(private opts: ComputeWorkloadManagerOptions) {
92+
this.createMaxAttempts = opts.createRetry?.maxAttempts ?? DEFAULT_CREATE_MAX_ATTEMPTS;
93+
this.createRetryBaseDelayMs =
94+
opts.createRetry?.baseDelayMs ?? DEFAULT_CREATE_RETRY_BASE_DELAY_MS;
95+
8696
if (opts.workloadApiDomain) {
8797
this.logger.warn("⚠️ Custom workload API domain", {
8898
domain: opts.workloadApiDomain,
@@ -239,7 +249,7 @@ export class ComputeWorkloadManager implements WorkloadManager {
239249
// Set after a ComputeClientError: the failed create may have left its
240250
// name registered, so subsequent attempts use a suffixed name.
241251
let suffixAttempts = false;
242-
for (; attempt <= CREATE_MAX_ATTEMPTS; attempt++) {
252+
for (; attempt <= this.createMaxAttempts; attempt++) {
243253
const attemptRunnerId = suffixAttempts
244254
? runnerNameForAttempt(runnerId, attempt)
245255
: runnerId;
@@ -270,8 +280,8 @@ export class ComputeWorkloadManager implements WorkloadManager {
270280
error: error instanceof Error ? error.message : String(error),
271281
});
272282

273-
if (!isRetryableCreateError(error) || attempt === CREATE_MAX_ATTEMPTS) break;
274-
await sleep(CREATE_RETRY_BASE_DELAY_MS * attempt);
283+
if (!isRetryableCreateError(error) || attempt === this.createMaxAttempts) break;
284+
await sleep(this.createRetryBaseDelayMs * attempt);
275285
}
276286
event.createAttempts = attempt;
277287

0 commit comments

Comments
 (0)