Skip to content
Merged
6 changes: 6 additions & 0 deletions .changeset/chat-boot-cursor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@trigger.dev/sdk": patch
"@trigger.dev/core": patch
---

Continuation chat boots no longer stall for around 10 seconds before the first turn. The `session.in` resume cursor is now found with a non-blocking records read instead of draining an SSE long-poll (which always waited out its full 5 second inactivity window, twice per boot), the boot reads run concurrently, and chat snapshots carry the cursor so subsequent boots skip the scan entirely.
5 changes: 5 additions & 0 deletions .changeset/chat-headstart-hydrate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/sdk": patch
---

Fix `chat.headStart` when `hydrateMessages` is registered. The warm route's step-1 partial now reaches the agent's accumulator on the hydrate path, so `onTurnComplete` carries the full first turn (the head-start user message included), tool-call handovers resume from step 2 instead of re-running step 1, and the assistant `messageId` stays stable across the handover.
5 changes: 5 additions & 0 deletions .changeset/chat-headstart-reasoning.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@trigger.dev/sdk": patch
---

Preserve reasoning parts across the `chat.headStart` handover. Extended-thinking models' step-1 reasoning now lands in the durable session history (and `onTurnComplete`) under the same assistant `messageId`, with provider metadata intact so Anthropic thinking signatures survive replays.
6 changes: 6 additions & 0 deletions .server-changes/cancel-stale-delayed-snapshots.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: supervisor
type: fix
---

Cancel pending delayed snapshots when a run completes or disconnects, preventing stale snapshots from pausing microVMs that have moved on to new work.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: fix
---

Speed up the environment variables page for projects with many archived preview branches. The page now only loads variable values for the environments it displays instead of every value ever created, including those left behind by archived branches.
6 changes: 6 additions & 0 deletions .server-changes/hipaa-addon-pricing-cta.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: feature
---

Request a HIPAA BAA add-on directly from any paid pricing tier in the dashboard.
6 changes: 6 additions & 0 deletions .server-changes/retry-transient-instance-create-failures.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: supervisor
type: fix
---

Retry transient instance create failures during cold starts instead of waiting minutes for the run to be requeued.
6 changes: 6 additions & 0 deletions .server-changes/trace-page-payload-diet.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: improvement
---

Shrinks the run trace page loader payload by keeping raw span events server-side and makes large trace trees render more efficiently. Also adds an optional `TRACE_VIEW_EMERGENCY_SPAN_CAP` env var that clamps trace summary and detailed summary span limits on both event store paths.
8 changes: 8 additions & 0 deletions apps/supervisor/src/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ const Env = z
COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset)
COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000),
COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10),
// Instance create retries for transient placement failures (1 = no retries)
COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS: z.coerce.number().int().min(1).max(10).default(3),
COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS: z.coerce
.number()
.int()
.min(0)
.max(10_000)
.default(250),

// Kubernetes settings
KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),
Expand Down
4 changes: 4 additions & 0 deletions apps/supervisor/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ class ManagedSupervisor {
otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT,
prettyLogs: env.RUNNER_PRETTY_LOGS,
},
createRetry: {
maxAttempts: env.COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS,
baseDelayMs: env.COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS,
},
});
this.computeManager = computeManager;
this.workloadManager = computeManager;
Expand Down
130 changes: 130 additions & 0 deletions apps/supervisor/src/services/computeSnapshotService.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import { describe, expect, it, vi } from "vitest";
import { setTimeout as sleep } from "node:timers/promises";
import { ComputeSnapshotService } from "./computeSnapshotService.js";
import type { ComputeWorkloadManager } from "../workloadManager/compute.js";
import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers";

// The TimerWheel ticks every 100ms, so a 200ms delay dispatches within ~300ms.
const DELAY_MS = 200;
// Long enough that a pending snapshot would certainly have dispatched.
const SETTLE_MS = 600;

function createService() {
const snapshot = vi.fn(async (_opts: { runnerId: string; metadata: Record<string, string> }) => true);

const computeManager = {
snapshotDelayMs: DELAY_MS,
snapshotDispatchLimit: 1,
snapshot,
} as unknown as ComputeWorkloadManager;

const service = new ComputeSnapshotService({
computeManager,
workerClient: {} as SupervisorHttpClient,
wideEventOpts: { service: "supervisor-test", env: {}, enabled: false },
});

return { service, snapshot };
}

function delayedSnapshot(runnerId = "runner-1") {
return {
runnerId,
runFriendlyId: "run_1",
snapshotFriendlyId: "snapshot_1",
};
}

describe("ComputeSnapshotService", () => {
it("dispatches a scheduled snapshot after the delay", async () => {
const { service, snapshot } = createService();
try {
service.schedule("run_1", delayedSnapshot());

await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 });
expect(snapshot).toHaveBeenCalledWith({
runnerId: "runner-1",
metadata: { runId: "run_1", snapshotFriendlyId: "snapshot_1" },
});
} finally {
service.stop();
}
});

it("cancel before the delay expires prevents the dispatch", async () => {
const { service, snapshot } = createService();
try {
service.schedule("run_1", delayedSnapshot());

expect(service.cancel("run_1")).toBe(true);

await sleep(SETTLE_MS);
expect(snapshot).not.toHaveBeenCalled();
} finally {
service.stop();
}
});

it("cancel returns false when nothing is pending", () => {
const { service } = createService();
try {
expect(service.cancel("run_1")).toBe(false);
} finally {
service.stop();
}
});

it("cancel with a matching runnerId cancels the pending snapshot", async () => {
const { service, snapshot } = createService();
try {
service.schedule("run_1", delayedSnapshot("runner-a"));

expect(service.cancel("run_1", "runner-a")).toBe(true);

await sleep(SETTLE_MS);
expect(snapshot).not.toHaveBeenCalled();
} finally {
service.stop();
}
});

it("cancel with a different runnerId leaves the pending snapshot alone", async () => {
const { service, snapshot } = createService();
try {
service.schedule("run_1", delayedSnapshot("runner-a"));

// A stale runner for a reassigned run must not cancel the new runner's snapshot.
expect(service.cancel("run_1", "runner-b")).toBe(false);

await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 });
expect(snapshot).toHaveBeenCalledWith(
expect.objectContaining({ runnerId: "runner-a" })
);
} finally {
service.stop();
}
});

it("re-scheduling the same run replaces the pending snapshot", async () => {
const { service, snapshot } = createService();
try {
service.schedule("run_1", delayedSnapshot());
service.schedule("run_1", {
runnerId: "runner-1",
runFriendlyId: "run_1",
snapshotFriendlyId: "snapshot_2",
});

await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 });
await sleep(SETTLE_MS);

expect(snapshot).toHaveBeenCalledTimes(1);
expect(snapshot).toHaveBeenCalledWith({
runnerId: "runner-1",
metadata: { runId: "run_1", snapshotFriendlyId: "snapshot_2" },
});
} finally {
service.stop();
}
});
});
15 changes: 13 additions & 2 deletions apps/supervisor/src/services/computeSnapshotService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,19 @@ export class ComputeSnapshotService {
});
}

/** Cancel a pending delayed snapshot. Returns true if one was cancelled. */
cancel(runFriendlyId: string): boolean {
/**
* Cancel a pending delayed snapshot. Returns true if one was cancelled.
* When `runnerId` is given, only a snapshot scheduled for that same runner
* is cancelled - a stale runner for a run that has since been reassigned
* must not cancel the new runner's pending snapshot.
*/
cancel(runFriendlyId: string, runnerId?: string): boolean {
if (runnerId) {
const pending = this.timerWheel.peek(runFriendlyId);
if (pending && pending.data.runnerId !== runnerId) {
return false;
}
}
const cancelled = this.timerWheel.cancel(runFriendlyId);
if (cancelled) {
emitOneShot({
Expand Down
17 changes: 17 additions & 0 deletions apps/supervisor/src/services/timerWheel.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,23 @@ describe("TimerWheel", () => {
wheel.stop();
});

it("peek returns the pending item without removing it", () => {
const wheel = new TimerWheel<string>({ delayMs: 3000, onExpire: () => {} });

wheel.start();
wheel.submit("run-1", "data");

expect(wheel.peek("run-1")).toEqual({ key: "run-1", data: "data" });
expect(wheel.size).toBe(1);
expect(wheel.peek("run-2")).toBeUndefined();

// Dispatched items are no longer peekable
vi.advanceTimersByTime(3100);
expect(wheel.peek("run-1")).toBeUndefined();

wheel.stop();
});

it("cancel returns false for unknown key", () => {
const wheel = new TimerWheel<string>({
delayMs: 3000,
Expand Down
6 changes: 6 additions & 0 deletions apps/supervisor/src/services/timerWheel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ export class TimerWheel<T> {
return true;
}

/** Look up a pending item without removing it. */
peek(key: string): TimerWheelItem<T> | undefined {
const entry = this.entries.get(key);
return entry ? { key, data: entry.data } : undefined;
}

/** Number of pending items in the wheel. */
get size(): number {
return this.entries.size;
Expand Down
56 changes: 56 additions & 0 deletions apps/supervisor/src/workloadManager/compute.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { describe, expect, it } from "vitest";
import { ComputeClientError } from "@internal/compute";
import { isRetryableCreateError, runnerNameForAttempt } from "./compute.js";

describe("runnerNameForAttempt", () => {
it("keeps the unsuffixed name for the first attempt", () => {
expect(runnerNameForAttempt("runner-abc123", 1)).toBe("runner-abc123");
});

it("suffixes retry attempts deterministically", () => {
expect(runnerNameForAttempt("runner-abc123", 2)).toBe("runner-abc123-r2");
expect(runnerNameForAttempt("runner-abc123", 3)).toBe("runner-abc123-r3");
});
});

describe("isRetryableCreateError", () => {
it("retries statuses where the create definitely did not commit", () => {
expect(isRetryableCreateError(new ComputeClientError(500, "tap busy", "http://gw"))).toBe(
true
);
expect(isRetryableCreateError(new ComputeClientError(503, "no placement", "http://gw"))).toBe(
true
);
});

it("does not retry lost-response statuses (create may have committed)", () => {
expect(isRetryableCreateError(new ComputeClientError(502, "bad gateway", "http://gw"))).toBe(
false
);
expect(
isRetryableCreateError(new ComputeClientError(504, "gateway timeout", "http://gw"))
).toBe(false);
});

it("does not retry 4xx responses", () => {
expect(isRetryableCreateError(new ComputeClientError(400, "bad request", "http://gw"))).toBe(
false
);
expect(isRetryableCreateError(new ComputeClientError(409, "conflict", "http://gw"))).toBe(
false
);
});

it("does not retry timeouts (instance may still be provisioning)", () => {
expect(isRetryableCreateError(new DOMException("timed out", "TimeoutError"))).toBe(false);
});

it("retries network-level fetch failures", () => {
expect(isRetryableCreateError(new TypeError("fetch failed"))).toBe(true);
});

it("does not retry unknown errors", () => {
expect(isRetryableCreateError(new Error("something else"))).toBe(false);
expect(isRetryableCreateError("string error")).toBe(false);
});
});
Loading
Loading