Dustin4444 · pull · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.changeset/chat-boot-cursor.md b/.changeset/chat-boot-cursor.md
@@ -0,0 +1,6 @@
+---
+"@trigger.dev/sdk": patch
+"@trigger.dev/core": patch
+---
+
+Continuation chat boots no longer stall for around 10 seconds before the first turn. The `session.in` resume cursor is now found with a non-blocking records read instead of draining an SSE long-poll (which always waited out its full 5 second inactivity window, twice per boot), the boot reads run concurrently, and chat snapshots carry the cursor so subsequent boots skip the scan entirely.
diff --git a/.changeset/chat-headstart-hydrate.md b/.changeset/chat-headstart-hydrate.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/sdk": patch
+---
+
+Fix `chat.headStart` when `hydrateMessages` is registered. The warm route's step-1 partial now reaches the agent's accumulator on the hydrate path, so `onTurnComplete` carries the full first turn (the head-start user message included), tool-call handovers resume from step 2 instead of re-running step 1, and the assistant `messageId` stays stable across the handover.
diff --git a/.changeset/chat-headstart-reasoning.md b/.changeset/chat-headstart-reasoning.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/sdk": patch
+---
+
+Preserve reasoning parts across the `chat.headStart` handover. Extended-thinking models' step-1 reasoning now lands in the durable session history (and `onTurnComplete`) under the same assistant `messageId`, with provider metadata intact so Anthropic thinking signatures survive replays.
diff --git a/.server-changes/cancel-stale-delayed-snapshots.md b/.server-changes/cancel-stale-delayed-snapshots.md
@@ -0,0 +1,6 @@
+---
+area: supervisor
+type: fix
+---
+
+Cancel pending delayed snapshots when a run completes or disconnects, preventing stale snapshots from pausing microVMs that have moved on to new work.
diff --git a/.server-changes/env-vars-page-scope-values-to-visible-environments.md b/.server-changes/env-vars-page-scope-values-to-visible-environments.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: fix
+---
+
+Speed up the environment variables page for projects with many archived preview branches. The page now only loads variable values for the environments it displays instead of every value ever created, including those left behind by archived branches.
diff --git a/.server-changes/hipaa-addon-pricing-cta.md b/.server-changes/hipaa-addon-pricing-cta.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Request a HIPAA BAA add-on directly from any paid pricing tier in the dashboard.
diff --git a/.server-changes/retry-transient-instance-create-failures.md b/.server-changes/retry-transient-instance-create-failures.md
@@ -0,0 +1,6 @@
+---
+area: supervisor
+type: fix
+---
+
+Retry transient instance create failures during cold starts instead of waiting minutes for the run to be requeued.
diff --git a/.server-changes/trace-page-payload-diet.md b/.server-changes/trace-page-payload-diet.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: improvement
+---
+
+Shrinks the run trace page loader payload by keeping raw span events server-side and makes large trace trees render more efficiently. Also adds an optional `TRACE_VIEW_EMERGENCY_SPAN_CAP` env var that clamps trace summary and detailed summary span limits on both event store paths.
diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts
@@ -114,6 +114,14 @@ const Env = z
     COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset)
     COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000),
     COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10),
+    // Instance create retries for transient placement failures (1 = no retries)
+    COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS: z.coerce.number().int().min(1).max(10).default(3),
+    COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS: z.coerce
+      .number()
+      .int()
+      .min(0)
+      .max(10_000)
+      .default(250),
 
     // Kubernetes settings
     KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),

diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts
@@ -144,6 +144,10 @@ class ManagedSupervisor {
           otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT,
           prettyLogs: env.RUNNER_PRETTY_LOGS,
         },
+        createRetry: {
+          maxAttempts: env.COMPUTE_INSTANCE_CREATE_MAX_ATTEMPTS,
+          baseDelayMs: env.COMPUTE_INSTANCE_CREATE_RETRY_BASE_DELAY_MS,
+        },
       });
       this.computeManager = computeManager;
       this.workloadManager = computeManager;

diff --git a/apps/supervisor/src/services/computeSnapshotService.test.ts b/apps/supervisor/src/services/computeSnapshotService.test.ts
@@ -0,0 +1,130 @@
+import { describe, expect, it, vi } from "vitest";
+import { setTimeout as sleep } from "node:timers/promises";
+import { ComputeSnapshotService } from "./computeSnapshotService.js";
+import type { ComputeWorkloadManager } from "../workloadManager/compute.js";
+import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers";
+
+// The TimerWheel ticks every 100ms, so a 200ms delay dispatches within ~300ms.
+const DELAY_MS = 200;
+// Long enough that a pending snapshot would certainly have dispatched.
+const SETTLE_MS = 600;
+
+function createService() {
+  const snapshot = vi.fn(async (_opts: { runnerId: string; metadata: Record<string, string> }) => true);
+
+  const computeManager = {
+    snapshotDelayMs: DELAY_MS,
+    snapshotDispatchLimit: 1,
+    snapshot,
+  } as unknown as ComputeWorkloadManager;
+
+  const service = new ComputeSnapshotService({
+    computeManager,
+    workerClient: {} as SupervisorHttpClient,
+    wideEventOpts: { service: "supervisor-test", env: {}, enabled: false },
+  });
+
+  return { service, snapshot };
+}
+
+function delayedSnapshot(runnerId = "runner-1") {
+  return {
+    runnerId,
+    runFriendlyId: "run_1",
+    snapshotFriendlyId: "snapshot_1",
+  };
+}
+
+describe("ComputeSnapshotService", () => {
+  it("dispatches a scheduled snapshot after the delay", async () => {
+    const { service, snapshot } = createService();
+    try {
+      service.schedule("run_1", delayedSnapshot());
+
+      await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 });
+      expect(snapshot).toHaveBeenCalledWith({
+        runnerId: "runner-1",
+        metadata: { runId: "run_1", snapshotFriendlyId: "snapshot_1" },
+      });
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("cancel before the delay expires prevents the dispatch", async () => {
+    const { service, snapshot } = createService();
+    try {
+      service.schedule("run_1", delayedSnapshot());
+
+      expect(service.cancel("run_1")).toBe(true);
+
+      await sleep(SETTLE_MS);
+      expect(snapshot).not.toHaveBeenCalled();
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("cancel returns false when nothing is pending", () => {
+    const { service } = createService();
+    try {
+      expect(service.cancel("run_1")).toBe(false);
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("cancel with a matching runnerId cancels the pending snapshot", async () => {
+    const { service, snapshot } = createService();
+    try {
+      service.schedule("run_1", delayedSnapshot("runner-a"));
+
+      expect(service.cancel("run_1", "runner-a")).toBe(true);
+
+      await sleep(SETTLE_MS);
+      expect(snapshot).not.toHaveBeenCalled();
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("cancel with a different runnerId leaves the pending snapshot alone", async () => {
+    const { service, snapshot } = createService();
+    try {
+      service.schedule("run_1", delayedSnapshot("runner-a"));
+
+      // A stale runner for a reassigned run must not cancel the new runner's snapshot.
+      expect(service.cancel("run_1", "runner-b")).toBe(false);
+
+      await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 });
+      expect(snapshot).toHaveBeenCalledWith(
+        expect.objectContaining({ runnerId: "runner-a" })
+      );
+    } finally {
+      service.stop();
+    }
+  });
+
+  it("re-scheduling the same run replaces the pending snapshot", async () => {
+    const { service, snapshot } = createService();
+    try {
+      service.schedule("run_1", delayedSnapshot());
+      service.schedule("run_1", {
+        runnerId: "runner-1",
+        runFriendlyId: "run_1",
+        snapshotFriendlyId: "snapshot_2",
+      });
+
+      await vi.waitFor(() => expect(snapshot).toHaveBeenCalledTimes(1), { timeout: 2_000 });
+      await sleep(SETTLE_MS);
+
+      expect(snapshot).toHaveBeenCalledTimes(1);
+      expect(snapshot).toHaveBeenCalledWith({
+        runnerId: "runner-1",
+        metadata: { runId: "run_1", snapshotFriendlyId: "snapshot_2" },
+      });
+    } finally {
+      service.stop();
+    }
+  });
+});
diff --git a/apps/supervisor/src/services/computeSnapshotService.ts b/apps/supervisor/src/services/computeSnapshotService.ts
@@ -92,8 +92,19 @@ export class ComputeSnapshotService {
     });
   }
 
-  /** Cancel a pending delayed snapshot. Returns true if one was cancelled. */
-  cancel(runFriendlyId: string): boolean {
+  /**
+   * Cancel a pending delayed snapshot. Returns true if one was cancelled.
+   * When `runnerId` is given, only a snapshot scheduled for that same runner
+   * is cancelled - a stale runner for a run that has since been reassigned
+   * must not cancel the new runner's pending snapshot.
+   */
+  cancel(runFriendlyId: string, runnerId?: string): boolean {
+    if (runnerId) {
+      const pending = this.timerWheel.peek(runFriendlyId);
+      if (pending && pending.data.runnerId !== runnerId) {
+        return false;
+      }
+    }
     const cancelled = this.timerWheel.cancel(runFriendlyId);
     if (cancelled) {
       emitOneShot({

diff --git a/apps/supervisor/src/services/timerWheel.test.ts b/apps/supervisor/src/services/timerWheel.test.ts
@@ -51,6 +51,23 @@ describe("TimerWheel", () => {
     wheel.stop();
   });
 
+  it("peek returns the pending item without removing it", () => {
+    const wheel = new TimerWheel<string>({ delayMs: 3000, onExpire: () => {} });
+
+    wheel.start();
+    wheel.submit("run-1", "data");
+
+    expect(wheel.peek("run-1")).toEqual({ key: "run-1", data: "data" });
+    expect(wheel.size).toBe(1);
+    expect(wheel.peek("run-2")).toBeUndefined();
+
+    // Dispatched items are no longer peekable
+    vi.advanceTimersByTime(3100);
+    expect(wheel.peek("run-1")).toBeUndefined();
+
+    wheel.stop();
+  });
+
   it("cancel returns false for unknown key", () => {
     const wheel = new TimerWheel<string>({
       delayMs: 3000,

diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts
@@ -121,6 +121,12 @@ export class TimerWheel<T> {
     return true;
   }
 
+  /** Look up a pending item without removing it. */
+  peek(key: string): TimerWheelItem<T> | undefined {
+    const entry = this.entries.get(key);
+    return entry ? { key, data: entry.data } : undefined;
+  }
+
   /** Number of pending items in the wheel. */
   get size(): number {
     return this.entries.size;

diff --git a/apps/supervisor/src/workloadManager/compute.test.ts b/apps/supervisor/src/workloadManager/compute.test.ts
@@ -0,0 +1,56 @@
+import { describe, expect, it } from "vitest";
+import { ComputeClientError } from "@internal/compute";
+import { isRetryableCreateError, runnerNameForAttempt } from "./compute.js";
+
+describe("runnerNameForAttempt", () => {
+  it("keeps the unsuffixed name for the first attempt", () => {
+    expect(runnerNameForAttempt("runner-abc123", 1)).toBe("runner-abc123");
+  });
+
+  it("suffixes retry attempts deterministically", () => {
+    expect(runnerNameForAttempt("runner-abc123", 2)).toBe("runner-abc123-r2");
+    expect(runnerNameForAttempt("runner-abc123", 3)).toBe("runner-abc123-r3");
+  });
+});
+
+describe("isRetryableCreateError", () => {
+  it("retries statuses where the create definitely did not commit", () => {
+    expect(isRetryableCreateError(new ComputeClientError(500, "tap busy", "http://gw"))).toBe(
+      true
+    );
+    expect(isRetryableCreateError(new ComputeClientError(503, "no placement", "http://gw"))).toBe(
+      true
+    );
+  });
+
+  it("does not retry lost-response statuses (create may have committed)", () => {
+    expect(isRetryableCreateError(new ComputeClientError(502, "bad gateway", "http://gw"))).toBe(
+      false
+    );
+    expect(
+      isRetryableCreateError(new ComputeClientError(504, "gateway timeout", "http://gw"))
+    ).toBe(false);
+  });
+
+  it("does not retry 4xx responses", () => {
+    expect(isRetryableCreateError(new ComputeClientError(400, "bad request", "http://gw"))).toBe(
+      false
+    );
+    expect(isRetryableCreateError(new ComputeClientError(409, "conflict", "http://gw"))).toBe(
+      false
+    );
+  });
+
+  it("does not retry timeouts (instance may still be provisioning)", () => {
+    expect(isRetryableCreateError(new DOMException("timed out", "TimeoutError"))).toBe(false);
+  });
+
+  it("retries network-level fetch failures", () => {
+    expect(isRetryableCreateError(new TypeError("fetch failed"))).toBe(true);
+  });
+
+  it("does not retry unknown errors", () => {
+    expect(isRetryableCreateError(new Error("something else"))).toBe(false);
+    expect(isRetryableCreateError("string error")).toBe(false);
+  });
+});