vercel
diff --git a/‎.changeset/four-donuts-glow.md‎
Lines changed: 9 additions & 0 deletions b/‎.changeset/four-donuts-glow.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/content/docs/changelog/meta.json‎
Lines changed: 1 addition & 1 deletion b/‎docs/content/docs/changelog/meta.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/content/docs/changelog/resilient-start.mdx‎
Lines changed: 327 additions & 0 deletions b/‎docs/content/docs/changelog/resilient-start.mdx‎
Lines changed: 327 additions & 0 deletions
diff --git a/‎packages/core/e2e/e2e.test.ts‎
Lines changed: 54 additions & 0 deletions b/‎packages/core/e2e/e2e.test.ts‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎packages/core/src/runtime.ts‎
Lines changed: 30 additions & 6 deletions b/‎packages/core/src/runtime.ts‎
Lines changed: 30 additions & 6 deletions
diff --git a/‎packages/core/src/runtime/run.ts‎
Lines changed: 29 additions & 1 deletion b/‎packages/core/src/runtime/run.ts‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎packages/core/src/runtime/start.test.ts‎
Lines changed: 72 additions & 1 deletion b/‎packages/core/src/runtime/start.test.ts‎
Lines changed: 72 additions & 1 deletion
@@ -0,0 +1,9 @@
+---
+"@workflow/world-postgres": patch
+"@workflow/world-vercel": patch
+"@workflow/world-local": patch
+"@workflow/world": patch
+"@workflow/core": patch
+---
+
+Allow workflow invocation to create run if initial storage call in `start` did not succeed. Send run input through queue to enable this. Allow creating run_created and run_started events together in World, and skip first event list call by returning events directly.
@@ -1,5 +1,5 @@
 {
   "title": "Changelog",
-  "pages": ["index", "eager-processing"],
+  "pages": ["index", "eager-processing", "resilient-start"],
   "defaultOpen": false
 }
@@ -4,7 +4,9 @@ import { setTimeout as sleep } from 'node:timers/promises';
 import {
   WorkflowRunCancelledError,
   WorkflowRunFailedError,
+  WorkflowWorldError,
 } from '@workflow/errors';
+import type { World } from '@workflow/world';
 import {
   afterAll,
   assert,
@@ -2172,4 +2174,56 @@ describe('e2e', () => {
       expect(returnValue.attempt).toBeGreaterThanOrEqual(1);
     }
   );
+
+  // ============================================================
+  // Resilient start: run completes even when run_created fails
+  // ============================================================
+  // TODO: Switch this to a stream-based workflow (e.g. readableStreamWorkflow)
+  // to also verify that serialization, flushing, and binary data work correctly
+  // over the queue boundary. Currently using addTenWorkflow to avoid the
+  // skipIf(isLocalDeployment()) barrier that stream tests require.
+  test(
+    'resilient start: addTenWorkflow completes when run_created returns 500',
+    { timeout: 60_000 },
+    async () => {
+      // Get the real world and wrap it so the first events.create call
+      // (run_created) throws a 500 server error. The queue should still
+      // be dispatched with runInput, and the runtime should bootstrap
+      // the run via the run_started fallback path.
+      const realWorld = getWorld();
+      let createCallCount = 0;
+      const stubbedWorld: World = {
+        ...realWorld,
+        events: {
+          ...realWorld.events,
+          create: (async (...args: Parameters<World['events']['create']>) => {
+            createCallCount++;
+            if (createCallCount === 1) {
+              // Fail the very first call (run_created from start())
+              throw new WorkflowWorldError('Simulated storage outage', {
+                status: 500,
+              });
+            }
+            return realWorld.events.create(...args);
+          }) as World['events']['create'],
+        },
+      };
+
+      const run = await start(await e2e('addTenWorkflow'), [123], {
+        world: stubbedWorld,
+      });
+
+      // Verify the stub intercepted the run_created call (only call
+      // through the stubbed world — the server-side runtime uses its
+      // own world instance for run_started and subsequent events).
+      expect(createCallCount).toBe(1);
+
+      // The run should still complete despite run_created failing.
+      // The runtime's resilient start path creates the run from
+      // run_started, so returnValue polling may initially get
+      // WorkflowRunNotFoundError before the queue delivers.
+      const returnValue = await run.returnValue;
+      expect(returnValue).toBe(133);
+    }
+  );
 });
@@ -114,6 +114,7 @@ export function workflowEntrypoint(
         runId,
         traceCarrier: traceContext,
         requestedAt,
+        runInput,
       } = WorkflowInvokePayloadSchema.parse(message_);
       const { requestId } = metadata;
       // Extract the workflow name from the topic name
@@ -239,7 +240,7 @@ export function workflowEntrypoint(
                 let workflowStartedAt = -1;
                 let workflowRun: WorkflowRun | undefined;
                 // Pre-loaded events from the run_started response.
-                // When present, we skip the events.list call to reduce TTFB.
+                // When present, we skip the events.list call.
                 let preloadedEvents: Event[] | undefined;
 
                 // --- Infrastructure: prepare the run state ---
@@ -257,7 +258,25 @@ export function workflowEntrypoint(
                     runId,
                     {
                       eventType: 'run_started',
-                      specVersion: SPEC_VERSION_CURRENT,
+                      // Use the spec version from the original start() call
+                      // when available, so the resilient start path creates
+                      // the run with the correct version (not always current).
+                      specVersion:
+                        runInput?.specVersion ?? SPEC_VERSION_CURRENT,
+                      // Pass run input from queue so the server can
+                      // create the run if run_created was missed.
+                      // Uint8Array values survive the queue natively
+                      // (CBOR on world-vercel, JSON reviver on world-local).
+                      ...(runInput
+                        ? {
+                            eventData: {
+                              input: runInput.input,
+                              deploymentId: runInput.deploymentId,
+                              workflowName: runInput.workflowName,
+                              executionContext: runInput.executionContext,
+                            },
+                          }
+                        : {}),
                     },
                     { requestId }
                   );
@@ -268,7 +287,7 @@ export function workflowEntrypoint(
                   }
                   workflowRun = result.run;
 
-                  // If the world returned events, use them to skip
+                  // If the response includes events, use them to skip
                   // the initial events.list call and reduce TTFB.
                   if (result.events && result.events.length > 0) {
                     preloadedEvents = result.events;
@@ -282,13 +301,16 @@ export function workflowEntrypoint(
                 } catch (err) {
                   // Run was concurrently completed/failed/cancelled
                   if (EntityConflictError.is(err) || RunExpiredError.is(err)) {
+                    // EntityConflictError: run was concurrently
+                    // completed/failed/cancelled during setup.
+                    // RunExpiredError: run already in terminal state.
+                    // In both cases, skip processing this message.
                     runtimeLogger.info(
                       'Run already finished during setup, skipping',
                       { workflowRunId: runId, message: err.message }
                     );
                     return;
-                  }
-                  if (err instanceof WorkflowRuntimeError) {
+                  } else if (err instanceof WorkflowRuntimeError) {
                     runtimeLogger.error(
                       'Fatal runtime error during workflow setup',
                       { workflowRunId: runId, error: err.message }
@@ -319,9 +341,11 @@ export function workflowEntrypoint(
                       throw failErr;
                     }
                     return;
+                  } else {
+                    throw err;
                   }
-                  throw err;
                 }
+
                 workflowStartedAt = +workflowRun.startedAt;
 
                 span?.setAttributes({
 
@@ -87,9 +87,19 @@ export class Run<TResult> {
    */
   private encryptionKeyPromise: Promise<CryptoKey | undefined> | null = null;
 
-  constructor(runId: string) {
+  /**
+   * When true, run_created failed and the run may not exist yet (the
+   * resilient start path will create it via run_started). pollReturnValue
+   * retries on WorkflowRunNotFoundError only when this flag is set so
+   * that normal runs fail fast on 404.
+   * @internal
+   */
+  private resilientStart = false;
+
+  constructor(runId: string, opts?: { resilientStart?: boolean }) {
     this.runId = runId;
     this.world = getWorld();
+    this.resilientStart = opts?.resilientStart ?? false;
   }
 
   /**
@@ -243,6 +253,15 @@ export class Run<TResult> {
    * @returns The workflow return value.
    */
   private async pollReturnValue(): Promise<TResult> {
+    // When resilientStart is true, run_created failed and the run may
+    // not exist yet. Retry on WorkflowRunNotFoundError up to 3 times
+    // (1s + 3s + 6s = 10s total) to give the queue time to deliver
+    // and the runtime to create the run via run_started.
+    // When resilientStart is false, 404 is a real error — fail fast.
+    let notFoundRetries = 0;
+    const NOT_FOUND_MAX_RETRIES = this.resilientStart ? 3 : 0;
+    const NOT_FOUND_DELAYS = [1_000, 3_000, 6_000];
+
     while (true) {
       try {
         const run = await this.world.runs.get(this.runId);
@@ -270,6 +289,15 @@ export class Run<TResult> {
           await new Promise((resolve) => setTimeout(resolve, 1_000));
           continue;
         }
+        if (
+          WorkflowRunNotFoundError.is(error) &&
+          notFoundRetries < NOT_FOUND_MAX_RETRIES
+        ) {
+          const delay = NOT_FOUND_DELAYS[notFoundRetries]!;
+          notFoundRetries++;
+          await new Promise((resolve) => setTimeout(resolve, delay));
+          continue;
+        }
         throw error;
       }
     }
 
@@ -1,4 +1,4 @@
-import { WorkflowRuntimeError } from '@workflow/errors';
+import { WorkflowRuntimeError, WorkflowWorldError } from '@workflow/errors';
 import { SPEC_VERSION_CURRENT, SPEC_VERSION_LEGACY } from '@workflow/world';
 import {
   afterEach,
@@ -391,6 +391,77 @@ describe('start', () => {
     });
   });
 
+  describe('resilient start (run_created failure)', () => {
+    const validWorkflow = Object.assign(() => Promise.resolve('result'), {
+      workflowId: 'test-workflow',
+    });
+
+    afterEach(() => {
+      vi.clearAllMocks();
+    });
+
+    it('should succeed when events.create throws a 500 error (queue still dispatched)', async () => {
+      const mockQueue = vi.fn().mockResolvedValue({ messageId: null });
+      const serverError = new WorkflowWorldError('Internal Server Error', {
+        status: 500,
+      });
+      const mockEventsCreate = vi.fn().mockRejectedValue(serverError);
+
+      vi.mocked(getWorld).mockReturnValue({
+        getDeploymentId: vi.fn().mockResolvedValue('deploy_123'),
+        events: { create: mockEventsCreate },
+        queue: mockQueue,
+      } as any);
+
+      // start() should NOT throw — the queue was still dispatched
+      const run = await start(validWorkflow, [42]);
+      expect(run.runId).toMatch(/^wrun_/);
+
+      // Queue should have been called with runInput
+      expect(mockQueue).toHaveBeenCalledTimes(1);
+      const [, queuePayload] = mockQueue.mock.calls[0];
+      expect(queuePayload.runInput).toBeDefined();
+      expect(queuePayload.runInput.deploymentId).toBe('deploy_123');
+      expect(queuePayload.runInput.workflowName).toBe('test-workflow');
+      expect(queuePayload.runInput.specVersion).toBe(SPEC_VERSION_CURRENT);
+    });
+
+    it('should throw when queue fails even if events.create succeeds', async () => {
+      const mockEventsCreate = vi.fn().mockResolvedValue({
+        run: { runId: 'wrun_test', status: 'pending' },
+      });
+      const mockQueue = vi
+        .fn()
+        .mockRejectedValue(new Error('Queue unavailable'));
+
+      vi.mocked(getWorld).mockReturnValue({
+        getDeploymentId: vi.fn().mockResolvedValue('deploy_123'),
+        events: { create: mockEventsCreate },
+        queue: mockQueue,
+      } as any);
+
+      await expect(start(validWorkflow, [])).rejects.toThrow(
+        'Queue unavailable'
+      );
+    });
+
+    it('should throw when events.create fails with a non-retryable error (e.g. 400)', async () => {
+      const badRequest = new WorkflowWorldError('Bad Request', {
+        status: 400,
+      });
+      const mockEventsCreate = vi.fn().mockRejectedValue(badRequest);
+      const mockQueue = vi.fn().mockResolvedValue({ messageId: null });
+
+      vi.mocked(getWorld).mockReturnValue({
+        getDeploymentId: vi.fn().mockResolvedValue('deploy_123'),
+        events: { create: mockEventsCreate },
+        queue: mockQueue,
+      } as any);
+
+      await expect(start(validWorkflow, [])).rejects.toThrow('Bad Request');
+    });
+  });
+
   describe('overload type inference', () => {
     // Type-only assertions that don't execute start() at runtime.
     // We use expectTypeOf on the function signature's return type directly.
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"title": "Changelog",`
`3`		`- "pages": ["index", "eager-processing"],`
	`3`	`+ "pages": ["index", "eager-processing", "resilient-start"],`
`4`	`4`	`"defaultOpen": false`
`5`	`5`	`}`