test(e2e): fix flaky multimodal + approval-flow specs (#818) (#819)

tombeckenham · claude · web-flow · commit 614576cb605e · 2026-06-24T14:54:11.000+10:00
The multimodal and approval-flow E2E specs intermittently failed with empty
assistant responses (surfaced as `chatStream fatal`). Root cause was a
test-harness race, not aimock or the library — every request the harness
actually sent succeeded.

Multimodal: `sendMessageWithImage` typed into a controlled React input and then
attached the image, which auto-sends using that input's value. Under CPU load
`pressSequentially` dropped leading characters, so the prompt reached aimock
truncated (e.g. "cribe this image") and 404'd as "No fixture matched"; and React
state could lag the committed DOM value so the auto-send fired with empty text
and dispatched no request at all.

- helpers: type until the full prompt is committed, then retry the typing +
  attach until the send actually fires (user bubble renders).
- ChatUI: the image auto-send reads the live input DOM value instead of
  possibly-stale React state.

Approval-flow: `runTest` treated the optimistic user-message bump as "run
started", returning before any real stream activity — a stalled run then timed
out waiting for an approval that never appeared.

- runTest: require real stream activity (loading on, a tool call, completion, or
  an assistant message) before returning, and retry the click otherwise.

Verified: multimodal 200/200 with 0 flaky at 20x (4 workers, retries=2) and at
6 workers/retries=0; approval-flow 450/450 at 25x/8 workers/retries=0; full E2E
suite green.

Co-authored-by: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/testing/e2e/src/components/ChatUI.tsx b/testing/e2e/src/components/ChatUI.tsx
@@ -40,6 +40,7 @@ export function ChatUI({
 }: ChatUIProps) {
   const [input, setInput] = useState('')
   const messagesRef = useRef<HTMLDivElement>(null)
+  const inputRef = useRef<HTMLInputElement>(null)
 
   useEffect(() => {
     if (messagesRef.current) {
@@ -203,14 +204,20 @@ export function ChatUI({
             className="text-xs text-gray-400"
             onChange={(e) => {
               const file = e.target.files?.[0]
-              if (file && input.trim() && onSendMessageWithImage) {
-                onSendMessageWithImage(input.trim(), file)
+              // Read the prompt from the live input DOM value rather than the
+              // `input` React state. Attaching a file auto-sends, and under
+              // load a controlled input's state can lag the committed DOM
+              // value — reading state here would send an empty/partial prompt.
+              const text = (inputRef.current?.value ?? input).trim()
+              if (file && text && onSendMessageWithImage) {
+                onSendMessageWithImage(text, file)
                 setInput('')
               }
             }}
           />
         )}
         <input
+          ref={inputRef}
           data-testid="chat-input"
           type="text"
           value={input}
diff --git a/testing/e2e/tests/helpers.ts b/testing/e2e/tests/helpers.ts
@@ -37,11 +37,31 @@ export async function sendMessageWithImage(
   imagePath: string,
 ) {
   const input = page.getByTestId('chat-input')
-  await input.click()
-  await input.pressSequentially(text, { delay: 30 })
-  // Wait for React state to settle before attaching file
-  await page.waitForTimeout(200)
-  await page.getByTestId('image-attachment-input').setInputFiles(imagePath)
+  const fileInput = page.getByTestId('image-attachment-input')
+  const userMessages = page.getByTestId('user-message')
+
+  // Attaching the image auto-sends, using the prompt currently in the chat
+  // input, and the matched aimock fixture keys on the exact user text. A
+  // *controlled* React input is fragile here under CPU load (CI, parallel
+  // workers) in two ways: typing char-by-char can drop characters, leaving a
+  // truncated value like "cribe this image" (which 404s as "No fixture
+  // matched" → empty `chatStream fatal`); and the attach's onChange can land
+  // before the typed value is committed, dispatching nothing at all. So drive
+  // the interaction to its observable outcome — the user bubble rendering —
+  // retrying both the typing and the attach until the send actually fires with
+  // the full prompt. A redundant re-attach is harmless: the client ignores a
+  // second send while the first is still streaming.
+  await expect(async () => {
+    await input.click()
+    await input.fill('')
+    await input.pressSequentially(text, { delay: 15 })
+    // Confirm the full prompt is committed before attaching.
+    expect(await input.inputValue()).toBe(text)
+    // Reset the selection so re-attaching the same path re-fires onChange.
+    await fileInput.setInputFiles([])
+    await fileInput.setInputFiles(imagePath)
+    await expect(userMessages.first()).toBeVisible({ timeout: 2_000 })
+  }).toPass({ timeout: 15_000, intervals: [250, 500, 1000] })
 }
 
 export async function waitForResponse(page: Page, timeout = 15_000) {
diff --git a/testing/e2e/tests/tools-test/helpers.ts b/testing/e2e/tests/tools-test/helpers.ts
@@ -108,23 +108,47 @@ export async function runTest(page: Page): Promise<void> {
   for (let attempt = 0; attempt < 5; attempt++) {
     const baselineMessageCount = await readMessageCount()
     await page.click('#run-test-button')
-    await page.waitForTimeout(300)
 
-    const started = await page.evaluate((baseline) => {
-      const metadata = document.getElementById('test-metadata')
-      if (metadata?.getAttribute('data-is-loading') === 'true') {
-        return true
-      }
-
-      const text =
-        document.getElementById('messages-json-content')?.textContent || '[]'
-      try {
-        const parsed = JSON.parse(text)
-        return Array.isArray(parsed) && parsed.length > baseline
-      } catch {
-        return false
-      }
-    }, baselineMessageCount)
+    // A run "starts" only when real stream activity appears — not when the
+    // optimistic user message lands. Clicking adds one user message
+    // synchronously (baseline + 1); that alone must NOT count as started, or a
+    // stalled run (the click registered but the stream produced nothing) would
+    // be reported as started and the test would later time out waiting for an
+    // approval / completion that never comes. Real activity is: loading turned
+    // on, a tool call appeared, the test completed, or a *second* message (the
+    // assistant response) was added beyond the optimistic user message. Poll
+    // briefly so a slow-but-real run under CI load isn't mistaken for a stall.
+    const started = await page
+      .waitForFunction(
+        (baseline) => {
+          const metadata = document.getElementById('test-metadata')
+          if (metadata?.getAttribute('data-is-loading') === 'true') return true
+          if (
+            parseInt(
+              metadata?.getAttribute('data-tool-call-count') || '0',
+              10,
+            ) > 0
+          )
+            return true
+          if (metadata?.getAttribute('data-test-complete') === 'true')
+            return true
+          const text =
+            document.getElementById('messages-json-content')?.textContent ||
+            '[]'
+          try {
+            const parsed = JSON.parse(text)
+            // > baseline + 1: the assistant message arrived (a real response),
+            // not just the optimistic user message.
+            return Array.isArray(parsed) && parsed.length > baseline + 1
+          } catch {
+            return false
+          }
+        },
+        baselineMessageCount,
+        { timeout: 2000 },
+      )
+      .then(() => true)
+      .catch(() => false)
 
     if (started) {
       return