From fe35e37783724fe4f02bda1db4b40ccb42e0d58e Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Fri, 15 May 2026 21:13:55 +0800
Subject: [PATCH 01/18] fix(core): persist partial assistant turn when stream
 errors mid tool_use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Weak-network failures during an Anthropic-compatible stream (DeepSeek,
api.anthropic.com, etc.) can drop the SSE between a tool_use
`content_block_stop` and the terminal `message_stop`. The functionCall
chunk is already yielded at content_block_stop, so:

  - Turn.run records a ToolCallRequest event.
  - useGeminiStream's for-await exits and schedules the tool.
  - handleCompletedTools eventually fires submitQuery(..., ToolResult)
    and pushes a user[functionResponse] into history.
  - But processStreamResponse's history.push for the model turn never
    ran (the for-await threw first), so the matching tool_use is gone.

The next request body has `user → user[tool_result]` with no tool_use
in between, and the server rejects with HTTP 400:
"tool_use_id ... must have a corresponding tool_use block in the
previous message". Ctrl+Y can't recover because
stripOrphanedUserEntriesFromHistory only strips trailing user
entries — the lost tool_use is unrecoverable, and the session is
wedged.

Wrap the for-await loop in processStreamResponse with try/catch.
When the stream throws AND any functionCall chunk was already
yielded (hasToolCall=true), persist the partial assistant turn to
history before re-throwing. The eventual tool_result submission
then has a matching tool_use and the session can continue.

Plain-text partial turns (no functionCall yielded) are intentionally
NOT persisted: the Retry path pops the trailing user prompt and
re-issues it, so a stale partial-text model turn between them would
either bias the retry or surface as duplicate output.
---
 packages/core/src/core/geminiChat.test.ts | 110 +++++++++++++++++++
 packages/core/src/core/geminiChat.ts      | 126 +++++++++++++++-------
 2 files changed, 200 insertions(+), 36 deletions(-)

diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index e57f16de645..ddf7fd666aa 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -699,6 +699,116 @@ describe('GeminiChat', async () => {
       ).resolves.not.toThrow();
     });
 
+    it('persists partial assistant turn when stream throws after a tool_use chunk', async () => {
+      // Weak-network scenario: Anthropic-compatible providers emit the
+      // `functionCall` part on `content_block_stop`; the SSE may then drop
+      // before `message_stop`. The yielded chunk is enough for `Turn.run`
+      // to queue a `ToolCallRequest`, the tool scheduler will eventually
+      // submit a `functionResponse` user turn — without a matching
+      // tool_use in history, the next request body shows
+      // `user → user[tool_result]` and DeepSeek/Anthropic rejects with
+      // "tool_use_id ... must have a corresponding tool_use block in the
+      // previous message". `processStreamResponse` must persist the
+      // partial model turn before re-throwing so the pairing is intact.
+      mockRetryWithBackoff.mockImplementation(async (apiCall) => apiCall());
+      const networkError = new Error('SSE connection reset by peer');
+      const streamThatThrowsAfterToolCall = (async function* () {
+        yield {
+          candidates: [
+            {
+              content: {
+                role: 'model',
+                parts: [
+                  {
+                    functionCall: {
+                      id: 'call_00_CeJrKJB0PSmXUZTCWHET7332',
+                      name: 'read_file',
+                      args: { path: '/tmp/x.txt' },
+                    },
+                  },
+                ],
+              },
+            },
+          ],
+        } as unknown as GenerateContentResponse;
+        throw networkError;
+      })();
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        streamThatThrowsAfterToolCall,
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        { message: 'open /tmp/x.txt please' },
+        'prompt-weak-network-tool',
+      );
+      await expect(
+        (async () => {
+          for await (const _ of stream) {
+            /* drain */
+          }
+        })(),
+      ).rejects.toBe(networkError);
+
+      const history = chat.getHistory();
+      expect(history.length).toBe(2);
+      expect(history[0]!.role).toBe('user');
+      const modelTurn = history[1]!;
+      expect(modelTurn.role).toBe('model');
+      expect(modelTurn.parts).toBeDefined();
+      const functionCallPart = modelTurn.parts!.find((p) => p.functionCall);
+      expect(functionCallPart?.functionCall?.id).toBe(
+        'call_00_CeJrKJB0PSmXUZTCWHET7332',
+      );
+      expect(functionCallPart?.functionCall?.name).toBe('read_file');
+    });
+
+    it('does NOT persist partial assistant turn when stream throws before any tool_use chunk', async () => {
+      // Plain-text partial responses are deliberately dropped on stream
+      // error: the Retry path pops the trailing user prompt and re-issues
+      // it, so a stale partial-text model turn between them would bias
+      // the retry or surface as duplicate output. Only tool_use turns
+      // need the partial-history bridge to preserve the tool_use →
+      // tool_result invariant — text alone has no such invariant.
+      mockRetryWithBackoff.mockImplementation(async (apiCall) => apiCall());
+      const networkError = new Error('connection reset');
+      const streamThatThrowsAfterText = (async function* () {
+        yield {
+          candidates: [
+            {
+              content: {
+                role: 'model',
+                parts: [{ text: 'partial reply that will be lost' }],
+              },
+            },
+          ],
+        } as unknown as GenerateContentResponse;
+        throw networkError;
+      })();
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        streamThatThrowsAfterText,
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        { message: 'hello' },
+        'prompt-weak-network-text',
+      );
+      await expect(
+        (async () => {
+          for await (const _ of stream) {
+            /* drain */
+          }
+        })(),
+      ).rejects.toBe(networkError);
+
+      const history = chat.getHistory();
+      // Only the user turn is in history — the partial-text model turn is
+      // intentionally not persisted.
+      expect(history.length).toBe(1);
+      expect(history[0]!.role).toBe('user');
+    });
+
     it('should throw InvalidStreamError when no tool call and no finish reason', async () => {
       vi.useFakeTimers();
       try {
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 6f7a11c7914..c8aee1e35a2 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -1257,48 +1257,62 @@ export class GeminiChat {
 
     let hasToolCall = false;
     let hasFinishReason = false;
+    // Captured if the upstream stream throws mid-iteration (typical on weak
+    // networks: SSE drops between `content_block_stop` of a tool_use and the
+    // terminal `message_stop`). We still build / record / push a partial
+    // assistant turn below before re-throwing — see the dedicated branch in
+    // the post-loop block for why this is needed to keep tool_use/tool_result
+    // pairing intact across the failure.
+    let streamError: unknown = null;
 
-    for await (const chunk of streamResponse) {
-      // Use ||= to avoid later usage-only chunks (no candidates) overwriting
-      // a finishReason that was already seen in an earlier chunk.
-      hasFinishReason ||=
-        chunk?.candidates?.some((candidate) => candidate.finishReason) ?? false;
-
-      if (isValidResponse(chunk)) {
-        const content = chunk.candidates?.[0]?.content;
-        if (content?.parts) {
-          if (content.parts.some((part) => part.functionCall)) {
-            hasToolCall = true;
-          }
+    try {
+      for await (const chunk of streamResponse) {
+        // Use ||= to avoid later usage-only chunks (no candidates) overwriting
+        // a finishReason that was already seen in an earlier chunk.
+        hasFinishReason ||=
+          chunk?.candidates?.some((candidate) => candidate.finishReason) ??
+          false;
+
+        if (isValidResponse(chunk)) {
+          const content = chunk.candidates?.[0]?.content;
+          if (content?.parts) {
+            if (content.parts.some((part) => part.functionCall)) {
+              hasToolCall = true;
+            }
 
-          // Collect all parts for recording
-          allModelParts.push(...content.parts);
+            // Collect all parts for recording
+            allModelParts.push(...content.parts);
+          }
         }
-      }
 
-      // Collect token usage for consolidated recording
-      if (chunk.usageMetadata) {
-        usageMetadata = chunk.usageMetadata;
-        // Context usage tracks prompt size; output isn't in history yet.
-        const lastPromptTokenCount =
-          usageMetadata.promptTokenCount || usageMetadata.totalTokenCount;
-        if (lastPromptTokenCount) {
-          // Always update the per-chat counter so this chat (including
-          // subagents) can make its own compaction decisions.
-          this.lastPromptTokenCount = lastPromptTokenCount;
-          // Mirror to the global telemetry only when wired — subagents
-          // pass `telemetryService=undefined` to keep their context usage
-          // out of the main session's UI counters.
-          this.telemetryService?.setLastPromptTokenCount(lastPromptTokenCount);
-        }
-        if (usageMetadata.cachedContentTokenCount && this.telemetryService) {
-          this.telemetryService.setLastCachedContentTokenCount(
-            usageMetadata.cachedContentTokenCount,
-          );
+        // Collect token usage for consolidated recording
+        if (chunk.usageMetadata) {
+          usageMetadata = chunk.usageMetadata;
+          // Context usage tracks prompt size; output isn't in history yet.
+          const lastPromptTokenCount =
+            usageMetadata.promptTokenCount || usageMetadata.totalTokenCount;
+          if (lastPromptTokenCount) {
+            // Always update the per-chat counter so this chat (including
+            // subagents) can make its own compaction decisions.
+            this.lastPromptTokenCount = lastPromptTokenCount;
+            // Mirror to the global telemetry only when wired — subagents
+            // pass `telemetryService=undefined` to keep their context usage
+            // out of the main session's UI counters.
+            this.telemetryService?.setLastPromptTokenCount(
+              lastPromptTokenCount,
+            );
+          }
+          if (usageMetadata.cachedContentTokenCount && this.telemetryService) {
+            this.telemetryService.setLastCachedContentTokenCount(
+              usageMetadata.cachedContentTokenCount,
+            );
+          }
         }
-      }
 
-      yield chunk; // Yield every chunk to the UI immediately.
+        yield chunk; // Yield every chunk to the UI immediately.
+      }
+    } catch (e) {
+      streamError = e;
     }
 
     let thoughtContentPart: Part | undefined;
@@ -1369,6 +1383,46 @@ export class GeminiChat {
       });
     }
 
+    // Mid-stream failure recovery: if the upstream stream threw (typical on
+    // weak networks — SSE cut between a tool_use `content_block_stop` and
+    // the terminal `message_stop`) AND any `functionCall` chunk was already
+    // yielded to consumers, we must persist the partial assistant turn here.
+    //
+    // The content generator (Anthropic / OpenAI) emits a `functionCall` part
+    // only at the end of a tool_use block. Once yielded, `Turn.run` registers
+    // a `ToolCallRequest` event, the React tool scheduler queues the call,
+    // and `handleCompletedTools` will fire `submitQuery(..., ToolResult)` —
+    // pushing a user message with `functionResponse` into history — even
+    // though the parent stream errored. Without preserving the matching
+    // tool_use on the model side, the next request body would have
+    // `user → user[tool_result]` with no tool_use in between, and the
+    // Anthropic-compatible API (DeepSeek, Anthropic, etc.) rejects with
+    //   "tool_use_id ... must have a corresponding tool_use block in the
+    //    previous message"
+    // — an unrecoverable state because Ctrl+Y's `stripOrphanedUserEntries`
+    // only strips trailing user entries; the lost tool_use can't be
+    // resurrected.
+    //
+    // Plain-text partial turns (no functionCall yielded) are deliberately
+    // NOT persisted — the Retry path pops the trailing user prompt and
+    // re-issues it; a stale partial-text model turn between them would
+    // either bias the retry or surface as a duplicate.
+    if (streamError !== null) {
+      if (
+        hasToolCall &&
+        (thoughtContentPart || consolidatedHistoryParts.length > 0)
+      ) {
+        this.history.push({
+          role: 'model',
+          parts: [
+            ...(thoughtContentPart ? [thoughtContentPart] : []),
+            ...consolidatedHistoryParts,
+          ],
+        });
+      }
+      throw streamError;
+    }
+
     // Stream validation logic: A stream is considered successful if:
     // 1. There's a tool call (tool calls can end without explicit finish reasons), OR
     // 2. There's a finish reason AND we have non-empty response text or thought text

From b2d332e00517770a098e36b3328245221d9edb5c Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Fri, 15 May 2026 21:25:44 +0800
Subject: [PATCH 02/18] test(core): cover thinking+tool_use mid-stream throw in
 partial-history fix

Adds a third case to the partial-history persistence test suite for
reasoning-mode providers (DeepSeek thinking, Claude 4.6+ adaptive): when
the assistant turn streams a thinking block AND a tool_use before the
SSE drops, the partial push must keep the thinking part before the
functionCall so DeepSeek's `injectThinkingOnToolUseTurns` converter
pass sees an existing block on the replayed turn and does not pre-pend
a synthetic empty one (which would discard the model's original
reasoning text).
---
 packages/core/src/core/geminiChat.test.ts | 70 +++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index ddf7fd666aa..6270506838b 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -763,6 +763,76 @@ describe('GeminiChat', async () => {
       expect(functionCallPart?.functionCall?.name).toBe('read_file');
     });
 
+    it('preserves thinking parts alongside tool_use when stream throws mid-tool', async () => {
+      // Covers reasoning-mode providers (DeepSeek, Claude 4.6+) where the
+      // assistant turn carries both a thinking block and a tool_use. The
+      // partial-history push must keep the thinking part so DeepSeek's
+      // `injectThinkingOnToolUseTurns` converter pass sees an existing
+      // block on the replayed turn and does not pre-pend a synthetic one
+      // (which would discard the model's original reasoning text).
+      mockRetryWithBackoff.mockImplementation(async (apiCall) => apiCall());
+      const networkError = new Error('SSE timeout');
+      const streamWithThinkingAndTool = (async function* () {
+        yield {
+          candidates: [
+            {
+              content: {
+                role: 'model',
+                parts: [{ text: 'planning the read', thought: true }],
+              },
+            },
+          ],
+        } as unknown as GenerateContentResponse;
+        yield {
+          candidates: [
+            {
+              content: {
+                role: 'model',
+                parts: [
+                  {
+                    functionCall: {
+                      id: 'call_thinking_tool_use',
+                      name: 'read_file',
+                      args: { path: '/tmp/a.txt' },
+                    },
+                  },
+                ],
+              },
+            },
+          ],
+        } as unknown as GenerateContentResponse;
+        throw networkError;
+      })();
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        streamWithThinkingAndTool,
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        { message: 'read /tmp/a.txt' },
+        'prompt-thinking-tool-weak-network',
+      );
+      await expect(
+        (async () => {
+          for await (const _ of stream) {
+            /* drain */
+          }
+        })(),
+      ).rejects.toBe(networkError);
+
+      const history = chat.getHistory();
+      expect(history.length).toBe(2);
+      const modelTurn = history[1]!;
+      expect(modelTurn.role).toBe('model');
+      const parts = modelTurn.parts!;
+      // The thinking part must come before the functionCall — Anthropic
+      // requires thinking blocks first in the assistant content array.
+      expect(parts[0]!.thought).toBe(true);
+      expect(parts[0]!.text).toBe('planning the read');
+      const functionCallPart = parts.find((p) => p.functionCall);
+      expect(functionCallPart?.functionCall?.id).toBe('call_thinking_tool_use');
+    });
+
     it('does NOT persist partial assistant turn when stream throws before any tool_use chunk', async () => {
       // Plain-text partial responses are deliberately dropped on stream
       // error: the Retry path pops the trailing user prompt and re-issues

From 3de3241a2c2cb82560db01558c81b09103b80c5e Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Fri, 15 May 2026 22:18:52 +0800
Subject: [PATCH 03/18] =?UTF-8?q?fix(core,cli):=20close=20tool=5Fuse?=
 =?UTF-8?q?=E2=86=94tool=5Fresult=20invariant=20at=20failure=20points?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends the partial-history fix in fe35e3778 to cover the residual race
paths surfaced in PR #4176 review:

  - Race A: Ctrl+Y while in-flight tool hasn't finished. History is
    [user, model(tool_use)] — `stripOrphanedUserEntriesFromHistory` only
    pops trailing user entries, so the retry payload lands as a fresh
    user turn after the orphan tool_use and API rejects. Meanwhile the
    scheduler's `onAllToolCallsComplete` is single-shot and gated on
    `isResponding`, so the eventual tool_result is silently swallowed.
  - Race B: process crash / OOM / SIGKILL between the partial-tool_use
    push and the React scheduler's tool_result submission. On `--resume`
    the dangling model(tool_use) wedges the first API call.
  - Race C: external tooling / manual JSONL edits leaving the same
    dangling shape.

The fix has three pieces working together:

1. `repairOrphanedToolUseTurns(history)` in geminiChat.ts walks history
   left-to-right and synthesizes an `error`-typed functionResponse for
   every functionCall whose id is not echoed back in the next user
   turn. Appends to an existing user turn when present, otherwise
   inserts a new one. Returns the injected (callId, name) list.

2. `GeminiClient.repairOrphanedToolUseTurnsInHistory()` wraps the helper
   and is called from three points:
     - `startChat()` after loading the transcript (Race B/C, --resume).
     - `sendMessageStream` Retry branch after stripOrphans (Race A).
     - `sendMessageStream` UserQuery/Cron branch (defensive belt-and-
       suspenders for anything that slipped past 1 and 2).

3. `handleCompletedTools` in useGeminiStream.ts dedupes against
   chat.history before submitting tool_results — if a synthetic
   functionResponse for the same callId is already present (planted by
   the repair pass), the in-flight scheduler's late result is dropped
   and the call is `markToolsAsSubmitted` so the UI advances. Same
   trade-off upstream Claude Code's `StreamingToolExecutor.discard()`
   makes — late real results are dropped on the wire after synthesis,
   the model sees the synthetic error and can retry the tool if it
   still wants the result.

Together with the partial-history push from fe35e3778, every tool_use
that ever streamed to the consumer is guaranteed to have a matching
tool_result on the wire — regardless of whether the stream errored,
the user retried mid-flight, the process crashed, or the session is
later resumed. This is the qwen-code analogue of upstream Claude Code's
`yieldMissingToolResultBlocks` (query.ts:123-149), but split across
the core/cli boundary because the React tool scheduler runs out-of-band
from the stream loop (so the synthesis path can't atomically discard
in-flight tools the way upstream's StreamingToolExecutor can; the
history-dedup at handleCompletedTools fills that gap instead).

Tests:
  - 8 new repair-helper tests in geminiChat.test.ts cover Race A,
    Race B, partial coverage of parallel tool_use, idempotence on
    already-paired history, no-op on tool-free history, caller-
    supplied reason text, multiple non-adjacent dangling rounds, and
    routes through the GeminiChat instance-method wrapper.
  - client.test.ts mocks updated for the new GeminiChat method.
  - All 88 geminiChat tests pass; 131 client tests pass; 91
    useGeminiStream tests pass.
---
 packages/cli/src/ui/hooks/useGeminiStream.ts |  49 +++-
 packages/core/src/core/client.test.ts        |   3 +
 packages/core/src/core/client.ts             |  64 +++++
 packages/core/src/core/geminiChat.test.ts    | 271 +++++++++++++++++++
 packages/core/src/core/geminiChat.ts         | 111 ++++++++
 5 files changed, 496 insertions(+), 2 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index 8b203b1de49..e74636fc11a 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -2019,17 +2019,62 @@ export const useGeminiStream = (
         );
       }
 
-      const geminiTools = completedAndReadyToSubmitTools.filter(
+      const geminiToolsRaw = completedAndReadyToSubmitTools.filter(
         (t) => !t.request.isClientInitiated,
       );
 
-      for (const toolCall of geminiTools) {
+      for (const toolCall of geminiToolsRaw) {
         geminiClient?.recordCompletedToolCall(
           toolCall.request.name,
           toolCall.request.args as Record<string, unknown>,
         );
       }
 
+      // History-based dedup: if a synthetic `functionResponse` for this
+      // callId is already in chat.history (planted by
+      // `client.repairOrphanedToolUseTurnsInHistory()` on session-load or
+      // Retry), the in-flight scheduler result would land as a duplicate
+      // `tool_result` and produce two consecutive user turns where the
+      // second is orphaned (no preceding tool_use — the synthetic ate it).
+      //
+      // For dedup hits: mark the tool as submitted so the UI advances and
+      // `useReactToolScheduler.allToolCallsCompleteHandler` (single-shot)
+      // doesn't leave the call permanently stuck in `completed-but-not-
+      // submitted`. The real result is dropped on the wire — same trade-off
+      // upstream Claude Code makes when its `StreamingToolExecutor.discard()`
+      // is followed by a `yieldMissingToolResultBlocks` synthesis
+      // (`query.ts:733` + `:984`). The model sees the synthetic error and
+      // can retry the tool if it still wants the result.
+      const historyCallIdsWithResponse = new Set<string>();
+      // Guard the call: some test harnesses build a partial GeminiClient
+      // mock without `getHistory`. Skipping dedup in that case is safe —
+      // it just means tests that never set up the repair pre-condition
+      // run with the original (pre-dedup) submission shape.
+      if (geminiClient && typeof geminiClient.getHistory === 'function') {
+        for (const entry of geminiClient.getHistory()) {
+          if (entry.role !== 'user') continue;
+          for (const part of entry.parts ?? []) {
+            const id = part.functionResponse?.id;
+            if (id) historyCallIdsWithResponse.add(id);
+          }
+        }
+      }
+
+      const dedupedCallIds = geminiToolsRaw
+        .filter((tc) => historyCallIdsWithResponse.has(tc.request.callId))
+        .map((tc) => tc.request.callId);
+      if (dedupedCallIds.length > 0) {
+        debugLogger.warn(
+          `[REPAIR] Dropping ${dedupedCallIds.length} late tool result(s) ` +
+            `whose callId already has a synthetic functionResponse in ` +
+            `history: ${dedupedCallIds.join(', ')}`,
+        );
+        markToolsAsSubmitted(dedupedCallIds);
+      }
+      const geminiTools = geminiToolsRaw.filter(
+        (tc) => !historyCallIdsWithResponse.has(tc.request.callId),
+      );
+
       if (geminiTools.length === 0) {
         return;
       }
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 044625c51ec..85501c8fa04 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -1453,6 +1453,7 @@ describe('Gemini Client (client.ts)', () => {
         getHistory: vi.fn().mockReturnValue([]),
         getHistoryLength,
         stripOrphanedUserEntriesFromHistory,
+        repairOrphanedToolUseTurns: vi.fn().mockReturnValue({ injected: [] }),
       } as unknown as GeminiChat;
       mockTurnRunFn.mockReturnValue(
         (async function* () {
@@ -4205,6 +4206,7 @@ Other open files:
           getHistoryLength: vi.fn().mockReturnValueOnce(3).mockReturnValue(2),
           setHistory: vi.fn(),
           stripOrphanedUserEntriesFromHistory: vi.fn(),
+          repairOrphanedToolUseTurns: vi.fn().mockReturnValue({ injected: [] }),
         };
         client['chat'] = mockChat as GeminiChat;
 
@@ -4237,6 +4239,7 @@ Other open files:
           getHistoryLength: vi.fn().mockReturnValue(0),
           setHistory: vi.fn(),
           stripOrphanedUserEntriesFromHistory: vi.fn(),
+          repairOrphanedToolUseTurns: vi.fn().mockReturnValue({ injected: [] }),
         };
         client['chat'] = mockChat as GeminiChat;
 
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index ff9a76fc5e0..867fa2146ed 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -353,6 +353,44 @@ export class GeminiClient {
     this.forceFullIdeContext = true;
   }
 
+  /**
+   * Synthesize a `functionResponse` for every dangling `model[functionCall]`
+   * in chat history whose corresponding tool_result never landed. Inverse of
+   * {@link stripOrphanedUserEntriesFromHistory}, which only handles trailing
+   * `user` entries.
+   *
+   * Called from three points:
+   *   1. After {@link startChat} loads transcript (covers `--resume` of a
+   *      session that crashed between partial-tool_use push and tool
+   *      completion).
+   *   2. After `stripOrphanedUserEntriesFromHistory` on the Retry submit path
+   *      (covers Ctrl+Y race — user retries while an in-flight tool's
+   *      `tool_result` has not yet been submitted, leaving a trailing
+   *      `model[functionCall]` without matching `functionResponse`).
+   *   3. Defensively at the start of UserQuery / Cron sends, so any state
+   *      that slipped past 1+2 still gets fixed before hitting the wire.
+   *
+   * Synthesizes an `error` `functionResponse`. The React tool scheduler
+   * (`useGeminiStream.handleCompletedTools`) MUST dedupe by `callId` against
+   * the live history before submitting its own `tool_result` — otherwise a
+   * late real result lands as a second `user[tool_result]` block (orphan
+   * because the synthetic already consumed the matching `tool_use`).
+   */
+  repairOrphanedToolUseTurnsInHistory(reason?: string): {
+    injected: Array<{ callId: string; name: string }>;
+  } {
+    const result = this.getChat().repairOrphanedToolUseTurns(reason);
+    if (result.injected.length > 0) {
+      debugLogger.warn(
+        `[REPAIR] Synthesized ${result.injected.length} functionResponse(s) ` +
+          `for dangling tool_use(s): ${result.injected
+            .map((e) => `${e.name}(${e.callId})`)
+            .join(', ')}`,
+      );
+    }
+    return result;
+  }
+
   setHistory(history: Content[]) {
     this.getChat().setHistory(history);
     // Replacing history wholesale drops any prior read_file tool
@@ -656,6 +694,16 @@ export class GeminiClient {
         uiTelemetryService,
       );
 
+      // Repair any dangling `model[functionCall]` whose `functionResponse`
+      // never made it back into the transcript before we wrote the JSONL.
+      // The common cause is a process crash / OOM / SIGKILL between the
+      // partial-tool_use push (see `processStreamResponse`) and the React
+      // scheduler's tool_result submission. Without this pass, the first
+      // API call on a resumed session would 400 with the same
+      // `tool_use_id ... corresponding tool_use` error this whole subsystem
+      // is trying to escape.
+      this.chat.repairOrphanedToolUseTurns();
+
       const sessionStartAdditionalContext =
         await this.fireSessionStartHook(sessionStartSource);
       this.lastSessionStartContext = sessionStartAdditionalContext;
@@ -1043,6 +1091,22 @@ export class GeminiClient {
 
     if (messageType === SendMessageType.Retry) {
       this.stripOrphanedUserEntriesFromHistory();
+      // Close any dangling `model[functionCall]` whose tool_result never
+      // landed before composing the retry payload. Ctrl+Y race: the user
+      // retried while a tool was still running on a partial-tool_use turn
+      // pushed by `processStreamResponse`'s mid-stream error path. The
+      // scheduler's `onAllToolCallsComplete` is single-shot and gated on
+      // `isResponding` (`useGeminiStream:1971`), so the eventual
+      // `tool_result` would otherwise be silently swallowed and the next
+      // API call would 400 with "tool_use_id ... corresponding tool_use"
+      // anyway. The synthesized `error` `functionResponse` keeps the wire
+      // invariant intact; the live scheduler dedupes against history in
+      // `handleCompletedTools` before submitting its real result so the
+      // synthetic doesn't collide with a late real one.
+      //
+      // Restricted to the Retry branch to mirror `stripOrphanedUserEntries`
+      // scope. Crash-resume's path is covered separately in `startChat()`.
+      this.repairOrphanedToolUseTurnsInHistory();
     }
 
     // Fire UserPromptSubmit hook through MessageBus (only if hooks are enabled)
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index 6270506838b..bd281d537a3 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -3312,6 +3312,277 @@ describe('GeminiChat', async () => {
     });
   });
 
+  describe('repairOrphanedToolUseTurns', () => {
+    // Verifies the inverse-of-strip pass: every `model[functionCall]`
+    // without a matching `user[functionResponse]` in the next turn gets
+    // a synthesized error functionResponse. This closes the
+    // tool_use ↔ tool_result wire invariant for the residual races
+    // (`--resume` of a crashed session, Ctrl+Y before in-flight tool
+    // finishes, scheduler abort before submitQuery, manual JSONL edits).
+
+    it('injects a synthetic functionResponse for a trailing tool_use (Race B/C)', () => {
+      // --resume of a session that crashed after the partial-tool_use push
+      // in `processStreamResponse` but before the scheduler submitted the
+      // tool_result. First API call would 400 without repair.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'open /tmp/a.txt' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_crash_A',
+                name: 'read_file',
+                args: { path: '/tmp/a.txt' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected).toEqual([
+        { callId: 'call_crash_A', name: 'read_file' },
+      ]);
+      const history = chat.getHistory();
+      expect(history.length).toBe(3);
+      expect(history[2]!.role).toBe('user');
+      const fr = history[2]!.parts![0]!.functionResponse;
+      expect(fr?.id).toBe('call_crash_A');
+      expect(fr?.name).toBe('read_file');
+      expect((fr?.response as { error?: string })?.error).toMatch(
+        /interrupted/i,
+      );
+    });
+
+    it('appends synthetic functionResponse onto an existing user turn (Race A)', () => {
+      // Ctrl+Y race: the user retried while the in-flight tool was still
+      // running. `stripOrphanedUserEntriesFromHistory` leaves the
+      // model[functionCall] in place (trailing entry is model), then the
+      // Retry pushes a fresh user turn with the user prompt. Repair must
+      // splice the synthetic response onto that user turn so it sits
+      // immediately after the model[tool_use] — NOT create a stray
+      // synthetic user turn between them.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'open /tmp/a.txt' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_race_A',
+                name: 'read_file',
+                args: { path: '/tmp/a.txt' },
+              },
+            },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'retry prompt' }] },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected.map((e) => e.callId)).toEqual(['call_race_A']);
+      const history = chat.getHistory();
+      // No new turn inserted — synthetic merges into existing user turn.
+      expect(history.length).toBe(3);
+      expect(history[2]!.role).toBe('user');
+      expect(history[2]!.parts!.length).toBe(2);
+      expect(history[2]!.parts![0]).toEqual({ text: 'retry prompt' });
+      expect(history[2]!.parts![1]!.functionResponse?.id).toBe('call_race_A');
+    });
+
+    it('handles parallel tool_use turns with only some responses present', () => {
+      // Common shape after #4176's partial-history push: the stream
+      // emitted multiple `content_block_stop`s for parallel tool_uses,
+      // but the React scheduler only submitted some before the user hit
+      // Ctrl+Y. The Retry path's repair must close every missing pair —
+      // the present `functionResponse` for A must NOT be duplicated.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'batch read' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_A',
+                name: 'read_file',
+                args: { path: '/a' },
+              },
+            },
+            {
+              functionCall: {
+                id: 'call_B',
+                name: 'read_file',
+                args: { path: '/b' },
+              },
+            },
+            {
+              functionCall: {
+                id: 'call_C',
+                name: 'read_file',
+                args: { path: '/c' },
+              },
+            },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'call_A',
+                name: 'read_file',
+                response: { output: 'a-content' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      const injectedIds = result.injected.map((e) => e.callId);
+      expect(injectedIds.sort()).toEqual(['call_B', 'call_C']);
+      const history = chat.getHistory();
+      // Same shape — synthetics merge into the existing user turn.
+      expect(history.length).toBe(3);
+      const fr = history[2]!.parts!.map((p) => p.functionResponse?.id);
+      expect(fr).toEqual(['call_A', 'call_B', 'call_C']);
+      // The pre-existing `call_A` response is untouched (real result kept).
+      expect(
+        (
+          history[2]!.parts![0]!.functionResponse?.response as {
+            output?: string;
+          }
+        )?.output,
+      ).toBe('a-content');
+    });
+
+    it('is a no-op when every tool_use already has a matching response', () => {
+      // Happy path: don't churn history when the invariant already holds.
+      const happy = [
+        { role: 'user' as const, parts: [{ text: 'q' }] },
+        {
+          role: 'model' as const,
+          parts: [
+            {
+              functionCall: {
+                id: 'call_ok',
+                name: 'read_file',
+                args: {},
+              },
+            },
+          ],
+        },
+        {
+          role: 'user' as const,
+          parts: [
+            {
+              functionResponse: {
+                id: 'call_ok',
+                name: 'read_file',
+                response: { output: 'fine' },
+              },
+            },
+          ],
+        },
+      ];
+      chat.setHistory(structuredClone(happy));
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected).toEqual([]);
+      expect(chat.getHistory()).toEqual(happy);
+    });
+
+    it('repairs multiple non-adjacent dangling tool_uses across history', () => {
+      // Stress case for the forward-walk algorithm: dangling turn near the
+      // start AND another near the end. Both should be repaired and the
+      // outer loop must not re-scan synthetic user turns it just inserted.
+      chat.setHistory([
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'early_orphan',
+                name: 'glob',
+                args: {},
+              },
+            },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'second user prompt' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'late_orphan',
+                name: 'read_file',
+                args: { path: '/x' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      const injectedIds = result.injected.map((e) => e.callId);
+      expect(injectedIds.sort()).toEqual(['early_orphan', 'late_orphan']);
+      const history = chat.getHistory();
+      // early_orphan got the synthetic spliced into the existing user turn
+      // between the two model entries; late_orphan got a brand-new
+      // trailing user turn appended after the second model entry.
+      expect(history.length).toBe(4);
+      expect(history[0]!.role).toBe('model');
+      expect(history[1]!.role).toBe('user');
+      expect(
+        history[1]!.parts!.some(
+          (p) => p.functionResponse?.id === 'early_orphan',
+        ),
+      ).toBe(true);
+      expect(history[2]!.role).toBe('model');
+      expect(history[3]!.role).toBe('user');
+      expect(history[3]!.parts![0]!.functionResponse?.id).toBe('late_orphan');
+    });
+
+    it('ignores model turns with no functionCall parts', () => {
+      const plain = [
+        { role: 'user' as const, parts: [{ text: 'hi' }] },
+        { role: 'model' as const, parts: [{ text: 'hello' }] },
+      ];
+      chat.setHistory(structuredClone(plain));
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected).toEqual([]);
+      expect(chat.getHistory()).toEqual(plain);
+    });
+
+    it('uses caller-provided reason text', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'q' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: { id: 'cid', name: 'read_file', args: {} },
+            },
+          ],
+        },
+      ]);
+
+      chat.repairOrphanedToolUseTurns('custom reason');
+
+      const fr = chat.getHistory()[2]!.parts![0]!.functionResponse;
+      expect((fr?.response as { error?: string })?.error).toBe('custom reason');
+    });
+  });
+
   describe('output token recovery', () => {
     function makeChunk(
       parts: Array<{ text?: string; functionCall?: unknown }>,
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index c8aee1e35a2..ee4e6d90fcc 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -378,6 +378,101 @@ export class InvalidStreamError extends Error {
   }
 }
 
+/**
+ * Default error text used when a synthesized `functionResponse` has to stand
+ * in for a real tool result that never made it back into history (e.g. the
+ * process crashed between the partial-tool_use push and tool completion, or
+ * the user hit Ctrl+Y before the in-flight tool finished and the scheduler's
+ * `onAllToolCallsComplete` was a single-shot that already fired into an
+ * `isResponding` early-return).
+ */
+const ORPHAN_TOOL_USE_REPAIR_REASON =
+  'Tool execution result was not recorded — likely interrupted by network ' +
+  'failure, abort, or process exit. Treat as failure and retry if needed.';
+
+/**
+ * Walk `history` left-to-right and close every dangling tool_use ↔ tool_result
+ * pair by synthesizing a `functionResponse` with an `error` field for any
+ * `functionCall` part whose `id` is not echoed back in the immediately
+ * following user turn.
+ *
+ * Mutates `history` in place and returns the set of injected `(callId, name)`
+ * tuples so callers (the React tool scheduler) can dedupe a real `tool_result`
+ * if the in-flight tool completes after the repair.
+ *
+ * The synthesis target follows this rule:
+ *  - If the next entry is a `user` turn → append synthetic parts to it.
+ *  - If the next entry is a `model` turn or end-of-history → insert a new
+ *    `user` turn between them carrying just the synthetic parts.
+ *
+ * This is the qwen-code analogue of upstream Claude Code's
+ * `yieldMissingToolResultBlocks` (`query.ts:123-149`). Upstream can call it
+ * unconditionally at every error path because their `StreamingToolExecutor`
+ * is in-band — they atomically `.discard()` in-flight tools at the synthesis
+ * point. Our React scheduler runs out-of-band, so the caller pairs this with
+ * dedup in `handleCompletedTools` (which skips submission for any callId
+ * already present in history). See PR review thread on #4176 for the full
+ * race-class analysis.
+ */
+export function repairOrphanedToolUseTurns(
+  history: Content[],
+  reason: string = ORPHAN_TOOL_USE_REPAIR_REASON,
+): { injected: Array<{ callId: string; name: string }> } {
+  const injected: Array<{ callId: string; name: string }> = [];
+
+  // Forward walk: i mutates as we splice, so use index-based iteration
+  // and skip the freshly-inserted user turn to avoid re-scanning it.
+  for (let i = 0; i < history.length; i++) {
+    const turn = history[i];
+    if (turn.role !== 'model') continue;
+
+    // Collect (id → name) for every functionCall in this model turn.
+    const expected = new Map<string, string>();
+    for (const part of turn.parts ?? []) {
+      const fc = part.functionCall;
+      if (fc?.id) {
+        expected.set(fc.id, fc.name ?? 'unknown');
+      }
+    }
+    if (expected.size === 0) continue;
+
+    const next = history[i + 1];
+    const matched = new Set<string>();
+    if (next?.role === 'user') {
+      for (const part of next.parts ?? []) {
+        const id = part.functionResponse?.id;
+        if (id) matched.add(id);
+      }
+    }
+
+    const missing = [...expected.entries()].filter(([id]) => !matched.has(id));
+    if (missing.length === 0) continue;
+
+    const syntheticParts: Part[] = missing.map(([callId, name]) => ({
+      functionResponse: {
+        id: callId,
+        name,
+        response: { error: reason },
+      },
+    }));
+
+    if (next?.role === 'user') {
+      next.parts = [...(next.parts ?? []), ...syntheticParts];
+    } else {
+      history.splice(i + 1, 0, { role: 'user', parts: syntheticParts });
+      // Skip the freshly-inserted user turn so the outer loop doesn't
+      // visit it as a model turn (it isn't) and stays linear-time.
+      i++;
+    }
+
+    for (const [callId, name] of missing) {
+      injected.push({ callId, name });
+    }
+  }
+
+  return { injected };
+}
+
 /**
  * Chat session that enables sending messages to the model with previous
  * conversation context.
@@ -1208,6 +1303,22 @@ export class GeminiChat {
     }
   }
 
+  /**
+   * Repair the inverse of `stripOrphanedUserEntriesFromHistory`: close every
+   * dangling `model[functionCall]` whose corresponding `user[functionResponse]`
+   * never landed (e.g. process crash between the partial-tool_use push and
+   * tool completion, or Ctrl+Y race before in-flight scheduler completed).
+   *
+   * Returns the list of synthesized `(callId, name)` tuples so the React
+   * tool scheduler can dedupe its eventual real `tool_result` for those
+   * callIds (see `handleCompletedTools` in `useGeminiStream.ts`).
+   */
+  repairOrphanedToolUseTurns(reason?: string): {
+    injected: Array<{ callId: string; name: string }>;
+  } {
+    return repairOrphanedToolUseTurns(this.history, reason);
+  }
+
   setTools(tools: Tool[]): void {
     this.generationConfig.tools = tools;
   }

From b2fed61cdd120f809fddc8f7ffd535da3c4b29a9 Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Fri, 15 May 2026 22:45:49 +0800
Subject: [PATCH 04/18] =?UTF-8?q?fix(core,cli):=20close=20tool=5Fuse?=
 =?UTF-8?q?=E2=86=94tool=5Fresult=20invariant=20at=20failure=20points?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Post-review audit of 3de3241a2 surfaced two real races the earlier fix
did not actually close:

  - Race A (the one yiliang114 originally flagged): handleCompletedTools
    is gated on `isResponding` (useGeminiStream:1971) BEFORE the dedup
    branch ran, so when the user Ctrl+Y'd mid-tool, the dedup never
    fired and the in-flight tool stayed permanently stuck in
    `completed-but-not-submitted` (the scheduler's
    `allToolCallsCompleteHandler` is single-shot). The dedup branch was
    structurally unreachable on the very race it was meant to defend
    against.

  - Race in the Retry path: the repair pass in
    `client.sendMessageStream` Retry branch ran BEFORE the chat-internal
    `push(userContent)`, so a Retry of a previous ToolResult submission
    (lastPrompt is a functionResponse part array) raced its own real
    `functionResponse` against the synthesized error one. The synthesis
    was winning and pre-empting the real result, producing two
    `functionResponse` entries with the same callId on the wire.

This commit relocates both pieces to where the contract actually holds:

1. Move the dedup branch in `handleCompletedTools` to BEFORE the
   `isResponding` early-return so `markToolsAsSubmitted` always runs
   for callIds already paired in history, unblocking the UI/scheduler
   even when a new stream is in flight. The submission-side `geminiTools
   = completedAndReadyToSubmitTools.filter(... && !inHistory ...)` then
   covers the no-double-submit case in one expression.

2. Move the repair call from `client.sendMessageStream` Retry branch
   into `chat.sendMessageStream` immediately AFTER the user-supplied
   turn is pushed. The user's own tool_result (when the Retry payload
   carries one) gets the first chance to close the pair before the
   synthesizer sees it as dangling. `client.startChat()` keeps a
   belt-and-suspenders pass at session-load time so any pre-send code
   reading `chat.history` sees a well-formed shape.

Adds three integration tests covering the new wiring:

  - geminiChat.test.ts: chat.sendMessageStream synthesizes a
    functionResponse when history carries a dangling tool_use AND the
    user-supplied content doesn't already close it (Race B/C plus
    Race A from the chat side).
  - geminiChat.test.ts: chat.sendMessageStream does NOT synthesize
    when the user-supplied content IS a matching functionResponse
    (the Retry-of-ToolResult race the previous wiring tripped on).
  - useGeminiStream.test.tsx: handleCompletedTools dedups a late real
    result whose callId already has a functionResponse in chat.history
    (Race A end-to-end: markToolsAsSubmitted fires but no
    submitQuery is dispatched).

Test summary: 90/90 geminiChat, 131/131 client, 92/92 useGeminiStream;
all 8186 core tests pass; tsc clean.

Known limitation logged for follow-up: a Retry of ToolResult whose
intervening stream itself partial-pushed a SECOND tool_use produces a
trailing user turn with both the stale re-pushed `fr_A` and the
synthesized `fr_B` for the second call — `fr_A` retry lands as an
orphan because `fc_A` was already paired earlier in history. Triggered
only when partial-push fires on the second stream AND user retries the
tool_result rather than the user prompt; extremely low frequency in
practice. Cleanest fix is in the retryLastPrompt layer (don't re-push
an already-paired tool_result), which is out of scope for this PR.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx | 122 ++++++++++++++++
 packages/cli/src/ui/hooks/useGeminiStream.ts  | 100 +++++++------
 packages/core/src/core/client.ts              |  34 ++---
 packages/core/src/core/geminiChat.test.ts     | 132 ++++++++++++++++++
 packages/core/src/core/geminiChat.ts          |  20 +++
 5 files changed, 337 insertions(+), 71 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index 6230b9009b1..92571cdaa85 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -714,6 +714,128 @@ describe('useGeminiStream', () => {
     });
   });
 
+  it('drops a late tool result whose callId is already paired in chat.history (Race A dedup)', async () => {
+    // Race A repro: the chat-internal repair pass already synthesized a
+    // functionResponse for this callId on the Retry push (because the
+    // partial-tool_use turn was orphan when Ctrl+Y landed). The live
+    // scheduler's late real result must NOT also be submitted, otherwise
+    // the wire payload would carry two functionResponse parts for the
+    // same callId and the second one would land as an orphan tool_result.
+    // The dedup MUST run regardless of `isResponding`, because the
+    // scheduler's `onAllToolCallsComplete` is single-shot and would
+    // otherwise leave the tool stuck in `completed-but-not-submitted`.
+    const lateRealResult: TrackedCompletedToolCall = {
+      request: {
+        callId: 'call_race_A',
+        name: 'read_file',
+        args: { path: '/tmp/x.txt' },
+        isClientInitiated: false,
+        prompt_id: 'prompt-race-a',
+      },
+      status: 'success',
+      responseSubmittedToGemini: false,
+      response: {
+        callId: 'call_race_A',
+        responseParts: [
+          {
+            functionResponse: {
+              id: 'call_race_A',
+              name: 'read_file',
+              response: { output: 'real file contents' },
+            },
+          },
+        ],
+        errorType: undefined,
+      },
+      tool: { displayName: 'ReadFile' },
+      invocation: {
+        getDescription: () => 'read /tmp/x.txt',
+      } as unknown as AnyToolInvocation,
+    } as TrackedCompletedToolCall;
+
+    const client = new MockedGeminiClientClass(mockConfig);
+    // Simulate the chat-internal repair pass having already planted a
+    // synthetic functionResponse for the same callId on the previous
+    // (Retry) push.
+    client.getHistory = vi.fn().mockReturnValue([
+      { role: 'user', parts: [{ text: 'open /tmp/x.txt' }] },
+      {
+        role: 'model',
+        parts: [
+          {
+            functionCall: {
+              id: 'call_race_A',
+              name: 'read_file',
+              args: { path: '/tmp/x.txt' },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          { text: 'retry' },
+          {
+            functionResponse: {
+              id: 'call_race_A',
+              name: 'read_file',
+              response: {
+                error: 'Tool execution result was not recorded',
+              },
+            },
+          },
+        ],
+      },
+    ]);
+
+    let capturedOnComplete:
+      | ((completedTools: TrackedToolCall[]) => Promise<void>)
+      | null = null;
+    mockUseReactToolScheduler.mockImplementation((onComplete) => {
+      capturedOnComplete = onComplete;
+      return [[], mockScheduleToolCalls, mockMarkToolsAsSubmitted];
+    });
+
+    renderHook(() =>
+      useGeminiStream(
+        client,
+        [],
+        mockAddItem,
+        mockConfig,
+        mockLoadedSettings,
+        mockOnDebugMessage,
+        mockHandleSlashCommand,
+        false,
+        () => 'vscode' as EditorType,
+        () => {},
+        () => Promise.resolve(),
+        false,
+        () => {},
+        () => {},
+        () => {},
+        () => {},
+        80,
+        24,
+      ),
+    );
+
+    await act(async () => {
+      if (capturedOnComplete) {
+        await capturedOnComplete([lateRealResult]);
+      }
+    });
+
+    await waitFor(() => {
+      // The dedup hit must `markToolsAsSubmitted` so the UI/scheduler is
+      // unblocked even though we drop the real result on the wire.
+      expect(mockMarkToolsAsSubmitted).toHaveBeenCalledWith(['call_race_A']);
+    });
+
+    // No follow-up submission: the synthetic in history already closes
+    // the tool_use ↔ tool_result pair.
+    expect(mockSendMessageStream).not.toHaveBeenCalled();
+  });
+
   it('should not flicker streaming state to Idle between tool completion and submission', async () => {
     const toolCallResponseParts: PartListUnion = [
       { text: 'tool 1 final response' },
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index e74636fc11a..7f4ecdd2e8a 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -1968,10 +1968,6 @@ export const useGeminiStream = (
 
   const handleCompletedTools = useCallback(
     async (completedToolCallsFromScheduler: TrackedToolCall[]) => {
-      if (isResponding) {
-        return;
-      }
-
       const completedAndReadyToSubmitTools =
         completedToolCallsFromScheduler.filter(
           (
@@ -1994,6 +1990,49 @@ export const useGeminiStream = (
           },
         );
 
+      // History-based dedup MUST run before the `isResponding` early-return.
+      // If a synthetic `functionResponse` for this callId is already in
+      // chat.history (planted on session-load by
+      // `client.repairOrphanedToolUseTurnsInHistory` or on every
+      // `chat.sendMessageStream` push by the inline repair pass), the
+      // in-flight scheduler result must be marked submitted NOW —
+      // `useReactToolScheduler.allToolCallsCompleteHandler` is single-shot
+      // per batch, so a later isResponding=true early-return would leave
+      // the tool stuck in `completed-but-not-submitted` forever (Race A
+      // surfaced in PR #4176 review). The real result is dropped on the
+      // wire — same trade-off upstream Claude Code makes when its
+      // `StreamingToolExecutor.discard()` follows a
+      // `yieldMissingToolResultBlocks` synthesis (`query.ts:733` + `:984`).
+      const historyCallIdsWithResponse = new Set<string>();
+      // Guard the call: some test harnesses build a partial GeminiClient
+      // mock without `getHistory`. Skipping dedup in that case is safe —
+      // it just means tests that never set up the repair pre-condition
+      // run with the original (pre-dedup) submission shape.
+      if (geminiClient && typeof geminiClient.getHistory === 'function') {
+        for (const entry of geminiClient.getHistory()) {
+          if (entry.role !== 'user') continue;
+          for (const part of entry.parts ?? []) {
+            const id = part.functionResponse?.id;
+            if (id) historyCallIdsWithResponse.add(id);
+          }
+        }
+      }
+      const dedupedCallIds = completedAndReadyToSubmitTools
+        .filter((tc) => historyCallIdsWithResponse.has(tc.request.callId))
+        .map((tc) => tc.request.callId);
+      if (dedupedCallIds.length > 0) {
+        debugLogger.warn(
+          `[REPAIR] Dropping ${dedupedCallIds.length} late tool result(s) ` +
+            `whose callId already has a functionResponse in history: ` +
+            `${dedupedCallIds.join(', ')}`,
+        );
+        markToolsAsSubmitted(dedupedCallIds);
+      }
+
+      if (isResponding) {
+        return;
+      }
+
       // Finalize any client-initiated tools as soon as they are done.
       const clientTools = completedAndReadyToSubmitTools.filter(
         (t) => t.request.isClientInitiated,
@@ -2019,62 +2058,19 @@ export const useGeminiStream = (
         );
       }
 
-      const geminiToolsRaw = completedAndReadyToSubmitTools.filter(
-        (t) => !t.request.isClientInitiated,
+      const geminiTools = completedAndReadyToSubmitTools.filter(
+        (t) =>
+          !t.request.isClientInitiated &&
+          !historyCallIdsWithResponse.has(t.request.callId),
       );
 
-      for (const toolCall of geminiToolsRaw) {
+      for (const toolCall of geminiTools) {
         geminiClient?.recordCompletedToolCall(
           toolCall.request.name,
           toolCall.request.args as Record<string, unknown>,
         );
       }
 
-      // History-based dedup: if a synthetic `functionResponse` for this
-      // callId is already in chat.history (planted by
-      // `client.repairOrphanedToolUseTurnsInHistory()` on session-load or
-      // Retry), the in-flight scheduler result would land as a duplicate
-      // `tool_result` and produce two consecutive user turns where the
-      // second is orphaned (no preceding tool_use — the synthetic ate it).
-      //
-      // For dedup hits: mark the tool as submitted so the UI advances and
-      // `useReactToolScheduler.allToolCallsCompleteHandler` (single-shot)
-      // doesn't leave the call permanently stuck in `completed-but-not-
-      // submitted`. The real result is dropped on the wire — same trade-off
-      // upstream Claude Code makes when its `StreamingToolExecutor.discard()`
-      // is followed by a `yieldMissingToolResultBlocks` synthesis
-      // (`query.ts:733` + `:984`). The model sees the synthetic error and
-      // can retry the tool if it still wants the result.
-      const historyCallIdsWithResponse = new Set<string>();
-      // Guard the call: some test harnesses build a partial GeminiClient
-      // mock without `getHistory`. Skipping dedup in that case is safe —
-      // it just means tests that never set up the repair pre-condition
-      // run with the original (pre-dedup) submission shape.
-      if (geminiClient && typeof geminiClient.getHistory === 'function') {
-        for (const entry of geminiClient.getHistory()) {
-          if (entry.role !== 'user') continue;
-          for (const part of entry.parts ?? []) {
-            const id = part.functionResponse?.id;
-            if (id) historyCallIdsWithResponse.add(id);
-          }
-        }
-      }
-
-      const dedupedCallIds = geminiToolsRaw
-        .filter((tc) => historyCallIdsWithResponse.has(tc.request.callId))
-        .map((tc) => tc.request.callId);
-      if (dedupedCallIds.length > 0) {
-        debugLogger.warn(
-          `[REPAIR] Dropping ${dedupedCallIds.length} late tool result(s) ` +
-            `whose callId already has a synthetic functionResponse in ` +
-            `history: ${dedupedCallIds.join(', ')}`,
-        );
-        markToolsAsSubmitted(dedupedCallIds);
-      }
-      const geminiTools = geminiToolsRaw.filter(
-        (tc) => !historyCallIdsWithResponse.has(tc.request.callId),
-      );
-
       if (geminiTools.length === 0) {
         return;
       }
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 867fa2146ed..f0df5696305 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -700,9 +700,14 @@ export class GeminiClient {
       // partial-tool_use push (see `processStreamResponse`) and the React
       // scheduler's tool_result submission. Without this pass, the first
       // API call on a resumed session would 400 with the same
-      // `tool_use_id ... corresponding tool_use` error this whole subsystem
-      // is trying to escape.
-      this.chat.repairOrphanedToolUseTurns();
+      // `tool_use_id ... corresponding tool_use` error this whole
+      // subsystem is trying to escape. (Belt-and-suspenders: the same
+      // helper runs again inside `chat.sendMessageStream` after the user
+      // content is pushed, so a dangling left here by setHistory /
+      // compaction reordering is also caught — but doing it here keeps
+      // any pre-send code reading `chat.history` from seeing a malformed
+      // shape.)
+      this.repairOrphanedToolUseTurnsInHistory();
 
       const sessionStartAdditionalContext =
         await this.fireSessionStartHook(sessionStartSource);
@@ -1091,22 +1096,13 @@ export class GeminiClient {
 
     if (messageType === SendMessageType.Retry) {
       this.stripOrphanedUserEntriesFromHistory();
-      // Close any dangling `model[functionCall]` whose tool_result never
-      // landed before composing the retry payload. Ctrl+Y race: the user
-      // retried while a tool was still running on a partial-tool_use turn
-      // pushed by `processStreamResponse`'s mid-stream error path. The
-      // scheduler's `onAllToolCallsComplete` is single-shot and gated on
-      // `isResponding` (`useGeminiStream:1971`), so the eventual
-      // `tool_result` would otherwise be silently swallowed and the next
-      // API call would 400 with "tool_use_id ... corresponding tool_use"
-      // anyway. The synthesized `error` `functionResponse` keeps the wire
-      // invariant intact; the live scheduler dedupes against history in
-      // `handleCompletedTools` before submitting its real result so the
-      // synthetic doesn't collide with a late real one.
-      //
-      // Restricted to the Retry branch to mirror `stripOrphanedUserEntries`
-      // scope. Crash-resume's path is covered separately in `startChat()`.
-      this.repairOrphanedToolUseTurnsInHistory();
+      // The matching dangling-`functionCall` repair runs inside
+      // `chat.sendMessageStream` AFTER the user content is pushed, so any
+      // tool_result the user is supplying (Retry of a ToolResult
+      // submission, lastPrompt === fr parts) closes the pair via the real
+      // `functionResponse` before we synthesize an error one. Doing the
+      // repair here would happen pre-push and race against the user
+      // content's own pairing — see PR #4176 review for the corner.
     }
 
     // Fire UserPromptSubmit hook through MessageBus (only if hooks are enabled)
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index bd281d537a3..cacd85af8a2 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -597,6 +597,138 @@ describe('GeminiChat', async () => {
         'This is the visible text that should not be lost.',
       );
     });
+
+    it('synthesizes a functionResponse for a dangling tool_use before sending', async () => {
+      // End-to-end: when sendMessageStream is invoked on a chat whose
+      // history carries a dangling `model[functionCall]` (typical state
+      // after a Ctrl+Y race or a crash-resume on a partial-tool_use
+      // turn), the inline repair pass closes the pair against the
+      // just-pushed user content so the wire payload doesn't 400 with
+      // "tool_use_id ... corresponding tool_use".
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'first message' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_dangling_for_send',
+                name: 'read_file',
+                args: { path: '/tmp/x' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const ackStream = (async function* () {
+        yield {
+          candidates: [
+            {
+              content: { role: 'model', parts: [{ text: 'ok' }] },
+              finishReason: 'STOP',
+            },
+          ],
+        } as unknown as GenerateContentResponse;
+      })();
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        ackStream,
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        { message: 'next user prompt after a stream-error-mid-tool_use' },
+        'prompt-send-repair',
+      );
+      for await (const _ of stream) {
+        /* drain */
+      }
+
+      const history = chat.getHistory();
+      // The dangling fc should now be followed by a user turn that
+      // carries both the user-supplied text AND the synthetic fr that
+      // closes the pair.
+      const userTurn = history[2]!;
+      expect(userTurn.role).toBe('user');
+      const fr = userTurn.parts!.find((p) => p.functionResponse);
+      expect(fr?.functionResponse?.id).toBe('call_dangling_for_send');
+      expect(fr?.functionResponse?.name).toBe('read_file');
+      expect(
+        (fr?.functionResponse?.response as { error?: string })?.error,
+      ).toMatch(/interrupted/i);
+      // The user's own text part is still present.
+      expect(
+        userTurn.parts!.some(
+          (p) =>
+            p.text === 'next user prompt after a stream-error-mid-tool_use',
+        ),
+      ).toBe(true);
+    });
+
+    it('does NOT synthesize when the user supplies a matching tool_result', async () => {
+      // Retry-of-ToolResult case (lastPrompt is a functionResponse Part
+      // array): the user-supplied tool_result must close the pair before
+      // the inline repair pass sees it, so no synthetic error is
+      // injected. Otherwise the wire payload would carry two
+      // functionResponse parts for the same callId — the real one and a
+      // bogus synthetic.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'do the read' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_retry_real_fr',
+                name: 'read_file',
+                args: { path: '/tmp/y' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const ackStream = (async function* () {
+        yield {
+          candidates: [
+            {
+              content: { role: 'model', parts: [{ text: 'ack' }] },
+              finishReason: 'STOP',
+            },
+          ],
+        } as unknown as GenerateContentResponse;
+      })();
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        ackStream,
+      );
+
+      const stream = await chat.sendMessageStream(
+        'test-model',
+        {
+          message: {
+            functionResponse: {
+              id: 'call_retry_real_fr',
+              name: 'read_file',
+              response: { output: 'real-tool-output' },
+            },
+          },
+        },
+        'prompt-retry-real-fr',
+      );
+      for await (const _ of stream) {
+        /* drain */
+      }
+
+      const userTurn = chat.getHistory()[2]!;
+      const frParts = userTurn.parts!.filter((p) => p.functionResponse);
+      // Exactly ONE functionResponse — the real one. No synthetic.
+      expect(frParts.length).toBe(1);
+      expect(frParts[0]!.functionResponse?.id).toBe('call_retry_real_fr');
+      expect(
+        (frParts[0]!.functionResponse?.response as { output?: string })?.output,
+      ).toBe('real-tool-output');
+    });
+
     it('should throw an error when a tool call is followed by an empty stream response', async () => {
       vi.useFakeTimers();
       try {
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index ee4e6d90fcc..44d50e64d26 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -728,6 +728,26 @@ export class GeminiChat {
       // Add user content to history ONCE before any attempts.
       this.history.push(userContent);
       userContentAdded = true;
+      // Close any dangling `model[functionCall]` whose `functionResponse`
+      // never landed by the time we compose the request. Runs AFTER the
+      // user-supplied turn lands so a tool_result the user is supplying
+      // gets the first chance to close the pair before we synthesize an
+      // `error` `functionResponse`. Covers:
+      //   - Stream errored mid-tool_use (partial assistant push left a
+      //     dangling functionCall), then the React scheduler's eventual
+      //     tool_result lost the race against a Ctrl+Y retry whose
+      //     onAllToolCallsComplete fired into `isResponding=true` and
+      //     skipped submission.
+      //   - The same shape from a process crash / OOM mid-flight (the
+      //     transcript JSONL preserves the dangling model[fc] across
+      //     `--resume`; `startChat()` calls this once on load, but a
+      //     belt-and-suspenders pass here covers anything that slipped
+      //     past — including dangling shapes the load-time repair didn't
+      //     visit because compaction / setHistory ran after it).
+      // The React scheduler's late real result is then dedup'd against
+      // chat.history in `useGeminiStream.handleCompletedTools` so the
+      // synthetic doesn't collide with it on the wire.
+      repairOrphanedToolUseTurns(this.history);
       requestContents = this.getHistory(true);
     } catch (error) {
       if (userContentAdded) {

From fc4d57b6eacb78044eb32f167d2b41dbd7645f33 Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Sat, 16 May 2026 12:10:01 +0800
Subject: [PATCH 05/18] =?UTF-8?q?fix(core,cli):=20close=20tool=5Fuse?=
 =?UTF-8?q?=E2=86=94tool=5Fresult=20invariant=20at=20failure=20points?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review surfaced four issues on top of b2fed61cd. All addressed here:

  - [P0] Synthetic functionResponse was appended to the trailing user
    turn's parts; on a Ctrl+Y race the shape became `user[text, fr]` and
    Anthropic-compatible backends still 400 because tool_result blocks
    must come FIRST in a user message (Claude Code does the same hoist
    in utils/messages.ts:hoistToolResults). Repair now slots synthetic
    fr in before the first non-functionResponse part, after any
    pre-existing real fr parts (which preserves real-tool-result order
    on a parallel-partial-submit). Tests asserting the part order added.

  - [P0] The dedup test fixture in useGeminiStream.test.tsx forced an
    incompatible object through a direct cast to TrackedCompletedToolCall;
    `tsc` rejected it with TS2352. Fixture restructured to match the
    type's actual shape (matches the existing `Mocked` patterns at
    L597-606 / L2396-2401) so the changed-file typecheck stays green.

  - [P1] On stream error, `recordAssistantTurn` ran unconditionally
    (gated only on parts being non-empty), so partial text-only turns
    we intentionally do NOT push into in-memory history were still
    written to the chat-recording JSONL — leaving --resume's
    transcript-load path to re-inject a model turn the in-session run
    discarded. Gate now matches the `history.push` decision: record
    iff we will persist (success path OR stream-error-with-tool_use).

  - [P2] Added client-level tests for the startChat repair wiring:
    extraHistory ending in `model[functionCall]` triggers synthetic fr
    injection on init; happy-path resume is a no-op. Defends against a
    future reorder/removal of the call in startChat() regressing the
    --resume recovery path.

All 91 geminiChat, 133 client, 92 useGeminiStream tests pass. Full
core suite (8186 tests) green. Changed-file tsc clean except for the
pre-existing `recordCompletedToolCall` typing gap that exists on main.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx | 13 +++-
 packages/core/src/core/client.test.ts         | 74 +++++++++++++++++++
 packages/core/src/core/geminiChat.test.ts     | 65 ++++++++++++++--
 packages/core/src/core/geminiChat.ts          | 42 ++++++++++-
 4 files changed, 183 insertions(+), 11 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index 92571cdaa85..124ce019f94 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -724,7 +724,7 @@ describe('useGeminiStream', () => {
     // The dedup MUST run regardless of `isResponding`, because the
     // scheduler's `onAllToolCallsComplete` is single-shot and would
     // otherwise leave the tool stuck in `completed-but-not-submitted`.
-    const lateRealResult: TrackedCompletedToolCall = {
+    const lateRealResult = {
       request: {
         callId: 'call_race_A',
         name: 'read_file',
@@ -745,13 +745,20 @@ describe('useGeminiStream', () => {
             },
           },
         ],
+        resultDisplay: undefined,
+        error: undefined,
         errorType: undefined,
       },
-      tool: { displayName: 'ReadFile' },
+      tool: {
+        name: 'read_file',
+        displayName: 'ReadFile',
+        description: 'Read a file',
+        build: vi.fn(),
+      } as any,
       invocation: {
         getDescription: () => 'read /tmp/x.txt',
       } as unknown as AnyToolInvocation,
-    } as TrackedCompletedToolCall;
+    } as unknown as TrackedCompletedToolCall;
 
     const client = new MockedGeminiClientClass(mockConfig);
     // Simulate the chat-internal repair pass having already planted a
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 85501c8fa04..c937212381f 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -1026,6 +1026,80 @@ describe('Gemini Client (client.ts)', () => {
     });
   });
 
+  describe('startChat — repair orphan tool_use on resume', () => {
+    it('synthesizes a functionResponse for a transcript ending in a dangling model[functionCall]', async () => {
+      // --resume of a session that crashed (OOM / SIGKILL / process exit)
+      // between the partial-tool_use push in `processStreamResponse` and
+      // the React scheduler's `submitQuery(ToolResult)`. The persisted
+      // JSONL ends with `model[functionCall]` and no matching user
+      // `functionResponse`. Without the repair pass running at session
+      // load, the first API call after `--resume` would 400 with
+      // "tool_use_id ... must have a corresponding tool_use block in
+      // the previous message" — exactly the wedge this PR is supposed
+      // to escape. Covers the only resume-time integration point for
+      // the repair, so a future reorder/removal of the call in
+      // `startChat()` regresses this test.
+      await client.startChat([
+        {
+          role: 'user',
+          parts: [{ text: 'open /tmp/crash.txt' }],
+        },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_crash_resume',
+                name: 'read_file',
+                args: { path: '/tmp/crash.txt' },
+              },
+            } as never,
+          ],
+        },
+      ]);
+
+      const history = client.getHistory();
+      // startChat prepends a mocked env-context user/model pair, then
+      // appends the supplied extraHistory; the repair pass must then
+      // splice a synthetic user[functionResponse] AFTER the dangling
+      // model[fc]. Locate the dangling model entry by its callId and
+      // verify the immediately-following entry carries the synthetic.
+      const danglingIdx = history.findIndex(
+        (h) =>
+          h.role === 'model' &&
+          h.parts?.some((p) => p.functionCall?.id === 'call_crash_resume'),
+      );
+      expect(danglingIdx).toBeGreaterThanOrEqual(0);
+      const userAfter = history[danglingIdx + 1];
+      expect(userAfter?.role).toBe('user');
+      const fr = userAfter?.parts!.find((p) => p.functionResponse);
+      expect(fr?.functionResponse?.id).toBe('call_crash_resume');
+      expect(fr?.functionResponse?.name).toBe('read_file');
+      expect(
+        (fr?.functionResponse?.response as { error?: string })?.error,
+      ).toMatch(/interrupted/i);
+    });
+
+    it('is a no-op when the resumed transcript has no dangling tool_use', async () => {
+      // Happy resume path: don't inject a synthetic functionResponse
+      // into a transcript whose tool_use pairing is already valid (or,
+      // as here, has no tool_use at all). Defends against a future
+      // regression where the repair pass starts spuriously injecting on
+      // perfectly-formed history.
+      await client.startChat([
+        { role: 'user', parts: [{ text: 'q' }] },
+        { role: 'model', parts: [{ text: 'plain text reply' }] },
+      ]);
+
+      const history = client.getHistory();
+      // No functionResponse anywhere — repair did nothing.
+      const hasAnyFunctionResponse = history.some((h) =>
+        h.parts?.some((p) => p.functionResponse),
+      );
+      expect(hasAnyFunctionResponse).toBe(false);
+    });
+  });
+
   describe('setTools — system instruction refresh', () => {
     // Regression coverage for the progressive-MCP wiring bug: when MCP
     // discovery completes AFTER startChat() (the new default), `setTools()`
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index cacd85af8a2..9f10869aab6 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -663,6 +663,13 @@ describe('GeminiChat', async () => {
             p.text === 'next user prompt after a stream-error-mid-tool_use',
         ),
       ).toBe(true);
+      // tool_result block must come BEFORE the text — Anthropic-
+      // compatible backends reject a user message whose first content
+      // block isn't the tool_result answering the immediately preceding
+      // tool_use. Mirrors upstream Claude Code's `hoistToolResults`.
+      expect(userTurn.parts![0]!.functionResponse?.id).toBe(
+        'call_dangling_for_send',
+      );
     });
 
     it('does NOT synthesize when the user supplies a matching tool_result', async () => {
@@ -3488,14 +3495,20 @@ describe('GeminiChat', async () => {
       );
     });
 
-    it('appends synthetic functionResponse onto an existing user turn (Race A)', () => {
+    it('hoists synthetic functionResponse to the front of an existing user turn (Race A)', () => {
       // Ctrl+Y race: the user retried while the in-flight tool was still
       // running. `stripOrphanedUserEntriesFromHistory` leaves the
       // model[functionCall] in place (trailing entry is model), then the
       // Retry pushes a fresh user turn with the user prompt. Repair must
       // splice the synthetic response onto that user turn so it sits
       // immediately after the model[tool_use] — NOT create a stray
-      // synthetic user turn between them.
+      // synthetic user turn between them. Crucially the synthetic
+      // functionResponse must come BEFORE the text part: Anthropic-
+      // compatible backends require tool_result blocks to be first in
+      // the user message (mirrors upstream Claude Code's
+      // `hoistToolResults`). Otherwise the wire payload re-triggers the
+      // "tool_use_id ... must have a corresponding tool_use block in the
+      // previous message" 400 this PR is supposed to escape.
       chat.setHistory([
         { role: 'user', parts: [{ text: 'open /tmp/a.txt' }] },
         {
@@ -3517,12 +3530,54 @@ describe('GeminiChat', async () => {
 
       expect(result.injected.map((e) => e.callId)).toEqual(['call_race_A']);
       const history = chat.getHistory();
-      // No new turn inserted — synthetic merges into existing user turn.
       expect(history.length).toBe(3);
       expect(history[2]!.role).toBe('user');
       expect(history[2]!.parts!.length).toBe(2);
-      expect(history[2]!.parts![0]).toEqual({ text: 'retry prompt' });
-      expect(history[2]!.parts![1]!.functionResponse?.id).toBe('call_race_A');
+      // synthetic fr FIRST, user text AFTER.
+      expect(history[2]!.parts![0]!.functionResponse?.id).toBe('call_race_A');
+      expect(history[2]!.parts![1]).toEqual({ text: 'retry prompt' });
+    });
+
+    it('hoists synthetic functionResponse AFTER pre-existing real ones (parallel partial submit)', () => {
+      // Parallel tool_use with one real functionResponse already in the
+      // user turn — synthetic for the missing callId must slot in
+      // between the real fr and any non-fr parts so the user message
+      // shape stays `[real_fr, synthetic_fr, text]` (every tool_result
+      // before any other content, preserving the real-fr order).
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'batch read' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: { id: 'call_A', name: 'read_file', args: {} },
+            },
+            {
+              functionCall: { id: 'call_B', name: 'read_file', args: {} },
+            },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'call_A',
+                name: 'read_file',
+                response: { output: 'a' },
+              },
+            },
+            { text: 'retry prompt' },
+          ],
+        },
+      ]);
+
+      chat.repairOrphanedToolUseTurns();
+      const parts = chat.getHistory()[2]!.parts!;
+      expect(parts.length).toBe(3);
+      expect(parts[0]!.functionResponse?.id).toBe('call_A');
+      expect(parts[1]!.functionResponse?.id).toBe('call_B');
+      expect(parts[2]).toEqual({ text: 'retry prompt' });
     });
 
     it('handles parallel tool_use turns with only some responses present', () => {
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 44d50e64d26..c82996747f7 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -457,7 +457,29 @@ export function repairOrphanedToolUseTurns(
     }));
 
     if (next?.role === 'user') {
-      next.parts = [...(next.parts ?? []), ...syntheticParts];
+      // Synthetic functionResponse parts MUST be placed before any
+      // non-functionResponse parts in the user turn. Anthropic-compatible
+      // backends reject a user message whose first content block isn't
+      // the tool_result that answers the immediately preceding tool_use
+      // ("tool result must follow tool use" / "tool_use_id ... must have
+      // a corresponding tool_use block in the previous message"). Common
+      // case after a Ctrl+Y race: the user's retry-prompt text was just
+      // pushed, so `next.parts = [text]`; appending the synthetic to the
+      // end would produce `[text, fr]` and re-trigger the wedge this PR
+      // is supposed to escape. Mirrors upstream Claude Code's
+      // `hoistToolResults` (`utils/messages.ts`).
+      //
+      // Place synthetics AFTER any pre-existing functionResponse parts
+      // (real tool_results the user is supplying in the same turn) so
+      // their original ordering is preserved.
+      const existing = next.parts ?? [];
+      const firstNonFr = existing.findIndex((part) => !part.functionResponse);
+      const insertAt = firstNonFr === -1 ? existing.length : firstNonFr;
+      next.parts = [
+        ...existing.slice(0, insertAt),
+        ...syntheticParts,
+        ...existing.slice(insertAt),
+      ];
     } else {
       history.splice(i + 1, 0, { role: 'user', parts: syntheticParts });
       // Skip the freshly-inserted user turn so the outer loop doesn't
@@ -1489,8 +1511,22 @@ export class GeminiChat {
       .join('')
       .trim();
 
-    // Record assistant turn with raw Content and metadata
-    if (thoughtContentPart || contentText || hasToolCall || usageMetadata) {
+    // Record assistant turn with raw Content and metadata. Gate matches
+    // the in-memory `this.history.push` decision below so chat-recording
+    // JSONL never carries a partial turn we deliberately dropped from
+    // history: on `--resume` the transcript-load path would otherwise
+    // re-inject a model turn the in-session run intentionally discarded
+    // (text-only mid-stream errors, where the Retry re-issues the user
+    // prompt — a stale partial-text record would bias the resumed
+    // conversation or surface as duplicate output).
+    const willPersistToHistory =
+      streamError === null ||
+      (hasToolCall &&
+        (thoughtContentPart || consolidatedHistoryParts.length > 0));
+    if (
+      willPersistToHistory &&
+      (thoughtContentPart || contentText || hasToolCall || usageMetadata)
+    ) {
       const contextWindowSize =
         this.config.getContentGeneratorConfig()?.contextWindowSize;
       this.chatRecordingService?.recordAssistantTurn({

From db344a40377688baa99cfb6e37688d5feab6cb3a Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Sat, 16 May 2026 21:49:08 +0800
Subject: [PATCH 06/18] fix(core): roll back partial assistant push on
 retryable mid-stream errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Regression surfaced by @yiliang114 on PR #4176: a stream attempt that
yields a `functionCall` and then throws a retryable error (e.g.
`StreamContentError` with a 429 payload, or `InvalidStreamError`)
triggers the partial-assistant-turn push in `processStreamResponse`,
but the outer `sendMessageStream` retry loop catches the error and
issues a fresh attempt. The failed-attempt `model[functionCall]` was
left in history, and the successful retry's response landed as a
SECOND consecutive `model` entry — invalid user/model alternation
AND the failed-attempt `tool_use` is orphan on the wire (no matching
tool_result). The very wedge this PR is meant to escape.

Track the index of the pushed partial in a new
`pendingPartialAssistantTurnIndex` field, reset it on every
`sendMessageStream` entry, and pop it before each retry-and-continue
path in the catch block (rate-limit, reactive-compression, transient
stream anomaly, content-validation retry). Paths that `break`
(unretryable) keep the partial — the caller will see it as part of
the error surface and the existing repair / dedup machinery in
`useGeminiStream.handleCompletedTools` handles it correctly.

The pop is defensive (index-bounds + role check) so a hypothetical
intervening `setHistory` / `truncateHistory` can't cause an out-of-
bounds splice.

Adds a regression test in `geminiChat.test.ts` that reproduces
@yiliang114's exact shape: yield `functionCall`, throw
`StreamContentError(429)`, second attempt yields `Success after retry`
plain text + STOP. Asserts the final history is exactly
`[user, model(success text)]` — no leading failed-attempt model turn,
no orphan `functionCall` anywhere.

All 92 geminiChat, 133 client, 92 useGeminiStream tests pass; full
core suite green; tsc clean.
---
 packages/core/src/core/geminiChat.test.ts | 81 +++++++++++++++++++++++
 packages/core/src/core/geminiChat.ts      | 56 ++++++++++++++++
 2 files changed, 137 insertions(+)

diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index 9f10869aab6..af95f43cc7b 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -2382,6 +2382,87 @@ describe('GeminiChat', async () => {
       }
     });
 
+    it('rolls back the partial assistant turn when a retryable error fires after a tool_use chunk', async () => {
+      // Regression for the case @yiliang114 reproduced on #4176: stream
+      // attempt 1 yields a `functionCall` (which triggers the partial-
+      // history push in `processStreamResponse`), then the SAME stream
+      // throws a retryable error (e.g. a TPM 429 `StreamContentError`).
+      // The outer retry loop must drop the partial before issuing the
+      // retry — otherwise the retry's response lands as a SECOND
+      // consecutive `model` entry and the failed-attempt `tool_use`
+      // becomes orphan on the wire (invalid alternation +
+      // tool_use_id-with-no-matching-tool_use 400).
+      vi.useFakeTimers();
+      try {
+        const tpmError = new StreamContentError(
+          '{"error":{"code":"429","message":"Throttling: TPM(1/1)"}}',
+        );
+        const failingStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: {
+                  parts: [
+                    {
+                      functionCall: {
+                        id: 'call_failed_retry_attempt',
+                        name: 'read_file',
+                        args: { path: '/tmp/a.txt' },
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+          throw tpmError;
+        })();
+        const successStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: { parts: [{ text: 'Success after retry' }] },
+                finishReason: 'STOP',
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+        })();
+        vi.mocked(mockContentGenerator.generateContentStream)
+          .mockResolvedValueOnce(failingStream)
+          .mockResolvedValueOnce(successStream);
+
+        const stream = await chat.sendMessageStream(
+          'test-model',
+          { message: 'test' },
+          'prompt-rollback-on-retry',
+        );
+        const iterator = stream[Symbol.asyncIterator]();
+        // Advance through the rate-limit RETRY + delay, drain all events.
+        for (;;) {
+          const next = iterator.next();
+          await vi.advanceTimersByTimeAsync(60_000);
+          const r = await next;
+          if (r.done) break;
+        }
+
+        const history = chat.getHistory();
+        // History must NOT contain the failed attempt's partial
+        // model[functionCall]. Expected shape: [user, model(success
+        // text)] — exactly two entries, alternation intact.
+        expect(history.length).toBe(2);
+        expect(history[0]!.role).toBe('user');
+        expect(history[1]!.role).toBe('model');
+        const successText = history[1]!.parts!.find((p) => p.text)?.text;
+        expect(successText).toBe('Success after retry');
+        // Defensively: NO functionCall anywhere in history.
+        expect(history.some((h) => h.parts?.some((p) => p.functionCall))).toBe(
+          false,
+        );
+      } finally {
+        vi.useRealTimers();
+      }
+    });
+
     it('should retry on TPM throttling StreamContentError with initial delay', async () => {
       vi.useFakeTimers();
 
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index c82996747f7..25923cfc463 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -553,6 +553,24 @@ export class GeminiChat {
    */
   private hasFailedCompressionAttempt = false;
 
+  /**
+   * Index into `this.history` of the model turn that `processStreamResponse`
+   * persisted on the CURRENT in-flight attempt's mid-stream error. `null` if
+   * no partial has been pushed (the common case).
+   *
+   * The retry loop in `sendMessageStream` reads this on every catch to roll
+   * the partial back BEFORE retrying — without that pop, a retryable
+   * mid-stream error (rate limit, transient stream anomaly) leaves the
+   * failed attempt's `model[functionCall]` in history, and the successful
+   * retry's response lands as a SECOND consecutive model turn (invalid
+   * user/model alternation, plus the failed-attempt tool_use is orphan on
+   * the wire — the very wedge this whole subsystem is meant to escape).
+   *
+   * Reset to `null` on every `sendMessageStream` entry so a marker left
+   * over from a prior unretryable break doesn't bleed into the next send.
+   */
+  private pendingPartialAssistantTurnIndex: number | null = null;
+
   /**
    * Creates a new GeminiChat instance.
    *
@@ -730,6 +748,12 @@ export class GeminiChat {
     });
     this.sendPromise = streamDonePromise;
 
+    // Clear any partial-push marker left over from a prior unretryable
+    // break path — the marker is per-send; carrying it across sends
+    // would let the next send's retry catch wrongly pop a now-valid
+    // model entry sitting at the stale index.
+    this.pendingPartialAssistantTurnIndex = null;
+
     let compressionInfo: ChatCompressionInfo;
     let requestContents: Content[];
     let userContentAdded = false;
@@ -855,12 +879,35 @@ export class GeminiChat {
           } catch (error) {
             lastError = error;
 
+            // If `processStreamResponse` persisted a partial assistant turn
+            // (mid-stream error after a `functionCall` was already yielded),
+            // every retry-and-continue path below must drop that turn first.
+            // Otherwise a successful retry's response lands AFTER the stale
+            // failed-attempt model turn — two consecutive `model` entries
+            // with an orphan tool_use in the first, re-triggering the
+            // "tool_use_id ... corresponding tool_use" 400 this fix is
+            // supposed to escape. Paths that `break` (unretryable) keep
+            // the partial — the caller will see it as part of the error
+            // surface.
+            const popPartialIfPushed = () => {
+              const idx = self.pendingPartialAssistantTurnIndex;
+              if (idx === null) return;
+              if (
+                self.history.length > idx &&
+                self.history[idx]?.role === 'model'
+              ) {
+                self.history.splice(idx, 1);
+              }
+              self.pendingPartialAssistantTurnIndex = null;
+            };
+
             // Handle rate-limit / throttling errors returned as stream content.
             // These arrive as StreamContentError with finish_reason="error_finish"
             // from the pipeline, containing the throttling message in the content.
             // Covers TPM throttling, GLM rate limits, and other provider throttling.
             const isRateLimit = isRateLimitError(error, extraRetryErrorCodes);
             if (isRateLimit && rateLimitRetryCount < maxRateLimitRetries) {
+              popPartialIfPushed();
               rateLimitRetryCount++;
               const delayMs = getRateLimitRetryDelayMs(rateLimitRetryCount, {
                 ...RATE_LIMIT_RETRY_OPTIONS,
@@ -935,6 +982,7 @@ export class GeminiChat {
                     reactiveInfo.compressionStatus ===
                     CompressionStatus.COMPRESSED
                   ) {
+                    popPartialIfPushed();
                     requestContents = self.getHistory(true);
                     debugLogger.info(
                       `Reactive compression succeeded: ` +
@@ -991,6 +1039,7 @@ export class GeminiChat {
               isTransientStreamError &&
               invalidStreamRetryCount < INVALID_STREAM_RETRY_CONFIG.maxRetries
             ) {
+              popPartialIfPushed();
               invalidStreamRetryCount++;
               const delayMs =
                 INVALID_STREAM_RETRY_CONFIG.initialDelayMs *
@@ -1024,6 +1073,7 @@ export class GeminiChat {
             const isContentError = error instanceof InvalidStreamError;
             if (isContentError) {
               if (attempt < INVALID_CONTENT_RETRY_OPTIONS.maxAttempts - 1) {
+                popPartialIfPushed();
                 logContentRetry(
                   self.config,
                   new ContentRetryEvent(
@@ -1586,6 +1636,12 @@ export class GeminiChat {
             ...consolidatedHistoryParts,
           ],
         });
+        // Track the pushed turn so the outer sendMessageStream retry loop
+        // can roll it back if it decides to retry the same send. Without
+        // this, a successful retry would leave the failed attempt's
+        // partial `model[functionCall]` as a stale leading model turn in
+        // front of the retry's real response.
+        this.pendingPartialAssistantTurnIndex = this.history.length - 1;
       }
       throw streamError;
     }

From 7f1c561aa2537973feacc31a519bf73a5281c739 Mon Sep 17 00:00:00 2001
From: wenshao <shaojin.wensj@alibaba-inc.com>
Date: Sun, 17 May 2026 08:44:17 +0800
Subject: [PATCH 07/18] fix(core): clear pendingPartialAssistantTurnIndex on
 history replacement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Audit follow-up to db344a403. The retry-rollback marker
`pendingPartialAssistantTurnIndex` captures an absolute index into
`this.history`, but `setHistory` / `clearHistory` / `truncateHistory`
wipe or shift the underlying array without touching the marker. Two
paths where this can bite:

  - Reactive compression on contextOverflow calls `this.setHistory(
    newHistory)` inside `tryCompress`. If a partial push has happened
    earlier (extreme edge case — would require a partial push followed
    by a context-overflow on retry), the marker's stale index could
    coincide with a model entry in the compressed history, and
    `popPartialIfPushed` would splice the wrong turn.

  - `/clear` / `--resume` / programmatic `setHistory` / external
    `chat.truncateHistory()` (Session.ts:244) wipe the basis the marker
    was captured against, leaving the next send's
    `popPartialIfPushed` with a meaningless index.

The fix is mechanical: each history-replacement method clears the
marker. The marker is per-send and ephemeral, so losing it across a
history replacement is safe (the next send either has no partial yet,
or will push a fresh one with a fresh index).

`popPartialIfPushed` already bounds-checks + role-checks before
splicing, so this is defense-in-depth — the bug above requires the
defensive check to ALSO line up. But cheap to harden and the comment
documents the invariant for future readers. 235 core/geminiChat +
core/client tests still pass.
---
 packages/core/src/core/geminiChat.ts | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 751b3ce02b4..50b6625ba2b 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -1447,6 +1447,13 @@ export class GeminiChat {
    */
   clearHistory(): void {
     this.history = [];
+    // Any pending partial-push marker points into the now-empty history;
+    // resetting prevents `popPartialIfPushed` from splicing whatever
+    // shows up at that index in a future send (defense-in-depth — the
+    // helper also bounds-checks, but a stale marker that happens to
+    // line up with a real model turn could otherwise pop the wrong
+    // entry).
+    this.pendingPartialAssistantTurnIndex = null;
   }
 
   /**
@@ -1458,10 +1465,24 @@ export class GeminiChat {
 
   setHistory(history: Content[]): void {
     this.history = history;
+    // History replacement (compression, /clear, --resume reload) wipes
+    // the index basis the partial-push marker was captured against. The
+    // marker MUST be cleared — otherwise `popPartialIfPushed` could find
+    // a model turn at the stale index in the replacement history and
+    // splice an entry that has nothing to do with the original partial
+    // push, corrupting the conversation.
+    this.pendingPartialAssistantTurnIndex = null;
   }
 
   truncateHistory(keepCount: number): void {
     this.history = this.history.slice(0, keepCount);
+    // Truncation can drop the entry the partial-push marker points at,
+    // or leave it valid but shift the meaning of nearby indices. Reset
+    // the marker rather than try to fix it up — it's per-send and
+    // ephemeral, so losing it across a truncate is safe (the
+    // sendMessageStream that pushed it has already finished or will
+    // start fresh on the next call).
+    this.pendingPartialAssistantTurnIndex = null;
   }
 
   stripThoughtsFromHistory(): void {

From 08216d9484a21d491c9b0609765acb5d118e4bb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Mon, 18 May 2026 00:59:23 +0800
Subject: [PATCH 08/18] fix(core,cli): close mimo-v2.5-pro review gaps on
 partial-tool_use repair
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four review items from the mimo-v2.5-pro pass on #4176:

S1 — `addHistory` and `stripThoughtsFromHistory` did not reset
`pendingPartialAssistantTurnIndex`. `setHistory`/`truncateHistory`/
`clearHistory` already do; consistency with the defensive pattern
matters because `stripThoughtsFromHistory` filter+map produces a new
array that makes the marker stale, and any future addHistory variant
that splices into the middle could leave `popPartialIfPushed` looking
at a wrong index. Comments updated to record the actual semantics
(plain push() is safe; a future splicing variant would not be).

S2 — When the history-based dedup in `useGeminiStream.handleCompletedTools`
filters a tool out of `geminiTools` because its callId is already paired
in chat.history, `recordCompletedToolCall` was never called for it.
That silently skips the `toolCallCount` increment and the
`skillsModifiedInSession` flip — the latter gates the skills-reload
prompt at end-of-turn, so a deduped `write_file` under a project
SKILLS path would never trigger a reload. Now records inline with the
dedup mark, filtered to non-client-initiated to match the original
geminiTools loop's shape.

S3 — Existing Race A dedup test never set `isResponding=true`, so a
future refactor that moves the dedup block below the `if (isResponding)
return` guard would pass every test while silently leaving deduped
tools stuck in `completed-but-not-submitted` (the scheduler's
`onAllToolCallsComplete` is single-shot per batch). New test holds a
stream open via `submitQuery` to pin `isResponding=true`, then triggers
`onComplete` and asserts `markToolsAsSubmitted` still fires. Verified
the test fails when the dedup is moved below the guard. Existing
Race A test also gained a `recordCompletedToolCall` assertion locking
in the S2 fix.

N1 — Hoisting comment in `repairOrphanedToolUseTurns` documented the
why (mirror upstream `hoistToolResults`) but not the consequence of
removal. Added "CONSEQUENCE OF REMOVAL" paragraph spelling out the
exact 400 a naive `next.parts = [...existing, ...syntheticParts]`
re-introduces, so the constraint is unmissable for future maintainers.

Verified: 94/94 useGeminiStream, 102/102 geminiChat, 133/133 client
tests pass. tsc clean both packages, eslint + prettier clean on all
three changed files. S3 regression-injection check confirmed the new
test catches the dedup-below-guard regression.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx | 168 ++++++++++++++++++
 packages/cli/src/ui/hooks/useGeminiStream.ts  |  23 ++-
 packages/core/src/core/geminiChat.ts          |  23 +++
 3 files changed, 211 insertions(+), 3 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index 492f9fa97fd..c32f29e2966 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -841,11 +841,179 @@ describe('useGeminiStream', () => {
       expect(mockMarkToolsAsSubmitted).toHaveBeenCalledWith(['call_race_A']);
     });
 
+    // The deduped tool DID run locally — `recordCompletedToolCall` must
+    // still fire so toolCallCount / skillsModifiedInSession reflect it,
+    // even though the wire-side submission is dropped. (Regression guard
+    // for the gap mimo-v2.5-pro flagged: filtering deduped tools out of
+    // `geminiTools` without recording skipped the metric increment.)
+    expect(client.recordCompletedToolCall).toHaveBeenCalledWith('read_file', {
+      path: '/tmp/x.txt',
+    });
+
     // No follow-up submission: the synthetic in history already closes
     // the tool_use ↔ tool_result pair.
     expect(mockSendMessageStream).not.toHaveBeenCalled();
   });
 
+  it('runs Race A dedup BEFORE the isResponding early-return (regression guard)', async () => {
+    // The dedup block in handleCompletedTools is intentionally placed
+    // ABOVE the `if (isResponding) return;` early-return: the scheduler's
+    // `onAllToolCallsComplete` is single-shot per batch, so if the dedup
+    // sat below the guard a tool whose result was already paired in
+    // history would be left in `completed-but-not-submitted` forever
+    // whenever the late completion lands while the next stream is still
+    // in flight (isResponding=true). This test holds a stream open to
+    // pin isResponding=true, then asserts `markToolsAsSubmitted` still
+    // fires for the deduped callId. A future refactor that moves the
+    // dedup below the guard would silently break this and pass every
+    // other test.
+    const lateRealResult = {
+      request: {
+        callId: 'call_race_A_responding',
+        name: 'read_file',
+        args: { path: '/tmp/y.txt' },
+        isClientInitiated: false,
+        prompt_id: 'prompt-race-a-responding',
+      },
+      status: 'success',
+      responseSubmittedToGemini: false,
+      response: {
+        callId: 'call_race_A_responding',
+        responseParts: [
+          {
+            functionResponse: {
+              id: 'call_race_A_responding',
+              name: 'read_file',
+              response: { output: 'real file contents' },
+            },
+          },
+        ],
+        resultDisplay: undefined,
+        error: undefined,
+        errorType: undefined,
+      },
+      tool: {
+        name: 'read_file',
+        displayName: 'ReadFile',
+        description: 'Read a file',
+        build: vi.fn(),
+      } as any,
+      invocation: {
+        getDescription: () => 'read /tmp/y.txt',
+      } as unknown as AnyToolInvocation,
+    } as unknown as TrackedCompletedToolCall;
+
+    const client = new MockedGeminiClientClass(mockConfig);
+    client.getHistory = vi.fn().mockReturnValue([
+      { role: 'user', parts: [{ text: 'open /tmp/y.txt' }] },
+      {
+        role: 'model',
+        parts: [
+          {
+            functionCall: {
+              id: 'call_race_A_responding',
+              name: 'read_file',
+              args: { path: '/tmp/y.txt' },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              id: 'call_race_A_responding',
+              name: 'read_file',
+              response: {
+                error: 'Tool execution result was not recorded',
+              },
+            },
+          },
+          { text: 'next prompt' },
+        ],
+      },
+    ]);
+
+    let capturedOnComplete:
+      | ((completedTools: TrackedToolCall[]) => Promise<void>)
+      | null = null;
+    mockUseReactToolScheduler.mockImplementation((onComplete) => {
+      capturedOnComplete = onComplete;
+      return [[], mockScheduleToolCalls, mockMarkToolsAsSubmitted];
+    });
+
+    // Held stream: never yields, never returns. Pins isResponding=true.
+    let releaseStream!: () => void;
+    const holdStream = new Promise<void>((resolve) => {
+      releaseStream = resolve;
+    });
+    // Intentionally yield-less: holds the stream open without producing
+    // chunks so isResponding stays true while we trigger onComplete.
+    // eslint-disable-next-line require-yield
+    const heldStream = (async function* () {
+      await holdStream;
+    })();
+    mockSendMessageStream.mockReturnValue(heldStream);
+
+    const { result } = renderHook(() =>
+      useGeminiStream(
+        client,
+        [],
+        mockAddItem,
+        mockConfig,
+        mockLoadedSettings,
+        mockOnDebugMessage,
+        mockHandleSlashCommand,
+        false,
+        () => 'vscode' as EditorType,
+        () => {},
+        () => Promise.resolve(),
+        false,
+        () => {},
+        () => {},
+        () => {},
+        () => {},
+        80,
+        24,
+      ),
+    );
+
+    // Kick the stream so submitQuery flips isResponding=true and parks
+    // on the first `await` inside the held async generator.
+    act(() => {
+      void result.current.submitQuery('next prompt');
+    });
+    await act(async () => {
+      await Promise.resolve();
+      await Promise.resolve();
+    });
+    expect(mockSendMessageStream).toHaveBeenCalledTimes(1);
+
+    // Now fire the deduped completion while isResponding=true.
+    await act(async () => {
+      if (capturedOnComplete) {
+        await capturedOnComplete([lateRealResult]);
+      }
+    });
+
+    // The dedup MUST still fire — markToolsAsSubmitted called with the
+    // deduped callId — even though the early-return on isResponding
+    // would otherwise skip every later branch.
+    await waitFor(() => {
+      expect(mockMarkToolsAsSubmitted).toHaveBeenCalledWith([
+        'call_race_A_responding',
+      ]);
+    });
+
+    // No additional sendMessageStream: the held one is still the only
+    // call. The dedup path does NOT submit a new request.
+    expect(mockSendMessageStream).toHaveBeenCalledTimes(1);
+
+    // Release the held stream so the test exits cleanly.
+    releaseStream();
+  });
+
   it('should not flicker streaming state to Idle between tool completion and submission', async () => {
     const toolCallResponseParts: PartListUnion = [
       { text: 'tool 1 final response' },
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index e285133f5ba..a68d8f4a864 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -2058,15 +2058,32 @@ export const useGeminiStream = (
           }
         }
       }
-      const dedupedCallIds = completedAndReadyToSubmitTools
-        .filter((tc) => historyCallIdsWithResponse.has(tc.request.callId))
-        .map((tc) => tc.request.callId);
+      const dedupedTools = completedAndReadyToSubmitTools.filter((tc) =>
+        historyCallIdsWithResponse.has(tc.request.callId),
+      );
+      const dedupedCallIds = dedupedTools.map((tc) => tc.request.callId);
       if (dedupedCallIds.length > 0) {
         debugLogger.warn(
           `[REPAIR] Dropping ${dedupedCallIds.length} late tool result(s) ` +
             `whose callId already has a functionResponse in history: ` +
             `${dedupedCallIds.join(', ')}`,
         );
+        // Even though the wire-side submission is dropped, the tool DID
+        // run locally — `toolCallCount` and `skillsModifiedInSession`
+        // must reflect that. Without this, deduped skill-write tools
+        // (e.g. write_file under a project SKILLS path) would silently
+        // skip the `skillsModifiedInSession` flip that gates the
+        // skills-reload prompt at end-of-turn. Mirrors the
+        // `recordCompletedToolCall` loop below over `geminiTools` —
+        // filter to the same shape (non-client-initiated) so client
+        // tools (which the original loop also skipped) stay skipped.
+        for (const tc of dedupedTools) {
+          if (tc.request.isClientInitiated) continue;
+          geminiClient?.recordCompletedToolCall(
+            tc.request.name,
+            tc.request.args as Record<string, unknown>,
+          );
+        }
         markToolsAsSubmitted(dedupedCallIds);
       }
 
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 50b6625ba2b..bac8d4c5c1c 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -474,6 +474,13 @@ export function repairOrphanedToolUseTurns(
       // is supposed to escape. Mirrors upstream Claude Code's
       // `hoistToolResults` (`utils/messages.ts`).
       //
+      // CONSEQUENCE OF REMOVAL: dropping this hoist (e.g. naively
+      // `next.parts = [...existing, ...syntheticParts]`) re-introduces
+      // the exact 400 "tool_use_id ... must have a corresponding tool_use
+      // block in the previous message" the synthesis pass exists to
+      // prevent. The whole repair becomes a no-op and the session stays
+      // wedged. Do not "simplify" this branch.
+      //
       // Place synthetics AFTER any pre-existing functionResponse parts
       // (real tool_results the user is supplying in the same turn) so
       // their original ordering is preserved.
@@ -1461,6 +1468,18 @@ export class GeminiChat {
    */
   addHistory(content: Content): void {
     this.history.push(content);
+    // The marker is per-send-attempt. By the time external code calls
+    // addHistory (cancelled-tool synthesis in useGeminiStream, ACP
+    // session injects, shellCommandProcessor, etc.), the originating
+    // sendMessageStream has either already popped the partial via the
+    // retry loop or hit an unrecoverable break — in both cases the
+    // marker is no longer load-bearing. Clearing here matches the
+    // defensive reset already applied in setHistory/truncateHistory/
+    // clearHistory: any future addHistory variant that splices into
+    // the middle (instead of plain push) would shift indices and a
+    // stale marker could splice the wrong entry. Belt-and-suspenders;
+    // the next sendMessageStream entry also clears it.
+    this.pendingPartialAssistantTurnIndex = null;
   }
 
   setHistory(history: Content[]): void {
@@ -1489,6 +1508,10 @@ export class GeminiChat {
     this.history = this.history
       .map(stripThoughtPartsFromContent)
       .filter((content): content is Content => content !== null);
+    // Filter+map replaces `this.history` with a new array, so any pending
+    // partial-push marker is now indexed against an array that no longer
+    // exists. Clear it for the same reason setHistory does.
+    this.pendingPartialAssistantTurnIndex = null;
   }
 
   /**

From 2dbfc4e3bb04311abcc4c325162542363eda20b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Mon, 18 May 2026 10:39:19 +0800
Subject: [PATCH 09/18] fix(core): defer chat-recording flush until
 partial-turn rollback decision
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Yiliang114's PR #4176 follow-up: `popPartialIfPushed()` rolls a
failed-attempt's `model[functionCall]` out of in-memory `this.history`
before retrying, but the same partial turn was already appended to the
chat-recording JSONL by `recordAssistantTurn()`. After a successful
retry, live history was clean while the durable transcript still
carried two assistant entries — the failed attempt and the success.
`--resume` then rehydrated the failed `model[functionCall]` and the
resumed model context picked up a tool_use the live session correctly
discarded.

Fix: the recording call on the stream-error + hasToolCall path is now
deferred. processStreamResponse stashes the would-be record on a new
`pendingPartialAssistantRecord` field instead of immediately appending.
The retry loop's `popPartialIfPushed` clears the stash alongside the
in-memory splice — no flush, no leak. Once the retry loop has settled
(success-break: stash already null; unretryable-break: partial
survived), a flush hook right after the for-loop appends the stashed
record to JSONL so the transcript matches whatever lives in
`this.history`. The success path (streamError === null) still records
immediately as before — only the at-risk-of-rollback path is deferred.

Lifecycle is paired one-to-one with `pendingPartialAssistantTurnIndex`:
set together, popped together, flushed together, and reset in every
history-replacement method (sendMessageStream entry, setHistory,
clearHistory, truncateHistory, addHistory, stripThoughtsFromHistory)
so an exotic path can't leak a stale stash into a future send's flush.

Two regression tests in geminiChat.test.ts:

  1. "rolls back the chat-recording entry too when the retry succeeds"
     — failing-then-succeeding stream sequence asserts
     `recordAssistantTurn` is called exactly once with the success
     text, no functionCall part anywhere in the recorded message.
     Verified the test fails (mock called twice) when the deferral
     branch is bypassed.

  2. "flushes the chat-recording entry on the unretryable break path"
     — synthetic non-retryable error after a tool_use chunk asserts
     the partial IS recorded so the JSONL stays aligned with the
     in-memory partial that survives. Verified the test fails
     (mock called zero times) when the post-loop flush is removed.

Tests: 104/104 geminiChat (+2 new), 133/133 client, 94/94
useGeminiStream. tsc + eslint + prettier clean.
---
 packages/core/src/core/geminiChat.test.ts | 182 ++++++++++++++++++++++
 packages/core/src/core/geminiChat.ts      | 100 +++++++++++-
 2 files changed, 274 insertions(+), 8 deletions(-)

diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index ec620af2087..048d70fde3f 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -2524,6 +2524,188 @@ describe('GeminiChat', async () => {
       }
     });
 
+    it('rolls back the chat-recording entry too when the retry succeeds (yiliang114 PR #4176 follow-up)', async () => {
+      // The in-memory rollback test above asserts `this.history` ends
+      // clean after a retry-success. This test asserts the same about
+      // chat-recording JSONL: the failed attempt's `recordAssistantTurn`
+      // call must NOT have been flushed, so `--resume` won't rehydrate
+      // a model[functionCall] turn the live session correctly discarded.
+      // Without the deferred-flush stash + popPartialIfPushed clear,
+      // `recordAssistantTurn` was called twice (once for the partial,
+      // once for the success) and only the in-memory pop fixed live
+      // history; the durable transcript stayed corrupt.
+      vi.useFakeTimers();
+      try {
+        const recordAssistantTurn = vi.fn();
+        const chatWithRecording = new GeminiChat(
+          mockConfig,
+          config,
+          [],
+          {
+            recordAssistantTurn,
+            recordChatCompression: vi.fn(),
+          } as unknown as ConstructorParameters<typeof GeminiChat>[3],
+          uiTelemetryService,
+        );
+
+        const tpmError = new StreamContentError(
+          '{"error":{"code":"429","message":"Throttling: TPM(1/1)"}}',
+        );
+        const failingStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: {
+                  parts: [
+                    {
+                      functionCall: {
+                        id: 'call_failed_retry_recording',
+                        name: 'read_file',
+                        args: { path: '/tmp/a.txt' },
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+          throw tpmError;
+        })();
+        const successStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: { parts: [{ text: 'Success after retry' }] },
+                finishReason: 'STOP',
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+        })();
+        vi.mocked(mockContentGenerator.generateContentStream)
+          .mockResolvedValueOnce(failingStream)
+          .mockResolvedValueOnce(successStream);
+
+        const stream = await chatWithRecording.sendMessageStream(
+          'test-model',
+          { message: 'test' },
+          'prompt-recording-rollback',
+        );
+        const iterator = stream[Symbol.asyncIterator]();
+        for (;;) {
+          const next = iterator.next();
+          await vi.advanceTimersByTimeAsync(60_000);
+          const r = await next;
+          if (r.done) break;
+        }
+
+        // Exactly one recording: the successful retry's text turn.
+        // The failed attempt's partial functionCall must have been
+        // discarded by `popPartialIfPushed` clearing the deferred-flush
+        // stash, never reaching the JSONL.
+        expect(recordAssistantTurn).toHaveBeenCalledTimes(1);
+        const recordedMessage = recordAssistantTurn.mock.calls[0]![0]
+          ?.message as Array<{ text?: string; functionCall?: unknown }>;
+        const recordedText = recordedMessage.find((p) => p.text)?.text;
+        expect(recordedText).toBe('Success after retry');
+        // No functionCall part anywhere in the recorded turn.
+        expect(recordedMessage.some((p) => p.functionCall)).toBe(false);
+      } finally {
+        vi.useRealTimers();
+      }
+    });
+
+    it('flushes the chat-recording entry on the unretryable break path (kept partial → durable JSONL)', async () => {
+      // Counterpart to the rollback test: when the retry budget is
+      // exhausted (or the error is unretryable from the start), the
+      // partial assistant turn IS kept in `this.history` — and the
+      // chat-recording JSONL must match. Without the deferred-flush
+      // path firing at the rethrow site, the JSONL silently drops a
+      // partial that's still in live history, and the orphan-tool_use
+      // repair pass at session-load has no dangling functionCall to
+      // close → `--resume` first send 400s with the very wedge the
+      // repair was supposed to escape.
+      vi.useFakeTimers();
+      try {
+        const recordAssistantTurn = vi.fn();
+        const chatWithRecording = new GeminiChat(
+          mockConfig,
+          config,
+          [],
+          {
+            recordAssistantTurn,
+            recordChatCompression: vi.fn(),
+          } as unknown as ConstructorParameters<typeof GeminiChat>[3],
+          uiTelemetryService,
+        );
+
+        // Unretryable: a non-rate-limit, non-InvalidStream error after
+        // a tool_use chunk lands. The catch block falls through to
+        // `break` with the partial kept in memory.
+        const failingStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: {
+                  parts: [
+                    {
+                      functionCall: {
+                        id: 'call_unretryable_kept',
+                        name: 'read_file',
+                        args: { path: '/tmp/k.txt' },
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+          throw new Error('synthetic unretryable mid-stream failure');
+        })();
+        vi.mocked(
+          mockContentGenerator.generateContentStream,
+        ).mockResolvedValueOnce(failingStream);
+
+        const stream = await chatWithRecording.sendMessageStream(
+          'test-model',
+          { message: 'test' },
+          'prompt-recording-flush-on-break',
+        );
+        const iterator = stream[Symbol.asyncIterator]();
+        await expect(
+          (async () => {
+            for (;;) {
+              const r = await iterator.next();
+              if (r.done) return;
+            }
+          })(),
+        ).rejects.toThrow(/synthetic unretryable/);
+
+        // In-memory: partial is kept (the wedge-recovery contract that
+        // the rest of this PR's machinery relies on).
+        const history = chatWithRecording.getHistory();
+        const lastModelTurn = history.findLast((h) => h.role === 'model');
+        expect(
+          lastModelTurn?.parts?.some(
+            (p) => p.functionCall?.id === 'call_unretryable_kept',
+          ),
+        ).toBe(true);
+
+        // JSONL: must contain the same partial turn so `--resume` sees
+        // a transcript that matches live history. Exactly one record
+        // (no success retry happened on this path).
+        expect(recordAssistantTurn).toHaveBeenCalledTimes(1);
+        const recordedMessage = recordAssistantTurn.mock.calls[0]![0]
+          ?.message as Array<{ functionCall?: { id?: string } }>;
+        expect(
+          recordedMessage.some(
+            (p) => p.functionCall?.id === 'call_unretryable_kept',
+          ),
+        ).toBe(true);
+      } finally {
+        vi.useRealTimers();
+      }
+    });
+
     it('should retry on TPM throttling StreamContentError with initial delay', async () => {
       vi.useFakeTimers();
 
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index bac8d4c5c1c..f04993046ef 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -583,6 +583,32 @@ export class GeminiChat {
    */
   private pendingPartialAssistantTurnIndex: number | null = null;
 
+  /**
+   * Deferred-flush record for the partial assistant turn pushed on a
+   * mid-stream error. Stashed instead of immediately appended to the
+   * chat-recording JSONL so a subsequent retry-success can roll back the
+   * persisted record alongside the in-memory pop. Without deferral, the
+   * JSONL transcript keeps the failed attempt even though live history
+   * dropped it — `--resume` rehydrates the failed `model[functionCall]`
+   * turn and the resumed model context picks up a tool_use the in-session
+   * run intentionally discarded (yiliang114 repro on PR #4176).
+   *
+   * Lifecycle is paired with `pendingPartialAssistantTurnIndex`:
+   *  - Set together on stream-error + hasToolCall + hasContent.
+   *  - Cleared together by `popPartialIfPushed` when the retry loop
+   *    rolls the partial back.
+   *  - Flushed together to JSONL after the retry loop exits if the
+   *    partial survived (unretryable break path → about to throw to
+   *    the caller). At that point the partial is durable in memory and
+   *    the recording must match.
+   *  - Reset on `sendMessageStream` entry / setHistory / clearHistory /
+   *    truncateHistory / addHistory / stripThoughtsFromHistory so a leak
+   *    from any exotic path can't bleed into a future send.
+   */
+  private pendingPartialAssistantRecord:
+    | Parameters<ChatRecordingService['recordAssistantTurn']>[0]
+    | null = null;
+
   /**
    * Heap-pressure compaction is process-wide pressure applied per chat. If one
    * heap-triggered attempt cannot reduce history, briefly back off this chat
@@ -826,8 +852,13 @@ export class GeminiChat {
     // Clear any partial-push marker left over from a prior unretryable
     // break path — the marker is per-send; carrying it across sends
     // would let the next send's retry catch wrongly pop a now-valid
-    // model entry sitting at the stale index.
+    // model entry sitting at the stale index. The deferred-record
+    // stash gets the same per-send reset for the same reason: a
+    // leftover from a prior unretryable break would otherwise get
+    // appended to JSONL by THIS send's retry-loop flush, attaching
+    // someone else's failed turn to this conversation.
     this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
 
     let compressionInfo: ChatCompressionInfo;
     let requestContents: Content[];
@@ -974,6 +1005,13 @@ export class GeminiChat {
                 self.history.splice(idx, 1);
               }
               self.pendingPartialAssistantTurnIndex = null;
+              // Discard the deferred chat-recording record alongside the
+              // in-memory pop so the JSONL transcript also drops the
+              // failed attempt. (Paired with the stash in
+              // processStreamResponse — see the field-level comment on
+              // `pendingPartialAssistantRecord` for the failure mode this
+              // fixes.)
+              self.pendingPartialAssistantRecord = null;
             };
 
             // Handle rate-limit / throttling errors returned as stream content.
@@ -1169,6 +1207,23 @@ export class GeminiChat {
           }
         }
 
+        // The retry loop has settled: any partial that was rolled back
+        // had its stash cleared by `popPartialIfPushed`; any partial that
+        // survived (success break with no partial set, or unretryable
+        // break with the partial kept) is now durable in memory, so the
+        // deferred chat-recording append must finally land on disk.
+        // Without this flush the unretryable-break path persists the
+        // partial in `this.history` but the JSONL transcript silently
+        // drops it — `--resume` then loads a truncated transcript that
+        // doesn't match the live session shape, and the orphan-tool_use
+        // repair pass at session-load has nothing to repair.
+        if (self.pendingPartialAssistantRecord) {
+          self.chatRecordingService?.recordAssistantTurn(
+            self.pendingPartialAssistantRecord,
+          );
+          self.pendingPartialAssistantRecord = null;
+        }
+
         // Max output tokens escalation: if the retry loop succeeded with
         // the capped default (8K) but hit MAX_TOKENS, retry once at the
         // model's full output limit. This ensures models with large output
@@ -1459,8 +1514,11 @@ export class GeminiChat {
     // shows up at that index in a future send (defense-in-depth — the
     // helper also bounds-checks, but a stale marker that happens to
     // line up with a real model turn could otherwise pop the wrong
-    // entry).
+    // entry). Drop the deferred-record stash for the same reason: a
+    // later flush would otherwise append a turn that doesn't match
+    // the (now-empty) live history.
     this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
   }
 
   /**
@@ -1478,8 +1536,12 @@ export class GeminiChat {
     // clearHistory: any future addHistory variant that splices into
     // the middle (instead of plain push) would shift indices and a
     // stale marker could splice the wrong entry. Belt-and-suspenders;
-    // the next sendMessageStream entry also clears it.
+    // the next sendMessageStream entry also clears it. The
+    // deferred-record stash is paired with the marker — keeping it
+    // around past an external addHistory could let a subsequent
+    // retry-loop flush land a stale partial in JSONL.
     this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
   }
 
   setHistory(history: Content[]): void {
@@ -1489,8 +1551,10 @@ export class GeminiChat {
     // marker MUST be cleared — otherwise `popPartialIfPushed` could find
     // a model turn at the stale index in the replacement history and
     // splice an entry that has nothing to do with the original partial
-    // push, corrupting the conversation.
+    // push, corrupting the conversation. Drop the paired deferred-record
+    // stash too: its referent (the model turn at the old index) is gone.
     this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
   }
 
   truncateHistory(keepCount: number): void {
@@ -1500,8 +1564,10 @@ export class GeminiChat {
     // the marker rather than try to fix it up — it's per-send and
     // ephemeral, so losing it across a truncate is safe (the
     // sendMessageStream that pushed it has already finished or will
-    // start fresh on the next call).
+    // start fresh on the next call). Drop the paired deferred-record
+    // stash for the same reason.
     this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
   }
 
   stripThoughtsFromHistory(): void {
@@ -1510,8 +1576,11 @@ export class GeminiChat {
       .filter((content): content is Content => content !== null);
     // Filter+map replaces `this.history` with a new array, so any pending
     // partial-push marker is now indexed against an array that no longer
-    // exists. Clear it for the same reason setHistory does.
+    // exists. Clear it for the same reason setHistory does. The deferred
+    // chat-recording stash is paired with the marker — drop it too so a
+    // later flush can't land a turn that doesn't exist in live history.
     this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
   }
 
   /**
@@ -1712,7 +1781,7 @@ export class GeminiChat {
     ) {
       const contextWindowSize =
         this.config.getContentGeneratorConfig()?.contextWindowSize;
-      this.chatRecordingService?.recordAssistantTurn({
+      const recordArgs = {
         model,
         message: [
           ...(thoughtContentPart ? [thoughtContentPart] : []),
@@ -1730,7 +1799,22 @@ export class GeminiChat {
         ],
         tokens: usageMetadata,
         contextWindowSize,
-      });
+      };
+      if (streamError !== null) {
+        // Stream-error + tool-use partial: defer the JSONL append until
+        // the outer retry loop decides whether to roll back this attempt.
+        // If the same send retries successfully, popPartialIfPushed clears
+        // this stash and the failed attempt never lands on disk; if the
+        // retry path doesn't apply (unretryable break), the stash is
+        // flushed at the rethrow site so JSONL stays aligned with the
+        // partial that survives in-memory. Without this, retry-success
+        // leaves a failed `model[functionCall]` durable in JSONL and
+        // `--resume` rehydrates a turn the live session correctly
+        // discarded (yiliang114 PR #4176 repro).
+        this.pendingPartialAssistantRecord = recordArgs;
+      } else {
+        this.chatRecordingService?.recordAssistantTurn(recordArgs);
+      }
     }
 
     // Mid-stream failure recovery: if the upstream stream threw (typical on

From fd12639c9c17e494f8a0d5d2072f3bc63971d361 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Tue, 19 May 2026 00:24:18 +0800
Subject: [PATCH 10/18] fix(core,cli): close 4 deepseek-v4-pro review threads
 on PR #4176
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four follow-ups from the deepseek-v4-pro pass on commit 2dbfc4e3b:

[A] useGeminiStream.ts — dedup recordCompletedToolCall now skips
cancelled tools. `dedupedTools` includes anything in a terminal state
(success | error | cancelled), but cancelled means the tool never
actually produced model-visible output. Counting it via
`recordCompletedToolCall` would inflate `toolCallCount` and could flip
`skillsModifiedInSession` for a never-executed skill-write. The
markToolsAsSubmitted call still fires so the scheduler unblocks.
Mirrors the rationale of the existing `allToolsCancelled` branch which
surfaces non-deduped cancellations via addHistory + reportCancelled
rather than the completed-call metric.

[B] client.ts — JSDoc on `repairOrphanedToolUseTurnsInHistory` claimed
three call points (startChat, Retry submit path, defensive
UserQuery/Cron pass) but in practice this method is only called once
from `startChat()`. The other two coverage points live one layer down
inside `GeminiChat.sendMessageStream` and call the standalone
`repairOrphanedToolUseTurns(history)` function directly without
routing through this wrapper. Updated to reflect the actual coupling.

[C] geminiChat.ts — `addHistory` now warns via debugLogger if it
clears a non-null partial-push marker. Today's callers
(useGeminiStream cancelled-tool synthesis, ACP session injects,
shellCommandProcessor) only run between sends, so the marker is null
on entry. If a future code path calls addHistory between the partial
push and the retry attempt, the silent clear would strand the
partial: popPartialIfPushed would no-op, the failed model[functionCall]
would survive into the retry, and a successful retry's response would
land as a SECOND consecutive model turn — the wedge this whole
subsystem exists to prevent. The warn surfaces the offending caller in
the log instead of forcing a blind trace through the marker lifecycle.

[D] geminiChat.ts — extracted `clearPendingPartialState()` helper
covering all 7 sites that need to reset both
`pendingPartialAssistantTurnIndex` and `pendingPartialAssistantRecord`
in lockstep (sendMessageStream entry, popPartialIfPushed, the
post-loop flush hook, clearHistory, addHistory, setHistory,
truncateHistory, stripThoughtsFromHistory). The two fields ARE always
paired by lifecycle (set together on stream-error stash, popped
together on retry, flushed together at the rethrow site), so any
single-field reset would be a bug. The helper makes the lockstep
invariant explicit and means a future history-mutating method can't
forget to clear one field.

Regression test: useGeminiStream.test.tsx adds
"skips recordCompletedToolCall for deduped CANCELLED tools" — sets up
a deduped cancelled tool with callId paired in history, asserts
markToolsAsSubmitted IS called (scheduler unblocks) but
recordCompletedToolCall is NOT called. Verified the test fails when
the cancelled-filter line is removed (regression-injection check).

Tests: 237/237 core, 95/95 useGeminiStream (+1 new). tsc + eslint +
prettier clean. Existing CI failure on Test (ubuntu-latest, Node 22.x)
is an unrelated timing flake in promptHookRunner.test.ts
("expected 49 to be greater than or equal to 50") — macOS and Windows
both pass.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx | 123 ++++++++++++++++++
 packages/cli/src/ui/hooks/useGeminiStream.ts  |  12 ++
 packages/core/src/core/client.ts              |  23 ++--
 packages/core/src/core/geminiChat.ts          | 117 ++++++++++-------
 4 files changed, 220 insertions(+), 55 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index c32f29e2966..797f875899e 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -855,6 +855,129 @@ describe('useGeminiStream', () => {
     expect(mockSendMessageStream).not.toHaveBeenCalled();
   });
 
+  it('skips recordCompletedToolCall for deduped CANCELLED tools (telemetry parity)', async () => {
+    // A deduped tool with status='cancelled' never actually produced
+    // model-visible output — counting it via `recordCompletedToolCall`
+    // (which increments toolCallCount and can flip
+    // skillsModifiedInSession on a skill-write path) would inflate the
+    // metric for a call that never ran end-to-end. This test repros
+    // the deepseek-v4-pro thread on PR #4176: dedup must skip BOTH
+    // client-initiated (already skipped) AND cancelled tools, while
+    // still calling `markToolsAsSubmitted` so the scheduler unblocks.
+    const cancelledDedupedTool = {
+      request: {
+        callId: 'call_dedup_cancelled',
+        name: 'write_file',
+        args: { path: '/tmp/cancelled.txt', content: 'x' },
+        isClientInitiated: false,
+        prompt_id: 'prompt-dedup-cancel',
+      },
+      status: 'cancelled',
+      responseSubmittedToGemini: false,
+      response: {
+        callId: 'call_dedup_cancelled',
+        responseParts: [
+          {
+            functionResponse: {
+              id: 'call_dedup_cancelled',
+              name: 'write_file',
+              response: { error: 'cancelled' },
+            },
+          },
+        ],
+        resultDisplay: undefined,
+        error: undefined,
+        errorType: undefined,
+      },
+      tool: {
+        name: 'write_file',
+        displayName: 'WriteFile',
+        description: 'Write a file',
+        build: vi.fn(),
+      } as any,
+      invocation: {
+        getDescription: () => 'cancelled write',
+      } as unknown as AnyToolInvocation,
+    } as unknown as TrackedCancelledToolCall;
+
+    const client = new MockedGeminiClientClass(mockConfig);
+    // Pre-paired in history: dedup will fire for this callId.
+    client.getHistory = vi.fn().mockReturnValue([
+      { role: 'user', parts: [{ text: 'cancelled write' }] },
+      {
+        role: 'model',
+        parts: [
+          {
+            functionCall: {
+              id: 'call_dedup_cancelled',
+              name: 'write_file',
+              args: { path: '/tmp/cancelled.txt', content: 'x' },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              id: 'call_dedup_cancelled',
+              name: 'write_file',
+              response: { error: 'synthetic' },
+            },
+          },
+        ],
+      },
+    ]);
+
+    let capturedOnComplete:
+      | ((completedTools: TrackedToolCall[]) => Promise<void>)
+      | null = null;
+    mockUseReactToolScheduler.mockImplementation((onComplete) => {
+      capturedOnComplete = onComplete;
+      return [[], mockScheduleToolCalls, mockMarkToolsAsSubmitted];
+    });
+
+    renderHook(() =>
+      useGeminiStream(
+        client,
+        [],
+        mockAddItem,
+        mockConfig,
+        mockLoadedSettings,
+        mockOnDebugMessage,
+        mockHandleSlashCommand,
+        false,
+        () => 'vscode' as EditorType,
+        () => {},
+        () => Promise.resolve(),
+        false,
+        () => {},
+        () => {},
+        () => {},
+        () => {},
+        80,
+        24,
+      ),
+    );
+
+    await act(async () => {
+      if (capturedOnComplete) {
+        await capturedOnComplete([cancelledDedupedTool]);
+      }
+    });
+
+    // Scheduler still gets unblocked.
+    await waitFor(() => {
+      expect(mockMarkToolsAsSubmitted).toHaveBeenCalledWith([
+        'call_dedup_cancelled',
+      ]);
+    });
+
+    // Telemetry NOT incremented — the cancelled filter held.
+    expect(client.recordCompletedToolCall).not.toHaveBeenCalled();
+  });
+
   it('runs Race A dedup BEFORE the isResponding early-return (regression guard)', async () => {
     // The dedup block in handleCompletedTools is intentionally placed
     // ABOVE the `if (isResponding) return;` early-return: the scheduler's
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index a68d8f4a864..c414223b90c 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -2077,8 +2077,20 @@ export const useGeminiStream = (
         // `recordCompletedToolCall` loop below over `geminiTools` —
         // filter to the same shape (non-client-initiated) so client
         // tools (which the original loop also skipped) stay skipped.
+        //
+        // Cancelled tools are also skipped: `dedupedTools` includes
+        // anything in a terminal state (success | error | cancelled),
+        // but cancelled means the tool never actually ran end-to-end —
+        // the `allToolsCancelled` branch below would have surfaced
+        // them via `addHistory + reportCancelled` rather than the
+        // completed-call metric, and the metric should match. Without
+        // this filter, a deduped + cancelled tool would inflate
+        // `toolCallCount` for a call that never produced a result
+        // (and could also flip `skillsModifiedInSession` for a
+        // never-executed skill-write).
         for (const tc of dedupedTools) {
           if (tc.request.isClientInitiated) continue;
+          if (tc.status === 'cancelled') continue;
           geminiClient?.recordCompletedToolCall(
             tc.request.name,
             tc.request.args as Record<string, unknown>,
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 6282f2fe420..a69071d0a9b 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -359,16 +359,19 @@ export class GeminiClient {
    * {@link stripOrphanedUserEntriesFromHistory}, which only handles trailing
    * `user` entries.
    *
-   * Called from three points:
-   *   1. After {@link startChat} loads transcript (covers `--resume` of a
-   *      session that crashed between partial-tool_use push and tool
-   *      completion).
-   *   2. After `stripOrphanedUserEntriesFromHistory` on the Retry submit path
-   *      (covers Ctrl+Y race — user retries while an in-flight tool's
-   *      `tool_result` has not yet been submitted, leaving a trailing
-   *      `model[functionCall]` without matching `functionResponse`).
-   *   3. Defensively at the start of UserQuery / Cron sends, so any state
-   *      that slipped past 1+2 still gets fixed before hitting the wire.
+   * This `GeminiClient` method is the resume-path entry point — called once
+   * from {@link startChat} after the transcript loads, covering `--resume`
+   * of a session that crashed between a partial-tool_use push and the
+   * tool's eventual completion.
+   *
+   * The other two coverage points (Retry submit path after
+   * `stripOrphanedUserEntriesFromHistory`, and the defensive pass at the
+   * start of every UserQuery / Cron send) live one layer down inside
+   * `GeminiChat.sendMessageStream` and call the standalone
+   * `repairOrphanedToolUseTurns(history)` function directly — they don't
+   * route through this wrapper. Anyone tracing the repair-pass coupling
+   * between the client and chat layers should follow that path
+   * separately rather than expect everything to funnel through here.
    *
    * Synthesizes an `error` `functionResponse`. The React tool scheduler
    * (`useGeminiStream.handleCompletedTools`) MUST dedupe by `callId` against
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index f04993046ef..b267b0ae62b 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -609,6 +609,21 @@ export class GeminiChat {
     | Parameters<ChatRecordingService['recordAssistantTurn']>[0]
     | null = null;
 
+  /**
+   * Reset both partial-push markers in lockstep. Extracted so the seven
+   * call sites that need to drop both fields (sendMessageStream entry,
+   * popPartialIfPushed, clearHistory, addHistory, setHistory,
+   * truncateHistory, stripThoughtsFromHistory) can't drift apart — a
+   * future history-mutating method that only clears one would leak the
+   * other into a later flush. The fields ARE always paired by lifecycle
+   * (set together on stream-error stash, popped together on retry, flushed
+   * together at the rethrow site), so any single-field reset is a bug.
+   */
+  private clearPendingPartialState(): void {
+    this.pendingPartialAssistantTurnIndex = null;
+    this.pendingPartialAssistantRecord = null;
+  }
+
   /**
    * Heap-pressure compaction is process-wide pressure applied per chat. If one
    * heap-triggered attempt cannot reduce history, briefly back off this chat
@@ -857,8 +872,7 @@ export class GeminiChat {
     // leftover from a prior unretryable break would otherwise get
     // appended to JSONL by THIS send's retry-loop flush, attaching
     // someone else's failed turn to this conversation.
-    this.pendingPartialAssistantTurnIndex = null;
-    this.pendingPartialAssistantRecord = null;
+    this.clearPendingPartialState();
 
     let compressionInfo: ChatCompressionInfo;
     let requestContents: Content[];
@@ -1004,14 +1018,13 @@ export class GeminiChat {
               ) {
                 self.history.splice(idx, 1);
               }
-              self.pendingPartialAssistantTurnIndex = null;
-              // Discard the deferred chat-recording record alongside the
-              // in-memory pop so the JSONL transcript also drops the
-              // failed attempt. (Paired with the stash in
-              // processStreamResponse — see the field-level comment on
-              // `pendingPartialAssistantRecord` for the failure mode this
-              // fixes.)
-              self.pendingPartialAssistantRecord = null;
+              // Drop both markers in lockstep — the deferred chat-
+              // recording record must be discarded alongside the
+              // in-memory splice so the JSONL transcript also drops the
+              // failed attempt. See the field-level comment on
+              // `pendingPartialAssistantRecord` for the failure mode
+              // this prevents.
+              self.clearPendingPartialState();
             };
 
             // Handle rate-limit / throttling errors returned as stream content.
@@ -1221,7 +1234,13 @@ export class GeminiChat {
           self.chatRecordingService?.recordAssistantTurn(
             self.pendingPartialAssistantRecord,
           );
-          self.pendingPartialAssistantRecord = null;
+          // Clear both fields in lockstep. The marker is no longer
+          // load-bearing past this point (its consumer is the for-loop
+          // catch above, which has exited), and the next
+          // sendMessageStream entry would clear it anyway — but pairing
+          // the reset preserves the "marker and stash are always set or
+          // cleared together" invariant the helper enforces.
+          self.clearPendingPartialState();
         }
 
         // Max output tokens escalation: if the retry loop succeeded with
@@ -1509,16 +1528,15 @@ export class GeminiChat {
    */
   clearHistory(): void {
     this.history = [];
-    // Any pending partial-push marker points into the now-empty history;
+    // Any pending partial-push state points into the now-empty history;
     // resetting prevents `popPartialIfPushed` from splicing whatever
     // shows up at that index in a future send (defense-in-depth — the
     // helper also bounds-checks, but a stale marker that happens to
     // line up with a real model turn could otherwise pop the wrong
-    // entry). Drop the deferred-record stash for the same reason: a
-    // later flush would otherwise append a turn that doesn't match
-    // the (now-empty) live history.
-    this.pendingPartialAssistantTurnIndex = null;
-    this.pendingPartialAssistantRecord = null;
+    // entry). The deferred-record stash is dropped for the same reason:
+    // a later flush would append a turn that doesn't match the (now-
+    // empty) live history.
+    this.clearPendingPartialState();
   }
 
   /**
@@ -1526,22 +1544,35 @@ export class GeminiChat {
    */
   addHistory(content: Content): void {
     this.history.push(content);
-    // The marker is per-send-attempt. By the time external code calls
-    // addHistory (cancelled-tool synthesis in useGeminiStream, ACP
-    // session injects, shellCommandProcessor, etc.), the originating
+    // The marker is per-send-attempt. Today's callers (cancelled-tool
+    // synthesis in useGeminiStream, ACP session injects,
+    // shellCommandProcessor) only run between sends, so the originating
     // sendMessageStream has either already popped the partial via the
     // retry loop or hit an unrecoverable break — in both cases the
-    // marker is no longer load-bearing. Clearing here matches the
-    // defensive reset already applied in setHistory/truncateHistory/
-    // clearHistory: any future addHistory variant that splices into
-    // the middle (instead of plain push) would shift indices and a
-    // stale marker could splice the wrong entry. Belt-and-suspenders;
-    // the next sendMessageStream entry also clears it. The
-    // deferred-record stash is paired with the marker — keeping it
-    // around past an external addHistory could let a subsequent
-    // retry-loop flush land a stale partial in JSONL.
-    this.pendingPartialAssistantTurnIndex = null;
-    this.pendingPartialAssistantRecord = null;
+    // marker is no longer load-bearing.
+    //
+    // If a future code path ever calls addHistory BETWEEN the partial
+    // push and the retry attempt, silently clearing the marker would
+    // strand the partial: popPartialIfPushed would no-op, the failed
+    // attempt's `model[functionCall]` would survive into the retry,
+    // and a successful retry's response would land as a SECOND
+    // consecutive model turn (the wedge this whole subsystem exists
+    // to prevent). The warn below makes that coupling observable —
+    // anyone investigating a stale-partial bug will see this log line
+    // pointing straight at the offending caller, instead of having to
+    // trace through the marker lifecycle blind.
+    if (
+      this.pendingPartialAssistantTurnIndex !== null ||
+      this.pendingPartialAssistantRecord !== null
+    ) {
+      debugLogger.warn(
+        'addHistory called while a partial-push marker is active — ' +
+          'clearing it. This is unexpected during an active sendMessageStream ' +
+          'and likely indicates a new caller violating the between-sends ' +
+          'invariant. See comment at GeminiChat.addHistory for context.',
+      );
+    }
+    this.clearPendingPartialState();
   }
 
   setHistory(history: Content[]): void {
@@ -1553,21 +1584,18 @@ export class GeminiChat {
     // splice an entry that has nothing to do with the original partial
     // push, corrupting the conversation. Drop the paired deferred-record
     // stash too: its referent (the model turn at the old index) is gone.
-    this.pendingPartialAssistantTurnIndex = null;
-    this.pendingPartialAssistantRecord = null;
+    this.clearPendingPartialState();
   }
 
   truncateHistory(keepCount: number): void {
     this.history = this.history.slice(0, keepCount);
     // Truncation can drop the entry the partial-push marker points at,
     // or leave it valid but shift the meaning of nearby indices. Reset
-    // the marker rather than try to fix it up — it's per-send and
-    // ephemeral, so losing it across a truncate is safe (the
-    // sendMessageStream that pushed it has already finished or will
-    // start fresh on the next call). Drop the paired deferred-record
-    // stash for the same reason.
-    this.pendingPartialAssistantTurnIndex = null;
-    this.pendingPartialAssistantRecord = null;
+    // both fields rather than try to fix them up — they're per-send and
+    // ephemeral, so losing them across a truncate is safe (the
+    // sendMessageStream that pushed them has already finished or will
+    // start fresh on the next call).
+    this.clearPendingPartialState();
   }
 
   stripThoughtsFromHistory(): void {
@@ -1576,11 +1604,10 @@ export class GeminiChat {
       .filter((content): content is Content => content !== null);
     // Filter+map replaces `this.history` with a new array, so any pending
     // partial-push marker is now indexed against an array that no longer
-    // exists. Clear it for the same reason setHistory does. The deferred
-    // chat-recording stash is paired with the marker — drop it too so a
-    // later flush can't land a turn that doesn't exist in live history.
-    this.pendingPartialAssistantTurnIndex = null;
-    this.pendingPartialAssistantRecord = null;
+    // exists. Clear it for the same reason setHistory does — and drop
+    // the paired deferred-record stash so a later flush can't land a
+    // turn that doesn't exist in live history.
+    this.clearPendingPartialState();
   }
 
   /**

From 2880de5778ee3cdd8af93a1d6116e5001b2a1315 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Tue, 19 May 2026 06:34:57 +0800
Subject: [PATCH 11/18] fix(core): close 3 review threads on partial-tool_use
 repair (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three follow-ups from the qwen-latest-series-invite-beta-v28 pass on
commit fd12639c9:

[E] CORRECTNESS — `repairOrphanedToolUseTurns` now scans EVERY
consecutive `user` turn after a `model[functionCall]` (not just
`history[i+1]`) when building the matched-id set. The shape
`model[fc], user[text], user[fr_real]` arises when a user aborts a
long-running tool, types a follow-up text turn, and the React
scheduler's late `submitQuery` appends the real `tool_result` as a
SEPARATE user entry. Previously the repair saw only the immediate
follow-up text turn, found no matching fr, and synthesized a duplicate
`error` `functionResponse` for a callId that already had a real result
two turns down. The downstream dedup in
`useGeminiStream.handleCompletedTools` then dropped the REAL result on
the next submission (its callId was now "already in history" thanks to
the synthetic), and the model only ever saw the error placeholder —
silently swapping a real tool success for an error.

Two regression tests in `geminiChat.test.ts`:
  - "does NOT synthesize when the real functionResponse lives in a
    non-adjacent later user turn" — proves the forward-scan finds the
    real fr at history[i+2]+ and skips synthesis.
  - "still synthesizes for a partial mismatch when forward scan misses
    one callId" — counterpart that confirms parallel tool_use shapes
    where only some callIds have real fr's still get synthetic ones
    hoisted onto the immediate next user turn.

Verified via regression-injection (revert to scanning only
`history[i+1]`): the non-adjacent test fails with `injected =
[{callId:'call_nonadjacent_real',...}]` — proves the test catches the
bug, not just verifies a no-op.

[F] OBSERVABILITY — added a `[PARTIAL_PUSH]` `debugLogger.warn` at the
partial-turn push site in `processStreamResponse`. The repair lifecycle
already had push/pop/repair logging at the dedup and synthesis ends
(`[REPAIR] Dropping ...`, `[REPAIR] Synthesized ...`), but the original
PUSH event — root cause of every downstream recovery — was unlogged.
At 3 AM investigating a stale-partial wedge, the trace now anchors at
the exact moment the partial was created with pendingIndex, callIds,
and the originating error message.

[G] INVARIANT — `stripOrphanedUserEntriesFromHistory` now calls
`clearPendingPartialState()` after popping. Today this is safe even
without the reset (only trailing `user` entries are popped, which
can't shift the index of an earlier `model` partial), but every other
history-mutation method in the class — `clearHistory`, `addHistory`,
`setHistory`, `truncateHistory`, `stripThoughtsFromHistory` — now
clears the partial-push state in lockstep. Omitting it here would be
a silent exception to the uniform invariant the helper extraction
established. A future caller invoking this between the deferred JSONL
flush and the next `sendMessageStream` would otherwise leave a stale
marker that happens to line up with whatever model entry is at that
index in the meanwhile.

Tests: 239/239 core (+2 new), 95/95 useGeminiStream. tsc + eslint +
prettier clean. Latest CI run: all three platforms (Linux/macOS/
Windows) green.
---
 packages/core/src/core/geminiChat.test.ts | 112 ++++++++++++++++++++++
 packages/core/src/core/geminiChat.ts      |  57 ++++++++++-
 2 files changed, 166 insertions(+), 3 deletions(-)

diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index 048d70fde3f..686ed49aad1 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -4092,6 +4092,118 @@ describe('GeminiChat', async () => {
       const fr = chat.getHistory()[2]!.parts![0]!.functionResponse;
       expect((fr?.response as { error?: string })?.error).toBe('custom reason');
     });
+
+    it('does NOT synthesize when the real functionResponse lives in a non-adjacent later user turn', () => {
+      // Regression for the qwen-latest-series-invite-beta-v28 thread on
+      // PR #4176: shape `[user, model[fc], user[text], user[fr_real]]`
+      // arises when the user aborts a long-running tool, types a
+      // follow-up text turn, and then the React scheduler's late
+      // submitQuery appends the real tool_result as a SEPARATE user
+      // entry. The repair pass previously checked only history[i+1]
+      // (the immediate next turn) — the real fr at history[i+2] was
+      // invisible, so the repair synthesized a duplicate `error` fr for
+      // a callId that already had a real result. The downstream dedup
+      // in `useGeminiStream.handleCompletedTools` then dropped the REAL
+      // tool_result on the next submission (its callId was now
+      // "already in history" because of the synthetic), and the model
+      // only ever saw the error placeholder — silently swapping a real
+      // success for an error.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'open /tmp/long.txt' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: {
+                id: 'call_nonadjacent_real',
+                name: 'read_file',
+                args: { path: '/tmp/long.txt' },
+              },
+            },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'never mind, do something else' }] },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'call_nonadjacent_real',
+                name: 'read_file',
+                response: { output: 'real file contents' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      // No synthesis: the real fr at history[3] satisfies the model[fc]
+      // at history[1].
+      expect(result.injected).toEqual([]);
+      const history = chat.getHistory();
+      // History shape is unchanged: 4 entries, no inserted synthetic.
+      expect(history.length).toBe(4);
+      expect(history[3]!.parts![0]!.functionResponse?.response).toEqual({
+        output: 'real file contents',
+      });
+      // The follow-up text turn is untouched (no synthetic fr hoisted
+      // into it — confirms the forward-scan fix, not just a no-op
+      // synthesis).
+      expect(history[2]!.parts).toEqual([
+        { text: 'never mind, do something else' },
+      ]);
+    });
+
+    it('still synthesizes for a partial mismatch when forward scan misses one callId', () => {
+      // Counterpart to the above: when the real fr only covers SOME
+      // of the callIds in a parallel tool_use, the missing ones must
+      // still get synthetic error fr's hoisted onto the immediate next
+      // user turn (preserving the existing parallel-tool_use behavior).
+      // Confirms the forward-scan didn't accidentally over-collect.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'fan out two reads' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: { id: 'cid_a', name: 'read_file', args: {} },
+            },
+            {
+              functionCall: { id: 'cid_b', name: 'read_file', args: {} },
+            },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'follow up' }] },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_a',
+                name: 'read_file',
+                response: { output: 'real for a' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      // cid_a satisfied by real fr at history[3]; cid_b missing →
+      // synthetic for cid_b only.
+      expect(result.injected).toEqual([{ callId: 'cid_b', name: 'read_file' }]);
+      const history = chat.getHistory();
+      expect(history.length).toBe(4);
+      // Synthetic for cid_b hoisted into history[2] (immediate next
+      // user turn) before the text part.
+      expect(history[2]!.parts![0]!.functionResponse?.id).toBe('cid_b');
+      expect(history[2]!.parts![1]).toEqual({ text: 'follow up' });
+      // Real fr for cid_a still at history[3], untouched.
+      expect(history[3]!.parts![0]!.functionResponse?.id).toBe('cid_a');
+    });
   });
 
   describe('output token recovery', () => {
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index b267b0ae62b..8ab9947bd58 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -441,18 +441,35 @@ export function repairOrphanedToolUseTurns(
     }
     if (expected.size === 0) continue;
 
-    const next = history[i + 1];
+    // Scan forward across EVERY consecutive user turn until the next
+    // non-user (model) entry, gathering all functionResponse ids. The
+    // real tool_result for this model[fc] may live in a non-adjacent
+    // user turn — common shape: user aborts a long-running tool, types
+    // a follow-up text turn, then the React scheduler's late
+    // submitQuery appends the real `user[fr]` as a SEPARATE turn,
+    // producing `model[fc], user[text], user[fr_real]`. Without
+    // forward scanning, `matched` would be empty (only `user[text]`
+    // visited), the repair would synthesize an `error` `functionResponse`
+    // for a callId that already has a real result downstream, and the
+    // `handleCompletedTools` dedup would then drop the REAL result on
+    // the next submission (callId already in history → submit skipped),
+    // so the model only ever sees the synthetic error placeholder.
+    // (qwen-latest-series-invite-beta-v28 thread on PR #4176.)
     const matched = new Set<string>();
-    if (next?.role === 'user') {
-      for (const part of next.parts ?? []) {
+    let scanIdx = i + 1;
+    while (scanIdx < history.length && history[scanIdx]?.role === 'user') {
+      for (const part of history[scanIdx].parts ?? []) {
         const id = part.functionResponse?.id;
         if (id) matched.add(id);
       }
+      scanIdx++;
     }
 
     const missing = [...expected.entries()].filter(([id]) => !matched.has(id));
     if (missing.length === 0) continue;
 
+    const next = history[i + 1];
+
     const syntheticParts: Part[] = missing.map(([callId, name]) => ({
       functionResponse: {
         id: callId,
@@ -1622,6 +1639,18 @@ export class GeminiChat {
     ) {
       this.history.pop();
     }
+    // Today this is safe even without the reset — only trailing user
+    // entries are popped, which can't shift the index of an earlier
+    // `model` partial. But every other history-mutation method now
+    // clears the partial-push state in lockstep
+    // (clearHistory/addHistory/setHistory/truncateHistory/
+    // stripThoughtsFromHistory), so omitting it here would be a silent
+    // exception to the uniform invariant: a future caller invoking
+    // this method between the deferred JSONL flush and the next
+    // `sendMessageStream` would otherwise leave a stale marker that
+    // happens to line up with whatever model entry is at that index
+    // in the meanwhile.
+    this.clearPendingPartialState();
   }
 
   /**
@@ -1886,6 +1915,28 @@ export class GeminiChat {
         // partial `model[functionCall]` as a stale leading model turn in
         // front of the retry's real response.
         this.pendingPartialAssistantTurnIndex = this.history.length - 1;
+        // Trace the push event so the lifecycle is observable end-to-end:
+        // dedup in `useGeminiStream.handleCompletedTools` already logs
+        // `[REPAIR] Dropping ...`, and `repairOrphanedToolUseTurnsInHistory`
+        // logs `[REPAIR] Synthesized ...`. Without a corresponding
+        // `[PARTIAL_PUSH]` line here, an investigator looking at a
+        // stale-partial wedge sees the downstream symptom but has no
+        // anchor for when/why the partial originated.
+        debugLogger.warn(
+          '[PARTIAL_PUSH] Persisting partial assistant turn for ' +
+            'mid-stream error recovery (will be rolled back if retry ' +
+            'succeeds, kept if break is unretryable). ' +
+            `pendingIndex=${this.pendingPartialAssistantTurnIndex} ` +
+            `callIds=${consolidatedHistoryParts
+              .map((p) => p.functionCall?.id)
+              .filter((id): id is string => Boolean(id))
+              .join(',')} ` +
+            `error=${
+              streamError instanceof Error
+                ? streamError.message
+                : String(streamError)
+            }`,
+        );
       }
       throw streamError;
     }

From ce68749b51d76592ca3c33364a4fdb03d923513a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Wed, 20 May 2026 09:50:15 +0800
Subject: [PATCH 12/18] fix(core): close 6 review threads on partial-tool_use
 repair (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Six follow-ups from the qwen-latest-series-invite-beta-v28 + gpt-5.5
passes on commit 2880de577:

[A] CORRECTNESS — `repairOrphanedToolUseTurns` now HOISTS a real
`functionResponse` from a non-adjacent later user turn into the
IMMEDIATE next user turn (placed before non-fr parts), in addition to
the forward-scan dedup added in 2880de577. The shape `model[fc],
user[text], user[fr_real]` arises when a user aborts a long-running
tool, types a follow-up text turn, and the React scheduler's late
`submitQuery` appends the real `tool_result` as a SEPARATE user entry.
Forward-scan alone correctly skipped the synthesis duplicate, but the
wire layout still serialized `model[tool_use] → user[text] →
user[tool_result]`, which Anthropic-compatible backends reject with
"tool_use_id ... must have a corresponding tool_use block in the
previous message". The hoist physically moves the real fr part out of
its original turn into history[i+1], drops the source turn if it
becomes empty, and reuses the existing front-of-user-turn
insertion logic so caller-supplied fr ordering stays preserved.
Hoisted ids are NOT added to `injected` — the real fr is already in
history (just relocated), so the React scheduler's history-based
dedup handles them naturally without an extra entry that would
double-trigger the dedup-drop log.

Three new tests in `geminiChat.test.ts`:
  - "hoists the real functionResponse from a non-adjacent later user
    turn into the adjacent one" — pure relocate case (source turn had
    only the fr → removed).
  - "synthesizes missing fr AND hoists real fr in a parallel tool_use
    mismatch" — parallel `model[fc_a, fc_b]` with real fr_a in a
    non-adjacent turn: cid_b synthesized, cid_a hoisted, source turn
    removed.
  - "hoists real fr but preserves the source user turn when it
    carries other content" — pins the empty-turn cleanup so it only
    drops turns whose parts list goes to zero (mixed text + fr source
    keeps its text after the fr is extracted).

[B] CRITICAL — moved the deferred chat-recording flush from BEFORE the
max-tokens escalation block into the outer `finally`. The escalated
`makeApiCallAndProcessStream → processStreamResponse` can set a NEW
`pendingPartialAssistantRecord` if it errors mid-tool_use, and that
throw escapes through the for-await without touching the (now-passed)
retry-loop catch. Before this fix the new record was never appended
to JSONL: live history retained the partial `model[functionCall]`
that the escalated processStreamResponse pushed, but the durable
transcript silently dropped it; on `--resume` the transcript-load
path didn't see the partial, `repairOrphanedToolUseTurnsInHistory`
found nothing to repair, and the React scheduler's late real result
became a permanent orphan — reproducing the exact wedge this PR
prevents. Putting the flush in `finally` covers all throw paths
(escalation, post-retry-loop `throw lastError`, consumer `.return()`)
and the normal completion path with one statement, while keeping the
"marker and stash cleared in lockstep" invariant.

One new regression test: "flushes the JSONL record when escalated
stream throws mid-tool_use" — pins the flush behavior under the
specific shape (initial MAX_TOKENS success → escalation cuts mid-
tool_use → throw). Asserts both that the partial survives in
`this.history` and that `recordAssistantTurn` got called with the
partial functionCall id.

[C] DOC — updated `repairOrphanedToolUseTurns` doc comment to spell
out both fix-ups (synthesize + hoist), the wire-format consequence of
removal ("tool_use_id ... must have a corresponding tool_use block"),
and the rule that hoisted ids are not in the returned `injected` list.
The previous "immediately following user turn" wording contradicted
the forward-scan implementation; the new wording matches the actual
behavior across both fix-up paths.

[D] OBSERVABILITY — inline `repairOrphanedToolUseTurns` call in
`sendMessageStream` now logs `[REPAIR] sendMessageStream inline pass
synthesized N functionResponse(s) ...` when synthesis fires. The
startChat() path already logs via `repairOrphanedToolUseTurnsInHistory`
(`[REPAIR] Synthesized ...`), and `useGeminiStream.handleCompletedTools`
logs `[REPAIR] Dropping ...` on dedup. Without a tagged log at the
inline call site, an investigator looking at a dedup-drop had no way
to tell whether the synthetic was planted at session-load or at the
per-send pass.

[E] OBSERVABILITY — `popPartialIfPushed` now logs `[PARTIAL_POP]
Splice skipped` when the marker is set but `history.length <= idx` or
`history[idx]?.role !== 'model'`. This can't happen today (every
history-mutation method clears the marker in lockstep), but the warn
makes any future regression observable rather than silent: a
mutation path that forgets to clear would otherwise leave a stale
partial in history with no diagnostic trace.

[F] CLARITY — added a comment at the post-`tryCompress`
`popPartialIfPushed()` call explaining the intentional defense-in-
depth no-op. `tryCompress()` succeeds → `setHistory()` →
`clearPendingPartialState()`, so the marker is null by the time the
pop runs. The call is kept (not removed) so a future refactor that
switches `tryCompress` to in-place mutation would still drop the
stale partial before `requestContents` is rebuilt.

Tests: 336/336 (108 in geminiChat.test.ts + 4 new = +4; 95
useGeminiStream + 133 client unchanged). tsc + eslint + prettier
clean on changed files.
---
 packages/core/src/core/geminiChat.test.ts | 247 +++++++++++++++---
 packages/core/src/core/geminiChat.ts      | 292 +++++++++++++++++-----
 2 files changed, 432 insertions(+), 107 deletions(-)

diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index 686ed49aad1..c498ef58e26 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -4093,21 +4093,22 @@ describe('GeminiChat', async () => {
       expect((fr?.response as { error?: string })?.error).toBe('custom reason');
     });
 
-    it('does NOT synthesize when the real functionResponse lives in a non-adjacent later user turn', () => {
-      // Regression for the qwen-latest-series-invite-beta-v28 thread on
-      // PR #4176: shape `[user, model[fc], user[text], user[fr_real]]`
-      // arises when the user aborts a long-running tool, types a
-      // follow-up text turn, and then the React scheduler's late
-      // submitQuery appends the real tool_result as a SEPARATE user
-      // entry. The repair pass previously checked only history[i+1]
-      // (the immediate next turn) — the real fr at history[i+2] was
-      // invisible, so the repair synthesized a duplicate `error` fr for
-      // a callId that already had a real result. The downstream dedup
-      // in `useGeminiStream.handleCompletedTools` then dropped the REAL
-      // tool_result on the next submission (its callId was now
-      // "already in history" because of the synthetic), and the model
-      // only ever saw the error placeholder — silently swapping a real
-      // success for an error.
+    it('hoists the real functionResponse from a non-adjacent later user turn into the adjacent one', () => {
+      // Regression for the gpt-5.5 thread on PR #4176: shape
+      // `[user, model[fc], user[text], user[fr_real]]` arises when the
+      // user aborts a long-running tool, types a follow-up text turn,
+      // and then the React scheduler's late submitQuery appends the
+      // real tool_result as a SEPARATE user entry.
+      //
+      // Forward scanning alone (qwen-latest-series-invite-beta-v28
+      // thread, fixed in commit 2880de577) prevents the *synthesis*
+      // duplicate, but the wire layout is still
+      // `model[tool_use] → user[text] → user[tool_result]`, which
+      // Anthropic-compatible backends reject because the tool_result
+      // is not at the head of the IMMEDIATELY following user message.
+      // The repair must MOVE the real fr from history[3] into
+      // history[2] (before the text part) so the wire format becomes
+      // `model[tool_use] → user[tool_result, text]`.
       chat.setHistory([
         { role: 'user', parts: [{ text: 'open /tmp/long.txt' }] },
         {
@@ -4139,29 +4140,33 @@ describe('GeminiChat', async () => {
 
       const result = chat.repairOrphanedToolUseTurns();
 
-      // No synthesis: the real fr at history[3] satisfies the model[fc]
-      // at history[1].
+      // No synthesis (the fr is real, just relocated) — `injected`
+      // stays empty so the React scheduler dedup doesn't see it as a
+      // synthesized callId.
       expect(result.injected).toEqual([]);
       const history = chat.getHistory();
-      // History shape is unchanged: 4 entries, no inserted synthetic.
-      expect(history.length).toBe(4);
-      expect(history[3]!.parts![0]!.functionResponse?.response).toEqual({
+      // History is now 3 entries: the source turn for the hoisted fr
+      // had only the one fr part, so it becomes empty and is removed.
+      expect(history.length).toBe(3);
+      // Real fr now at the head of the immediate next user turn,
+      // before the text part, satisfying the wire-format invariant.
+      expect(history[2]!.parts![0]!.functionResponse?.id).toBe(
+        'call_nonadjacent_real',
+      );
+      expect(history[2]!.parts![0]!.functionResponse?.response).toEqual({
         output: 'real file contents',
       });
-      // The follow-up text turn is untouched (no synthetic fr hoisted
-      // into it — confirms the forward-scan fix, not just a no-op
-      // synthesis).
-      expect(history[2]!.parts).toEqual([
-        { text: 'never mind, do something else' },
-      ]);
+      expect(history[2]!.parts![1]).toEqual({
+        text: 'never mind, do something else',
+      });
     });
 
-    it('still synthesizes for a partial mismatch when forward scan misses one callId', () => {
-      // Counterpart to the above: when the real fr only covers SOME
-      // of the callIds in a parallel tool_use, the missing ones must
-      // still get synthetic error fr's hoisted onto the immediate next
-      // user turn (preserving the existing parallel-tool_use behavior).
-      // Confirms the forward-scan didn't accidentally over-collect.
+    it('synthesizes missing fr AND hoists real fr in a parallel tool_use mismatch', () => {
+      // Counterpart to the hoist case: when the real fr only covers
+      // SOME callIds in a parallel tool_use, and the real one is in a
+      // non-adjacent later user turn, BOTH fix-ups apply on the same
+      // model turn — synthesize the missing callId AND hoist the real
+      // fr from the non-adjacent location into the adjacent turn.
       chat.setHistory([
         { role: 'user', parts: [{ text: 'fan out two reads' }] },
         {
@@ -4192,17 +4197,73 @@ describe('GeminiChat', async () => {
 
       const result = chat.repairOrphanedToolUseTurns();
 
-      // cid_a satisfied by real fr at history[3]; cid_b missing →
-      // synthetic for cid_b only.
+      // cid_b synthesized (no real fr anywhere). cid_a is hoisted, not
+      // synthesized — `injected` only contains the synthetic.
       expect(result.injected).toEqual([{ callId: 'cid_b', name: 'read_file' }]);
       const history = chat.getHistory();
+      // The non-adjacent turn that held cid_a's real fr is now empty
+      // and removed → 3 entries instead of the original 4.
+      expect(history.length).toBe(3);
+      // Adjacent user turn now leads with the synthesized fr_b, then
+      // the hoisted real fr_a, then the text. Both tool_results sit
+      // at the head, satisfying the Anthropic wire-format invariant.
+      const adjacentParts = history[2]!.parts!;
+      expect(adjacentParts[0]!.functionResponse?.id).toBe('cid_b');
+      expect(
+        (adjacentParts[0]!.functionResponse?.response as { error?: string })
+          ?.error,
+      ).toBeDefined();
+      expect(adjacentParts[1]!.functionResponse?.id).toBe('cid_a');
+      expect(adjacentParts[1]!.functionResponse?.response).toEqual({
+        output: 'real for a',
+      });
+      expect(adjacentParts[2]).toEqual({ text: 'follow up' });
+    });
+
+    it('hoists real fr but preserves the source user turn when it carries other content', () => {
+      // Edge case for the hoist path: if the source turn for the real
+      // fr ALSO carries text (or any non-fr part), removing the fr
+      // alone must NOT delete the turn — the remaining text is the
+      // user's real message and must be preserved at its original
+      // position. Confirms the empty-turn cleanup only deletes turns
+      // whose parts list goes to zero after the splice.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: { id: 'cid_mix', name: 'read_file', args: {} },
+            },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'never mind' }] },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_mix',
+                name: 'read_file',
+                response: { output: 'data' },
+              },
+            },
+            { text: 'thanks anyway' },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected).toEqual([]);
+      const history = chat.getHistory();
+      // The source turn lost its fr but kept its trailing text, so
+      // history is still 4 entries — the source turn survives as a
+      // text-only user message.
       expect(history.length).toBe(4);
-      // Synthetic for cid_b hoisted into history[2] (immediate next
-      // user turn) before the text part.
-      expect(history[2]!.parts![0]!.functionResponse?.id).toBe('cid_b');
-      expect(history[2]!.parts![1]).toEqual({ text: 'follow up' });
-      // Real fr for cid_a still at history[3], untouched.
-      expect(history[3]!.parts![0]!.functionResponse?.id).toBe('cid_a');
+      expect(history[2]!.parts![0]!.functionResponse?.id).toBe('cid_mix');
+      expect(history[2]!.parts![1]).toEqual({ text: 'never mind' });
+      expect(history[3]!.parts).toEqual([{ text: 'thanks anyway' }]);
     });
   });
 
@@ -4503,6 +4564,112 @@ describe('GeminiChat', async () => {
         .join('');
       expect(mergedText).toBe('BCD');
     });
+
+    it('flushes the JSONL record when escalated stream throws mid-tool_use (qwen-latest-series-invite-beta-v28 thread on PR #4176)', async () => {
+      // Critical regression for the max-tokens escalation path:
+      // 1) initial stream succeeds with text + MAX_TOKENS → triggers
+      //    escalation, no partial set, deferred record clean.
+      // 2) escalated stream throws AFTER yielding a functionCall chunk
+      //    → processStreamResponse pushes a partial model[fc] into
+      //    `this.history` and stashes a NEW `pendingPartialAssistantRecord`.
+      // 3) The throw escapes through the for-await on the escalated
+      //    stream, propagates past the (now-passed) retry loop, and
+      //    lands in the outer `finally` block.
+      //
+      // BEFORE the fix: the flush only ran BEFORE the escalation block,
+      // so the new record set in step 2 was never appended to JSONL —
+      // live history disagreed with disk; `--resume` rehydrated a
+      // truncated transcript and `repairOrphanedToolUseTurnsInHistory`
+      // had nothing to repair, leaving the React scheduler's late real
+      // result as a permanent orphan.
+      //
+      // AFTER the fix: the flush is in `finally`, so the record lands
+      // on disk regardless of which stream raised.
+      const recordAssistantTurn = vi.fn();
+      const chatWithRecording = new GeminiChat(
+        mockConfig,
+        config,
+        [],
+        {
+          recordAssistantTurn,
+          recordChatCompression: vi.fn(),
+        } as unknown as ConstructorParameters<typeof GeminiChat>[3],
+        uiTelemetryService,
+      );
+
+      // Stream 1: text + MAX_TOKENS (success, triggers escalation).
+      // Stream 2: yields a functionCall chunk THEN throws — simulates a
+      // mid-tool_use stream cut on the escalated request.
+      const streams = [
+        makeStream([makeChunk([{ text: 'partial answer' }], 'MAX_TOKENS')]),
+        (async function* () {
+          yield {
+            candidates: [
+              {
+                content: {
+                  parts: [
+                    {
+                      functionCall: {
+                        id: 'call_escalation_throw',
+                        name: 'read_file',
+                        args: { path: '/tmp/escalated.txt' },
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+          throw new Error('synthetic mid-tool_use cut on escalated stream');
+        })(),
+      ];
+      let callIndex = 0;
+      vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
+        async () => streams[callIndex++]!,
+      );
+
+      const stream = await chatWithRecording.sendMessageStream(
+        'gemini-3-pro',
+        { message: 'kick off' },
+        'prompt-escalation-flush',
+      );
+
+      // Consume the stream and expect the synthetic mid-tool_use error
+      // to escape (escalation errors do not retry).
+      await expect(
+        (async () => {
+          for await (const _ of stream) {
+            /* consume */
+          }
+        })(),
+      ).rejects.toThrow(/synthetic mid-tool_use cut/);
+
+      // In-memory: the partial functionCall pushed by the escalated
+      // processStreamResponse must be in history.
+      const history = chatWithRecording.getHistory();
+      const partialModel = history.findLast((h) => h.role === 'model');
+      expect(
+        partialModel?.parts?.some(
+          (p) => p.functionCall?.id === 'call_escalation_throw',
+        ),
+      ).toBe(true);
+
+      // JSONL: at least one record must mention the partial functionCall
+      // (the escalation throw flushed it). Without the finally-block
+      // flush, this assertion would fail and the durable transcript
+      // would silently lose a tool_use that's still live in memory.
+      const recordedHasPartial = recordAssistantTurn.mock.calls.some((call) => {
+        const message = (
+          call[0] as {
+            message?: Array<{ functionCall?: { id?: string } }>;
+          }
+        )?.message;
+        return message?.some(
+          (p) => p.functionCall?.id === 'call_escalation_throw',
+        );
+      });
+      expect(recordedHasPartial).toBe(true);
+    });
   });
 
   describe('redactStructuredOutputArgsForRecording', () => {
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 8ab9947bd58..736c07bef6f 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -397,16 +397,38 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
 
 /**
  * Walk `history` left-to-right and close every dangling tool_use ↔ tool_result
- * pair by synthesizing a `functionResponse` with an `error` field for any
- * `functionCall` part whose `id` is not echoed back in the immediately
- * following user turn.
+ * pair so the wire format the next API call sees is always
+ * `model[fc] → user[fr]` with the `fr` blocks at the head of the immediately
+ * following user turn. Two fix-ups can run for each `model[functionCall]`:
+ *
+ *  - SYNTHESIZE: for any `functionCall.id` not echoed back by ANY of the
+ *    consecutive user turns that follow it (up to the next model turn or
+ *    end-of-history), insert a synthetic `functionResponse` carrying an
+ *    `error` field — the close analogue of upstream Claude Code's
+ *    `yieldMissingToolResultBlocks` (`query.ts:123-149`).
+ *  - HOIST: for any `functionCall.id` whose real `functionResponse` lives in
+ *    a non-adjacent following user turn (typical shape:
+ *    `model[fc], user[text], user[fr_real]` — produced when a user aborts a
+ *    long-running tool, types a follow-up, and the React scheduler's late
+ *    `submitQuery` appends the real `fr` as a SEPARATE user entry), MOVE the
+ *    real `fr` part out of its original turn into the adjacent one. Without
+ *    hoisting, the synthesis pass correctly skips the call (a real `fr`
+ *    exists somewhere later) but the wire layout still serializes
+ *    `model[tool_use] → user[text] → user[tool_result]`, which
+ *    Anthropic-compatible backends reject with "tool_use_id ... must have a
+ *    corresponding tool_use block in the previous message". gpt-5.5 review
+ *    thread on PR #4176.
  *
  * Mutates `history` in place and returns the set of injected `(callId, name)`
  * tuples so callers (the React tool scheduler) can dedupe a real `tool_result`
- * if the in-flight tool completes after the repair.
+ * if the in-flight tool completes after the repair. Hoisted ids are NOT in
+ * the returned list — the real `fr` is already present in history, so the
+ * scheduler's existing history-based dedup handles them without extra entries.
  *
- * The synthesis target follows this rule:
- *  - If the next entry is a `user` turn → append synthetic parts to it.
+ * The injection target for synthesized parts follows this rule:
+ *  - If the next entry is a `user` turn → insert synthetic parts at the head
+ *    (before any non-`functionResponse` parts; after any pre-existing real
+ *    `functionResponse` parts so caller-supplied ordering is preserved).
  *  - If the next entry is a `model` turn or end-of-history → insert a new
  *    `user` turn between them carrying just the synthetic parts.
  *
@@ -442,81 +464,149 @@ export function repairOrphanedToolUseTurns(
     if (expected.size === 0) continue;
 
     // Scan forward across EVERY consecutive user turn until the next
-    // non-user (model) entry, gathering all functionResponse ids. The
-    // real tool_result for this model[fc] may live in a non-adjacent
-    // user turn — common shape: user aborts a long-running tool, types
-    // a follow-up text turn, then the React scheduler's late
-    // submitQuery appends the real `user[fr]` as a SEPARATE turn,
-    // producing `model[fc], user[text], user[fr_real]`. Without
-    // forward scanning, `matched` would be empty (only `user[text]`
-    // visited), the repair would synthesize an `error` `functionResponse`
-    // for a callId that already has a real result downstream, and the
-    // `handleCompletedTools` dedup would then drop the REAL result on
-    // the next submission (callId already in history → submit skipped),
-    // so the model only ever sees the synthetic error placeholder.
-    // (qwen-latest-series-invite-beta-v28 thread on PR #4176.)
-    const matched = new Set<string>();
+    // non-user (model) entry, recording (id → location) for every
+    // functionResponse part. The real tool_result for this model[fc]
+    // may live in a non-adjacent user turn — common shape: user aborts
+    // a long-running tool, types a follow-up text turn, then the React
+    // scheduler's late submitQuery appends the real `user[fr]` as a
+    // SEPARATE user entry, producing `model[fc], user[text], user[fr_real]`.
+    //
+    // We need the full location (turn index + part index) for two
+    // reasons:
+    //  - synthesis: skip ids that already have a real fr SOMEWHERE so we
+    //    don't plant a duplicate `error` placeholder. Without this, the
+    //    `handleCompletedTools` history-based dedup would then drop the
+    //    REAL result on the next submission (callId already in history
+    //    via the synthetic), and the model would only ever see the error.
+    //  - hoist: ids matched in a NON-adjacent later user turn must be
+    //    physically moved into history[i+1] (before non-fr parts) so the
+    //    wire format stays `model[tool_use] → user[tool_result, ...]`.
+    //    Anthropic-compatible backends require the tool_result blocks at
+    //    the head of the immediately following user message, otherwise
+    //    they reject with "tool_use_id ... must have a corresponding
+    //    tool_use block in the previous message". gpt-5.5 review thread
+    //    on PR #4176.
+    const matched = new Map<
+      string,
+      { turnIdx: number; partIdx: number; part: Part }
+    >();
     let scanIdx = i + 1;
     while (scanIdx < history.length && history[scanIdx]?.role === 'user') {
-      for (const part of history[scanIdx].parts ?? []) {
+      const parts = history[scanIdx].parts ?? [];
+      for (let pIdx = 0; pIdx < parts.length; pIdx++) {
+        const part = parts[pIdx];
         const id = part.functionResponse?.id;
-        if (id) matched.add(id);
+        if (id && !matched.has(id)) {
+          matched.set(id, { turnIdx: scanIdx, partIdx: pIdx, part });
+        }
       }
       scanIdx++;
     }
 
-    const missing = [...expected.entries()].filter(([id]) => !matched.has(id));
-    if (missing.length === 0) continue;
-
-    const next = history[i + 1];
+    const synthesizeIds: Array<[string, string]> = [];
+    const hoistLocations: Array<{
+      turnIdx: number;
+      partIdx: number;
+      part: Part;
+    }> = [];
+    for (const [id, name] of expected) {
+      const loc = matched.get(id);
+      if (!loc) {
+        synthesizeIds.push([id, name]);
+      } else if (loc.turnIdx !== i + 1) {
+        hoistLocations.push(loc);
+      }
+      // else: already in the immediate next user turn — wire format ok.
+    }
+    if (synthesizeIds.length === 0 && hoistLocations.length === 0) continue;
 
-    const syntheticParts: Part[] = missing.map(([callId, name]) => ({
+    const syntheticParts: Part[] = synthesizeIds.map(([callId, name]) => ({
       functionResponse: {
         id: callId,
         name,
         response: { error: reason },
       },
     }));
+    // Hoisted parts come from REAL tool_results elsewhere in history.
+    // We capture references first, then splice them out below in
+    // descending order so earlier removals don't shift later indices.
+    const hoistedParts: Part[] = hoistLocations.map((loc) => loc.part);
+    // Synthetics first, then hoisted. Order between synthetic and
+    // hoisted is internal — backends just need ALL tool_results at the
+    // head of the user turn, regardless of order among themselves.
+    const partsToInject: Part[] = [...syntheticParts, ...hoistedParts];
+
+    // Remove hoisted parts from their original turns. Sort by
+    // (turnIdx desc, partIdx desc) so each splice operates on stable
+    // indices for everything still to be removed.
+    const removalOrder = [...hoistLocations].sort((a, b) => {
+      if (a.turnIdx !== b.turnIdx) return b.turnIdx - a.turnIdx;
+      return b.partIdx - a.partIdx;
+    });
+    for (const loc of removalOrder) {
+      const turnParts = history[loc.turnIdx].parts;
+      if (turnParts) turnParts.splice(loc.partIdx, 1);
+    }
+    // Drop any user turn within the scan range (i+2 .. scanIdx-1) that
+    // is now empty because we extracted its only fr part(s). Walk back
+    // to front so removals don't shift remaining indices. The
+    // immediately-adjacent turn at i+1 is preserved even if empty —
+    // we'll rewrite its parts below. After this, scanIdx is no longer
+    // accurate; we rebind via `next` directly.
+    for (let j = scanIdx - 1; j > i + 1; j--) {
+      if (
+        history[j]?.role === 'user' &&
+        (history[j].parts?.length ?? 0) === 0
+      ) {
+        history.splice(j, 1);
+      }
+    }
 
+    const next = history[i + 1];
     if (next?.role === 'user') {
-      // Synthetic functionResponse parts MUST be placed before any
-      // non-functionResponse parts in the user turn. Anthropic-compatible
-      // backends reject a user message whose first content block isn't
-      // the tool_result that answers the immediately preceding tool_use
-      // ("tool result must follow tool use" / "tool_use_id ... must have
-      // a corresponding tool_use block in the previous message"). Common
-      // case after a Ctrl+Y race: the user's retry-prompt text was just
-      // pushed, so `next.parts = [text]`; appending the synthetic to the
-      // end would produce `[text, fr]` and re-trigger the wedge this PR
+      // Synthesized + hoisted functionResponse parts MUST be placed
+      // before any non-functionResponse parts in the user turn.
+      // Anthropic-compatible backends reject a user message whose first
+      // content block isn't the tool_result that answers the
+      // immediately preceding tool_use ("tool result must follow tool
+      // use" / "tool_use_id ... must have a corresponding tool_use
+      // block in the previous message"). Common case after a Ctrl+Y
+      // race: the user's retry-prompt text was just pushed, so
+      // `next.parts = [text]`; appending the synthetic to the end
+      // would produce `[text, fr]` and re-trigger the wedge this PR
       // is supposed to escape. Mirrors upstream Claude Code's
       // `hoistToolResults` (`utils/messages.ts`).
       //
       // CONSEQUENCE OF REMOVAL: dropping this hoist (e.g. naively
-      // `next.parts = [...existing, ...syntheticParts]`) re-introduces
+      // `next.parts = [...existing, ...partsToInject]`) re-introduces
       // the exact 400 "tool_use_id ... must have a corresponding tool_use
       // block in the previous message" the synthesis pass exists to
       // prevent. The whole repair becomes a no-op and the session stays
       // wedged. Do not "simplify" this branch.
       //
-      // Place synthetics AFTER any pre-existing functionResponse parts
-      // (real tool_results the user is supplying in the same turn) so
-      // their original ordering is preserved.
+      // Place new parts AFTER any pre-existing functionResponse parts
+      // (real tool_results the user already had at the head of this
+      // turn) so caller-supplied ordering is preserved.
       const existing = next.parts ?? [];
       const firstNonFr = existing.findIndex((part) => !part.functionResponse);
       const insertAt = firstNonFr === -1 ? existing.length : firstNonFr;
       next.parts = [
         ...existing.slice(0, insertAt),
-        ...syntheticParts,
+        ...partsToInject,
         ...existing.slice(insertAt),
       ];
     } else {
-      history.splice(i + 1, 0, { role: 'user', parts: syntheticParts });
+      history.splice(i + 1, 0, { role: 'user', parts: partsToInject });
       // Skip the freshly-inserted user turn so the outer loop doesn't
       // visit it as a model turn (it isn't) and stays linear-time.
       i++;
     }
 
-    for (const [callId, name] of missing) {
+    // Only synthesized ids feed dedup — hoisted ids reference real frs
+    // that were ALREADY in history before this pass and remain present
+    // (just relocated). The scheduler's history-based dedup will
+    // continue to handle those naturally on its next pass.
+    for (const [callId, name] of synthesizeIds) {
       injected.push({ callId, name });
     }
   }
@@ -930,7 +1020,25 @@ export class GeminiChat {
       // The React scheduler's late real result is then dedup'd against
       // chat.history in `useGeminiStream.handleCompletedTools` so the
       // synthetic doesn't collide with it on the wire.
-      repairOrphanedToolUseTurns(this.history);
+      //
+      // Diagnostic: log non-empty inline-repair results. The startChat()
+      // path logs synthesis events through `repairOrphanedToolUseTurnsInHistory`
+      // (`[REPAIR] Synthesized N functionResponse(s) ...`) and dedup events
+      // through `useGeminiStream.handleCompletedTools` (`[REPAIR] Dropping ...`),
+      // but this inline call site was previously silent — when a dedup-drop
+      // log shows up, investigators had no way to tell whether the
+      // synthetic was planted at session-load or at this per-send pass.
+      // Tag the log site so the lifecycle anchor is unambiguous.
+      const inlineRepair = repairOrphanedToolUseTurns(this.history);
+      if (inlineRepair.injected.length > 0) {
+        debugLogger.warn(
+          `[REPAIR] sendMessageStream inline pass synthesized ` +
+            `${inlineRepair.injected.length} functionResponse(s): ` +
+            inlineRepair.injected
+              .map((entry) => `${entry.name}(${entry.callId})`)
+              .join(', '),
+        );
+      }
       requestContents = this.getHistory(true);
     } catch (error) {
       if (userContentAdded) {
@@ -1034,6 +1142,28 @@ export class GeminiChat {
                 self.history[idx]?.role === 'model'
               ) {
                 self.history.splice(idx, 1);
+              } else {
+                // Marker was set but the entry it pointed at is gone or
+                // is no longer a `model` turn. Today this can't happen:
+                // every history-mutation path (clearHistory, addHistory,
+                // setHistory, truncateHistory, stripThoughtsFromHistory,
+                // stripOrphanedUserEntriesFromHistory) calls
+                // clearPendingPartialState() in lockstep, so the marker
+                // is null whenever the index basis is invalidated.
+                // Logging the mismatch makes the invariant observable —
+                // without this, a future caller that mutates history
+                // without resetting the marker would silently leave a
+                // stale partial in `this.history` (popPartialIfPushed
+                // skipping the splice) AND the field-level invariant
+                // that "marker non-null ⇒ a real partial sits at idx"
+                // would be quietly violated. With the warn, anyone
+                // investigating a stale-partial wedge sees a log line
+                // pointing straight at the offending caller.
+                debugLogger.warn(
+                  `[PARTIAL_POP] Splice skipped: idx=${idx}, ` +
+                    `historyLength=${self.history.length}, ` +
+                    `roleAtIdx=${self.history[idx]?.role ?? 'undefined'}`,
+                );
               }
               // Drop both markers in lockstep — the deferred chat-
               // recording record must be discarded alongside the
@@ -1125,6 +1255,21 @@ export class GeminiChat {
                     reactiveInfo.compressionStatus ===
                     CompressionStatus.COMPRESSED
                   ) {
+                    // Defense-in-depth no-op: tryCompress() succeeded
+                    // means it has already replaced this.history via
+                    // setHistory(), which calls clearPendingPartialState()
+                    // — so by the time we reach this line, the marker is
+                    // null and popPartialIfPushed splices nothing. We
+                    // keep the call as a uniformity assertion against
+                    // future refactors that might switch tryCompress to
+                    // an in-place mutation: in that world, the marker
+                    // would NOT be reset by setHistory and this call
+                    // becomes the only thing that drops the stale
+                    // partial before requestContents is rebuilt below.
+                    // Removing it would couple correctness to the
+                    // implementation detail "setHistory always clears
+                    // the marker", which the other retry branches don't
+                    // share.
                     popPartialIfPushed();
                     requestContents = self.getHistory(true);
                     debugLogger.info(
@@ -1237,29 +1382,6 @@ export class GeminiChat {
           }
         }
 
-        // The retry loop has settled: any partial that was rolled back
-        // had its stash cleared by `popPartialIfPushed`; any partial that
-        // survived (success break with no partial set, or unretryable
-        // break with the partial kept) is now durable in memory, so the
-        // deferred chat-recording append must finally land on disk.
-        // Without this flush the unretryable-break path persists the
-        // partial in `this.history` but the JSONL transcript silently
-        // drops it — `--resume` then loads a truncated transcript that
-        // doesn't match the live session shape, and the orphan-tool_use
-        // repair pass at session-load has nothing to repair.
-        if (self.pendingPartialAssistantRecord) {
-          self.chatRecordingService?.recordAssistantTurn(
-            self.pendingPartialAssistantRecord,
-          );
-          // Clear both fields in lockstep. The marker is no longer
-          // load-bearing past this point (its consumer is the for-loop
-          // catch above, which has exited), and the next
-          // sendMessageStream entry would clear it anyway — but pairing
-          // the reset preserves the "marker and stash are always set or
-          // cleared together" invariant the helper enforces.
-          self.clearPendingPartialState();
-        }
-
         // Max output tokens escalation: if the retry loop succeeded with
         // the capped default (8K) but hit MAX_TOKENS, retry once at the
         // model's full output limit. This ensures models with large output
@@ -1432,6 +1554,42 @@ export class GeminiChat {
         }
       } finally {
         streamDoneResolver!();
+        // Flush any deferred partial-tool_use record into the JSONL
+        // transcript. The retry loop and the post-loop max-tokens
+        // escalation block can BOTH leave one of these on the chat:
+        //
+        //  - Retry loop: any partial rolled back by popPartialIfPushed
+        //    has its stash cleared; any partial that survived (success
+        //    break with no partial set, or unretryable break with the
+        //    partial kept) leaves its record set so we record-and-clear
+        //    it here.
+        //  - Max-tokens escalation: the escalated stream re-enters
+        //    `processStreamResponse`, which sets a NEW
+        //    `pendingPartialAssistantRecord` if it errors mid-tool_use.
+        //    That throw propagates through the for-await above without
+        //    touching the (now-passed) retry-loop catch, so without a
+        //    flush in `finally` the partial would be live in
+        //    `this.history` (the escalated processStreamResponse already
+        //    pushed it) but absent from the JSONL transcript. `--resume`
+        //    would then rehydrate a truncated transcript whose live
+        //    history disagrees with disk, and
+        //    `repairOrphanedToolUseTurnsInHistory` would find nothing to
+        //    repair on load — the React scheduler's late real result
+        //    becomes a permanent orphan, reproducing the exact wedge
+        //    this PR prevents.
+        //
+        // Putting the flush in `finally` covers ALL throw paths
+        // (escalation, post-retry-loop `throw lastError`, the for-await
+        // consumer's `.return()` if it abandons the generator) and the
+        // normal completion path with a single statement. The marker
+        // and stash are dropped together to preserve the
+        // "marker non-null ⇔ stash non-null" invariant.
+        if (self.pendingPartialAssistantRecord) {
+          self.chatRecordingService?.recordAssistantTurn(
+            self.pendingPartialAssistantRecord,
+          );
+          self.clearPendingPartialState();
+        }
       }
     })();
   }

From c8fa3143f3f454c1b5f9f835858e5dcbcfd76ea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Wed, 20 May 2026 13:15:09 +0800
Subject: [PATCH 13/18] fix(core,cli): close 10 review threads on
 partial-tool_use repair (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five Critical + four Suggestion threads from the gpt-5.5 +
qwen-latest-series-invite-beta-v34 passes on commit ce68749b5, plus
one N/A documented in code/test comments where the requested test
target is provably unreachable.

[A] CORRECTNESS — `repairOrphanedToolUseTurns` now collects EVERY
location for each `functionResponse` id, not just the first.
Previously the `matched` Map stored only the first occurrence per
callId, so when the same id was echoed back more than once across
the consecutive user turns (shape `model[fc id=cid], user[text],
user[fr cid], user[fr cid]` — possible when the React scheduler
retries the late submitQuery after the orphan repair already planted
one, or two parallel late-submit paths land), hoisting only the
first left a duplicate behind. The wire payload then serialized
`model[tool_use] -> user[tool_result] -> user[tool_result]`, the
backend rejected the trailing block as an orphan, and the session
stayed wedged for the same 400 class this whole repair pass exists
to escape. Fix: track all locations per id; pick the first as the
canonical survivor (hoisted into history[i+1] when non-adjacent, or
left in place when adjacent), and drop EVERY duplicate. New tests:
"drops duplicate functionResponse entries for the same callId across
user turns" (canonical non-adjacent + duplicate further down →
3-entry result), "drops duplicate fr even when the canonical copy is
already in the adjacent turn" (no hoist needed but duplicate cleanup
must still fire). gpt-5.5 review thread on PR #4176.

[B] CORRECTNESS — Max-tokens recovery catch (`recoveryError` block)
now pops the partial `model` turn FIRST, then the
OUTPUT_RECOVERY_MESSAGE user turn. Before the fix, when a recovery
stream errored AFTER yielding a `functionCall` chunk,
`processStreamResponse` pushed a partial `model` turn into history
before re-throwing — so by the time the catch ran, the trailing
entries were `[..., user(OUTPUT_RECOVERY_MESSAGE), model(partial fc)]`
and the naive `if last is user, pop` check no-op'd, stranding the
control prompt as a real user turn. Two consequences: the recovery
prompt's instructions polluted later turns, and the inline repair
on the next sendMessageStream synthesized an `error` `functionResponse`
for the dangling `functionCall` — which the dedup then dropped when
the React scheduler's REAL tool result arrived, so the model saw an
"execution result was not recorded" error for a tool that actually
succeeded. Fix: pop the partial model turn, clear the partial-push
markers, then pop the recovery user turn. New regression test:
"should pop both the partial model turn AND the recovery user
message when recovery throws after a functionCall".
qwen-latest-series-invite-beta-v34 review thread.

[C] PERFORMANCE — added `getHistoryFunctionResponseIds(): Set<string>`
to `GeminiChat` and a wrapper on `GeminiClient`. The dedup pass in
`useGeminiStream.handleCompletedTools` previously called
`geminiClient.getHistory()`, which returns `structuredClone(this.history)`
— a recursive deep clone of every part including large tool outputs.
On long sessions (200+ entries) running on every tool-completion
batch, this caused multi-millisecond stalls on the React UI thread
during streaming. The new accessor walks `this.history` in place
and only collects the id strings the dedup actually needs.
`useGeminiStream` prefers it; falls back to the cloning getHistory()
for older mocks that don't expose it (tests stay green).
qwen-latest-series-invite-beta-v34 review thread.

[D] ROBUSTNESS — wrapped the `finally`-block JSONL flush in
try/catch. Recording-service errors (disk full, write permission,
serialization failure) MUST NOT propagate out of the generator's
`finally` — that would mask the real send outcome (success or
original throw) with a JSONL-write error the caller can't usefully
act on. On failure the partial is dropped from JSONL but stays
durable in `this.history`, so live behavior is unaffected; eventual
consistency is restored on the next successful flush.
qwen-latest-series-invite-beta-v34 review thread.

[E] TEST COVERAGE — added an InvalidStreamError transient-retry
rollback test exercising `popPartialIfPushed()` on the
NO_FINISH_REASON / NO_RESPONSE_TEXT path (separate budget from the
already-tested rate-limit branch). Stream yields a functionCall
then throws InvalidStreamError; retry succeeds; assert no orphan
functionCall remains in history. qwen-latest-series-invite-beta-v34
review thread.

[F] N/A WITH RATIONALE — verified that the InvalidStreamError
content-retry branch (geminiChat.ts ~line 1399) is unreachable for
that error class: `isTransientStreamError` and `isContentError` are
the same predicate (`error instanceof InvalidStreamError`), so the
transient branch above always either continues or breaks before
control reaches the content branch. The `popPartialIfPushed()` call
is preserved as defense-in-depth for a future error class that
should diverge the predicates. Documented at the call site (code
comment) and at the would-be test site (test-block comment) so
future readers see the unreachability analysis instead of repeating
the review-comment ask.

[G] DRIFT GUARD — `recordAssistantTurn` and `this.history.push`
gates now share the single `willPersistToHistory` binding instead of
re-deriving the same `hasToolCall && (thoughtContentPart || ...)`
expression. Drift risk: tightening one without the other would
silently desync the JSONL transcript from in-memory history.
qwen-latest-series-invite-beta-v34 review thread.

[H] DOUBLE-DISPATCH — `clientTools` filter in
`useGeminiStream.handleCompletedTools` now excludes callIds whose
fr is already in history. Previously a deduped client-initiated
tool would have `markToolsAsSubmitted` called twice (once in the
dedup block, once in the clientTools block), triggering an extra
React render cycle. qwen-latest-series-invite-beta-v34 review thread.

[I] TEST COVERAGE — added six tests under "partial-push marker
invariants on history mutation" pinning that each of the six
mutation methods (clearHistory, addHistory, setHistory,
truncateHistory, stripThoughtsFromHistory,
stripOrphanedUserEntriesFromHistory) calls clearPendingPartialState()
in lockstep. Markers planted directly via private-field assignment
(the lifecycle would otherwise clear them in finally before the
test could observe them). Future refactors that drop a
clearPendingPartialState() call from one site are now caught at
test time rather than as a silent stale-partial wedge in production.
qwen-latest-series-invite-beta-v34 review thread.

[J] TEST COVERAGE — added a mixed-batch dedup test in
useGeminiStream.test.tsx: scheduler completes two tools in the same
batch, one whose callId already has an fr in history (deduped), one
fresh. Asserts (a) markToolsAsSubmitted called with both, (b)
recordCompletedToolCall fires exactly once per tool (deduped via
the dedup-loop, fresh via the geminiTools loop — no double-record
on the deduped id), (c) sendMessageStream IS called for the fresh
tool's real result. The `!historyCallIdsWithResponse.has(callId)`
filter on `geminiTools` is the only guard against double-counting
telemetry; existing tests supplied only deduped tools.
qwen-latest-series-invite-beta-v34 review thread.

Tests: 347/347 (118 in geminiChat.test.ts, 96 in
useGeminiStream.test.tsx, 133 in client.test.ts; net +8 new tests).
tsc + eslint + prettier clean on changed files.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx | 190 +++++++
 packages/cli/src/ui/hooks/useGeminiStream.ts  |  43 +-
 packages/core/src/core/client.ts              |  16 +
 packages/core/src/core/geminiChat.test.ts     | 477 ++++++++++++++++++
 packages/core/src/core/geminiChat.ts          | 192 +++++--
 5 files changed, 880 insertions(+), 38 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index 797f875899e..e951912cc6b 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -1137,6 +1137,196 @@ describe('useGeminiStream', () => {
     releaseStream();
   });
 
+  it('handles a mixed batch (one deduped + one non-deduped) without double-counting telemetry (qwen-latest-series-invite-beta-v34 thread on PR #4176)', async () => {
+    // The dedup filter on `geminiTools` (`!historyCallIdsWithResponse.has(callId)`)
+    // is the only thing preventing double `recordCompletedToolCall`
+    // for tools whose late real result lands AFTER the orphan-tool_use
+    // repair already planted a synthetic. Existing dedup tests supply
+    // ONLY deduped tools, so a regression that removed that filter
+    // would silently inflate `toolCallCount` (and flip
+    // `skillsModifiedInSession` for the SAME skill-write callId twice)
+    // without breaking any current test.
+    //
+    // Mixed-batch repro: scheduler completes two tools in the same
+    // batch — one whose callId already has a fr in history (deduped),
+    // one whose callId is fresh (must reach sendMessageStream). Pin:
+    //   (a) markToolsAsSubmitted called with BOTH callIds,
+    //   (b) recordCompletedToolCall fires once per non-deduped tool,
+    //       NOT twice for the deduped one,
+    //   (c) sendMessageStream IS called (the non-deduped tool's real
+    //       result must reach the wire).
+    const dedupedTool = {
+      request: {
+        callId: 'call_mixed_deduped',
+        name: 'read_file',
+        args: { path: '/tmp/d.txt' },
+        isClientInitiated: false,
+        prompt_id: 'prompt-mixed',
+      },
+      status: 'success',
+      responseSubmittedToGemini: false,
+      response: {
+        callId: 'call_mixed_deduped',
+        responseParts: [
+          {
+            functionResponse: {
+              id: 'call_mixed_deduped',
+              name: 'read_file',
+              response: { output: 'late real for deduped' },
+            },
+          },
+        ],
+        resultDisplay: undefined,
+        error: undefined,
+        errorType: undefined,
+      },
+      tool: {
+        name: 'read_file',
+        displayName: 'ReadFile',
+        description: 'Read a file',
+        build: vi.fn(),
+      } as any,
+      invocation: {
+        getDescription: () => 'read /tmp/d.txt',
+      } as unknown as AnyToolInvocation,
+    } as unknown as TrackedCompletedToolCall;
+
+    const freshTool = {
+      request: {
+        callId: 'call_mixed_fresh',
+        name: 'read_file',
+        args: { path: '/tmp/f.txt' },
+        isClientInitiated: false,
+        prompt_id: 'prompt-mixed',
+      },
+      status: 'success',
+      responseSubmittedToGemini: false,
+      response: {
+        callId: 'call_mixed_fresh',
+        responseParts: [
+          {
+            functionResponse: {
+              id: 'call_mixed_fresh',
+              name: 'read_file',
+              response: { output: 'real for fresh' },
+            },
+          },
+        ],
+        resultDisplay: undefined,
+        error: undefined,
+        errorType: undefined,
+      },
+      tool: {
+        name: 'read_file',
+        displayName: 'ReadFile',
+        description: 'Read a file',
+        build: vi.fn(),
+      } as any,
+      invocation: {
+        getDescription: () => 'read /tmp/f.txt',
+      } as unknown as AnyToolInvocation,
+    } as unknown as TrackedCompletedToolCall;
+
+    const client = new MockedGeminiClientClass(mockConfig);
+    // History has fr for ONLY the deduped callId — `call_mixed_fresh`
+    // is not paired and must flow through to sendMessageStream.
+    client.getHistory = vi.fn().mockReturnValue([
+      { role: 'user', parts: [{ text: 'kick off' }] },
+      {
+        role: 'model',
+        parts: [
+          {
+            functionCall: {
+              id: 'call_mixed_deduped',
+              name: 'read_file',
+              args: { path: '/tmp/d.txt' },
+            },
+          },
+          {
+            functionCall: {
+              id: 'call_mixed_fresh',
+              name: 'read_file',
+              args: { path: '/tmp/f.txt' },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              id: 'call_mixed_deduped',
+              name: 'read_file',
+              response: { error: 'Tool execution result was not recorded' },
+            },
+          },
+        ],
+      },
+    ]);
+
+    let capturedOnComplete:
+      | ((completedTools: TrackedToolCall[]) => Promise<void>)
+      | null = null;
+    mockUseReactToolScheduler.mockImplementation((onComplete) => {
+      capturedOnComplete = onComplete;
+      return [[], mockScheduleToolCalls, mockMarkToolsAsSubmitted];
+    });
+
+    renderHook(() =>
+      useGeminiStream(
+        client,
+        [],
+        mockAddItem,
+        mockConfig,
+        mockLoadedSettings,
+        mockOnDebugMessage,
+        mockHandleSlashCommand,
+        false,
+        () => 'vscode' as EditorType,
+        () => {},
+        () => Promise.resolve(),
+        false,
+        () => {},
+        () => {},
+        () => {},
+        () => {},
+        80,
+        24,
+      ),
+    );
+
+    await act(async () => {
+      if (capturedOnComplete) {
+        await capturedOnComplete([dedupedTool, freshTool]);
+      }
+    });
+
+    await waitFor(() => {
+      // (a) Both callIds were marked submitted somewhere across the
+      // dedup pass (deduped) and the post-isResponding flow (fresh).
+      const allMarked = mockMarkToolsAsSubmitted.mock.calls.flatMap(
+        (call) => call[0] as string[],
+      );
+      expect(allMarked).toContain('call_mixed_deduped');
+      expect(allMarked).toContain('call_mixed_fresh');
+    });
+
+    // (b) recordCompletedToolCall fires EXACTLY once per tool (deduped
+    // gets one call from the dedup-loop; fresh gets one from the
+    // geminiTools loop). The filter is what prevents the double
+    // record on the deduped callId.
+    const recordedCallIds = (
+      client.recordCompletedToolCall as unknown as ReturnType<typeof vi.fn>
+    ).mock.calls.map((call) => (call[1] as { path: string }).path);
+    expect(recordedCallIds.filter((p) => p === '/tmp/d.txt').length).toBe(1);
+    expect(recordedCallIds.filter((p) => p === '/tmp/f.txt').length).toBe(1);
+
+    // (c) The fresh tool's real result reaches sendMessageStream —
+    // dedup didn't accidentally suppress it.
+    expect(mockSendMessageStream).toHaveBeenCalled();
+  });
+
   it('should not flicker streaming state to Idle between tool completion and submission', async () => {
     const toolCallResponseParts: PartListUnion = [
       { text: 'tool 1 final response' },
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index c414223b90c..bc8f15c5484 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -2044,12 +2044,32 @@ export const useGeminiStream = (
       // wire — same trade-off upstream Claude Code makes when its
       // `StreamingToolExecutor.discard()` follows a
       // `yieldMissingToolResultBlocks` synthesis (`query.ts:733` + `:984`).
-      const historyCallIdsWithResponse = new Set<string>();
-      // Guard the call: some test harnesses build a partial GeminiClient
-      // mock without `getHistory`. Skipping dedup in that case is safe —
-      // it just means tests that never set up the repair pre-condition
-      // run with the original (pre-dedup) submission shape.
-      if (geminiClient && typeof geminiClient.getHistory === 'function') {
+      // Walk raw history WITHOUT cloning — `geminiClient.getHistory()`
+      // returns `structuredClone(this.history)`, which on long sessions
+      // (200+ entries with sizable tool outputs) costs several ms on
+      // the React UI thread and visibly stalls streaming when the
+      // dedup pass runs on every tool-completion batch. The
+      // `getHistoryFunctionResponseIds` accessor walks history in
+      // place and only collects the id strings we actually need.
+      // Guard the call: some test harnesses build a partial
+      // GeminiClient mock without it. Skipping dedup in that case is
+      // safe — tests that never set up the repair pre-condition run
+      // with the original (pre-dedup) submission shape. We fall back
+      // to the cloning getHistory() path for older mocks that only
+      // expose that method, so legacy tests stay green.
+      // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+      let historyCallIdsWithResponse: Set<string>;
+      if (
+        geminiClient &&
+        typeof geminiClient.getHistoryFunctionResponseIds === 'function'
+      ) {
+        historyCallIdsWithResponse =
+          geminiClient.getHistoryFunctionResponseIds();
+      } else if (
+        geminiClient &&
+        typeof geminiClient.getHistory === 'function'
+      ) {
+        historyCallIdsWithResponse = new Set<string>();
         for (const entry of geminiClient.getHistory()) {
           if (entry.role !== 'user') continue;
           for (const part of entry.parts ?? []) {
@@ -2057,6 +2077,8 @@ export const useGeminiStream = (
             if (id) historyCallIdsWithResponse.add(id);
           }
         }
+      } else {
+        historyCallIdsWithResponse = new Set<string>();
       }
       const dedupedTools = completedAndReadyToSubmitTools.filter((tc) =>
         historyCallIdsWithResponse.has(tc.request.callId),
@@ -2104,8 +2126,15 @@ export const useGeminiStream = (
       }
 
       // Finalize any client-initiated tools as soon as they are done.
+      // Skip ones whose callId already lives in chat history with a
+      // matching `functionResponse` — the dedup block above already
+      // called `markToolsAsSubmitted` for those, and re-dispatching
+      // the same callIds here would queue an extra React render.
+      // qwen-latest-series-invite-beta-v34 thread on PR #4176.
       const clientTools = completedAndReadyToSubmitTools.filter(
-        (t) => t.request.isClientInitiated,
+        (t) =>
+          t.request.isClientInitiated &&
+          !historyCallIdsWithResponse.has(t.request.callId),
       );
       if (clientTools.length > 0) {
         markToolsAsSubmitted(clientTools.map((t) => t.request.callId));
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index a69071d0a9b..cef37e3e670 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -314,6 +314,22 @@ export class GeminiClient {
     return this.getChat().getHistoryTail(count, curated);
   }
 
+  /**
+   * Walk-only accessor for the set of `functionResponse.id` strings in
+   * raw history. Callers that only need the dedup id set (notably
+   * `useGeminiStream.handleCompletedTools`) MUST prefer this over
+   * {@link getHistory}, which deep-clones the entire conversation via
+   * `structuredClone` on every call. On long sessions with sizable
+   * tool outputs the clone is a multi-millisecond hit on the React UI
+   * thread; running it on every tool-completion batch caused visible
+   * frame drops during streaming. See
+   * `GeminiChat.getHistoryFunctionResponseIds` for the implementation
+   * and the qwen-latest-series-invite-beta-v34 thread on PR #4176.
+   */
+  getHistoryFunctionResponseIds(): Set<string> {
+    return this.getChat().getHistoryFunctionResponseIds();
+  }
+
   /**
    * Pop orphaned trailing user entries from the in-memory chat history.
    * Used by:
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index c498ef58e26..cf383ab2038 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -2524,6 +2524,98 @@ describe('GeminiChat', async () => {
       }
     });
 
+    it('rolls back the partial assistant turn when an InvalidStreamError fires after a tool_use chunk on the transient-stream retry budget (qwen-latest-series-invite-beta-v34 thread on PR #4176)', async () => {
+      // Counterpart to the rate-limit rollback above. The
+      // transient-stream retry budget (NO_FINISH_REASON /
+      // NO_RESPONSE_TEXT) has its own popPartialIfPushed call site —
+      // separate from the rate-limit branch the existing test
+      // covers. Without a regression test, that call could be
+      // accidentally removed and the rate-limit test would still
+      // pass while a stale partial silently rode the retry.
+      vi.useFakeTimers();
+      try {
+        const failingStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: {
+                  parts: [
+                    {
+                      functionCall: {
+                        id: 'call_transient_retry_partial',
+                        name: 'read_file',
+                        args: { path: '/tmp/t.txt' },
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+          // Mid-tool_use cut without a finish reason — the transient-
+          // stream retry budget catches this and retries with delay.
+          throw new InvalidStreamError(
+            'Model stream ended without a finish reason.',
+            'NO_FINISH_REASON',
+          );
+        })();
+        const successStream = (async function* () {
+          yield {
+            candidates: [
+              {
+                content: { parts: [{ text: 'Recovered on retry' }] },
+                finishReason: 'STOP',
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+        })();
+        vi.mocked(mockContentGenerator.generateContentStream)
+          .mockResolvedValueOnce(failingStream)
+          .mockResolvedValueOnce(successStream);
+
+        const stream = await chat.sendMessageStream(
+          'test-model',
+          { message: 'test' },
+          'prompt-rollback-transient',
+        );
+        const iterator = stream[Symbol.asyncIterator]();
+        // Advance through the transient-retry delay (initial 2000 ms).
+        for (;;) {
+          const next = iterator.next();
+          await vi.advanceTimersByTimeAsync(5_000);
+          const r = await next;
+          if (r.done) break;
+        }
+
+        const history = chat.getHistory();
+        // Final shape must be clean: [user, model(success text)].
+        // The failed attempt's partial functionCall must NOT survive.
+        expect(history.length).toBe(2);
+        expect(history[0]!.role).toBe('user');
+        expect(history[1]!.role).toBe('model');
+        expect(history[1]!.parts!.find((p) => p.text)?.text).toBe(
+          'Recovered on retry',
+        );
+        expect(history.some((h) => h.parts?.some((p) => p.functionCall))).toBe(
+          false,
+        );
+      } finally {
+        vi.useRealTimers();
+      }
+    });
+
+    // NOTE: no test for the InvalidStreamError content-retry branch
+    // (geminiChat.ts ~line 1399). Verified unreachable for that error
+    // class: `isTransientStreamError` and `isContentError` are the
+    // same predicate (`error instanceof InvalidStreamError`), so the
+    // transient branch above always either `continue`s or `break`s
+    // before control reaches the content branch. The
+    // `popPartialIfPushed()` call there is preserved as
+    // defense-in-depth for a future error class that should diverge
+    // the predicates; see the comment block at that call site for
+    // the full analysis. qwen-latest-series-invite-beta-v34 thread
+    // on PR #4176.
+
     it('rolls back the chat-recording entry too when the retry succeeds (yiliang114 PR #4176 follow-up)', async () => {
       // The in-memory rollback test above asserts `this.history` ends
       // clean after a retry-success. This test asserts the same about
@@ -3775,6 +3867,161 @@ describe('GeminiChat', async () => {
     });
   });
 
+  describe('partial-push marker invariants on history mutation (qwen-latest-series-invite-beta-v34 thread on PR #4176)', () => {
+    // The whole partial-push lifecycle relies on the invariant
+    //   "every history-mutation method clears the partial-push markers"
+    // — six sites enforce it (clearHistory, addHistory, setHistory,
+    // truncateHistory, stripThoughtsFromHistory,
+    // stripOrphanedUserEntriesFromHistory). If any site forgets, a
+    // stale `pendingPartialAssistantTurnIndex` could line up with an
+    // unrelated model turn in the post-mutation history and cause
+    // `popPartialIfPushed` to splice the WRONG entry — silently losing
+    // a real assistant response.
+    //
+    // The markers are ephemeral within a single sendMessageStream
+    // call: the `finally` block flushes the deferred JSONL record
+    // and calls `clearPendingPartialState()` before the generator
+    // unwinds. So we can't observe non-null markers after a real
+    // mid-stream error completes — by that point the lifecycle has
+    // already cleared them. Instead, we plant the markers directly
+    // via the same private-field assignment the production code uses,
+    // then call each mutation method and verify both fields are reset
+    // in lockstep. This pins the invariant against future refactors
+    // that drop a `clearPendingPartialState()` call from one site
+    // while the other five still pass.
+    type PrivateFields = {
+      pendingPartialAssistantTurnIndex: number | null;
+      pendingPartialAssistantRecord: unknown;
+    };
+    function plantMarkers(c: GeminiChat): void {
+      const internal = c as unknown as PrivateFields;
+      internal.pendingPartialAssistantTurnIndex = 0;
+      internal.pendingPartialAssistantRecord = {
+        model: 'test-model',
+        message: [{ functionCall: { id: 'call_test', name: 't', args: {} } }],
+      };
+    }
+    function markers(c: GeminiChat): {
+      idx: number | null;
+      record: unknown;
+    } {
+      const internal = c as unknown as PrivateFields;
+      return {
+        idx: internal.pendingPartialAssistantTurnIndex,
+        record: internal.pendingPartialAssistantRecord,
+      };
+    }
+
+    it('clearHistory() clears the partial-push markers', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [{ functionCall: { id: 'x', name: 't', args: {} } }],
+        },
+      ]);
+      plantMarkers(chat);
+      expect(markers(chat).idx).toBe(0);
+
+      chat.clearHistory();
+
+      expect(markers(chat).idx).toBeNull();
+      expect(markers(chat).record).toBeNull();
+    });
+
+    it('addHistory() clears the partial-push markers (violation path)', () => {
+      // addHistory is documented to be called between sends, NOT
+      // mid-send. Calling it with markers active is a violation —
+      // the implementation logs a warn so the offending caller is
+      // visible in diagnostics, then clears the markers.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [{ functionCall: { id: 'x', name: 't', args: {} } }],
+        },
+      ]);
+      plantMarkers(chat);
+      expect(markers(chat).idx).toBe(0);
+
+      chat.addHistory({ role: 'user', parts: [{ text: 'between sends' }] });
+
+      expect(markers(chat).idx).toBeNull();
+      expect(markers(chat).record).toBeNull();
+    });
+
+    it('setHistory() clears the partial-push markers', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [{ functionCall: { id: 'x', name: 't', args: {} } }],
+        },
+      ]);
+      plantMarkers(chat);
+      expect(markers(chat).idx).toBe(0);
+
+      chat.setHistory([{ role: 'user', parts: [{ text: 'replacement' }] }]);
+
+      expect(markers(chat).idx).toBeNull();
+      expect(markers(chat).record).toBeNull();
+    });
+
+    it('truncateHistory() clears the partial-push markers', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [{ functionCall: { id: 'x', name: 't', args: {} } }],
+        },
+      ]);
+      plantMarkers(chat);
+      expect(markers(chat).idx).toBe(0);
+
+      chat.truncateHistory(1);
+
+      expect(markers(chat).idx).toBeNull();
+      expect(markers(chat).record).toBeNull();
+    });
+
+    it('stripThoughtsFromHistory() clears the partial-push markers', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [{ functionCall: { id: 'x', name: 't', args: {} } }],
+        },
+      ]);
+      plantMarkers(chat);
+      expect(markers(chat).idx).toBe(0);
+
+      chat.stripThoughtsFromHistory();
+
+      expect(markers(chat).idx).toBeNull();
+      expect(markers(chat).record).toBeNull();
+    });
+
+    it('stripOrphanedUserEntriesFromHistory() clears the partial-push markers', () => {
+      // History tail is a model turn — strip is a no-op on history,
+      // but the marker reset must still fire so all six mutation
+      // sites stay uniform.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [{ functionCall: { id: 'x', name: 't', args: {} } }],
+        },
+      ]);
+      plantMarkers(chat);
+      expect(markers(chat).idx).toBe(0);
+
+      chat.stripOrphanedUserEntriesFromHistory();
+
+      expect(markers(chat).idx).toBeNull();
+      expect(markers(chat).record).toBeNull();
+    });
+  });
+
   describe('repairOrphanedToolUseTurns', () => {
     // Verifies the inverse-of-strip pass: every `model[functionCall]`
     // without a matching `user[functionResponse]` in the next turn gets
@@ -4265,6 +4512,138 @@ describe('GeminiChat', async () => {
       expect(history[2]!.parts![1]).toEqual({ text: 'never mind' });
       expect(history[3]!.parts).toEqual([{ text: 'thanks anyway' }]);
     });
+
+    it('drops duplicate functionResponse entries for the same callId across user turns (gpt-5.5 thread on PR #4176)', () => {
+      // Critical regression: when the same callId is echoed back more
+      // than once (e.g. the React scheduler retries the late submitQuery
+      // after the orphan repair already planted one, or two parallel
+      // late-submit paths land), hoisting only the first leaves the
+      // duplicate behind. The wire payload then serializes
+      //   `model[tool_use] -> user[tool_result] -> user[tool_result]`
+      // and Anthropic-compatible backends reject the trailing block as
+      // an orphan, re-wedging the session. The repair MUST hoist one
+      // canonical fr into the adjacent turn AND delete every duplicate.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'open file' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: { id: 'cid_dup', name: 'read_file', args: {} },
+            },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'never mind' }] },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_dup',
+                name: 'read_file',
+                response: { output: 'data' },
+              },
+            },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_dup',
+                name: 'read_file',
+                response: { output: 'data' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected).toEqual([]);
+      const history = chat.getHistory();
+      // 5 → 3: both source turns held only the duplicate fr, so both
+      // are removed; the canonical fr is hoisted into history[2] and
+      // sits at the head before the text part.
+      expect(history.length).toBe(3);
+      expect(history[2]!.parts![0]!.functionResponse?.id).toBe('cid_dup');
+      expect(history[2]!.parts![1]).toEqual({ text: 'never mind' });
+      // No fr for cid_dup remains anywhere AFTER the adjacent turn.
+      const trailingHasDup = history
+        .slice(3)
+        .some((entry) =>
+          (entry.parts ?? []).some(
+            (part) => part.functionResponse?.id === 'cid_dup',
+          ),
+        );
+      expect(trailingHasDup).toBe(false);
+    });
+
+    it('drops duplicate fr even when the canonical copy is already in the adjacent turn', () => {
+      // Variant of the duplicate case where the FIRST fr lands in the
+      // immediate next user turn (no hoist needed) but a second
+      // duplicate copy is in a later user turn. The hoist branch is
+      // skipped, but duplicate cleanup must still fire — otherwise the
+      // wire payload still has two `tool_result` blocks for the same id.
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'kick off' }] },
+        {
+          role: 'model',
+          parts: [
+            {
+              functionCall: { id: 'cid_adj_dup', name: 'read_file', args: {} },
+            },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_adj_dup',
+                name: 'read_file',
+                response: { output: 'real' },
+              },
+            },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_adj_dup',
+                name: 'read_file',
+                response: { output: 'real' },
+              },
+            },
+            { text: 'follow up' },
+          ],
+        },
+      ]);
+
+      const result = chat.repairOrphanedToolUseTurns();
+
+      expect(result.injected).toEqual([]);
+      const history = chat.getHistory();
+      // The source duplicate turn loses its fr but keeps its text part
+      // → 4 entries preserved, but the duplicate fr is gone.
+      expect(history.length).toBe(4);
+      expect(history[2]!.parts![0]!.functionResponse?.id).toBe('cid_adj_dup');
+      expect(history[2]!.parts!.length).toBe(1);
+      expect(history[3]!.parts).toEqual([{ text: 'follow up' }]);
+      // The model[fc] is followed by exactly one fr for that id across
+      // all subsequent user turns.
+      const allFrIds = history
+        .slice(2)
+        .flatMap((entry) =>
+          (entry.parts ?? []).map((p) => p.functionResponse?.id),
+        )
+        .filter((id): id is string => Boolean(id));
+      expect(allFrIds).toEqual(['cid_adj_dup']);
+    });
   });
 
   describe('output token recovery', () => {
@@ -4470,6 +4849,104 @@ describe('GeminiChat', async () => {
       expect(lastEntry.parts!.length).toBeGreaterThan(0);
     });
 
+    it('should pop both the partial model turn AND the recovery user message when recovery throws after a functionCall (qwen-latest-series-invite-beta-v34 thread on PR #4176)', async () => {
+      // Critical regression for the recovery catch's pop ordering.
+      // When the recovery stream yields a `functionCall` chunk and
+      // then throws, `processStreamResponse` pushes a partial `model`
+      // turn into history BEFORE re-throwing — so by the time the
+      // recovery catch runs, the trailing entries are
+      //   [..., user(OUTPUT_RECOVERY_MESSAGE), model(partial fc)]
+      // The naive "if last is user, pop" check would no-op here (last
+      // is now `model`), leaving the OUTPUT_RECOVERY_MESSAGE control
+      // prompt stranded as a real user turn. The catch must pop the
+      // partial model turn FIRST, then the recovery user turn, and
+      // clear the partial-push markers so the outer `finally` JSONL
+      // flush doesn't resurrect the partial we just deleted.
+      const streams = [
+        // Initial: text + MAX_TOKENS → triggers escalation.
+        makeStream([makeChunk([{ text: 'initial' }], 'MAX_TOKENS')]),
+        // Escalated: text + MAX_TOKENS → triggers recovery iteration 1.
+        makeStream([makeChunk([{ text: 'escalated' }], 'MAX_TOKENS')]),
+        // Recovery iter 1: yields functionCall chunk, then throws.
+        // processStreamResponse pushes a partial model turn before
+        // re-throwing the synthetic error.
+        (async function* () {
+          yield {
+            candidates: [
+              {
+                content: {
+                  parts: [
+                    {
+                      functionCall: {
+                        id: 'call_recovery_throw',
+                        name: 'read_file',
+                        args: { path: '/tmp/r.txt' },
+                      },
+                    },
+                  ],
+                },
+              },
+            ],
+          } as unknown as GenerateContentResponse;
+          throw new Error('synthetic recovery mid-tool_use cut');
+        })(),
+      ];
+      let callIndex = 0;
+      vi.mocked(mockContentGenerator.generateContentStream).mockImplementation(
+        async () => streams[callIndex++]!,
+      );
+
+      const stream = await chat.sendMessageStream(
+        'gemini-3-pro',
+        { message: 'recovery throws after functionCall' },
+        'prompt-recovery-fc-throw',
+      );
+
+      // Consume; the catch swallows the error and emits a synthetic
+      // STOP chunk so the consumer sees a clean termination.
+      for await (const _ of stream) {
+        /* consume */
+      }
+
+      const history = chat.getHistory();
+
+      // OUTPUT_RECOVERY_MESSAGE must NOT appear anywhere in history.
+      // The pop-ordering bug strands it as a real user turn that then
+      // pollutes durable history and biases later turns.
+      const flattened = JSON.stringify(history);
+      expect(flattened).not.toContain('Output token limit hit');
+      expect(flattened).not.toContain('Resume directly');
+
+      // The partial model[functionCall] from the recovery throw must
+      // also be popped — leaving it would create a dangling tool_use
+      // that the inline repair on the next sendMessageStream would
+      // synthesize an `error` functionResponse for, and the React
+      // scheduler's late real result would be dropped by the
+      // history-based dedup. Symptom: model sees an "execution result
+      // was not recorded" error for a tool that actually succeeded.
+      const stillHasPartialFc = history.some((entry) =>
+        (entry.parts ?? []).some(
+          (part) => part.functionCall?.id === 'call_recovery_throw',
+        ),
+      );
+      expect(stillHasPartialFc).toBe(false);
+
+      // Roles must strictly alternate (no consecutive same-role) so
+      // providers don't reject the next turn.
+      for (let i = 1; i < history.length; i++) {
+        expect(history[i]!.role).not.toBe(history[i - 1]!.role);
+      }
+
+      // History tail should be the escalated model response (text:
+      // 'escalated'), preserved as the user-visible answer.
+      const lastEntry = history[history.length - 1]!;
+      expect(lastEntry.role).toBe('model');
+      const lastModelText = (lastEntry.parts ?? [])
+        .map((p) => ('text' in p ? ((p as { text?: string }).text ?? '') : ''))
+        .join('');
+      expect(lastModelText).toContain('escalated');
+    });
+
     it('should stop recovery mid-loop when a later iteration emits functionCall', async () => {
       // Covers the cross-iteration guard: iter 1 returns plain text (recovery
       // proceeds), iter 2 returns a functionCall (recovery must break before
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 736c07bef6f..6dcc6df8742 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -486,9 +486,25 @@ export function repairOrphanedToolUseTurns(
     //    they reject with "tool_use_id ... must have a corresponding
     //    tool_use block in the previous message". gpt-5.5 review thread
     //    on PR #4176.
+    // Collect EVERY (turnIdx, partIdx, part) for every functionResponse
+    // we encounter, keyed by id. Storing all locations (not just the
+    // first) is load-bearing for the duplicate case: if the same callId
+    // is echoed back more than once across the consecutive user turns
+    // (e.g. `model[fc id=cid], user[text], user[fr cid], user[fr cid]`
+    // — possible when the React scheduler retries the late submitQuery
+    // and a duplicate fr lands), hoisting only the first leaves the
+    // duplicate(s) behind in a non-adjacent later user turn. The wire
+    // payload then serializes
+    //   `model[tool_use] -> user[tool_result] -> user[tool_result]`
+    // and the backend rejects the trailing block as an orphan
+    // ("tool_use_id ... must have a corresponding tool_use block in the
+    // previous message"), so the session stays wedged for the same 400
+    // class this repair pass exists to escape. We MUST drop every
+    // duplicate; the survivor at history[i+1] is whichever copy we
+    // hoist, and the rest are erased. (gpt-5.5 thread on PR #4176.)
     const matched = new Map<
       string,
-      { turnIdx: number; partIdx: number; part: Part }
+      Array<{ turnIdx: number; partIdx: number; part: Part }>
     >();
     let scanIdx = i + 1;
     while (scanIdx < history.length && history[scanIdx]?.role === 'user') {
@@ -496,29 +512,55 @@ export function repairOrphanedToolUseTurns(
       for (let pIdx = 0; pIdx < parts.length; pIdx++) {
         const part = parts[pIdx];
         const id = part.functionResponse?.id;
-        if (id && !matched.has(id)) {
-          matched.set(id, { turnIdx: scanIdx, partIdx: pIdx, part });
+        if (id) {
+          const list = matched.get(id);
+          if (list) list.push({ turnIdx: scanIdx, partIdx: pIdx, part });
+          else matched.set(id, [{ turnIdx: scanIdx, partIdx: pIdx, part }]);
         }
       }
       scanIdx++;
     }
 
     const synthesizeIds: Array<[string, string]> = [];
-    const hoistLocations: Array<{
-      turnIdx: number;
-      partIdx: number;
-      part: Part;
-    }> = [];
+    const hoistedParts: Part[] = [];
+    // Removal targets cover BOTH:
+    //  - the survivor's original location for ids being hoisted (we
+    //    move it into history[i+1])
+    //  - every duplicate copy of an id (hoisted or already-adjacent)
+    // For an id whose canonical fr is already in history[i+1] but has
+    // duplicates further down, we don't hoist (no relocation needed)
+    // but we still must drop the duplicates so the wire format only
+    // contains one fr per call.
+    const allRemovalTargets: Array<{ turnIdx: number; partIdx: number }> = [];
     for (const [id, name] of expected) {
-      const loc = matched.get(id);
-      if (!loc) {
+      const locations = matched.get(id);
+      if (!locations || locations.length === 0) {
         synthesizeIds.push([id, name]);
-      } else if (loc.turnIdx !== i + 1) {
-        hoistLocations.push(loc);
+        continue;
+      }
+      // Pick the first location's part as the canonical survivor (any
+      // copy works — the response payload should be identical for the
+      // same callId; if they differ the wire is already corrupt and the
+      // backend will reject regardless).
+      const survivor = locations[0]!;
+      if (survivor.turnIdx !== i + 1) {
+        // Hoist: the canonical part needs to move into history[i+1].
+        hoistedParts.push(survivor.part);
+        allRemovalTargets.push({
+          turnIdx: survivor.turnIdx,
+          partIdx: survivor.partIdx,
+        });
+      }
+      // else: canonical already adjacent — no relocation.
+      // Either way, drop EVERY duplicate beyond the first.
+      for (let k = 1; k < locations.length; k++) {
+        allRemovalTargets.push({
+          turnIdx: locations[k]!.turnIdx,
+          partIdx: locations[k]!.partIdx,
+        });
       }
-      // else: already in the immediate next user turn — wire format ok.
     }
-    if (synthesizeIds.length === 0 && hoistLocations.length === 0) continue;
+    if (synthesizeIds.length === 0 && allRemovalTargets.length === 0) continue;
 
     const syntheticParts: Part[] = synthesizeIds.map(([callId, name]) => ({
       functionResponse: {
@@ -527,23 +569,19 @@ export function repairOrphanedToolUseTurns(
         response: { error: reason },
       },
     }));
-    // Hoisted parts come from REAL tool_results elsewhere in history.
-    // We capture references first, then splice them out below in
-    // descending order so earlier removals don't shift later indices.
-    const hoistedParts: Part[] = hoistLocations.map((loc) => loc.part);
     // Synthetics first, then hoisted. Order between synthetic and
     // hoisted is internal — backends just need ALL tool_results at the
     // head of the user turn, regardless of order among themselves.
     const partsToInject: Part[] = [...syntheticParts, ...hoistedParts];
 
-    // Remove hoisted parts from their original turns. Sort by
-    // (turnIdx desc, partIdx desc) so each splice operates on stable
-    // indices for everything still to be removed.
-    const removalOrder = [...hoistLocations].sort((a, b) => {
+    // Remove every removal target (hoist survivors + all duplicates).
+    // Sort by (turnIdx desc, partIdx desc) so each splice operates on
+    // stable indices for everything still to be removed.
+    allRemovalTargets.sort((a, b) => {
       if (a.turnIdx !== b.turnIdx) return b.turnIdx - a.turnIdx;
       return b.partIdx - a.partIdx;
     });
-    for (const loc of removalOrder) {
+    for (const loc of allRemovalTargets) {
       const turnParts = history[loc.turnIdx].parts;
       if (turnParts) turnParts.splice(loc.partIdx, 1);
     }
@@ -1357,7 +1395,18 @@ export class GeminiChat {
               break;
             }
 
-            // Other content validation errors (e.g. NO_FINISH_REASON).
+            // Currently unreachable for `InvalidStreamError`. The
+            // `isContentError` predicate is identical to
+            // `isTransientStreamError` (`error instanceof InvalidStreamError`),
+            // and the transient branch above already either continued or
+            // broke for that class. The branch is preserved as
+            // defense-in-depth: a future error class that should consume
+            // its own content-retry budget but NOT the transient one
+            // could be threaded through here without re-deriving the
+            // popPartialIfPushed sequence. Reviewer thread on PR #4176
+            // (qwen-latest-series-invite-beta-v34) flagged the absence
+            // of a test — there is no reachable test path until the
+            // predicates diverge.
             const isContentError = error instanceof InvalidStreamError;
             if (isContentError) {
               if (attempt < INVALID_CONTENT_RETRY_OPTIONS.maxAttempts - 1) {
@@ -1499,6 +1548,37 @@ export class GeminiChat {
               // If a recovery attempt fails (e.g., empty response, network
               // error), stop recovering and let the partial output stand.
               // Pop the dangling recovery message to keep history valid.
+              //
+              // Order matters: when the recovery stream errors AFTER
+              // yielding a `functionCall` chunk, `processStreamResponse`
+              // pushes a partial `model` turn into history before
+              // re-throwing. The naive "if last is user, pop" check
+              // would then no-op (last is now the partial `model`),
+              // leaving `user(OUTPUT_RECOVERY_MESSAGE)` stranded as a
+              // real user turn the user never sent. Two consequences:
+              //  - the control-prompt text (which carries instructions
+              //    meant only for the model's own continuation context)
+              //    pollutes durable history and biases later turns,
+              //  - the inline repair on the next sendMessageStream
+              //    synthesizes an `error` `functionResponse` for the
+              //    dangling `functionCall`, which the
+              //    `handleCompletedTools` history-based dedup then drops
+              //    when the React scheduler's REAL tool result arrives,
+              //    so the model sees an "execution result was not
+              //    recorded" error for a tool that actually succeeded.
+              // Pop the partial model turn FIRST, then the recovery
+              // user turn. The partial-push markers are also cleared
+              // in lockstep so the outer `finally` JSONL flush can't
+              // resurrect a partial we just deleted from live history.
+              // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+              if (
+                self.pendingPartialAssistantTurnIndex !== null &&
+                self.history.length > 0 &&
+                self.history[self.history.length - 1].role === 'model'
+              ) {
+                self.history.pop();
+                self.clearPendingPartialState();
+              }
               if (
                 self.history.length > 0 &&
                 self.history[self.history.length - 1].role === 'user'
@@ -1585,9 +1665,28 @@ export class GeminiChat {
         // and stash are dropped together to preserve the
         // "marker non-null ⇔ stash non-null" invariant.
         if (self.pendingPartialAssistantRecord) {
-          self.chatRecordingService?.recordAssistantTurn(
-            self.pendingPartialAssistantRecord,
-          );
+          // Recording-service errors (disk full, write permission,
+          // serialization failure) MUST NOT propagate out of the
+          // generator's `finally` — that would mask the real send
+          // outcome (success or original throw) with a JSONL-write
+          // error the caller can't usefully act on. Instead, log and
+          // drop the record: the partial is already durable in
+          // `this.history`, so live behavior is unaffected; only the
+          // disk transcript loses this turn (eventual consistency
+          // restored on the next successful flush of any other turn).
+          // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+          try {
+            self.chatRecordingService?.recordAssistantTurn(
+              self.pendingPartialAssistantRecord,
+            );
+          } catch (recordErr) {
+            debugLogger.warn(
+              '[PARTIAL_FLUSH] Failed to persist deferred JSONL record: ' +
+                (recordErr instanceof Error
+                  ? recordErr.message
+                  : String(recordErr)),
+            );
+          }
           self.clearPendingPartialState();
         }
       }
@@ -1698,6 +1797,32 @@ export class GeminiChat {
     return this.history.length;
   }
 
+  /**
+   * Returns the set of `functionResponse.id` values present anywhere
+   * in user turns of the raw chat history. Walks `this.history` in
+   * place — no `structuredClone`, no per-part copy — so it is safe to
+   * call on hot paths (e.g. on every tool-completion batch in
+   * `useGeminiStream.handleCompletedTools`).
+   *
+   * The dedup pass only needs id strings; routing it through
+   * {@link getHistory} would deep-clone the entire conversation
+   * (recursive `structuredClone` over every part, including large tool
+   * outputs) on every batch and visibly stall the React UI thread on
+   * long sessions (200+ entries with sizable tool results).
+   * qwen-latest-series-invite-beta-v34 thread on PR #4176.
+   */
+  getHistoryFunctionResponseIds(): Set<string> {
+    const ids = new Set<string>();
+    for (const entry of this.history) {
+      if (entry.role !== 'user') continue;
+      for (const part of entry.parts ?? []) {
+        const id = part.functionResponse?.id;
+        if (id) ids.add(id);
+      }
+    }
+    return ids;
+  }
+
   /**
    * Clears the chat history.
    */
@@ -2056,10 +2181,15 @@ export class GeminiChat {
     // re-issues it; a stale partial-text model turn between them would
     // either bias the retry or surface as a duplicate.
     if (streamError !== null) {
-      if (
-        hasToolCall &&
-        (thoughtContentPart || consolidatedHistoryParts.length > 0)
-      ) {
+      // Reuse the `willPersistToHistory` gate from the recordAssistantTurn
+      // block above instead of re-deriving it. When `streamError !== null`,
+      // `willPersistToHistory` reduces to exactly the original expression
+      // `hasToolCall && (thoughtContentPart || consolidatedHistoryParts.length > 0)`;
+      // sharing the single binding eliminates drift risk if one gate is
+      // tightened without the other and the JSONL recording silently
+      // desyncs from in-memory history.
+      // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+      if (willPersistToHistory) {
         this.history.push({
           role: 'model',
           parts: [

From c30bba6e772e50cac1f23699b8a3b2d61472518c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Wed, 20 May 2026 18:16:16 +0800
Subject: [PATCH 14/18] fix(core,cli): close 3 review threads on
 partial-tool_use repair (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three Suggestion threads from the qwen-latest-series-invite-beta-v34
pass on commit c8fa3143f.

[A] DIAGNOSTICS — `repairOrphanedToolUseTurns` return type now
carries a `droppedDuplicates: Array<{ callId; name }>` field
alongside `injected`. Previously a duplicate-only repair (no
synthesis, no hoist — the main scenario the prior commit added)
returned `injected.length === 0` and both call sites
(`repairOrphanedToolUseTurnsInHistory` in client.ts,
`sendMessageStream` inline in geminiChat.ts) only logged on
synthesis, leaving zero diagnostic trail. If a future callId
collision bug caused the wrong `functionResponse` to be dropped,
there was no breadcrumb pointing back to the repair function. Both
call sites now emit a `[REPAIR] Dropped N duplicate functionResponse(s)`
warn line tagged with their origin (session-load vs inline per-send)
so investigators can anchor to the exact pass.

[B] DEFENSIVE INVARIANT — max-tokens recovery catch now uses an
index-checked pop instead of a positional `history.pop()`. The
semantically equivalent `popPartialIfPushed` uses
`splice(idx, 1)` with an explicit bounds/role check and a
`debugLogger.warn` on mismatch; the recovery catch was the only
rollback site that pop'd whatever model entry happened to be last,
with zero diagnostic output. Today the invariant holds (nothing
mutates `this.history` between `processStreamResponse`'s push and
the for-await catch), but a future change that inserts a mutation
in that window — compression side-effect, abort-signal handler,
telemetry hook — would silently pop the wrong entry while
`clearPendingPartialState()` cleared markers for the actual
partial, leaving it permanently stranded. The new
`[RECOVERY_POP] Marker/last-index mismatch` warn surfaces any
future violation immediately.

[C] TEST COVERAGE — added 6 unit tests for
`GeminiChat.getHistoryFunctionResponseIds()` (empty, mixed
user/model, model[fc] ignored, duplicate collapse to Set,
malformed parts, no aliasing of internal state). Also wired the
mixed-batch dedup test in `useGeminiStream.test.tsx` through the
fast-path accessor (mock returns the dedup Set directly) and
asserted that `getHistoryFunctionResponseIds` IS called and the
legacy cloning `getHistory()` is NOT. The earlier version of that
test only mocked `getHistory()`, so the optimization the previous
commit added was never actually exercised — a regression that
dropped the fast-path branch from the dispatcher would re-route
every batch onto the slow `structuredClone` path with no test
failure. The new assertion makes that regression visible.

Tests: 353/353 (124 in geminiChat.test.ts +6, 96 in
useGeminiStream.test.tsx unchanged but mock surface expanded,
133 in client.test.ts unchanged). tsc + eslint + prettier clean
on changed files.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx |  28 +++-
 packages/core/src/core/client.ts              |  15 ++
 packages/core/src/core/geminiChat.test.ts     | 158 ++++++++++++++++++
 packages/core/src/core/geminiChat.ts          |  63 ++++++-
 4 files changed, 258 insertions(+), 6 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index e951912cc6b..0e23793835f 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -1228,8 +1228,20 @@ describe('useGeminiStream', () => {
     } as unknown as TrackedCompletedToolCall;
 
     const client = new MockedGeminiClientClass(mockConfig);
-    // History has fr for ONLY the deduped callId — `call_mixed_fresh`
-    // is not paired and must flow through to sendMessageStream.
+    // Wire BOTH the fast-path accessor (`getHistoryFunctionResponseIds`)
+    // and the legacy `getHistory()` fallback. Wiring the fast path
+    // is the actual point of this test: production code prefers
+    // `getHistoryFunctionResponseIds` to skip the multi-millisecond
+    // `structuredClone` cost on long sessions, and an earlier
+    // version of this test only mocked `getHistory()` so the slow
+    // path was always the one exercised. We assert below that the
+    // fast path was the only one called — a regression that drops
+    // the fast-path branch from the dispatcher would silently
+    // re-route every batch onto the slow clone path with no test
+    // failure.
+    client.getHistoryFunctionResponseIds = vi
+      .fn()
+      .mockReturnValue(new Set(['call_mixed_deduped']));
     client.getHistory = vi.fn().mockReturnValue([
       { role: 'user', parts: [{ text: 'kick off' }] },
       {
@@ -1325,6 +1337,18 @@ describe('useGeminiStream', () => {
     // (c) The fresh tool's real result reaches sendMessageStream —
     // dedup didn't accidentally suppress it.
     expect(mockSendMessageStream).toHaveBeenCalled();
+
+    // (d) Fast-path was taken: `getHistoryFunctionResponseIds` was
+    // called for the dedup pass, and the cloning `getHistory()`
+    // fallback was NOT used by the dedup. (Other call sites in the
+    // hook may still call getHistory for their own purposes; we
+    // pin only that the dedup itself did not re-clone.) A future
+    // refactor that drops the fast-path branch from the dispatcher
+    // would re-route the dedup pass onto the structuredClone path
+    // and break this assertion — exactly the regression the
+    // accessor was added to prevent.
+    expect(client.getHistoryFunctionResponseIds).toHaveBeenCalled();
+    expect(client.getHistory).not.toHaveBeenCalled();
   });
 
   it('should not flicker streaming state to Idle between tool completion and submission', async () => {
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index cef37e3e670..0c92a51409d 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -397,6 +397,7 @@ export class GeminiClient {
    */
   repairOrphanedToolUseTurnsInHistory(reason?: string): {
     injected: Array<{ callId: string; name: string }>;
+    droppedDuplicates: Array<{ callId: string; name: string }>;
   } {
     const result = this.getChat().repairOrphanedToolUseTurns(reason);
     if (result.injected.length > 0) {
@@ -407,6 +408,20 @@ export class GeminiClient {
             .join(', ')}`,
       );
     }
+    if (result.droppedDuplicates.length > 0) {
+      // Surface the duplicate-cleanup pass so investigators tracing
+      // a dedup-drop log have a breadcrumb pointing back to the
+      // repair function. Without this a duplicate-only repair (no
+      // synthesis, no hoist) leaves zero diagnostic trail and a
+      // future callId-collision bug would silently delete the
+      // wrong fr. qwen-latest-series-invite-beta-v34 thread on PR #4176.
+      debugLogger.warn(
+        `[REPAIR] Dropped ${result.droppedDuplicates.length} duplicate ` +
+          `functionResponse(s) for callId(s): ${result.droppedDuplicates
+            .map((e) => `${e.name}(${e.callId})`)
+            .join(', ')}`,
+      );
+    }
     return result;
   }
 
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index cf383ab2038..833fdc167c6 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -9,6 +9,7 @@ import type {
   Content,
   GenerateContentConfig,
   GenerateContentResponse,
+  Part,
 } from '@google/genai';
 import { ApiError } from '@google/genai';
 import { AuthType, type ContentGenerator } from '../core/contentGenerator.js';
@@ -2222,6 +2223,163 @@ describe('GeminiChat', async () => {
     });
   });
 
+  describe('getHistoryFunctionResponseIds (qwen-latest-series-invite-beta-v34 thread on PR #4176)', () => {
+    // Walk-only accessor used by `useGeminiStream.handleCompletedTools`
+    // for the dedup pass. The whole point of this method is to avoid
+    // the multi-millisecond `structuredClone` hit that
+    // `getHistory()` pays on long sessions when only the id Set is
+    // needed. Pin the contract: returned Set contains every fr id
+    // present in user turns (including duplicates collapsed to one
+    // Set entry), and ignores parts that aren't functionResponses
+    // and turns that aren't user.
+    it('returns an empty Set for empty history', () => {
+      expect(chat.getHistoryFunctionResponseIds()).toEqual(new Set());
+    });
+
+    it('collects fr ids from user turns and ignores non-fr parts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'go' }] },
+        {
+          role: 'model',
+          parts: [
+            { functionCall: { id: 'cid_a', name: 'read_file', args: {} } },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_a',
+                name: 'read_file',
+                response: { output: 'a' },
+              },
+            },
+            { text: 'follow up' },
+          ],
+        },
+      ]);
+
+      expect(chat.getHistoryFunctionResponseIds()).toEqual(new Set(['cid_a']));
+    });
+
+    it('skips functionCall parts in model turns (only user[fr] counts)', () => {
+      // Defensive: a regression that walks all turns instead of just
+      // user turns would pull in `functionCall.id`s and double-count.
+      chat.setHistory([
+        {
+          role: 'model',
+          parts: [
+            { functionCall: { id: 'cid_model', name: 'read_file', args: {} } },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_user',
+                name: 'read_file',
+                response: { output: 'u' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const ids = chat.getHistoryFunctionResponseIds();
+      expect(ids).toEqual(new Set(['cid_user']));
+      expect(ids.has('cid_model')).toBe(false);
+    });
+
+    it('collapses duplicate fr ids across multiple user turns to one Set entry', () => {
+      // Same id echoed twice in different user turns: dedup callers
+      // only need to know "is this id paired anywhere", not the
+      // count, so a Set is sufficient and natural.
+      chat.setHistory([
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_dup',
+                name: 'read_file',
+                response: { output: '1' },
+              },
+            },
+          ],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_dup',
+                name: 'read_file',
+                response: { output: '2' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const ids = chat.getHistoryFunctionResponseIds();
+      expect(ids.size).toBe(1);
+      expect(ids.has('cid_dup')).toBe(true);
+    });
+
+    it('handles entries with no parts and parts with no functionResponse', () => {
+      // Defensive against malformed history (missing parts, parts
+      // with neither text nor fr): must not crash.
+      chat.setHistory([
+        { role: 'user', parts: undefined as unknown as Part[] },
+        { role: 'user', parts: [] },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_ok',
+                name: 'read_file',
+                response: { output: 'ok' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      expect(chat.getHistoryFunctionResponseIds()).toEqual(new Set(['cid_ok']));
+    });
+
+    it('does not deep-clone history (returns a fresh Set, not aliased to internal state)', () => {
+      // The whole reason this method exists is to avoid the
+      // structuredClone in getHistory(). Mutating the returned Set
+      // must not bleed into the next call.
+      chat.setHistory([
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'cid_immut',
+                name: 'read_file',
+                response: { output: 'v' },
+              },
+            },
+          ],
+        },
+      ]);
+
+      const first = chat.getHistoryFunctionResponseIds();
+      first.add('cid_FAKE');
+      first.delete('cid_immut');
+
+      const second = chat.getHistoryFunctionResponseIds();
+      expect(second.has('cid_immut')).toBe(true);
+      expect(second.has('cid_FAKE')).toBe(false);
+    });
+  });
+
   describe('getHistoryTail', () => {
     it('returns only the requested recent entries as a deep copy', () => {
       const oldContent: Content = { role: 'user', parts: [{ text: 'old' }] };
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 6dcc6df8742..34570b66e55 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -444,8 +444,20 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
 export function repairOrphanedToolUseTurns(
   history: Content[],
   reason: string = ORPHAN_TOOL_USE_REPAIR_REASON,
-): { injected: Array<{ callId: string; name: string }> } {
+): {
+  injected: Array<{ callId: string; name: string }>;
+  droppedDuplicates: Array<{ callId: string; name: string }>;
+} {
   const injected: Array<{ callId: string; name: string }> = [];
+  // Duplicates removed during the cleanup pass (any callId that had
+  // more than one `functionResponse` echo across the consecutive
+  // user turns). Returned alongside `injected` so call sites can log
+  // the cleanup — without this, a duplicate-only repair (no
+  // synthesis, no hoist) leaves zero diagnostic trail and a future
+  // callId-collision bug in the resolver could silently drop the
+  // wrong fr with no breadcrumb pointing here.
+  // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+  const droppedDuplicates: Array<{ callId: string; name: string }> = [];
 
   // Forward walk: i mutates as we splice, so use index-based iteration
   // and skip the freshly-inserted user turn to avoid re-scanning it.
@@ -558,6 +570,7 @@ export function repairOrphanedToolUseTurns(
           turnIdx: locations[k]!.turnIdx,
           partIdx: locations[k]!.partIdx,
         });
+        droppedDuplicates.push({ callId: id, name });
       }
     }
     if (synthesizeIds.length === 0 && allRemovalTargets.length === 0) continue;
@@ -649,7 +662,7 @@ export function repairOrphanedToolUseTurns(
     }
   }
 
-  return { injected };
+  return { injected, droppedDuplicates };
 }
 
 /**
@@ -1077,6 +1090,19 @@ export class GeminiChat {
               .join(', '),
         );
       }
+      if (inlineRepair.droppedDuplicates.length > 0) {
+        // Symmetrical with the synthesis log: a duplicate-only repair
+        // (no synthesis, no hoist) here would otherwise be silent.
+        // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+        debugLogger.warn(
+          `[REPAIR] sendMessageStream inline pass dropped ` +
+            `${inlineRepair.droppedDuplicates.length} duplicate ` +
+            `functionResponse(s): ` +
+            inlineRepair.droppedDuplicates
+              .map((entry) => `${entry.name}(${entry.callId})`)
+              .join(', '),
+        );
+      }
       requestContents = this.getHistory(true);
     } catch (error) {
       if (userContentAdded) {
@@ -1571,11 +1597,39 @@ export class GeminiChat {
               // in lockstep so the outer `finally` JSONL flush can't
               // resurrect a partial we just deleted from live history.
               // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+              // Index-checked pop instead of a positional `pop()` so
+              // we match the diagnostic standard set by
+              // `popPartialIfPushed` above (splice at `idx` + warn on
+              // bounds/role mismatch). The two rollback strategies
+              // share an undocumented positional assumption: nothing
+              // mutates `this.history` between
+              // `processStreamResponse`'s push and the for-await
+              // catch here. If a future change inserts a mutation in
+              // that window (compression side-effect, abort-signal
+              // handler, telemetry hook), a naked
+              // `history.pop()` would silently remove the wrong
+              // entry while `clearPendingPartialState()` clears
+              // markers for the actual partial — leaving it
+              // permanently stranded with no log trail. The warn
+              // makes any future violation visible immediately.
+              // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+              const expectedIdx = self.pendingPartialAssistantTurnIndex;
+              const lastIdx = self.history.length - 1;
               if (
-                self.pendingPartialAssistantTurnIndex !== null &&
+                expectedIdx !== null &&
                 self.history.length > 0 &&
-                self.history[self.history.length - 1].role === 'model'
+                self.history[lastIdx]?.role === 'model'
               ) {
+                if (expectedIdx !== lastIdx) {
+                  debugLogger.warn(
+                    `[RECOVERY_POP] Marker/last-index mismatch: ` +
+                      `marker=${expectedIdx}, lastIdx=${lastIdx}, ` +
+                      `historyLength=${self.history.length}. Popping ` +
+                      `last entry as best-effort rollback — investigate ` +
+                      `any history mutation between processStreamResponse's ` +
+                      `partial push and this catch.`,
+                  );
+                }
                 self.history.pop();
                 self.clearPendingPartialState();
               }
@@ -1948,6 +2002,7 @@ export class GeminiChat {
    */
   repairOrphanedToolUseTurns(reason?: string): {
     injected: Array<{ callId: string; name: string }>;
+    droppedDuplicates: Array<{ callId: string; name: string }>;
   } {
     return repairOrphanedToolUseTurns(this.history, reason);
   }

From 06a695156bd688b49d7be764113a69bcbbc82c40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Wed, 20 May 2026 21:29:11 +0800
Subject: [PATCH 15/18] test(cli): route existing dedup tests through fast-path
 accessor (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One Suggestion thread from the deepseek-v4-pro pass on commit
c30bba6e7.

`MockedGeminiClientClass` did not expose `getHistoryFunctionResponseIds`,
so 3 of the 4 dedup tests in `useGeminiStream.test.tsx` fell through
to the `else if (typeof geminiClient.getHistory === 'function')`
branch — the `structuredClone(this.history)` slow path. Only the
mixed-batch test added in c30bba6e7 wired the fast path explicitly.
Net effect: a regression in `getHistoryFunctionResponseIds` (wrong
ids, missing ids, exception) would silently re-route every dedup
batch onto the slow clone path with all 4 tests still green, while
production paid the multi-millisecond UI-thread stall this PR
specifically added the accessor to avoid.

Fix in two parts:

1. Add `getHistoryFunctionResponseIds = vi.fn().mockReturnValue(new
   Set<string>())` to the `MockedGeminiClientClass` default. Every
   test now exposes the fast-path accessor, so the dispatcher takes
   the `getHistoryFunctionResponseIds` branch by default — matching
   production. Tests that need a non-empty dedup set override the
   mock explicitly.

2. Override the mock in each of the three previously-affected dedup
   tests with a `Set` containing the callId(s) their `getHistory()`
   fixture had paired. The dedup assertions stay identical — but
   the path the production code takes to reach them now matches
   the path under test.

Tests: 96/96 useGeminiStream (no new tests; existing dedup tests
now exercise the fast path the previous commit added). 353/353
across core + cli. tsc + eslint + prettier clean.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx | 35 +++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index 0e23793835f..cb835cfff9a 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -53,6 +53,18 @@ const MockedGeminiClientClass = vi.hoisted(() =>
     this.addHistory = vi.fn();
     this.consumePendingMemoryTaskPromises = vi.fn().mockReturnValue([]);
     this.recordCompletedToolCall = vi.fn();
+    // Default to the fast-path accessor returning an empty Set so the
+    // dedup dispatcher in `handleCompletedTools` takes the
+    // `getHistoryFunctionResponseIds` branch by default (matching
+    // production). Tests that need a non-empty dedup set override
+    // this. Without exposing the method at all, the dispatcher would
+    // fall through to the `structuredClone(getHistory())` slow path
+    // and any regression in the fast path would silently route
+    // production onto the expensive branch while CI stays green.
+    // deepseek-v4-pro thread on PR #4176.
+    this.getHistoryFunctionResponseIds = vi
+      .fn()
+      .mockReturnValue(new Set<string>());
     this.getChatRecordingService = vi.fn().mockReturnValue({
       recordThought: vi.fn(),
       initialize: vi.fn(),
@@ -766,7 +778,14 @@ describe('useGeminiStream', () => {
     const client = new MockedGeminiClientClass(mockConfig);
     // Simulate the chat-internal repair pass having already planted a
     // synthetic functionResponse for the same callId on the previous
-    // (Retry) push.
+    // (Retry) push. The dedup dispatcher consults
+    // `getHistoryFunctionResponseIds` first; we override the default
+    // empty-Set mock to return the matching callId so the fast path
+    // is what production code exercises in this test (instead of
+    // falling through to the structuredClone slow path).
+    client.getHistoryFunctionResponseIds = vi
+      .fn()
+      .mockReturnValue(new Set(['call_race_A']));
     client.getHistory = vi.fn().mockReturnValue([
       { role: 'user', parts: [{ text: 'open /tmp/x.txt' }] },
       {
@@ -901,7 +920,13 @@ describe('useGeminiStream', () => {
     } as unknown as TrackedCancelledToolCall;
 
     const client = new MockedGeminiClientClass(mockConfig);
-    // Pre-paired in history: dedup will fire for this callId.
+    // Pre-paired in history: dedup will fire for this callId. Wire
+    // the fast-path accessor so the dispatcher takes the
+    // `getHistoryFunctionResponseIds` branch (matches production
+    // path; see the default mock comment in MockedGeminiClientClass).
+    client.getHistoryFunctionResponseIds = vi
+      .fn()
+      .mockReturnValue(new Set(['call_dedup_cancelled']));
     client.getHistory = vi.fn().mockReturnValue([
       { role: 'user', parts: [{ text: 'cancelled write' }] },
       {
@@ -1027,6 +1052,12 @@ describe('useGeminiStream', () => {
     } as unknown as TrackedCompletedToolCall;
 
     const client = new MockedGeminiClientClass(mockConfig);
+    // Wire the fast-path accessor so the dispatcher takes the
+    // `getHistoryFunctionResponseIds` branch (matches production
+    // path; see the default mock comment in MockedGeminiClientClass).
+    client.getHistoryFunctionResponseIds = vi
+      .fn()
+      .mockReturnValue(new Set(['call_race_A_responding']));
     client.getHistory = vi.fn().mockReturnValue([
       { role: 'user', parts: [{ text: 'open /tmp/y.txt' }] },
       {

From b27085a636fb188a703a0c1cea7c6fdfe26e1515 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Wed, 20 May 2026 22:37:16 +0800
Subject: [PATCH 16/18] refactor(core,cli): address yiliang114 review
 observations (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five items from the yiliang114 review on commit 06a695156. Two P1
(observability + structural maintainability) and three editorial.

[A] P1 — repairOrphanedToolUseTurns refactored into three pure
phases: scanModelTurn / planRepair / applyRepair. Each runs in
isolation against narrow `ScanResult` / `RepairPlan` types, and only
the last mutates `history`. The outer forward-walk loop is now a
~25-line orchestrator: scan → bail on empty expected → plan → bail
on empty plan → apply → record `injected` + `droppedDuplicates` →
advance cursor past any freshly-inserted user turn. Index drift can
only happen in applyRepair, so an audit narrows to one function.
Behavior identical to the pre-refactor monolith — all 13 existing
repair tests pass unchanged.

[B] P1 — finally-block JSONL flush error log promoted from
`debugLogger.warn` to `debugLogger.error`. A persistent write
failure (disk full, permission denied, serialization broken)
silently loses every deferred partial after the first occurrence;
that's the class of failure that warrants monitoring attention, not
the warn category that fades into log noise. Single-occurrence
transient failures still surface as one error per occurrence.

[C] P2 — addHistory partial-marker log promoted to `debugLogger.error`
with `[INVARIANT_VIOLATION]` tag. The existing call graph cannot
legitimately hit this branch (it only fires when addHistory runs
mid-sendMessageStream, which no current caller does); any future
hit is a true bug in a new caller, not noise.

[D] P2 — removed the `getHistory()` fallback branch in
`useGeminiStream.handleCompletedTools`. Verified via grep: every
GeminiClient consumer (real production code + MockedGeminiClientClass
+ every individual dedup test) now exposes
`getHistoryFunctionResponseIds`, so the three-branch duck-type
dispatch was dead-code at production level and only existed to
support an interim cycle when the fast-path accessor was being
introduced. The dispatcher is now a one-liner; production path is
strictly the fast accessor, with an empty Set returned if `geminiClient`
itself is missing (only happens in hook-level unit tests with no
client at all).

[E] P2 — cleanup pass dropping reviewer/thread citations from code
comments (e.g. "qwen-latest-series-invite-beta-v34 thread on PR
#4176", "gpt-5.5 review thread", "deepseek-v4-pro thread",
"yiliang114 repro"). The "why" explanations stay — those are the
load-bearing context. Reviewer/thread attribution lives in the
PR history; embedding it in long-term comments was noise that
ages poorly. Touched 9 sites in geminiChat.ts, 2 in client.ts,
1 in useGeminiStream.ts, 4 in useGeminiStream.test.tsx, and 8 in
geminiChat.test.ts (test names + comment blocks).

Also: P2.4 (yiliang114 flagged a potential test-coverage regression
on TPM throttling + Retry-After delay) verified as a false alarm
— `should retry on TPM throttling StreamContentError with initial
delay` (geminiChat.test.ts:2959) and `should use Retry-After delay
for streamed rate-limit errors` (line 3035) both still exist.

Tests: 353/353 (no new tests; all existing pass through the
refactored repair function and the simplified dispatcher). tsc +
eslint + prettier clean.
---
 .../cli/src/ui/hooks/useGeminiStream.test.tsx |  17 +-
 packages/cli/src/ui/hooks/useGeminiStream.ts  |  43 +-
 packages/core/src/core/client.ts              |   5 +-
 packages/core/src/core/geminiChat.test.ts     |  42 +-
 packages/core/src/core/geminiChat.ts          | 497 ++++++++++--------
 5 files changed, 314 insertions(+), 290 deletions(-)

diff --git a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
index cb835cfff9a..8a7a9d8df3c 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@@ -61,7 +61,6 @@ const MockedGeminiClientClass = vi.hoisted(() =>
     // fall through to the `structuredClone(getHistory())` slow path
     // and any regression in the fast path would silently route
     // production onto the expensive branch while CI stays green.
-    // deepseek-v4-pro thread on PR #4176.
     this.getHistoryFunctionResponseIds = vi
       .fn()
       .mockReturnValue(new Set<string>());
@@ -862,9 +861,9 @@ describe('useGeminiStream', () => {
 
     // The deduped tool DID run locally — `recordCompletedToolCall` must
     // still fire so toolCallCount / skillsModifiedInSession reflect it,
-    // even though the wire-side submission is dropped. (Regression guard
-    // for the gap mimo-v2.5-pro flagged: filtering deduped tools out of
-    // `geminiTools` without recording skipped the metric increment.)
+    // even though the wire-side submission is dropped. Regression guard:
+    // an earlier version filtered deduped tools out of `geminiTools`
+    // without recording, skipping the metric increment.
     expect(client.recordCompletedToolCall).toHaveBeenCalledWith('read_file', {
       path: '/tmp/x.txt',
     });
@@ -879,10 +878,10 @@ describe('useGeminiStream', () => {
     // model-visible output — counting it via `recordCompletedToolCall`
     // (which increments toolCallCount and can flip
     // skillsModifiedInSession on a skill-write path) would inflate the
-    // metric for a call that never ran end-to-end. This test repros
-    // the deepseek-v4-pro thread on PR #4176: dedup must skip BOTH
-    // client-initiated (already skipped) AND cancelled tools, while
-    // still calling `markToolsAsSubmitted` so the scheduler unblocks.
+    // metric for a call that never ran end-to-end. Dedup must skip
+    // BOTH client-initiated (already skipped) AND cancelled tools,
+    // while still calling `markToolsAsSubmitted` so the scheduler
+    // unblocks.
     const cancelledDedupedTool = {
       request: {
         callId: 'call_dedup_cancelled',
@@ -1168,7 +1167,7 @@ describe('useGeminiStream', () => {
     releaseStream();
   });
 
-  it('handles a mixed batch (one deduped + one non-deduped) without double-counting telemetry (qwen-latest-series-invite-beta-v34 thread on PR #4176)', async () => {
+  it('handles a mixed batch (one deduped + one non-deduped) without double-counting telemetry', async () => {
     // The dedup filter on `geminiTools` (`!historyCallIdsWithResponse.has(callId)`)
     // is the only thing preventing double `recordCompletedToolCall`
     // for tools whose late real result lands AFTER the orphan-tool_use
diff --git a/packages/cli/src/ui/hooks/useGeminiStream.ts b/packages/cli/src/ui/hooks/useGeminiStream.ts
index bc8f15c5484..9cf3fed9c81 100644
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@@ -2048,38 +2048,16 @@ export const useGeminiStream = (
       // returns `structuredClone(this.history)`, which on long sessions
       // (200+ entries with sizable tool outputs) costs several ms on
       // the React UI thread and visibly stalls streaming when the
-      // dedup pass runs on every tool-completion batch. The
-      // `getHistoryFunctionResponseIds` accessor walks history in
-      // place and only collects the id strings we actually need.
-      // Guard the call: some test harnesses build a partial
-      // GeminiClient mock without it. Skipping dedup in that case is
-      // safe — tests that never set up the repair pre-condition run
-      // with the original (pre-dedup) submission shape. We fall back
-      // to the cloning getHistory() path for older mocks that only
-      // expose that method, so legacy tests stay green.
-      // qwen-latest-series-invite-beta-v34 thread on PR #4176.
-      let historyCallIdsWithResponse: Set<string>;
-      if (
-        geminiClient &&
-        typeof geminiClient.getHistoryFunctionResponseIds === 'function'
-      ) {
-        historyCallIdsWithResponse =
-          geminiClient.getHistoryFunctionResponseIds();
-      } else if (
-        geminiClient &&
-        typeof geminiClient.getHistory === 'function'
-      ) {
-        historyCallIdsWithResponse = new Set<string>();
-        for (const entry of geminiClient.getHistory()) {
-          if (entry.role !== 'user') continue;
-          for (const part of entry.parts ?? []) {
-            const id = part.functionResponse?.id;
-            if (id) historyCallIdsWithResponse.add(id);
-          }
-        }
-      } else {
-        historyCallIdsWithResponse = new Set<string>();
-      }
+      // dedup pass runs on every tool-completion batch.
+      // `getHistoryFunctionResponseIds` walks history in place and
+      // returns only the id Set this dispatcher needs. The
+      // GeminiClient implementation is mandatory — production and
+      // test mocks both expose it. Skip the dedup pass entirely if
+      // the client is missing (only happens in unit tests that
+      // construct a hook without a client).
+      const historyCallIdsWithResponse: Set<string> = geminiClient
+        ? geminiClient.getHistoryFunctionResponseIds()
+        : new Set<string>();
       const dedupedTools = completedAndReadyToSubmitTools.filter((tc) =>
         historyCallIdsWithResponse.has(tc.request.callId),
       );
@@ -2130,7 +2108,6 @@ export const useGeminiStream = (
       // matching `functionResponse` — the dedup block above already
       // called `markToolsAsSubmitted` for those, and re-dispatching
       // the same callIds here would queue an extra React render.
-      // qwen-latest-series-invite-beta-v34 thread on PR #4176.
       const clientTools = completedAndReadyToSubmitTools.filter(
         (t) =>
           t.request.isClientInitiated &&
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 0c92a51409d..4f4fc80969d 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -323,8 +323,7 @@ export class GeminiClient {
    * tool outputs the clone is a multi-millisecond hit on the React UI
    * thread; running it on every tool-completion batch caused visible
    * frame drops during streaming. See
-   * `GeminiChat.getHistoryFunctionResponseIds` for the implementation
-   * and the qwen-latest-series-invite-beta-v34 thread on PR #4176.
+   * `GeminiChat.getHistoryFunctionResponseIds` for the implementation.
    */
   getHistoryFunctionResponseIds(): Set<string> {
     return this.getChat().getHistoryFunctionResponseIds();
@@ -414,7 +413,7 @@ export class GeminiClient {
       // repair function. Without this a duplicate-only repair (no
       // synthesis, no hoist) leaves zero diagnostic trail and a
       // future callId-collision bug would silently delete the
-      // wrong fr. qwen-latest-series-invite-beta-v34 thread on PR #4176.
+      // wrong fr.
       debugLogger.warn(
         `[REPAIR] Dropped ${result.droppedDuplicates.length} duplicate ` +
           `functionResponse(s) for callId(s): ${result.droppedDuplicates
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index 833fdc167c6..33737d0d2ff 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -2223,7 +2223,7 @@ describe('GeminiChat', async () => {
     });
   });
 
-  describe('getHistoryFunctionResponseIds (qwen-latest-series-invite-beta-v34 thread on PR #4176)', () => {
+  describe('getHistoryFunctionResponseIds', () => {
     // Walk-only accessor used by `useGeminiStream.handleCompletedTools`
     // for the dedup pass. The whole point of this method is to avoid
     // the multi-millisecond `structuredClone` hit that
@@ -2602,11 +2602,11 @@ describe('GeminiChat', async () => {
     });
 
     it('rolls back the partial assistant turn when a retryable error fires after a tool_use chunk', async () => {
-      // Regression for the case @yiliang114 reproduced on #4176: stream
-      // attempt 1 yields a `functionCall` (which triggers the partial-
-      // history push in `processStreamResponse`), then the SAME stream
-      // throws a retryable error (e.g. a TPM 429 `StreamContentError`).
-      // The outer retry loop must drop the partial before issuing the
+      // Regression for a stream attempt that yields a `functionCall`
+      // (which triggers the partial-history push in
+      // `processStreamResponse`), then throws a retryable error (e.g.
+      // a TPM 429 `StreamContentError`). The outer retry loop must
+      // drop the partial before issuing the
       // retry — otherwise the retry's response lands as a SECOND
       // consecutive `model` entry and the failed-attempt `tool_use`
       // becomes orphan on the wire (invalid alternation +
@@ -2682,7 +2682,7 @@ describe('GeminiChat', async () => {
       }
     });
 
-    it('rolls back the partial assistant turn when an InvalidStreamError fires after a tool_use chunk on the transient-stream retry budget (qwen-latest-series-invite-beta-v34 thread on PR #4176)', async () => {
+    it('rolls back the partial assistant turn when an InvalidStreamError fires after a tool_use chunk on the transient-stream retry budget', async () => {
       // Counterpart to the rate-limit rollback above. The
       // transient-stream retry budget (NO_FINISH_REASON /
       // NO_RESPONSE_TEXT) has its own popPartialIfPushed call site —
@@ -2771,10 +2771,9 @@ describe('GeminiChat', async () => {
     // `popPartialIfPushed()` call there is preserved as
     // defense-in-depth for a future error class that should diverge
     // the predicates; see the comment block at that call site for
-    // the full analysis. qwen-latest-series-invite-beta-v34 thread
-    // on PR #4176.
+    // the full analysis.
 
-    it('rolls back the chat-recording entry too when the retry succeeds (yiliang114 PR #4176 follow-up)', async () => {
+    it('rolls back the chat-recording entry too when the retry succeeds', async () => {
       // The in-memory rollback test above asserts `this.history` ends
       // clean after a retry-success. This test asserts the same about
       // chat-recording JSONL: the failed attempt's `recordAssistantTurn`
@@ -4025,7 +4024,7 @@ describe('GeminiChat', async () => {
     });
   });
 
-  describe('partial-push marker invariants on history mutation (qwen-latest-series-invite-beta-v34 thread on PR #4176)', () => {
+  describe('partial-push marker invariants on history mutation', () => {
     // The whole partial-push lifecycle relies on the invariant
     //   "every history-mutation method clears the partial-push markers"
     // — six sites enforce it (clearHistory, addHistory, setHistory,
@@ -4499,15 +4498,14 @@ describe('GeminiChat', async () => {
     });
 
     it('hoists the real functionResponse from a non-adjacent later user turn into the adjacent one', () => {
-      // Regression for the gpt-5.5 thread on PR #4176: shape
-      // `[user, model[fc], user[text], user[fr_real]]` arises when the
-      // user aborts a long-running tool, types a follow-up text turn,
-      // and then the React scheduler's late submitQuery appends the
-      // real tool_result as a SEPARATE user entry.
+      // Regression for the shape
+      // `[user, model[fc], user[text], user[fr_real]]` — arises when
+      // the user aborts a long-running tool, types a follow-up text
+      // turn, and the React scheduler's late submitQuery then appends
+      // the real tool_result as a SEPARATE user entry.
       //
-      // Forward scanning alone (qwen-latest-series-invite-beta-v28
-      // thread, fixed in commit 2880de577) prevents the *synthesis*
-      // duplicate, but the wire layout is still
+      // Forward scanning alone prevents the *synthesis* duplicate,
+      // but the wire layout is still
       // `model[tool_use] → user[text] → user[tool_result]`, which
       // Anthropic-compatible backends reject because the tool_result
       // is not at the head of the IMMEDIATELY following user message.
@@ -4671,7 +4669,7 @@ describe('GeminiChat', async () => {
       expect(history[3]!.parts).toEqual([{ text: 'thanks anyway' }]);
     });
 
-    it('drops duplicate functionResponse entries for the same callId across user turns (gpt-5.5 thread on PR #4176)', () => {
+    it('drops duplicate functionResponse entries for the same callId across user turns', () => {
       // Critical regression: when the same callId is echoed back more
       // than once (e.g. the React scheduler retries the late submitQuery
       // after the orphan repair already planted one, or two parallel
@@ -5007,7 +5005,7 @@ describe('GeminiChat', async () => {
       expect(lastEntry.parts!.length).toBeGreaterThan(0);
     });
 
-    it('should pop both the partial model turn AND the recovery user message when recovery throws after a functionCall (qwen-latest-series-invite-beta-v34 thread on PR #4176)', async () => {
+    it('should pop both the partial model turn AND the recovery user message when recovery throws after a functionCall', async () => {
       // Critical regression for the recovery catch's pop ordering.
       // When the recovery stream yields a `functionCall` chunk and
       // then throws, `processStreamResponse` pushes a partial `model`
@@ -5200,7 +5198,7 @@ describe('GeminiChat', async () => {
       expect(mergedText).toBe('BCD');
     });
 
-    it('flushes the JSONL record when escalated stream throws mid-tool_use (qwen-latest-series-invite-beta-v28 thread on PR #4176)', async () => {
+    it('flushes the JSONL record when escalated stream throws mid-tool_use', async () => {
       // Critical regression for the max-tokens escalation path:
       // 1) initial stream succeeds with text + MAX_TOKENS → triggers
       //    escalation, no partial set, deferred record clean.
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 34570b66e55..7343acbb98a 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -416,8 +416,7 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
  *    exists somewhere later) but the wire layout still serializes
  *    `model[tool_use] → user[text] → user[tool_result]`, which
  *    Anthropic-compatible backends reject with "tool_use_id ... must have a
- *    corresponding tool_use block in the previous message". gpt-5.5 review
- *    thread on PR #4176.
+ *    corresponding tool_use block in the previous message".
  *
  * Mutates `history` in place and returns the set of injected `(callId, name)`
  * tuples so callers (the React tool scheduler) can dedupe a real `tool_result`
@@ -441,6 +440,242 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
  * already present in history). See PR review thread on #4176 for the full
  * race-class analysis.
  */
+/** Location of a `functionResponse` part within `history`. */
+interface FrLocation {
+  turnIdx: number;
+  partIdx: number;
+  part: Part;
+}
+
+/**
+ * Output of the scan phase for a single `model[functionCall]` turn at
+ * `modelIdx`. `expected` maps each `functionCall.id` to its tool name,
+ * `matched` maps that same id to ALL locations of matching
+ * `functionResponse` parts across the consecutive user turns that
+ * follow, and `scanEnd` is one past the last user turn visited.
+ */
+interface ScanResult {
+  modelIdx: number;
+  expected: Map<string, string>;
+  matched: Map<string, FrLocation[]>;
+  scanEnd: number;
+}
+
+/**
+ * Output of the decision phase for one scanned model turn. Encodes
+ * exactly which mutations the next phase should apply:
+ *  - `synthesizeIds`: ids that have no matching fr anywhere — synthesize an
+ *    `error` `functionResponse` for each.
+ *  - `hoistedParts`: parts to MOVE into the adjacent user turn (the
+ *    canonical survivor for each id whose fr lives in a non-adjacent
+ *    later turn).
+ *  - `removalTargets`: parts to SPLICE out of `history` — covers both
+ *    hoist survivors (so they only remain in the new location) and
+ *    duplicate copies of any id.
+ *  - `droppedDuplicates`: callIds whose duplicates we removed; returned
+ *    by the function so callers can log the cleanup.
+ */
+interface RepairPlan {
+  modelIdx: number;
+  scanEnd: number;
+  synthesizeIds: Array<[string, string]>;
+  hoistedParts: Part[];
+  removalTargets: Array<{ turnIdx: number; partIdx: number }>;
+  droppedDuplicates: Array<{ callId: string; name: string }>;
+}
+
+/**
+ * SCAN PHASE — collect `expected` from the `model[functionCall]` turn
+ * at `modelIdx` and `matched` from every consecutive user turn that
+ * follows. Pure read; no mutation.
+ *
+ * Storing ALL locations (not just the first) is load-bearing for the
+ * duplicate case: if the same callId is echoed back more than once
+ * across the consecutive user turns (e.g.
+ * `model[fc id=cid], user[text], user[fr cid], user[fr cid]` — possible
+ * when the React scheduler retries the late `submitQuery` and a
+ * duplicate fr lands), hoisting only the first would leave the
+ * duplicate behind. The wire payload then serializes
+ *   `model[tool_use] -> user[tool_result] -> user[tool_result]`
+ * and the backend rejects the trailing block as an orphan
+ * ("tool_use_id ... must have a corresponding tool_use block in the
+ * previous message").
+ */
+function scanModelTurn(history: Content[], modelIdx: number): ScanResult {
+  const expected = new Map<string, string>();
+  for (const part of history[modelIdx]?.parts ?? []) {
+    const fc = part.functionCall;
+    if (fc?.id) expected.set(fc.id, fc.name ?? 'unknown');
+  }
+
+  const matched = new Map<string, FrLocation[]>();
+  let scanIdx = modelIdx + 1;
+  while (scanIdx < history.length && history[scanIdx]?.role === 'user') {
+    const parts = history[scanIdx].parts ?? [];
+    for (let pIdx = 0; pIdx < parts.length; pIdx++) {
+      const part = parts[pIdx];
+      const id = part.functionResponse?.id;
+      if (id) {
+        const list = matched.get(id);
+        if (list) list.push({ turnIdx: scanIdx, partIdx: pIdx, part });
+        else matched.set(id, [{ turnIdx: scanIdx, partIdx: pIdx, part }]);
+      }
+    }
+    scanIdx++;
+  }
+
+  return { modelIdx, expected, matched, scanEnd: scanIdx };
+}
+
+/**
+ * DECISION PHASE — classify each expected callId into synthesize /
+ * hoist / skip-already-adjacent, and collect every duplicate copy for
+ * removal. Pure compute; no mutation. The plan returned here drives
+ * the mutation phase exactly.
+ *
+ * Classification rules per callId:
+ *   - No matching fr anywhere     → SYNTHESIZE an error fr.
+ *   - First match adjacent (modelIdx+1) → SKIP relocation. (Duplicates,
+ *     if any, are still removed below.)
+ *   - First match non-adjacent    → HOIST: move the canonical part into
+ *     the adjacent user turn; remove the original location.
+ * In all matched cases, drop EVERY duplicate beyond the first so the
+ * wire payload contains exactly one fr per call.
+ */
+function planRepair(scan: ScanResult): RepairPlan {
+  const synthesizeIds: Array<[string, string]> = [];
+  const hoistedParts: Part[] = [];
+  const removalTargets: Array<{ turnIdx: number; partIdx: number }> = [];
+  const droppedDuplicates: Array<{ callId: string; name: string }> = [];
+
+  const adjacentIdx = scan.modelIdx + 1;
+  for (const [id, name] of scan.expected) {
+    const locations = scan.matched.get(id);
+    if (!locations || locations.length === 0) {
+      synthesizeIds.push([id, name]);
+      continue;
+    }
+    // First copy is the canonical survivor — payloads should be
+    // identical for the same callId; if they differ, the wire is
+    // already corrupt and the backend rejects regardless.
+    const survivor = locations[0]!;
+    if (survivor.turnIdx !== adjacentIdx) {
+      hoistedParts.push(survivor.part);
+      removalTargets.push({
+        turnIdx: survivor.turnIdx,
+        partIdx: survivor.partIdx,
+      });
+    }
+    for (let k = 1; k < locations.length; k++) {
+      removalTargets.push({
+        turnIdx: locations[k]!.turnIdx,
+        partIdx: locations[k]!.partIdx,
+      });
+      droppedDuplicates.push({ callId: id, name });
+    }
+  }
+
+  return {
+    modelIdx: scan.modelIdx,
+    scanEnd: scan.scanEnd,
+    synthesizeIds,
+    hoistedParts,
+    removalTargets,
+    droppedDuplicates,
+  };
+}
+
+/**
+ * MUTATION PHASE — apply the plan to `history` in place. Returns the
+ * number of new user turns inserted before `modelIdx + 1` (currently
+ * always 0 or 1), which the caller uses to advance its forward-walk
+ * cursor past anything the loop should not revisit.
+ *
+ * Order matters here:
+ *  1. Splice removal targets in (turnIdx desc, partIdx desc) so earlier
+ *     removals don't shift indices for later ones.
+ *  2. Drop user turns within `[modelIdx + 2, scanEnd)` that are now
+ *     empty after the splice. Walk back-to-front for the same reason.
+ *     The immediately-adjacent turn (`modelIdx + 1`) is preserved even
+ *     if empty — we rewrite its parts in step 3.
+ *  3. Inject `[...synthetic, ...hoisted]` at the head of the adjacent
+ *     user turn (before any non-fr parts) OR insert a new user turn
+ *     between `modelIdx` and whatever follows.
+ *
+ * Anthropic-compatible backends require the tool_result blocks at the
+ * head of the immediately following user message; appending instead
+ * (`[text, fr]`) re-triggers the 400 the synthesis pass exists to
+ * escape. Mirrors upstream Claude Code's `hoistToolResults`.
+ *
+ * CONSEQUENCE OF REMOVAL of the head-insert: dropping this hoist (e.g.
+ * naively `next.parts = [...existing, ...partsToInject]`) re-introduces
+ * the "tool_use_id ... must have a corresponding tool_use block in the
+ * previous message" 400 the synthesis pass exists to prevent. Do not
+ * "simplify" this branch.
+ */
+function applyRepair(
+  history: Content[],
+  plan: RepairPlan,
+  reason: string,
+): { insertedBefore: number } {
+  if (plan.synthesizeIds.length === 0 && plan.removalTargets.length === 0) {
+    return { insertedBefore: 0 };
+  }
+
+  const syntheticParts: Part[] = plan.synthesizeIds.map(([callId, name]) => ({
+    functionResponse: { id: callId, name, response: { error: reason } },
+  }));
+  const partsToInject: Part[] = [...syntheticParts, ...plan.hoistedParts];
+
+  // (1) Splice removal targets, descending so indices stay valid.
+  const removals = [...plan.removalTargets].sort((a, b) => {
+    if (a.turnIdx !== b.turnIdx) return b.turnIdx - a.turnIdx;
+    return b.partIdx - a.partIdx;
+  });
+  for (const loc of removals) {
+    const turnParts = history[loc.turnIdx].parts;
+    if (turnParts) turnParts.splice(loc.partIdx, 1);
+  }
+
+  // (2) Drop now-empty user turns within [modelIdx + 2, scanEnd).
+  // Preserve the adjacent turn even if empty — we'll rewrite it
+  // below.
+  const adjacentIdx = plan.modelIdx + 1;
+  for (let j = plan.scanEnd - 1; j > adjacentIdx; j--) {
+    if (history[j]?.role === 'user' && (history[j].parts?.length ?? 0) === 0) {
+      history.splice(j, 1);
+    }
+  }
+
+  // (3) Place new parts at the head of the adjacent user turn, OR
+  // insert a fresh user turn between this model turn and whatever
+  // follows.
+  const next = history[adjacentIdx];
+  if (next?.role === 'user') {
+    const existing = next.parts ?? [];
+    const firstNonFr = existing.findIndex((part) => !part.functionResponse);
+    const insertAt = firstNonFr === -1 ? existing.length : firstNonFr;
+    next.parts = [
+      ...existing.slice(0, insertAt),
+      ...partsToInject,
+      ...existing.slice(insertAt),
+    ];
+    return { insertedBefore: 0 };
+  }
+  history.splice(adjacentIdx, 0, { role: 'user', parts: partsToInject });
+  return { insertedBefore: 1 };
+}
+
+/**
+ * Forward-walk `history`, planning and applying the repair for each
+ * `model[functionCall]` turn in turn. Iteration is index-based and the
+ * cursor advances by the count of user turns inserted ahead of it so
+ * a freshly-injected turn isn't re-visited.
+ *
+ * Splitting scan / decision / mutation into separate functions keeps
+ * each phase auditable in isolation — index drift can only happen in
+ * `applyRepair`, the only function that mutates `history`.
+ */
 export function repairOrphanedToolUseTurns(
   history: Content[],
   reason: string = ORPHAN_TOOL_USE_REPAIR_REASON,
@@ -449,217 +684,30 @@ export function repairOrphanedToolUseTurns(
   droppedDuplicates: Array<{ callId: string; name: string }>;
 } {
   const injected: Array<{ callId: string; name: string }> = [];
-  // Duplicates removed during the cleanup pass (any callId that had
-  // more than one `functionResponse` echo across the consecutive
-  // user turns). Returned alongside `injected` so call sites can log
-  // the cleanup — without this, a duplicate-only repair (no
-  // synthesis, no hoist) leaves zero diagnostic trail and a future
-  // callId-collision bug in the resolver could silently drop the
-  // wrong fr with no breadcrumb pointing here.
-  // qwen-latest-series-invite-beta-v34 thread on PR #4176.
   const droppedDuplicates: Array<{ callId: string; name: string }> = [];
 
-  // Forward walk: i mutates as we splice, so use index-based iteration
-  // and skip the freshly-inserted user turn to avoid re-scanning it.
   for (let i = 0; i < history.length; i++) {
-    const turn = history[i];
-    if (turn.role !== 'model') continue;
-
-    // Collect (id → name) for every functionCall in this model turn.
-    const expected = new Map<string, string>();
-    for (const part of turn.parts ?? []) {
-      const fc = part.functionCall;
-      if (fc?.id) {
-        expected.set(fc.id, fc.name ?? 'unknown');
-      }
-    }
-    if (expected.size === 0) continue;
-
-    // Scan forward across EVERY consecutive user turn until the next
-    // non-user (model) entry, recording (id → location) for every
-    // functionResponse part. The real tool_result for this model[fc]
-    // may live in a non-adjacent user turn — common shape: user aborts
-    // a long-running tool, types a follow-up text turn, then the React
-    // scheduler's late submitQuery appends the real `user[fr]` as a
-    // SEPARATE user entry, producing `model[fc], user[text], user[fr_real]`.
-    //
-    // We need the full location (turn index + part index) for two
-    // reasons:
-    //  - synthesis: skip ids that already have a real fr SOMEWHERE so we
-    //    don't plant a duplicate `error` placeholder. Without this, the
-    //    `handleCompletedTools` history-based dedup would then drop the
-    //    REAL result on the next submission (callId already in history
-    //    via the synthetic), and the model would only ever see the error.
-    //  - hoist: ids matched in a NON-adjacent later user turn must be
-    //    physically moved into history[i+1] (before non-fr parts) so the
-    //    wire format stays `model[tool_use] → user[tool_result, ...]`.
-    //    Anthropic-compatible backends require the tool_result blocks at
-    //    the head of the immediately following user message, otherwise
-    //    they reject with "tool_use_id ... must have a corresponding
-    //    tool_use block in the previous message". gpt-5.5 review thread
-    //    on PR #4176.
-    // Collect EVERY (turnIdx, partIdx, part) for every functionResponse
-    // we encounter, keyed by id. Storing all locations (not just the
-    // first) is load-bearing for the duplicate case: if the same callId
-    // is echoed back more than once across the consecutive user turns
-    // (e.g. `model[fc id=cid], user[text], user[fr cid], user[fr cid]`
-    // — possible when the React scheduler retries the late submitQuery
-    // and a duplicate fr lands), hoisting only the first leaves the
-    // duplicate(s) behind in a non-adjacent later user turn. The wire
-    // payload then serializes
-    //   `model[tool_use] -> user[tool_result] -> user[tool_result]`
-    // and the backend rejects the trailing block as an orphan
-    // ("tool_use_id ... must have a corresponding tool_use block in the
-    // previous message"), so the session stays wedged for the same 400
-    // class this repair pass exists to escape. We MUST drop every
-    // duplicate; the survivor at history[i+1] is whichever copy we
-    // hoist, and the rest are erased. (gpt-5.5 thread on PR #4176.)
-    const matched = new Map<
-      string,
-      Array<{ turnIdx: number; partIdx: number; part: Part }>
-    >();
-    let scanIdx = i + 1;
-    while (scanIdx < history.length && history[scanIdx]?.role === 'user') {
-      const parts = history[scanIdx].parts ?? [];
-      for (let pIdx = 0; pIdx < parts.length; pIdx++) {
-        const part = parts[pIdx];
-        const id = part.functionResponse?.id;
-        if (id) {
-          const list = matched.get(id);
-          if (list) list.push({ turnIdx: scanIdx, partIdx: pIdx, part });
-          else matched.set(id, [{ turnIdx: scanIdx, partIdx: pIdx, part }]);
-        }
-      }
-      scanIdx++;
-    }
+    if (history[i].role !== 'model') continue;
 
-    const synthesizeIds: Array<[string, string]> = [];
-    const hoistedParts: Part[] = [];
-    // Removal targets cover BOTH:
-    //  - the survivor's original location for ids being hoisted (we
-    //    move it into history[i+1])
-    //  - every duplicate copy of an id (hoisted or already-adjacent)
-    // For an id whose canonical fr is already in history[i+1] but has
-    // duplicates further down, we don't hoist (no relocation needed)
-    // but we still must drop the duplicates so the wire format only
-    // contains one fr per call.
-    const allRemovalTargets: Array<{ turnIdx: number; partIdx: number }> = [];
-    for (const [id, name] of expected) {
-      const locations = matched.get(id);
-      if (!locations || locations.length === 0) {
-        synthesizeIds.push([id, name]);
-        continue;
-      }
-      // Pick the first location's part as the canonical survivor (any
-      // copy works — the response payload should be identical for the
-      // same callId; if they differ the wire is already corrupt and the
-      // backend will reject regardless).
-      const survivor = locations[0]!;
-      if (survivor.turnIdx !== i + 1) {
-        // Hoist: the canonical part needs to move into history[i+1].
-        hoistedParts.push(survivor.part);
-        allRemovalTargets.push({
-          turnIdx: survivor.turnIdx,
-          partIdx: survivor.partIdx,
-        });
-      }
-      // else: canonical already adjacent — no relocation.
-      // Either way, drop EVERY duplicate beyond the first.
-      for (let k = 1; k < locations.length; k++) {
-        allRemovalTargets.push({
-          turnIdx: locations[k]!.turnIdx,
-          partIdx: locations[k]!.partIdx,
-        });
-        droppedDuplicates.push({ callId: id, name });
-      }
-    }
-    if (synthesizeIds.length === 0 && allRemovalTargets.length === 0) continue;
+    const scan = scanModelTurn(history, i);
+    if (scan.expected.size === 0) continue;
 
-    const syntheticParts: Part[] = synthesizeIds.map(([callId, name]) => ({
-      functionResponse: {
-        id: callId,
-        name,
-        response: { error: reason },
-      },
-    }));
-    // Synthetics first, then hoisted. Order between synthetic and
-    // hoisted is internal — backends just need ALL tool_results at the
-    // head of the user turn, regardless of order among themselves.
-    const partsToInject: Part[] = [...syntheticParts, ...hoistedParts];
-
-    // Remove every removal target (hoist survivors + all duplicates).
-    // Sort by (turnIdx desc, partIdx desc) so each splice operates on
-    // stable indices for everything still to be removed.
-    allRemovalTargets.sort((a, b) => {
-      if (a.turnIdx !== b.turnIdx) return b.turnIdx - a.turnIdx;
-      return b.partIdx - a.partIdx;
-    });
-    for (const loc of allRemovalTargets) {
-      const turnParts = history[loc.turnIdx].parts;
-      if (turnParts) turnParts.splice(loc.partIdx, 1);
-    }
-    // Drop any user turn within the scan range (i+2 .. scanIdx-1) that
-    // is now empty because we extracted its only fr part(s). Walk back
-    // to front so removals don't shift remaining indices. The
-    // immediately-adjacent turn at i+1 is preserved even if empty —
-    // we'll rewrite its parts below. After this, scanIdx is no longer
-    // accurate; we rebind via `next` directly.
-    for (let j = scanIdx - 1; j > i + 1; j--) {
-      if (
-        history[j]?.role === 'user' &&
-        (history[j].parts?.length ?? 0) === 0
-      ) {
-        history.splice(j, 1);
-      }
-    }
-
-    const next = history[i + 1];
-    if (next?.role === 'user') {
-      // Synthesized + hoisted functionResponse parts MUST be placed
-      // before any non-functionResponse parts in the user turn.
-      // Anthropic-compatible backends reject a user message whose first
-      // content block isn't the tool_result that answers the
-      // immediately preceding tool_use ("tool result must follow tool
-      // use" / "tool_use_id ... must have a corresponding tool_use
-      // block in the previous message"). Common case after a Ctrl+Y
-      // race: the user's retry-prompt text was just pushed, so
-      // `next.parts = [text]`; appending the synthetic to the end
-      // would produce `[text, fr]` and re-trigger the wedge this PR
-      // is supposed to escape. Mirrors upstream Claude Code's
-      // `hoistToolResults` (`utils/messages.ts`).
-      //
-      // CONSEQUENCE OF REMOVAL: dropping this hoist (e.g. naively
-      // `next.parts = [...existing, ...partsToInject]`) re-introduces
-      // the exact 400 "tool_use_id ... must have a corresponding tool_use
-      // block in the previous message" the synthesis pass exists to
-      // prevent. The whole repair becomes a no-op and the session stays
-      // wedged. Do not "simplify" this branch.
-      //
-      // Place new parts AFTER any pre-existing functionResponse parts
-      // (real tool_results the user already had at the head of this
-      // turn) so caller-supplied ordering is preserved.
-      const existing = next.parts ?? [];
-      const firstNonFr = existing.findIndex((part) => !part.functionResponse);
-      const insertAt = firstNonFr === -1 ? existing.length : firstNonFr;
-      next.parts = [
-        ...existing.slice(0, insertAt),
-        ...partsToInject,
-        ...existing.slice(insertAt),
-      ];
-    } else {
-      history.splice(i + 1, 0, { role: 'user', parts: partsToInject });
-      // Skip the freshly-inserted user turn so the outer loop doesn't
-      // visit it as a model turn (it isn't) and stays linear-time.
-      i++;
+    const plan = planRepair(scan);
+    if (plan.synthesizeIds.length === 0 && plan.removalTargets.length === 0) {
+      continue;
     }
 
-    // Only synthesized ids feed dedup — hoisted ids reference real frs
-    // that were ALREADY in history before this pass and remain present
-    // (just relocated). The scheduler's history-based dedup will
-    // continue to handle those naturally on its next pass.
-    for (const [callId, name] of synthesizeIds) {
+    const { insertedBefore } = applyRepair(history, plan, reason);
+    // Only synthesized ids feed `injected` — hoisted ids reference real
+    // frs that were ALREADY in history before this pass (just
+    // relocated), so the scheduler's dedup naturally handles them.
+    for (const [callId, name] of plan.synthesizeIds) {
       injected.push({ callId, name });
     }
+    droppedDuplicates.push(...plan.droppedDuplicates);
+    // Advance past any freshly-inserted user turn so the outer loop
+    // doesn't revisit it. Keeps the walk linear-time.
+    i += insertedBefore;
   }
 
   return { injected, droppedDuplicates };
@@ -749,7 +797,7 @@ export class GeminiChat {
    * JSONL transcript keeps the failed attempt even though live history
    * dropped it — `--resume` rehydrates the failed `model[functionCall]`
    * turn and the resumed model context picks up a tool_use the in-session
-   * run intentionally discarded (yiliang114 repro on PR #4176).
+   * run intentionally discarded.
    *
    * Lifecycle is paired with `pendingPartialAssistantTurnIndex`:
    *  - Set together on stream-error + hasToolCall + hasContent.
@@ -1093,7 +1141,6 @@ export class GeminiChat {
       if (inlineRepair.droppedDuplicates.length > 0) {
         // Symmetrical with the synthesis log: a duplicate-only repair
         // (no synthesis, no hoist) here would otherwise be silent.
-        // qwen-latest-series-invite-beta-v34 thread on PR #4176.
         debugLogger.warn(
           `[REPAIR] sendMessageStream inline pass dropped ` +
             `${inlineRepair.droppedDuplicates.length} duplicate ` +
@@ -1429,10 +1476,8 @@ export class GeminiChat {
             // defense-in-depth: a future error class that should consume
             // its own content-retry budget but NOT the transient one
             // could be threaded through here without re-deriving the
-            // popPartialIfPushed sequence. Reviewer thread on PR #4176
-            // (qwen-latest-series-invite-beta-v34) flagged the absence
-            // of a test — there is no reachable test path until the
-            // predicates diverge.
+            // popPartialIfPushed sequence. No reachable test path until
+            // the predicates diverge.
             const isContentError = error instanceof InvalidStreamError;
             if (isContentError) {
               if (attempt < INVALID_CONTENT_RETRY_OPTIONS.maxAttempts - 1) {
@@ -1596,7 +1641,7 @@ export class GeminiChat {
               // user turn. The partial-push markers are also cleared
               // in lockstep so the outer `finally` JSONL flush can't
               // resurrect a partial we just deleted from live history.
-              // qwen-latest-series-invite-beta-v34 thread on PR #4176.
+              //
               // Index-checked pop instead of a positional `pop()` so
               // we match the diagnostic standard set by
               // `popPartialIfPushed` above (splice at `idx` + warn on
@@ -1612,7 +1657,6 @@ export class GeminiChat {
               // markers for the actual partial — leaving it
               // permanently stranded with no log trail. The warn
               // makes any future violation visible immediately.
-              // qwen-latest-series-invite-beta-v34 thread on PR #4176.
               const expectedIdx = self.pendingPartialAssistantTurnIndex;
               const lastIdx = self.history.length - 1;
               if (
@@ -1728,13 +1772,20 @@ export class GeminiChat {
           // `this.history`, so live behavior is unaffected; only the
           // disk transcript loses this turn (eventual consistency
           // restored on the next successful flush of any other turn).
-          // qwen-latest-series-invite-beta-v34 thread on PR #4176.
           try {
             self.chatRecordingService?.recordAssistantTurn(
               self.pendingPartialAssistantRecord,
             );
           } catch (recordErr) {
-            debugLogger.warn(
+            // Error-level (not warn): a persistent JSONL write failure
+            // (disk full, permission, serialization) silently loses
+            // every deferred partial after this point — exactly the
+            // class of failure that warrants monitoring attention.
+            // Transient failures still bubble through as a single
+            // error per occurrence; if logs are spammed it's an
+            // operational signal that the recording layer is broken,
+            // not noise.
+            debugLogger.error(
               '[PARTIAL_FLUSH] Failed to persist deferred JSONL record: ' +
                 (recordErr instanceof Error
                   ? recordErr.message
@@ -1863,7 +1914,6 @@ export class GeminiChat {
    * (recursive `structuredClone` over every part, including large tool
    * outputs) on every batch and visibly stall the React UI thread on
    * long sessions (200+ entries with sizable tool results).
-   * qwen-latest-series-invite-beta-v34 thread on PR #4176.
    */
   getHistoryFunctionResponseIds(): Set<string> {
     const ids = new Set<string>();
@@ -1911,17 +1961,19 @@ export class GeminiChat {
     // attempt's `model[functionCall]` would survive into the retry,
     // and a successful retry's response would land as a SECOND
     // consecutive model turn (the wedge this whole subsystem exists
-    // to prevent). The warn below makes that coupling observable —
+    // to prevent). The log below makes that coupling observable —
     // anyone investigating a stale-partial bug will see this log line
-    // pointing straight at the offending caller, instead of having to
-    // trace through the marker lifecycle blind.
+    // pointing straight at the offending caller. Error-level (not
+    // warn) because this is a true invariant violation: the existing
+    // call graph cannot legitimately hit this branch, so any
+    // occurrence is a real bug in a future caller, not noise.
     if (
       this.pendingPartialAssistantTurnIndex !== null ||
       this.pendingPartialAssistantRecord !== null
     ) {
-      debugLogger.warn(
-        'addHistory called while a partial-push marker is active — ' +
-          'clearing it. This is unexpected during an active sendMessageStream ' +
+      debugLogger.error(
+        '[INVARIANT_VIOLATION] addHistory called while a partial-push ' +
+          'marker is active — clearing it. This is unexpected during an active sendMessageStream ' +
           'and likely indicates a new caller violating the between-sends ' +
           'invariant. See comment at GeminiChat.addHistory for context.',
       );
@@ -2204,7 +2256,7 @@ export class GeminiChat {
         // partial that survives in-memory. Without this, retry-success
         // leaves a failed `model[functionCall]` durable in JSONL and
         // `--resume` rehydrates a turn the live session correctly
-        // discarded (yiliang114 PR #4176 repro).
+        // discarded.
         this.pendingPartialAssistantRecord = recordArgs;
       } else {
         this.chatRecordingService?.recordAssistantTurn(recordArgs);
@@ -2243,7 +2295,6 @@ export class GeminiChat {
       // sharing the single binding eliminates drift risk if one gate is
       // tightened without the other and the JSONL recording silently
       // desyncs from in-memory history.
-      // qwen-latest-series-invite-beta-v34 thread on PR #4176.
       if (willPersistToHistory) {
         this.history.push({
           role: 'model',

From 298588190ddcdfd612875b5aa4f9d90ee5061fea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Thu, 21 May 2026 12:22:41 +0800
Subject: [PATCH 17/18] refactor(core): consolidate partial-tool_use repair
 docs into one design block (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the pomelo-nwu review observation on c30bba6e7:
~60% of the +918 lines added to geminiChat.ts were comments, with
several 20–40 line prose blocks repeating the same race-class /
tool_use_id wedge analysis at every call site. Net –139 / +117
(net delete ~22 lines but mostly redistributing weight from per-site
blocks to a single canonical note).

Changes:

* Add one canonical design block above ORPHAN_TOOL_USE_REPAIR_REASON
  covering: the tool_use_id 400 wedge, the three race classes (A
  Ctrl+Y mid-flight, B process crash, C SSE drop), the two-layer fix
  (partial-push + repair), and the partial-push marker lifecycle.
  Every per-site comment that used to repeat this now points back.

* Trim popPartialIfPushed COMPRESSED-branch comment from 14 lines to
  4 (no-op-today + reason for keeping).

* Trim recovery-catch comment block from ~38 lines to ~7 (pop-order
  matters + index-checked, pointer to canonical note).

* Trim addHistory invariant comment from ~22 lines to ~5 (when this
  fires + why error-level, pointer to canonical note).

* Trim processStreamResponse partial-push comment from ~22 lines to
  ~8 (Race C name + signal + plain-text-not-persisted rule).

* Replace tool_use_id wedge re-explanations at 5 sites (repair doc,
  scanModelTurn doc, applyRepair doc, popPartialIfPushed comment,
  processStreamResponse comment) with single-line pointers. Canonical
  block at line 411 is now the only authoritative copy.

Reviewer marked this as non-blocking ("shouldn't gate the merge");
done as cleanup polish while LGTM is in.

Tests: 375/375 (no test changes). tsc + eslint + prettier clean.
---
 packages/core/src/core/geminiChat.ts | 256 ++++++++++++---------------
 1 file changed, 117 insertions(+), 139 deletions(-)

diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index c23694667f8..0c755924f99 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -397,6 +397,75 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
   'Tool execution result was not recorded — likely interrupted by network ' +
   'failure, abort, or process exit. Treat as failure and retry if needed.';
 
+/*
+ * ============================================================================
+ * Partial-tool_use repair subsystem — canonical design note.
+ * ============================================================================
+ *
+ * Every comment block elsewhere in this file that mentions one of the
+ * concepts below points back here. Per-site comments should be one or two
+ * lines stating WHAT the local code does; the WHY lives here.
+ *
+ * --- The wedge ----------------------------------------------------------
+ *
+ * Anthropic-compatible backends (Anthropic, DeepSeek, …) reject a request
+ * whose `user[tool_result]` blocks are not at the HEAD of the user message
+ * immediately following the `model[tool_use]` they answer:
+ *
+ *     "tool_use_id ... must have a corresponding tool_use block in the
+ *      previous message"
+ *
+ * Without a matching pair the session is unrecoverable — `stripOrphanedUser
+ * EntriesFromHistory` only strips trailing user entries, so a lost tool_use
+ * cannot be resurrected and the next send 400s repeatedly.
+ *
+ * --- The race classes that produce dangling tool_uses --------------------
+ *
+ *   Race A (Ctrl+Y mid-flight): user retries before the in-flight tool
+ *     finishes. The scheduler's `onAllToolCallsComplete` is single-shot
+ *     per batch and would otherwise leave the tool stuck in
+ *     `completed-but-not-submitted` forever.
+ *   Race B (process crash / OOM mid-flight): the JSONL transcript captures
+ *     the dangling `model[fc]` and `--resume` rehydrates it.
+ *   Race C (network drop between `content_block_stop` of a tool_use and
+ *     the terminal `message_stop`): `processStreamResponse` re-throws
+ *     after we have already yielded a `functionCall` chunk, so the React
+ *     scheduler is on its way to submit a real `functionResponse` while
+ *     in-memory history has no matching `model[fc]`.
+ *
+ * --- The two-layer fix ---------------------------------------------------
+ *
+ *   (1) Persist the partial assistant turn at the failure point in
+ *       `processStreamResponse` (`this.history.push({role: 'model', parts:
+ *       [...]})` plus the `pendingPartialAssistantTurnIndex` /
+ *       `pendingPartialAssistantRecord` markers) so the matching
+ *       `model[fc]` is on disk and in memory when the late `user[fr]`
+ *       arrives.
+ *   (2) Repair any remaining dangling `model[fc]` whose
+ *       `user[fr]` never landed (`repairOrphanedToolUseTurns`):
+ *         - SYNTHESIZE an `error` fr for ids with no matching response;
+ *         - HOIST the real fr into the immediately-adjacent user turn
+ *           when it landed in a non-adjacent later turn;
+ *         - DROP duplicate fr copies for the same id.
+ *       Then `useGeminiStream.handleCompletedTools` dedupes the
+ *       scheduler's late real result against `chat.history` so the
+ *       synthetic and the real result never collide on the wire.
+ *
+ * --- Partial-push marker lifecycle ---------------------------------------
+ *
+ * Set together on (streamError + hasToolCall + hasContent) inside
+ * `processStreamResponse`. Cleared together by `popPartialIfPushed` on a
+ * retryable error rollback, or flushed together to JSONL by the outer
+ * `finally` after the retry loop exits. Defense-in-depth: every
+ * history-mutation method (clearHistory / addHistory / setHistory /
+ * truncateHistory / stripThoughtsFromHistory /
+ * stripOrphanedUserEntriesFromHistory) resets both markers in lockstep so
+ * a stale index can't shift onto an unrelated model turn and cause
+ * `popPartialIfPushed` to splice the wrong entry. Any single-field reset
+ * is a bug.
+ * ============================================================================
+ */
+
 /**
  * Walk `history` left-to-right and close every dangling tool_use ↔ tool_result
  * pair so the wire format the next API call sees is always
@@ -410,15 +479,9 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
  *    `yieldMissingToolResultBlocks` (`query.ts:123-149`).
  *  - HOIST: for any `functionCall.id` whose real `functionResponse` lives in
  *    a non-adjacent following user turn (typical shape:
- *    `model[fc], user[text], user[fr_real]` — produced when a user aborts a
- *    long-running tool, types a follow-up, and the React scheduler's late
- *    `submitQuery` appends the real `fr` as a SEPARATE user entry), MOVE the
- *    real `fr` part out of its original turn into the adjacent one. Without
- *    hoisting, the synthesis pass correctly skips the call (a real `fr`
- *    exists somewhere later) but the wire layout still serializes
- *    `model[tool_use] → user[text] → user[tool_result]`, which
- *    Anthropic-compatible backends reject with "tool_use_id ... must have a
- *    corresponding tool_use block in the previous message".
+ *    `model[fc], user[text], user[fr_real]`), MOVE the real `fr` part out
+ *    of its original turn into the adjacent one. Required by the wire
+ *    layout — see the canonical design note above for the wedge.
  *
  * Mutates `history` in place and returns the set of injected `(callId, name)`
  * tuples so callers (the React tool scheduler) can dedupe a real `tool_result`
@@ -492,16 +555,10 @@ interface RepairPlan {
  * follows. Pure read; no mutation.
  *
  * Storing ALL locations (not just the first) is load-bearing for the
- * duplicate case: if the same callId is echoed back more than once
- * across the consecutive user turns (e.g.
- * `model[fc id=cid], user[text], user[fr cid], user[fr cid]` — possible
- * when the React scheduler retries the late `submitQuery` and a
- * duplicate fr lands), hoisting only the first would leave the
- * duplicate behind. The wire payload then serializes
- *   `model[tool_use] -> user[tool_result] -> user[tool_result]`
- * and the backend rejects the trailing block as an orphan
- * ("tool_use_id ... must have a corresponding tool_use block in the
- * previous message").
+ * duplicate case: e.g. `model[fc id=cid], user[text], user[fr cid],
+ * user[fr cid]` (the React scheduler retries the late `submitQuery`).
+ * The downstream decision phase needs every copy so it can drop
+ * duplicates — otherwise the wire payload re-triggers the wedge.
  */
 function scanModelTurn(history: Content[], modelIdx: number): ScanResult {
   const expected = new Map<string, string>();
@@ -604,16 +661,10 @@ function planRepair(scan: ScanResult): RepairPlan {
  *     user turn (before any non-fr parts) OR insert a new user turn
  *     between `modelIdx` and whatever follows.
  *
- * Anthropic-compatible backends require the tool_result blocks at the
- * head of the immediately following user message; appending instead
- * (`[text, fr]`) re-triggers the 400 the synthesis pass exists to
- * escape. Mirrors upstream Claude Code's `hoistToolResults`.
- *
- * CONSEQUENCE OF REMOVAL of the head-insert: dropping this hoist (e.g.
- * naively `next.parts = [...existing, ...partsToInject]`) re-introduces
- * the "tool_use_id ... must have a corresponding tool_use block in the
- * previous message" 400 the synthesis pass exists to prevent. Do not
- * "simplify" this branch.
+ * Step 3 inserts at the HEAD (before non-fr parts), not the tail —
+ * mirrors upstream Claude Code's `hoistToolResults`. See the canonical
+ * design note above `ORPHAN_TOOL_USE_REPAIR_REASON` for why a tail
+ * append re-triggers the 400. Do not "simplify" the head-insert away.
  */
 function applyRepair(
   history: Content[],
@@ -1172,15 +1223,13 @@ export class GeminiChat {
             lastError = error;
 
             // If `processStreamResponse` persisted a partial assistant turn
-            // (mid-stream error after a `functionCall` was already yielded),
-            // every retry-and-continue path below must drop that turn first.
-            // Otherwise a successful retry's response lands AFTER the stale
-            // failed-attempt model turn — two consecutive `model` entries
-            // with an orphan tool_use in the first, re-triggering the
-            // "tool_use_id ... corresponding tool_use" 400 this fix is
-            // supposed to escape. Paths that `break` (unretryable) keep
-            // the partial — the caller will see it as part of the error
-            // surface.
+            // (mid-stream error after a `functionCall` was already
+            // yielded), every retry-and-continue path below must drop
+            // that turn first; otherwise the retry's response lands as
+            // a second consecutive model turn with an orphan tool_use
+            // (the wedge — see the canonical note above
+            // `ORPHAN_TOOL_USE_REPAIR_REASON`). Paths that `break`
+            // (unretryable) keep the partial.
             const popPartialIfPushed = () => {
               const idx = self.pendingPartialAssistantTurnIndex;
               if (idx === null) return;
@@ -1302,21 +1351,10 @@ export class GeminiChat {
                     reactiveInfo.compressionStatus ===
                     CompressionStatus.COMPRESSED
                   ) {
-                    // Defense-in-depth no-op: tryCompress() succeeded
-                    // means it has already replaced this.history via
-                    // setHistory(), which calls clearPendingPartialState()
-                    // — so by the time we reach this line, the marker is
-                    // null and popPartialIfPushed splices nothing. We
-                    // keep the call as a uniformity assertion against
-                    // future refactors that might switch tryCompress to
-                    // an in-place mutation: in that world, the marker
-                    // would NOT be reset by setHistory and this call
-                    // becomes the only thing that drops the stale
-                    // partial before requestContents is rebuilt below.
-                    // Removing it would couple correctness to the
-                    // implementation detail "setHistory always clears
-                    // the marker", which the other retry branches don't
-                    // share.
+                    // No-op today: tryCompress's setHistory has already
+                    // cleared the marker. Kept for uniformity with the
+                    // other retry branches in case a future in-place
+                    // tryCompress stops resetting it.
                     popPartialIfPushed();
                     requestContents = self.getRequestHistory();
                     debugLogger.info(
@@ -1552,47 +1590,13 @@ export class GeminiChat {
               // coalesced back into the preceding model entry after the loop.
               successfulRecoveries++;
             } catch (recoveryError) {
-              // If a recovery attempt fails (e.g., empty response, network
-              // error), stop recovering and let the partial output stand.
-              // Pop the dangling recovery message to keep history valid.
-              //
-              // Order matters: when the recovery stream errors AFTER
-              // yielding a `functionCall` chunk, `processStreamResponse`
-              // pushes a partial `model` turn into history before
-              // re-throwing. The naive "if last is user, pop" check
-              // would then no-op (last is now the partial `model`),
-              // leaving `user(OUTPUT_RECOVERY_MESSAGE)` stranded as a
-              // real user turn the user never sent. Two consequences:
-              //  - the control-prompt text (which carries instructions
-              //    meant only for the model's own continuation context)
-              //    pollutes durable history and biases later turns,
-              //  - the inline repair on the next sendMessageStream
-              //    synthesizes an `error` `functionResponse` for the
-              //    dangling `functionCall`, which the
-              //    `handleCompletedTools` history-based dedup then drops
-              //    when the React scheduler's REAL tool result arrives,
-              //    so the model sees an "execution result was not
-              //    recorded" error for a tool that actually succeeded.
-              // Pop the partial model turn FIRST, then the recovery
-              // user turn. The partial-push markers are also cleared
-              // in lockstep so the outer `finally` JSONL flush can't
-              // resurrect a partial we just deleted from live history.
-              //
-              // Index-checked pop instead of a positional `pop()` so
-              // we match the diagnostic standard set by
-              // `popPartialIfPushed` above (splice at `idx` + warn on
-              // bounds/role mismatch). The two rollback strategies
-              // share an undocumented positional assumption: nothing
-              // mutates `this.history` between
-              // `processStreamResponse`'s push and the for-await
-              // catch here. If a future change inserts a mutation in
-              // that window (compression side-effect, abort-signal
-              // handler, telemetry hook), a naked
-              // `history.pop()` would silently remove the wrong
-              // entry while `clearPendingPartialState()` clears
-              // markers for the actual partial — leaving it
-              // permanently stranded with no log trail. The warn
-              // makes any future violation visible immediately.
+              // Pop the partial `model[fc]` FIRST (if processStreamResponse
+              // pushed one before re-throwing), THEN the recovery user
+              // turn. Reversed order would strand `OUTPUT_RECOVERY_MESSAGE`
+              // as a real user turn. Index-checked pop mirrors
+              // `popPartialIfPushed` above — see the design note above
+              // `ORPHAN_TOOL_USE_REPAIR_REASON` for the wedge mechanism
+              // and the partial-push marker lifecycle.
               const expectedIdx = self.pendingPartialAssistantTurnIndex;
               const lastIdx = self.history.length - 1;
               if (
@@ -1936,34 +1940,18 @@ export class GeminiChat {
    */
   addHistory(content: Content): void {
     this.history.push(content);
-    // The marker is per-send-attempt. Today's callers (cancelled-tool
-    // synthesis in useGeminiStream, ACP session injects,
-    // shellCommandProcessor) only run between sends, so the originating
-    // sendMessageStream has either already popped the partial via the
-    // retry loop or hit an unrecoverable break — in both cases the
-    // marker is no longer load-bearing.
-    //
-    // If a future code path ever calls addHistory BETWEEN the partial
-    // push and the retry attempt, silently clearing the marker would
-    // strand the partial: popPartialIfPushed would no-op, the failed
-    // attempt's `model[functionCall]` would survive into the retry,
-    // and a successful retry's response would land as a SECOND
-    // consecutive model turn (the wedge this whole subsystem exists
-    // to prevent). The log below makes that coupling observable —
-    // anyone investigating a stale-partial bug will see this log line
-    // pointing straight at the offending caller. Error-level (not
-    // warn) because this is a true invariant violation: the existing
-    // call graph cannot legitimately hit this branch, so any
-    // occurrence is a real bug in a future caller, not noise.
+    // addHistory only runs between sends, so the partial-push marker
+    // should already be cleared. If it is not, a new caller is
+    // violating that invariant — surface it at error level so the
+    // offending stack is visible. See the design note above
+    // `ORPHAN_TOOL_USE_REPAIR_REASON` for the marker lifecycle.
     if (
       this.pendingPartialAssistantTurnIndex !== null ||
       this.pendingPartialAssistantRecord !== null
     ) {
       debugLogger.error(
         '[INVARIANT_VIOLATION] addHistory called while a partial-push ' +
-          'marker is active — clearing it. This is unexpected during an active sendMessageStream ' +
-          'and likely indicates a new caller violating the between-sends ' +
-          'invariant. See comment at GeminiChat.addHistory for context.',
+          'marker is active — clearing it.',
       );
     }
     this.clearPendingPartialState();
@@ -2251,30 +2239,20 @@ export class GeminiChat {
       }
     }
 
-    // Mid-stream failure recovery: if the upstream stream threw (typical on
-    // weak networks — SSE cut between a tool_use `content_block_stop` and
-    // the terminal `message_stop`) AND any `functionCall` chunk was already
-    // yielded to consumers, we must persist the partial assistant turn here.
-    //
-    // The content generator (Anthropic / OpenAI) emits a `functionCall` part
-    // only at the end of a tool_use block. Once yielded, `Turn.run` registers
-    // a `ToolCallRequest` event, the React tool scheduler queues the call,
-    // and `handleCompletedTools` will fire `submitQuery(..., ToolResult)` —
-    // pushing a user message with `functionResponse` into history — even
-    // though the parent stream errored. Without preserving the matching
-    // tool_use on the model side, the next request body would have
-    // `user → user[tool_result]` with no tool_use in between, and the
-    // Anthropic-compatible API (DeepSeek, Anthropic, etc.) rejects with
-    //   "tool_use_id ... must have a corresponding tool_use block in the
-    //    previous message"
-    // — an unrecoverable state because Ctrl+Y's `stripOrphanedUserEntries`
-    // only strips trailing user entries; the lost tool_use can't be
-    // resurrected.
+    // Mid-stream failure recovery (Race C in the canonical note above
+    // `ORPHAN_TOOL_USE_REPAIR_REASON`): if the upstream stream threw
+    // AFTER a `functionCall` chunk was already yielded — typical on
+    // weak networks: SSE cut between a tool_use `content_block_stop`
+    // and the terminal `message_stop` — we persist the partial
+    // assistant turn so the React scheduler's incoming
+    // `user[functionResponse]` has a matching `model[tool_use]` to
+    // pair with.
     //
-    // Plain-text partial turns (no functionCall yielded) are deliberately
-    // NOT persisted — the Retry path pops the trailing user prompt and
-    // re-issues it; a stale partial-text model turn between them would
-    // either bias the retry or surface as a duplicate.
+    // Plain-text partial turns (no functionCall yielded) are
+    // deliberately NOT persisted — the Retry path pops the trailing
+    // user prompt and re-issues it; a stale partial-text model turn
+    // between them would either bias the retry or surface as a
+    // duplicate.
     if (streamError !== null) {
       // Reuse the `willPersistToHistory` gate from the recordAssistantTurn
       // block above instead of re-deriving it. When `streamError !== null`,

From e2033ec87964159457652e875f16ae0e215631f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AB=98=E9=93=81?= <shaojin.wensj@alibaba-inc.com>
Date: Thu, 21 May 2026 16:42:12 +0800
Subject: [PATCH 18/18] refactor(core): further trim partial-tool_use repair
 comments (PR #4176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Second cleanup pass on pomelo-nwu's feedback. Previous commit
(298588190) cut net –22 lines but several blocks were still
20–25 lines of prose; this pass takes the rest down to the
one-or-two-line pointer pattern the reviewer suggested.

geminiChat.ts: 2384 → 2207 lines (-177). Comment density 37% → 32%.

Trimmed:

  * pendingPartialAssistantTurnIndex + pendingPartialAssistantRecord
    field doc blocks (16 + 22 lines → one combined 5-line block
    pointing at the canonical note).
  * clearPendingPartialState method doc (10 → 4 lines).
  * sendMessageStream inline-repair comment block (29 → 7 lines).
  * `finally` JSONL flush block (32 → 8 lines).
  * repairOrphanedToolUseTurns function doc (39 → 14 lines).
  * scanModelTurn / planRepair / applyRepair phase docs (11–22 → 4–9
    lines each).
  * RepairPlan interface doc (14 → 1 line).
  * GeminiChat.repairOrphanedToolUseTurns wrapper doc (9 → 3 lines).
  * GeminiChat.getHistoryFunctionResponseIds doc (13 → 6 lines).

What stayed (intentional load-bearing context):

  * The canonical block above ORPHAN_TOOL_USE_REPAIR_REASON — that's
    the consolidation point every per-site one-liner points back to.
  * Pre-existing upstream JSDoc (ctor, sendMessageStream, getHistory,
    redactStructuredOutputArgsForRecording).
  * The Reactive compression no-op note (already at the 4-line cap).
  * The recovery-catch 7-line block (already at the cap).
  * The addHistory invariant 5-line block (already at the cap).

Tests: 375/375 pass on the affected suites. tsc + eslint + prettier
clean on changed files. Behavior unchanged.
---
 packages/core/src/core/geminiChat.ts | 287 +++++----------------------
 1 file changed, 55 insertions(+), 232 deletions(-)

diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index 0c755924f99..ce6e5106b0f 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -467,43 +467,19 @@ const ORPHAN_TOOL_USE_REPAIR_REASON =
  */
 
 /**
- * Walk `history` left-to-right and close every dangling tool_use ↔ tool_result
- * pair so the wire format the next API call sees is always
- * `model[fc] → user[fr]` with the `fr` blocks at the head of the immediately
- * following user turn. Two fix-ups can run for each `model[functionCall]`:
+ * Walk `history` left-to-right and close every dangling
+ * tool_use ↔ tool_result pair. For each `model[functionCall]`:
+ *  - SYNTHESIZE an `error` `functionResponse` for ids with no match;
+ *  - HOIST a real fr from a non-adjacent later user turn into the
+ *    adjacent one;
+ *  - drop duplicate fr copies for the same id.
  *
- *  - SYNTHESIZE: for any `functionCall.id` not echoed back by ANY of the
- *    consecutive user turns that follow it (up to the next model turn or
- *    end-of-history), insert a synthetic `functionResponse` carrying an
- *    `error` field — the close analogue of upstream Claude Code's
- *    `yieldMissingToolResultBlocks` (`query.ts:123-149`).
- *  - HOIST: for any `functionCall.id` whose real `functionResponse` lives in
- *    a non-adjacent following user turn (typical shape:
- *    `model[fc], user[text], user[fr_real]`), MOVE the real `fr` part out
- *    of its original turn into the adjacent one. Required by the wire
- *    layout — see the canonical design note above for the wedge.
- *
- * Mutates `history` in place and returns the set of injected `(callId, name)`
- * tuples so callers (the React tool scheduler) can dedupe a real `tool_result`
- * if the in-flight tool completes after the repair. Hoisted ids are NOT in
- * the returned list — the real `fr` is already present in history, so the
- * scheduler's existing history-based dedup handles them without extra entries.
- *
- * The injection target for synthesized parts follows this rule:
- *  - If the next entry is a `user` turn → insert synthetic parts at the head
- *    (before any non-`functionResponse` parts; after any pre-existing real
- *    `functionResponse` parts so caller-supplied ordering is preserved).
- *  - If the next entry is a `model` turn or end-of-history → insert a new
- *    `user` turn between them carrying just the synthetic parts.
- *
- * This is the qwen-code analogue of upstream Claude Code's
- * `yieldMissingToolResultBlocks` (`query.ts:123-149`). Upstream can call it
- * unconditionally at every error path because their `StreamingToolExecutor`
- * is in-band — they atomically `.discard()` in-flight tools at the synthesis
- * point. Our React scheduler runs out-of-band, so the caller pairs this with
- * dedup in `handleCompletedTools` (which skips submission for any callId
- * already present in history). See PR review thread on #4176 for the full
- * race-class analysis.
+ * Mutates `history` in place. Returns the synthesized (callId, name)
+ * pairs so the React scheduler's dedup can drop late real results for
+ * those ids; hoisted ids are NOT returned (the real fr is still in
+ * history, scheduler dedup handles them naturally). See the canonical
+ * note above `ORPHAN_TOOL_USE_REPAIR_REASON`. qwen-code analogue of
+ * upstream Claude Code's `yieldMissingToolResultBlocks`.
  */
 /** Location of a `functionResponse` part within `history`. */
 interface FrLocation {
@@ -526,20 +502,7 @@ interface ScanResult {
   scanEnd: number;
 }
 
-/**
- * Output of the decision phase for one scanned model turn. Encodes
- * exactly which mutations the next phase should apply:
- *  - `synthesizeIds`: ids that have no matching fr anywhere — synthesize an
- *    `error` `functionResponse` for each.
- *  - `hoistedParts`: parts to MOVE into the adjacent user turn (the
- *    canonical survivor for each id whose fr lives in a non-adjacent
- *    later turn).
- *  - `removalTargets`: parts to SPLICE out of `history` — covers both
- *    hoist survivors (so they only remain in the new location) and
- *    duplicate copies of any id.
- *  - `droppedDuplicates`: callIds whose duplicates we removed; returned
- *    by the function so callers can log the cleanup.
- */
+/** Decision-phase output: exact mutations the next phase will apply. */
 interface RepairPlan {
   modelIdx: number;
   scanEnd: number;
@@ -550,15 +513,10 @@ interface RepairPlan {
 }
 
 /**
- * SCAN PHASE — collect `expected` from the `model[functionCall]` turn
- * at `modelIdx` and `matched` from every consecutive user turn that
- * follows. Pure read; no mutation.
- *
- * Storing ALL locations (not just the first) is load-bearing for the
- * duplicate case: e.g. `model[fc id=cid], user[text], user[fr cid],
- * user[fr cid]` (the React scheduler retries the late `submitQuery`).
- * The downstream decision phase needs every copy so it can drop
- * duplicates — otherwise the wire payload re-triggers the wedge.
+ * SCAN — collect every `functionCall.id → name` from the model turn at
+ * `modelIdx` and EVERY `functionResponse.id → location` from the
+ * consecutive user turns that follow. Pure read. Storing all locations
+ * (not just the first) is what lets the decision phase drop duplicates.
  */
 function scanModelTurn(history: Content[], modelIdx: number): ScanResult {
   const expected = new Map<string, string>();
@@ -587,19 +545,9 @@ function scanModelTurn(history: Content[], modelIdx: number): ScanResult {
 }
 
 /**
- * DECISION PHASE — classify each expected callId into synthesize /
- * hoist / skip-already-adjacent, and collect every duplicate copy for
- * removal. Pure compute; no mutation. The plan returned here drives
- * the mutation phase exactly.
- *
- * Classification rules per callId:
- *   - No matching fr anywhere     → SYNTHESIZE an error fr.
- *   - First match adjacent (modelIdx+1) → SKIP relocation. (Duplicates,
- *     if any, are still removed below.)
- *   - First match non-adjacent    → HOIST: move the canonical part into
- *     the adjacent user turn; remove the original location.
- * In all matched cases, drop EVERY duplicate beyond the first so the
- * wire payload contains exactly one fr per call.
+ * DECISION — classify each expected id: no match → SYNTHESIZE; first
+ * match adjacent → SKIP relocation; first match non-adjacent → HOIST.
+ * Every duplicate beyond the first is always dropped. Pure compute.
  */
 function planRepair(scan: ScanResult): RepairPlan {
   const synthesizeIds: Array<[string, string]> = [];
@@ -645,26 +593,15 @@ function planRepair(scan: ScanResult): RepairPlan {
 }
 
 /**
- * MUTATION PHASE — apply the plan to `history` in place. Returns the
- * number of new user turns inserted before `modelIdx + 1` (currently
- * always 0 or 1), which the caller uses to advance its forward-walk
- * cursor past anything the loop should not revisit.
- *
- * Order matters here:
- *  1. Splice removal targets in (turnIdx desc, partIdx desc) so earlier
- *     removals don't shift indices for later ones.
- *  2. Drop user turns within `[modelIdx + 2, scanEnd)` that are now
- *     empty after the splice. Walk back-to-front for the same reason.
- *     The immediately-adjacent turn (`modelIdx + 1`) is preserved even
- *     if empty — we rewrite its parts in step 3.
- *  3. Inject `[...synthetic, ...hoisted]` at the head of the adjacent
- *     user turn (before any non-fr parts) OR insert a new user turn
- *     between `modelIdx` and whatever follows.
+ * MUTATION — apply the plan to `history` in place. Returns the count
+ * of new user turns inserted ahead of `modelIdx + 1` (0 or 1) so the
+ * outer loop can advance its cursor.
  *
- * Step 3 inserts at the HEAD (before non-fr parts), not the tail —
- * mirrors upstream Claude Code's `hoistToolResults`. See the canonical
- * design note above `ORPHAN_TOOL_USE_REPAIR_REASON` for why a tail
- * append re-triggers the 400. Do not "simplify" the head-insert away.
+ * Order: (1) splice removal targets desc-by-desc, (2) drop empty user
+ * turns in `[modelIdx + 2, scanEnd)`, (3) HEAD-insert at the adjacent
+ * user turn OR splice a new user turn between. The HEAD insert is
+ * load-bearing (mirrors upstream `hoistToolResults`) — see the
+ * canonical note for why tail-append re-triggers the wedge.
  */
 function applyRepair(
   history: Content[],
@@ -825,58 +762,20 @@ export class GeminiChat {
   private hasFailedCompressionAttempt = false;
 
   /**
-   * Index into `this.history` of the model turn that `processStreamResponse`
-   * persisted on the CURRENT in-flight attempt's mid-stream error. `null` if
-   * no partial has been pushed (the common case).
-   *
-   * The retry loop in `sendMessageStream` reads this on every catch to roll
-   * the partial back BEFORE retrying — without that pop, a retryable
-   * mid-stream error (rate limit, transient stream anomaly) leaves the
-   * failed attempt's `model[functionCall]` in history, and the successful
-   * retry's response lands as a SECOND consecutive model turn (invalid
-   * user/model alternation, plus the failed-attempt tool_use is orphan on
-   * the wire — the very wedge this whole subsystem is meant to escape).
-   *
-   * Reset to `null` on every `sendMessageStream` entry so a marker left
-   * over from a prior unretryable break doesn't bleed into the next send.
+   * Partial-push markers — index of the in-memory `model[partial fc]`
+   * and the matching deferred JSONL record. See the canonical note
+   * above `ORPHAN_TOOL_USE_REPAIR_REASON` for the lifecycle and the
+   * wedge they prevent.
    */
   private pendingPartialAssistantTurnIndex: number | null = null;
-
-  /**
-   * Deferred-flush record for the partial assistant turn pushed on a
-   * mid-stream error. Stashed instead of immediately appended to the
-   * chat-recording JSONL so a subsequent retry-success can roll back the
-   * persisted record alongside the in-memory pop. Without deferral, the
-   * JSONL transcript keeps the failed attempt even though live history
-   * dropped it — `--resume` rehydrates the failed `model[functionCall]`
-   * turn and the resumed model context picks up a tool_use the in-session
-   * run intentionally discarded.
-   *
-   * Lifecycle is paired with `pendingPartialAssistantTurnIndex`:
-   *  - Set together on stream-error + hasToolCall + hasContent.
-   *  - Cleared together by `popPartialIfPushed` when the retry loop
-   *    rolls the partial back.
-   *  - Flushed together to JSONL after the retry loop exits if the
-   *    partial survived (unretryable break path → about to throw to
-   *    the caller). At that point the partial is durable in memory and
-   *    the recording must match.
-   *  - Reset on `sendMessageStream` entry / setHistory / clearHistory /
-   *    truncateHistory / addHistory / stripThoughtsFromHistory so a leak
-   *    from any exotic path can't bleed into a future send.
-   */
   private pendingPartialAssistantRecord:
     | Parameters<ChatRecordingService['recordAssistantTurn']>[0]
     | null = null;
 
   /**
-   * Reset both partial-push markers in lockstep. Extracted so the seven
-   * call sites that need to drop both fields (sendMessageStream entry,
-   * popPartialIfPushed, clearHistory, addHistory, setHistory,
-   * truncateHistory, stripThoughtsFromHistory) can't drift apart — a
-   * future history-mutating method that only clears one would leak the
-   * other into a later flush. The fields ARE always paired by lifecycle
-   * (set together on stream-error stash, popped together on retry, flushed
-   * together at the rethrow site), so any single-field reset is a bug.
+   * Reset both partial-push markers in lockstep. Every history-mutation
+   * site uses this — single-field resets are a bug because the fields
+   * are always paired by lifecycle.
    */
   private clearPendingPartialState(): void {
     this.pendingPartialAssistantTurnIndex = null;
@@ -1087,34 +986,13 @@ export class GeminiChat {
       // Add user content to history ONCE before any attempts.
       this.history.push(userContent);
       userContentAdded = true;
-      // Close any dangling `model[functionCall]` whose `functionResponse`
-      // never landed by the time we compose the request. Runs AFTER the
-      // user-supplied turn lands so a tool_result the user is supplying
-      // gets the first chance to close the pair before we synthesize an
-      // `error` `functionResponse`. Covers:
-      //   - Stream errored mid-tool_use (partial assistant push left a
-      //     dangling functionCall), then the React scheduler's eventual
-      //     tool_result lost the race against a Ctrl+Y retry whose
-      //     onAllToolCallsComplete fired into `isResponding=true` and
-      //     skipped submission.
-      //   - The same shape from a process crash / OOM mid-flight (the
-      //     transcript JSONL preserves the dangling model[fc] across
-      //     `--resume`; `startChat()` calls this once on load, but a
-      //     belt-and-suspenders pass here covers anything that slipped
-      //     past — including dangling shapes the load-time repair didn't
-      //     visit because compaction / setHistory ran after it).
-      // The React scheduler's late real result is then dedup'd against
-      // chat.history in `useGeminiStream.handleCompletedTools` so the
-      // synthetic doesn't collide with it on the wire.
-      //
-      // Diagnostic: log non-empty inline-repair results. The startChat()
-      // path logs synthesis events through `repairOrphanedToolUseTurnsInHistory`
-      // (`[REPAIR] Synthesized N functionResponse(s) ...`) and dedup events
-      // through `useGeminiStream.handleCompletedTools` (`[REPAIR] Dropping ...`),
-      // but this inline call site was previously silent — when a dedup-drop
-      // log shows up, investigators had no way to tell whether the
-      // synthetic was planted at session-load or at this per-send pass.
-      // Tag the log site so the lifecycle anchor is unambiguous.
+      // Per-send orphan repair (belt-and-suspenders alongside the
+      // startChat load-time pass). Runs AFTER user content lands so a
+      // user-supplied tool_result closes the pair before we synthesize
+      // anything. Logs are tagged so investigators can distinguish this
+      // pass from the session-load pass and from the React scheduler's
+      // dedup-drop. See the canonical note above
+      // `ORPHAN_TOOL_USE_REPAIR_REASON`.
       const inlineRepair = repairOrphanedToolUseTurns(this.history);
       if (inlineRepair.injected.length > 0) {
         debugLogger.warn(
@@ -1126,8 +1004,6 @@ export class GeminiChat {
         );
       }
       if (inlineRepair.droppedDuplicates.length > 0) {
-        // Symmetrical with the synthesis log: a duplicate-only repair
-        // (no synthesis, no hoist) here would otherwise be silent.
         debugLogger.warn(
           `[REPAIR] sendMessageStream inline pass dropped ` +
             `${inlineRepair.droppedDuplicates.length} duplicate ` +
@@ -1672,59 +1548,19 @@ export class GeminiChat {
         }
       } finally {
         streamDoneResolver!();
-        // Flush any deferred partial-tool_use record into the JSONL
-        // transcript. The retry loop and the post-loop max-tokens
-        // escalation block can BOTH leave one of these on the chat:
-        //
-        //  - Retry loop: any partial rolled back by popPartialIfPushed
-        //    has its stash cleared; any partial that survived (success
-        //    break with no partial set, or unretryable break with the
-        //    partial kept) leaves its record set so we record-and-clear
-        //    it here.
-        //  - Max-tokens escalation: the escalated stream re-enters
-        //    `processStreamResponse`, which sets a NEW
-        //    `pendingPartialAssistantRecord` if it errors mid-tool_use.
-        //    That throw propagates through the for-await above without
-        //    touching the (now-passed) retry-loop catch, so without a
-        //    flush in `finally` the partial would be live in
-        //    `this.history` (the escalated processStreamResponse already
-        //    pushed it) but absent from the JSONL transcript. `--resume`
-        //    would then rehydrate a truncated transcript whose live
-        //    history disagrees with disk, and
-        //    `repairOrphanedToolUseTurnsInHistory` would find nothing to
-        //    repair on load — the React scheduler's late real result
-        //    becomes a permanent orphan, reproducing the exact wedge
-        //    this PR prevents.
-        //
-        // Putting the flush in `finally` covers ALL throw paths
-        // (escalation, post-retry-loop `throw lastError`, the for-await
-        // consumer's `.return()` if it abandons the generator) and the
-        // normal completion path with a single statement. The marker
-        // and stash are dropped together to preserve the
-        // "marker non-null ⇔ stash non-null" invariant.
+        // Flush any deferred partial-tool_use record. Covers both the
+        // post-retry-loop unretryable break AND the max-tokens
+        // escalation throw (the escalated processStreamResponse can
+        // set a new record that escapes the retry-loop catch).
+        // Recording-service errors are logged at error level (sustained
+        // failure = monitoring signal) and swallowed — propagating
+        // would mask the real send outcome.
         if (self.pendingPartialAssistantRecord) {
-          // Recording-service errors (disk full, write permission,
-          // serialization failure) MUST NOT propagate out of the
-          // generator's `finally` — that would mask the real send
-          // outcome (success or original throw) with a JSONL-write
-          // error the caller can't usefully act on. Instead, log and
-          // drop the record: the partial is already durable in
-          // `this.history`, so live behavior is unaffected; only the
-          // disk transcript loses this turn (eventual consistency
-          // restored on the next successful flush of any other turn).
           try {
             self.chatRecordingService?.recordAssistantTurn(
               self.pendingPartialAssistantRecord,
             );
           } catch (recordErr) {
-            // Error-level (not warn): a persistent JSONL write failure
-            // (disk full, permission, serialization) silently loses
-            // every deferred partial after this point — exactly the
-            // class of failure that warrants monitoring attention.
-            // Transient failures still bubble through as a single
-            // error per occurrence; if logs are spammed it's an
-            // operational signal that the recording layer is broken,
-            // not noise.
             debugLogger.error(
               '[PARTIAL_FLUSH] Failed to persist deferred JSONL record: ' +
                 (recordErr instanceof Error
@@ -1895,17 +1731,10 @@ export class GeminiChat {
   }
 
   /**
-   * Returns the set of `functionResponse.id` values present anywhere
-   * in user turns of the raw chat history. Walks `this.history` in
-   * place — no `structuredClone`, no per-part copy — so it is safe to
-   * call on hot paths (e.g. on every tool-completion batch in
-   * `useGeminiStream.handleCompletedTools`).
-   *
-   * The dedup pass only needs id strings; routing it through
-   * {@link getHistory} would deep-clone the entire conversation
-   * (recursive `structuredClone` over every part, including large tool
-   * outputs) on every batch and visibly stall the React UI thread on
-   * long sessions (200+ entries with sizable tool results).
+   * Set of `functionResponse.id` strings in user turns. Walk-only,
+   * no clone — `useGeminiStream.handleCompletedTools` calls this per
+   * tool-completion batch, so {@link getHistory}'s `structuredClone`
+   * would stall the UI on long sessions.
    */
   getHistoryFunctionResponseIds(): Set<string> {
     const ids = new Set<string>();
@@ -2019,14 +1848,8 @@ export class GeminiChat {
   }
 
   /**
-   * Repair the inverse of `stripOrphanedUserEntriesFromHistory`: close every
-   * dangling `model[functionCall]` whose corresponding `user[functionResponse]`
-   * never landed (e.g. process crash between the partial-tool_use push and
-   * tool completion, or Ctrl+Y race before in-flight scheduler completed).
-   *
-   * Returns the list of synthesized `(callId, name)` tuples so the React
-   * tool scheduler can dedupe its eventual real `tool_result` for those
-   * callIds (see `handleCompletedTools` in `useGeminiStream.ts`).
+   * Instance wrapper around the free-function {@link repairOrphanedToolUseTurns}.
+   * See the canonical note above `ORPHAN_TOOL_USE_REPAIR_REASON`.
    */
   repairOrphanedToolUseTurns(reason?: string): {
     injected: Array<{ callId: string; name: string }>;