From 4a597bce3db7cab1dfcef53a6dccc0ef0478ef0d Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:44:37 +0800
Subject: [PATCH 01/15] docs: add async memory recall design spec and
 implementation plan

---
 .../2026-05-15-async-memory-recall-design.md  | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 docs/design/2026-05-15-async-memory-recall-design.md

diff --git a/docs/design/2026-05-15-async-memory-recall-design.md b/docs/design/2026-05-15-async-memory-recall-design.md
new file mode 100644
index 00000000000..b078d52c97a
--- /dev/null
+++ b/docs/design/2026-05-15-async-memory-recall-design.md
@@ -0,0 +1,172 @@
+# Async Memory Recall — Design Spec
+
+**Date:** 2026-05-15
+**Status:** Approved
+**Related issues:** #3761, #3759
+**Related PRs:** #3814, #3866
+
+---
+
+## Problem
+
+`relevanceSelector.ts` uses `AbortSignal.timeout(1_000)` (introduced by #3866). On first-session cold starts, qwen3.5-flash averages ~908 ms — consistently hitting the 1 s threshold. The outer 2.5 s deadline in `resolveAutoMemoryWithDeadline` means every UserQuery can block for up to 2.5 s even when recall always fails.
+
+Root cause: the main-agent request path `await`s the recall result before sending to the model. Any slowness in the recall side-query directly adds to user-visible latency.
+
+---
+
+## Design
+
+### Core idea
+
+Fire recall on UserQuery and never await it. Consume the result at two opportunistic points — whichever fires first:
+
+1. **UserQuery consume point** — synchronous `settledAt !== null` check just before `turn.run()`. Zero-wait: if already settled, use it; if not, skip.
+2. **ToolResult inject point** — same check on every ToolResult turn. Injects memory as a `system-reminder` prepended to the tool-result message, giving the model memory context before its next response.
+
+This matches the pattern used by Claude Code upstream (`startRelevantMemoryPrefetch` / `settledAt` polling in `query.ts`).
+
+---
+
+## Data structures
+
+### New type `MemoryPrefetchHandle` (in `client.ts`)
+
+```typescript
+type MemoryPrefetchHandle = {
+  promise: Promise<RelevantAutoMemoryPromptResult>;
+  /** Set by promise.finally(). null until the promise settles. */
+  settledAt: number | null;
+  /** True after memory has been injected — prevents double-inject. */
+  consumed: boolean;
+  controller: AbortController;
+};
+```
+
+### Field change on `GeminiClient`
+
+| Remove                                                       | Add                                                        |
+| ------------------------------------------------------------ | ---------------------------------------------------------- |
+| `pendingRecallAbortController: AbortController \| undefined` | `pendingMemoryPrefetch: MemoryPrefetchHandle \| undefined` |
+
+---
+
+## Changes
+
+### 1. `client.ts` — remove `resolveAutoMemoryWithDeadline`
+
+Delete the function entirely. It is replaced by the `settledAt` flag mechanism.
+
+### 2. `client.ts` — UserQuery fire path
+
+Replace the `resolveAutoMemoryWithDeadline` call with:
+
+```typescript
+const controller = new AbortController();
+const promise = this.config
+  .getMemoryManager()
+  .recall(projectRoot, partToString(request), {
+    config: this.config,
+    excludedFilePaths: this.surfacedRelevantAutoMemoryPaths,
+    abortSignal: controller.signal,
+  })
+  .catch((error: unknown) => {
+    if (!(error instanceof DOMException && error.name === 'AbortError')) {
+      debugLogger.warn('Managed auto-memory recall prefetch failed.', error);
+    }
+    return EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
+  });
+
+const handle: MemoryPrefetchHandle = {
+  promise,
+  settledAt: null,
+  consumed: false,
+  controller,
+};
+void promise.finally(() => {
+  handle.settledAt = Date.now();
+});
+this.pendingMemoryPrefetch = handle;
+// no await — continue immediately
+```
+
+### 3. `client.ts` — UserQuery consume point (replaces `await relevantAutoMemoryPromise`)
+
+```typescript
+const prefetchHandle = this.pendingMemoryPrefetch;
+if (
+  prefetchHandle &&
+  prefetchHandle.settledAt !== null &&
+  !prefetchHandle.consumed
+) {
+  prefetchHandle.consumed = true;
+  this.pendingMemoryPrefetch = undefined;
+  const result = await prefetchHandle.promise; // already settled, returns immediately
+  if (result.prompt) {
+    systemReminders.push(result.prompt);
+    for (const doc of result.selectedDocs) {
+      this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+    }
+  }
+}
+```
+
+### 4. `client.ts` — ToolResult inject point (new)
+
+After `requestToSend` is assembled, before `turn.run()`, add:
+
+```typescript
+if (messageType === SendMessageType.ToolResult) {
+  const prefetchHandle = this.pendingMemoryPrefetch;
+  if (
+    prefetchHandle &&
+    prefetchHandle.settledAt !== null &&
+    !prefetchHandle.consumed
+  ) {
+    prefetchHandle.consumed = true;
+    this.pendingMemoryPrefetch = undefined;
+    const result = await prefetchHandle.promise;
+    if (result.prompt) {
+      requestToSend = [result.prompt, ...requestToSend];
+      for (const doc of result.selectedDocs) {
+        this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+      }
+    }
+  }
+}
+```
+
+### 5. `client.ts` — cleanup paths (6 locations)
+
+Replace all `pendingRecallAbortController?.abort()` + `= undefined` with:
+
+```typescript
+this.pendingMemoryPrefetch?.controller.abort();
+this.pendingMemoryPrefetch = undefined;
+```
+
+Locations: `resetChat()`, MaxSessionTurns early-return, boundedTurns=0 early-return, SessionTokenLimitExceeded early-return, Arena control signal early-return, post-consume clear.
+
+### 6. `relevanceSelector.ts` — remove `AbortSignal.timeout(1_000)`
+
+Remove the combined `AbortSignal.any([AbortSignal.timeout(1_000), callerAbortSignal])` and pass `callerAbortSignal` directly.
+
+---
+
+## Behaviour comparison
+
+| Scenario                                     | Before                         | After                                                  |
+| -------------------------------------------- | ------------------------------ | ------------------------------------------------------ |
+| recall completes before model prep           | inject on UserQuery, ~0 wait   | inject on UserQuery, ~0 wait                           |
+| recall slow (cold start)                     | block up to 2.5 s              | skip UserQuery, inject on first ToolResult             |
+| recall times out (1 s)                       | abort, empty result, no memory | no hard timeout; inject whenever settled               |
+| no tool calls, recall slow                   | block up to 2.5 s, then skip   | skip UserQuery, no ToolResult opportunity — miss       |
+| user sends 2nd message before recall settles | 2nd recall races 1st handle    | 1st handle aborted when 2nd UserQuery fires new handle |
+
+---
+
+## Out of scope
+
+- Changing the memory injection format from `system-reminder` to `tool-result` attachment (CC style)
+- Per-session byte budget skip gate
+- Single-word prompt skip gate

From 2d0b4151df3fa73d1792e72c77d7e08ce64d5128 Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:50:04 +0800
Subject: [PATCH 02/15] refactor(core): introduce MemoryPrefetchHandle, replace
 pendingRecallAbortController field

---
 packages/core/src/core/client.ts | 46 +++++++-------------------------
 1 file changed, 9 insertions(+), 37 deletions(-)

diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 69447064c2c..d29316b24cb 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -149,42 +149,14 @@ function wrapIdeContext(contextText: string): string {
   return `<system-reminder>\n${safeContextText}\n</system-reminder>`;
 }
 
-/**
- * Resolve the auto-memory recall promise with a hard deadline.
- * If the recall (model-driven selection + heuristic fallback) does not complete
- * within the deadline, return an empty result so the main request is not delayed.
- *
- * The deadline is set slightly above the model-driven selector's own
- * AbortSignal.timeout (2s) to give the heuristic fallback time to complete,
- * but low enough that the user does not perceive a delay on every turn.
- */
-async function resolveAutoMemoryWithDeadline(
-  promise: Promise<RelevantAutoMemoryPromptResult> | undefined,
-  onDeadline: () => void,
-): Promise<RelevantAutoMemoryPromptResult> {
-  if (!promise) {
-    return EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
-  }
-
-  let timer: ReturnType<typeof setTimeout> | undefined;
-  const deadline = new Promise<RelevantAutoMemoryPromptResult>((resolve) => {
-    timer = setTimeout(() => {
-      try {
-        onDeadline();
-      } finally {
-        resolve(EMPTY_RELEVANT_AUTO_MEMORY_RESULT);
-      }
-    }, 2_500);
-  });
-
-  try {
-    return await Promise.race([promise, deadline]);
-  } finally {
-    if (timer !== undefined) {
-      clearTimeout(timer);
-    }
-  }
-}
+type MemoryPrefetchHandle = {
+  promise: Promise<RelevantAutoMemoryPromptResult>;
+  /** Set by promise.finally(). null until the promise settles. */
+  settledAt: number | null;
+  /** True after memory has been injected — prevents double-inject. */
+  consumed: boolean;
+  controller: AbortController;
+};
 
 /** Tools that can write to the skills directory, used to detect skillsModifiedInSession. */
 const SKILL_WRITE_TOOL_NAMES: ReadonlySet<string> = new Set([
@@ -204,7 +176,7 @@ export class GeminiClient {
   private lastPromptId: string | undefined = undefined;
   private lastSentIdeContext: IdeContext | undefined;
   private forceFullIdeContext = true;
-  private pendingRecallAbortController: AbortController | undefined;
+  private pendingMemoryPrefetch: MemoryPrefetchHandle | undefined;
   private lastSessionStartContext: string | undefined;
   private lastSessionStartSource: SessionStartSource | undefined;
 

From 069045e7cf064a63d71e50dcbf0bc3fbbbe3d1c0 Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:51:02 +0800
Subject: [PATCH 03/15] refactor(core): fire memory recall as non-blocking
 prefetch with settledAt flag

---
 packages/core/src/core/client.ts | 35 +++++++++++++-------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index d29316b24cb..9f013bc0b99 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -974,9 +974,6 @@ export class GeminiClient {
     turns: number = MAX_TURNS,
   ): AsyncGenerator<ServerGeminiStreamEvent, Turn> {
     const messageType = options?.type ?? SendMessageType.UserQuery;
-    let relevantAutoMemoryPromise:
-      | Promise<RelevantAutoMemoryPromptResult>
-      | undefined;
 
     if (messageType === SendMessageType.Retry) {
       this.stripOrphanedUserEntriesFromHistory();
@@ -1062,24 +1059,18 @@ export class GeminiClient {
         messageType === SendMessageType.Cron
       ) {
         if (this.config.getManagedAutoMemoryEnabled()) {
-          const recallAbortController = new AbortController();
-          const rawRecallPromise = this.config
+          const controller = new AbortController();
+          const promise = this.config
             .getMemoryManager()
             .recall(this.config.getProjectRoot(), partToString(request), {
               config: this.config,
               excludedFilePaths: this.surfacedRelevantAutoMemoryPaths,
-              abortSignal: recallAbortController.signal,
+              abortSignal: controller.signal,
             })
             .catch((error: unknown) => {
               if (
-                error instanceof DOMException &&
-                error.name === 'AbortError'
+                !(error instanceof DOMException && error.name === 'AbortError')
               ) {
-                debugLogger.debug(
-                  'Auto-memory recall aborted by deadline.',
-                  error,
-                );
-              } else {
                 debugLogger.warn(
                   'Managed auto-memory recall prefetch failed.',
                   error,
@@ -1087,14 +1078,16 @@ export class GeminiClient {
               }
               return EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
             });
-          this.pendingRecallAbortController = recallAbortController;
-          // Race the recall against the deadline at initiation time so the 2.5s
-          // budget is not consumed by intermediate work (microcompact, compression,
-          // token checks, IDE context) between initiation and consumption.
-          relevantAutoMemoryPromise = resolveAutoMemoryWithDeadline(
-            rawRecallPromise,
-            () => recallAbortController.abort(),
-          );
+          const handle: MemoryPrefetchHandle = {
+            promise,
+            settledAt: null,
+            consumed: false,
+            controller,
+          };
+          void promise.finally(() => {
+            handle.settledAt = Date.now();
+          });
+          this.pendingMemoryPrefetch = handle;
         }
 
         // Track prompt count for commit attribution. Only the user typing a

From 347091bc380322f01545b82aad2bba37dabe547d Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:53:44 +0800
Subject: [PATCH 04/15] refactor(core): replace blocking await with zero-wait
 settledAt poll at UserQuery consume point

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.ts | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 9f013bc0b99..d29712b9a88 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -1276,18 +1276,22 @@ export class GeminiClient {
         messageType === SendMessageType.Cron
       ) {
         const systemReminders = [];
-        // The recall promise was already raced against the 2.5s deadline at
-        // initiation time; this await just collects the result.
-        this.pendingRecallAbortController = undefined;
-        const relevantAutoMemory = relevantAutoMemoryPromise
-          ? await relevantAutoMemoryPromise
-          : EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
-        const relevantAutoMemoryPrompt = relevantAutoMemory.prompt;
-
-        if (relevantAutoMemoryPrompt) {
-          systemReminders.push(relevantAutoMemoryPrompt);
-          for (const doc of relevantAutoMemory.selectedDocs) {
-            this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+        // Zero-wait poll: consume only if the prefetch has already settled.
+        // If not settled yet, skip — the ToolResult inject point will retry.
+        const prefetchHandle = this.pendingMemoryPrefetch;
+        if (
+          prefetchHandle &&
+          prefetchHandle.settledAt !== null &&
+          !prefetchHandle.consumed
+        ) {
+          prefetchHandle.consumed = true;
+          this.pendingMemoryPrefetch = undefined;
+          const result = await prefetchHandle.promise; // already settled, returns immediately
+          if (result.prompt) {
+            systemReminders.push(result.prompt);
+            for (const doc of result.selectedDocs) {
+              this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+            }
           }
         }
 

From 5a0b5fe9d73052c27e4c13f63bdc385c789ce4b2 Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:54:28 +0800
Subject: [PATCH 05/15] feat(core): inject recalled memory on first ToolResult
 when UserQuery consume point misses

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.ts | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index d29712b9a88..e15b323be88 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -1331,6 +1331,25 @@ export class GeminiClient {
         requestToSend = [...systemReminders, ...requestToSend];
       }
 
+      if (messageType === SendMessageType.ToolResult) {
+        const prefetchHandle = this.pendingMemoryPrefetch;
+        if (
+          prefetchHandle &&
+          prefetchHandle.settledAt !== null &&
+          !prefetchHandle.consumed
+        ) {
+          prefetchHandle.consumed = true;
+          this.pendingMemoryPrefetch = undefined;
+          const result = await prefetchHandle.promise;
+          if (result.prompt) {
+            requestToSend = [result.prompt, ...requestToSend];
+            for (const doc of result.selectedDocs) {
+              this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+            }
+          }
+        }
+      }
+
       const resultStream = turn.run(model, requestToSend, signal);
       let didUpdateIdeContextState = false;
       for await (const event of resultStream) {

From 4bd5905c03d96cec67158d00d5bff8e49c09198f Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:55:38 +0800
Subject: [PATCH 06/15] refactor(core): replace pendingRecallAbortController
 with pendingMemoryPrefetch in all cleanup paths

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.ts | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index e15b323be88..6e4a50bc864 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -386,10 +386,8 @@ export class GeminiClient {
     this.config.getBaseLlmClient().clearPerModelGeneratorCache();
     // Abort any in-flight auto-memory recall so the stale controller
     // does not leak into the next session.
-    if (this.pendingRecallAbortController) {
-      this.pendingRecallAbortController.abort();
-      this.pendingRecallAbortController = undefined;
-    }
+    this.pendingMemoryPrefetch?.controller.abort();
+    this.pendingMemoryPrefetch = undefined;
     // Drop any deferred tools revealed this session so /clear really gives
     // a clean slate. We don't clear inside startChat itself because that path
     // is also taken by compression (which preserves the session), and
@@ -1158,8 +1156,8 @@ export class GeminiClient {
           this.config.getMaxSessionTurns() > 0 &&
           this.sessionTurnCount > this.config.getMaxSessionTurns()
         ) {
-          this.pendingRecallAbortController?.abort();
-          this.pendingRecallAbortController = undefined;
+          this.pendingMemoryPrefetch?.controller.abort();
+          this.pendingMemoryPrefetch = undefined;
           yield { type: GeminiEventType.MaxSessionTurns };
           if (isTopLevelInteraction)
             endInteractionSpan('error', {
@@ -1172,8 +1170,8 @@ export class GeminiClient {
       // Ensure turns never exceeds MAX_TURNS to prevent infinite loops
       const boundedTurns = Math.min(turns, MAX_TURNS);
       if (!boundedTurns) {
-        this.pendingRecallAbortController?.abort();
-        this.pendingRecallAbortController = undefined;
+        this.pendingMemoryPrefetch?.controller.abort();
+        this.pendingMemoryPrefetch = undefined;
         if (isTopLevelInteraction)
           endInteractionSpan('error', { errorMessage: 'max turns exhausted' });
         return new Turn(this.getChat(), prompt_id);
@@ -1188,8 +1186,8 @@ export class GeminiClient {
         const lastPromptTokenCount =
           uiTelemetryService.getLastPromptTokenCount();
         if (lastPromptTokenCount > sessionTokenLimit) {
-          this.pendingRecallAbortController?.abort();
-          this.pendingRecallAbortController = undefined;
+          this.pendingMemoryPrefetch?.controller.abort();
+          this.pendingMemoryPrefetch = undefined;
           yield {
             type: GeminiEventType.SessionTokenLimitExceeded,
             value: {
@@ -1249,8 +1247,8 @@ export class GeminiClient {
             `Arena control signal received: ${controlSignal.type} - ${controlSignal.reason}`,
           );
           await arenaAgentClient.reportCancelled();
-          this.pendingRecallAbortController?.abort();
-          this.pendingRecallAbortController = undefined;
+          this.pendingMemoryPrefetch?.controller.abort();
+          this.pendingMemoryPrefetch = undefined;
           if (isTopLevelInteraction) endInteractionSpan('cancelled');
           return new Turn(this.getChat(), prompt_id);
         }

From 65bd06176dc687c04df1219736c802e036ca2f8f Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 15:56:40 +0800
Subject: [PATCH 07/15] =?UTF-8?q?refactor(memory):=20remove=201s=20AbortSi?=
 =?UTF-8?q?gnal.timeout=20from=20relevanceSelector=20=E2=80=94=20caller=20?=
 =?UTF-8?q?controls=20lifetime?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/memory/relevanceSelector.ts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/packages/core/src/memory/relevanceSelector.ts b/packages/core/src/memory/relevanceSelector.ts
index 8962eb94d0c..6fa8bed6d53 100644
--- a/packages/core/src/memory/relevanceSelector.ts
+++ b/packages/core/src/memory/relevanceSelector.ts
@@ -91,9 +91,7 @@ export async function selectRelevantAutoMemoryDocumentsByModel(
     purpose: 'auto-memory-recall',
     contents,
     schema: RESPONSE_SCHEMA,
-    abortSignal: callerAbortSignal
-      ? AbortSignal.any([AbortSignal.timeout(1_000), callerAbortSignal])
-      : AbortSignal.timeout(1_000),
+    abortSignal: callerAbortSignal ?? new AbortController().signal,
 
     // Uses runSideQuery's default side-query model policy: fast model first,
     // then main session model when no fast model is configured.

From 064427c4c2db15c78325c433d5ed881005e8c27e Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 16:01:02 +0800
Subject: [PATCH 08/15] =?UTF-8?q?test(core):=20update=20auto-memory=20test?=
 =?UTF-8?q?s=20for=20async=20prefetch=20pattern=20=E2=80=94=20drop=20fake?=
 =?UTF-8?q?=20timers=20and=20deadline=20references?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.test.ts | 65 +++++++++------------------
 1 file changed, 22 insertions(+), 43 deletions(-)

diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 682853d1e3b..39e215f6ec2 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -2303,20 +2303,9 @@ hello
     });
 
     it('should not block the main request when auto-memory recall is slow', async () => {
-      // Simulate a recall that takes longer than the 2.5s deadline
-      mockMemoryManager.recall.mockReturnValue(
-        new Promise((resolve) =>
-          setTimeout(
-            () =>
-              resolve({
-                prompt: '## Relevant memory\n\nSlow memory result.',
-                selectedDocs: [],
-                strategy: 'model',
-              }),
-            10_000,
-          ),
-        ),
-      );
+      // Recall never settles — settledAt stays null so the UserQuery consume
+      // point skips it and turn.run() is called immediately without memory.
+      mockMemoryManager.recall.mockReturnValue(new Promise(() => {}));
 
       const mockStream = (async function* () {
         yield { type: 'content', value: 'Hello' };
@@ -2329,38 +2318,28 @@ hello
       };
       client['chat'] = mockChat as GeminiChat;
 
-      vi.useFakeTimers();
-      try {
-        const streamPromise = (async () => {
-          const stream = client.sendMessageStream(
-            [{ text: 'Quick question' }],
-            new AbortController().signal,
-            'prompt-id-slow-memory',
-          );
-          for await (const _ of stream) {
-            // consume stream
-          }
-        })();
-
-        // Advance past the 2.5s deadline — the main request should proceed
-        await vi.advanceTimersByTimeAsync(3_000);
-        await streamPromise;
-
-        // The main request should have been called without the slow memory
-        expect(mockTurnRunFn).toHaveBeenCalledWith(
-          'test-model',
-          expect.not.arrayContaining([
-            expect.stringContaining('Slow memory result'),
-          ]),
-          expect.any(AbortSignal),
-        );
-      } finally {
-        vi.useRealTimers();
+      const stream = client.sendMessageStream(
+        [{ text: 'Quick question' }],
+        new AbortController().signal,
+        'prompt-id-slow-memory',
+      );
+      for await (const _ of stream) {
+        // consume stream
       }
+
+      // turn.run() must have been called without the slow memory
+      expect(mockTurnRunFn).toHaveBeenCalledWith(
+        'test-model',
+        expect.not.arrayContaining([
+          expect.stringContaining('Slow memory result'),
+        ]),
+        expect.any(AbortSignal),
+      );
     });
 
-    it('should include auto-memory prompt when recall completes within deadline', async () => {
-      // Simulate a fast recall that completes well within the deadline
+    it('should inject auto-memory at UserQuery consume point when recall already settled', async () => {
+      // mockResolvedValue settles synchronously; by the time the consume-point
+      // check runs (after at least one await), settledAt is set.
       mockMemoryManager.recall.mockResolvedValue({
         prompt: '## Relevant memory\n\nFast memory result.',
         selectedDocs: [],

From d9946a395409703b323ff874574c37def7208ff7 Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 16:04:30 +0800
Subject: [PATCH 09/15] =?UTF-8?q?test(core):=20add=20ToolResult=20inject?=
 =?UTF-8?q?=20test=20=E2=80=94=20memory=20injected=20on=20first=20ToolResu?=
 =?UTF-8?q?lt=20when=20recall=20settles=20after=20UserQuery?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.test.ts | 81 +++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 39e215f6ec2..e6bb73b56e7 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -2373,6 +2373,87 @@ hello
       );
     });
 
+    it('should inject auto-memory on first ToolResult when recall settles after UserQuery', async () => {
+      // Controllable promise — recall stays pending across the UserQuery turn
+      // and only settles before the ToolResult turn runs.
+      let resolveRecall:
+        | ((value: {
+            prompt: string;
+            selectedDocs: never[];
+            strategy: 'model';
+          }) => void)
+        | undefined;
+      mockMemoryManager.recall.mockReturnValue(
+        new Promise((resolve) => {
+          resolveRecall = resolve;
+        }),
+      );
+
+      const mockStream = (async function* () {
+        yield { type: 'content', value: 'Hello' };
+      })();
+      mockTurnRunFn.mockReturnValue(mockStream);
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+
+      // Turn 1: UserQuery — recall still pending, no injection
+      const userStream = client.sendMessageStream(
+        [{ text: 'What is my name?' }],
+        new AbortController().signal,
+        'prompt-id-user-query',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of userStream) {
+        // consume
+      }
+
+      expect(mockTurnRunFn).toHaveBeenLastCalledWith(
+        'test-model',
+        expect.not.arrayContaining([
+          expect.stringContaining('Deferred memory result'),
+        ]),
+        expect.any(AbortSignal),
+      );
+
+      // Recall settles between turns
+      resolveRecall!({
+        prompt: '## Relevant memory\n\nDeferred memory result.',
+        selectedDocs: [],
+        strategy: 'model',
+      });
+      // Drain microtasks so the settledAt finally() callback runs
+      await Promise.resolve();
+      await Promise.resolve();
+
+      // Turn 2: ToolResult — settledAt is now non-null, memory should inject
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'world' };
+        })(),
+      );
+      const toolStream = client.sendMessageStream(
+        [{ functionResponse: { name: 'foo', response: { ok: true } } }],
+        new AbortController().signal,
+        'prompt-id-tool-result',
+        { type: SendMessageType.ToolResult },
+      );
+      for await (const _ of toolStream) {
+        // consume
+      }
+
+      expect(mockTurnRunFn).toHaveBeenLastCalledWith(
+        'test-model',
+        expect.arrayContaining([
+          '## Relevant memory\n\nDeferred memory result.',
+        ]),
+        expect.any(AbortSignal),
+      );
+    });
+
     it('should proceed without auto-memory when managed auto-memory is disabled', async () => {
       // When getManagedAutoMemoryEnabled returns false, no recall is initiated
       // and sendMessageStream completes without memory content

From ce02bd001d7f858b7a5432d5ba027391c6dfaeec Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 17:10:50 +0800
Subject: [PATCH 10/15] fix(core): address codex review findings on async
 memory recall

Three findings fixed:

1. Abort previous prefetch before installing a new one (line 1059):
   A new UserQuery/Cron used to overwrite pendingMemoryPrefetch without
   aborting the old controller, leaking an unbounded background recall now
   that the 1s side-query timeout is gone.

2. Move the UserQuery consume poll AFTER the async reminder setup:
   ensureTool + listSubagents are awaited between the old poll location and
   the final assembly, so recalls that settled during those awaits used to
   be missed (and a tool-less turn never got a ToolResult retry). The poll
   now runs immediately before requestToSend assembly, and unshifts memory
   to the front of systemReminders to preserve ordering.

3. Append memory after functionResponse on ToolResult turns:
   The Qwen API requires the functionResponse part to immediately follow
   the model's functionCall (see lines 1209-1213). Prepending memory text
   risked breaking that pairing on the native Gemini path. Appending keeps
   the pair intact on Gemini and produces the same OpenAI output (text
   becomes a separate user message after the tool messages).

Tests:
- Updated ToolResult inject test to assert memory index > functionResponse
- Added abort-previous-prefetch test (mid-flight UserQuery aborts old handle)

224/224 tests pass; tsc clean on changed files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.test.ts | 70 ++++++++++++++++++++++++---
 packages/core/src/core/client.ts      | 53 ++++++++++++--------
 2 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index e6bb73b56e7..0ae77516d86 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -2445,13 +2445,71 @@ hello
         // consume
       }
 
-      expect(mockTurnRunFn).toHaveBeenLastCalledWith(
-        'test-model',
-        expect.arrayContaining([
-          '## Relevant memory\n\nDeferred memory result.',
-        ]),
-        expect.any(AbortSignal),
+      // Memory must come AFTER the functionResponse part so the Qwen API
+      // call/response pairing isn't broken (see client.ts:1209-1213).
+      const lastCallArgs = mockTurnRunFn.mock.lastCall;
+      const requestArr = lastCallArgs![1] as unknown[];
+      const functionResponseIdx = requestArr.findIndex(
+        (p) => typeof p === 'object' && p !== null && 'functionResponse' in p,
+      );
+      const memoryIdx = requestArr.findIndex(
+        (p) => p === '## Relevant memory\n\nDeferred memory result.',
+      );
+      expect(functionResponseIdx).toBeGreaterThanOrEqual(0);
+      expect(memoryIdx).toBeGreaterThan(functionResponseIdx);
+    });
+
+    it('should abort the previous prefetch when a new UserQuery arrives mid-flight', async () => {
+      // Pending recall on first UserQuery — never resolves on its own.
+      const abortSignals: AbortSignal[] = [];
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        abortSignals.push(opts.abortSignal as AbortSignal);
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello' };
+        })(),
       );
+
+      // First UserQuery — installs prefetch #1
+      const stream1 = client.sendMessageStream(
+        [{ text: 'first' }],
+        new AbortController().signal,
+        'prompt-id-1',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream1) {
+        // consume
+      }
+      expect(abortSignals.length).toBe(1);
+      expect(abortSignals[0].aborted).toBe(false);
+
+      // Second UserQuery — should abort #1 before installing #2
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello again' };
+        })(),
+      );
+      const stream2 = client.sendMessageStream(
+        [{ text: 'second' }],
+        new AbortController().signal,
+        'prompt-id-2',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream2) {
+        // consume
+      }
+
+      expect(abortSignals.length).toBe(2);
+      expect(abortSignals[0].aborted).toBe(true);
+      expect(abortSignals[1].aborted).toBe(false);
     });
 
     it('should proceed without auto-memory when managed auto-memory is disabled', async () => {
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 6e4a50bc864..69c420f322c 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -1057,6 +1057,11 @@ export class GeminiClient {
         messageType === SendMessageType.Cron
       ) {
         if (this.config.getManagedAutoMemoryEnabled()) {
+          // A previous recall may still be pending (slow side-query, new user
+          // turn arrived before it settled). Abort it before installing the
+          // new handle so the orphan doesn't keep running indefinitely.
+          this.pendingMemoryPrefetch?.controller.abort();
+          this.pendingMemoryPrefetch = undefined;
           const controller = new AbortController();
           const promise = this.config
             .getMemoryManager()
@@ -1274,24 +1279,6 @@ export class GeminiClient {
         messageType === SendMessageType.Cron
       ) {
         const systemReminders = [];
-        // Zero-wait poll: consume only if the prefetch has already settled.
-        // If not settled yet, skip — the ToolResult inject point will retry.
-        const prefetchHandle = this.pendingMemoryPrefetch;
-        if (
-          prefetchHandle &&
-          prefetchHandle.settledAt !== null &&
-          !prefetchHandle.consumed
-        ) {
-          prefetchHandle.consumed = true;
-          this.pendingMemoryPrefetch = undefined;
-          const result = await prefetchHandle.promise; // already settled, returns immediately
-          if (result.prompt) {
-            systemReminders.push(result.prompt);
-            for (const doc of result.selectedDocs) {
-              this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
-            }
-          }
-        }
 
         // add subagent system reminder if there are subagents
         const hasAgentTool = await this.config
@@ -1326,6 +1313,27 @@ export class GeminiClient {
           }
         }
 
+        // Zero-wait poll: consume only if the prefetch has already settled.
+        // Done AFTER the async reminder setup above so recall settling during
+        // those awaits still gets caught here. If still not settled, skip —
+        // the ToolResult inject point will retry on the next turn.
+        const prefetchHandle = this.pendingMemoryPrefetch;
+        if (
+          prefetchHandle &&
+          prefetchHandle.settledAt !== null &&
+          !prefetchHandle.consumed
+        ) {
+          prefetchHandle.consumed = true;
+          this.pendingMemoryPrefetch = undefined;
+          const result = await prefetchHandle.promise; // already settled, returns immediately
+          if (result.prompt) {
+            systemReminders.unshift(result.prompt);
+            for (const doc of result.selectedDocs) {
+              this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+            }
+          }
+        }
+
         requestToSend = [...systemReminders, ...requestToSend];
       }
 
@@ -1340,7 +1348,14 @@ export class GeminiClient {
           this.pendingMemoryPrefetch = undefined;
           const result = await prefetchHandle.promise;
           if (result.prompt) {
-            requestToSend = [result.prompt, ...requestToSend];
+            // Append (not prepend): on a ToolResult turn, requestToSend leads
+            // with functionResponse parts that must immediately follow the
+            // model's functionCall (Qwen API constraint, see lines 1209-1213).
+            // Putting the memory text after the functionResponse parts keeps
+            // the call/response pairing intact under native Gemini, while the
+            // OpenAI converter still emits the text as a separate user message
+            // after the tool messages.
+            requestToSend = [...requestToSend, result.prompt];
             for (const doc of result.selectedDocs) {
               this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
             }

From 2c41fa1970f17782e21b3e8d8a91d24231d190ba Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 17:35:40 +0800
Subject: [PATCH 11/15] docs(core): add JSDoc + clarifying comments per review
 feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Annotations only, no behavior change:
- MemoryPrefetchHandle: full JSDoc covering lifecycle (create → consume → discard)
- UserQuery consume site: explain why we unshift (front of systemReminders)
- ToolResult inject site: reference hasPendingToolCall pattern instead of
  brittle line numbers when citing the Qwen functionCall/Response constraint
- relevanceSelector.ts: explain why the side-query has no inline timeout
  (caller controls lifetime via MemoryPrefetchHandle.controller)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.ts              | 28 +++++++++++++++----
 packages/core/src/memory/relevanceSelector.ts |  4 +++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 69c420f322c..f67cd9f3d91 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -149,6 +149,18 @@ function wrapIdeContext(contextText: string): string {
   return `<system-reminder>\n${safeContextText}\n</system-reminder>`;
 }
 
+/**
+ * Handle for a non-blocking auto-memory recall prefetch.
+ *
+ * Lifecycle:
+ *  1. Created on UserQuery/Cron — the recall promise fires immediately,
+ *     `pendingMemoryPrefetch` is set to this handle.
+ *  2. Consumed at either of two opportunistic points: a zero-wait
+ *     `settledAt !== null` poll just before the UserQuery main request,
+ *     or — if recall hadn't settled yet — on the first ToolResult turn.
+ *  3. Aborted-and-discarded by every cleanup path (resetChat,
+ *     MaxSessionTurns, etc.) or replaced when a new UserQuery arrives.
+ */
 type MemoryPrefetchHandle = {
   promise: Promise<RelevantAutoMemoryPromptResult>;
   /** Set by promise.finally(). null until the promise settles. */
@@ -1327,6 +1339,11 @@ export class GeminiClient {
           this.pendingMemoryPrefetch = undefined;
           const result = await prefetchHandle.promise; // already settled, returns immediately
           if (result.prompt) {
+            // Unshift to the front of systemReminders: on a UserQuery turn
+            // requestToSend leads with user text, so positioning memory at
+            // the very start of the system-reminder block keeps it close to
+            // the user prompt. Contrast the ToolResult path below, which
+            // must append to avoid splitting functionCall / functionResponse.
             systemReminders.unshift(result.prompt);
             for (const doc of result.selectedDocs) {
               this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
@@ -1350,11 +1367,12 @@ export class GeminiClient {
           if (result.prompt) {
             // Append (not prepend): on a ToolResult turn, requestToSend leads
             // with functionResponse parts that must immediately follow the
-            // model's functionCall (Qwen API constraint, see lines 1209-1213).
-            // Putting the memory text after the functionResponse parts keeps
-            // the call/response pairing intact under native Gemini, while the
-            // OpenAI converter still emits the text as a separate user message
-            // after the tool messages.
+            // model's functionCall (Qwen API constraint — same reason the
+            // IDE-context block above is skipped while a tool call is pending,
+            // see the `hasPendingToolCall` guard). Putting the memory text
+            // after the functionResponse parts keeps the call/response pairing
+            // intact under native Gemini; the OpenAI converter then emits the
+            // text as a separate user message after the tool messages.
             requestToSend = [...requestToSend, result.prompt];
             for (const doc of result.selectedDocs) {
               this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
diff --git a/packages/core/src/memory/relevanceSelector.ts b/packages/core/src/memory/relevanceSelector.ts
index 6fa8bed6d53..a7effd4fa38 100644
--- a/packages/core/src/memory/relevanceSelector.ts
+++ b/packages/core/src/memory/relevanceSelector.ts
@@ -91,6 +91,10 @@ export async function selectRelevantAutoMemoryDocumentsByModel(
     purpose: 'auto-memory-recall',
     contents,
     schema: RESPONSE_SCHEMA,
+    // No inline timeout — caller (`GeminiClient.MemoryPrefetchHandle`)
+    // owns cancellation and aborts via its controller on cleanup paths.
+    // A bare unsignalled AbortController fills the type when the caller
+    // doesn't pass one, since runSideQuery's `abortSignal` is required.
     abortSignal: callerAbortSignal ?? new AbortController().signal,
 
     // Uses runSideQuery's default side-query model policy: fast model first,

From f4b6aae125fad5249b5d8a439b7a70e269bafcca Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Fri, 15 May 2026 17:45:04 +0800
Subject: [PATCH 12/15] fix(core): bridge caller abort signal into memory
 prefetch + doc accuracy fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Behavior fix (addresses copilot review on client.ts:1071):
- When the parent sendMessageStream signal aborts (user Ctrl-C / Esc),
  the prefetch controller now aborts too. Previously the recall side-query
  would keep running until a later cleanup (next UserQuery / /clear / etc),
  wasting fast-model tokens on work whose result no one would consume.
- Listener uses { once: true } and is also removed in the promise's
  finally() so a long-lived parent signal doesn't accumulate listeners
  across many turns under normal completion.
- Edge case: if signal is already aborted when fire runs, abort the
  controller synchronously instead of attaching a listener.

Test:
- New regression guard: "should abort the pending prefetch when the caller
  signal aborts" — verifies the abort handler installed on the recall side
  fires once the parent signal aborts.

Doc accuracy (addresses copilot review on the design spec):
- ToolResult inject: was documented as "prepend", actual implementation
  appends to preserve functionCall/functionResponse pairing. Updated both
  the prose summary and the code sample.
- Cleanup section: was documented as 6 abort-locations including the
  "post-consume clear"; the consume sites don't actually abort (the promise
  has already settled). Reorganized as 5 abort-and-clear sites + 2
  clear-only sites with the distinction made explicit.
- Fire path snippet: added the abort-previous-prefetch line and the
  caller-signal bridge so the spec matches the current implementation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../2026-05-15-async-memory-recall-design.md  | 40 ++++++++++++++++---
 packages/core/src/core/client.test.ts         | 36 +++++++++++++++++
 packages/core/src/core/client.ts              | 13 ++++++
 3 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/docs/design/2026-05-15-async-memory-recall-design.md b/docs/design/2026-05-15-async-memory-recall-design.md
index b078d52c97a..4cbb33ede32 100644
--- a/docs/design/2026-05-15-async-memory-recall-design.md
+++ b/docs/design/2026-05-15-async-memory-recall-design.md
@@ -22,7 +22,7 @@ Root cause: the main-agent request path `await`s the recall result before sendin
 Fire recall on UserQuery and never await it. Consume the result at two opportunistic points — whichever fires first:
 
 1. **UserQuery consume point** — synchronous `settledAt !== null` check just before `turn.run()`. Zero-wait: if already settled, use it; if not, skip.
-2. **ToolResult inject point** — same check on every ToolResult turn. Injects memory as a `system-reminder` prepended to the tool-result message, giving the model memory context before its next response.
+2. **ToolResult inject point** — same check on every ToolResult turn. Injects memory as a `system-reminder` **appended after** the functionResponse parts in `requestToSend`, giving the model memory context before its next response. (Append, not prepend: the Qwen API requires the functionResponse to immediately follow the model's functionCall — see the existing `hasPendingToolCall` IDE-context skip for the same constraint.)
 
 This matches the pattern used by Claude Code upstream (`startRelevantMemoryPrefetch` / `settledAt` polling in `query.ts`).
 
@@ -62,7 +62,22 @@ Delete the function entirely. It is replaced by the `settledAt` flag mechanism.
 Replace the `resolveAutoMemoryWithDeadline` call with:
 
 ```typescript
+// Abort any in-flight prefetch from a previous UserQuery before installing
+// the new handle (prevents orphan side-queries when the user types again
+// before recall settles).
+this.pendingMemoryPrefetch?.controller.abort();
+this.pendingMemoryPrefetch = undefined;
+
 const controller = new AbortController();
+// Bridge the caller's signal into the prefetch controller so a user abort
+// (Ctrl-C / Esc) on the parent turn also terminates the recall side-query.
+const onParentAbort = () => controller.abort();
+if (signal.aborted) {
+  controller.abort();
+} else {
+  signal.addEventListener('abort', onParentAbort, { once: true });
+}
+
 const promise = this.config
   .getMemoryManager()
   .recall(projectRoot, partToString(request), {
@@ -85,6 +100,7 @@ const handle: MemoryPrefetchHandle = {
 };
 void promise.finally(() => {
   handle.settledAt = Date.now();
+  signal.removeEventListener('abort', onParentAbort);
 });
 this.pendingMemoryPrefetch = handle;
 // no await — continue immediately
@@ -127,7 +143,10 @@ if (messageType === SendMessageType.ToolResult) {
     this.pendingMemoryPrefetch = undefined;
     const result = await prefetchHandle.promise;
     if (result.prompt) {
-      requestToSend = [result.prompt, ...requestToSend];
+      // Append (not prepend) so functionResponse parts stay first
+      // and the model's functionCall/functionResponse pairing
+      // isn't broken on the native Gemini path.
+      requestToSend = [...requestToSend, result.prompt];
       for (const doc of result.selectedDocs) {
         this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
       }
@@ -136,16 +155,27 @@ if (messageType === SendMessageType.ToolResult) {
 }
 ```
 
-### 5. `client.ts` — cleanup paths (6 locations)
+### 5. `client.ts` — cleanup paths
+
+The handle is released by two distinct mechanisms:
 
-Replace all `pendingRecallAbortController?.abort()` + `= undefined` with:
+**5 abort-and-clear sites** (the prefetch is still pending, abort the controller before dropping the reference). Replace `pendingRecallAbortController?.abort()` + `= undefined` with:
 
 ```typescript
 this.pendingMemoryPrefetch?.controller.abort();
 this.pendingMemoryPrefetch = undefined;
 ```
 
-Locations: `resetChat()`, MaxSessionTurns early-return, boundedTurns=0 early-return, SessionTokenLimitExceeded early-return, Arena control signal early-return, post-consume clear.
+Sites: `resetChat()`, `MaxSessionTurns` early-return, `boundedTurns=0` early-return, `SessionTokenLimitExceeded` early-return, Arena control-signal early-return. The fire path itself also performs this abort-then-replace when a new UserQuery arrives while the previous prefetch is still in flight.
+
+**2 clear-only sites** (the prefetch has already settled and we're consuming it — no controller to abort, just drop the reference):
+
+```typescript
+prefetchHandle.consumed = true;
+this.pendingMemoryPrefetch = undefined;
+```
+
+Sites: UserQuery consume point, ToolResult inject point.
 
 ### 6. `relevanceSelector.ts` — remove `AbortSignal.timeout(1_000)`
 
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index 0ae77516d86..e99fa9b4f0b 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -2459,6 +2459,42 @@ hello
       expect(memoryIdx).toBeGreaterThan(functionResponseIdx);
     });
 
+    it('should abort the pending prefetch when the caller signal aborts', async () => {
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello' };
+        })(),
+      );
+
+      const callerController = new AbortController();
+      const stream = client.sendMessageStream(
+        [{ text: 'user typed but then aborted' }],
+        callerController.signal,
+        'prompt-id-aborted',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream) {
+        // consume
+      }
+
+      expect(abortHandlerInvoked).toBe(false);
+      callerController.abort();
+      expect(abortHandlerInvoked).toBe(true);
+    });
+
     it('should abort the previous prefetch when a new UserQuery arrives mid-flight', async () => {
       // Pending recall on first UserQuery — never resolves on its own.
       const abortSignals: AbortSignal[] = [];
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index f67cd9f3d91..c82008fa357 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -1075,6 +1075,18 @@ export class GeminiClient {
           this.pendingMemoryPrefetch?.controller.abort();
           this.pendingMemoryPrefetch = undefined;
           const controller = new AbortController();
+          // Bridge the caller's signal into the prefetch controller so a user
+          // abort (Ctrl-C / Esc) on the parent turn also terminates the
+          // recall side-query. `{ once: true }` lets the listener clean itself
+          // up after firing; we still call removeEventListener on the promise's
+          // finally to cover the normal-completion case so a long-lived parent
+          // signal doesn't accumulate listeners across many turns.
+          const onParentAbort = () => controller.abort();
+          if (signal.aborted) {
+            controller.abort();
+          } else {
+            signal.addEventListener('abort', onParentAbort, { once: true });
+          }
           const promise = this.config
             .getMemoryManager()
             .recall(this.config.getProjectRoot(), partToString(request), {
@@ -1101,6 +1113,7 @@ export class GeminiClient {
           };
           void promise.finally(() => {
             handle.settledAt = Date.now();
+            signal.removeEventListener('abort', onParentAbort);
           });
           this.pendingMemoryPrefetch = handle;
         }

From 41ca832a87e0ada8b83b165818dd5848f9d0db1d Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Mon, 18 May 2026 10:17:59 +0800
Subject: [PATCH 13/15] refactor(core): consolidate memory-prefetch lifecycle +
 safety nets per round-3 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Architectural (root-cause fix for cleanup-path sibling drift):
- New private cancelPendingMemoryPrefetch() consolidates the abort+clear
  idiom (was duplicated across 6 sites). Logs at debug when discarding a
  settled-but-unconsumed handle so missing-memory scenarios are diagnosable.
- New private tryConsumeMemoryPrefetch() consolidates the
  consume-and-mark-consumed dance (was duplicated UserQuery + ToolResult).
- All existing cleanup sites + the two newly-flagged early-return sites
  (LoopDetected, Error) now use the helper; future early-returns can rely
  on the finally-block safety net.
- sendMessageStream try-finally now uses a `normalCompletion` flag:
  only the bottom-of-try return path preserves the prefetch (intentional
  — next ToolResult turn may consume it); every other exit (uncaught
  exception, abnormal early-return) goes through cancelPendingMemoryPrefetch
  in finally.

Diagnostics:
- Restored AbortError debug log in fire-path catch (was silent after
  removing the deadline mechanism; aborts now come from 4+ sources so a
  trace is valuable).
- Updated stale "deadline" log in recall.ts to reflect current abort
  sources (caller signal / new UserQuery / cleanup / 30 s safety timeout).

Safety net:
- Added 30 s ceiling in relevanceSelector via AbortSignal.any(...).
  Generous enough that normal ~1 s recalls don't trip it; bounds zombie
  side-queries if the model API hangs and the caller never aborts.
  Replaces the uncancellable `new AbortController().signal` fallback that
  would have left callerless invocations running indefinitely.

Doc sync:
- Design doc updated: UserQuery consume code sample now shows `unshift`
  (matches implementation) with an inline note on the prepend-vs-append
  contrast.

Tests:
- New regression guard: resetChat aborts pending prefetch and clears the
  handle.
- New regression guard: LoopDetected mid-stream aborts pending prefetch
  and clears the handle (catches the sibling-drift bug this round caught).

227/227 tests pass; tsc clean on changed files.

Declined from this round:
- `await Promise.resolve()` after fire path: defensive — current code has
  multiple natural microtask drains before consume point. Added comment
  documenting the dependency instead.
- Renaming `settledAt: number | null` to `settled: boolean`: timestamp
  has diagnostic value for future instrumentation; current consumers'
  null-check usage is documented in the JSDoc.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../2026-05-15-async-memory-recall-design.md  |   6 +-
 packages/core/src/core/client.test.ts         |  80 +++++++++
 packages/core/src/core/client.ts              | 165 ++++++++++++------
 packages/core/src/memory/recall.ts            |  11 +-
 packages/core/src/memory/relevanceSelector.ts |  16 +-
 5 files changed, 209 insertions(+), 69 deletions(-)

diff --git a/docs/design/2026-05-15-async-memory-recall-design.md b/docs/design/2026-05-15-async-memory-recall-design.md
index 4cbb33ede32..f11b2ac5d23 100644
--- a/docs/design/2026-05-15-async-memory-recall-design.md
+++ b/docs/design/2026-05-15-async-memory-recall-design.md
@@ -119,7 +119,11 @@ if (
   this.pendingMemoryPrefetch = undefined;
   const result = await prefetchHandle.promise; // already settled, returns immediately
   if (result.prompt) {
-    systemReminders.push(result.prompt);
+    // unshift, not push: keep memory at the front of systemReminders so
+    // it leads the system-reminder block on UserQuery turns. (ToolResult
+    // turns instead append to requestToSend to preserve functionCall /
+    // functionResponse pairing — see below.)
+    systemReminders.unshift(result.prompt);
     for (const doc of result.selectedDocs) {
       this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
     }
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index e99fa9b4f0b..ec1afea6e30 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -2548,6 +2548,86 @@ hello
       expect(abortSignals[1].aborted).toBe(false);
     });
 
+    it('should abort the pending prefetch on resetChat', async () => {
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello' };
+        })(),
+      );
+
+      const stream = client.sendMessageStream(
+        [{ text: 'first' }],
+        new AbortController().signal,
+        'prompt-id-reset-1',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream) {
+        // consume
+      }
+
+      expect(abortHandlerInvoked).toBe(false);
+      await client.resetChat();
+      expect(abortHandlerInvoked).toBe(true);
+      expect(client['pendingMemoryPrefetch']).toBeUndefined();
+    });
+
+    it('should abort the pending prefetch when LoopDetected fires mid-stream', async () => {
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+
+      // Force LoopDetector to trip on the first event.
+      const loopDetector = client['loopDetector'];
+      vi.spyOn(loopDetector, 'addAndCheck').mockReturnValue(true);
+      vi.spyOn(loopDetector, 'getLastLoopType').mockReturnValue(undefined);
+
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'looping' };
+        })(),
+      );
+
+      const stream = client.sendMessageStream(
+        [{ text: 'trigger a loop' }],
+        new AbortController().signal,
+        'prompt-id-loop',
+        { type: SendMessageType.UserQuery },
+      );
+      const events = [];
+      for await (const event of stream) {
+        events.push(event);
+      }
+
+      expect(events.some((e) => e.type === GeminiEventType.LoopDetected)).toBe(
+        true,
+      );
+      expect(abortHandlerInvoked).toBe(true);
+      expect(client['pendingMemoryPrefetch']).toBeUndefined();
+    });
+
     it('should proceed without auto-memory when managed auto-memory is disabled', async () => {
       // When getManagedAutoMemoryEnabled returns false, no recall is initiated
       // and sendMessageStream completes without memory content
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index c82008fa357..e0fe0367f04 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -385,6 +385,50 @@ export class GeminiClient {
     });
   }
 
+  /**
+   * Abort and release the pending auto-memory prefetch in one step.
+   * Safe to call when no prefetch is pending — does nothing. Centralises
+   * the abort-then-clear idiom so every cleanup path (resetChat, early
+   * returns, finally) cannot half-fix one without the other.
+   *
+   * If the handle has already settled (recall completed but consume point
+   * hadn't run yet), the settled result is discarded — logged at debug so
+   * operators can diagnose missing-memory scenarios.
+   */
+  private cancelPendingMemoryPrefetch(): void {
+    const handle = this.pendingMemoryPrefetch;
+    if (!handle) return;
+    if (handle.settledAt !== null && !handle.consumed) {
+      debugLogger.debug('Discarding settled but unconsumed memory prefetch.');
+    }
+    handle.controller.abort();
+    this.pendingMemoryPrefetch = undefined;
+  }
+
+  /**
+   * Atomically consume the pending prefetch if it has already settled.
+   * Returns the recall result (caller decides where to inject it in
+   * `requestToSend`), or `null` if there's nothing to consume yet.
+   *
+   * Centralises the consume-and-mark dance so the UserQuery and ToolResult
+   * inject sites can't drift on the guard logic.
+   */
+  private async tryConsumeMemoryPrefetch(): Promise<RelevantAutoMemoryPromptResult | null> {
+    const handle = this.pendingMemoryPrefetch;
+    if (!handle || handle.settledAt === null || handle.consumed) {
+      return null;
+    }
+    handle.consumed = true;
+    this.pendingMemoryPrefetch = undefined;
+    const result = await handle.promise; // already settled, returns immediately
+    if (result.prompt) {
+      for (const doc of result.selectedDocs) {
+        this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+      }
+    }
+    return result;
+  }
+
   async resetChat(): Promise<void> {
     this.initializedSessionId = undefined;
     this.surfacedRelevantAutoMemoryPaths.clear();
@@ -398,8 +442,7 @@ export class GeminiClient {
     this.config.getBaseLlmClient().clearPerModelGeneratorCache();
     // Abort any in-flight auto-memory recall so the stale controller
     // does not leak into the next session.
-    this.pendingMemoryPrefetch?.controller.abort();
-    this.pendingMemoryPrefetch = undefined;
+    this.cancelPendingMemoryPrefetch();
     // Drop any deferred tools revealed this session so /clear really gives
     // a clean slate. We don't clear inside startChat itself because that path
     // is also taken by compression (which preserves the session), and
@@ -1063,6 +1106,13 @@ export class GeminiClient {
       });
     }
 
+    // Tracks whether the generator reached its natural end (the bottom-of-try
+    // `return turn`). Only on that path do we want to preserve the pending
+    // memory prefetch so the next ToolResult turn can consume it. Any other
+    // exit (LoopDetected, Error, signal abort, uncaught exception, abnormal
+    // early-return) leaves this `false`, and the `finally` block aborts the
+    // prefetch as a safety net.
+    let normalCompletion = false;
     try {
       if (
         messageType === SendMessageType.UserQuery ||
@@ -1072,8 +1122,7 @@ export class GeminiClient {
           // A previous recall may still be pending (slow side-query, new user
           // turn arrived before it settled). Abort it before installing the
           // new handle so the orphan doesn't keep running indefinitely.
-          this.pendingMemoryPrefetch?.controller.abort();
-          this.pendingMemoryPrefetch = undefined;
+          this.cancelPendingMemoryPrefetch();
           const controller = new AbortController();
           // Bridge the caller's signal into the prefetch controller so a user
           // abort (Ctrl-C / Esc) on the parent turn also terminates the
@@ -1095,9 +1144,18 @@ export class GeminiClient {
               abortSignal: controller.signal,
             })
             .catch((error: unknown) => {
+              // Abort sources are now numerous (caller signal, new UserQuery,
+              // cleanup paths, safety-net timeout). Keep a debug trace so
+              // operators can diagnose missing-memory scenarios without
+              // raising noise on the common abort path.
               if (
-                !(error instanceof DOMException && error.name === 'AbortError')
+                error instanceof DOMException &&
+                error.name === 'AbortError'
               ) {
+                debugLogger.debug(
+                  'Managed auto-memory recall prefetch aborted.',
+                );
+              } else {
                 debugLogger.warn(
                   'Managed auto-memory recall prefetch failed.',
                   error,
@@ -1186,8 +1244,7 @@ export class GeminiClient {
           this.config.getMaxSessionTurns() > 0 &&
           this.sessionTurnCount > this.config.getMaxSessionTurns()
         ) {
-          this.pendingMemoryPrefetch?.controller.abort();
-          this.pendingMemoryPrefetch = undefined;
+          this.cancelPendingMemoryPrefetch();
           yield { type: GeminiEventType.MaxSessionTurns };
           if (isTopLevelInteraction)
             endInteractionSpan('error', {
@@ -1200,8 +1257,7 @@ export class GeminiClient {
       // Ensure turns never exceeds MAX_TURNS to prevent infinite loops
       const boundedTurns = Math.min(turns, MAX_TURNS);
       if (!boundedTurns) {
-        this.pendingMemoryPrefetch?.controller.abort();
-        this.pendingMemoryPrefetch = undefined;
+        this.cancelPendingMemoryPrefetch();
         if (isTopLevelInteraction)
           endInteractionSpan('error', { errorMessage: 'max turns exhausted' });
         return new Turn(this.getChat(), prompt_id);
@@ -1216,8 +1272,7 @@ export class GeminiClient {
         const lastPromptTokenCount =
           uiTelemetryService.getLastPromptTokenCount();
         if (lastPromptTokenCount > sessionTokenLimit) {
-          this.pendingMemoryPrefetch?.controller.abort();
-          this.pendingMemoryPrefetch = undefined;
+          this.cancelPendingMemoryPrefetch();
           yield {
             type: GeminiEventType.SessionTokenLimitExceeded,
             value: {
@@ -1277,8 +1332,7 @@ export class GeminiClient {
             `Arena control signal received: ${controlSignal.type} - ${controlSignal.reason}`,
           );
           await arenaAgentClient.reportCancelled();
-          this.pendingMemoryPrefetch?.controller.abort();
-          this.pendingMemoryPrefetch = undefined;
+          this.cancelPendingMemoryPrefetch();
           if (isTopLevelInteraction) endInteractionSpan('cancelled');
           return new Turn(this.getChat(), prompt_id);
         }
@@ -1340,57 +1394,36 @@ export class GeminiClient {
 
         // Zero-wait poll: consume only if the prefetch has already settled.
         // Done AFTER the async reminder setup above so recall settling during
-        // those awaits still gets caught here. If still not settled, skip —
-        // the ToolResult inject point will retry on the next turn.
-        const prefetchHandle = this.pendingMemoryPrefetch;
-        if (
-          prefetchHandle &&
-          prefetchHandle.settledAt !== null &&
-          !prefetchHandle.consumed
-        ) {
-          prefetchHandle.consumed = true;
-          this.pendingMemoryPrefetch = undefined;
-          const result = await prefetchHandle.promise; // already settled, returns immediately
-          if (result.prompt) {
-            // Unshift to the front of systemReminders: on a UserQuery turn
-            // requestToSend leads with user text, so positioning memory at
-            // the very start of the system-reminder block keeps it close to
-            // the user prompt. Contrast the ToolResult path below, which
-            // must append to avoid splitting functionCall / functionResponse.
-            systemReminders.unshift(result.prompt);
-            for (const doc of result.selectedDocs) {
-              this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
-            }
-          }
+        // those awaits still gets caught here. (settledAt is set in
+        // promise.finally(); microtask ordering guarantees it's visible
+        // after any await prior to this point — flatMapTextParts above is
+        // the natural drain.) If still not settled, skip — the ToolResult
+        // inject point will retry on the next turn.
+        const userQueryMemory = await this.tryConsumeMemoryPrefetch();
+        if (userQueryMemory?.prompt) {
+          // Unshift to the front of systemReminders: on a UserQuery turn
+          // requestToSend leads with user text, so positioning memory at
+          // the very start of the system-reminder block keeps it close to
+          // the user prompt. Contrast the ToolResult path below, which
+          // must append to avoid splitting functionCall / functionResponse.
+          systemReminders.unshift(userQueryMemory.prompt);
         }
 
         requestToSend = [...systemReminders, ...requestToSend];
       }
 
       if (messageType === SendMessageType.ToolResult) {
-        const prefetchHandle = this.pendingMemoryPrefetch;
-        if (
-          prefetchHandle &&
-          prefetchHandle.settledAt !== null &&
-          !prefetchHandle.consumed
-        ) {
-          prefetchHandle.consumed = true;
-          this.pendingMemoryPrefetch = undefined;
-          const result = await prefetchHandle.promise;
-          if (result.prompt) {
-            // Append (not prepend): on a ToolResult turn, requestToSend leads
-            // with functionResponse parts that must immediately follow the
-            // model's functionCall (Qwen API constraint — same reason the
-            // IDE-context block above is skipped while a tool call is pending,
-            // see the `hasPendingToolCall` guard). Putting the memory text
-            // after the functionResponse parts keeps the call/response pairing
-            // intact under native Gemini; the OpenAI converter then emits the
-            // text as a separate user message after the tool messages.
-            requestToSend = [...requestToSend, result.prompt];
-            for (const doc of result.selectedDocs) {
-              this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
-            }
-          }
+        const toolResultMemory = await this.tryConsumeMemoryPrefetch();
+        if (toolResultMemory?.prompt) {
+          // Append (not prepend): on a ToolResult turn, requestToSend leads
+          // with functionResponse parts that must immediately follow the
+          // model's functionCall (Qwen API constraint — same reason the
+          // IDE-context block above is skipped while a tool call is pending,
+          // see the `hasPendingToolCall` guard). Putting the memory text
+          // after the functionResponse parts keeps the call/response pairing
+          // intact under native Gemini; the OpenAI converter then emits the
+          // text as a separate user message after the tool messages.
+          requestToSend = [...requestToSend, toolResultMemory.prompt];
         }
       }
 
@@ -1416,6 +1449,9 @@ export class GeminiClient {
             this.lastApiCompletionTimestamp = Date.now();
             if (isTopLevelInteraction)
               endInteractionSpan('error', { errorMessage: 'loop detected' });
+            // finally cleanup catches this, but cancel explicitly to match
+            // the cleanup pattern at other early-return sites.
+            this.cancelPendingMemoryPrefetch();
             return turn;
           }
         }
@@ -1466,6 +1502,9 @@ export class GeminiClient {
               event.value instanceof Error ? '[API error]' : 'unknown error';
             endInteractionSpan('error', { errorMessage: errMsg });
           }
+          // finally cleanup catches this, but cancel explicitly to match
+          // the cleanup pattern at other early-return sites.
+          this.cancelPendingMemoryPrefetch();
           return turn;
         }
       }
@@ -1661,8 +1700,18 @@ export class GeminiClient {
       if (isTopLevelInteraction) {
         endInteractionSpan(signal?.aborted ? 'cancelled' : 'ok');
       }
+      // Reached the bottom of the try — this turn ended cleanly. Preserve
+      // any still-pending memory prefetch so the next ToolResult turn can
+      // consume it (the whole point of the fire-and-forget design).
+      normalCompletion = true;
       return turn;
     } finally {
+      // Belt-and-suspenders: abort the prefetch on any exit other than the
+      // bottom-of-try `return turn`. Catches uncaught exceptions and guards
+      // against future early-return sites that forget to call cancel.
+      if (!normalCompletion) {
+        this.cancelPendingMemoryPrefetch();
+      }
       if (isTopLevelInteraction) {
         endInteractionSpan(signal?.aborted ? 'cancelled' : 'error', {
           errorMessage: 'unexpected exit',
diff --git a/packages/core/src/memory/recall.ts b/packages/core/src/memory/recall.ts
index 8697496f953..e9a402a3a69 100644
--- a/packages/core/src/memory/recall.ts
+++ b/packages/core/src/memory/recall.ts
@@ -219,11 +219,12 @@ export async function resolveRelevantAutoMemoryPromptForQuery(
         strategy,
       };
     } catch (error) {
-      // Distinguish deadline-triggered cancellation from real model errors
-      // so oncall debugging is not misled by the fallback log.
+      // Distinguish caller-driven aborts (user signal / new UserQuery /
+      // session cleanup / 30 s safety timeout) from real model errors so
+      // oncall debugging is not misled by the fallback log.
       if (error instanceof DOMException && error.name === 'AbortError') {
         debugLogger.debug(
-          'Model-driven auto-memory recall cancelled by deadline; heuristic result discarded.',
+          'Model-driven auto-memory recall aborted; heuristic result discarded.',
         );
       } else {
         debugLogger.warn(
@@ -234,8 +235,8 @@ export async function resolveRelevantAutoMemoryPromptForQuery(
     }
   }
 
-  // If the caller's abort signal is already set (e.g. deadline fired), skip the
-  // heuristic fallback — the result would be discarded anyway.
+  // If the caller's abort signal is already set, skip the heuristic
+  // fallback — the result would be discarded anyway.
   if (options.abortSignal?.aborted) {
     return {
       prompt: '',
diff --git a/packages/core/src/memory/relevanceSelector.ts b/packages/core/src/memory/relevanceSelector.ts
index a7effd4fa38..1ef957671f7 100644
--- a/packages/core/src/memory/relevanceSelector.ts
+++ b/packages/core/src/memory/relevanceSelector.ts
@@ -91,11 +91,17 @@ export async function selectRelevantAutoMemoryDocumentsByModel(
     purpose: 'auto-memory-recall',
     contents,
     schema: RESPONSE_SCHEMA,
-    // No inline timeout — caller (`GeminiClient.MemoryPrefetchHandle`)
-    // owns cancellation and aborts via its controller on cleanup paths.
-    // A bare unsignalled AbortController fills the type when the caller
-    // doesn't pass one, since runSideQuery's `abortSignal` is required.
-    abortSignal: callerAbortSignal ?? new AbortController().signal,
+    // Caller (`GeminiClient.MemoryPrefetchHandle`) owns lifecycle and aborts
+    // via its controller on cleanup paths. The 30 s ceiling is a generous
+    // safety net that only fires if the model API hangs (network partition,
+    // server stall, runaway retry) AND the caller never aborts. Normal
+    // recalls take ~1 s; 30 s is far above the long tail so this doesn't
+    // re-introduce the 1 s timeout regression that motivated this redesign.
+    // Without this ceiling, a callerless invocation would use an
+    // unsignalled AbortController and run indefinitely.
+    abortSignal: callerAbortSignal
+      ? AbortSignal.any([AbortSignal.timeout(30_000), callerAbortSignal])
+      : AbortSignal.timeout(30_000),
 
     // Uses runSideQuery's default side-query model policy: fast model first,
     // then main session model when no fast model is configured.

From 904f2c47ace31764440e48b66d32f407e2e2c3e9 Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Mon, 18 May 2026 10:23:18 +0800
Subject: [PATCH 14/15] =?UTF-8?q?fix(test):=20correct=20getLastLoopType=20?=
 =?UTF-8?q?mock=20return=20type=20=E2=80=94=20null,=20not=20undefined?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI tsc --build (stricter than --noEmit) caught:
  src/core/client.test.ts(2996,65): error TS2345: Argument of type
  'undefined' is not assignable to parameter of type 'LoopType | null'.

getLastLoopType()'s contract returns LoopType | null; the test mock was
returning undefined. Switched to null to match the type.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.test.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index ec1afea6e30..af5444bd1cf 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -2602,7 +2602,7 @@ hello
       // Force LoopDetector to trip on the first event.
       const loopDetector = client['loopDetector'];
       vi.spyOn(loopDetector, 'addAndCheck').mockReturnValue(true);
-      vi.spyOn(loopDetector, 'getLastLoopType').mockReturnValue(undefined);
+      vi.spyOn(loopDetector, 'getLastLoopType').mockReturnValue(null);
 
       mockTurnRunFn.mockReturnValue(
         (async function* () {

From a0b8bedf53e3b7237dab600a6dacc112a086af21 Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Tue, 19 May 2026 10:33:28 +0800
Subject: [PATCH 15/15] fix(core): preserve memory prefetch across
 hook/next-speaker continuations + accurate recall abort log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-4 review findings (self-inflicted regression from round-3):

1. Preserve pending prefetch on `return hookTurn` (Stop-hook continuation)
   and `return continueTurn` (next-speaker continuation). The round-3
   `normalCompletion = true` was only set at the bottom-of-try `return turn`,
   leaving these two recursive-yield paths to trip the finally cleanup.
   When the inner Hook turn produced tool calls, the subsequent ToolResult
   turn found `pendingMemoryPrefetch === undefined` and memory was silently
   dropped.

2. recall.ts catch log distinguishes caller-driven aborts (heuristic
   genuinely skipped below) from the 30s safety-net timeout in
   relevanceSelector (the caller's signal is NOT aborted by that path,
   so the heuristic fallback actually runs).

Regression guard added:
- "should PRESERVE the pending prefetch when next-speaker continueTurn
  returns" — was red before this commit, green after.

258/258 tests pass; tsc --build clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/core/src/core/client.test.ts | 68 ++++++++++++++++++++++++++-
 packages/core/src/core/client.ts      |  9 ++++
 packages/core/src/memory/recall.ts    | 24 +++++++---
 3 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index d13da2a7fb5..bde6856c67a 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -39,7 +39,12 @@ import {
 import type { ModelsConfig } from '../models/modelsConfig.js';
 import { UnauthorizedError } from '../utils/errors.js';
 import { retryWithBackoff } from '../utils/retry.js';
-import { CompressionStatus, GeminiEventType, Turn } from './turn.js';
+import {
+  CompressionStatus,
+  GeminiEventType,
+  Turn,
+  type ServerGeminiStreamEvent,
+} from './turn.js';
 
 vi.mock('../utils/retry.js', () => ({
   retryWithBackoff: vi.fn(async (fn) => await fn()),
@@ -3025,6 +3030,67 @@ hello
       expect(client['pendingMemoryPrefetch']).toBeUndefined();
     });
 
+    it('should PRESERVE the pending prefetch when next-speaker continueTurn returns', async () => {
+      // Self-inflicted-regression guard for the round-4 finding:
+      // the bottom-of-try `normalCompletion = true` doesn't cover the
+      // `return continueTurn;` path, so the outer's finally used to cancel
+      // the still-pending prefetch — meaning a subsequent ToolResult turn
+      // would have no memory to consume.
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {}); // never settles
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'outer reply' };
+        })(),
+      );
+
+      // Force the next-speaker check to recurse so we hit `return continueTurn`.
+      // The recursion call passes through this same mock stream and returns.
+      const { checkNextSpeaker } = await import(
+        '../utils/nextSpeakerChecker.js'
+      );
+      const mockedCheckNextSpeaker = vi.mocked(checkNextSpeaker);
+      mockedCheckNextSpeaker
+        .mockResolvedValueOnce({
+          reasoning: 'forced',
+          next_speaker: 'model',
+        })
+        .mockResolvedValue(null); // inner recursion: stop
+      // Each recursive sendMessageStream call asks turn.run() for a new stream.
+      mockTurnRunFn.mockImplementation(
+        () =>
+          (async function* () {
+            yield { type: 'content', value: 'reply' };
+          })() as unknown as AsyncGenerator<ServerGeminiStreamEvent>,
+      );
+
+      const stream = client.sendMessageStream(
+        [{ text: 'hello' }],
+        new AbortController().signal,
+        'prompt-id-continueturn',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream) {
+        // consume
+      }
+
+      // The prefetch must survive the continueTurn return so a follow-up
+      // ToolResult turn can consume it.
+      expect(abortHandlerInvoked).toBe(false);
+      expect(client['pendingMemoryPrefetch']).not.toBeUndefined();
+    });
+
     it('should proceed without auto-memory when managed auto-memory is disabled', async () => {
       // When getManagedAutoMemoryEnabled returns false, no recall is initiated
       // and sendMessageStream completes without memory content
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index f5b3fb353ff..e66362900dd 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -1794,6 +1794,10 @@ export class GeminiClient {
           );
           if (isTopLevelInteraction)
             endInteractionSpan(signal.aborted ? 'cancelled' : 'ok');
+          // Preserve the pending prefetch: the inner Hook turn we just
+          // yielded may have produced tool calls, and the caller's next
+          // ToolResult turn still needs to consume the recall result.
+          normalCompletion = true;
           return hookTurn;
         }
 
@@ -1860,6 +1864,11 @@ export class GeminiClient {
           );
           if (isTopLevelInteraction)
             endInteractionSpan(signal.aborted ? 'cancelled' : 'ok');
+          // Preserve the pending prefetch: same reasoning as the
+          // `return hookTurn` site above — the recursive Hook turn may
+          // have produced tool calls whose ToolResult turn still needs
+          // the recall result.
+          normalCompletion = true;
           return continueTurn;
         }
 
diff --git a/packages/core/src/memory/recall.ts b/packages/core/src/memory/recall.ts
index e9a402a3a69..5b476b21ce1 100644
--- a/packages/core/src/memory/recall.ts
+++ b/packages/core/src/memory/recall.ts
@@ -219,13 +219,25 @@ export async function resolveRelevantAutoMemoryPromptForQuery(
         strategy,
       };
     } catch (error) {
-      // Distinguish caller-driven aborts (user signal / new UserQuery /
-      // session cleanup / 30 s safety timeout) from real model errors so
-      // oncall debugging is not misled by the fallback log.
+      // Distinguish three cases so oncall debugging isn't misled:
+      //   - caller-driven abort (user signal / new UserQuery / session
+      //     cleanup): caller signal is aborted → heuristic fallback is
+      //     skipped below at `options.abortSignal?.aborted`, so the
+      //     result really is discarded.
+      //   - 30 s safety-net timeout in relevanceSelector: only the inner
+      //     combined signal aborts; the caller's signal is NOT aborted,
+      //     so the heuristic fallback below DOES run.
+      //   - real model error: warn at the higher level.
       if (error instanceof DOMException && error.name === 'AbortError') {
-        debugLogger.debug(
-          'Model-driven auto-memory recall aborted; heuristic result discarded.',
-        );
+        if (options.abortSignal?.aborted) {
+          debugLogger.debug(
+            'Model-driven auto-memory recall aborted by caller; heuristic result discarded.',
+          );
+        } else {
+          debugLogger.debug(
+            'Model-driven auto-memory recall timed out (30 s safety net); heuristic fallback will run.',
+          );
+        }
       } else {
         debugLogger.warn(
           'Model-driven auto-memory recall failed; falling back to heuristic selection.',