diff --git a/docs/design/2026-05-15-async-memory-recall-design.md b/docs/design/2026-05-15-async-memory-recall-design.md
new file mode 100644
index 00000000000..f11b2ac5d23
--- /dev/null
+++ b/docs/design/2026-05-15-async-memory-recall-design.md
@@ -0,0 +1,206 @@
+# Async Memory Recall — Design Spec
+
+**Date:** 2026-05-15
+**Status:** Approved
+**Related issues:** #3761, #3759
+**Related PRs:** #3814, #3866
+
+---
+
+## Problem
+
+`relevanceSelector.ts` uses `AbortSignal.timeout(1_000)` (introduced by #3866). On first-session cold starts, qwen3.5-flash averages ~908 ms — consistently hitting the 1 s threshold. The outer 2.5 s deadline in `resolveAutoMemoryWithDeadline` means every UserQuery can block for up to 2.5 s even when recall always fails.
+
+Root cause: the main-agent request path `await`s the recall result before sending to the model. Any slowness in the recall side-query directly adds to user-visible latency.
+
+---
+
+## Design
+
+### Core idea
+
+Fire recall on UserQuery and never await it. Consume the result at two opportunistic points — whichever fires first:
+
+1. **UserQuery consume point** — synchronous `settledAt !== null` check just before `turn.run()`. Zero-wait: if already settled, use it; if not, skip.
+2. **ToolResult inject point** — same check on every ToolResult turn. Injects memory as a `system-reminder` **appended after** the functionResponse parts in `requestToSend`, giving the model memory context before its next response. (Append, not prepend: the Qwen API requires the functionResponse to immediately follow the model's functionCall — see the existing `hasPendingToolCall` IDE-context skip for the same constraint.)
+
+This matches the pattern used by Claude Code upstream (`startRelevantMemoryPrefetch` / `settledAt` polling in `query.ts`).
+
+---
+
+## Data structures
+
+### New type `MemoryPrefetchHandle` (in `client.ts`)
+
+```typescript
+type MemoryPrefetchHandle = {
+  promise: Promise<RelevantAutoMemoryPromptResult>;
+  /** Set by promise.finally(). null until the promise settles. */
+  settledAt: number | null;
+  /** True after memory has been injected — prevents double-inject. */
+  consumed: boolean;
+  controller: AbortController;
+};
+```
+
+### Field change on `GeminiClient`
+
+| Remove                                                       | Add                                                        |
+| ------------------------------------------------------------ | ---------------------------------------------------------- |
+| `pendingRecallAbortController: AbortController \| undefined` | `pendingMemoryPrefetch: MemoryPrefetchHandle \| undefined` |
+
+---
+
+## Changes
+
+### 1. `client.ts` — remove `resolveAutoMemoryWithDeadline`
+
+Delete the function entirely. It is replaced by the `settledAt` flag mechanism.
+
+### 2. `client.ts` — UserQuery fire path
+
+Replace the `resolveAutoMemoryWithDeadline` call with:
+
+```typescript
+// Abort any in-flight prefetch from a previous UserQuery before installing
+// the new handle (prevents orphan side-queries when the user types again
+// before recall settles).
+this.pendingMemoryPrefetch?.controller.abort();
+this.pendingMemoryPrefetch = undefined;
+
+const controller = new AbortController();
+// Bridge the caller's signal into the prefetch controller so a user abort
+// (Ctrl-C / Esc) on the parent turn also terminates the recall side-query.
+const onParentAbort = () => controller.abort();
+if (signal.aborted) {
+  controller.abort();
+} else {
+  signal.addEventListener('abort', onParentAbort, { once: true });
+}
+
+const promise = this.config
+  .getMemoryManager()
+  .recall(projectRoot, partToString(request), {
+    config: this.config,
+    excludedFilePaths: this.surfacedRelevantAutoMemoryPaths,
+    abortSignal: controller.signal,
+  })
+  .catch((error: unknown) => {
+    if (!(error instanceof DOMException && error.name === 'AbortError')) {
+      debugLogger.warn('Managed auto-memory recall prefetch failed.', error);
+    }
+    return EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
+  });
+
+const handle: MemoryPrefetchHandle = {
+  promise,
+  settledAt: null,
+  consumed: false,
+  controller,
+};
+void promise.finally(() => {
+  handle.settledAt = Date.now();
+  signal.removeEventListener('abort', onParentAbort);
+});
+this.pendingMemoryPrefetch = handle;
+// no await — continue immediately
+```
+
+### 3. `client.ts` — UserQuery consume point (replaces `await relevantAutoMemoryPromise`)
+
+```typescript
+const prefetchHandle = this.pendingMemoryPrefetch;
+if (
+  prefetchHandle &&
+  prefetchHandle.settledAt !== null &&
+  !prefetchHandle.consumed
+) {
+  prefetchHandle.consumed = true;
+  this.pendingMemoryPrefetch = undefined;
+  const result = await prefetchHandle.promise; // already settled, returns immediately
+  if (result.prompt) {
+    // unshift, not push: keep memory at the front of systemReminders so
+    // it leads the system-reminder block on UserQuery turns. (ToolResult
+    // turns instead append to requestToSend to preserve functionCall /
+    // functionResponse pairing — see below.)
+    systemReminders.unshift(result.prompt);
+    for (const doc of result.selectedDocs) {
+      this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+    }
+  }
+}
+```
+
+### 4. `client.ts` — ToolResult inject point (new)
+
+After `requestToSend` is assembled, before `turn.run()`, add:
+
+```typescript
+if (messageType === SendMessageType.ToolResult) {
+  const prefetchHandle = this.pendingMemoryPrefetch;
+  if (
+    prefetchHandle &&
+    prefetchHandle.settledAt !== null &&
+    !prefetchHandle.consumed
+  ) {
+    prefetchHandle.consumed = true;
+    this.pendingMemoryPrefetch = undefined;
+    const result = await prefetchHandle.promise;
+    if (result.prompt) {
+      // Append (not prepend) so functionResponse parts stay first
+      // and the model's functionCall/functionResponse pairing
+      // isn't broken on the native Gemini path.
+      requestToSend = [...requestToSend, result.prompt];
+      for (const doc of result.selectedDocs) {
+        this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+      }
+    }
+  }
+}
+```
+
+### 5. `client.ts` — cleanup paths
+
+The handle is released by two distinct mechanisms:
+
+**5 abort-and-clear sites** (the prefetch is still pending, abort the controller before dropping the reference). Replace `pendingRecallAbortController?.abort()` + `= undefined` with:
+
+```typescript
+this.pendingMemoryPrefetch?.controller.abort();
+this.pendingMemoryPrefetch = undefined;
+```
+
+Sites: `resetChat()`, `MaxSessionTurns` early-return, `boundedTurns=0` early-return, `SessionTokenLimitExceeded` early-return, Arena control-signal early-return. The fire path itself also performs this abort-then-replace when a new UserQuery arrives while the previous prefetch is still in flight.
+
+**2 clear-only sites** (the prefetch has already settled and we're consuming it — no controller to abort, just drop the reference):
+
+```typescript
+prefetchHandle.consumed = true;
+this.pendingMemoryPrefetch = undefined;
+```
+
+Sites: UserQuery consume point, ToolResult inject point.
+
+### 6. `relevanceSelector.ts` — remove `AbortSignal.timeout(1_000)`
+
+Remove the combined `AbortSignal.any([AbortSignal.timeout(1_000), callerAbortSignal])` and pass `callerAbortSignal` directly.
+
+---
+
+## Behaviour comparison
+
+| Scenario                                     | Before                         | After                                                  |
+| -------------------------------------------- | ------------------------------ | ------------------------------------------------------ |
+| recall completes before model prep           | inject on UserQuery, ~0 wait   | inject on UserQuery, ~0 wait                           |
+| recall slow (cold start)                     | block up to 2.5 s              | skip UserQuery, inject on first ToolResult             |
+| recall times out (1 s)                       | abort, empty result, no memory | no hard timeout; inject whenever settled               |
+| no tool calls, recall slow                   | block up to 2.5 s, then skip   | skip UserQuery, no ToolResult opportunity — miss       |
+| user sends 2nd message before recall settles | 2nd recall races 1st handle    | 1st handle aborted when 2nd UserQuery fires new handle |
+
+---
+
+## Out of scope
+
+- Changing the memory injection format from `system-reminder` to `tool-result` attachment (CC style)
+- Per-session byte budget skip gate
+- Single-word prompt skip gate
diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts
index a45c51eda1a..bde6856c67a 100644
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@@ -39,7 +39,12 @@ import {
 import type { ModelsConfig } from '../models/modelsConfig.js';
 import { UnauthorizedError } from '../utils/errors.js';
 import { retryWithBackoff } from '../utils/retry.js';
-import { CompressionStatus, GeminiEventType, Turn } from './turn.js';
+import {
+  CompressionStatus,
+  GeminiEventType,
+  Turn,
+  type ServerGeminiStreamEvent,
+} from './turn.js';
 
 vi.mock('../utils/retry.js', () => ({
   retryWithBackoff: vi.fn(async (fn) => await fn()),
@@ -2700,20 +2705,9 @@ hello
     });
 
     it('should not block the main request when auto-memory recall is slow', async () => {
-      // Simulate a recall that takes longer than the 2.5s deadline
-      mockMemoryManager.recall.mockReturnValue(
-        new Promise((resolve) =>
-          setTimeout(
-            () =>
-              resolve({
-                prompt: '## Relevant memory\n\nSlow memory result.',
-                selectedDocs: [],
-                strategy: 'model',
-              }),
-            10_000,
-          ),
-        ),
-      );
+      // Recall never settles — settledAt stays null so the UserQuery consume
+      // point skips it and turn.run() is called immediately without memory.
+      mockMemoryManager.recall.mockReturnValue(new Promise(() => {}));
 
       const mockStream = (async function* () {
         yield { type: 'content', value: 'Hello' };
@@ -2726,38 +2720,28 @@ hello
       };
       client['chat'] = mockChat as GeminiChat;
 
-      vi.useFakeTimers();
-      try {
-        const streamPromise = (async () => {
-          const stream = client.sendMessageStream(
-            [{ text: 'Quick question' }],
-            new AbortController().signal,
-            'prompt-id-slow-memory',
-          );
-          for await (const _ of stream) {
-            // consume stream
-          }
-        })();
-
-        // Advance past the 2.5s deadline — the main request should proceed
-        await vi.advanceTimersByTimeAsync(3_000);
-        await streamPromise;
-
-        // The main request should have been called without the slow memory
-        expect(mockTurnRunFn).toHaveBeenCalledWith(
-          'test-model',
-          expect.not.arrayContaining([
-            expect.stringContaining('Slow memory result'),
-          ]),
-          expect.any(AbortSignal),
-        );
-      } finally {
-        vi.useRealTimers();
+      const stream = client.sendMessageStream(
+        [{ text: 'Quick question' }],
+        new AbortController().signal,
+        'prompt-id-slow-memory',
+      );
+      for await (const _ of stream) {
+        // consume stream
       }
+
+      // turn.run() must have been called without the slow memory
+      expect(mockTurnRunFn).toHaveBeenCalledWith(
+        'test-model',
+        expect.not.arrayContaining([
+          expect.stringContaining('Slow memory result'),
+        ]),
+        expect.any(AbortSignal),
+      );
     });
 
-    it('should include auto-memory prompt when recall completes within deadline', async () => {
-      // Simulate a fast recall that completes well within the deadline
+    it('should inject auto-memory at UserQuery consume point when recall already settled', async () => {
+      // mockResolvedValue settles synchronously; by the time the consume-point
+      // check runs (after at least one await), settledAt is set.
       mockMemoryManager.recall.mockResolvedValue({
         prompt: '## Relevant memory\n\nFast memory result.',
         selectedDocs: [],
@@ -2791,6 +2775,322 @@ hello
       );
     });
 
+    it('should inject auto-memory on first ToolResult when recall settles after UserQuery', async () => {
+      // Controllable promise — recall stays pending across the UserQuery turn
+      // and only settles before the ToolResult turn runs.
+      let resolveRecall:
+        | ((value: {
+            prompt: string;
+            selectedDocs: never[];
+            strategy: 'model';
+          }) => void)
+        | undefined;
+      mockMemoryManager.recall.mockReturnValue(
+        new Promise((resolve) => {
+          resolveRecall = resolve;
+        }),
+      );
+
+      const mockStream = (async function* () {
+        yield { type: 'content', value: 'Hello' };
+      })();
+      mockTurnRunFn.mockReturnValue(mockStream);
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+
+      // Turn 1: UserQuery — recall still pending, no injection
+      const userStream = client.sendMessageStream(
+        [{ text: 'What is my name?' }],
+        new AbortController().signal,
+        'prompt-id-user-query',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of userStream) {
+        // consume
+      }
+
+      expect(mockTurnRunFn).toHaveBeenLastCalledWith(
+        'test-model',
+        expect.not.arrayContaining([
+          expect.stringContaining('Deferred memory result'),
+        ]),
+        expect.any(AbortSignal),
+      );
+
+      // Recall settles between turns
+      resolveRecall!({
+        prompt: '## Relevant memory\n\nDeferred memory result.',
+        selectedDocs: [],
+        strategy: 'model',
+      });
+      // Drain microtasks so the settledAt finally() callback runs
+      await Promise.resolve();
+      await Promise.resolve();
+
+      // Turn 2: ToolResult — settledAt is now non-null, memory should inject
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'world' };
+        })(),
+      );
+      const toolStream = client.sendMessageStream(
+        [{ functionResponse: { name: 'foo', response: { ok: true } } }],
+        new AbortController().signal,
+        'prompt-id-tool-result',
+        { type: SendMessageType.ToolResult },
+      );
+      for await (const _ of toolStream) {
+        // consume
+      }
+
+      // Memory must come AFTER the functionResponse part so the Qwen API
+      // call/response pairing isn't broken (see client.ts:1209-1213).
+      const lastCallArgs = mockTurnRunFn.mock.lastCall;
+      const requestArr = lastCallArgs![1] as unknown[];
+      const functionResponseIdx = requestArr.findIndex(
+        (p) => typeof p === 'object' && p !== null && 'functionResponse' in p,
+      );
+      const memoryIdx = requestArr.findIndex(
+        (p) => p === '## Relevant memory\n\nDeferred memory result.',
+      );
+      expect(functionResponseIdx).toBeGreaterThanOrEqual(0);
+      expect(memoryIdx).toBeGreaterThan(functionResponseIdx);
+    });
+
+    it('should abort the pending prefetch when the caller signal aborts', async () => {
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello' };
+        })(),
+      );
+
+      const callerController = new AbortController();
+      const stream = client.sendMessageStream(
+        [{ text: 'user typed but then aborted' }],
+        callerController.signal,
+        'prompt-id-aborted',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream) {
+        // consume
+      }
+
+      expect(abortHandlerInvoked).toBe(false);
+      callerController.abort();
+      expect(abortHandlerInvoked).toBe(true);
+    });
+
+    it('should abort the previous prefetch when a new UserQuery arrives mid-flight', async () => {
+      // Pending recall on first UserQuery — never resolves on its own.
+      const abortSignals: AbortSignal[] = [];
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        abortSignals.push(opts.abortSignal as AbortSignal);
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello' };
+        })(),
+      );
+
+      // First UserQuery — installs prefetch #1
+      const stream1 = client.sendMessageStream(
+        [{ text: 'first' }],
+        new AbortController().signal,
+        'prompt-id-1',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream1) {
+        // consume
+      }
+      expect(abortSignals.length).toBe(1);
+      expect(abortSignals[0].aborted).toBe(false);
+
+      // Second UserQuery — should abort #1 before installing #2
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello again' };
+        })(),
+      );
+      const stream2 = client.sendMessageStream(
+        [{ text: 'second' }],
+        new AbortController().signal,
+        'prompt-id-2',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream2) {
+        // consume
+      }
+
+      expect(abortSignals.length).toBe(2);
+      expect(abortSignals[0].aborted).toBe(true);
+      expect(abortSignals[1].aborted).toBe(false);
+    });
+
+    it('should abort the pending prefetch on resetChat', async () => {
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'Hello' };
+        })(),
+      );
+
+      const stream = client.sendMessageStream(
+        [{ text: 'first' }],
+        new AbortController().signal,
+        'prompt-id-reset-1',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream) {
+        // consume
+      }
+
+      expect(abortHandlerInvoked).toBe(false);
+      await client.resetChat();
+      expect(abortHandlerInvoked).toBe(true);
+      expect(client['pendingMemoryPrefetch']).toBeUndefined();
+    });
+
+    it('should abort the pending prefetch when LoopDetected fires mid-stream', async () => {
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {});
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+
+      // Force LoopDetector to trip on the first event.
+      const loopDetector = client['loopDetector'];
+      vi.spyOn(loopDetector, 'addAndCheck').mockReturnValue(true);
+      vi.spyOn(loopDetector, 'getLastLoopType').mockReturnValue(null);
+
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'looping' };
+        })(),
+      );
+
+      const stream = client.sendMessageStream(
+        [{ text: 'trigger a loop' }],
+        new AbortController().signal,
+        'prompt-id-loop',
+        { type: SendMessageType.UserQuery },
+      );
+      const events = [];
+      for await (const event of stream) {
+        events.push(event);
+      }
+
+      expect(events.some((e) => e.type === GeminiEventType.LoopDetected)).toBe(
+        true,
+      );
+      expect(abortHandlerInvoked).toBe(true);
+      expect(client['pendingMemoryPrefetch']).toBeUndefined();
+    });
+
+    it('should PRESERVE the pending prefetch when next-speaker continueTurn returns', async () => {
+      // Self-inflicted-regression guard for the round-4 finding:
+      // the bottom-of-try `normalCompletion = true` doesn't cover the
+      // `return continueTurn;` path, so the outer's finally used to cancel
+      // the still-pending prefetch — meaning a subsequent ToolResult turn
+      // would have no memory to consume.
+      let abortHandlerInvoked = false;
+      mockMemoryManager.recall.mockImplementation((_root, _query, opts) => {
+        opts.abortSignal?.addEventListener('abort', () => {
+          abortHandlerInvoked = true;
+        });
+        return new Promise(() => {}); // never settles
+      });
+
+      const mockChat: Partial<GeminiChat> = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+      };
+      client['chat'] = mockChat as GeminiChat;
+      mockTurnRunFn.mockReturnValue(
+        (async function* () {
+          yield { type: 'content', value: 'outer reply' };
+        })(),
+      );
+
+      // Force the next-speaker check to recurse so we hit `return continueTurn`.
+      // The recursion call passes through this same mock stream and returns.
+      const { checkNextSpeaker } = await import(
+        '../utils/nextSpeakerChecker.js'
+      );
+      const mockedCheckNextSpeaker = vi.mocked(checkNextSpeaker);
+      mockedCheckNextSpeaker
+        .mockResolvedValueOnce({
+          reasoning: 'forced',
+          next_speaker: 'model',
+        })
+        .mockResolvedValue(null); // inner recursion: stop
+      // Each recursive sendMessageStream call asks turn.run() for a new stream.
+      mockTurnRunFn.mockImplementation(
+        () =>
+          (async function* () {
+            yield { type: 'content', value: 'reply' };
+          })() as unknown as AsyncGenerator<ServerGeminiStreamEvent>,
+      );
+
+      const stream = client.sendMessageStream(
+        [{ text: 'hello' }],
+        new AbortController().signal,
+        'prompt-id-continueturn',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of stream) {
+        // consume
+      }
+
+      // The prefetch must survive the continueTurn return so a follow-up
+      // ToolResult turn can consume it.
+      expect(abortHandlerInvoked).toBe(false);
+      expect(client['pendingMemoryPrefetch']).not.toBeUndefined();
+    });
+
     it('should proceed without auto-memory when managed auto-memory is disabled', async () => {
       // When getManagedAutoMemoryEnabled returns false, no recall is initiated
       // and sendMessageStream completes without memory content
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 2ff9f1bc6f7..e66362900dd 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -153,41 +153,25 @@ function wrapIdeContext(contextText: string): string {
 }
 
 /**
- * Resolve the auto-memory recall promise with a hard deadline.
- * If the recall (model-driven selection + heuristic fallback) does not complete
- * within the deadline, return an empty result so the main request is not delayed.
+ * Handle for a non-blocking auto-memory recall prefetch.
  *
- * The deadline is set slightly above the model-driven selector's own
- * AbortSignal.timeout (2s) to give the heuristic fallback time to complete,
- * but low enough that the user does not perceive a delay on every turn.
+ * Lifecycle:
+ *  1. Created on UserQuery/Cron — the recall promise fires immediately,
+ *     `pendingMemoryPrefetch` is set to this handle.
+ *  2. Consumed at either of two opportunistic points: a zero-wait
+ *     `settledAt !== null` poll just before the UserQuery main request,
+ *     or — if recall hadn't settled yet — on the first ToolResult turn.
+ *  3. Aborted-and-discarded by every cleanup path (resetChat,
+ *     MaxSessionTurns, etc.) or replaced when a new UserQuery arrives.
  */
-async function resolveAutoMemoryWithDeadline(
-  promise: Promise<RelevantAutoMemoryPromptResult> | undefined,
-  onDeadline: () => void,
-): Promise<RelevantAutoMemoryPromptResult> {
-  if (!promise) {
-    return EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
-  }
-
-  let timer: ReturnType<typeof setTimeout> | undefined;
-  const deadline = new Promise<RelevantAutoMemoryPromptResult>((resolve) => {
-    timer = setTimeout(() => {
-      try {
-        onDeadline();
-      } finally {
-        resolve(EMPTY_RELEVANT_AUTO_MEMORY_RESULT);
-      }
-    }, 2_500);
-  });
-
-  try {
-    return await Promise.race([promise, deadline]);
-  } finally {
-    if (timer !== undefined) {
-      clearTimeout(timer);
-    }
-  }
-}
+type MemoryPrefetchHandle = {
+  promise: Promise<RelevantAutoMemoryPromptResult>;
+  /** Set by promise.finally(). null until the promise settles. */
+  settledAt: number | null;
+  /** True after memory has been injected — prevents double-inject. */
+  consumed: boolean;
+  controller: AbortController;
+};
 
 /** Tools that can write to the skills directory, used to detect skillsModifiedInSession. */
 const SKILL_WRITE_TOOL_NAMES: ReadonlySet<string> = new Set([
@@ -207,7 +191,7 @@ export class GeminiClient {
   private lastPromptId: string | undefined = undefined;
   private lastSentIdeContext: IdeContext | undefined;
   private forceFullIdeContext = true;
-  private pendingRecallAbortController: AbortController | undefined;
+  private pendingMemoryPrefetch: MemoryPrefetchHandle | undefined;
   private lastSessionStartContext: string | undefined;
   private lastSessionStartSource: SessionStartSource | undefined;
 
@@ -432,6 +416,50 @@ export class GeminiClient {
     });
   }
 
+  /**
+   * Abort and release the pending auto-memory prefetch in one step.
+   * Safe to call when no prefetch is pending — does nothing. Centralises
+   * the abort-then-clear idiom so every cleanup path (resetChat, early
+   * returns, finally) cannot half-fix one without the other.
+   *
+   * If the handle has already settled (recall completed but consume point
+   * hadn't run yet), the settled result is discarded — logged at debug so
+   * operators can diagnose missing-memory scenarios.
+   */
+  private cancelPendingMemoryPrefetch(): void {
+    const handle = this.pendingMemoryPrefetch;
+    if (!handle) return;
+    if (handle.settledAt !== null && !handle.consumed) {
+      debugLogger.debug('Discarding settled but unconsumed memory prefetch.');
+    }
+    handle.controller.abort();
+    this.pendingMemoryPrefetch = undefined;
+  }
+
+  /**
+   * Atomically consume the pending prefetch if it has already settled.
+   * Returns the recall result (caller decides where to inject it in
+   * `requestToSend`), or `null` if there's nothing to consume yet.
+   *
+   * Centralises the consume-and-mark dance so the UserQuery and ToolResult
+   * inject sites can't drift on the guard logic.
+   */
+  private async tryConsumeMemoryPrefetch(): Promise<RelevantAutoMemoryPromptResult | null> {
+    const handle = this.pendingMemoryPrefetch;
+    if (!handle || handle.settledAt === null || handle.consumed) {
+      return null;
+    }
+    handle.consumed = true;
+    this.pendingMemoryPrefetch = undefined;
+    const result = await handle.promise; // already settled, returns immediately
+    if (result.prompt) {
+      for (const doc of result.selectedDocs) {
+        this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
+      }
+    }
+    return result;
+  }
+
   async resetChat(): Promise<void> {
     this.initializedSessionId = undefined;
     this.surfacedRelevantAutoMemoryPaths.clear();
@@ -445,10 +473,7 @@ export class GeminiClient {
     this.config.getBaseLlmClient().clearPerModelGeneratorCache();
     // Abort any in-flight auto-memory recall so the stale controller
     // does not leak into the next session.
-    if (this.pendingRecallAbortController) {
-      this.pendingRecallAbortController.abort();
-      this.pendingRecallAbortController = undefined;
-    }
+    this.cancelPendingMemoryPrefetch();
     // Drop any deferred tools revealed this session so /clear really gives
     // a clean slate. We don't clear inside startChat itself because that path
     // is also taken by compression (which preserves the session), and
@@ -1044,9 +1069,6 @@ export class GeminiClient {
     turns: number = MAX_TURNS,
   ): AsyncGenerator<ServerGeminiStreamEvent, Turn> {
     const messageType = options?.type ?? SendMessageType.UserQuery;
-    let relevantAutoMemoryPromise:
-      | Promise<RelevantAutoMemoryPromptResult>
-      | undefined;
 
     if (messageType === SendMessageType.Retry) {
       this.stripOrphanedUserEntriesFromHistory();
@@ -1139,28 +1161,54 @@ export class GeminiClient {
       }
     }
 
+    // Tracks whether the generator reached its natural end (the bottom-of-try
+    // `return turn`). Only on that path do we want to preserve the pending
+    // memory prefetch so the next ToolResult turn can consume it. Any other
+    // exit (LoopDetected, Error, signal abort, uncaught exception, abnormal
+    // early-return) leaves this `false`, and the `finally` block aborts the
+    // prefetch as a safety net.
+    let normalCompletion = false;
     try {
       if (
         messageType === SendMessageType.UserQuery ||
         messageType === SendMessageType.Cron
       ) {
         if (this.config.getManagedAutoMemoryEnabled()) {
-          const recallAbortController = new AbortController();
-          const rawRecallPromise = this.config
+          // A previous recall may still be pending (slow side-query, new user
+          // turn arrived before it settled). Abort it before installing the
+          // new handle so the orphan doesn't keep running indefinitely.
+          this.cancelPendingMemoryPrefetch();
+          const controller = new AbortController();
+          // Bridge the caller's signal into the prefetch controller so a user
+          // abort (Ctrl-C / Esc) on the parent turn also terminates the
+          // recall side-query. `{ once: true }` lets the listener clean itself
+          // up after firing; we still call removeEventListener on the promise's
+          // finally to cover the normal-completion case so a long-lived parent
+          // signal doesn't accumulate listeners across many turns.
+          const onParentAbort = () => controller.abort();
+          if (signal.aborted) {
+            controller.abort();
+          } else {
+            signal.addEventListener('abort', onParentAbort, { once: true });
+          }
+          const promise = this.config
             .getMemoryManager()
             .recall(this.config.getProjectRoot(), partToString(request), {
               config: this.config,
               excludedFilePaths: this.surfacedRelevantAutoMemoryPaths,
-              abortSignal: recallAbortController.signal,
+              abortSignal: controller.signal,
             })
             .catch((error: unknown) => {
+              // Abort sources are now numerous (caller signal, new UserQuery,
+              // cleanup paths, safety-net timeout). Keep a debug trace so
+              // operators can diagnose missing-memory scenarios without
+              // raising noise on the common abort path.
               if (
                 error instanceof DOMException &&
                 error.name === 'AbortError'
               ) {
                 debugLogger.debug(
-                  'Auto-memory recall aborted by deadline.',
-                  error,
+                  'Managed auto-memory recall prefetch aborted.',
                 );
               } else {
                 debugLogger.warn(
@@ -1170,14 +1218,17 @@ export class GeminiClient {
               }
               return EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
             });
-          this.pendingRecallAbortController = recallAbortController;
-          // Race the recall against the deadline at initiation time so the 2.5s
-          // budget is not consumed by intermediate work (microcompact, compression,
-          // token checks, IDE context) between initiation and consumption.
-          relevantAutoMemoryPromise = resolveAutoMemoryWithDeadline(
-            rawRecallPromise,
-            () => recallAbortController.abort(),
-          );
+          const handle: MemoryPrefetchHandle = {
+            promise,
+            settledAt: null,
+            consumed: false,
+            controller,
+          };
+          void promise.finally(() => {
+            handle.settledAt = Date.now();
+            signal.removeEventListener('abort', onParentAbort);
+          });
+          this.pendingMemoryPrefetch = handle;
         }
 
         // Track prompt count for commit attribution. Only the user typing a
@@ -1289,8 +1340,7 @@ export class GeminiClient {
           this.config.getMaxSessionTurns() > 0 &&
           this.sessionTurnCount > this.config.getMaxSessionTurns()
         ) {
-          this.pendingRecallAbortController?.abort();
-          this.pendingRecallAbortController = undefined;
+          this.cancelPendingMemoryPrefetch();
           yield { type: GeminiEventType.MaxSessionTurns };
           if (isTopLevelInteraction)
             endInteractionSpan('error', {
@@ -1303,8 +1353,7 @@ export class GeminiClient {
       // Ensure turns never exceeds MAX_TURNS to prevent infinite loops
       const boundedTurns = Math.min(turns, MAX_TURNS);
       if (!boundedTurns) {
-        this.pendingRecallAbortController?.abort();
-        this.pendingRecallAbortController = undefined;
+        this.cancelPendingMemoryPrefetch();
         if (isTopLevelInteraction)
           endInteractionSpan('error', { errorMessage: 'max turns exhausted' });
         return new Turn(this.getChat(), prompt_id);
@@ -1319,8 +1368,7 @@ export class GeminiClient {
         const lastPromptTokenCount =
           uiTelemetryService.getLastPromptTokenCount();
         if (lastPromptTokenCount > sessionTokenLimit) {
-          this.pendingRecallAbortController?.abort();
-          this.pendingRecallAbortController = undefined;
+          this.cancelPendingMemoryPrefetch();
           yield {
             type: GeminiEventType.SessionTokenLimitExceeded,
             value: {
@@ -1380,8 +1428,7 @@ export class GeminiClient {
             `Arena control signal received: ${controlSignal.type} - ${controlSignal.reason}`,
           );
           await arenaAgentClient.reportCancelled();
-          this.pendingRecallAbortController?.abort();
-          this.pendingRecallAbortController = undefined;
+          this.cancelPendingMemoryPrefetch();
           if (isTopLevelInteraction) endInteractionSpan('cancelled');
           return new Turn(this.getChat(), prompt_id);
         }
@@ -1407,20 +1454,6 @@ export class GeminiClient {
         messageType === SendMessageType.Cron
       ) {
         const systemReminders = [];
-        // The recall promise was already raced against the 2.5s deadline at
-        // initiation time; this await just collects the result.
-        this.pendingRecallAbortController = undefined;
-        const relevantAutoMemory = relevantAutoMemoryPromise
-          ? await relevantAutoMemoryPromise
-          : EMPTY_RELEVANT_AUTO_MEMORY_RESULT;
-        const relevantAutoMemoryPrompt = relevantAutoMemory.prompt;
-
-        if (relevantAutoMemoryPrompt) {
-          systemReminders.push(relevantAutoMemoryPrompt);
-          for (const doc of relevantAutoMemory.selectedDocs) {
-            this.surfacedRelevantAutoMemoryPaths.add(doc.filePath);
-          }
-        }
 
         // add subagent system reminder if there are subagents
         const hasAgentTool = await this.config
@@ -1455,9 +1488,41 @@ export class GeminiClient {
           }
         }
 
+        // Zero-wait poll: consume only if the prefetch has already settled.
+        // Done AFTER the async reminder setup above so recall settling during
+        // those awaits still gets caught here. (settledAt is set in
+        // promise.finally(); microtask ordering guarantees it's visible
+        // after any await prior to this point — flatMapTextParts above is
+        // the natural drain.) If still not settled, skip — the ToolResult
+        // inject point will retry on the next turn.
+        const userQueryMemory = await this.tryConsumeMemoryPrefetch();
+        if (userQueryMemory?.prompt) {
+          // Unshift to the front of systemReminders: on a UserQuery turn
+          // requestToSend leads with user text, so positioning memory at
+          // the very start of the system-reminder block keeps it close to
+          // the user prompt. Contrast the ToolResult path below, which
+          // must append to avoid splitting functionCall / functionResponse.
+          systemReminders.unshift(userQueryMemory.prompt);
+        }
+
         requestToSend = [...systemReminders, ...requestToSend];
       }
 
+      if (messageType === SendMessageType.ToolResult) {
+        const toolResultMemory = await this.tryConsumeMemoryPrefetch();
+        if (toolResultMemory?.prompt) {
+          // Append (not prepend): on a ToolResult turn, requestToSend leads
+          // with functionResponse parts that must immediately follow the
+          // model's functionCall (Qwen API constraint — same reason the
+          // IDE-context block above is skipped while a tool call is pending,
+          // see the `hasPendingToolCall` guard). Putting the memory text
+          // after the functionResponse parts keeps the call/response pairing
+          // intact under native Gemini; the OpenAI converter then emits the
+          // text as a separate user message after the tool messages.
+          requestToSend = [...requestToSend, toolResultMemory.prompt];
+        }
+      }
+
       const activeGoalAtTurnStart = getActiveGoal(this.config.getSessionId());
       if (activeGoalAtTurnStart) {
         yield {
@@ -1503,6 +1568,9 @@ export class GeminiClient {
             this.lastApiCompletionTimestamp = Date.now();
             if (isTopLevelInteraction)
               endInteractionSpan('error', { errorMessage: 'loop detected' });
+            // finally cleanup catches this, but cancel explicitly to match
+            // the cleanup pattern at other early-return sites.
+            this.cancelPendingMemoryPrefetch();
             return turn;
           }
         }
@@ -1553,6 +1621,9 @@ export class GeminiClient {
               event.value instanceof Error ? '[API error]' : 'unknown error';
             endInteractionSpan('error', { errorMessage: errMsg });
           }
+          // finally cleanup catches this, but cancel explicitly to match
+          // the cleanup pattern at other early-return sites.
+          this.cancelPendingMemoryPrefetch();
           return turn;
         }
       }
@@ -1723,6 +1794,10 @@ export class GeminiClient {
           );
           if (isTopLevelInteraction)
             endInteractionSpan(signal.aborted ? 'cancelled' : 'ok');
+          // Preserve the pending prefetch: the inner Hook turn we just
+          // yielded may have produced tool calls, and the caller's next
+          // ToolResult turn still needs to consume the recall result.
+          normalCompletion = true;
           return hookTurn;
         }
 
@@ -1789,6 +1864,11 @@ export class GeminiClient {
           );
           if (isTopLevelInteraction)
             endInteractionSpan(signal.aborted ? 'cancelled' : 'ok');
+          // Preserve the pending prefetch: same reasoning as the
+          // `return hookTurn` site above — the recursive Hook turn may
+          // have produced tool calls whose ToolResult turn still needs
+          // the recall result.
+          normalCompletion = true;
           return continueTurn;
         }
 
@@ -1808,8 +1888,18 @@ export class GeminiClient {
       if (isTopLevelInteraction) {
         endInteractionSpan(signal?.aborted ? 'cancelled' : 'ok');
       }
+      // Reached the bottom of the try — this turn ended cleanly. Preserve
+      // any still-pending memory prefetch so the next ToolResult turn can
+      // consume it (the whole point of the fire-and-forget design).
+      normalCompletion = true;
       return turn;
     } finally {
+      // Belt-and-suspenders: abort the prefetch on any exit other than the
+      // bottom-of-try `return turn`. Catches uncaught exceptions and guards
+      // against future early-return sites that forget to call cancel.
+      if (!normalCompletion) {
+        this.cancelPendingMemoryPrefetch();
+      }
       if (isTopLevelInteraction) {
         endInteractionSpan(signal?.aborted ? 'cancelled' : 'error', {
           errorMessage: 'unexpected exit',
diff --git a/packages/core/src/memory/recall.ts b/packages/core/src/memory/recall.ts
index 8697496f953..5b476b21ce1 100644
--- a/packages/core/src/memory/recall.ts
+++ b/packages/core/src/memory/recall.ts
@@ -219,12 +219,25 @@ export async function resolveRelevantAutoMemoryPromptForQuery(
         strategy,
       };
     } catch (error) {
-      // Distinguish deadline-triggered cancellation from real model errors
-      // so oncall debugging is not misled by the fallback log.
+      // Distinguish three cases so oncall debugging isn't misled:
+      //   - caller-driven abort (user signal / new UserQuery / session
+      //     cleanup): caller signal is aborted → heuristic fallback is
+      //     skipped below at `options.abortSignal?.aborted`, so the
+      //     result really is discarded.
+      //   - 30 s safety-net timeout in relevanceSelector: only the inner
+      //     combined signal aborts; the caller's signal is NOT aborted,
+      //     so the heuristic fallback below DOES run.
+      //   - real model error: warn at the higher level.
       if (error instanceof DOMException && error.name === 'AbortError') {
-        debugLogger.debug(
-          'Model-driven auto-memory recall cancelled by deadline; heuristic result discarded.',
-        );
+        if (options.abortSignal?.aborted) {
+          debugLogger.debug(
+            'Model-driven auto-memory recall aborted by caller; heuristic result discarded.',
+          );
+        } else {
+          debugLogger.debug(
+            'Model-driven auto-memory recall timed out (30 s safety net); heuristic fallback will run.',
+          );
+        }
       } else {
         debugLogger.warn(
           'Model-driven auto-memory recall failed; falling back to heuristic selection.',
@@ -234,8 +247,8 @@ export async function resolveRelevantAutoMemoryPromptForQuery(
     }
   }
 
-  // If the caller's abort signal is already set (e.g. deadline fired), skip the
-  // heuristic fallback — the result would be discarded anyway.
+  // If the caller's abort signal is already set, skip the heuristic
+  // fallback — the result would be discarded anyway.
   if (options.abortSignal?.aborted) {
     return {
       prompt: '',
diff --git a/packages/core/src/memory/relevanceSelector.ts b/packages/core/src/memory/relevanceSelector.ts
index 8962eb94d0c..1ef957671f7 100644
--- a/packages/core/src/memory/relevanceSelector.ts
+++ b/packages/core/src/memory/relevanceSelector.ts
@@ -91,9 +91,17 @@ export async function selectRelevantAutoMemoryDocumentsByModel(
     purpose: 'auto-memory-recall',
     contents,
     schema: RESPONSE_SCHEMA,
+    // Caller (`GeminiClient.MemoryPrefetchHandle`) owns lifecycle and aborts
+    // via its controller on cleanup paths. The 30 s ceiling is a generous
+    // safety net that only fires if the model API hangs (network partition,
+    // server stall, runaway retry) AND the caller never aborts. Normal
+    // recalls take ~1 s; 30 s is far above the long tail so this doesn't
+    // re-introduce the 1 s timeout regression that motivated this redesign.
+    // Without this ceiling, a callerless invocation would use an
+    // unsignalled AbortController and run indefinitely.
     abortSignal: callerAbortSignal
-      ? AbortSignal.any([AbortSignal.timeout(1_000), callerAbortSignal])
-      : AbortSignal.timeout(1_000),
+      ? AbortSignal.any([AbortSignal.timeout(30_000), callerAbortSignal])
+      : AbortSignal.timeout(30_000),
 
     // Uses runSideQuery's default side-query model policy: fast model first,
     // then main session model when no fast model is configured.