diff --git a/docs/design/2026-05-15-async-memory-recall-design.md b/docs/design/2026-05-15-async-memory-recall-design.md new file mode 100644 index 00000000000..f11b2ac5d23 --- /dev/null +++ b/docs/design/2026-05-15-async-memory-recall-design.md @@ -0,0 +1,206 @@ +# Async Memory Recall — Design Spec + +**Date:** 2026-05-15 +**Status:** Approved +**Related issues:** #3761, #3759 +**Related PRs:** #3814, #3866 + +--- + +## Problem + +`relevanceSelector.ts` uses `AbortSignal.timeout(1_000)` (introduced by #3866). On first-session cold starts, qwen3.5-flash averages ~908 ms — consistently hitting the 1 s threshold. The outer 2.5 s deadline in `resolveAutoMemoryWithDeadline` means every UserQuery can block for up to 2.5 s even when recall always fails. + +Root cause: the main-agent request path `await`s the recall result before sending to the model. Any slowness in the recall side-query directly adds to user-visible latency. + +--- + +## Design + +### Core idea + +Fire recall on UserQuery and never await it. Consume the result at two opportunistic points — whichever fires first: + +1. **UserQuery consume point** — synchronous `settledAt !== null` check just before `turn.run()`. Zero-wait: if already settled, use it; if not, skip. +2. **ToolResult inject point** — same check on every ToolResult turn. Injects memory as a `system-reminder` **appended after** the functionResponse parts in `requestToSend`, giving the model memory context before its next response. (Append, not prepend: the Qwen API requires the functionResponse to immediately follow the model's functionCall — see the existing `hasPendingToolCall` IDE-context skip for the same constraint.) + +This matches the pattern used by Claude Code upstream (`startRelevantMemoryPrefetch` / `settledAt` polling in `query.ts`). + +--- + +## Data structures + +### New type `MemoryPrefetchHandle` (in `client.ts`) + +```typescript +type MemoryPrefetchHandle = { + promise: Promise; + /** Set by promise.finally(). null until the promise settles. */ + settledAt: number | null; + /** True after memory has been injected — prevents double-inject. */ + consumed: boolean; + controller: AbortController; +}; +``` + +### Field change on `GeminiClient` + +| Remove | Add | +| ------------------------------------------------------------ | ---------------------------------------------------------- | +| `pendingRecallAbortController: AbortController \| undefined` | `pendingMemoryPrefetch: MemoryPrefetchHandle \| undefined` | + +--- + +## Changes + +### 1. `client.ts` — remove `resolveAutoMemoryWithDeadline` + +Delete the function entirely. It is replaced by the `settledAt` flag mechanism. + +### 2. `client.ts` — UserQuery fire path + +Replace the `resolveAutoMemoryWithDeadline` call with: + +```typescript +// Abort any in-flight prefetch from a previous UserQuery before installing +// the new handle (prevents orphan side-queries when the user types again +// before recall settles). +this.pendingMemoryPrefetch?.controller.abort(); +this.pendingMemoryPrefetch = undefined; + +const controller = new AbortController(); +// Bridge the caller's signal into the prefetch controller so a user abort +// (Ctrl-C / Esc) on the parent turn also terminates the recall side-query. +const onParentAbort = () => controller.abort(); +if (signal.aborted) { + controller.abort(); +} else { + signal.addEventListener('abort', onParentAbort, { once: true }); +} + +const promise = this.config + .getMemoryManager() + .recall(projectRoot, partToString(request), { + config: this.config, + excludedFilePaths: this.surfacedRelevantAutoMemoryPaths, + abortSignal: controller.signal, + }) + .catch((error: unknown) => { + if (!(error instanceof DOMException && error.name === 'AbortError')) { + debugLogger.warn('Managed auto-memory recall prefetch failed.', error); + } + return EMPTY_RELEVANT_AUTO_MEMORY_RESULT; + }); + +const handle: MemoryPrefetchHandle = { + promise, + settledAt: null, + consumed: false, + controller, +}; +void promise.finally(() => { + handle.settledAt = Date.now(); + signal.removeEventListener('abort', onParentAbort); +}); +this.pendingMemoryPrefetch = handle; +// no await — continue immediately +``` + +### 3. `client.ts` — UserQuery consume point (replaces `await relevantAutoMemoryPromise`) + +```typescript +const prefetchHandle = this.pendingMemoryPrefetch; +if ( + prefetchHandle && + prefetchHandle.settledAt !== null && + !prefetchHandle.consumed +) { + prefetchHandle.consumed = true; + this.pendingMemoryPrefetch = undefined; + const result = await prefetchHandle.promise; // already settled, returns immediately + if (result.prompt) { + // unshift, not push: keep memory at the front of systemReminders so + // it leads the system-reminder block on UserQuery turns. (ToolResult + // turns instead append to requestToSend to preserve functionCall / + // functionResponse pairing — see below.) + systemReminders.unshift(result.prompt); + for (const doc of result.selectedDocs) { + this.surfacedRelevantAutoMemoryPaths.add(doc.filePath); + } + } +} +``` + +### 4. `client.ts` — ToolResult inject point (new) + +After `requestToSend` is assembled, before `turn.run()`, add: + +```typescript +if (messageType === SendMessageType.ToolResult) { + const prefetchHandle = this.pendingMemoryPrefetch; + if ( + prefetchHandle && + prefetchHandle.settledAt !== null && + !prefetchHandle.consumed + ) { + prefetchHandle.consumed = true; + this.pendingMemoryPrefetch = undefined; + const result = await prefetchHandle.promise; + if (result.prompt) { + // Append (not prepend) so functionResponse parts stay first + // and the model's functionCall/functionResponse pairing + // isn't broken on the native Gemini path. + requestToSend = [...requestToSend, result.prompt]; + for (const doc of result.selectedDocs) { + this.surfacedRelevantAutoMemoryPaths.add(doc.filePath); + } + } + } +} +``` + +### 5. `client.ts` — cleanup paths + +The handle is released by two distinct mechanisms: + +**5 abort-and-clear sites** (the prefetch is still pending, abort the controller before dropping the reference). Replace `pendingRecallAbortController?.abort()` + `= undefined` with: + +```typescript +this.pendingMemoryPrefetch?.controller.abort(); +this.pendingMemoryPrefetch = undefined; +``` + +Sites: `resetChat()`, `MaxSessionTurns` early-return, `boundedTurns=0` early-return, `SessionTokenLimitExceeded` early-return, Arena control-signal early-return. The fire path itself also performs this abort-then-replace when a new UserQuery arrives while the previous prefetch is still in flight. + +**2 clear-only sites** (the prefetch has already settled and we're consuming it — no controller to abort, just drop the reference): + +```typescript +prefetchHandle.consumed = true; +this.pendingMemoryPrefetch = undefined; +``` + +Sites: UserQuery consume point, ToolResult inject point. + +### 6. `relevanceSelector.ts` — remove `AbortSignal.timeout(1_000)` + +Remove the combined `AbortSignal.any([AbortSignal.timeout(1_000), callerAbortSignal])` and pass `callerAbortSignal` directly. + +--- + +## Behaviour comparison + +| Scenario | Before | After | +| -------------------------------------------- | ------------------------------ | ------------------------------------------------------ | +| recall completes before model prep | inject on UserQuery, ~0 wait | inject on UserQuery, ~0 wait | +| recall slow (cold start) | block up to 2.5 s | skip UserQuery, inject on first ToolResult | +| recall times out (1 s) | abort, empty result, no memory | no hard timeout; inject whenever settled | +| no tool calls, recall slow | block up to 2.5 s, then skip | skip UserQuery, no ToolResult opportunity — miss | +| user sends 2nd message before recall settles | 2nd recall races 1st handle | 1st handle aborted when 2nd UserQuery fires new handle | + +--- + +## Out of scope + +- Changing the memory injection format from `system-reminder` to `tool-result` attachment (CC style) +- Per-session byte budget skip gate +- Single-word prompt skip gate diff --git a/packages/core/src/core/client.test.ts b/packages/core/src/core/client.test.ts index a45c51eda1a..bde6856c67a 100644 --- a/packages/core/src/core/client.test.ts +++ b/packages/core/src/core/client.test.ts @@ -39,7 +39,12 @@ import { import type { ModelsConfig } from '../models/modelsConfig.js'; import { UnauthorizedError } from '../utils/errors.js'; import { retryWithBackoff } from '../utils/retry.js'; -import { CompressionStatus, GeminiEventType, Turn } from './turn.js'; +import { + CompressionStatus, + GeminiEventType, + Turn, + type ServerGeminiStreamEvent, +} from './turn.js'; vi.mock('../utils/retry.js', () => ({ retryWithBackoff: vi.fn(async (fn) => await fn()), @@ -2700,20 +2705,9 @@ hello }); it('should not block the main request when auto-memory recall is slow', async () => { - // Simulate a recall that takes longer than the 2.5s deadline - mockMemoryManager.recall.mockReturnValue( - new Promise((resolve) => - setTimeout( - () => - resolve({ - prompt: '## Relevant memory\n\nSlow memory result.', - selectedDocs: [], - strategy: 'model', - }), - 10_000, - ), - ), - ); + // Recall never settles — settledAt stays null so the UserQuery consume + // point skips it and turn.run() is called immediately without memory. + mockMemoryManager.recall.mockReturnValue(new Promise(() => {})); const mockStream = (async function* () { yield { type: 'content', value: 'Hello' }; @@ -2726,38 +2720,28 @@ hello }; client['chat'] = mockChat as GeminiChat; - vi.useFakeTimers(); - try { - const streamPromise = (async () => { - const stream = client.sendMessageStream( - [{ text: 'Quick question' }], - new AbortController().signal, - 'prompt-id-slow-memory', - ); - for await (const _ of stream) { - // consume stream - } - })(); - - // Advance past the 2.5s deadline — the main request should proceed - await vi.advanceTimersByTimeAsync(3_000); - await streamPromise; - - // The main request should have been called without the slow memory - expect(mockTurnRunFn).toHaveBeenCalledWith( - 'test-model', - expect.not.arrayContaining([ - expect.stringContaining('Slow memory result'), - ]), - expect.any(AbortSignal), - ); - } finally { - vi.useRealTimers(); + const stream = client.sendMessageStream( + [{ text: 'Quick question' }], + new AbortController().signal, + 'prompt-id-slow-memory', + ); + for await (const _ of stream) { + // consume stream } + + // turn.run() must have been called without the slow memory + expect(mockTurnRunFn).toHaveBeenCalledWith( + 'test-model', + expect.not.arrayContaining([ + expect.stringContaining('Slow memory result'), + ]), + expect.any(AbortSignal), + ); }); - it('should include auto-memory prompt when recall completes within deadline', async () => { - // Simulate a fast recall that completes well within the deadline + it('should inject auto-memory at UserQuery consume point when recall already settled', async () => { + // mockResolvedValue settles synchronously; by the time the consume-point + // check runs (after at least one await), settledAt is set. mockMemoryManager.recall.mockResolvedValue({ prompt: '## Relevant memory\n\nFast memory result.', selectedDocs: [], @@ -2791,6 +2775,322 @@ hello ); }); + it('should inject auto-memory on first ToolResult when recall settles after UserQuery', async () => { + // Controllable promise — recall stays pending across the UserQuery turn + // and only settles before the ToolResult turn runs. + let resolveRecall: + | ((value: { + prompt: string; + selectedDocs: never[]; + strategy: 'model'; + }) => void) + | undefined; + mockMemoryManager.recall.mockReturnValue( + new Promise((resolve) => { + resolveRecall = resolve; + }), + ); + + const mockStream = (async function* () { + yield { type: 'content', value: 'Hello' }; + })(); + mockTurnRunFn.mockReturnValue(mockStream); + + const mockChat: Partial = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + }; + client['chat'] = mockChat as GeminiChat; + + // Turn 1: UserQuery — recall still pending, no injection + const userStream = client.sendMessageStream( + [{ text: 'What is my name?' }], + new AbortController().signal, + 'prompt-id-user-query', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of userStream) { + // consume + } + + expect(mockTurnRunFn).toHaveBeenLastCalledWith( + 'test-model', + expect.not.arrayContaining([ + expect.stringContaining('Deferred memory result'), + ]), + expect.any(AbortSignal), + ); + + // Recall settles between turns + resolveRecall!({ + prompt: '## Relevant memory\n\nDeferred memory result.', + selectedDocs: [], + strategy: 'model', + }); + // Drain microtasks so the settledAt finally() callback runs + await Promise.resolve(); + await Promise.resolve(); + + // Turn 2: ToolResult — settledAt is now non-null, memory should inject + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'world' }; + })(), + ); + const toolStream = client.sendMessageStream( + [{ functionResponse: { name: 'foo', response: { ok: true } } }], + new AbortController().signal, + 'prompt-id-tool-result', + { type: SendMessageType.ToolResult }, + ); + for await (const _ of toolStream) { + // consume + } + + // Memory must come AFTER the functionResponse part so the Qwen API + // call/response pairing isn't broken (see client.ts:1209-1213). + const lastCallArgs = mockTurnRunFn.mock.lastCall; + const requestArr = lastCallArgs![1] as unknown[]; + const functionResponseIdx = requestArr.findIndex( + (p) => typeof p === 'object' && p !== null && 'functionResponse' in p, + ); + const memoryIdx = requestArr.findIndex( + (p) => p === '## Relevant memory\n\nDeferred memory result.', + ); + expect(functionResponseIdx).toBeGreaterThanOrEqual(0); + expect(memoryIdx).toBeGreaterThan(functionResponseIdx); + }); + + it('should abort the pending prefetch when the caller signal aborts', async () => { + let abortHandlerInvoked = false; + mockMemoryManager.recall.mockImplementation((_root, _query, opts) => { + opts.abortSignal?.addEventListener('abort', () => { + abortHandlerInvoked = true; + }); + return new Promise(() => {}); + }); + + const mockChat: Partial = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + }; + client['chat'] = mockChat as GeminiChat; + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'Hello' }; + })(), + ); + + const callerController = new AbortController(); + const stream = client.sendMessageStream( + [{ text: 'user typed but then aborted' }], + callerController.signal, + 'prompt-id-aborted', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of stream) { + // consume + } + + expect(abortHandlerInvoked).toBe(false); + callerController.abort(); + expect(abortHandlerInvoked).toBe(true); + }); + + it('should abort the previous prefetch when a new UserQuery arrives mid-flight', async () => { + // Pending recall on first UserQuery — never resolves on its own. + const abortSignals: AbortSignal[] = []; + mockMemoryManager.recall.mockImplementation((_root, _query, opts) => { + abortSignals.push(opts.abortSignal as AbortSignal); + return new Promise(() => {}); + }); + + const mockChat: Partial = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + }; + client['chat'] = mockChat as GeminiChat; + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'Hello' }; + })(), + ); + + // First UserQuery — installs prefetch #1 + const stream1 = client.sendMessageStream( + [{ text: 'first' }], + new AbortController().signal, + 'prompt-id-1', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of stream1) { + // consume + } + expect(abortSignals.length).toBe(1); + expect(abortSignals[0].aborted).toBe(false); + + // Second UserQuery — should abort #1 before installing #2 + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'Hello again' }; + })(), + ); + const stream2 = client.sendMessageStream( + [{ text: 'second' }], + new AbortController().signal, + 'prompt-id-2', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of stream2) { + // consume + } + + expect(abortSignals.length).toBe(2); + expect(abortSignals[0].aborted).toBe(true); + expect(abortSignals[1].aborted).toBe(false); + }); + + it('should abort the pending prefetch on resetChat', async () => { + let abortHandlerInvoked = false; + mockMemoryManager.recall.mockImplementation((_root, _query, opts) => { + opts.abortSignal?.addEventListener('abort', () => { + abortHandlerInvoked = true; + }); + return new Promise(() => {}); + }); + + const mockChat: Partial = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + }; + client['chat'] = mockChat as GeminiChat; + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'Hello' }; + })(), + ); + + const stream = client.sendMessageStream( + [{ text: 'first' }], + new AbortController().signal, + 'prompt-id-reset-1', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of stream) { + // consume + } + + expect(abortHandlerInvoked).toBe(false); + await client.resetChat(); + expect(abortHandlerInvoked).toBe(true); + expect(client['pendingMemoryPrefetch']).toBeUndefined(); + }); + + it('should abort the pending prefetch when LoopDetected fires mid-stream', async () => { + let abortHandlerInvoked = false; + mockMemoryManager.recall.mockImplementation((_root, _query, opts) => { + opts.abortSignal?.addEventListener('abort', () => { + abortHandlerInvoked = true; + }); + return new Promise(() => {}); + }); + + const mockChat: Partial = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + }; + client['chat'] = mockChat as GeminiChat; + + // Force LoopDetector to trip on the first event. + const loopDetector = client['loopDetector']; + vi.spyOn(loopDetector, 'addAndCheck').mockReturnValue(true); + vi.spyOn(loopDetector, 'getLastLoopType').mockReturnValue(null); + + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'looping' }; + })(), + ); + + const stream = client.sendMessageStream( + [{ text: 'trigger a loop' }], + new AbortController().signal, + 'prompt-id-loop', + { type: SendMessageType.UserQuery }, + ); + const events = []; + for await (const event of stream) { + events.push(event); + } + + expect(events.some((e) => e.type === GeminiEventType.LoopDetected)).toBe( + true, + ); + expect(abortHandlerInvoked).toBe(true); + expect(client['pendingMemoryPrefetch']).toBeUndefined(); + }); + + it('should PRESERVE the pending prefetch when next-speaker continueTurn returns', async () => { + // Self-inflicted-regression guard for the round-4 finding: + // the bottom-of-try `normalCompletion = true` doesn't cover the + // `return continueTurn;` path, so the outer's finally used to cancel + // the still-pending prefetch — meaning a subsequent ToolResult turn + // would have no memory to consume. + let abortHandlerInvoked = false; + mockMemoryManager.recall.mockImplementation((_root, _query, opts) => { + opts.abortSignal?.addEventListener('abort', () => { + abortHandlerInvoked = true; + }); + return new Promise(() => {}); // never settles + }); + + const mockChat: Partial = { + addHistory: vi.fn(), + getHistory: vi.fn().mockReturnValue([]), + }; + client['chat'] = mockChat as GeminiChat; + mockTurnRunFn.mockReturnValue( + (async function* () { + yield { type: 'content', value: 'outer reply' }; + })(), + ); + + // Force the next-speaker check to recurse so we hit `return continueTurn`. + // The recursion call passes through this same mock stream and returns. + const { checkNextSpeaker } = await import( + '../utils/nextSpeakerChecker.js' + ); + const mockedCheckNextSpeaker = vi.mocked(checkNextSpeaker); + mockedCheckNextSpeaker + .mockResolvedValueOnce({ + reasoning: 'forced', + next_speaker: 'model', + }) + .mockResolvedValue(null); // inner recursion: stop + // Each recursive sendMessageStream call asks turn.run() for a new stream. + mockTurnRunFn.mockImplementation( + () => + (async function* () { + yield { type: 'content', value: 'reply' }; + })() as unknown as AsyncGenerator, + ); + + const stream = client.sendMessageStream( + [{ text: 'hello' }], + new AbortController().signal, + 'prompt-id-continueturn', + { type: SendMessageType.UserQuery }, + ); + for await (const _ of stream) { + // consume + } + + // The prefetch must survive the continueTurn return so a follow-up + // ToolResult turn can consume it. + expect(abortHandlerInvoked).toBe(false); + expect(client['pendingMemoryPrefetch']).not.toBeUndefined(); + }); + it('should proceed without auto-memory when managed auto-memory is disabled', async () => { // When getManagedAutoMemoryEnabled returns false, no recall is initiated // and sendMessageStream completes without memory content diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts index 2ff9f1bc6f7..e66362900dd 100644 --- a/packages/core/src/core/client.ts +++ b/packages/core/src/core/client.ts @@ -153,41 +153,25 @@ function wrapIdeContext(contextText: string): string { } /** - * Resolve the auto-memory recall promise with a hard deadline. - * If the recall (model-driven selection + heuristic fallback) does not complete - * within the deadline, return an empty result so the main request is not delayed. + * Handle for a non-blocking auto-memory recall prefetch. * - * The deadline is set slightly above the model-driven selector's own - * AbortSignal.timeout (2s) to give the heuristic fallback time to complete, - * but low enough that the user does not perceive a delay on every turn. + * Lifecycle: + * 1. Created on UserQuery/Cron — the recall promise fires immediately, + * `pendingMemoryPrefetch` is set to this handle. + * 2. Consumed at either of two opportunistic points: a zero-wait + * `settledAt !== null` poll just before the UserQuery main request, + * or — if recall hadn't settled yet — on the first ToolResult turn. + * 3. Aborted-and-discarded by every cleanup path (resetChat, + * MaxSessionTurns, etc.) or replaced when a new UserQuery arrives. */ -async function resolveAutoMemoryWithDeadline( - promise: Promise | undefined, - onDeadline: () => void, -): Promise { - if (!promise) { - return EMPTY_RELEVANT_AUTO_MEMORY_RESULT; - } - - let timer: ReturnType | undefined; - const deadline = new Promise((resolve) => { - timer = setTimeout(() => { - try { - onDeadline(); - } finally { - resolve(EMPTY_RELEVANT_AUTO_MEMORY_RESULT); - } - }, 2_500); - }); - - try { - return await Promise.race([promise, deadline]); - } finally { - if (timer !== undefined) { - clearTimeout(timer); - } - } -} +type MemoryPrefetchHandle = { + promise: Promise; + /** Set by promise.finally(). null until the promise settles. */ + settledAt: number | null; + /** True after memory has been injected — prevents double-inject. */ + consumed: boolean; + controller: AbortController; +}; /** Tools that can write to the skills directory, used to detect skillsModifiedInSession. */ const SKILL_WRITE_TOOL_NAMES: ReadonlySet = new Set([ @@ -207,7 +191,7 @@ export class GeminiClient { private lastPromptId: string | undefined = undefined; private lastSentIdeContext: IdeContext | undefined; private forceFullIdeContext = true; - private pendingRecallAbortController: AbortController | undefined; + private pendingMemoryPrefetch: MemoryPrefetchHandle | undefined; private lastSessionStartContext: string | undefined; private lastSessionStartSource: SessionStartSource | undefined; @@ -432,6 +416,50 @@ export class GeminiClient { }); } + /** + * Abort and release the pending auto-memory prefetch in one step. + * Safe to call when no prefetch is pending — does nothing. Centralises + * the abort-then-clear idiom so every cleanup path (resetChat, early + * returns, finally) cannot half-fix one without the other. + * + * If the handle has already settled (recall completed but consume point + * hadn't run yet), the settled result is discarded — logged at debug so + * operators can diagnose missing-memory scenarios. + */ + private cancelPendingMemoryPrefetch(): void { + const handle = this.pendingMemoryPrefetch; + if (!handle) return; + if (handle.settledAt !== null && !handle.consumed) { + debugLogger.debug('Discarding settled but unconsumed memory prefetch.'); + } + handle.controller.abort(); + this.pendingMemoryPrefetch = undefined; + } + + /** + * Atomically consume the pending prefetch if it has already settled. + * Returns the recall result (caller decides where to inject it in + * `requestToSend`), or `null` if there's nothing to consume yet. + * + * Centralises the consume-and-mark dance so the UserQuery and ToolResult + * inject sites can't drift on the guard logic. + */ + private async tryConsumeMemoryPrefetch(): Promise { + const handle = this.pendingMemoryPrefetch; + if (!handle || handle.settledAt === null || handle.consumed) { + return null; + } + handle.consumed = true; + this.pendingMemoryPrefetch = undefined; + const result = await handle.promise; // already settled, returns immediately + if (result.prompt) { + for (const doc of result.selectedDocs) { + this.surfacedRelevantAutoMemoryPaths.add(doc.filePath); + } + } + return result; + } + async resetChat(): Promise { this.initializedSessionId = undefined; this.surfacedRelevantAutoMemoryPaths.clear(); @@ -445,10 +473,7 @@ export class GeminiClient { this.config.getBaseLlmClient().clearPerModelGeneratorCache(); // Abort any in-flight auto-memory recall so the stale controller // does not leak into the next session. - if (this.pendingRecallAbortController) { - this.pendingRecallAbortController.abort(); - this.pendingRecallAbortController = undefined; - } + this.cancelPendingMemoryPrefetch(); // Drop any deferred tools revealed this session so /clear really gives // a clean slate. We don't clear inside startChat itself because that path // is also taken by compression (which preserves the session), and @@ -1044,9 +1069,6 @@ export class GeminiClient { turns: number = MAX_TURNS, ): AsyncGenerator { const messageType = options?.type ?? SendMessageType.UserQuery; - let relevantAutoMemoryPromise: - | Promise - | undefined; if (messageType === SendMessageType.Retry) { this.stripOrphanedUserEntriesFromHistory(); @@ -1139,28 +1161,54 @@ export class GeminiClient { } } + // Tracks whether the generator reached its natural end (the bottom-of-try + // `return turn`). Only on that path do we want to preserve the pending + // memory prefetch so the next ToolResult turn can consume it. Any other + // exit (LoopDetected, Error, signal abort, uncaught exception, abnormal + // early-return) leaves this `false`, and the `finally` block aborts the + // prefetch as a safety net. + let normalCompletion = false; try { if ( messageType === SendMessageType.UserQuery || messageType === SendMessageType.Cron ) { if (this.config.getManagedAutoMemoryEnabled()) { - const recallAbortController = new AbortController(); - const rawRecallPromise = this.config + // A previous recall may still be pending (slow side-query, new user + // turn arrived before it settled). Abort it before installing the + // new handle so the orphan doesn't keep running indefinitely. + this.cancelPendingMemoryPrefetch(); + const controller = new AbortController(); + // Bridge the caller's signal into the prefetch controller so a user + // abort (Ctrl-C / Esc) on the parent turn also terminates the + // recall side-query. `{ once: true }` lets the listener clean itself + // up after firing; we still call removeEventListener on the promise's + // finally to cover the normal-completion case so a long-lived parent + // signal doesn't accumulate listeners across many turns. + const onParentAbort = () => controller.abort(); + if (signal.aborted) { + controller.abort(); + } else { + signal.addEventListener('abort', onParentAbort, { once: true }); + } + const promise = this.config .getMemoryManager() .recall(this.config.getProjectRoot(), partToString(request), { config: this.config, excludedFilePaths: this.surfacedRelevantAutoMemoryPaths, - abortSignal: recallAbortController.signal, + abortSignal: controller.signal, }) .catch((error: unknown) => { + // Abort sources are now numerous (caller signal, new UserQuery, + // cleanup paths, safety-net timeout). Keep a debug trace so + // operators can diagnose missing-memory scenarios without + // raising noise on the common abort path. if ( error instanceof DOMException && error.name === 'AbortError' ) { debugLogger.debug( - 'Auto-memory recall aborted by deadline.', - error, + 'Managed auto-memory recall prefetch aborted.', ); } else { debugLogger.warn( @@ -1170,14 +1218,17 @@ export class GeminiClient { } return EMPTY_RELEVANT_AUTO_MEMORY_RESULT; }); - this.pendingRecallAbortController = recallAbortController; - // Race the recall against the deadline at initiation time so the 2.5s - // budget is not consumed by intermediate work (microcompact, compression, - // token checks, IDE context) between initiation and consumption. - relevantAutoMemoryPromise = resolveAutoMemoryWithDeadline( - rawRecallPromise, - () => recallAbortController.abort(), - ); + const handle: MemoryPrefetchHandle = { + promise, + settledAt: null, + consumed: false, + controller, + }; + void promise.finally(() => { + handle.settledAt = Date.now(); + signal.removeEventListener('abort', onParentAbort); + }); + this.pendingMemoryPrefetch = handle; } // Track prompt count for commit attribution. Only the user typing a @@ -1289,8 +1340,7 @@ export class GeminiClient { this.config.getMaxSessionTurns() > 0 && this.sessionTurnCount > this.config.getMaxSessionTurns() ) { - this.pendingRecallAbortController?.abort(); - this.pendingRecallAbortController = undefined; + this.cancelPendingMemoryPrefetch(); yield { type: GeminiEventType.MaxSessionTurns }; if (isTopLevelInteraction) endInteractionSpan('error', { @@ -1303,8 +1353,7 @@ export class GeminiClient { // Ensure turns never exceeds MAX_TURNS to prevent infinite loops const boundedTurns = Math.min(turns, MAX_TURNS); if (!boundedTurns) { - this.pendingRecallAbortController?.abort(); - this.pendingRecallAbortController = undefined; + this.cancelPendingMemoryPrefetch(); if (isTopLevelInteraction) endInteractionSpan('error', { errorMessage: 'max turns exhausted' }); return new Turn(this.getChat(), prompt_id); @@ -1319,8 +1368,7 @@ export class GeminiClient { const lastPromptTokenCount = uiTelemetryService.getLastPromptTokenCount(); if (lastPromptTokenCount > sessionTokenLimit) { - this.pendingRecallAbortController?.abort(); - this.pendingRecallAbortController = undefined; + this.cancelPendingMemoryPrefetch(); yield { type: GeminiEventType.SessionTokenLimitExceeded, value: { @@ -1380,8 +1428,7 @@ export class GeminiClient { `Arena control signal received: ${controlSignal.type} - ${controlSignal.reason}`, ); await arenaAgentClient.reportCancelled(); - this.pendingRecallAbortController?.abort(); - this.pendingRecallAbortController = undefined; + this.cancelPendingMemoryPrefetch(); if (isTopLevelInteraction) endInteractionSpan('cancelled'); return new Turn(this.getChat(), prompt_id); } @@ -1407,20 +1454,6 @@ export class GeminiClient { messageType === SendMessageType.Cron ) { const systemReminders = []; - // The recall promise was already raced against the 2.5s deadline at - // initiation time; this await just collects the result. - this.pendingRecallAbortController = undefined; - const relevantAutoMemory = relevantAutoMemoryPromise - ? await relevantAutoMemoryPromise - : EMPTY_RELEVANT_AUTO_MEMORY_RESULT; - const relevantAutoMemoryPrompt = relevantAutoMemory.prompt; - - if (relevantAutoMemoryPrompt) { - systemReminders.push(relevantAutoMemoryPrompt); - for (const doc of relevantAutoMemory.selectedDocs) { - this.surfacedRelevantAutoMemoryPaths.add(doc.filePath); - } - } // add subagent system reminder if there are subagents const hasAgentTool = await this.config @@ -1455,9 +1488,41 @@ export class GeminiClient { } } + // Zero-wait poll: consume only if the prefetch has already settled. + // Done AFTER the async reminder setup above so recall settling during + // those awaits still gets caught here. (settledAt is set in + // promise.finally(); microtask ordering guarantees it's visible + // after any await prior to this point — flatMapTextParts above is + // the natural drain.) If still not settled, skip — the ToolResult + // inject point will retry on the next turn. + const userQueryMemory = await this.tryConsumeMemoryPrefetch(); + if (userQueryMemory?.prompt) { + // Unshift to the front of systemReminders: on a UserQuery turn + // requestToSend leads with user text, so positioning memory at + // the very start of the system-reminder block keeps it close to + // the user prompt. Contrast the ToolResult path below, which + // must append to avoid splitting functionCall / functionResponse. + systemReminders.unshift(userQueryMemory.prompt); + } + requestToSend = [...systemReminders, ...requestToSend]; } + if (messageType === SendMessageType.ToolResult) { + const toolResultMemory = await this.tryConsumeMemoryPrefetch(); + if (toolResultMemory?.prompt) { + // Append (not prepend): on a ToolResult turn, requestToSend leads + // with functionResponse parts that must immediately follow the + // model's functionCall (Qwen API constraint — same reason the + // IDE-context block above is skipped while a tool call is pending, + // see the `hasPendingToolCall` guard). Putting the memory text + // after the functionResponse parts keeps the call/response pairing + // intact under native Gemini; the OpenAI converter then emits the + // text as a separate user message after the tool messages. + requestToSend = [...requestToSend, toolResultMemory.prompt]; + } + } + const activeGoalAtTurnStart = getActiveGoal(this.config.getSessionId()); if (activeGoalAtTurnStart) { yield { @@ -1503,6 +1568,9 @@ export class GeminiClient { this.lastApiCompletionTimestamp = Date.now(); if (isTopLevelInteraction) endInteractionSpan('error', { errorMessage: 'loop detected' }); + // finally cleanup catches this, but cancel explicitly to match + // the cleanup pattern at other early-return sites. + this.cancelPendingMemoryPrefetch(); return turn; } } @@ -1553,6 +1621,9 @@ export class GeminiClient { event.value instanceof Error ? '[API error]' : 'unknown error'; endInteractionSpan('error', { errorMessage: errMsg }); } + // finally cleanup catches this, but cancel explicitly to match + // the cleanup pattern at other early-return sites. + this.cancelPendingMemoryPrefetch(); return turn; } } @@ -1723,6 +1794,10 @@ export class GeminiClient { ); if (isTopLevelInteraction) endInteractionSpan(signal.aborted ? 'cancelled' : 'ok'); + // Preserve the pending prefetch: the inner Hook turn we just + // yielded may have produced tool calls, and the caller's next + // ToolResult turn still needs to consume the recall result. + normalCompletion = true; return hookTurn; } @@ -1789,6 +1864,11 @@ export class GeminiClient { ); if (isTopLevelInteraction) endInteractionSpan(signal.aborted ? 'cancelled' : 'ok'); + // Preserve the pending prefetch: same reasoning as the + // `return hookTurn` site above — the recursive Hook turn may + // have produced tool calls whose ToolResult turn still needs + // the recall result. + normalCompletion = true; return continueTurn; } @@ -1808,8 +1888,18 @@ export class GeminiClient { if (isTopLevelInteraction) { endInteractionSpan(signal?.aborted ? 'cancelled' : 'ok'); } + // Reached the bottom of the try — this turn ended cleanly. Preserve + // any still-pending memory prefetch so the next ToolResult turn can + // consume it (the whole point of the fire-and-forget design). + normalCompletion = true; return turn; } finally { + // Belt-and-suspenders: abort the prefetch on any exit other than the + // bottom-of-try `return turn`. Catches uncaught exceptions and guards + // against future early-return sites that forget to call cancel. + if (!normalCompletion) { + this.cancelPendingMemoryPrefetch(); + } if (isTopLevelInteraction) { endInteractionSpan(signal?.aborted ? 'cancelled' : 'error', { errorMessage: 'unexpected exit', diff --git a/packages/core/src/memory/recall.ts b/packages/core/src/memory/recall.ts index 8697496f953..5b476b21ce1 100644 --- a/packages/core/src/memory/recall.ts +++ b/packages/core/src/memory/recall.ts @@ -219,12 +219,25 @@ export async function resolveRelevantAutoMemoryPromptForQuery( strategy, }; } catch (error) { - // Distinguish deadline-triggered cancellation from real model errors - // so oncall debugging is not misled by the fallback log. + // Distinguish three cases so oncall debugging isn't misled: + // - caller-driven abort (user signal / new UserQuery / session + // cleanup): caller signal is aborted → heuristic fallback is + // skipped below at `options.abortSignal?.aborted`, so the + // result really is discarded. + // - 30 s safety-net timeout in relevanceSelector: only the inner + // combined signal aborts; the caller's signal is NOT aborted, + // so the heuristic fallback below DOES run. + // - real model error: warn at the higher level. if (error instanceof DOMException && error.name === 'AbortError') { - debugLogger.debug( - 'Model-driven auto-memory recall cancelled by deadline; heuristic result discarded.', - ); + if (options.abortSignal?.aborted) { + debugLogger.debug( + 'Model-driven auto-memory recall aborted by caller; heuristic result discarded.', + ); + } else { + debugLogger.debug( + 'Model-driven auto-memory recall timed out (30 s safety net); heuristic fallback will run.', + ); + } } else { debugLogger.warn( 'Model-driven auto-memory recall failed; falling back to heuristic selection.', @@ -234,8 +247,8 @@ export async function resolveRelevantAutoMemoryPromptForQuery( } } - // If the caller's abort signal is already set (e.g. deadline fired), skip the - // heuristic fallback — the result would be discarded anyway. + // If the caller's abort signal is already set, skip the heuristic + // fallback — the result would be discarded anyway. if (options.abortSignal?.aborted) { return { prompt: '', diff --git a/packages/core/src/memory/relevanceSelector.ts b/packages/core/src/memory/relevanceSelector.ts index 8962eb94d0c..1ef957671f7 100644 --- a/packages/core/src/memory/relevanceSelector.ts +++ b/packages/core/src/memory/relevanceSelector.ts @@ -91,9 +91,17 @@ export async function selectRelevantAutoMemoryDocumentsByModel( purpose: 'auto-memory-recall', contents, schema: RESPONSE_SCHEMA, + // Caller (`GeminiClient.MemoryPrefetchHandle`) owns lifecycle and aborts + // via its controller on cleanup paths. The 30 s ceiling is a generous + // safety net that only fires if the model API hangs (network partition, + // server stall, runaway retry) AND the caller never aborts. Normal + // recalls take ~1 s; 30 s is far above the long tail so this doesn't + // re-introduce the 1 s timeout regression that motivated this redesign. + // Without this ceiling, a callerless invocation would use an + // unsignalled AbortController and run indefinitely. abortSignal: callerAbortSignal - ? AbortSignal.any([AbortSignal.timeout(1_000), callerAbortSignal]) - : AbortSignal.timeout(1_000), + ? AbortSignal.any([AbortSignal.timeout(30_000), callerAbortSignal]) + : AbortSignal.timeout(30_000), // Uses runSideQuery's default side-query model policy: fast model first, // then main session model when no fast model is configured.