Skip to content

Commit 797b3d3

Browse files
authored
fix(vision): retain image context in follow-up turns (#81)
Persist provider-owned image replay text in segment stateful markers so image context is not lost across follow-up turns or Reload Window. Only the current pending image message is sent to the vision proxy. Historical images replay from matching assistant markers, while marker misses are omitted instead of re-running vision. Segment marker metadata is encoded as unpadded base64url JSON to survive Copilot replay framing. Also update diagnostics, request dumps, and token estimates for marker-based replay.
1 parent 0605da9 commit 797b3d3

15 files changed

Lines changed: 490 additions & 436 deletions

File tree

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@
148148
"deepseek-copilot.visionPrompt": {
149149
"type": "string",
150150
"editPresentation": "multilineText",
151-
"default": "Describe the visual contents of this image in detail, including any text, objects, people, or context that would be relevant for understanding it. Focus on factual visual elements.",
151+
"default": "Describe all image attachments in this message.\n\nIf there is one image, describe it directly.\nIf there are multiple images:\n1. Describe each image separately, preserving their order.\n2. Then provide a combined description explaining the overall context and relationships across the images.\n\nReturn one concise factual description suitable for inserting into a text-only chat prompt. Include visible text, objects, UI elements, people, and relevant context. Do not invent details.",
152152
"description": "%deepseek-copilot.config.visionPrompt.description%"
153153
},
154154
"deepseek-copilot.debugMode": {

src/consts.ts

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,34 +26,6 @@ export const WELCOME_SHOWN_KEY = 'deepseek-copilot.welcomeShown';
2626
/** Walkthrough contribution ID. */
2727
export const WALKTHROUGH_ID = 'Vizards.deepseek-v4-for-copilot#deepseekGettingStarted';
2828

29-
// ---- Vision proxy ----
30-
31-
/** Default model ID used for the vision proxy when auto-detection is enabled. */
32-
export const DEFAULT_VISION_MODEL_ID = 'oswe-vscode-prime';
33-
34-
/**
35-
* Prompt sent to the vision proxy model when describing image attachments
36-
* before forwarding them to text-only DeepSeek models.
37-
*/
38-
export const IMAGE_DESCRIPTION_PROMPT =
39-
'Describe the visual contents of this image in detail, including any text, objects, people, or context that would be relevant for understanding it. Focus on factual visual elements.';
40-
41-
/**
42-
* Stable fallback marker inserted into the chat prompt when the vision proxy
43-
* fails to describe an image. Keep this in English and out of i18n so prompt
44-
* shape and cache behaviour do not vary by VS Code display language.
45-
*/
46-
export const IMAGE_DESCRIPTION_UNAVAILABLE = '[Image Description unavailable]';
47-
48-
/**
49-
* Wrapper applied to vision model descriptions before they are inserted into
50-
* the chat prompt. The full format is: `[Image Description: <description>]`.
51-
* Keep these in English and out of i18n so cache keys and token estimates
52-
* stay stable regardless of VS Code display language.
53-
*/
54-
export const IMAGE_DESCRIPTION_PREFIX = '[Image Description: ';
55-
export const IMAGE_DESCRIPTION_SUFFIX = ']';
56-
5729
// ---- Reasoning cache ----
5830

5931
/** Directory name under globalStorageUri for persisted DeepSeek reasoning_content. */

src/provider/diagnostics.ts

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
import { createHash } from 'crypto';
22
import vscode from 'vscode';
33
import { getDebugLoggingEnabled } from '../config';
4-
import {
5-
IMAGE_DESCRIPTION_UNAVAILABLE,
6-
LANGUAGE_MODEL_CHAT_SYSTEM_ROLE,
7-
REASONING_CACHE_TTL_MS,
8-
} from '../consts';
4+
import { LANGUAGE_MODEL_CHAT_SYSTEM_ROLE, REASONING_CACHE_TTL_MS } from '../consts';
95
import { logger } from '../logger';
106
import type { DeepSeekMessage, DeepSeekRequest, DeepSeekTool, DeepSeekUsage } from '../types';
117
import type { ConversationSegment } from './segment';
12-
import type { VisionDescriptionCacheStats } from './vision/index';
8+
import { IMAGE_DESCRIPTION_UNAVAILABLE } from './vision/consts';
9+
import type { VisionResolutionStats as VisionPipelineStats } from './vision/index';
1310

1411
const LARGE_MESSAGE_CHARS = 10_000;
1512
const HASH_WINDOW_CHARS = 2_048;
@@ -124,7 +121,7 @@ export interface BeginCacheDiagnosticsOptions {
124121
inputMessages: readonly vscode.LanguageModelChatRequestMessage[];
125122
resolvedMessages: readonly vscode.LanguageModelChatRequestMessage[];
126123
visionModelId?: string;
127-
visionCacheStats?: VisionDescriptionCacheStats;
124+
visionStats?: VisionPipelineStats;
128125
}
129126

130127
export interface CacheDiagnosticsDoneInfo {
@@ -149,6 +146,8 @@ export interface SegmentMarkerReportInfo {
149146
segment: ConversationSegment;
150147
status: SegmentMarkerReportStatus;
151148
trigger?: SegmentMarkerReportTrigger;
149+
markerBytes?: number;
150+
visionTextChars?: number;
152151
reason?: 'cancelled' | 'stream-error';
153152
error?: unknown;
154153
}
@@ -183,7 +182,7 @@ export function createCacheDiagnosticsRecorder(): CacheDiagnosticsRecorder {
183182
return new DefaultCacheDiagnosticsRecorder();
184183
}
185184

186-
interface VisionResolutionStats {
185+
interface VisionMessageStats {
187186
inputImageParts: number;
188187
inputImageMessages: number;
189188
describedImageMessages: number;
@@ -264,7 +263,7 @@ class DefaultCacheDiagnosticsRecorder implements CacheDiagnosticsRecorder {
264263
for (const detailLine of formatCacheTraceDetailLines(cacheTrace)) {
265264
logger.info(`[cache-trace #${requestId}] ${detailLine}`);
266265
}
267-
const visionTrace = formatVisionTrace(visionResolution, options.visionCacheStats);
266+
const visionTrace = formatVisionTrace(visionResolution, options.visionStats);
268267
if (visionTrace) {
269268
logger.info(`[cache-trace #${requestId}] ${visionTrace}`);
270269
}
@@ -404,13 +403,18 @@ function formatSegmentTrace(segment: ConversationSegment): string {
404403

405404
function formatSegmentMarkerReport(info: SegmentMarkerReportInfo): string {
406405
const trigger = info.trigger ? ` trigger=${info.trigger}` : '';
406+
const markerBytes = info.markerBytes === undefined ? '' : ` markerBytes=${info.markerBytes}`;
407+
const visionTextChars =
408+
info.visionTextChars === undefined ? '' : ` visionTextChars=${info.visionTextChars}`;
407409
const reason = info.reason ? ` reason=${info.reason}` : '';
408410
const error = info.error ? ` error=${formatError(info.error)}` : '';
409411
return (
410412
`segmentMarker status=${info.status}` +
411413
` segment=${info.segment.segmentId}` +
412414
` segmentReason=${info.segment.reason}` +
413415
trigger +
416+
markerBytes +
417+
visionTextChars +
414418
reason +
415419
error
416420
);
@@ -518,8 +522,8 @@ function summarizeVisionResolution(
518522
inputMessages: readonly vscode.LanguageModelChatRequestMessage[],
519523
resolvedMessages: readonly vscode.LanguageModelChatRequestMessage[],
520524
visionModelId: string | undefined,
521-
): VisionResolutionStats {
522-
const stats: VisionResolutionStats = {
525+
): VisionMessageStats {
526+
const stats: VisionMessageStats = {
523527
inputImageParts: 0,
524528
inputImageMessages: 0,
525529
describedImageMessages: 0,
@@ -588,17 +592,21 @@ function getMessageText(message: vscode.LanguageModelChatRequestMessage): string
588592
}
589593

590594
function formatVisionTrace(
591-
stats: VisionResolutionStats,
592-
cacheStats: VisionDescriptionCacheStats | undefined,
595+
stats: VisionMessageStats,
596+
pipelineStats: VisionPipelineStats | undefined,
593597
): string | undefined {
594-
if (stats.inputImageParts === 0 && stats.historyDescriptionMessages === 0) {
598+
if (
599+
stats.inputImageParts === 0 &&
600+
stats.historyDescriptionMessages === 0 &&
601+
!hasVisionPipelineActivity(pipelineStats)
602+
) {
595603
return undefined;
596604
}
597605

598606
const note =
599607
stats.inputImageParts === 0 && stats.historyDescriptionMessages > 0 ? ' note=history-only' : '';
600608
const visionModel = formatVisionModel(stats);
601-
const cacheTrace = formatVisionCacheStats(stats, cacheStats);
609+
const pipelineTrace = formatVisionPipelineStats(pipelineStats);
602610
return (
603611
`vision rawImageParts=${stats.inputImageParts}` +
604612
` rawImageMessages=${stats.inputImageMessages}` +
@@ -607,42 +615,50 @@ function formatVisionTrace(
607615
` droppedImageParts=${stats.droppedImageParts}` +
608616
` visionModel=${visionModel}` +
609617
` historyDescriptionMessages=${stats.historyDescriptionMessages}` +
610-
cacheTrace +
618+
pipelineTrace +
611619
note
612620
);
613621
}
614622

615-
function formatVisionCacheStats(
616-
resolutionStats: VisionResolutionStats,
617-
cacheStats: VisionDescriptionCacheStats | undefined,
618-
): string {
619-
if (!cacheStats) {
620-
return '';
623+
function hasVisionPipelineActivity(stats: VisionPipelineStats | undefined): boolean {
624+
if (!stats) {
625+
return false;
621626
}
627+
return (
628+
stats.inputImageParts > 0 ||
629+
stats.currentImageMessages > 0 ||
630+
stats.generatedImageMessages > 0 ||
631+
stats.replayedImageMessages > 0 ||
632+
stats.omittedImageMessages > 0 ||
633+
stats.unavailableImageMessages > 0 ||
634+
stats.failedImageMessages > 0 ||
635+
stats.invalidMarkerVisionMetadata > 0
636+
);
637+
}
622638

623-
const hasCacheActivity =
624-
cacheStats.hits > 0 ||
625-
cacheStats.misses > 0 ||
626-
cacheStats.deduplicatedDescriptions > 0 ||
627-
cacheStats.generatedDescriptions > 0 ||
628-
cacheStats.failedDescriptions > 0 ||
629-
cacheStats.droppedImageParts > 0;
630-
if (!hasCacheActivity && resolutionStats.inputImageParts === 0) {
639+
function formatVisionPipelineStats(stats: VisionPipelineStats | undefined): string {
640+
if (!stats) {
641+
return '';
642+
}
643+
if (!hasVisionPipelineActivity(stats)) {
631644
return '';
632645
}
633646

634647
return (
635-
` cache(enabled=${cacheStats.enabled}` +
636-
`,hits=${cacheStats.hits}` +
637-
`,misses=${cacheStats.misses}` +
638-
`,deduped=${cacheStats.deduplicatedDescriptions}` +
639-
`,entries=${cacheStats.entries}` +
640-
`,generated=${cacheStats.generatedDescriptions}` +
641-
`,failed=${cacheStats.failedDescriptions})`
648+
` markerReplay(inputImages=${stats.inputImageParts}` +
649+
`,current=${stats.currentImageMessages}` +
650+
`,generated=${stats.generatedImageMessages}` +
651+
`,replayed=${stats.replayedImageMessages}` +
652+
`,omitted=${stats.omittedImageMessages}` +
653+
`,unavailable=${stats.unavailableImageMessages}` +
654+
`,failed=${stats.failedImageMessages}` +
655+
`,droppedParts=${stats.droppedImageParts}` +
656+
`,markerChars=${stats.markerVisionTextChars}` +
657+
`,invalidMarkerVision=${stats.invalidMarkerVisionMetadata})`
642658
);
643659
}
644660

645-
function formatVisionModel(stats: VisionResolutionStats): string {
661+
function formatVisionModel(stats: VisionMessageStats): string {
646662
if (stats.visionModelId) {
647663
return stats.visionModelId;
648664
}

src/provider/dump.ts

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { safeStringify, toWellFormedString } from '../json';
99
import { logger } from '../logger';
1010
import type { DeepSeekMessage, DeepSeekRequest } from '../types';
1111
import { parseSegmentMarkerData, SEGMENT_MARKER_MIME, type ConversationSegment } from './segment';
12-
import type { VisionDescriptionCacheStats } from './vision/index';
12+
import type { VisionResolutionStats } from './vision/index';
1313

1414
let dumpCounter = 0;
1515
let providerInputDumpCounter = 0;
@@ -84,7 +84,7 @@ export interface DumpDeepSeekRequestOptions {
8484
resolvedMessages: readonly vscode.LanguageModelChatRequestMessage[];
8585
requestOptions: vscode.ProvideLanguageModelChatResponseOptions;
8686
visionModelId?: string;
87-
visionCacheStats?: VisionDescriptionCacheStats;
87+
visionStats?: VisionResolutionStats;
8888
}
8989

9090
export interface DumpProviderInputOptions {
@@ -308,7 +308,7 @@ function createPipelineSnapshot(
308308
stage === 'resolved'
309309
? {
310310
modelId: options.visionModelId ?? null,
311-
stats: options.visionCacheStats ?? null,
311+
stats: options.visionStats ?? null,
312312
}
313313
: undefined,
314314
deepSeekPromptSummary: summarizeDeepSeekSystemPrompt(request.messages),
@@ -399,6 +399,8 @@ type SerializedContentPart =
399399
segmentMarker?: {
400400
valid: boolean;
401401
segmentId?: string;
402+
visionTextChars?: number;
403+
visionTextIgnoredReason?: string;
402404
error?: string;
403405
};
404406
}
@@ -479,7 +481,9 @@ function serializeContentPart(part: unknown, index: number): SerializedContentPa
479481

480482
if (part instanceof vscode.LanguageModelDataPart) {
481483
const segmentMarker =
482-
part.mimeType === SEGMENT_MARKER_MIME ? parseSegmentMarkerData(part.data) : undefined;
484+
part.mimeType === SEGMENT_MARKER_MIME
485+
? summarizeSegmentMarker(parseSegmentMarkerData(part.data))
486+
: undefined;
483487
return {
484488
index,
485489
type: 'data',
@@ -503,6 +507,22 @@ function serializeContentPart(part: unknown, index: number): SerializedContentPa
503507
};
504508
}
505509

510+
function summarizeSegmentMarker(marker: ReturnType<typeof parseSegmentMarkerData>): {
511+
valid: boolean;
512+
segmentId?: string;
513+
visionTextChars?: number;
514+
visionTextIgnoredReason?: string;
515+
error?: string;
516+
} {
517+
return {
518+
valid: marker.valid,
519+
segmentId: marker.segmentId,
520+
visionTextChars: marker.visionText?.length,
521+
visionTextIgnoredReason: marker.visionTextIgnoredReason,
522+
error: marker.error,
523+
};
524+
}
525+
506526
function serializeTools(
507527
tools: readonly vscode.LanguageModelChatTool[] | undefined,
508528
): object[] | undefined {

src/provider/request.ts

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { convertMessages, convertTools, countMessageChars } from './convert';
1010
import type { CacheDiagnosticsRecorder, CacheDiagnosticsRun } from './diagnostics';
1111
import { dumpDeepSeekRequest } from './dump';
1212
import { getConfiguredThinkingEffort, type ModelConfigurationOptions } from './models';
13-
import type { ConversationSegment } from './segment';
13+
import type { ConversationSegment, SegmentMarkerMetadata } from './segment';
1414
import { resolveImageMessages } from './vision/index';
1515

1616
export interface PreparedChatRequest {
@@ -21,6 +21,8 @@ export interface PreparedChatRequest {
2121
trailingToolResultIds: string[];
2222
cacheDiagnostics: CacheDiagnosticsRun;
2323
segment: ConversationSegment;
24+
segmentMarkerMetadata: SegmentMarkerMetadata;
25+
visionMarkerTextChars?: number;
2426
}
2527

2628
export interface PrepareChatRequestOptions {
@@ -61,7 +63,12 @@ export async function prepareChatRequest({
6163
const thinkingEffort = getConfiguredThinkingEffort(options as ModelConfigurationOptions);
6264
const maxTokens = getMaxTokens();
6365

64-
const visionResolution = await resolveImageMessages(messages, token, getVisionModel);
66+
const visionResolution = await resolveImageMessages(
67+
messages,
68+
token,
69+
getVisionModel,
70+
segment.segmentId,
71+
);
6572
const resolvedMessages = visionResolution.messages;
6673
const deepseekMessages = convertMessages(resolvedMessages, isThinkingModel, reasoningLookup);
6774
const tools = modelDef?.capabilities.toolCalling ? convertTools(options.tools) : undefined;
@@ -94,7 +101,7 @@ export async function prepareChatRequest({
94101
resolvedMessages,
95102
requestOptions: options,
96103
visionModelId: visionResolution.visionModelId,
97-
visionCacheStats: visionResolution.stats,
104+
visionStats: visionResolution.stats,
98105
});
99106

100107
const diagnosticsRun = cacheDiagnostics.beginRequest({
@@ -108,7 +115,7 @@ export async function prepareChatRequest({
108115
inputMessages: messages,
109116
resolvedMessages,
110117
visionModelId: visionResolution.visionModelId,
111-
visionCacheStats: visionResolution.stats,
118+
visionStats: visionResolution.stats,
112119
});
113120

114121
return {
@@ -119,6 +126,8 @@ export async function prepareChatRequest({
119126
trailingToolResultIds: collectTrailingToolResultIds(deepseekMessages),
120127
cacheDiagnostics: diagnosticsRun,
121128
segment,
129+
segmentMarkerMetadata: visionResolution.segmentMarkerMetadata,
130+
visionMarkerTextChars: visionResolution.stats.markerVisionTextChars || undefined,
122131
};
123132
}
124133

0 commit comments

Comments
 (0)