Skip to content

Commit 1bd8636

Browse files
authored
fix(provider): recursively count all part types in provideTokenCount (#51)
1 parent 82501b1 commit 1bd8636

5 files changed

Lines changed: 192 additions & 13 deletions

File tree

src/consts.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ export const IMAGE_DESCRIPTION_PROMPT =
4242
*/
4343
export const IMAGE_DESCRIPTION_UNAVAILABLE = '[Image Description unavailable]';
4444

45+
/**
46+
* Wrapper applied to vision model descriptions before they are inserted into
47+
* the chat prompt. The full format is: `[Image Description: <description>]`.
48+
* Keep these in English and out of i18n so cache keys and token estimates
49+
* stay stable regardless of VS Code display language.
50+
*/
51+
export const IMAGE_DESCRIPTION_PREFIX = '[Image Description: ';
52+
export const IMAGE_DESCRIPTION_SUFFIX = ']';
53+
4554
// ---- Cache ----
4655

4756
/** Max entries in the reasoning-content cache before eviction kicks in. */

src/provider/convert.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ export function countMessageChars(messages: DeepSeekMessage[]): number {
151151
let total = 0;
152152
for (const msg of messages) {
153153
total += msg.content?.length ?? 0;
154+
total += msg.reasoning_content?.length ?? 0;
154155
if (msg.tool_calls) {
155156
for (const tc of msg.tool_calls) {
156157
total += tc.function?.name?.length ?? 0;

src/provider/tokens.ts

Lines changed: 121 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,122 @@
11
import vscode from 'vscode';
2+
import { IMAGE_DESCRIPTION_PREFIX, IMAGE_DESCRIPTION_SUFFIX } from '../consts';
3+
import { computeDataHash, getCachedDescriptionByDataHash } from './vision/cache';
4+
5+
/**
6+
* Recursively estimate the character count for a single content part.
7+
* Returns character count, which the caller divides by charsPerToken to get token estimate.
8+
*/
9+
function estimatePartChars(part: unknown): number {
10+
// 1. LanguageModelTextPart — the most common case
11+
if (part instanceof vscode.LanguageModelTextPart) {
12+
return part.value.length;
13+
}
14+
15+
// 2. LanguageModelToolCallPart — count callId + name + JSON-serialized input
16+
if (part instanceof vscode.LanguageModelToolCallPart) {
17+
let chars = part.callId.length + part.name.length;
18+
try {
19+
chars += JSON.stringify(part.input).length;
20+
} catch {
21+
// If input can't be stringified (e.g. contains circular refs), fall back to a rough estimate
22+
chars += 2;
23+
}
24+
return chars;
25+
}
26+
27+
// 3. LanguageModelToolResultPart — recursively count nested content parts
28+
if (part instanceof vscode.LanguageModelToolResultPart) {
29+
let chars = part.callId.length;
30+
if (Array.isArray(part.content)) {
31+
for (const item of part.content) {
32+
chars += estimatePartChars(item);
33+
}
34+
}
35+
return chars;
36+
}
37+
38+
// 4. LanguageModelDataPart — use a capped heuristic because our model never
39+
// receives binary data directly. Images are resolved to text descriptions
40+
// by the vision pipeline; raw byteLength would massively overestimate.
41+
if (part instanceof vscode.LanguageModelDataPart) {
42+
const mime = part.mimeType;
43+
// Images: try the vision description cache first. If this image was
44+
// already resolved, the cached description length is the most accurate
45+
// estimate of what the model will actually receive.
46+
if (mime.startsWith('image/')) {
47+
// Skip SHA-256 for very large images — the hash cost outweighs the
48+
// benefit of a cache lookup, and such images are unlikely to be
49+
// processed by the vision pipeline anyway.
50+
if (part.data.byteLength <= 500_000) {
51+
const cached = getCachedDescriptionByDataHash(computeDataHash(part.data));
52+
if (cached !== undefined) {
53+
return IMAGE_DESCRIPTION_PREFIX.length + cached.length + IMAGE_DESCRIPTION_SUFFIX.length;
54+
}
55+
}
56+
// Cold cache (or image too large to hash): use a conservative
57+
// fixed estimate (~255 tokens at 4 chars/tok, roughly matching
58+
// OpenAI auto-detail for a moderate image).
59+
// The vision pipeline will replace these with text descriptions
60+
// whose actual token cost is counted via LanguageModelTextPart
61+
// on the next pass.
62+
return 1020;
63+
}
64+
// PDFs and other documents: use byteLength as a rough proxy but cap it
65+
// to prevent a single large attachment from dominating the budget.
66+
return Math.min(part.data?.byteLength ?? 0, 10000);
67+
}
68+
69+
// 5. LanguageModelThinkingPart (proposed API) — handle string | string[]
70+
if (isLanguageModelThinkingPart(part)) {
71+
if (typeof part.value === 'string') {
72+
return part.value.length;
73+
}
74+
if (Array.isArray(part.value)) {
75+
let chars = 0;
76+
for (const s of part.value) {
77+
chars += s.length;
78+
}
79+
return chars;
80+
}
81+
return 0;
82+
}
83+
84+
// 6. LanguageModelPromptTsxPart — stringify the value if present
85+
// Duck-type check since PromptTsxPart may not always be available
86+
if (
87+
part &&
88+
typeof part === 'object' &&
89+
'value' in part &&
90+
part.constructor?.name === 'LanguageModelPromptTsxPart'
91+
) {
92+
try {
93+
return JSON.stringify((part as { value: unknown }).value).length;
94+
} catch {
95+
return 0;
96+
}
97+
}
98+
99+
// Fallback: try to stringify unknown part types
100+
if (part && typeof part === 'object') {
101+
try {
102+
return JSON.stringify(part).length;
103+
} catch {
104+
return 0;
105+
}
106+
}
107+
108+
return 0;
109+
}
110+
111+
/**
112+
* Check for LanguageModelThinkingPart (proposed API, may not be available at runtime).
113+
*/
114+
function isLanguageModelThinkingPart(part: unknown): part is vscode.LanguageModelThinkingPart {
115+
return (
116+
typeof (vscode as Record<string, unknown>).LanguageModelThinkingPart === 'function' &&
117+
part instanceof vscode.LanguageModelThinkingPart
118+
);
119+
}
2120

3121
export function estimateTokenCount(
4122
text: string | vscode.LanguageModelChatRequestMessage,
@@ -12,11 +130,9 @@ export function estimateTokenCount(
12130
return 1;
13131
}
14132

15-
let total = 0;
133+
let totalChars = 0;
16134
for (const part of text.content) {
17-
if (part instanceof vscode.LanguageModelTextPart) {
18-
total += part.value.length;
19-
}
135+
totalChars += estimatePartChars(part);
20136
}
21-
return Math.max(1, Math.ceil(total / charsPerToken));
137+
return Math.max(1, Math.ceil(totalChars / charsPerToken));
22138
}

src/provider/vision/cache.ts

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,16 @@ const MAX_VISION_DESCRIPTION_CACHE_ENTRIES = 100;
66

77
interface VisionDescriptionCacheEntry {
88
description: string;
9+
/** SHA-256 of the original image bytes, for secondary index eviction. */
10+
dataHash?: string;
911
}
1012

1113
const visionDescriptionCache = new Map<string, VisionDescriptionCacheEntry>();
1214
// Promise-only single-flight: caller cancellation does not abort shared vision work.
1315
const pendingVisionDescriptions = new Map<string, Promise<string>>();
16+
// Secondary index keyed by data hash, for lookup without knowing vision model/prompt.
17+
// Used by provideTokenCount to find cached descriptions for image DataParts.
18+
const dataHashToDescription = new Map<string, string>();
1419

1520
export function createVisionDescriptionCacheStats(): VisionDescriptionCacheStats {
1621
return {
@@ -36,10 +41,10 @@ export function createVisionDescriptionCacheKey(
3641
part: vscode.LanguageModelDataPart,
3742
visionModelId: string,
3843
visionPrompt: string,
44+
dataHash?: string,
3945
): string {
40-
return hashString(
41-
['v1', part.mimeType, hashBytes(part.data), visionModelId, hashString(visionPrompt)].join('\0'),
42-
);
46+
const dh = dataHash ?? hashBytes(part.data);
47+
return hashString(['v1', part.mimeType, dh, visionModelId, hashString(visionPrompt)].join('\0'));
4348
}
4449

4550
export function getCachedDescription(key: string): string | undefined {
@@ -53,17 +58,47 @@ export function getCachedDescription(key: string): string | undefined {
5358
return entry.description;
5459
}
5560

56-
export function rememberDescription(key: string, description: string): void {
61+
export function rememberDescription(key: string, description: string, dataHash?: string): void {
62+
// Delete before set to refresh LRU insertion order; Map.set on an
63+
// existing key preserves the original insertion position.
64+
visionDescriptionCache.delete(key);
5765
visionDescriptionCache.set(key, {
5866
description,
67+
dataHash,
5968
});
6069

70+
if (dataHash) {
71+
dataHashToDescription.set(dataHash, description);
72+
}
73+
6174
while (visionDescriptionCache.size > MAX_VISION_DESCRIPTION_CACHE_ENTRIES) {
6275
const oldestKey = visionDescriptionCache.keys().next().value;
6376
if (!oldestKey) {
6477
break;
6578
}
79+
const evicted = visionDescriptionCache.get(oldestKey);
6680
visionDescriptionCache.delete(oldestKey);
81+
if (evicted?.dataHash) {
82+
// Only delete the secondary index mapping if no other cached
83+
// entry still references the same data hash (same image bytes
84+
// may be cached under different vision model/prompt keys).
85+
let remainingEntry: typeof evicted | undefined;
86+
for (const entry of visionDescriptionCache.values()) {
87+
if (entry.dataHash === evicted.dataHash) {
88+
remainingEntry = entry;
89+
break;
90+
}
91+
}
92+
if (remainingEntry) {
93+
// Another entry still references this hash — update the
94+
// index to the remaining entry's description (the evicted
95+
// one may have had a different description from another
96+
// vision model/prompt combination).
97+
dataHashToDescription.set(evicted.dataHash, remainingEntry.description);
98+
} else {
99+
dataHashToDescription.delete(evicted.dataHash);
100+
}
101+
}
67102
}
68103
}
69104

@@ -82,6 +117,14 @@ export function rememberPendingDescription(key: string, description: Promise<str
82117
.catch(() => undefined);
83118
}
84119

120+
export function getCachedDescriptionByDataHash(dataHash: string): string | undefined {
121+
return dataHashToDescription.get(dataHash);
122+
}
123+
124+
export function computeDataHash(data: Uint8Array): string {
125+
return hashBytes(data);
126+
}
127+
85128
function hashBytes(value: Uint8Array): string {
86129
return createHash('sha256').update(value).digest('hex');
87130
}

src/provider/vision/resolve.ts

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import vscode from 'vscode';
2-
import { IMAGE_DESCRIPTION_UNAVAILABLE } from '../../consts';
2+
import {
3+
IMAGE_DESCRIPTION_PREFIX,
4+
IMAGE_DESCRIPTION_SUFFIX,
5+
IMAGE_DESCRIPTION_UNAVAILABLE,
6+
} from '../../consts';
37
import { t } from '../../i18n';
48
import { logger } from '../../logger';
59
import {
10+
computeDataHash,
611
createVisionDescriptionCacheKey,
712
createVisionDescriptionCacheStats,
813
finalizeVisionDescriptionCacheStats,
@@ -95,7 +100,10 @@ async function resolveImageDescription(
95100
stats: VisionDescriptionCacheStats,
96101
token: vscode.CancellationToken,
97102
): Promise<string> {
98-
const cacheKey = createVisionDescriptionCacheKey(part, visionModel.id, visionPrompt);
103+
// Compute dataHash once; reused for cache key construction and
104+
// the secondary index to avoid double SHA-256 on the same bytes.
105+
const dataHash = computeDataHash(part.data);
106+
const cacheKey = createVisionDescriptionCacheKey(part, visionModel.id, visionPrompt, dataHash);
99107
const cachedDescription = getCachedDescription(cacheKey);
100108
if (cachedDescription !== undefined) {
101109
stats.hits += 1;
@@ -120,6 +128,7 @@ async function resolveImageDescription(
120128
part,
121129
visionModel,
122130
visionPrompt,
131+
dataHash,
123132
);
124133
rememberPendingDescription(cacheKey, pendingDescriptionRequest);
125134
const description = await resolvePendingDescription(
@@ -139,11 +148,12 @@ function createPendingDescriptionRequest(
139148
part: vscode.LanguageModelDataPart,
140149
visionModel: vscode.LanguageModelChat,
141150
visionPrompt: string,
151+
dataHash: string,
142152
): Promise<string> {
143153
return describeImagePart(part, visionModel, visionPrompt).then(
144154
(description) => {
145155
if (description.length > 0) {
146-
rememberDescription(cacheKey, description);
156+
rememberDescription(cacheKey, description, dataHash);
147157
}
148158
return description;
149159
},
@@ -241,7 +251,7 @@ async function describeImagePart(
241251
}
242252

243253
function createImageDescriptionText(description: string): string {
244-
return `[Image Description: ${description}]`;
254+
return IMAGE_DESCRIPTION_PREFIX + description + IMAGE_DESCRIPTION_SUFFIX;
245255
}
246256

247257
function isImageDataPart(part: unknown): part is vscode.LanguageModelDataPart {

0 commit comments

Comments
 (0)