fix(provider): recursively count all part types in provideTokenCount (#51)

Vizards · web-flow · commit 1bd8636d04e9 · 2026-05-06T22:02:48.000+08:00
diff --git a/src/consts.ts b/src/consts.ts
@@ -42,6 +42,15 @@ export const IMAGE_DESCRIPTION_PROMPT =
  */
 export const IMAGE_DESCRIPTION_UNAVAILABLE = '[Image Description unavailable]';
 
+/**
+ * Wrapper applied to vision model descriptions before they are inserted into
+ * the chat prompt. The full format is: `[Image Description: <description>]`.
+ * Keep these in English and out of i18n so cache keys and token estimates
+ * stay stable regardless of VS Code display language.
+ */
+export const IMAGE_DESCRIPTION_PREFIX = '[Image Description: ';
+export const IMAGE_DESCRIPTION_SUFFIX = ']';
+
 // ---- Cache ----
 
 /** Max entries in the reasoning-content cache before eviction kicks in. */
diff --git a/src/provider/convert.ts b/src/provider/convert.ts
@@ -151,6 +151,7 @@ export function countMessageChars(messages: DeepSeekMessage[]): number {
 	let total = 0;
 	for (const msg of messages) {
 		total += msg.content?.length ?? 0;
+		total += msg.reasoning_content?.length ?? 0;
 		if (msg.tool_calls) {
 			for (const tc of msg.tool_calls) {
 				total += tc.function?.name?.length ?? 0;
diff --git a/src/provider/tokens.ts b/src/provider/tokens.ts
@@ -1,4 +1,122 @@
 import vscode from 'vscode';
+import { IMAGE_DESCRIPTION_PREFIX, IMAGE_DESCRIPTION_SUFFIX } from '../consts';
+import { computeDataHash, getCachedDescriptionByDataHash } from './vision/cache';
+
+/**
+ * Recursively estimate the character count for a single content part.
+ * Returns character count, which the caller divides by charsPerToken to get token estimate.
+ */
+function estimatePartChars(part: unknown): number {
+	// 1. LanguageModelTextPart — the most common case
+	if (part instanceof vscode.LanguageModelTextPart) {
+		return part.value.length;
+	}
+
+	// 2. LanguageModelToolCallPart — count callId + name + JSON-serialized input
+	if (part instanceof vscode.LanguageModelToolCallPart) {
+		let chars = part.callId.length + part.name.length;
+		try {
+			chars += JSON.stringify(part.input).length;
+		} catch {
+			// If input can't be stringified (e.g. contains circular refs), fall back to a rough estimate
+			chars += 2;
+		}
+		return chars;
+	}
+
+	// 3. LanguageModelToolResultPart — recursively count nested content parts
+	if (part instanceof vscode.LanguageModelToolResultPart) {
+		let chars = part.callId.length;
+		if (Array.isArray(part.content)) {
+			for (const item of part.content) {
+				chars += estimatePartChars(item);
+			}
+		}
+		return chars;
+	}
+
+	// 4. LanguageModelDataPart — use a capped heuristic because our model never
+	//    receives binary data directly. Images are resolved to text descriptions
+	//    by the vision pipeline; raw byteLength would massively overestimate.
+	if (part instanceof vscode.LanguageModelDataPart) {
+		const mime = part.mimeType;
+		// Images: try the vision description cache first. If this image was
+		// already resolved, the cached description length is the most accurate
+		// estimate of what the model will actually receive.
+		if (mime.startsWith('image/')) {
+			// Skip SHA-256 for very large images — the hash cost outweighs the
+			// benefit of a cache lookup, and such images are unlikely to be
+			// processed by the vision pipeline anyway.
+			if (part.data.byteLength <= 500_000) {
+				const cached = getCachedDescriptionByDataHash(computeDataHash(part.data));
+				if (cached !== undefined) {
+					return IMAGE_DESCRIPTION_PREFIX.length + cached.length + IMAGE_DESCRIPTION_SUFFIX.length;
+				}
+			}
+			// Cold cache (or image too large to hash): use a conservative
+			// fixed estimate (~255 tokens at 4 chars/tok, roughly matching
+			// OpenAI auto-detail for a moderate image).
+			// The vision pipeline will replace these with text descriptions
+			// whose actual token cost is counted via LanguageModelTextPart
+			// on the next pass.
+			return 1020;
+		}
+		// PDFs and other documents: use byteLength as a rough proxy but cap it
+		// to prevent a single large attachment from dominating the budget.
+		return Math.min(part.data?.byteLength ?? 0, 10000);
+	}
+
+	// 5. LanguageModelThinkingPart (proposed API) — handle string | string[]
+	if (isLanguageModelThinkingPart(part)) {
+		if (typeof part.value === 'string') {
+			return part.value.length;
+		}
+		if (Array.isArray(part.value)) {
+			let chars = 0;
+			for (const s of part.value) {
+				chars += s.length;
+			}
+			return chars;
+		}
+		return 0;
+	}
+
+	// 6. LanguageModelPromptTsxPart — stringify the value if present
+	// Duck-type check since PromptTsxPart may not always be available
+	if (
+		part &&
+		typeof part === 'object' &&
+		'value' in part &&
+		part.constructor?.name === 'LanguageModelPromptTsxPart'
+	) {
+		try {
+			return JSON.stringify((part as { value: unknown }).value).length;
+		} catch {
+			return 0;
+		}
+	}
+
+	// Fallback: try to stringify unknown part types
+	if (part && typeof part === 'object') {
+		try {
+			return JSON.stringify(part).length;
+		} catch {
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Check for LanguageModelThinkingPart (proposed API, may not be available at runtime).
+ */
+function isLanguageModelThinkingPart(part: unknown): part is vscode.LanguageModelThinkingPart {
+	return (
+		typeof (vscode as Record<string, unknown>).LanguageModelThinkingPart === 'function' &&
+		part instanceof vscode.LanguageModelThinkingPart
+	);
+}
 
 export function estimateTokenCount(
 	text: string | vscode.LanguageModelChatRequestMessage,
@@ -12,11 +130,9 @@ export function estimateTokenCount(
 		return 1;
 	}
 
-	let total = 0;
+	let totalChars = 0;
 	for (const part of text.content) {
-		if (part instanceof vscode.LanguageModelTextPart) {
-			total += part.value.length;
-		}
+		totalChars += estimatePartChars(part);
 	}
-	return Math.max(1, Math.ceil(total / charsPerToken));
+	return Math.max(1, Math.ceil(totalChars / charsPerToken));
 }
diff --git a/src/provider/vision/cache.ts b/src/provider/vision/cache.ts
@@ -6,11 +6,16 @@ const MAX_VISION_DESCRIPTION_CACHE_ENTRIES = 100;
 
 interface VisionDescriptionCacheEntry {
 	description: string;
+	/** SHA-256 of the original image bytes, for secondary index eviction. */
+	dataHash?: string;
 }
 
 const visionDescriptionCache = new Map<string, VisionDescriptionCacheEntry>();
 // Promise-only single-flight: caller cancellation does not abort shared vision work.
 const pendingVisionDescriptions = new Map<string, Promise<string>>();
+// Secondary index keyed by data hash, for lookup without knowing vision model/prompt.
+// Used by provideTokenCount to find cached descriptions for image DataParts.
+const dataHashToDescription = new Map<string, string>();
 
 export function createVisionDescriptionCacheStats(): VisionDescriptionCacheStats {
 	return {
@@ -36,10 +41,10 @@ export function createVisionDescriptionCacheKey(
 	part: vscode.LanguageModelDataPart,
 	visionModelId: string,
 	visionPrompt: string,
+	dataHash?: string,
 ): string {
-	return hashString(
-		['v1', part.mimeType, hashBytes(part.data), visionModelId, hashString(visionPrompt)].join('\0'),
-	);
+	const dh = dataHash ?? hashBytes(part.data);
+	return hashString(['v1', part.mimeType, dh, visionModelId, hashString(visionPrompt)].join('\0'));
 }
 
 export function getCachedDescription(key: string): string | undefined {
@@ -53,17 +58,47 @@ export function getCachedDescription(key: string): string | undefined {
 	return entry.description;
 }
 
-export function rememberDescription(key: string, description: string): void {
+export function rememberDescription(key: string, description: string, dataHash?: string): void {
+	// Delete before set to refresh LRU insertion order; Map.set on an
+	// existing key preserves the original insertion position.
+	visionDescriptionCache.delete(key);
 	visionDescriptionCache.set(key, {
 		description,
+		dataHash,
 	});
 
+	if (dataHash) {
+		dataHashToDescription.set(dataHash, description);
+	}
+
 	while (visionDescriptionCache.size > MAX_VISION_DESCRIPTION_CACHE_ENTRIES) {
 		const oldestKey = visionDescriptionCache.keys().next().value;
 		if (!oldestKey) {
 			break;
 		}
+		const evicted = visionDescriptionCache.get(oldestKey);
 		visionDescriptionCache.delete(oldestKey);
+		if (evicted?.dataHash) {
+			// Only delete the secondary index mapping if no other cached
+			// entry still references the same data hash (same image bytes
+			// may be cached under different vision model/prompt keys).
+			let remainingEntry: typeof evicted | undefined;
+			for (const entry of visionDescriptionCache.values()) {
+				if (entry.dataHash === evicted.dataHash) {
+					remainingEntry = entry;
+					break;
+				}
+			}
+			if (remainingEntry) {
+				// Another entry still references this hash — update the
+				// index to the remaining entry's description (the evicted
+				// one may have had a different description from another
+				// vision model/prompt combination).
+				dataHashToDescription.set(evicted.dataHash, remainingEntry.description);
+			} else {
+				dataHashToDescription.delete(evicted.dataHash);
+			}
+		}
 	}
 }
 
@@ -82,6 +117,14 @@ export function rememberPendingDescription(key: string, description: Promise<str
 		.catch(() => undefined);
 }
 
+export function getCachedDescriptionByDataHash(dataHash: string): string | undefined {
+	return dataHashToDescription.get(dataHash);
+}
+
+export function computeDataHash(data: Uint8Array): string {
+	return hashBytes(data);
+}
+
 function hashBytes(value: Uint8Array): string {
 	return createHash('sha256').update(value).digest('hex');
 }
diff --git a/src/provider/vision/resolve.ts b/src/provider/vision/resolve.ts
@@ -1,8 +1,13 @@
 import vscode from 'vscode';
-import { IMAGE_DESCRIPTION_UNAVAILABLE } from '../../consts';
+import {
+	IMAGE_DESCRIPTION_PREFIX,
+	IMAGE_DESCRIPTION_SUFFIX,
+	IMAGE_DESCRIPTION_UNAVAILABLE,
+} from '../../consts';
 import { t } from '../../i18n';
 import { logger } from '../../logger';
 import {
+	computeDataHash,
 	createVisionDescriptionCacheKey,
 	createVisionDescriptionCacheStats,
 	finalizeVisionDescriptionCacheStats,
@@ -95,7 +100,10 @@ async function resolveImageDescription(
 	stats: VisionDescriptionCacheStats,
 	token: vscode.CancellationToken,
 ): Promise<string> {
-	const cacheKey = createVisionDescriptionCacheKey(part, visionModel.id, visionPrompt);
+	// Compute dataHash once; reused for cache key construction and
+	// the secondary index to avoid double SHA-256 on the same bytes.
+	const dataHash = computeDataHash(part.data);
+	const cacheKey = createVisionDescriptionCacheKey(part, visionModel.id, visionPrompt, dataHash);
 	const cachedDescription = getCachedDescription(cacheKey);
 	if (cachedDescription !== undefined) {
 		stats.hits += 1;
@@ -120,6 +128,7 @@ async function resolveImageDescription(
 		part,
 		visionModel,
 		visionPrompt,
+		dataHash,
 	);
 	rememberPendingDescription(cacheKey, pendingDescriptionRequest);
 	const description = await resolvePendingDescription(
@@ -139,11 +148,12 @@ function createPendingDescriptionRequest(
 	part: vscode.LanguageModelDataPart,
 	visionModel: vscode.LanguageModelChat,
 	visionPrompt: string,
+	dataHash: string,
 ): Promise<string> {
 	return describeImagePart(part, visionModel, visionPrompt).then(
 		(description) => {
 			if (description.length > 0) {
-				rememberDescription(cacheKey, description);
+				rememberDescription(cacheKey, description, dataHash);
 			}
 			return description;
 		},
@@ -241,7 +251,7 @@ async function describeImagePart(
 }
 
 function createImageDescriptionText(description: string): string {
-	return `[Image Description: ${description}]`;
+	return IMAGE_DESCRIPTION_PREFIX + description + IMAGE_DESCRIPTION_SUFFIX;
 }
 
 function isImageDataPart(part: unknown): part is vscode.LanguageModelDataPart {