diff --git a/src/extension/conversation/common/languageModelChatMessageHelpers.ts b/src/extension/conversation/common/languageModelChatMessageHelpers.ts index 42e6f90008..ec40999949 100644 --- a/src/extension/conversation/common/languageModelChatMessageHelpers.ts +++ b/src/extension/conversation/common/languageModelChatMessageHelpers.ts @@ -22,6 +22,41 @@ export function isImageDataPart(part: unknown): part is LanguageModelDataPart { return false; } +/** + * Detect the actual MIME type by inspecting the file's magic bytes, + * since the declared mimeType (based on file extension) may be wrong. + */ +export function detectImageMimeType(data: Uint8Array): ChatImageMimeType | undefined { + if (data.length < 4) { + return undefined; + } + + // JPEG: FF D8 FF + if (data[0] === 0xFF && data[1] === 0xD8 && data[2] === 0xFF) { + return ChatImageMimeType.JPEG; + } + // PNG: 89 50 4E 47 + if (data[0] === 0x89 && data[1] === 0x50 && data[2] === 0x4E && data[3] === 0x47) { + return ChatImageMimeType.PNG; + } + // GIF: 47 49 46 38 + if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x38) { + return ChatImageMimeType.GIF; + } + // WebP: RIFF....WEBP + if (data.length >= 12 && + data[0] === 0x52 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x46 && + data[8] === 0x57 && data[9] === 0x45 && data[10] === 0x42 && data[11] === 0x50) { + return ChatImageMimeType.WEBP; + } + // BMP: 42 4D + if (data[0] === 0x42 && data[1] === 0x4D) { + return ChatImageMimeType.BMP; + } + + return undefined; +} + function isChatImageMimeType(mimeType: string): mimeType is ChatImageMimeType { switch (mimeType) { case ChatImageMimeType.JPEG: diff --git a/src/extension/conversation/common/test/languageModelChatMessageHelpers.spec.ts b/src/extension/conversation/common/test/languageModelChatMessageHelpers.spec.ts new file mode 100644 index 0000000000..ea15202d80 --- /dev/null +++ b/src/extension/conversation/common/test/languageModelChatMessageHelpers.spec.ts @@ -0,0 +1,56 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { describe, expect, test } from 'vitest'; +import { ChatImageMimeType, detectImageMimeType } from '../languageModelChatMessageHelpers'; + +describe('detectImageMimeType', () => { + test('detects JPEG from magic bytes', () => { + const data = new Uint8Array([0xFF, 0xD8, 0xFF, 0xE0, 0x00]); + expect(detectImageMimeType(data)).toBe(ChatImageMimeType.JPEG); + }); + + test('detects PNG from magic bytes', () => { + const data = new Uint8Array([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]); + expect(detectImageMimeType(data)).toBe(ChatImageMimeType.PNG); + }); + + test('detects GIF from magic bytes', () => { + const data = new Uint8Array([0x47, 0x49, 0x46, 0x38, 0x39, 0x61]); + expect(detectImageMimeType(data)).toBe(ChatImageMimeType.GIF); + }); + + test('detects WebP from magic bytes', () => { + // RIFF....WEBP + const data = new Uint8Array([0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50]); + expect(detectImageMimeType(data)).toBe(ChatImageMimeType.WEBP); + }); + + test('detects BMP from magic bytes', () => { + const data = new Uint8Array([0x42, 0x4D, 0x00, 0x00]); + expect(detectImageMimeType(data)).toBe(ChatImageMimeType.BMP); + }); + + test('returns undefined for unknown format', () => { + const data = new Uint8Array([0x00, 0x01, 0x02, 0x03]); + expect(detectImageMimeType(data)).toBeUndefined(); + }); + + test('returns undefined for data shorter than 4 bytes', () => { + const data = new Uint8Array([0xFF, 0xD8]); + expect(detectImageMimeType(data)).toBeUndefined(); + }); + + test('returns undefined for empty data', () => { + const data = new Uint8Array(0); + expect(detectImageMimeType(data)).toBeUndefined(); + }); + + test('correctly identifies JPEG when file extension might suggest PNG', () => { + // This is the actual bug scenario: file named .png but content is JPEG + const jpegData = new Uint8Array([0xFF, 0xD8, 0xFF, 0xE1, 0x00, 0x10]); + expect(detectImageMimeType(jpegData)).toBe(ChatImageMimeType.JPEG); + }); +}); diff --git a/src/extension/prompts/node/panel/toolCalling.tsx b/src/extension/prompts/node/panel/toolCalling.tsx index c379cb4500..c59383633b 100644 --- a/src/extension/prompts/node/panel/toolCalling.tsx +++ b/src/extension/prompts/node/panel/toolCalling.tsx @@ -32,7 +32,7 @@ import { URI, UriComponents } from '../../../../util/vs/base/common/uri'; import { IInstantiationService, ServicesAccessor } from '../../../../util/vs/platform/instantiation/common/instantiation'; import { ServiceCollection } from '../../../../util/vs/platform/instantiation/common/serviceCollection'; import { LanguageModelDataPart, LanguageModelDataPart2, LanguageModelPartAudience, LanguageModelPromptTsxPart, LanguageModelTextPart, LanguageModelTextPart2, LanguageModelToolMCPSource, LanguageModelToolResult } from '../../../../vscodeTypes'; -import { isImageDataPart } from '../../../conversation/common/languageModelChatMessageHelpers'; +import { detectImageMimeType, isImageDataPart } from '../../../conversation/common/languageModelChatMessageHelpers'; import { IResultMetadata } from '../../../prompt/common/conversation'; import { IBuildPromptContext, IToolCall, IToolCallRound } from '../../../prompt/common/intents'; import { toJsonSchema } from '../../../tools/common/toJsonSchema'; @@ -456,7 +456,7 @@ enum ToolInvocationOutcome { export async function imageDataPartToTSX(part: LanguageModelDataPart, githubToken?: string, urlOrRequestMetadata?: string | RequestMetadata, logService?: ILogService, imageService?: IImageService) { if (isImageDataPart(part)) { let imageData: Uint8Array = part.data; - let mimeType = part.mimeType; + let mimeType = detectImageMimeType(part.data) ?? part.mimeType; if (imageService) { try { @@ -473,7 +473,7 @@ export async function imageDataPartToTSX(part: LanguageModelDataPart, githubToke const isChatRequest = typeof urlOrRequestMetadata !== 'string' && (urlOrRequestMetadata?.type === RequestType.ChatCompletions || urlOrRequestMetadata?.type === RequestType.ChatMessages); if (githubToken && isChatRequest && imageService) { try { - const uri = await imageService.uploadChatImageAttachment(imageData, 'tool-result-image', mimeType ?? 'image/png', githubToken); + const uri = await imageService.uploadChatImageAttachment(imageData, 'tool-result-image', mimeType ?? 'image/png', githubToken); if (uri) { imageSource = uri.toString(); } @@ -638,6 +638,12 @@ class PrimitiveToolResult extends PromptEle */ private imageSizeBudgetLeft = (5 * 1024 * 1024) / 2; // 5MB + /** + * Track total image count to stay within model limits (e.g. Gemini's max_prompt_images). + * Reserve some budget for user-attached images by using half the model's limit. + */ + private imageCountBudgetLeft: number; + constructor( props: T, @IPromptEndpoint protected readonly endpoint: IPromptEndpoint, @@ -649,6 +655,9 @@ class PrimitiveToolResult extends PromptEle ) { super(props); this.linkedResources = this.props.content.filter((c): c is LanguageModelDataPart => c instanceof LanguageModelDataPart && c.mimeType === McpLinkedResourceToolResult.mimeType); + this.imageCountBudgetLeft = endpoint?.maxPromptImages !== undefined + ? Math.max(1, Math.floor(endpoint.maxPromptImages / 2)) + : Infinity; } async render(): Promise { @@ -696,13 +705,19 @@ class PrimitiveToolResult extends PromptEle return '[Image content is not available because vision is not supported by the current model or is disabled by your organization.]'; } + // Check image count budget first + if (this.imageCountBudgetLeft <= 0) { + return ''; + } + this.imageCountBudgetLeft--; + const githubToken = (await this.authService.getGitHubSession('any', { silent: true }))?.accessToken; const uploadsEnabled = this.configurationService && this.experimentationService ? this.configurationService.getExperimentBasedConfig(ConfigKey.EnableChatImageUpload, this.experimentationService) : false; // Anthropic (from CAPI) currently does not support image uploads from tool calls. - const uploadToken = uploadsEnabled && modelCanUseMcpResultImageURL(this.endpoint) ? githubToken : undefined; + const uploadToken = uploadsEnabled && this.endpoint && modelCanUseMcpResultImageURL(this.endpoint) ? githubToken : undefined; if (!uploadToken) { if (this.imageSizeBudgetLeft < 0) { @@ -715,7 +730,7 @@ class PrimitiveToolResult extends PromptEle } } - return Promise.resolve(imageDataPartToTSX(part, uploadToken, this.endpoint.urlOrRequestMetadata, this.logService, this.imageService)); + return Promise.resolve(imageDataPartToTSX(part, uploadToken, this.endpoint?.urlOrRequestMetadata, this.logService, this.imageService)); } protected onTSX(part: JSONTree.PromptElementJSON) {