Skip to content

Commit ee696c1

Browse files
tessaherselfclaude
andcommitted
feat: send PDFs as native file content parts for OpenAI
When a model declares `acceptedFileMimetypes` including "application/pdf", PDFs are now sent as OpenAI `file` content parts with base64 data instead of being wrapped in XML tags. This lets OpenAI (and compatible providers) process PDFs natively. Non-PDF files and models without `acceptedFileMimetypes` are unaffected — existing text extraction behavior is preserved. Depends on: huggingface#2189 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 182ee28 commit ee696c1

3 files changed

Lines changed: 57 additions & 19 deletions

File tree

src/lib/server/endpoints/openai/endpointOai.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,12 @@ export async function endpointOai(
174174
}) => {
175175
// Format messages for the chat API, handling multimodal content if supported
176176
let messagesOpenAI: OpenAI.Chat.Completions.ChatCompletionMessageParam[] =
177-
await prepareMessagesWithFiles(messages, imageProcessor, isMultimodal ?? model.multimodal);
177+
await prepareMessagesWithFiles(
178+
messages,
179+
imageProcessor,
180+
isMultimodal ?? model.multimodal,
181+
model.acceptedFileMimetypes
182+
);
178183

179184
// Normalize preprompt and handle empty values
180185
const normalizedPreprompt = typeof preprompt === "string" ? preprompt.trim() : "";

src/lib/server/textGeneration/mcp/runMcpFlow.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,8 @@ export async function* runMcpFlow({
344344
let messagesOpenAI: ChatCompletionMessageParam[] = await prepareMessagesWithFiles(
345345
messages,
346346
imageProcessor,
347-
mmEnabled
347+
mmEnabled,
348+
model.acceptedFileMimetypes
348349
);
349350
const toolPreprompt = buildToolPreprompt(oaTools);
350351
const prepromptPieces: string[] = [];

src/lib/server/textGeneration/utils/prepareFiles.ts

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,51 @@ import type { OpenAI } from "openai";
44
import { TEXT_MIME_ALLOWLIST } from "$lib/constants/mime";
55
import type { makeImageProcessor } from "$lib/server/endpoints/images";
66

7+
/** MIME types that OpenAI handles natively as file content parts */
8+
const NATIVE_FILE_MIMETYPES = ["application/pdf"] as const;
9+
10+
function matchesMimeAllowlist(mime: string, allowlist: readonly string[]): boolean {
11+
const normalizedMime = (mime || "").toLowerCase();
12+
const [fileType, fileSubtype] = normalizedMime.split("/");
13+
return allowlist.some((allowed) => {
14+
const [type, subtype] = allowed.toLowerCase().split("/");
15+
const typeOk = type === "*" || type === fileType;
16+
const subOk = subtype === "*" || subtype === fileSubtype;
17+
return typeOk && subOk;
18+
});
19+
}
20+
721
/**
822
* Prepare chat messages for OpenAI-compatible multimodal payloads.
923
* - Processes images via the provided imageProcessor (resize/convert) when multimodal is enabled.
24+
* - Sends PDFs as native file content parts when the model accepts them.
1025
* - Injects text-file content into the user message text.
1126
* - Leaves messages untouched when no files or multimodal disabled.
1227
*/
1328
export async function prepareMessagesWithFiles(
1429
messages: EndpointMessage[],
1530
imageProcessor: ReturnType<typeof makeImageProcessor>,
16-
isMultimodal: boolean
31+
isMultimodal: boolean,
32+
acceptedFileMimetypes?: string[]
1733
): Promise<OpenAI.Chat.Completions.ChatCompletionMessageParam[]> {
1834
return Promise.all(
1935
messages.map(async (message) => {
2036
if (message.from === "user" && message.files && message.files.length > 0) {
21-
const { imageParts, textContent } = await prepareFiles(
37+
const { imageParts, fileParts, textContent } = await prepareFiles(
2238
imageProcessor,
2339
message.files,
24-
isMultimodal
40+
isMultimodal,
41+
acceptedFileMimetypes
2542
);
2643

2744
let messageText = message.content;
2845
if (textContent.length > 0) {
2946
messageText = textContent + "\n\n" + message.content;
3047
}
3148

32-
if (imageParts.length > 0 && isMultimodal) {
33-
const parts = [{ type: "text" as const, text: messageText }, ...imageParts];
49+
const multimodalParts = [...imageParts, ...fileParts];
50+
if (multimodalParts.length > 0) {
51+
const parts = [{ type: "text" as const, text: messageText }, ...multimodalParts];
3452
return { role: message.from, content: parts };
3553
}
3654

@@ -44,22 +62,25 @@ export async function prepareMessagesWithFiles(
4462
async function prepareFiles(
4563
imageProcessor: ReturnType<typeof makeImageProcessor>,
4664
files: MessageFile[],
47-
isMultimodal: boolean
65+
isMultimodal: boolean,
66+
acceptedFileMimetypes?: string[]
4867
): Promise<{
4968
imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[];
69+
fileParts: OpenAI.Chat.Completions.ChatCompletionContentPart.File[];
5070
textContent: string;
5171
}> {
5272
const imageFiles = files.filter((file) => file.mime.startsWith("image/"));
53-
const textFiles = files.filter((file) => {
54-
const mime = (file.mime || "").toLowerCase();
55-
const [fileType, fileSubtype] = mime.split("/");
56-
return TEXT_MIME_ALLOWLIST.some((allowed) => {
57-
const [type, subtype] = allowed.toLowerCase().split("/");
58-
const typeOk = type === "*" || type === fileType;
59-
const subOk = subtype === "*" || subtype === fileSubtype;
60-
return typeOk && subOk;
61-
});
62-
});
73+
const textFiles = files.filter((file) => matchesMimeAllowlist(file.mime, TEXT_MIME_ALLOWLIST));
74+
75+
// Files that the model accepts natively (e.g. PDFs via OpenAI's file content part)
76+
const nativeFiles = files.filter(
77+
(file) =>
78+
!file.mime.startsWith("image/") &&
79+
!matchesMimeAllowlist(file.mime, TEXT_MIME_ALLOWLIST) &&
80+
acceptedFileMimetypes &&
81+
matchesMimeAllowlist(file.mime, acceptedFileMimetypes) &&
82+
matchesMimeAllowlist(file.mime, NATIVE_FILE_MIMETYPES)
83+
);
6384

6485
let imageParts: OpenAI.Chat.Completions.ChatCompletionContentPartImage[] = [];
6586
if (isMultimodal && imageFiles.length > 0) {
@@ -73,6 +94,17 @@ async function prepareFiles(
7394
}));
7495
}
7596

97+
// Send natively-supported files as OpenAI file content parts
98+
const fileParts: OpenAI.Chat.Completions.ChatCompletionContentPart.File[] = nativeFiles.map(
99+
(file) => ({
100+
type: "file" as const,
101+
file: {
102+
filename: file.name,
103+
file_data: `data:${file.mime};base64,${file.value}`,
104+
},
105+
})
106+
);
107+
76108
let textContent = "";
77109
if (textFiles.length > 0) {
78110
const textParts = await Promise.all(
@@ -84,5 +116,5 @@ async function prepareFiles(
84116
textContent = textParts.join("\n\n");
85117
}
86118

87-
return { imageParts, textContent };
119+
return { imageParts, fileParts, textContent };
88120
}

0 commit comments

Comments
 (0)