Skip to content

Commit 112b037

Browse files
author
Jurij Skornik
committed
fix(document-to-markdown): address PR review comments and fix validation bugs
- Fail fast on provider init instead of lazy-loading (zsculac review) - Add REST endpoint POST /document-to-markdown with multipart upload and OpenAPI docs (zsculac review) - Move file type/size validation into processDocument() so both MCP and REST paths validate consistently - Add busboy error and close handlers for malformed/empty multipart requests - Remove redundant imageFilename variable in blob-integration - Remove unused MistralProvider import in providers/index - Update tests to use mock providers and fresh MCP servers
1 parent c9dc5e2 commit 112b037

4 files changed

Lines changed: 167 additions & 95 deletions

File tree

packages/plugin-dkg-essentials/src/plugins/document-to-markdown/blob-integration.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,10 @@ async function uploadImages(
5555
folderId: string,
5656
): Promise<void> {
5757
for (const image of images) {
58-
const imageFilename = `${image.id}`; // ID already includes extension (e.g., "img-0.jpeg")
59-
const imageBlobId = `${BLOB_PREFIX}/${folderId}/${imageFilename}`;
58+
const imageBlobId = `${BLOB_PREFIX}/${folderId}/${image.id}`; // ID already includes extension (e.g., "img-0.jpeg")
6059
const imageStream = Readable.toWeb(Readable.from(image.data));
6160
await ctx.blob.put(imageBlobId, imageStream, {
62-
name: imageFilename,
61+
name: image.id,
6362
mimeType: `image/${image.originalFormat}`,
6463
});
6564
image.blobId = imageBlobId;

packages/plugin-dkg-essentials/src/plugins/document-to-markdown/index.ts

Lines changed: 124 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@
99
* - DOCUMENT_CONVERSION_PROVIDER (optional, default: "mistral")
1010
*/
1111

12+
import { Readable } from "stream";
1213
import consumers from "stream/consumers";
1314
import { defineDkgPlugin, DkgContext } from "@dkg/plugins";
1415
import { z } from "@dkg/plugins/helpers";
16+
import { openAPIRoute } from "@dkg/plugin-swagger";
17+
import busboy from "busboy";
1518

1619
import type {
1720
DocumentConversionProvider,
@@ -20,6 +23,7 @@ import type {
2023
DocumentConversionConfig,
2124
} from "./types";
2225
import { integrateWithBlobStorage } from "./blob-integration";
26+
import { validateFileType, validateFileSize } from "./validation";
2327
import { createProvider } from "./providers";
2428

2529
// Re-export types for external use
@@ -56,7 +60,7 @@ export {
5660

5761
/**
5862
* Process a document and convert it to markdown with image extraction.
59-
* Uses the provided provider for conversion, then integrates with blob storage.
63+
* Validates file type and size, converts via provider, then integrates with blob storage.
6064
*/
6165
async function processDocument(
6266
ctx: DkgContext,
@@ -65,6 +69,10 @@ async function processDocument(
6569
filename: string,
6670
options?: DocumentConversionOptions,
6771
): Promise<ConversionResult> {
72+
// Validate before handing off to provider (providers may also validate internally)
73+
validateFileType(filename);
74+
validateFileSize(documentBuffer.length);
75+
6876
console.log(`Converting document using provider: ${provider.name}`);
6977

7078
// Convert document using provider
@@ -84,7 +92,7 @@ async function processDocument(
8492
export function createDocumentToMarkdownPlugin(
8593
config?: DocumentConversionConfig,
8694
) {
87-
return defineDkgPlugin((ctx, mcp) => {
95+
return defineDkgPlugin((ctx, mcp, api) => {
8896
// Resolve provider: use provided instance, create by name, or use default
8997
let provider: DocumentConversionProvider;
9098

@@ -98,24 +106,126 @@ export function createDocumentToMarkdownPlugin(
98106
process.env.DOCUMENT_CONVERSION_PROVIDER ??
99107
"mistral";
100108

101-
try {
102-
provider = createProvider(providerName);
103-
} catch (error) {
104-
// Log warning but don't fail plugin initialization
105-
// Provider errors will surface when the tool is actually used
106-
console.warn(
107-
`Document conversion provider "${providerName}" initialization deferred: ` +
108-
`${error instanceof Error ? error.message : String(error)}`,
109-
);
110-
// Create a lazy provider that initializes on first use
111-
provider = createLazyProvider(providerName);
112-
}
109+
provider = createProvider(providerName);
113110
}
114111

115112
console.log(
116113
`Document-to-markdown plugin initialized with provider: ${provider.name}`,
117114
);
118115

116+
// REST endpoint for document-to-markdown conversion
117+
api.post(
118+
"/document-to-markdown",
119+
openAPIRoute(
120+
{
121+
summary: "Convert document to Markdown",
122+
description:
123+
"Upload a PDF, DOCX, or PPTX file and convert it to Markdown using OCR. " +
124+
"Returns the extracted markdown content and any images stored in blob storage.",
125+
tag: "Documents",
126+
response: {
127+
schema: z.object({
128+
markdown: z.string().openapi({
129+
description: "The extracted markdown content",
130+
}),
131+
markdownBlobId: z.string().openapi({
132+
description: "Blob ID of the stored markdown file",
133+
}),
134+
outputFolderId: z.string().openapi({
135+
description: "Blob folder ID containing all conversion outputs",
136+
}),
137+
pageCount: z.number().openapi({
138+
description: "Number of pages processed",
139+
}),
140+
images: z.array(
141+
z.object({
142+
id: z.string(),
143+
blobId: z.string(),
144+
}),
145+
).openapi({
146+
description: "List of extracted images with their blob IDs",
147+
}),
148+
}),
149+
},
150+
finalizeRouteConfig(cfg) {
151+
cfg.request = {
152+
body: {
153+
required: true,
154+
description:
155+
"Document file to convert. Supported formats: PDF, DOCX, PPTX.",
156+
content: {
157+
"multipart/form-data": {
158+
schema: z.object({
159+
file: z.string().openapi({
160+
description: "The document file to convert",
161+
format: "binary",
162+
}),
163+
}),
164+
},
165+
},
166+
},
167+
};
168+
return cfg;
169+
},
170+
},
171+
async (req, res) => {
172+
let fileReceived = false;
173+
174+
const bb = busboy({ headers: req.headers });
175+
176+
bb.on("file", async (_name, file, info) => {
177+
fileReceived = true;
178+
try {
179+
const documentBuffer = await consumers.buffer(
180+
Readable.toWeb(file),
181+
);
182+
183+
const result = await processDocument(
184+
ctx,
185+
provider,
186+
documentBuffer,
187+
info.filename,
188+
);
189+
190+
res.status(200).json({
191+
markdown: result.markdown,
192+
markdownBlobId: result.markdownBlobId,
193+
outputFolderId: result.outputFolderId,
194+
pageCount: result.pageCount,
195+
images: result.images.map((img) => ({
196+
id: img.id,
197+
blobId: img.blobId ?? "",
198+
})),
199+
});
200+
} catch (error) {
201+
const message =
202+
error instanceof Error ? error.message : String(error);
203+
console.error(
204+
"Error converting document to markdown:",
205+
message,
206+
);
207+
res.status(400).json({ error: message });
208+
}
209+
});
210+
211+
bb.on("error", (error: Error) => {
212+
console.error("Error parsing multipart request:", error.message);
213+
res.status(400).json({ error: "Invalid multipart request." });
214+
});
215+
216+
bb.on("close", () => {
217+
if (!fileReceived) {
218+
res
219+
.status(400)
220+
.json({ error: "No file provided in the request." });
221+
}
222+
});
223+
224+
req.pipe(bb);
225+
},
226+
),
227+
);
228+
119229
mcp.registerTool(
120230
"document-to-markdown",
121231
{
@@ -232,27 +342,6 @@ export function createDocumentToMarkdownPlugin(
232342
});
233343
}
234344

235-
/**
236-
* Create a lazy provider that defers initialization until first use.
237-
* This allows the plugin to initialize even if provider config is missing.
238-
*/
239-
function createLazyProvider(providerName: string): DocumentConversionProvider {
240-
let actualProvider: DocumentConversionProvider | null = null;
241-
242-
return {
243-
get name() {
244-
return actualProvider?.name ?? `${providerName} (lazy)`;
245-
},
246-
247-
async convert(buffer, filename, options) {
248-
if (!actualProvider) {
249-
actualProvider = createProvider(providerName);
250-
}
251-
return actualProvider.convert(buffer, filename, options);
252-
},
253-
};
254-
}
255-
256345
/**
257346
* Default plugin export - uses Mistral provider.
258347
* For custom provider configuration, use createDocumentToMarkdownPlugin().

packages/plugin-dkg-essentials/src/plugins/document-to-markdown/providers/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
*/
55

66
import type { DocumentConversionProvider } from "../types";
7-
import { MistralProvider, createMistralProvider } from "./mistral";
7+
import { createMistralProvider } from "./mistral";
88

99
// Re-export provider implementations
1010
export { MistralProvider, createMistralProvider } from "./mistral";

0 commit comments

Comments
 (0)