99 * - DOCUMENT_CONVERSION_PROVIDER (optional, default: "mistral")
1010 */
1111
12+ import { Readable } from "stream" ;
1213import consumers from "stream/consumers" ;
1314import { defineDkgPlugin , DkgContext } from "@dkg/plugins" ;
1415import { z } from "@dkg/plugins/helpers" ;
16+ import { openAPIRoute } from "@dkg/plugin-swagger" ;
17+ import busboy from "busboy" ;
1518
1619import type {
1720 DocumentConversionProvider ,
@@ -20,6 +23,7 @@ import type {
2023 DocumentConversionConfig ,
2124} from "./types" ;
2225import { integrateWithBlobStorage } from "./blob-integration" ;
26+ import { validateFileType , validateFileSize } from "./validation" ;
2327import { createProvider } from "./providers" ;
2428
2529// Re-export types for external use
@@ -56,7 +60,7 @@ export {
5660
5761/**
5862 * Process a document and convert it to markdown with image extraction.
59- * Uses the provided provider for conversion , then integrates with blob storage.
63+ * Validates file type and size, converts via provider , then integrates with blob storage.
6064 */
6165async function processDocument (
6266 ctx : DkgContext ,
@@ -65,6 +69,10 @@ async function processDocument(
6569 filename : string ,
6670 options ?: DocumentConversionOptions ,
6771) : Promise < ConversionResult > {
72+ // Validate before handing off to provider (providers may also validate internally)
73+ validateFileType ( filename ) ;
74+ validateFileSize ( documentBuffer . length ) ;
75+
6876 console . log ( `Converting document using provider: ${ provider . name } ` ) ;
6977
7078 // Convert document using provider
@@ -84,7 +92,7 @@ async function processDocument(
8492export function createDocumentToMarkdownPlugin (
8593 config ?: DocumentConversionConfig ,
8694) {
87- return defineDkgPlugin ( ( ctx , mcp ) => {
95+ return defineDkgPlugin ( ( ctx , mcp , api ) => {
8896 // Resolve provider: use provided instance, create by name, or use default
8997 let provider : DocumentConversionProvider ;
9098
@@ -98,24 +106,126 @@ export function createDocumentToMarkdownPlugin(
98106 process . env . DOCUMENT_CONVERSION_PROVIDER ??
99107 "mistral" ;
100108
101- try {
102- provider = createProvider ( providerName ) ;
103- } catch ( error ) {
104- // Log warning but don't fail plugin initialization
105- // Provider errors will surface when the tool is actually used
106- console . warn (
107- `Document conversion provider "${ providerName } " initialization deferred: ` +
108- `${ error instanceof Error ? error . message : String ( error ) } ` ,
109- ) ;
110- // Create a lazy provider that initializes on first use
111- provider = createLazyProvider ( providerName ) ;
112- }
109+ provider = createProvider ( providerName ) ;
113110 }
114111
115112 console . log (
116113 `Document-to-markdown plugin initialized with provider: ${ provider . name } ` ,
117114 ) ;
118115
116+ // REST endpoint for document-to-markdown conversion
117+ api . post (
118+ "/document-to-markdown" ,
119+ openAPIRoute (
120+ {
121+ summary : "Convert document to Markdown" ,
122+ description :
123+ "Upload a PDF, DOCX, or PPTX file and convert it to Markdown using OCR. " +
124+ "Returns the extracted markdown content and any images stored in blob storage." ,
125+ tag : "Documents" ,
126+ response : {
127+ schema : z . object ( {
128+ markdown : z . string ( ) . openapi ( {
129+ description : "The extracted markdown content" ,
130+ } ) ,
131+ markdownBlobId : z . string ( ) . openapi ( {
132+ description : "Blob ID of the stored markdown file" ,
133+ } ) ,
134+ outputFolderId : z . string ( ) . openapi ( {
135+ description : "Blob folder ID containing all conversion outputs" ,
136+ } ) ,
137+ pageCount : z . number ( ) . openapi ( {
138+ description : "Number of pages processed" ,
139+ } ) ,
140+ images : z . array (
141+ z . object ( {
142+ id : z . string ( ) ,
143+ blobId : z . string ( ) ,
144+ } ) ,
145+ ) . openapi ( {
146+ description : "List of extracted images with their blob IDs" ,
147+ } ) ,
148+ } ) ,
149+ } ,
150+ finalizeRouteConfig ( cfg ) {
151+ cfg . request = {
152+ body : {
153+ required : true ,
154+ description :
155+ "Document file to convert. Supported formats: PDF, DOCX, PPTX." ,
156+ content : {
157+ "multipart/form-data" : {
158+ schema : z . object ( {
159+ file : z . string ( ) . openapi ( {
160+ description : "The document file to convert" ,
161+ format : "binary" ,
162+ } ) ,
163+ } ) ,
164+ } ,
165+ } ,
166+ } ,
167+ } ;
168+ return cfg ;
169+ } ,
170+ } ,
171+ async ( req , res ) => {
172+ let fileReceived = false ;
173+
174+ const bb = busboy ( { headers : req . headers } ) ;
175+
176+ bb . on ( "file" , async ( _name , file , info ) => {
177+ fileReceived = true ;
178+ try {
179+ const documentBuffer = await consumers . buffer (
180+ Readable . toWeb ( file ) ,
181+ ) ;
182+
183+ const result = await processDocument (
184+ ctx ,
185+ provider ,
186+ documentBuffer ,
187+ info . filename ,
188+ ) ;
189+
190+ res . status ( 200 ) . json ( {
191+ markdown : result . markdown ,
192+ markdownBlobId : result . markdownBlobId ,
193+ outputFolderId : result . outputFolderId ,
194+ pageCount : result . pageCount ,
195+ images : result . images . map ( ( img ) => ( {
196+ id : img . id ,
197+ blobId : img . blobId ?? "" ,
198+ } ) ) ,
199+ } ) ;
200+ } catch ( error ) {
201+ const message =
202+ error instanceof Error ? error . message : String ( error ) ;
203+ console . error (
204+ "Error converting document to markdown:" ,
205+ message ,
206+ ) ;
207+ res . status ( 400 ) . json ( { error : message } ) ;
208+ }
209+ } ) ;
210+
211+ bb . on ( "error" , ( error : Error ) => {
212+ console . error ( "Error parsing multipart request:" , error . message ) ;
213+ res . status ( 400 ) . json ( { error : "Invalid multipart request." } ) ;
214+ } ) ;
215+
216+ bb . on ( "close" , ( ) => {
217+ if ( ! fileReceived ) {
218+ res
219+ . status ( 400 )
220+ . json ( { error : "No file provided in the request." } ) ;
221+ }
222+ } ) ;
223+
224+ req . pipe ( bb ) ;
225+ } ,
226+ ) ,
227+ ) ;
228+
119229 mcp . registerTool (
120230 "document-to-markdown" ,
121231 {
@@ -232,27 +342,6 @@ export function createDocumentToMarkdownPlugin(
232342 } ) ;
233343}
234344
235- /**
236- * Create a lazy provider that defers initialization until first use.
237- * This allows the plugin to initialize even if provider config is missing.
238- */
239- function createLazyProvider ( providerName : string ) : DocumentConversionProvider {
240- let actualProvider : DocumentConversionProvider | null = null ;
241-
242- return {
243- get name ( ) {
244- return actualProvider ?. name ?? `${ providerName } (lazy)` ;
245- } ,
246-
247- async convert ( buffer , filename , options ) {
248- if ( ! actualProvider ) {
249- actualProvider = createProvider ( providerName ) ;
250- }
251- return actualProvider . convert ( buffer , filename , options ) ;
252- } ,
253- } ;
254- }
255-
256345/**
257346 * Default plugin export - uses Mistral provider.
258347 * For custom provider configuration, use createDocumentToMarkdownPlugin().
0 commit comments