Skip to content

Commit 62b3fa0

Browse files
Jurij Skornikclaude
andcommitted
fix(document-to-markdown): clamp page ranges, update descriptions, and improve tests
- Clamp pageStart/pageEnd to valid bounds to fix wrong page numbering when callers pass out-of-range values (e.g. pageStart: 0) - Update MCP tool and REST endpoint descriptions to remove "using OCR" and note that capabilities depend on the configured provider - Fix stale docstring: providerName default is "unpdf", not "mistral" - Remove 2 duplicate tests in Provider Registry - Add 3 edge case tests: pageStart-only, pageEnd-only, single-page PDF Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 112b037 commit 62b3fa0

8 files changed

Lines changed: 773 additions & 79 deletions

File tree

package-lock.json

Lines changed: 479 additions & 61 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/plugin-dkg-essentials/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,15 @@
3232
"@mistralai/mistralai": "^1.3.0",
3333
"busboy": "^1.6.0",
3434
"sparqljs": "^3.7.3",
35-
"undici": "^6.19.0"
35+
"undici": "^6.19.0",
36+
"unpdf": "^0.12.1"
3637
},
3738
"devDependencies": {
3839
"@dkg/eslint-config": "*",
3940
"@dkg/typescript-config": "*",
4041
"@types/busboy": "^1.5.4",
4142
"@types/sparqljs": "^3.1.12",
43+
"pdf-lib": "^1.17.1",
4244
"tsup": "^8.5.0"
4345
}
4446
}

packages/plugin-dkg-essentials/src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ export {
2929
isProviderAvailable,
3030
MistralProvider,
3131
createMistralProvider,
32+
UnpdfProvider,
33+
createUnpdfProvider,
3234
} from "./plugins/document-to-markdown";
3335

3436
export default defineDkgPlugin((ctx, mcp, api) => {

packages/plugin-dkg-essentials/src/plugins/document-to-markdown/index.ts

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
* Document to Markdown MCP Tool
33
*
44
* Converts PDF, DOCX, and PPTX documents to Markdown using configurable OCR providers.
5-
* Default provider: Mistral OCR.
5+
* Default provider: unpdf (zero-config PDF text extraction).
66
*
77
* Environment variables:
8-
* - MISTRAL_API_KEY (required for Mistral provider)
9-
* - DOCUMENT_CONVERSION_PROVIDER (optional, default: "mistral")
8+
* - DOCUMENT_CONVERSION_PROVIDER (optional, default: "unpdf")
9+
* - MISTRAL_API_KEY (required when using Mistral provider)
1010
*/
1111

1212
import { Readable } from "stream";
@@ -44,6 +44,8 @@ export {
4444
isProviderAvailable,
4545
MistralProvider,
4646
createMistralProvider,
47+
UnpdfProvider,
48+
createUnpdfProvider,
4749
} from "./providers";
4850

4951
// Re-export validation utilities
@@ -104,7 +106,7 @@ export function createDocumentToMarkdownPlugin(
104106
config?.providerName ??
105107
// eslint-disable-next-line turbo/no-undeclared-env-vars
106108
process.env.DOCUMENT_CONVERSION_PROVIDER ??
107-
"mistral";
109+
"unpdf";
108110

109111
provider = createProvider(providerName);
110112
}
@@ -120,7 +122,8 @@ export function createDocumentToMarkdownPlugin(
120122
{
121123
summary: "Convert document to Markdown",
122124
description:
123-
"Upload a PDF, DOCX, or PPTX file and convert it to Markdown using OCR. " +
125+
"Upload a PDF, DOCX, or PPTX file and convert it to Markdown. " +
126+
"Supported document types and image extraction capabilities depend on the configured provider. " +
124127
"Returns the extracted markdown content and any images stored in blob storage.",
125128
tag: "Documents",
126129
response: {
@@ -231,8 +234,9 @@ export function createDocumentToMarkdownPlugin(
231234
{
232235
title: "Document to Markdown",
233236
description:
234-
"Convert PDF, DOCX, or PPTX documents to Markdown using OCR. " +
235-
"Extracts text and images from documents. Use this as the first step when publishing documents to the DKG - " +
237+
"Convert PDF, DOCX, or PPTX documents to Markdown. " +
238+
"Supported document types and image extraction capabilities depend on the configured provider. " +
239+
"Use this as the first step when publishing documents to the DKG - " +
236240
"the markdown output can then be transformed to JSON-LD and published using dkg-create.",
237241
inputSchema: {
238242
blobId: z
@@ -343,7 +347,7 @@ export function createDocumentToMarkdownPlugin(
343347
}
344348

345349
/**
346-
* Default plugin export - uses Mistral provider.
347-
* For custom provider configuration, use createDocumentToMarkdownPlugin().
350+
* Default plugin export - uses unpdf provider (zero-config).
351+
* For Mistral OCR, use createDocumentToMarkdownPlugin({ providerName: "mistral" }).
348352
*/
349353
export default createDocumentToMarkdownPlugin();

packages/plugin-dkg-essentials/src/plugins/document-to-markdown/providers/index.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55

66
import type { DocumentConversionProvider } from "../types";
77
import { createMistralProvider } from "./mistral";
8+
import { createUnpdfProvider } from "./unpdf";
89

910
// Re-export provider implementations
1011
export { MistralProvider, createMistralProvider } from "./mistral";
12+
export { UnpdfProvider, createUnpdfProvider } from "./unpdf";
1113

1214
/**
1315
* Map of provider names to factory functions
@@ -17,12 +19,13 @@ const PROVIDER_FACTORIES: Record<
1719
(options?: Record<string, unknown>) => DocumentConversionProvider
1820
> = {
1921
mistral: (options) => createMistralProvider(options?.apiKey as string),
22+
unpdf: () => createUnpdfProvider(),
2023
};
2124

2225
/**
2326
* Default provider name
2427
*/
25-
const DEFAULT_PROVIDER = "mistral";
28+
const DEFAULT_PROVIDER = "unpdf";
2629

2730
/**
2831
* Get list of available provider names
@@ -41,7 +44,7 @@ export function isProviderAvailable(name: string): boolean {
4144
/**
4245
* Create a provider by name.
4346
*
44-
* @param name - Provider name (default: "mistral")
47+
* @param name - Provider name (default: "unpdf")
4548
* @param options - Provider-specific options
4649
* @throws Error if provider is not found
4750
*/
@@ -60,8 +63,8 @@ export function createProvider(
6063
}
6164

6265
/**
63-
* Get the default provider (Mistral).
64-
* This is the most common use case - just get a working provider.
66+
* Get the default provider (unpdf).
67+
* This is the most common use case - just get a working provider with zero config.
6568
*/
6669
export function getDefaultProvider(): DocumentConversionProvider {
6770
return createProvider(DEFAULT_PROVIDER);
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/**
2+
* unpdf provider for document-to-markdown conversion.
3+
* Uses Mozilla pdf.js (via unpdf) to extract text from non-scanned PDFs.
4+
* Zero-config — no API key required.
5+
*/
6+
7+
import { extractText } from "unpdf";
8+
9+
import type {
10+
DocumentConversionProvider,
11+
DocumentConversionOptions,
12+
DocumentConversionOutput,
13+
} from "../types";
14+
import { getFileExtension, validateFileSize } from "../validation";
15+
16+
const PROVIDER_NAME = "unpdf";
17+
18+
/**
19+
* unpdf provider for PDF text extraction.
20+
* Supports .pdf files only — for .docx/.pptx or scanned PDFs, use Mistral.
21+
*/
22+
export class UnpdfProvider implements DocumentConversionProvider {
23+
readonly name = PROVIDER_NAME;
24+
25+
async convert(
26+
buffer: Buffer,
27+
filename: string,
28+
options?: DocumentConversionOptions,
29+
): Promise<DocumentConversionOutput> {
30+
// Only PDF is supported — reject other formats with a helpful message
31+
const ext = getFileExtension(filename);
32+
if (ext !== ".pdf") {
33+
throw new Error(
34+
`The unpdf provider only supports .pdf files, got '${ext || "(no extension)"}'. ` +
35+
`For .docx and .pptx support, use the Mistral provider ` +
36+
`(set DOCUMENT_CONVERSION_PROVIDER=mistral and provide MISTRAL_API_KEY).`,
37+
);
38+
}
39+
40+
validateFileSize(buffer.length);
41+
42+
// Extract text from PDF using pdf.js
43+
const { totalPages, text } = await extractText(new Uint8Array(buffer), {
44+
mergePages: false,
45+
});
46+
47+
// Clamp page range to valid bounds (1-indexed inputs, 0-indexed array)
48+
const effectiveStart = Math.max(1, options?.pageStart ?? 1);
49+
const effectiveEnd = Math.min(totalPages, options?.pageEnd ?? totalPages);
50+
51+
let pages = text;
52+
if (options?.pageStart != null || options?.pageEnd != null) {
53+
pages = text.slice(effectiveStart - 1, effectiveEnd);
54+
}
55+
56+
// Format pages with separators
57+
const markdown = pages
58+
.map((pageText, i) => {
59+
const pageNum = effectiveStart + i;
60+
return `<!-- Page ${pageNum} -->\n\n${pageText}`;
61+
})
62+
.join("\n\n");
63+
64+
return {
65+
markdown,
66+
images: [],
67+
pageCount: totalPages,
68+
};
69+
}
70+
}
71+
72+
/**
73+
* Create an unpdf provider instance. No configuration required.
74+
*/
75+
export function createUnpdfProvider(): UnpdfProvider {
76+
return new UnpdfProvider();
77+
}

packages/plugin-dkg-essentials/src/plugins/document-to-markdown/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,6 @@ export interface DocumentConversionProvider {
8787
export interface DocumentConversionConfig {
8888
/** Provider to use for conversion (default: auto-detected) */
8989
provider?: DocumentConversionProvider;
90-
/** Provider name to use if no provider instance given (default: "mistral") */
90+
/** Provider name to use if no provider instance given (default: "unpdf") */
9191
providerName?: string;
9292
}

0 commit comments

Comments
 (0)