From e33fc468b2aef53e753d8403627a00520268e7cb Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Thu, 23 Apr 2026 14:16:08 +0100 Subject: [PATCH 1/2] perf(pdf-server): share cache across server instances and dedupe form parsing In stateless HTTP deployments createServer() is called per request, so the per-instance pdfCache and the 4 MB viewer HTML were discarded after every call. Hoist both to module scope. Also refactor extractFormSchema/extractFormFieldInfo to accept an already-parsed PDFDocumentProxy so display_pdf downloads and parses the PDF once instead of twice. --- examples/pdf-server/server.ts | 203 ++++++++++++++-------------------- 1 file changed, 84 insertions(+), 119 deletions(-) diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts index 80a549953..14fb247c8 100644 --- a/examples/pdf-server/server.ts +++ b/examples/pdf-server/server.ts @@ -34,6 +34,7 @@ import { VerbosityLevel, version as PDFJS_VERSION, } from "pdfjs-dist/legacy/build/pdf.mjs"; +import type { PDFDocumentProxy } from "pdfjs-dist/types/src/display/api.js"; /** * PDF Standard-14 fonts from CDN. Used by both server and viewer so we @@ -943,112 +944,69 @@ interface FormFieldInfo { * from a PDF. Bounding boxes are converted to model coordinates (top-left origin). */ async function extractFormFieldInfo( - url: string, - readRange: ( - url: string, - offset: number, - byteCount: number, - ) => Promise<{ data: Uint8Array; totalBytes: number }>, + pdfDoc: PDFDocumentProxy, ): Promise { - const { totalBytes } = await readRange(url, 0, 1); - const { data } = await readRange(url, 0, totalBytes); - - const loadingTask = getDocument({ - data, - standardFontDataUrl: STANDARD_FONT_DATA_URL, - StandardFontDataFactory: FetchStandardFontDataFactory, - // We only introspect form fields (never render) — silence residual - // warnings like "Unimplemented border style: inset". - verbosity: VerbosityLevel.ERRORS, - }); - const pdfDoc = await loadingTask.promise; - const fields: FormFieldInfo[] = []; - try { - for (let i = 1; i <= pdfDoc.numPages; i++) { - const page = await pdfDoc.getPage(i); - const pageHeight = page.getViewport({ scale: 1.0 }).height; - const annotations = await page.getAnnotations(); - for (const ann of annotations) { - // Only include form widgets (annotationType 20) - if (ann.annotationType !== 20) continue; - if (!ann.rect) continue; - - const fieldName = ann.fieldName || ""; - const fieldType = ann.fieldType || "unknown"; - - // PDF rect is [x1, y1, x2, y2] in bottom-left origin - const x1 = Math.min(ann.rect[0], ann.rect[2]); - const y1 = Math.min(ann.rect[1], ann.rect[3]); - const x2 = Math.max(ann.rect[0], ann.rect[2]); - const y2 = Math.max(ann.rect[1], ann.rect[3]); - const width = x2 - x1; - const height = y2 - y1; - - // Convert to model coords (top-left origin): modelY = pageHeight - pdfY - height - const modelY = pageHeight - y2; - - // Choice widgets (combo/listbox) carry `options` as - // [{exportValue, displayValue}]. Expose export values — that's - // what fill_form needs. - let options: string[] | undefined; - if (Array.isArray(ann.options) && ann.options.length > 0) { - options = ann.options - .map((o: { exportValue?: string }) => o?.exportValue) - .filter((v: unknown): v is string => typeof v === "string"); - } - - fields.push({ - name: fieldName, - type: fieldType, - page: i, - x: Math.round(x1), - y: Math.round(modelY), - width: Math.round(width), - height: Math.round(height), - ...(ann.alternativeText ? { label: ann.alternativeText } : undefined), - // Radio: buttonValue is the per-widget export value — the only - // thing distinguishing three `size [Btn]` lines from each other. - ...(ann.radioButton && ann.buttonValue != null - ? { exportValue: String(ann.buttonValue) } - : undefined), - ...(options?.length ? { options } : undefined), - }); + for (let i = 1; i <= pdfDoc.numPages; i++) { + const page = await pdfDoc.getPage(i); + const pageHeight = page.getViewport({ scale: 1.0 }).height; + const annotations = await page.getAnnotations(); + for (const ann of annotations) { + // Only include form widgets (annotationType 20) + if (ann.annotationType !== 20) continue; + if (!ann.rect) continue; + + const fieldName = ann.fieldName || ""; + const fieldType = ann.fieldType || "unknown"; + + // PDF rect is [x1, y1, x2, y2] in bottom-left origin + const x1 = Math.min(ann.rect[0], ann.rect[2]); + const y1 = Math.min(ann.rect[1], ann.rect[3]); + const x2 = Math.max(ann.rect[0], ann.rect[2]); + const y2 = Math.max(ann.rect[1], ann.rect[3]); + const width = x2 - x1; + const height = y2 - y1; + + // Convert to model coords (top-left origin): modelY = pageHeight - pdfY - height + const modelY = pageHeight - y2; + + // Choice widgets (combo/listbox) carry `options` as + // [{exportValue, displayValue}]. Expose export values — that's + // what fill_form needs. + let options: string[] | undefined; + if (Array.isArray(ann.options) && ann.options.length > 0) { + options = ann.options + .map((o: { exportValue?: string }) => o?.exportValue) + .filter((v: unknown): v is string => typeof v === "string"); } + + fields.push({ + name: fieldName, + type: fieldType, + page: i, + x: Math.round(x1), + y: Math.round(modelY), + width: Math.round(width), + height: Math.round(height), + ...(ann.alternativeText ? { label: ann.alternativeText } : undefined), + // Radio: buttonValue is the per-widget export value — the only + // thing distinguishing three `size [Btn]` lines from each other. + ...(ann.radioButton && ann.buttonValue != null + ? { exportValue: String(ann.buttonValue) } + : undefined), + ...(options?.length ? { options } : undefined), + }); } - } finally { - pdfDoc.destroy(); } return fields; } -async function extractFormSchema( - url: string, - readRange: ( - url: string, - offset: number, - byteCount: number, - ) => Promise<{ data: Uint8Array; totalBytes: number }>, -): Promise<{ +async function extractFormSchema(pdfDoc: PDFDocumentProxy): Promise<{ type: "object"; properties: Record; required?: string[]; } | null> { - // Read full PDF bytes - const { totalBytes } = await readRange(url, 0, 1); - const { data } = await readRange(url, 0, totalBytes); - - const loadingTask = getDocument({ - data, - standardFontDataUrl: STANDARD_FONT_DATA_URL, - StandardFontDataFactory: FetchStandardFontDataFactory, - // We only introspect form fields (never render) — silence residual - // warnings like "Unimplemented border style: inset". - verbosity: VerbosityLevel.ERRORS, - }); - const pdfDoc = await loadingTask.promise; - let fieldObjects: Record | null; try { fieldObjects = (await pdfDoc.getFieldObjects()) as Record< @@ -1056,11 +1014,9 @@ async function extractFormSchema( PdfJsFieldObject[] > | null; } catch { - pdfDoc.destroy(); return null; } if (!fieldObjects || Object.keys(fieldObjects).length === 0) { - pdfDoc.destroy(); return null; } @@ -1131,7 +1087,6 @@ async function extractFormSchema( return /[[\]().]/.test(name) || /^[A-Z0-9_]+$/.test(name); }); - pdfDoc.destroy(); if (Object.keys(properties).length === 0) return null; if (hasMechanicalNames) return null; @@ -1178,6 +1133,12 @@ export interface CreateServerOptions { debug?: boolean; } +// Module-level singletons so they survive across createServer() calls — in +// stateless HTTP deployments a fresh server is created per request, and +// per-instance caches are discarded immediately. +const sharedPdfCache = createPdfCache(); +let cachedAppHtml: string | undefined; + export function createServer(options: CreateServerOptions = {}): McpServer { const { enableInteract = false, useClientRoots = false } = options; const debug = options.debug ?? false; @@ -1197,8 +1158,7 @@ export function createServer(options: CreateServerOptions = {}): McpServer { ); } - // Create session-local cache (isolated per server instance) - const { readPdfRange } = createPdfCache(); + const { readPdfRange } = sharedPdfCache; // Tool: list_pdfs - List available PDFs server.tool( @@ -1484,33 +1444,38 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before } } - // Extract form field schema (used for elicitation and field name validation) + // Extract form field schema + detailed field info from a single + // download/parse pass. let formSchema: Awaited> = null; + let fieldInfo: FormFieldInfo[] = []; try { - formSchema = await extractFormSchema(normalized, readPdfRange); + const { data } = await readPdfRange(normalized, 0, totalBytes); + const pdfDoc = await getDocument({ + data, + standardFontDataUrl: STANDARD_FONT_DATA_URL, + StandardFontDataFactory: FetchStandardFontDataFactory, + verbosity: VerbosityLevel.ERRORS, + }).promise; + try { + formSchema = await extractFormSchema(pdfDoc); + fieldInfo = await extractFormFieldInfo(pdfDoc); + } finally { + pdfDoc.destroy(); + } } catch { - // Non-fatal — PDF may not have form fields + // Non-fatal — PDF may not have form fields or may fail to parse } if (formSchema) { viewFieldNames.set(uuid, new Set(Object.keys(formSchema.properties))); } - - // Extract detailed form field info (page, bounding box, label) - let fieldInfo: FormFieldInfo[] = []; - try { - fieldInfo = await extractFormFieldInfo(normalized, readPdfRange); - if (fieldInfo.length > 0) { - viewFieldInfo.set(uuid, fieldInfo); - // Also populate viewFieldNames from field info if not already set - if (!viewFieldNames.has(uuid)) { - viewFieldNames.set( - uuid, - new Set(fieldInfo.map((f) => f.name).filter(Boolean)), - ); - } + if (fieldInfo.length > 0) { + viewFieldInfo.set(uuid, fieldInfo); + if (!viewFieldNames.has(uuid)) { + viewFieldNames.set( + uuid, + new Set(fieldInfo.map((f) => f.name).filter(Boolean)), + ); } - } catch { - // Non-fatal } // Elicit form field values if requested and client supports it @@ -2833,10 +2798,10 @@ Example — add a signature image and a stamp, then screenshot to verify: RESOURCE_URI, { mimeType: RESOURCE_MIME_TYPE }, async (): Promise => { - const html = await fs.promises.readFile( + const html = (cachedAppHtml ??= await fs.promises.readFile( path.join(DIST_DIR, "mcp-app.html"), "utf-8", - ); + )); return { contents: [ { From 4702eb6e09b9f9495a2967d0a63abce291ca2f7f Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Thu, 23 Apr 2026 15:07:46 +0100 Subject: [PATCH 2/2] perf(pdf-server): cap total cache bytes with LRU eviction The module-level sharedPdfCache could grow unbounded within the 60s lifetime window under a burst of distinct URLs. Track running total bytes and evict least-recently-used entries on insert when it would exceed CACHE_MAX_TOTAL_BYTES (256MB). getCacheEntry now bumps the accessed entry to the end of insertion order so eviction targets the LRU entry rather than the oldest insert. createPdfCache takes an optional maxTotalBytes for testability. --- examples/pdf-server/server.test.ts | 66 ++++++++++++++++++++++++++++++ examples/pdf-server/server.ts | 20 ++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/examples/pdf-server/server.test.ts b/examples/pdf-server/server.test.ts index a2f64ee49..f63ae6737 100644 --- a/examples/pdf-server/server.test.ts +++ b/examples/pdf-server/server.test.ts @@ -71,6 +71,72 @@ describe("PDF Cache with Timeouts", () => { }); }); + describe("byte-cap LRU eviction", () => { + const tenBytes = new Uint8Array(10); + + async function fill(cache: PdfCache, url: string): Promise { + const m = spyOn(globalThis, "fetch").mockResolvedValueOnce( + new Response(tenBytes, { status: 200 }), + ); + try { + await cache.readPdfRange(url, 0, 1024); + } finally { + m.mockRestore(); + } + } + + it("evicts least-recently-used entry when total exceeds cap", async () => { + const cache = createPdfCache(25); + try { + await fill(cache, "https://arxiv.org/pdf/a"); + await fill(cache, "https://arxiv.org/pdf/b"); + expect(cache.getCacheSize()).toBe(2); + + // Touch A so B becomes least-recently-used + await cache.readPdfRange("https://arxiv.org/pdf/a", 0, 1); + + // Inserting C (10B) pushes total to 30 > 25 → evict LRU (B) + await fill(cache, "https://arxiv.org/pdf/c"); + expect(cache.getCacheSize()).toBe(2); + + // A and C still served from cache; B re-fetches + const m = spyOn(globalThis, "fetch").mockResolvedValue( + new Response(tenBytes, { status: 200 }), + ); + try { + await cache.readPdfRange("https://arxiv.org/pdf/a", 0, 1); + await cache.readPdfRange("https://arxiv.org/pdf/c", 0, 1); + expect(m).toHaveBeenCalledTimes(0); + await cache.readPdfRange("https://arxiv.org/pdf/b", 0, 1); + expect(m).toHaveBeenCalledTimes(1); + } finally { + m.mockRestore(); + } + } finally { + cache.clearCache(); + } + }); + + it("evicts multiple entries if a single insert exceeds the cap", async () => { + const cache = createPdfCache(25); + try { + await fill(cache, "https://arxiv.org/pdf/a"); + await fill(cache, "https://arxiv.org/pdf/b"); + const big = spyOn(globalThis, "fetch").mockResolvedValueOnce( + new Response(new Uint8Array(20), { status: 200 }), + ); + try { + await cache.readPdfRange("https://arxiv.org/pdf/big", 0, 1024); + } finally { + big.mockRestore(); + } + expect(cache.getCacheSize()).toBe(1); + } finally { + cache.clearCache(); + } + }); + }); + describe("readPdfRange caching behavior", () => { const testUrl = "https://arxiv.org/pdf/test-pdf"; const testData = new Uint8Array([0x25, 0x50, 0x44, 0x46]); // %PDF header diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts index 14fb247c8..062b26301 100644 --- a/examples/pdf-server/server.ts +++ b/examples/pdf-server/server.ts @@ -84,6 +84,9 @@ export const CACHE_MAX_LIFETIME_MS = 60_000; // 60 seconds /** Max size for cached PDFs (defensive limit to prevent memory exhaustion) */ export const CACHE_MAX_PDF_SIZE_BYTES = 50 * 1024 * 1024; // 50MB +/** Max total bytes across all cache entries; oldest evicted first when exceeded. */ +export const CACHE_MAX_TOTAL_BYTES = 256 * 1024 * 1024; // 256MB + /** Allowed local file paths (CLI args + file roots — read access). */ export const allowedLocalFiles = new Set(); @@ -696,8 +699,11 @@ export interface PdfCache { * - CACHE_INACTIVITY_TIMEOUT_MS of no access (resets on each access) * - CACHE_MAX_LIFETIME_MS from creation (absolute timeout) */ -export function createPdfCache(): PdfCache { +export function createPdfCache( + maxTotalBytes: number = CACHE_MAX_TOTAL_BYTES, +): PdfCache { const cache = new Map(); + let totalBytes = 0; /** Delete a cache entry and clear its timers */ function deleteCacheEntry(url: string): void { @@ -705,6 +711,7 @@ export function createPdfCache(): PdfCache { if (entry) { clearTimeout(entry.inactivityTimer); clearTimeout(entry.maxLifetimeTimer); + totalBytes -= entry.data.length; cache.delete(url); } } @@ -720,6 +727,10 @@ export function createPdfCache(): PdfCache { deleteCacheEntry(url); }, CACHE_INACTIVITY_TIMEOUT_MS); + // Move to end of insertion order so size-cap eviction is LRU. + cache.delete(url); + cache.set(url, entry); + return entry.data; } @@ -728,6 +739,12 @@ export function createPdfCache(): PdfCache { // Clear any existing entry first deleteCacheEntry(url); + // Evict least-recently-used entries until under the byte cap. + for (const oldest of cache.keys()) { + if (totalBytes + data.length <= maxTotalBytes) break; + deleteCacheEntry(oldest); + } + const entry: CacheEntry = { data, createdAt: Date.now(), @@ -740,6 +757,7 @@ export function createPdfCache(): PdfCache { }; cache.set(url, entry); + totalBytes += data.length; } /** Slice a cached or freshly-fetched full body to the requested range. */