modelcontextprotocol
diff --git a/‎examples/pdf-server/server.test.ts‎
Lines changed: 37 additions & 44 deletions b/‎examples/pdf-server/server.test.ts‎
Lines changed: 37 additions & 44 deletions
diff --git a/‎examples/pdf-server/server.ts‎
Lines changed: 58 additions & 51 deletions b/‎examples/pdf-server/server.ts‎
Lines changed: 58 additions & 51 deletions
diff --git a/‎examples/pdf-server/src/mcp-app.ts‎
Lines changed: 46 additions & 32 deletions b/‎examples/pdf-server/src/mcp-app.ts‎
Lines changed: 46 additions & 32 deletions
@@ -6,6 +6,7 @@ import { Client } from "@modelcontextprotocol/sdk/client/index.js";
 import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js";
 import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
 import { PDFDocument } from "pdf-lib";
+import { makeRandomJpeg } from "../../tests/helpers/range-counting-server";
 import {
   createPdfCache,
   createServer,
@@ -350,69 +351,56 @@ describe("PdfCacheRangeTransport", () => {
     // hand pdfjs a single onDataRange(begin, fullBuffer). This test fails if
     // deliver() either truncates or calls onDataRange more than once per
     // requestDataRange (pdf.mjs _onReceiveData matches by exact begin).
-    function makeRandomJpeg(len: number): Uint8Array {
-      const header = Uint8Array.from([
-        0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
-        0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xff, 0xc0, 0x00, 0x0b,
-        0x08, 0x00, 0x08, 0x00, 0x08, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00,
-        0x14, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xda, 0x00, 0x08, 0x01,
-        0x01, 0x00, 0x00, 0x3f, 0x00,
-      ]);
-      const scan = new Uint8Array(len);
-      for (let i = 0; i < len; i++) {
-        const b = (i * 1103515245 + 12345) & 0xff;
-        scan[i] = b === 0xff ? 0xfe : b;
-      }
-      const eoi = Uint8Array.from([0xff, 0xd9]);
-      const out = new Uint8Array(header.length + len + 2);
-      out.set(header, 0);
-      out.set(scan, header.length);
-      out.set(eoi, header.length + len);
-      return out;
-    }
-
     const d = await PDFDocument.create();
     const img = await d.embedJpg(makeRandomJpeg(1_100_000));
     const page = d.addPage([612, 792]);
     page.drawImage(img, { x: 36, y: 36, width: 540, height: 720 });
     const bytes = await d.save();
     expect(bytes.length).toBeGreaterThan(2 * MAX_CHUNK_BYTES);
 
-    let maxReadLen = 0;
     const readClamped: PdfCache["readPdfRange"] = async (_u, off, n) => {
       const len = Math.min(n, MAX_CHUNK_BYTES, bytes.length - off);
-      maxReadLen = Math.max(maxReadLen, len);
       return { data: bytes.slice(off, off + len), totalBytes: bytes.length };
     };
-    const transport = new PdfCacheRangeTransport(
+    // Record the spans pdfjs actually requests so the test fails fast if it
+    // never asks for >MAX_CHUNK_BYTES (i.e. can't go vacuously green).
+    const spans: number[] = [];
+    class RecordingTransport extends PdfCacheRangeTransport {
+      override requestDataRange(begin: number, end: number): void {
+        spans.push(end - begin);
+        super.requestDataRange(begin, end);
+      }
+    }
+    const transport = new RecordingTransport(
       "mem://big",
       bytes.length,
       readClamped,
     );
 
-    const doc = await Promise.race([
+    const orHang = <T>(p: Promise<T>, what: string): Promise<T> =>
+      Promise.race([
+        p,
+        transport.failed,
+        new Promise<never>((_, rej) =>
+          setTimeout(() => rej(new Error(`${what} hung`)), 5000),
+        ),
+      ]);
+
+    const doc = await orHang(
       getDocument({
         range: transport,
         length: bytes.length,
         disableAutoFetch: true,
         disableStream: true,
         rangeChunkSize: 64 * 1024,
       }).promise,
-      transport.failed,
-      new Promise<never>((_, rej) =>
-        setTimeout(() => rej(new Error("getDocument hung")), 5000),
-      ),
-    ]);
-    const p1 = await Promise.race([
-      doc.getPage(1),
-      transport.failed,
-      new Promise<never>((_, rej) =>
-        setTimeout(() => rej(new Error("getPage hung")), 5000),
-      ),
-    ]);
-    expect(p1).toBeDefined();
-    expect(maxReadLen).toBeLessThanOrEqual(MAX_CHUNK_BYTES);
+      "getDocument",
+    );
+    const p1 = await orHang(doc.getPage(1), "getPage");
+    // getPage() alone doesn't decode the image XObject; getOperatorList() does,
+    // which is what triggers the >512KB coalesced range request.
+    await orHang(p1.getOperatorList(), "getOperatorList");
+    expect(Math.max(...spans)).toBeGreaterThan(MAX_CHUNK_BYTES);
     doc.destroy();
   });
 });
@@ -470,7 +458,10 @@ describe("extractFormSchema field-tree handling", () => {
   async function schemaFor(bytes: Uint8Array) {
     const doc = await getDocument({ data: bytes }).promise;
     try {
-      return await extractFormSchema(doc);
+      const fo = (await doc.getFieldObjects()) as Parameters<
+        typeof extractFormSchema
+      >[1];
+      return await extractFormSchema(doc, fo);
     } finally {
       doc.destroy();
     }
@@ -525,11 +516,13 @@ describe("extractFormSchema field-tree handling", () => {
     );
     const doc = await getDocument({ data: new Uint8Array(bytes) }).promise;
     try {
-      const fo = (await doc.getFieldObjects()) as Record<string, unknown[]>;
+      const fo = (await doc.getFieldObjects()) as Parameters<
+        typeof extractFormSchema
+      >[1];
       // Container nodes (no leaf type) should not crash extraction
-      expect(fo["topmostSubform[0]"]).toBeDefined();
+      expect(fo!["topmostSubform[0]"]).toBeDefined();
       // Schema is null for W-9 (mechanical names), but extraction must not throw
-      const schema = await extractFormSchema(doc);
+      const schema = await extractFormSchema(doc, fo);
       expect(schema).toBeNull();
     } finally {
       doc.destroy();
 
@@ -1018,6 +1018,58 @@ interface FormFieldInfo {
   options?: string[];
 }
 
+/**
+ * Open `url` via {@link PdfCacheRangeTransport} and return form metadata.
+ * Uses `disableAutoFetch` so PDFs without an AcroForm are probed with only
+ * the trailer/xref/catalog (~5-25% of bytes); PDFs with forms still walk
+ * every page via {@link extractFormFieldInfo} but those are typically small.
+ * All errors (including range-fetch failures surfaced via
+ * {@link PdfCacheRangeTransport.failed}) resolve to empty results.
+ */
+async function probeFormFields(
+  url: string,
+  totalBytes: number,
+  readPdfRange: PdfCache["readPdfRange"],
+): Promise<{
+  formSchema: Awaited<ReturnType<typeof extractFormSchema>>;
+  fieldInfo: FormFieldInfo[];
+}> {
+  try {
+    const transport = new PdfCacheRangeTransport(url, totalBytes, readPdfRange);
+    const orFail = <T>(p: Promise<T>): Promise<T> =>
+      Promise.race([p, transport.failed]);
+    const pdfDoc = await orFail(
+      getDocument({
+        range: transport,
+        length: totalBytes,
+        disableAutoFetch: true,
+        disableStream: true,
+        rangeChunkSize: 64 * 1024,
+        standardFontDataUrl: STANDARD_FONT_DATA_URL,
+        StandardFontDataFactory: FetchStandardFontDataFactory,
+        verbosity: VerbosityLevel.ERRORS,
+      }).promise,
+    );
+    try {
+      const fieldObjects = (await orFail(pdfDoc.getFieldObjects())) as Record<
+        string,
+        PdfJsFieldObject[]
+      > | null;
+      if (!fieldObjects || Object.keys(fieldObjects).length === 0) {
+        return { formSchema: null, fieldInfo: [] };
+      }
+      return {
+        formSchema: await orFail(extractFormSchema(pdfDoc, fieldObjects)),
+        fieldInfo: await orFail(extractFormFieldInfo(pdfDoc)),
+      };
+    } finally {
+      pdfDoc.destroy();
+    }
+  } catch {
+    return { formSchema: null, fieldInfo: [] };
+  }
+}
+
 /**
  * Extract detailed form field info (name, type, page, bounding box, label)
  * from a PDF. Bounding boxes are converted to model coordinates (top-left origin).
@@ -1083,22 +1135,12 @@ async function extractFormFieldInfo(
 
 export async function extractFormSchema(
   pdfDoc: PDFDocumentProxy,
-  fieldObjects?: Record<string, PdfJsFieldObject[]> | null,
+  fieldObjects: Record<string, PdfJsFieldObject[]> | null,
 ): Promise<{
   type: "object";
   properties: Record<string, PrimitiveSchemaDefinition>;
   required?: string[];
 } | null> {
-  if (fieldObjects === undefined) {
-    try {
-      fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
-        string,
-        PdfJsFieldObject[]
-      > | null;
-    } catch {
-      return null;
-    }
-  }
   if (!fieldObjects || Object.keys(fieldObjects).length === 0) {
     return null;
   }
@@ -1531,46 +1573,11 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
         }
       }
 
-      // Extract form field schema + detailed field info via range transport so
-      // PDFs without forms only fetch the trailer/xref/catalog (~5% of bytes).
-      // PDFs with forms still pull most of the file once getAnnotations walks
-      // every page, but those are typically small.
-      let formSchema: Awaited<ReturnType<typeof extractFormSchema>> = null;
-      let fieldInfo: FormFieldInfo[] = [];
-      try {
-        const transport = new PdfCacheRangeTransport(
-          normalized,
-          totalBytes,
-          readPdfRange,
-        );
-        const orFail = <T>(p: Promise<T>): Promise<T> =>
-          Promise.race([p, transport.failed]);
-        const pdfDoc = await orFail(
-          getDocument({
-            range: transport,
-            length: totalBytes,
-            disableAutoFetch: true,
-            disableStream: true,
-            rangeChunkSize: 64 * 1024,
-            standardFontDataUrl: STANDARD_FONT_DATA_URL,
-            StandardFontDataFactory: FetchStandardFontDataFactory,
-            verbosity: VerbosityLevel.ERRORS,
-          }).promise,
-        );
-        try {
-          const fieldObjects = (await orFail(
-            pdfDoc.getFieldObjects(),
-          )) as Record<string, PdfJsFieldObject[]> | null;
-          if (fieldObjects && Object.keys(fieldObjects).length > 0) {
-            formSchema = await orFail(extractFormSchema(pdfDoc, fieldObjects));
-            fieldInfo = await orFail(extractFormFieldInfo(pdfDoc));
-          }
-        } finally {
-          pdfDoc.destroy();
-        }
-      } catch {
-        // Non-fatal — PDF may not have form fields or may fail to parse
-      }
+      const { formSchema, fieldInfo } = await probeFormFields(
+        normalized,
+        totalBytes,
+        readPdfRange,
+      );
       if (formSchema) {
         viewFieldNames.set(uuid, new Set(Object.keys(formSchema.properties)));
       }
 
@@ -2696,36 +2696,47 @@ function scanPageBaselineAnnotations(
   baselineScannedPages.add(pageNum);
   let imported = 0;
   for (let i = 0; i < annotations.length; i++) {
-    const ann = annotations[i] as {
-      annotationType?: number;
-      subtype?: string;
-      name?: string;
-      rect?: number[];
-    };
-    const def = importPdfjsAnnotation(ann, pageNum, i);
-    if (def) {
-      pdfBaselineAnnotations.push(def);
-      imported++;
-      if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) {
-        annotationMap.set(def.id, { def, elements: [] });
+    // Isolate each annotation: a malformed one must not bubble up to the
+    // caller's form-layer try in renderPage() (which would skip
+    // AnnotationLayer.render and hide form widgets for the whole page).
+    try {
+      const ann = annotations[i] as {
+        annotationType?: number;
+        subtype?: string;
+        name?: string;
+        rect?: number[];
+      };
+      const def = importPdfjsAnnotation(ann, pageNum, i);
+      if (def) {
+        pdfBaselineAnnotations.push(def);
+        imported++;
+        if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) {
+          annotationMap.set(def.id, { def, elements: [] });
+        }
+      } else if (ann.annotationType !== 20) {
+        // Widget (type 20) is expected to be skipped; anything else we
+        // don't import will still be painted by page.render() onto the
+        // canvas as unselectable pixels. Log so we can diagnose
+        // "ghost annotations" (visible but not in panel, not clickable).
+        log.info(
+          `[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
+          `type=${ann.annotationType}`,
+          `subtype=${ann.subtype ?? "?"}`,
+          `name=${ann.name ?? "?"}`,
+          `rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
+        );
       }
-    } else if (ann.annotationType !== 20) {
-      // Widget (type 20) is expected to be skipped; anything else we
-      // don't import will still be painted by page.render() onto the
-      // canvas as unselectable pixels. Log so we can diagnose
-      // "ghost annotations" (visible but not in panel, not clickable).
-      log.info(
-        `[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
-        `type=${ann.annotationType}`,
-        `subtype=${ann.subtype ?? "?"}`,
-        `name=${ann.name ?? "?"}`,
-        `rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
-      );
+    } catch (err) {
+      log.info(`Baseline: page ${pageNum} annotation import failed`, err);
     }
   }
   if (imported > 0) {
-    updateAnnotationsBadge();
-    renderAnnotationPanel();
+    try {
+      updateAnnotationsBadge();
+      renderAnnotationPanel();
+    } catch (err) {
+      log.info(`Baseline: page ${pageNum} panel update failed`, err);
+    }
   }
 }
 
@@ -2744,12 +2755,15 @@ function persistAnnotations(): void {
 
   // computeDiff only sees baseline ids from pages we've already scanned.
   // Carry forward restored tombstones for unvisited pages so the first
-  // persist after restore doesn't drop them. Once the page is scanned the id
-  // appears in pdfBaselineAnnotations and computeDiff produces it itself,
-  // hence the includes() guard.
-  for (const id of restoredRemovedIds) {
-    if (!annotationMap.has(id) && !diff.removed.includes(id)) {
-      diff.removed.push(id);
+  // persist after restore doesn't drop them. Once every page is scanned the
+  // baseline is complete and computeDiff is authoritative on its own —
+  // dropping the carry-forward then also stops a stale id (no longer in the
+  // file) from pinning dirty=true forever.
+  if (baselineScannedPages.size < totalPages) {
+    for (const id of restoredRemovedIds) {
+      if (!annotationMap.has(id) && !diff.removed.includes(id)) {
+        diff.removed.push(id);
+      }
     }
   }