Skip to content

Commit f51529f

Browse files
committed
refactor(pdf-server): simplify per round-2 review and fix false-pass test
server.ts: extract probeFormFields() so display_pdf calls one helper instead of inlining 40 lines of transport/orFail plumbing; make extractFormSchema's fieldObjects param required (the optional branch only existed for a test helper). server.test.ts: the >1MB integration test now forces the image XObject fetch via getOperatorList() and asserts max requestDataRange span > MAX_CHUNK_BYTES, so it can't go vacuous. Dedupe makeRandomJpeg by importing from the fixture. mcp-app.ts: restore per-annotation try/catch isolation in the lazy baseline scan (a throw was skipping AnnotationLayer.render); only carry forward restoredRemovedIds while the baseline scan is incomplete so a stale id can't pin dirty=true once every page is scanned. tests: bump fixture JPEG to 1.1MB; consolidate 7 e2e tests to 4 (merge overlap into the byte-budget test, merge stall+page-2, drop the error-e2e duplicate of the unit integration test); delete run-fixture.mjs; drop /error.pdf and ERROR_AFTER_BYTES; rename tombstone test's describe and clarify page-2 covers the viewer transport.
1 parent 5caaa37 commit f51529f

6 files changed

Lines changed: 179 additions & 239 deletions

File tree

examples/pdf-server/server.test.ts

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { Client } from "@modelcontextprotocol/sdk/client/index.js";
66
import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js";
77
import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
88
import { PDFDocument } from "pdf-lib";
9+
import { makeRandomJpeg } from "../../tests/helpers/range-counting-server";
910
import {
1011
createPdfCache,
1112
createServer,
@@ -350,69 +351,56 @@ describe("PdfCacheRangeTransport", () => {
350351
// hand pdfjs a single onDataRange(begin, fullBuffer). This test fails if
351352
// deliver() either truncates or calls onDataRange more than once per
352353
// requestDataRange (pdf.mjs _onReceiveData matches by exact begin).
353-
function makeRandomJpeg(len: number): Uint8Array {
354-
const header = Uint8Array.from([
355-
0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
356-
0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, 0xff, 0xc0, 0x00, 0x0b,
357-
0x08, 0x00, 0x08, 0x00, 0x08, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00,
358-
0x14, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
359-
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xda, 0x00, 0x08, 0x01,
360-
0x01, 0x00, 0x00, 0x3f, 0x00,
361-
]);
362-
const scan = new Uint8Array(len);
363-
for (let i = 0; i < len; i++) {
364-
const b = (i * 1103515245 + 12345) & 0xff;
365-
scan[i] = b === 0xff ? 0xfe : b;
366-
}
367-
const eoi = Uint8Array.from([0xff, 0xd9]);
368-
const out = new Uint8Array(header.length + len + 2);
369-
out.set(header, 0);
370-
out.set(scan, header.length);
371-
out.set(eoi, header.length + len);
372-
return out;
373-
}
374-
375354
const d = await PDFDocument.create();
376355
const img = await d.embedJpg(makeRandomJpeg(1_100_000));
377356
const page = d.addPage([612, 792]);
378357
page.drawImage(img, { x: 36, y: 36, width: 540, height: 720 });
379358
const bytes = await d.save();
380359
expect(bytes.length).toBeGreaterThan(2 * MAX_CHUNK_BYTES);
381360

382-
let maxReadLen = 0;
383361
const readClamped: PdfCache["readPdfRange"] = async (_u, off, n) => {
384362
const len = Math.min(n, MAX_CHUNK_BYTES, bytes.length - off);
385-
maxReadLen = Math.max(maxReadLen, len);
386363
return { data: bytes.slice(off, off + len), totalBytes: bytes.length };
387364
};
388-
const transport = new PdfCacheRangeTransport(
365+
// Record the spans pdfjs actually requests so the test fails fast if it
366+
// never asks for >MAX_CHUNK_BYTES (i.e. can't go vacuously green).
367+
const spans: number[] = [];
368+
class RecordingTransport extends PdfCacheRangeTransport {
369+
override requestDataRange(begin: number, end: number): void {
370+
spans.push(end - begin);
371+
super.requestDataRange(begin, end);
372+
}
373+
}
374+
const transport = new RecordingTransport(
389375
"mem://big",
390376
bytes.length,
391377
readClamped,
392378
);
393379

394-
const doc = await Promise.race([
380+
const orHang = <T>(p: Promise<T>, what: string): Promise<T> =>
381+
Promise.race([
382+
p,
383+
transport.failed,
384+
new Promise<never>((_, rej) =>
385+
setTimeout(() => rej(new Error(`${what} hung`)), 5000),
386+
),
387+
]);
388+
389+
const doc = await orHang(
395390
getDocument({
396391
range: transport,
397392
length: bytes.length,
398393
disableAutoFetch: true,
399394
disableStream: true,
400395
rangeChunkSize: 64 * 1024,
401396
}).promise,
402-
transport.failed,
403-
new Promise<never>((_, rej) =>
404-
setTimeout(() => rej(new Error("getDocument hung")), 5000),
405-
),
406-
]);
407-
const p1 = await Promise.race([
408-
doc.getPage(1),
409-
transport.failed,
410-
new Promise<never>((_, rej) =>
411-
setTimeout(() => rej(new Error("getPage hung")), 5000),
412-
),
413-
]);
414-
expect(p1).toBeDefined();
415-
expect(maxReadLen).toBeLessThanOrEqual(MAX_CHUNK_BYTES);
397+
"getDocument",
398+
);
399+
const p1 = await orHang(doc.getPage(1), "getPage");
400+
// getPage() alone doesn't decode the image XObject; getOperatorList() does,
401+
// which is what triggers the >512KB coalesced range request.
402+
await orHang(p1.getOperatorList(), "getOperatorList");
403+
expect(Math.max(...spans)).toBeGreaterThan(MAX_CHUNK_BYTES);
416404
doc.destroy();
417405
});
418406
});
@@ -470,7 +458,10 @@ describe("extractFormSchema field-tree handling", () => {
470458
async function schemaFor(bytes: Uint8Array) {
471459
const doc = await getDocument({ data: bytes }).promise;
472460
try {
473-
return await extractFormSchema(doc);
461+
const fo = (await doc.getFieldObjects()) as Parameters<
462+
typeof extractFormSchema
463+
>[1];
464+
return await extractFormSchema(doc, fo);
474465
} finally {
475466
doc.destroy();
476467
}
@@ -525,11 +516,13 @@ describe("extractFormSchema field-tree handling", () => {
525516
);
526517
const doc = await getDocument({ data: new Uint8Array(bytes) }).promise;
527518
try {
528-
const fo = (await doc.getFieldObjects()) as Record<string, unknown[]>;
519+
const fo = (await doc.getFieldObjects()) as Parameters<
520+
typeof extractFormSchema
521+
>[1];
529522
// Container nodes (no leaf type) should not crash extraction
530-
expect(fo["topmostSubform[0]"]).toBeDefined();
523+
expect(fo!["topmostSubform[0]"]).toBeDefined();
531524
// Schema is null for W-9 (mechanical names), but extraction must not throw
532-
const schema = await extractFormSchema(doc);
525+
const schema = await extractFormSchema(doc, fo);
533526
expect(schema).toBeNull();
534527
} finally {
535528
doc.destroy();

examples/pdf-server/server.ts

Lines changed: 58 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,58 @@ interface FormFieldInfo {
10181018
options?: string[];
10191019
}
10201020

1021+
/**
1022+
* Open `url` via {@link PdfCacheRangeTransport} and return form metadata.
1023+
* Uses `disableAutoFetch` so PDFs without an AcroForm are probed with only
1024+
* the trailer/xref/catalog (~5-25% of bytes); PDFs with forms still walk
1025+
* every page via {@link extractFormFieldInfo} but those are typically small.
1026+
* All errors (including range-fetch failures surfaced via
1027+
* {@link PdfCacheRangeTransport.failed}) resolve to empty results.
1028+
*/
1029+
async function probeFormFields(
1030+
url: string,
1031+
totalBytes: number,
1032+
readPdfRange: PdfCache["readPdfRange"],
1033+
): Promise<{
1034+
formSchema: Awaited<ReturnType<typeof extractFormSchema>>;
1035+
fieldInfo: FormFieldInfo[];
1036+
}> {
1037+
try {
1038+
const transport = new PdfCacheRangeTransport(url, totalBytes, readPdfRange);
1039+
const orFail = <T>(p: Promise<T>): Promise<T> =>
1040+
Promise.race([p, transport.failed]);
1041+
const pdfDoc = await orFail(
1042+
getDocument({
1043+
range: transport,
1044+
length: totalBytes,
1045+
disableAutoFetch: true,
1046+
disableStream: true,
1047+
rangeChunkSize: 64 * 1024,
1048+
standardFontDataUrl: STANDARD_FONT_DATA_URL,
1049+
StandardFontDataFactory: FetchStandardFontDataFactory,
1050+
verbosity: VerbosityLevel.ERRORS,
1051+
}).promise,
1052+
);
1053+
try {
1054+
const fieldObjects = (await orFail(pdfDoc.getFieldObjects())) as Record<
1055+
string,
1056+
PdfJsFieldObject[]
1057+
> | null;
1058+
if (!fieldObjects || Object.keys(fieldObjects).length === 0) {
1059+
return { formSchema: null, fieldInfo: [] };
1060+
}
1061+
return {
1062+
formSchema: await orFail(extractFormSchema(pdfDoc, fieldObjects)),
1063+
fieldInfo: await orFail(extractFormFieldInfo(pdfDoc)),
1064+
};
1065+
} finally {
1066+
pdfDoc.destroy();
1067+
}
1068+
} catch {
1069+
return { formSchema: null, fieldInfo: [] };
1070+
}
1071+
}
1072+
10211073
/**
10221074
* Extract detailed form field info (name, type, page, bounding box, label)
10231075
* from a PDF. Bounding boxes are converted to model coordinates (top-left origin).
@@ -1083,22 +1135,12 @@ async function extractFormFieldInfo(
10831135

10841136
export async function extractFormSchema(
10851137
pdfDoc: PDFDocumentProxy,
1086-
fieldObjects?: Record<string, PdfJsFieldObject[]> | null,
1138+
fieldObjects: Record<string, PdfJsFieldObject[]> | null,
10871139
): Promise<{
10881140
type: "object";
10891141
properties: Record<string, PrimitiveSchemaDefinition>;
10901142
required?: string[];
10911143
} | null> {
1092-
if (fieldObjects === undefined) {
1093-
try {
1094-
fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
1095-
string,
1096-
PdfJsFieldObject[]
1097-
> | null;
1098-
} catch {
1099-
return null;
1100-
}
1101-
}
11021144
if (!fieldObjects || Object.keys(fieldObjects).length === 0) {
11031145
return null;
11041146
}
@@ -1531,46 +1573,11 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
15311573
}
15321574
}
15331575

1534-
// Extract form field schema + detailed field info via range transport so
1535-
// PDFs without forms only fetch the trailer/xref/catalog (~5% of bytes).
1536-
// PDFs with forms still pull most of the file once getAnnotations walks
1537-
// every page, but those are typically small.
1538-
let formSchema: Awaited<ReturnType<typeof extractFormSchema>> = null;
1539-
let fieldInfo: FormFieldInfo[] = [];
1540-
try {
1541-
const transport = new PdfCacheRangeTransport(
1542-
normalized,
1543-
totalBytes,
1544-
readPdfRange,
1545-
);
1546-
const orFail = <T>(p: Promise<T>): Promise<T> =>
1547-
Promise.race([p, transport.failed]);
1548-
const pdfDoc = await orFail(
1549-
getDocument({
1550-
range: transport,
1551-
length: totalBytes,
1552-
disableAutoFetch: true,
1553-
disableStream: true,
1554-
rangeChunkSize: 64 * 1024,
1555-
standardFontDataUrl: STANDARD_FONT_DATA_URL,
1556-
StandardFontDataFactory: FetchStandardFontDataFactory,
1557-
verbosity: VerbosityLevel.ERRORS,
1558-
}).promise,
1559-
);
1560-
try {
1561-
const fieldObjects = (await orFail(
1562-
pdfDoc.getFieldObjects(),
1563-
)) as Record<string, PdfJsFieldObject[]> | null;
1564-
if (fieldObjects && Object.keys(fieldObjects).length > 0) {
1565-
formSchema = await orFail(extractFormSchema(pdfDoc, fieldObjects));
1566-
fieldInfo = await orFail(extractFormFieldInfo(pdfDoc));
1567-
}
1568-
} finally {
1569-
pdfDoc.destroy();
1570-
}
1571-
} catch {
1572-
// Non-fatal — PDF may not have form fields or may fail to parse
1573-
}
1576+
const { formSchema, fieldInfo } = await probeFormFields(
1577+
normalized,
1578+
totalBytes,
1579+
readPdfRange,
1580+
);
15741581
if (formSchema) {
15751582
viewFieldNames.set(uuid, new Set(Object.keys(formSchema.properties)));
15761583
}

examples/pdf-server/src/mcp-app.ts

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2696,36 +2696,47 @@ function scanPageBaselineAnnotations(
26962696
baselineScannedPages.add(pageNum);
26972697
let imported = 0;
26982698
for (let i = 0; i < annotations.length; i++) {
2699-
const ann = annotations[i] as {
2700-
annotationType?: number;
2701-
subtype?: string;
2702-
name?: string;
2703-
rect?: number[];
2704-
};
2705-
const def = importPdfjsAnnotation(ann, pageNum, i);
2706-
if (def) {
2707-
pdfBaselineAnnotations.push(def);
2708-
imported++;
2709-
if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) {
2710-
annotationMap.set(def.id, { def, elements: [] });
2699+
// Isolate each annotation: a malformed one must not bubble up to the
2700+
// caller's form-layer try in renderPage() (which would skip
2701+
// AnnotationLayer.render and hide form widgets for the whole page).
2702+
try {
2703+
const ann = annotations[i] as {
2704+
annotationType?: number;
2705+
subtype?: string;
2706+
name?: string;
2707+
rect?: number[];
2708+
};
2709+
const def = importPdfjsAnnotation(ann, pageNum, i);
2710+
if (def) {
2711+
pdfBaselineAnnotations.push(def);
2712+
imported++;
2713+
if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) {
2714+
annotationMap.set(def.id, { def, elements: [] });
2715+
}
2716+
} else if (ann.annotationType !== 20) {
2717+
// Widget (type 20) is expected to be skipped; anything else we
2718+
// don't import will still be painted by page.render() onto the
2719+
// canvas as unselectable pixels. Log so we can diagnose
2720+
// "ghost annotations" (visible but not in panel, not clickable).
2721+
log.info(
2722+
`[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
2723+
`type=${ann.annotationType}`,
2724+
`subtype=${ann.subtype ?? "?"}`,
2725+
`name=${ann.name ?? "?"}`,
2726+
`rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
2727+
);
27112728
}
2712-
} else if (ann.annotationType !== 20) {
2713-
// Widget (type 20) is expected to be skipped; anything else we
2714-
// don't import will still be painted by page.render() onto the
2715-
// canvas as unselectable pixels. Log so we can diagnose
2716-
// "ghost annotations" (visible but not in panel, not clickable).
2717-
log.info(
2718-
`[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
2719-
`type=${ann.annotationType}`,
2720-
`subtype=${ann.subtype ?? "?"}`,
2721-
`name=${ann.name ?? "?"}`,
2722-
`rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
2723-
);
2729+
} catch (err) {
2730+
log.info(`Baseline: page ${pageNum} annotation import failed`, err);
27242731
}
27252732
}
27262733
if (imported > 0) {
2727-
updateAnnotationsBadge();
2728-
renderAnnotationPanel();
2734+
try {
2735+
updateAnnotationsBadge();
2736+
renderAnnotationPanel();
2737+
} catch (err) {
2738+
log.info(`Baseline: page ${pageNum} panel update failed`, err);
2739+
}
27292740
}
27302741
}
27312742

@@ -2744,12 +2755,15 @@ function persistAnnotations(): void {
27442755

27452756
// computeDiff only sees baseline ids from pages we've already scanned.
27462757
// Carry forward restored tombstones for unvisited pages so the first
2747-
// persist after restore doesn't drop them. Once the page is scanned the id
2748-
// appears in pdfBaselineAnnotations and computeDiff produces it itself,
2749-
// hence the includes() guard.
2750-
for (const id of restoredRemovedIds) {
2751-
if (!annotationMap.has(id) && !diff.removed.includes(id)) {
2752-
diff.removed.push(id);
2758+
// persist after restore doesn't drop them. Once every page is scanned the
2759+
// baseline is complete and computeDiff is authoritative on its own —
2760+
// dropping the carry-forward then also stops a stale id (no longer in the
2761+
// file) from pinning dirty=true forever.
2762+
if (baselineScannedPages.size < totalPages) {
2763+
for (const id of restoredRemovedIds) {
2764+
if (!annotationMap.has(id) && !diff.removed.includes(id)) {
2765+
diff.removed.push(id);
2766+
}
27532767
}
27542768
}
27552769

0 commit comments

Comments
 (0)