embedpdf · Phauks · Jun 13, 2026 · Jun 13, 2026
diff --git a/packages/engines/examples/node/sanitize/assert-helpers.mjs b/packages/engines/examples/node/sanitize/assert-helpers.mjs
@@ -0,0 +1,33 @@
+// Shared assertion helpers for the sanitize tests. These re-parse saved output
+// with pdf-lib independently of the engine that produced it, so a "vector gone"
+// assertion is a true second-opinion check, not the engine confirming itself.
+import { PDFDocument, PDFName, PDFDict } from 'pdf-lib';
+
+export async function loadDoc(bytes) {
+  // updateMetadata: false so load() does not itself rewrite the Info dict.
+  const doc = await PDFDocument.load(bytes, { updateMetadata: false });
+  return doc;
+}
+
+export function catalogHas(doc, key) {
+  return doc.catalog.get(PDFName.of(key)) !== undefined;
+}
+
+export function namesSubtree(doc, key) {
+  // lookupMaybe (not lookup) so a missing/removed /Names tree returns undefined
+  // instead of throwing.
+  const names = doc.catalog.lookupMaybe(PDFName.of('Names'), PDFDict);
+  if (!names) return undefined;
+  return names.get(PDFName.of(key));
+}
+
+export function anyPageHasThumb(doc) {
+  return doc.getPages().some((p) => p.node.get(PDFName.of('Thumb')) !== undefined);
+}
+
+export function assert(cond, msg) {
+  if (!cond) {
+    console.error('ASSERT FAILED:', msg);
+    process.exit(1);
+  }
+}
diff --git a/packages/engines/examples/node/sanitize/build-dirty-fixture.mjs b/packages/engines/examples/node/sanitize/build-dirty-fixture.mjs
@@ -0,0 +1,76 @@
+// Builds a deterministic "dirty" PDF carrying every non-content hidden vector the
+// sanitize primitive must reach: Info metadata, an XMP /Metadata stream, document
+// JavaScript (/OpenAction + /Names /JavaScript), a page /Thumb, and an attachment.
+//
+// Run: node build-dirty-fixture.mjs   ->  writes dirty.pdf next to this file.
+//
+// pdf-lib notes: context.obj() turns string VALUES into PDFString, so every
+// name-valued entry (/S, /Type, ...) is wrapped in PDFName.of(). doc.attach()
+// creates the catalog /Names tree, so JavaScript is merged into the existing
+// Names dict AFTER attaching rather than overwriting it.
+import { writeFileSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+import { PDFDocument, PDFName, PDFRawStream, PDFString } from 'pdf-lib';
+
+const doc = await PDFDocument.create();
+const page = doc.addPage([200, 200]);
+page.drawText('Secret 123-45-6789', { x: 20, y: 100, size: 12 });
+
+// (a) Info-dictionary metadata
+doc.setAuthor('Jane Privileged');
+doc.setTitle('PRIVILEGED - draft settlement');
+doc.setProducer('MagnaCartaFixture');
+
+// (b) XMP /Metadata stream on the catalog
+const xmp = `<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
+<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+<rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/">
+<dc:creator><rdf:Seq><rdf:li>Jane Privileged</rdf:li></rdf:Seq></dc:creator>
+<dc:title><rdf:Alt><rdf:li xml:lang="x-default">PRIVILEGED - draft settlement</rdf:li></rdf:Alt></dc:title>
+</rdf:Description></rdf:RDF></x:xmpmeta><?xpacket end="w"?>`;
+const xmpBytes = new TextEncoder().encode(xmp);
+const xmpStream = PDFRawStream.of(
+  doc.context.obj({ Type: PDFName.of('Metadata'), Subtype: PDFName.of('XML'), Length: xmpBytes.length }),
+  xmpBytes,
+);
+doc.catalog.set(PDFName.of('Metadata'), doc.context.register(xmpStream));
+
+// (c) Document JavaScript: /OpenAction (a JS action) + /Names /JavaScript name tree
+const jsCode = 'app.alert("phone home");';
+const jsAction = doc.context.obj({
+  Type: PDFName.of('Action'),
+  S: PDFName.of('JavaScript'),
+  JS: PDFString.of(jsCode),
+});
+const jsActionRef = doc.context.register(jsAction);
+doc.catalog.set(PDFName.of('OpenAction'), jsActionRef);
+
+// (d) Page /Thumb (presence is the test; pixel content is irrelevant)
+const thumbBytes = new Uint8Array([0x00, 0x7f, 0xff, 0x80]);
+const thumb = PDFRawStream.of(
+  doc.context.obj({
+    Type: PDFName.of('XObject'), Subtype: PDFName.of('Image'),
+    Width: 2, Height: 2, ColorSpace: PDFName.of('DeviceGray'),
+    BitsPerComponent: 8, Length: thumbBytes.length,
+  }),
+  thumbBytes,
+);
+page.node.set(PDFName.of('Thumb'), doc.context.register(thumb));
+
+// (e) Attachment (creates catalog /Names /EmbeddedFiles)
+await doc.attach(new Uint8Array([1, 2, 3, 4]), 'evidence.bin', {
+  mimeType: 'application/octet-stream',
+  description: 'embedded file',
+});
+
+// (c continued) build our own /Names dict carrying /JavaScript. doc.attach() defers
+// creating the /Names tree until save(), where it merges /EmbeddedFiles INTO this
+// same dict (pdf-lib's embedAll uses lookupMaybe), so both subtrees coexist.
+const jsNameTree = doc.context.obj({ Names: [PDFString.of('MagnaCartaJS'), jsActionRef] });
+const namesDict = doc.context.obj({ JavaScript: doc.context.register(jsNameTree) });
+doc.catalog.set(PDFName.of('Names'), doc.context.register(namesDict));
+
+const bytes = await doc.save({ useObjectStreams: false });
+const out = fileURLToPath(new URL('./dirty.pdf', import.meta.url));
+writeFileSync(out, bytes);
+console.log('wrote', out, bytes.length, 'bytes');
diff --git a/packages/engines/examples/node/sanitize/build-ocg-fixture.mjs b/packages/engines/examples/node/sanitize/build-ocg-fixture.mjs
@@ -0,0 +1,60 @@
+// Builds a PDF with a hidden optional-content (OCG) layer for the OCG-removal
+// spike/follow-up. The page has VISIBLE text plus a marked-content section
+// (/OC /MC0 BDC ... EMC) governed by an OCG whose default config is OFF, so the
+// "HIDDEN-LAYER-SECRET" text is hidden by default yet physically present.
+//
+// The content stream is an unfiltered PDFRawStream, so the BDC/OC operators and
+// the hidden text are visible to a raw byte scan (no inflate needed to verify).
+//
+// Run: node build-ocg-fixture.mjs  ->  writes ocg-dirty.pdf next to this file.
+import { writeFileSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+import { PDFDocument, PDFName, PDFString, PDFRawStream } from 'pdf-lib';
+
+const doc = await PDFDocument.create();
+const page = doc.addPage([612, 792]);
+
+// Standard Type1 font (no embedding needed) referenced as /F1.
+const fontRef = doc.context.register(
+  doc.context.obj({ Type: PDFName.of('Font'), Subtype: PDFName.of('Type1'), BaseFont: PDFName.of('Helvetica') }),
+);
+
+// The OCG and the catalog /OCProperties with the layer defaulted OFF (hidden).
+const ocgRef = doc.context.register(
+  doc.context.obj({ Type: PDFName.of('OCG'), Name: PDFString.of('HiddenLayer') }),
+);
+const ocPropsRef = doc.context.register(
+  doc.context.obj({
+    OCGs: [ocgRef],
+    D: doc.context.obj({ OFF: [ocgRef], ON: [], Order: [ocgRef] }),
+  }),
+);
+doc.catalog.set(PDFName.of('OCProperties'), ocPropsRef);
+
+// Unfiltered content stream: a visible run, then a hidden OCG-marked run.
+const content = `BT /F1 18 Tf 50 700 Td (VISIBLE) Tj ET
+/OC /MC0 BDC
+BT /F1 18 Tf 50 650 Td (HIDDEN-LAYER-SECRET) Tj ET
+EMC
+`;
+const contentBytes = new TextEncoder().encode(content);
+const contentRef = doc.context.register(
+  PDFRawStream.of(doc.context.obj({ Length: contentBytes.length }), contentBytes),
+);
+page.node.set(PDFName.of('Contents'), contentRef);
+
+// Resources: the font as /F1 and the OCG as the /MC0 marked-content property.
+page.node.set(
+  PDFName.of('Resources'),
+  doc.context.register(
+    doc.context.obj({
+      Font: doc.context.obj({ F1: fontRef }),
+      Properties: doc.context.obj({ MC0: ocgRef }),
+    }),
+  ),
+);
+
+const bytes = await doc.save({ useObjectStreams: false });
+const out = fileURLToPath(new URL('./ocg-dirty.pdf', import.meta.url));
+writeFileSync(out, bytes);
+console.log('wrote', out, bytes.length, 'bytes');
diff --git a/packages/engines/examples/node/sanitize/engine-setup.mjs b/packages/engines/examples/node/sanitize/engine-setup.mjs
@@ -0,0 +1,20 @@
+// Shared headless engine setup for the sanitize tests (sharp-free, quiet logger).
+import { readFile } from 'node:fs/promises';
+import { fileURLToPath } from 'node:url';
+
+import { init } from '@embedpdf/pdfium';
+import { PdfiumNative, PdfEngine } from '@embedpdf/engines/pdfium';
+import { NoopLogger } from '@embedpdf/models';
+
+export async function makeEngine() {
+  const logger = new NoopLogger();
+  const pdfiumModule = await init();
+  const native = new PdfiumNative(pdfiumModule, { logger });
+  // No imageConverter: sanitize/metadata/save ops never render.
+  return new PdfEngine(native, { logger });
+}
+
+export async function openDirty(engine, id = 'dirty') {
+  const content = await readFile(fileURLToPath(new URL('./dirty.pdf', import.meta.url)));
+  return engine.openDocumentBuffer({ id, content }).toPromise();
+}
diff --git a/packages/engines/examples/node/sanitize/test-remove-ocg.mjs b/packages/engines/examples/node/sanitize/test-remove-ocg.mjs
@@ -0,0 +1,37 @@
+// Hidden optional-content (OCG) layer: its content must be physically removed
+// (not merely hidden), and /OCProperties dropped, while visible content stays.
+// Text extraction reads hidden-layer text too, so it is a faithful oracle:
+// before the scrub it would include HIDDEN-LAYER-SECRET; after, only VISIBLE.
+import { readFile } from 'node:fs/promises';
+import { fileURLToPath } from 'node:url';
+import { makeEngine } from './engine-setup.mjs';
+import { loadDoc, catalogHas, assert } from './assert-helpers.mjs';
+
+const engine = await makeEngine();
+const content = await readFile(fileURLToPath(new URL('./ocg-dirty.pdf', import.meta.url)));
+const doc = await engine.openDocumentBuffer({ id: 'ocg', content }).toPromise();
+
+const out = await engine
+  .sanitizeDocument(doc, {
+    xmp: false,
+    javascript: false,
+    embeddedThumbnails: false,
+    attachments: false,
+    optionalContentGroups: true,
+  })
+  .toPromise();
+
+// /OCProperties removed (independent pdf-lib re-parse).
+const parsed = await loadDoc(out);
+assert(!catalogHas(parsed, 'OCProperties'), '/OCProperties removed');
+
+// Hidden-layer content physically gone; visible content preserved.
+const doc2 = await engine.openDocumentBuffer({ id: 'ocg-check', content: out }).toPromise();
+const text = await engine.extractText(doc2, [0]).toPromise();
+assert(!text.includes('HIDDEN-LAYER-SECRET'), `hidden OCG content removed (text=${JSON.stringify(text)})`);
+assert(text.includes('VISIBLE'), `visible content preserved (text=${JSON.stringify(text)})`);
+
+await engine.closeDocument(doc).toPromise();
+await engine.closeDocument(doc2).toPromise();
+console.log('PASS test-remove-ocg: hidden-layer content + /OCProperties removed, visible kept');
+process.exit(0);
diff --git a/packages/engines/examples/node/sanitize/test-sanitize-document.mjs b/packages/engines/examples/node/sanitize/test-sanitize-document.mjs
@@ -0,0 +1,33 @@
+// Full scrub: sanitizeDocument() with defaults must remove every hidden vector
+// and emit a single-revision (non-incremental) document. Re-parses the output
+// independently with pdf-lib (and the engine for attachments).
+import { makeEngine, openDirty } from './engine-setup.mjs';
+import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';
+
+const engine = await makeEngine();
+const doc = await openDirty(engine);
+
+const out = await engine.sanitizeDocument(doc).toPromise(); // all vectors default true
+
+// Catalog-level vectors, via an independent pdf-lib re-parse.
+const parsed = await loadDoc(out);
+assert(!catalogHas(parsed, 'Metadata'), 'XMP /Metadata removed');
+assert(!catalogHas(parsed, 'OpenAction'), '/OpenAction removed');
+assert(!catalogHas(parsed, 'AA'), 'catalog /AA removed');
+assert(namesSubtree(parsed, 'JavaScript') === undefined, '/Names /JavaScript removed');
+assert(!anyPageHasThumb(parsed), 'page /Thumb removed');
+
+// Attachments, via the engine's own reader on the re-opened output.
+const doc2 = await engine.openDocumentBuffer({ id: 'check', content: out }).toPromise();
+const attachments = await engine.getAttachments(doc2).toPromise();
+assert(attachments.length === 0, `attachments removed (got ${attachments.length})`);
+
+// Non-incremental: a full rewrite has exactly one %%EOF (no retained prior revision).
+const text = Buffer.from(out).toString('latin1');
+const eofCount = (text.match(/%%EOF/g) || []).length;
+assert(eofCount === 1, `single-revision output expected (one %%EOF), got ${eofCount}`);
+
+await engine.closeDocument(doc).toPromise();
+await engine.closeDocument(doc2).toPromise();
+console.log('PASS test-sanitize-document: all vectors scrubbed, single-revision output');
+process.exit(0);
diff --git a/packages/engines/examples/node/sanitize/test-vector-isolation.mjs b/packages/engines/examples/node/sanitize/test-vector-isolation.mjs
@@ -0,0 +1,50 @@
+// Each vector flag must remove only its own vector and leave the others intact —
+// proving the EPDF_* exports are independently effective and correctly scoped.
+import { makeEngine, openDirty } from './engine-setup.mjs';
+import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';
+
+const engine = await makeEngine();
+
+// XMP only.
+{
+  const doc = await openDirty(engine, 'xmp');
+  const out = await engine
+    .sanitizeDocument(doc, { xmp: true, javascript: false, embeddedThumbnails: false, attachments: false })
+    .toPromise();
+  const p = await loadDoc(out);
+  assert(!catalogHas(p, 'Metadata'), 'xmp-only: XMP /Metadata removed');
+  assert(catalogHas(p, 'OpenAction'), 'xmp-only: JS /OpenAction preserved');
+  assert(anyPageHasThumb(p), 'xmp-only: /Thumb preserved');
+  await engine.closeDocument(doc).toPromise();
+}
+
+// JavaScript only.
+{
+  const doc = await openDirty(engine, 'js');
+  const out = await engine
+    .sanitizeDocument(doc, { xmp: false, javascript: true, embeddedThumbnails: false, attachments: false })
+    .toPromise();
+  const p = await loadDoc(out);
+  assert(!catalogHas(p, 'OpenAction'), 'js-only: /OpenAction removed');
+  assert(!catalogHas(p, 'AA'), 'js-only: /AA removed');
+  assert(namesSubtree(p, 'JavaScript') === undefined, 'js-only: /Names /JavaScript removed');
+  assert(catalogHas(p, 'Metadata'), 'js-only: XMP preserved');
+  assert(anyPageHasThumb(p), 'js-only: /Thumb preserved');
+  await engine.closeDocument(doc).toPromise();
+}
+
+// Embedded thumbnails only.
+{
+  const doc = await openDirty(engine, 'thumb');
+  const out = await engine
+    .sanitizeDocument(doc, { xmp: false, javascript: false, embeddedThumbnails: true, attachments: false })
+    .toPromise();
+  const p = await loadDoc(out);
+  assert(!anyPageHasThumb(p), 'thumb-only: /Thumb removed');
+  assert(catalogHas(p, 'Metadata'), 'thumb-only: XMP preserved');
+  assert(catalogHas(p, 'OpenAction'), 'thumb-only: /OpenAction preserved');
+  await engine.closeDocument(doc).toPromise();
+}
+
+console.log('PASS test-vector-isolation: each vector removed independently, others preserved');
+process.exit(0);
diff --git a/packages/engines/examples/node/sanitize/verify-dirty-fixture.mjs b/packages/engines/examples/node/sanitize/verify-dirty-fixture.mjs
@@ -0,0 +1,13 @@
+import { readFileSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';
+const bytes = readFileSync(fileURLToPath(new URL('./dirty.pdf', import.meta.url)));
+const doc = await loadDoc(bytes);
+assert(catalogHas(doc, 'Metadata'), 'XMP /Metadata present');
+assert(catalogHas(doc, 'OpenAction'), '/OpenAction present');
+assert(namesSubtree(doc, 'JavaScript') !== undefined, '/Names /JavaScript present');
+assert(namesSubtree(doc, 'EmbeddedFiles') !== undefined, '/Names /EmbeddedFiles present');
+assert(anyPageHasThumb(doc), 'page /Thumb present');
+const info = doc.getAuthor();
+assert(info === 'Jane Privileged', 'Info author present, got: ' + info);
+console.log('FIXTURE OK: all vectors present (XMP, OpenAction, JS name tree, EmbeddedFiles, Thumb, Info author)');
diff --git a/packages/engines/package.json b/packages/engines/package.json
@@ -95,6 +95,7 @@
     "@types/jest": "^29.5.14",
     "@types/react": "^18.2.0",
     "jest": "^29.7.0",
+    "pdf-lib": "^1.17.1",
     "ts-jest": "^29.4.6",
     "typescript": "^5.0.0"
   },

diff --git a/packages/engines/src/lib/orchestrator/pdf-engine.ts b/packages/engines/src/lib/orchestrator/pdf-engine.ts
@@ -4,6 +4,7 @@ import {
   NoopLogger,
   PdfEngine as IPdfEngine,
   PdfDocumentObject,
+  SanitizeOptions,
   PdfPageObject,
   PdfTask,
   PdfErrorReason,
@@ -1144,6 +1145,16 @@ export class PdfEngine<T = Blob> implements IPdfEngine<T> {
     );
   }
 
+  sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions): PdfTask<ArrayBuffer> {
+    return this.workerQueue.enqueue(
+      {
+        execute: () => this.executor.sanitizeDocument(doc, options),
+        meta: { docId: doc.id, operation: 'sanitizeDocument' },
+      },
+      { priority: Priority.MEDIUM },
+    );
+  }
+
   closeDocument(doc: PdfDocumentObject): PdfTask<boolean> {
     return this.workerQueue.enqueue(
       {

diff --git a/packages/engines/src/lib/orchestrator/remote-executor.ts b/packages/engines/src/lib/orchestrator/remote-executor.ts
@@ -3,6 +3,7 @@ import {
   Logger,
   NoopLogger,
   PdfDocumentObject,
+  SanitizeOptions,
   PdfPageObject,
   PdfTask,
   PdfErrorReason,
@@ -128,6 +129,7 @@ type MessageType =
   | 'mergePages'
   | 'preparePrintDocument'
   | 'saveAsCopy'
+  | 'sanitizeDocument'
   | 'closeDocument'
   | 'closeAllDocuments'
   | 'setDocumentEncryption'
@@ -656,6 +658,10 @@ export class RemoteExecutor implements IPdfiumExecutor {
     return this.send<ArrayBuffer>('saveAsCopy', [doc]);
   }
 
+  sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions): PdfTask<ArrayBuffer> {
+    return this.send<ArrayBuffer>('sanitizeDocument', [doc, options]);
+  }
+
   closeDocument(doc: PdfDocumentObject): PdfTask<boolean> {
     return this.send<boolean>('closeDocument', [doc]);
   }