Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions packages/engines/examples/node/sanitize/assert-helpers.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Shared assertion helpers for the sanitize tests. These re-parse saved output
// with pdf-lib independently of the engine that produced it, so a "vector gone"
// assertion is a true second-opinion check, not the engine confirming itself.
import { PDFDocument, PDFName, PDFDict } from 'pdf-lib';

export async function loadDoc(bytes) {
// updateMetadata: false so load() does not itself rewrite the Info dict.
const doc = await PDFDocument.load(bytes, { updateMetadata: false });
return doc;
}

export function catalogHas(doc, key) {
return doc.catalog.get(PDFName.of(key)) !== undefined;
}

export function namesSubtree(doc, key) {
// lookupMaybe (not lookup) so a missing/removed /Names tree returns undefined
// instead of throwing.
const names = doc.catalog.lookupMaybe(PDFName.of('Names'), PDFDict);
if (!names) return undefined;
return names.get(PDFName.of(key));
}

export function anyPageHasThumb(doc) {
return doc.getPages().some((p) => p.node.get(PDFName.of('Thumb')) !== undefined);
}

export function assert(cond, msg) {
if (!cond) {
console.error('ASSERT FAILED:', msg);
process.exit(1);
}
}
76 changes: 76 additions & 0 deletions packages/engines/examples/node/sanitize/build-dirty-fixture.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Builds a deterministic "dirty" PDF carrying every non-content hidden vector the
// sanitize primitive must reach: Info metadata, an XMP /Metadata stream, document
// JavaScript (/OpenAction + /Names /JavaScript), a page /Thumb, and an attachment.
//
// Run: node build-dirty-fixture.mjs -> writes dirty.pdf next to this file.
//
// pdf-lib notes: context.obj() turns string VALUES into PDFString, so every
// name-valued entry (/S, /Type, ...) is wrapped in PDFName.of(). doc.attach()
// creates the catalog /Names tree, so JavaScript is merged into the existing
// Names dict AFTER attaching rather than overwriting it.
import { writeFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { PDFDocument, PDFName, PDFRawStream, PDFString } from 'pdf-lib';

const doc = await PDFDocument.create();
const page = doc.addPage([200, 200]);
page.drawText('Secret 123-45-6789', { x: 20, y: 100, size: 12 });

// (a) Info-dictionary metadata
doc.setAuthor('Jane Privileged');
doc.setTitle('PRIVILEGED - draft settlement');
doc.setProducer('MagnaCartaFixture');

// (b) XMP /Metadata stream on the catalog
const xmp = `<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:creator><rdf:Seq><rdf:li>Jane Privileged</rdf:li></rdf:Seq></dc:creator>
<dc:title><rdf:Alt><rdf:li xml:lang="x-default">PRIVILEGED - draft settlement</rdf:li></rdf:Alt></dc:title>
</rdf:Description></rdf:RDF></x:xmpmeta><?xpacket end="w"?>`;
const xmpBytes = new TextEncoder().encode(xmp);
const xmpStream = PDFRawStream.of(
doc.context.obj({ Type: PDFName.of('Metadata'), Subtype: PDFName.of('XML'), Length: xmpBytes.length }),
xmpBytes,
);
doc.catalog.set(PDFName.of('Metadata'), doc.context.register(xmpStream));

// (c) Document JavaScript: /OpenAction (a JS action) + /Names /JavaScript name tree
const jsCode = 'app.alert("phone home");';
const jsAction = doc.context.obj({
Type: PDFName.of('Action'),
S: PDFName.of('JavaScript'),
JS: PDFString.of(jsCode),
});
const jsActionRef = doc.context.register(jsAction);
doc.catalog.set(PDFName.of('OpenAction'), jsActionRef);

// (d) Page /Thumb (presence is the test; pixel content is irrelevant)
const thumbBytes = new Uint8Array([0x00, 0x7f, 0xff, 0x80]);
const thumb = PDFRawStream.of(
doc.context.obj({
Type: PDFName.of('XObject'), Subtype: PDFName.of('Image'),
Width: 2, Height: 2, ColorSpace: PDFName.of('DeviceGray'),
BitsPerComponent: 8, Length: thumbBytes.length,
}),
thumbBytes,
);
page.node.set(PDFName.of('Thumb'), doc.context.register(thumb));

// (e) Attachment (creates catalog /Names /EmbeddedFiles)
await doc.attach(new Uint8Array([1, 2, 3, 4]), 'evidence.bin', {
mimeType: 'application/octet-stream',
description: 'embedded file',
});

// (c continued) build our own /Names dict carrying /JavaScript. doc.attach() defers
// creating the /Names tree until save(), where it merges /EmbeddedFiles INTO this
// same dict (pdf-lib's embedAll uses lookupMaybe), so both subtrees coexist.
const jsNameTree = doc.context.obj({ Names: [PDFString.of('MagnaCartaJS'), jsActionRef] });
const namesDict = doc.context.obj({ JavaScript: doc.context.register(jsNameTree) });
doc.catalog.set(PDFName.of('Names'), doc.context.register(namesDict));

const bytes = await doc.save({ useObjectStreams: false });
const out = fileURLToPath(new URL('./dirty.pdf', import.meta.url));
writeFileSync(out, bytes);
console.log('wrote', out, bytes.length, 'bytes');
60 changes: 60 additions & 0 deletions packages/engines/examples/node/sanitize/build-ocg-fixture.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Builds a PDF with a hidden optional-content (OCG) layer for the OCG-removal
// spike/follow-up. The page has VISIBLE text plus a marked-content section
// (/OC /MC0 BDC ... EMC) governed by an OCG whose default config is OFF, so the
// "HIDDEN-LAYER-SECRET" text is hidden by default yet physically present.
//
// The content stream is an unfiltered PDFRawStream, so the BDC/OC operators and
// the hidden text are visible to a raw byte scan (no inflate needed to verify).
//
// Run: node build-ocg-fixture.mjs -> writes ocg-dirty.pdf next to this file.
import { writeFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { PDFDocument, PDFName, PDFString, PDFRawStream } from 'pdf-lib';

const doc = await PDFDocument.create();
const page = doc.addPage([612, 792]);

// Standard Type1 font (no embedding needed) referenced as /F1.
const fontRef = doc.context.register(
doc.context.obj({ Type: PDFName.of('Font'), Subtype: PDFName.of('Type1'), BaseFont: PDFName.of('Helvetica') }),
);

// The OCG and the catalog /OCProperties with the layer defaulted OFF (hidden).
const ocgRef = doc.context.register(
doc.context.obj({ Type: PDFName.of('OCG'), Name: PDFString.of('HiddenLayer') }),
);
const ocPropsRef = doc.context.register(
doc.context.obj({
OCGs: [ocgRef],
D: doc.context.obj({ OFF: [ocgRef], ON: [], Order: [ocgRef] }),
}),
);
doc.catalog.set(PDFName.of('OCProperties'), ocPropsRef);

// Unfiltered content stream: a visible run, then a hidden OCG-marked run.
const content = `BT /F1 18 Tf 50 700 Td (VISIBLE) Tj ET
/OC /MC0 BDC
BT /F1 18 Tf 50 650 Td (HIDDEN-LAYER-SECRET) Tj ET
EMC
`;
const contentBytes = new TextEncoder().encode(content);
const contentRef = doc.context.register(
PDFRawStream.of(doc.context.obj({ Length: contentBytes.length }), contentBytes),
);
page.node.set(PDFName.of('Contents'), contentRef);

// Resources: the font as /F1 and the OCG as the /MC0 marked-content property.
page.node.set(
PDFName.of('Resources'),
doc.context.register(
doc.context.obj({
Font: doc.context.obj({ F1: fontRef }),
Properties: doc.context.obj({ MC0: ocgRef }),
}),
),
);

const bytes = await doc.save({ useObjectStreams: false });
const out = fileURLToPath(new URL('./ocg-dirty.pdf', import.meta.url));
writeFileSync(out, bytes);
console.log('wrote', out, bytes.length, 'bytes');
20 changes: 20 additions & 0 deletions packages/engines/examples/node/sanitize/engine-setup.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Shared headless engine setup for the sanitize tests (sharp-free, quiet logger).
import { readFile } from 'node:fs/promises';
import { fileURLToPath } from 'node:url';

import { init } from '@embedpdf/pdfium';
import { PdfiumNative, PdfEngine } from '@embedpdf/engines/pdfium';
import { NoopLogger } from '@embedpdf/models';

export async function makeEngine() {
const logger = new NoopLogger();
const pdfiumModule = await init();
const native = new PdfiumNative(pdfiumModule, { logger });
// No imageConverter: sanitize/metadata/save ops never render.
return new PdfEngine(native, { logger });
}

export async function openDirty(engine, id = 'dirty') {
const content = await readFile(fileURLToPath(new URL('./dirty.pdf', import.meta.url)));
return engine.openDocumentBuffer({ id, content }).toPromise();
}
37 changes: 37 additions & 0 deletions packages/engines/examples/node/sanitize/test-remove-ocg.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Hidden optional-content (OCG) layer: its content must be physically removed
// (not merely hidden), and /OCProperties dropped, while visible content stays.
// Text extraction reads hidden-layer text too, so it is a faithful oracle:
// before the scrub it would include HIDDEN-LAYER-SECRET; after, only VISIBLE.
import { readFile } from 'node:fs/promises';
import { fileURLToPath } from 'node:url';
import { makeEngine } from './engine-setup.mjs';
import { loadDoc, catalogHas, assert } from './assert-helpers.mjs';

const engine = await makeEngine();
const content = await readFile(fileURLToPath(new URL('./ocg-dirty.pdf', import.meta.url)));
const doc = await engine.openDocumentBuffer({ id: 'ocg', content }).toPromise();

const out = await engine
.sanitizeDocument(doc, {
xmp: false,
javascript: false,
embeddedThumbnails: false,
attachments: false,
optionalContentGroups: true,
})
.toPromise();

// /OCProperties removed (independent pdf-lib re-parse).
const parsed = await loadDoc(out);
assert(!catalogHas(parsed, 'OCProperties'), '/OCProperties removed');

// Hidden-layer content physically gone; visible content preserved.
const doc2 = await engine.openDocumentBuffer({ id: 'ocg-check', content: out }).toPromise();
const text = await engine.extractText(doc2, [0]).toPromise();
assert(!text.includes('HIDDEN-LAYER-SECRET'), `hidden OCG content removed (text=${JSON.stringify(text)})`);
assert(text.includes('VISIBLE'), `visible content preserved (text=${JSON.stringify(text)})`);

await engine.closeDocument(doc).toPromise();
await engine.closeDocument(doc2).toPromise();
console.log('PASS test-remove-ocg: hidden-layer content + /OCProperties removed, visible kept');
process.exit(0);
33 changes: 33 additions & 0 deletions packages/engines/examples/node/sanitize/test-sanitize-document.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Full scrub: sanitizeDocument() with defaults must remove every hidden vector
// and emit a single-revision (non-incremental) document. Re-parses the output
// independently with pdf-lib (and the engine for attachments).
import { makeEngine, openDirty } from './engine-setup.mjs';
import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';

const engine = await makeEngine();
const doc = await openDirty(engine);

const out = await engine.sanitizeDocument(doc).toPromise(); // all vectors default true

// Catalog-level vectors, via an independent pdf-lib re-parse.
const parsed = await loadDoc(out);
assert(!catalogHas(parsed, 'Metadata'), 'XMP /Metadata removed');
assert(!catalogHas(parsed, 'OpenAction'), '/OpenAction removed');
assert(!catalogHas(parsed, 'AA'), 'catalog /AA removed');
assert(namesSubtree(parsed, 'JavaScript') === undefined, '/Names /JavaScript removed');
assert(!anyPageHasThumb(parsed), 'page /Thumb removed');

// Attachments, via the engine's own reader on the re-opened output.
const doc2 = await engine.openDocumentBuffer({ id: 'check', content: out }).toPromise();
const attachments = await engine.getAttachments(doc2).toPromise();
assert(attachments.length === 0, `attachments removed (got ${attachments.length})`);

// Non-incremental: a full rewrite has exactly one %%EOF (no retained prior revision).
const text = Buffer.from(out).toString('latin1');
const eofCount = (text.match(/%%EOF/g) || []).length;
assert(eofCount === 1, `single-revision output expected (one %%EOF), got ${eofCount}`);

await engine.closeDocument(doc).toPromise();
await engine.closeDocument(doc2).toPromise();
console.log('PASS test-sanitize-document: all vectors scrubbed, single-revision output');
process.exit(0);
50 changes: 50 additions & 0 deletions packages/engines/examples/node/sanitize/test-vector-isolation.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Each vector flag must remove only its own vector and leave the others intact —
// proving the EPDF_* exports are independently effective and correctly scoped.
import { makeEngine, openDirty } from './engine-setup.mjs';
import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';

const engine = await makeEngine();

// XMP only.
{
const doc = await openDirty(engine, 'xmp');
const out = await engine
.sanitizeDocument(doc, { xmp: true, javascript: false, embeddedThumbnails: false, attachments: false })
.toPromise();
const p = await loadDoc(out);
assert(!catalogHas(p, 'Metadata'), 'xmp-only: XMP /Metadata removed');
assert(catalogHas(p, 'OpenAction'), 'xmp-only: JS /OpenAction preserved');
assert(anyPageHasThumb(p), 'xmp-only: /Thumb preserved');
await engine.closeDocument(doc).toPromise();
}

// JavaScript only.
{
const doc = await openDirty(engine, 'js');
const out = await engine
.sanitizeDocument(doc, { xmp: false, javascript: true, embeddedThumbnails: false, attachments: false })
.toPromise();
const p = await loadDoc(out);
assert(!catalogHas(p, 'OpenAction'), 'js-only: /OpenAction removed');
assert(!catalogHas(p, 'AA'), 'js-only: /AA removed');
assert(namesSubtree(p, 'JavaScript') === undefined, 'js-only: /Names /JavaScript removed');
assert(catalogHas(p, 'Metadata'), 'js-only: XMP preserved');
assert(anyPageHasThumb(p), 'js-only: /Thumb preserved');
await engine.closeDocument(doc).toPromise();
}

// Embedded thumbnails only.
{
const doc = await openDirty(engine, 'thumb');
const out = await engine
.sanitizeDocument(doc, { xmp: false, javascript: false, embeddedThumbnails: true, attachments: false })
.toPromise();
const p = await loadDoc(out);
assert(!anyPageHasThumb(p), 'thumb-only: /Thumb removed');
assert(catalogHas(p, 'Metadata'), 'thumb-only: XMP preserved');
assert(catalogHas(p, 'OpenAction'), 'thumb-only: /OpenAction preserved');
await engine.closeDocument(doc).toPromise();
}

console.log('PASS test-vector-isolation: each vector removed independently, others preserved');
process.exit(0);
13 changes: 13 additions & 0 deletions packages/engines/examples/node/sanitize/verify-dirty-fixture.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';
const bytes = readFileSync(fileURLToPath(new URL('./dirty.pdf', import.meta.url)));
const doc = await loadDoc(bytes);
assert(catalogHas(doc, 'Metadata'), 'XMP /Metadata present');
assert(catalogHas(doc, 'OpenAction'), '/OpenAction present');
assert(namesSubtree(doc, 'JavaScript') !== undefined, '/Names /JavaScript present');
assert(namesSubtree(doc, 'EmbeddedFiles') !== undefined, '/Names /EmbeddedFiles present');
assert(anyPageHasThumb(doc), 'page /Thumb present');
const info = doc.getAuthor();
assert(info === 'Jane Privileged', 'Info author present, got: ' + info);
console.log('FIXTURE OK: all vectors present (XMP, OpenAction, JS name tree, EmbeddedFiles, Thumb, Info author)');
1 change: 1 addition & 0 deletions packages/engines/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
"@types/jest": "^29.5.14",
"@types/react": "^18.2.0",
"jest": "^29.7.0",
"pdf-lib": "^1.17.1",
"ts-jest": "^29.4.6",
"typescript": "^5.0.0"
},
Expand Down
11 changes: 11 additions & 0 deletions packages/engines/src/lib/orchestrator/pdf-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
NoopLogger,
PdfEngine as IPdfEngine,
PdfDocumentObject,
SanitizeOptions,
PdfPageObject,
PdfTask,
PdfErrorReason,
Expand Down Expand Up @@ -1144,6 +1145,16 @@ export class PdfEngine<T = Blob> implements IPdfEngine<T> {
);
}

sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions): PdfTask<ArrayBuffer> {
return this.workerQueue.enqueue(
{
execute: () => this.executor.sanitizeDocument(doc, options),
meta: { docId: doc.id, operation: 'sanitizeDocument' },
},
{ priority: Priority.MEDIUM },
);
}

closeDocument(doc: PdfDocumentObject): PdfTask<boolean> {
return this.workerQueue.enqueue(
{
Expand Down
6 changes: 6 additions & 0 deletions packages/engines/src/lib/orchestrator/remote-executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
Logger,
NoopLogger,
PdfDocumentObject,
SanitizeOptions,
PdfPageObject,
PdfTask,
PdfErrorReason,
Expand Down Expand Up @@ -128,6 +129,7 @@ type MessageType =
| 'mergePages'
| 'preparePrintDocument'
| 'saveAsCopy'
| 'sanitizeDocument'
| 'closeDocument'
| 'closeAllDocuments'
| 'setDocumentEncryption'
Expand Down Expand Up @@ -656,6 +658,10 @@ export class RemoteExecutor implements IPdfiumExecutor {
return this.send<ArrayBuffer>('saveAsCopy', [doc]);
}

sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions): PdfTask<ArrayBuffer> {
return this.send<ArrayBuffer>('sanitizeDocument', [doc, options]);
}

closeDocument(doc: PdfDocumentObject): PdfTask<boolean> {
return this.send<boolean>('closeDocument', [doc]);
}
Expand Down
Loading