Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions packages/engines/examples/node/sanitize/assert-helpers.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Shared assertion helpers for the sanitize tests. These re-parse saved output
// with pdf-lib independently of the engine that produced it, so a "vector gone"
// assertion is a true second-opinion check, not the engine confirming itself.
import { PDFDocument, PDFName, PDFDict } from 'pdf-lib';

export async function loadDoc(bytes) {
// updateMetadata: false so load() does not itself rewrite the Info dict.
const doc = await PDFDocument.load(bytes, { updateMetadata: false });
return doc;
}

export function catalogHas(doc, key) {
return doc.catalog.get(PDFName.of(key)) !== undefined;
}

export function namesSubtree(doc, key) {
// lookupMaybe (not lookup) so a missing/removed /Names tree returns undefined
// instead of throwing.
const names = doc.catalog.lookupMaybe(PDFName.of('Names'), PDFDict);
if (!names) return undefined;
return names.get(PDFName.of(key));
}

export function anyPageHasThumb(doc) {
return doc.getPages().some((p) => p.node.get(PDFName.of('Thumb')) !== undefined);
}

export function assert(cond, msg) {
if (!cond) {
console.error('ASSERT FAILED:', msg);
process.exit(1);
}
}
76 changes: 76 additions & 0 deletions packages/engines/examples/node/sanitize/build-dirty-fixture.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Builds a deterministic "dirty" PDF carrying every non-content hidden vector the
// sanitize primitive must reach: Info metadata, an XMP /Metadata stream, document
// JavaScript (/OpenAction + /Names /JavaScript), a page /Thumb, and an attachment.
//
// Run: node build-dirty-fixture.mjs -> writes dirty.pdf next to this file.
//
// pdf-lib notes: context.obj() turns string VALUES into PDFString, so every
// name-valued entry (/S, /Type, ...) is wrapped in PDFName.of(). doc.attach()
// creates the catalog /Names tree, so JavaScript is merged into the existing
// Names dict AFTER attaching rather than overwriting it.
import { writeFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { PDFDocument, PDFName, PDFRawStream, PDFString } from 'pdf-lib';

const doc = await PDFDocument.create();
const page = doc.addPage([200, 200]);
page.drawText('Secret 123-45-6789', { x: 20, y: 100, size: 12 });

// (a) Info-dictionary metadata
doc.setAuthor('Jane Privileged');
doc.setTitle('PRIVILEGED - draft settlement');
doc.setProducer('MagnaCartaFixture');

// (b) XMP /Metadata stream on the catalog
const xmp = `<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:creator><rdf:Seq><rdf:li>Jane Privileged</rdf:li></rdf:Seq></dc:creator>
<dc:title><rdf:Alt><rdf:li xml:lang="x-default">PRIVILEGED - draft settlement</rdf:li></rdf:Alt></dc:title>
</rdf:Description></rdf:RDF></x:xmpmeta><?xpacket end="w"?>`;
const xmpBytes = new TextEncoder().encode(xmp);
const xmpStream = PDFRawStream.of(
doc.context.obj({ Type: PDFName.of('Metadata'), Subtype: PDFName.of('XML'), Length: xmpBytes.length }),
xmpBytes,
);
doc.catalog.set(PDFName.of('Metadata'), doc.context.register(xmpStream));

// (c) Document JavaScript: /OpenAction (a JS action) + /Names /JavaScript name tree
const jsCode = 'app.alert("phone home");';
const jsAction = doc.context.obj({
Type: PDFName.of('Action'),
S: PDFName.of('JavaScript'),
JS: PDFString.of(jsCode),
});
const jsActionRef = doc.context.register(jsAction);
doc.catalog.set(PDFName.of('OpenAction'), jsActionRef);

// (d) Page /Thumb (presence is the test; pixel content is irrelevant)
const thumbBytes = new Uint8Array([0x00, 0x7f, 0xff, 0x80]);
const thumb = PDFRawStream.of(
doc.context.obj({
Type: PDFName.of('XObject'), Subtype: PDFName.of('Image'),
Width: 2, Height: 2, ColorSpace: PDFName.of('DeviceGray'),
BitsPerComponent: 8, Length: thumbBytes.length,
}),
thumbBytes,
);
page.node.set(PDFName.of('Thumb'), doc.context.register(thumb));

// (e) Attachment (creates catalog /Names /EmbeddedFiles)
await doc.attach(new Uint8Array([1, 2, 3, 4]), 'evidence.bin', {
mimeType: 'application/octet-stream',
description: 'embedded file',
});

// (c continued) build our own /Names dict carrying /JavaScript. doc.attach() defers
// creating the /Names tree until save(), where it merges /EmbeddedFiles INTO this
// same dict (pdf-lib's embedAll uses lookupMaybe), so both subtrees coexist.
const jsNameTree = doc.context.obj({ Names: [PDFString.of('MagnaCartaJS'), jsActionRef] });
const namesDict = doc.context.obj({ JavaScript: doc.context.register(jsNameTree) });
doc.catalog.set(PDFName.of('Names'), doc.context.register(namesDict));

const bytes = await doc.save({ useObjectStreams: false });
const out = fileURLToPath(new URL('./dirty.pdf', import.meta.url));
writeFileSync(out, bytes);
console.log('wrote', out, bytes.length, 'bytes');
20 changes: 20 additions & 0 deletions packages/engines/examples/node/sanitize/engine-setup.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Shared headless engine setup for the sanitize tests (sharp-free, quiet logger).
import { readFile } from 'node:fs/promises';
import { fileURLToPath } from 'node:url';

import { init } from '@embedpdf/pdfium';
import { PdfiumNative, PdfEngine } from '@embedpdf/engines/pdfium';
import { NoopLogger } from '@embedpdf/models';

export async function makeEngine() {
const logger = new NoopLogger();
const pdfiumModule = await init();
const native = new PdfiumNative(pdfiumModule, { logger });
// No imageConverter: sanitize/metadata/save ops never render.
return new PdfEngine(native, { logger });
}

export async function openDirty(engine, id = 'dirty') {
const content = await readFile(fileURLToPath(new URL('./dirty.pdf', import.meta.url)));
return engine.openDocumentBuffer({ id, content }).toPromise();
}
33 changes: 33 additions & 0 deletions packages/engines/examples/node/sanitize/test-sanitize-document.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Full scrub: sanitizeDocument() with defaults must remove every hidden vector
// and emit a single-revision (non-incremental) document. Re-parses the output
// independently with pdf-lib (and the engine for attachments).
import { makeEngine, openDirty } from './engine-setup.mjs';
import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';

const engine = await makeEngine();
const doc = await openDirty(engine);

const out = await engine.sanitizeDocument(doc).toPromise(); // all vectors default true

// Catalog-level vectors, via an independent pdf-lib re-parse.
const parsed = await loadDoc(out);
assert(!catalogHas(parsed, 'Metadata'), 'XMP /Metadata removed');
assert(!catalogHas(parsed, 'OpenAction'), '/OpenAction removed');
assert(!catalogHas(parsed, 'AA'), 'catalog /AA removed');
assert(namesSubtree(parsed, 'JavaScript') === undefined, '/Names /JavaScript removed');
assert(!anyPageHasThumb(parsed), 'page /Thumb removed');

// Attachments, via the engine's own reader on the re-opened output.
const doc2 = await engine.openDocumentBuffer({ id: 'check', content: out }).toPromise();
const attachments = await engine.getAttachments(doc2).toPromise();
assert(attachments.length === 0, `attachments removed (got ${attachments.length})`);

// Non-incremental: a full rewrite has exactly one %%EOF (no retained prior revision).
const text = Buffer.from(out).toString('latin1');
const eofCount = (text.match(/%%EOF/g) || []).length;
assert(eofCount === 1, `single-revision output expected (one %%EOF), got ${eofCount}`);

await engine.closeDocument(doc).toPromise();
await engine.closeDocument(doc2).toPromise();
console.log('PASS test-sanitize-document: all vectors scrubbed, single-revision output');
process.exit(0);
50 changes: 50 additions & 0 deletions packages/engines/examples/node/sanitize/test-vector-isolation.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Each vector flag must remove only its own vector and leave the others intact —
// proving the EPDF_* exports are independently effective and correctly scoped.
import { makeEngine, openDirty } from './engine-setup.mjs';
import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';

const engine = await makeEngine();

// XMP only.
{
const doc = await openDirty(engine, 'xmp');
const out = await engine
.sanitizeDocument(doc, { xmp: true, javascript: false, embeddedThumbnails: false, attachments: false })
.toPromise();
const p = await loadDoc(out);
assert(!catalogHas(p, 'Metadata'), 'xmp-only: XMP /Metadata removed');
assert(catalogHas(p, 'OpenAction'), 'xmp-only: JS /OpenAction preserved');
assert(anyPageHasThumb(p), 'xmp-only: /Thumb preserved');
await engine.closeDocument(doc).toPromise();
}

// JavaScript only.
{
const doc = await openDirty(engine, 'js');
const out = await engine
.sanitizeDocument(doc, { xmp: false, javascript: true, embeddedThumbnails: false, attachments: false })
.toPromise();
const p = await loadDoc(out);
assert(!catalogHas(p, 'OpenAction'), 'js-only: /OpenAction removed');
assert(!catalogHas(p, 'AA'), 'js-only: /AA removed');
assert(namesSubtree(p, 'JavaScript') === undefined, 'js-only: /Names /JavaScript removed');
assert(catalogHas(p, 'Metadata'), 'js-only: XMP preserved');
assert(anyPageHasThumb(p), 'js-only: /Thumb preserved');
await engine.closeDocument(doc).toPromise();
}

// Embedded thumbnails only.
{
const doc = await openDirty(engine, 'thumb');
const out = await engine
.sanitizeDocument(doc, { xmp: false, javascript: false, embeddedThumbnails: true, attachments: false })
.toPromise();
const p = await loadDoc(out);
assert(!anyPageHasThumb(p), 'thumb-only: /Thumb removed');
assert(catalogHas(p, 'Metadata'), 'thumb-only: XMP preserved');
assert(catalogHas(p, 'OpenAction'), 'thumb-only: /OpenAction preserved');
await engine.closeDocument(doc).toPromise();
}

console.log('PASS test-vector-isolation: each vector removed independently, others preserved');
process.exit(0);
13 changes: 13 additions & 0 deletions packages/engines/examples/node/sanitize/verify-dirty-fixture.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { loadDoc, catalogHas, namesSubtree, anyPageHasThumb, assert } from './assert-helpers.mjs';
const bytes = readFileSync(fileURLToPath(new URL('./dirty.pdf', import.meta.url)));
const doc = await loadDoc(bytes);
assert(catalogHas(doc, 'Metadata'), 'XMP /Metadata present');
assert(catalogHas(doc, 'OpenAction'), '/OpenAction present');
assert(namesSubtree(doc, 'JavaScript') !== undefined, '/Names /JavaScript present');
assert(namesSubtree(doc, 'EmbeddedFiles') !== undefined, '/Names /EmbeddedFiles present');
assert(anyPageHasThumb(doc), 'page /Thumb present');
const info = doc.getAuthor();
assert(info === 'Jane Privileged', 'Info author present, got: ' + info);
console.log('FIXTURE OK: all vectors present (XMP, OpenAction, JS name tree, EmbeddedFiles, Thumb, Info author)');
1 change: 1 addition & 0 deletions packages/engines/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
"@types/jest": "^29.5.14",
"@types/react": "^18.2.0",
"jest": "^29.7.0",
"pdf-lib": "^1.17.1",
"ts-jest": "^29.4.6",
"typescript": "^5.0.0"
},
Expand Down
11 changes: 11 additions & 0 deletions packages/engines/src/lib/orchestrator/pdf-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
NoopLogger,
PdfEngine as IPdfEngine,
PdfDocumentObject,
SanitizeOptions,
PdfPageObject,
PdfTask,
PdfErrorReason,
Expand Down Expand Up @@ -1144,6 +1145,16 @@ export class PdfEngine<T = Blob> implements IPdfEngine<T> {
);
}

sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions): PdfTask<ArrayBuffer> {
return this.workerQueue.enqueue(
{
execute: () => this.executor.sanitizeDocument(doc, options),
meta: { docId: doc.id, operation: 'sanitizeDocument' },
},
{ priority: Priority.MEDIUM },
);
}

closeDocument(doc: PdfDocumentObject): PdfTask<boolean> {
return this.workerQueue.enqueue(
{
Expand Down
6 changes: 6 additions & 0 deletions packages/engines/src/lib/orchestrator/remote-executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
Logger,
NoopLogger,
PdfDocumentObject,
SanitizeOptions,
PdfPageObject,
PdfTask,
PdfErrorReason,
Expand Down Expand Up @@ -128,6 +129,7 @@ type MessageType =
| 'mergePages'
| 'preparePrintDocument'
| 'saveAsCopy'
| 'sanitizeDocument'
| 'closeDocument'
| 'closeAllDocuments'
| 'setDocumentEncryption'
Expand Down Expand Up @@ -656,6 +658,10 @@ export class RemoteExecutor implements IPdfiumExecutor {
return this.send<ArrayBuffer>('saveAsCopy', [doc]);
}

sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions): PdfTask<ArrayBuffer> {
return this.send<ArrayBuffer>('sanitizeDocument', [doc, options]);
}

closeDocument(doc: PdfDocumentObject): PdfTask<boolean> {
return this.send<boolean>('closeDocument', [doc]);
}
Expand Down
53 changes: 53 additions & 0 deletions packages/engines/src/lib/pdfium/engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
PdfDestinationObject,
PdfBookmarkObject,
PdfDocumentObject,
SanitizeOptions,
PdfPageObject,
PdfPageBoxes,
Box,
Expand Down Expand Up @@ -3237,6 +3238,58 @@ export class PdfiumNative implements IPdfiumExecutor {
return PdfTaskHelper.resolve(buffer);
}

/**
* {@inheritDoc @embedpdf/models!PdfEngine.sanitizeDocument}
*
* @public
*/
sanitizeDocument(doc: PdfDocumentObject, options: SanitizeOptions = {}) {
this.logger.debug(LOG_SOURCE, LOG_CATEGORY, 'sanitizeDocument', doc, options);
this.logger.perf(LOG_SOURCE, LOG_CATEGORY, `SanitizeDocument`, 'Begin', doc.id);

const ctx = this.cache.getContext(doc.id);

if (!ctx) {
this.logger.perf(LOG_SOURCE, LOG_CATEGORY, `SanitizeDocument`, 'End', doc.id);
return PdfTaskHelper.reject({
code: PdfErrorCode.DocNotOpen,
message: 'document does not open',
});
}

const opts = {
xmp: true,
javascript: true,
embeddedThumbnails: true,
attachments: true,
...options,
};

if (opts.xmp) {
this.pdfiumModule.EPDF_RemoveXMPMetadata(ctx.docPtr);
}
if (opts.javascript) {
this.pdfiumModule.EPDF_RemoveAllJavaScript(ctx.docPtr);
}
if (opts.embeddedThumbnails) {
this.pdfiumModule.EPDF_RemoveEmbeddedThumbnails(ctx.docPtr);
}
if (opts.attachments) {
// Delete from the end so earlier indices stay valid as entries are removed.
const count = this.pdfiumModule.FPDFDoc_GetAttachmentCount(ctx.docPtr);
for (let i = count - 1; i >= 0; i--) {
this.pdfiumModule.FPDFDoc_DeleteAttachment(ctx.docPtr, i);
}
}

// Full, non-incremental rewrite (saveDocument -> PDFiumExt_SaveAsCopy with no
// incremental flag), so prior-revision content cannot survive in the output.
const buffer = this.saveDocument(ctx.docPtr);

this.logger.perf(LOG_SOURCE, LOG_CATEGORY, `SanitizeDocument`, 'End', doc.id);
return PdfTaskHelper.resolve(buffer);
}

/**
* {@inheritDoc @embedpdf/models!PdfEngine.closeDocument}
*
Expand Down
17 changes: 17 additions & 0 deletions packages/engines/src/lib/webworker/engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import {
PdfAddAttachmentParams,
AnnotationAppearanceMap,
ImageDataLike,
SanitizeOptions,
} from '@embedpdf/models';
import { ExecuteRequest, Response, SpecificExecuteRequest } from './runner';

Expand Down Expand Up @@ -774,6 +775,22 @@ export class WebWorkerEngine implements PdfEngine {
return task;
}

/**
* {@inheritDoc @embedpdf/models!PdfEngine.sanitizeDocument}
*
* @public
*/
sanitizeDocument(doc: PdfDocumentObject, options?: SanitizeOptions) {
this.logger.debug(LOG_SOURCE, LOG_CATEGORY, 'sanitizeDocument', doc, options);
const requestId = this.generateRequestId(doc.id);
const task = new WorkerTask<ArrayBuffer>(this.worker, requestId);

const request: ExecuteRequest = createRequest(requestId, 'sanitizeDocument', [doc, options]);
this.proxy(task, request);

return task;
}

/**
* {@inheritDoc @embedpdf/models!PdfEngine.getAttachments}
*
Expand Down
3 changes: 3 additions & 0 deletions packages/engines/src/lib/webworker/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,9 @@ export class EngineRunner {
case 'saveAsCopy':
this.handleTask(request.id, engine.saveAsCopy!(...args));
return;
case 'sanitizeDocument':
this.handleTask(request.id, engine.sanitizeDocument!(...args));
return;
case 'getAttachments':
this.handleTask(request.id, engine.getAttachments!(...args));
return;
Expand Down
Loading