From 609802dd78c12a94a44c68ba1ed60b219b9febba Mon Sep 17 00:00:00 2001 From: Phauks <61893497+Phauks@users.noreply.github.com> Date: Sat, 13 Jun 2026 03:38:39 -0700 Subject: [PATCH 1/2] feat: document-sanitization EPDF_* exports (XMP, thumbnails, JavaScript) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three EmbedPDF extension functions for redaction-defensibility scrubbing of non-content hidden vectors, mirroring the existing EPDF_SetMetaText style: - EPDF_RemoveXMPMetadata: drop the catalog /Metadata XMP stream (survives an Info-dict clear — the #1 sanitization miss). - EPDF_RemoveEmbeddedThumbnails: drop every page /Thumb. - EPDF_RemoveAllJavaScript: drop /Names /JavaScript, JS /OpenAction, and /AA. Declared in public/fpdf_doc.h (auto-exported by the WASM build's generator), implemented in fpdfsdk/fpdf_doc.cpp via GetMutableRoot()/RemoveFor. --- fpdfsdk/fpdf_doc.cpp | 59 ++++++++++++++++++++++++++++++++++++++++++++ public/fpdf_doc.h | 34 +++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/fpdfsdk/fpdf_doc.cpp b/fpdfsdk/fpdf_doc.cpp index 799c5a592..c9f84a07b 100644 --- a/fpdfsdk/fpdf_doc.cpp +++ b/fpdfsdk/fpdf_doc.cpp @@ -773,4 +773,63 @@ EPDF_GetMetaKeyName(FPDF_DOCUMENT document, } } return 0; +} + +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveXMPMetadata(FPDF_DOCUMENT document) { + CPDF_Document* pDoc = CPDFDocumentFromFPDFDocument(document); + if (!pDoc) + return false; + + RetainPtr root = pDoc->GetMutableRoot(); + if (!root) + return false; + + // /Metadata is the catalog-level XMP stream (ISO 32000 §14.3.2). It is stored + // separately from /Info, so clearing Info via EPDF_SetMetaText() does not + // touch it. Removing the key drops the XMP from the document. + root->RemoveFor("Metadata"); + return true; +} + +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveEmbeddedThumbnails(FPDF_DOCUMENT document) { + CPDF_Document* pDoc = CPDFDocumentFromFPDFDocument(document); + if (!pDoc) + return false; + + const int count = pDoc->GetPageCount(); + for (int i = 0; i < count; ++i) { + RetainPtr page = pDoc->GetMutablePageDictionary(i); + if (page) + page->RemoveFor("Thumb"); + } + return true; +} + +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveAllJavaScript(FPDF_DOCUMENT document) { + CPDF_Document* pDoc = CPDFDocumentFromFPDFDocument(document); + if (!pDoc) + return false; + + RetainPtr root = pDoc->GetMutableRoot(); + if (!root) + return false; + + // (1) Catalog /Names /JavaScript name tree (document-level scripts). + RetainPtr names = root->GetMutableDictFor("Names"); + if (names) + names->RemoveFor("JavaScript"); + + // (2) /OpenAction, but only when it is a JavaScript action — a GoTo + // destination OpenAction is legitimate navigation and is left intact. + RetainPtr open_action = root->GetDictFor("OpenAction"); + if (open_action && open_action->GetNameFor("S") == "JavaScript") + root->RemoveFor("OpenAction"); + + // (3) Catalog-level /AA additional-actions (e.g. WillClose/WillPrint scripts). + root->RemoveFor("AA"); + + return true; } \ No newline at end of file diff --git a/public/fpdf_doc.h b/public/fpdf_doc.h index 2feee5303..d7c5bfe52 100644 --- a/public/fpdf_doc.h +++ b/public/fpdf_doc.h @@ -517,6 +517,40 @@ EPDF_GetMetaKeyName(FPDF_DOCUMENT document, void* buffer, unsigned long buflen); +// Experimental EmbedPDF Extension API. +// Remove the document's XMP metadata stream (the catalog /Metadata entry). +// +// document - handle to the document. +// +// XMP metadata (ISO 32000 §14.3.2) is stored separately from the Info +// dictionary, so clearing Info via EPDF_SetMetaText() does not remove it. This +// is the #1 redaction-sanitization miss: author/title/history can survive in +// XMP. Returns true on success, including when no /Metadata is present. +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveXMPMetadata(FPDF_DOCUMENT document); + +// Experimental EmbedPDF Extension API. +// Remove every page's embedded thumbnail (the page /Thumb entry). +// +// document - handle to the document. +// +// An embedded thumbnail can retain a pre-redaction image of the page. Returns +// true on success, including when no thumbnails are present. +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveEmbeddedThumbnails(FPDF_DOCUMENT document); + +// Experimental EmbedPDF Extension API. +// Remove all document-level JavaScript from |document|: the catalog +// /Names /JavaScript name tree, /OpenAction when it is a JavaScript action +// (GoTo destinations are preserved), and the catalog /AA additional-actions +// dictionary. +// +// document - handle to the document. +// +// Returns true on success, including when no JavaScript is present. +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveAllJavaScript(FPDF_DOCUMENT document); + // Experimental EmbedPDF Extension API. // Create a new destination array of the form [page /XYZ left top zoom]. // From 8b8a678782a173feea2064430e3a9a51c4904508 Mon Sep 17 00:00:00 2001 From: Phauks <61893497+Phauks@users.noreply.github.com> Date: Sat, 13 Jun 2026 04:19:19 -0700 Subject: [PATCH 2/2] =?UTF-8?q?feat:=20EPDF=5FRemoveOptionalContentGroups?= =?UTF-8?q?=20=E2=80=94=20strip=20hidden=20optional-content=20layers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-page, drop page objects not visible under the default (View) OC config (CPDF_OCContext::CheckPageObjectVisible resolves OCG/OCMD/VE), regenerate the content, then remove the catalog /OCProperties. Excises hidden-layer content rather than just deleting /OCProperties (which would reveal it). Declared in public/fpdf_doc.h, implemented in fpdfsdk/fpdf_editpage.cpp (reuses FPDF_LoadPage for content parsing + CPDF_PageContentGenerator for regen). --- fpdfsdk/fpdf_editpage.cpp | 54 +++++++++++++++++++++++++++++++++++++++ public/fpdf_doc.h | 13 ++++++++++ 2 files changed, 67 insertions(+) diff --git a/fpdfsdk/fpdf_editpage.cpp b/fpdfsdk/fpdf_editpage.cpp index 06ed956c3..464ef4da6 100644 --- a/fpdfsdk/fpdf_editpage.cpp +++ b/fpdfsdk/fpdf_editpage.cpp @@ -5,6 +5,7 @@ // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #include "public/fpdf_edit.h" +#include "public/fpdfview.h" #include #include @@ -18,6 +19,7 @@ #include "core/fpdfapi/page/cpdf_form.h" #include "core/fpdfapi/page/cpdf_formobject.h" #include "core/fpdfapi/page/cpdf_imageobject.h" +#include "core/fpdfapi/page/cpdf_occontext.h" #include "core/fpdfapi/page/cpdf_page.h" #include "core/fpdfapi/page/cpdf_pageimagecache.h" #include "core/fpdfapi/page/cpdf_pageobject.h" @@ -1218,3 +1220,55 @@ FPDFFormObj_RemoveObject(FPDF_PAGEOBJECT form_object, removed_object.release(); return true; } + +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveOptionalContentGroups(FPDF_DOCUMENT document) { + CPDF_Document* pDoc = CPDFDocumentFromFPDFDocument(document); + if (!pDoc) { + return false; + } + + // Visibility under the default (View) configuration: an object is "hidden" + // when an OFF optional-content group (or OCMD / VE expression) suppresses it. + auto oc = pdfium::MakeRetain(pDoc, CPDF_OCContext::kView); + + const int page_count = FPDF_GetPageCount(document); + for (int i = 0; i < page_count; ++i) { + // FPDF_LoadPage parses the page content, so the object list is populated. + FPDF_PAGE page = FPDF_LoadPage(document, i); + if (!page) { + continue; + } + + CPDF_Page* pPage = CPDFPageFromFPDFPage(page); + if (IsPageObject(pPage)) { + // Collect hidden objects first, then remove, so the list is not mutated + // mid-iteration. + std::vector hidden; + const size_t count = pPage->GetPageObjectCount(); + for (size_t k = 0; k < count; ++k) { + CPDF_PageObject* obj = pPage->GetPageObjectByIndex(k); + if (obj && !oc->CheckPageObjectVisible(obj)) { + hidden.push_back(obj); + } + } + if (!hidden.empty()) { + for (CPDF_PageObject* obj : hidden) { + // Dropping the returned unique_ptr frees the removed object. + pPage->RemovePageObject(obj); + } + CPDF_PageContentGenerator generator(pPage); + generator.GenerateContent(); + } + } + + FPDF_ClosePage(page); + } + + // With the governed content removed, drop the optional-content machinery. + RetainPtr root = pDoc->GetMutableRoot(); + if (root) { + root->RemoveFor("OCProperties"); + } + return true; +} diff --git a/public/fpdf_doc.h b/public/fpdf_doc.h index d7c5bfe52..a4dd9a8de 100644 --- a/public/fpdf_doc.h +++ b/public/fpdf_doc.h @@ -551,6 +551,19 @@ EPDF_RemoveEmbeddedThumbnails(FPDF_DOCUMENT document); FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV EPDF_RemoveAllJavaScript(FPDF_DOCUMENT document); +// Experimental EmbedPDF Extension API. +// Remove content governed by hidden optional-content groups (OCGs / layers): +// for each page, drop page objects not visible under the default (View) +// configuration, regenerate the page content, then remove the catalog +// /OCProperties. This excises hidden-layer content rather than merely deleting +// /OCProperties (which would make that content visible). +// +// document - handle to the document. +// +// Returns true on success, including when no optional content is present. +FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV +EPDF_RemoveOptionalContentGroups(FPDF_DOCUMENT document); + // Experimental EmbedPDF Extension API. // Create a new destination array of the form [page /XYZ left top zoom]. //