Skip to content

Commit 8fc3ec7

Browse files
authored
feat: AES-256, LTV signatures, PDF/A conversion, PDF/UA validation (#35)
* feat(encryption): add AES-256 (V=5, R=6) encryption PDF 2.0 AES-256 with SHA-based key derivation, random file encryption key, and per-object encryption without MD5 key derivation. * feat(signatures): add LTV signatures with TSA timestamps CMS signed attributes (content-type, message-digest, signing-time), RFC 3161 timestamp client, full cert chain support, and timestamp/ revocation detection in verification. * feat(conversion): add PDF/A conversion with XMP and ICC profile Converts documents to PDF/A by writing XMP metadata, embedding sRGB ICC profile, removing JS/actions, and stripping transparency (PDF/A-1). * feat(validation): add PDF/UA accessibility validation Checks MarkInfo, Lang, StructTreeRoot, figure alt text, heading hierarchy, tab order, annotation accessibility, and tagged content. * feat(wasm): expose AES-256, PDF/A conversion, and PDF/UA validation Add algorithm parameter to encrypt(), and new validatePdfA(), validatePdfUa(), and convertToPdfA() WASM bindings. * docs: update guides, API reference, and type stubs for new features Add AES-256, LTV signatures, PDF/A conversion, and PDF/UA validation to security guide, signatures guide, API docs, and TYPE_CHECKING stubs. * refactor: replace monkey-patching with mixin classes Convert all 12 feature modules from Document.method = _func monkey-patching to mixin classes that Document inherits from. Eliminates all type: ignore [method-assign] comments and the 700-line TYPE_CHECKING stub block in _document.py. Adds _protocols.py with DocumentBase/PageBase for mypy.
1 parent 586a535 commit 8fc3ec7

56 files changed

Lines changed: 4868 additions & 2802 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,5 @@ rsa = "0.9"
3232
p256 = { version = "0.13", features = ["ecdsa"] }
3333
pkcs8 = "0.10"
3434
roxmltree = "0.20"
35+
ureq = "2"
3536
tokio = { version = "1", features = ["rt-multi-thread"] }

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,13 @@ md = await doc.ato_markdown()
4242
- **Metadata & bookmarks** — read and edit document properties and outline
4343
- **Annotations & watermarks** — add, read, remove annotations; text watermarks
4444
- **Forms** — inspect, fill, create, and modify form fields
45-
- **Security** — encryption (AES-128/RC4), sanitization, true content-stream redaction
45+
- **Security** — encryption (AES-128/256, RC4), sanitization, true content-stream redaction
4646
- **PDF diff** — text-level comparison of two documents
4747
- **Layout analysis** — multi-column detection, header/footer identification
4848
- **Native async** — powered by Rust and tokio, no Python thread pools
49+
- **Digital signatures** — sign, verify, and inspect with LTV timestamp support
50+
- **PDF/A** — validation and conversion (XMP, ICC profiles, transparency removal)
51+
- **PDF/UA** — accessibility validation (structure tree, alt text, tagged content)
4952
- **WASM playground** — try it in the browser at [docs.byteveda.org/paperjam](https://docs.byteveda.org/paperjam/)
5053

5154
## Documentation

crates/paperjam-core/Cargo.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,20 @@ image = { workspace = true, optional = true }
2626
x509-parser = { workspace = true, optional = true }
2727
cms = { workspace = true, optional = true }
2828
der = { workspace = true, optional = true }
29-
sha2 = { workspace = true, optional = true }
29+
sha2 = { workspace = true }
3030
spki = { workspace = true, optional = true }
3131
sha1 = { workspace = true, optional = true }
3232
rsa = { workspace = true, optional = true }
3333
p256 = { workspace = true, optional = true }
3434
pkcs8 = { workspace = true, optional = true }
3535
roxmltree = { workspace = true, optional = true }
36+
ureq = { workspace = true, optional = true }
3637

3738
[features]
3839
default = ["parallel"]
3940
parallel = ["dep:rayon"]
4041
mmap = ["dep:memmap2"]
4142
render = ["dep:pdfium-render", "dep:image"]
42-
signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha2", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"]
43+
signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"]
44+
ltv = ["signatures", "dep:ureq"]
4345
validation = ["dep:roxmltree"]

crates/paperjam-core/data/sRGB.icc

500 Bytes
Binary file not shown.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//! JavaScript and action removal for PDF/A conversion.
2+
//!
3+
//! Reuses logic from the sanitize module.
4+
5+
use crate::document::Document;
6+
use crate::error::Result;
7+
use crate::sanitize::{SanitizeOptions, SanitizedItem};
8+
9+
/// Remove JavaScript and prohibited actions for PDF/A compliance.
10+
///
11+
/// Returns the list of items that were removed.
12+
pub fn remove_prohibited_actions(doc: &Document) -> Result<(Document, Vec<String>)> {
13+
let options = SanitizeOptions {
14+
remove_javascript: true,
15+
remove_embedded_files: false,
16+
remove_actions: true,
17+
remove_links: false,
18+
};
19+
20+
let (sanitized, result) = crate::sanitize::sanitize(doc, &options)?;
21+
let mut actions = Vec::new();
22+
23+
for item in &result.items {
24+
actions.push(format_action(item));
25+
}
26+
27+
Ok((sanitized, actions))
28+
}
29+
30+
fn format_action(item: &SanitizedItem) -> String {
31+
if let Some(page) = item.page {
32+
format!(
33+
"Removed {} on page {}: {}",
34+
item.category, page, item.description
35+
)
36+
} else {
37+
format!("Removed {}: {}", item.category, item.description)
38+
}
39+
}
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
//! PDF/A conversion: convert a document to PDF/A conformance.
2+
3+
mod actions;
4+
mod output_intent;
5+
mod transparency;
6+
mod xmp;
7+
8+
use crate::document::Document;
9+
use crate::error::{PdfError, Result};
10+
use crate::validation::{PdfALevel, Severity, ValidationIssue};
11+
12+
/// Options for PDF/A conversion.
13+
pub struct ConversionOptions {
14+
/// Target PDF/A conformance level.
15+
pub level: PdfALevel,
16+
/// If true, proceed even if some issues (like unembedded fonts) cannot be fixed.
17+
pub force: bool,
18+
}
19+
20+
/// A single action taken during conversion.
21+
#[derive(Debug, Clone)]
22+
pub struct ConversionAction {
23+
pub category: String,
24+
pub description: String,
25+
pub page: Option<u32>,
26+
}
27+
28+
/// Result of a PDF/A conversion.
29+
#[derive(Debug, Clone)]
30+
pub struct ConversionResult {
31+
pub level: PdfALevel,
32+
pub success: bool,
33+
pub actions_taken: Vec<ConversionAction>,
34+
pub remaining_issues: Vec<ValidationIssue>,
35+
}
36+
37+
/// Convert a PDF document to PDF/A conformance.
38+
///
39+
/// Performs the following transformations:
40+
/// 1. Removes encryption dictionary (if present)
41+
/// 2. Writes/updates XMP metadata with PDF/A identification
42+
/// 3. Adds sRGB OutputIntent with ICC profile (if missing)
43+
/// 4. Strips JavaScript and prohibited actions
44+
/// 5. Removes transparency (PDF/A-1 only)
45+
///
46+
/// Font embedding is NOT performed — documents with unembedded fonts will
47+
/// have those reported as remaining issues.
48+
pub fn convert_to_pdf_a(
49+
doc: &Document,
50+
options: &ConversionOptions,
51+
) -> Result<(Document, ConversionResult)> {
52+
let mut inner = doc.inner().clone();
53+
let mut all_actions = Vec::new();
54+
55+
// 1. Remove encryption
56+
let encryption_actions = remove_encryption(&mut inner);
57+
all_actions.extend(encryption_actions);
58+
59+
// 2. XMP metadata
60+
let xmp_actions = xmp::ensure_xmp_metadata(&mut inner, options.level)?;
61+
all_actions.extend(xmp_actions);
62+
63+
// 3. OutputIntent with ICC profile
64+
let oi_actions = output_intent::ensure_output_intent(&mut inner)?;
65+
all_actions.extend(oi_actions);
66+
67+
// 4. Remove JS and actions (uses sanitize module, needs a Document wrapper)
68+
let temp_doc = Document::from_lopdf(inner)?;
69+
let (sanitized_doc, action_descriptions) = actions::remove_prohibited_actions(&temp_doc)?;
70+
inner = sanitized_doc.inner().clone();
71+
all_actions.extend(action_descriptions);
72+
73+
// 5. Remove transparency (PDF/A-1 only)
74+
if matches!(options.level, PdfALevel::A1b | PdfALevel::A1a) {
75+
let page_map = inner.get_pages();
76+
let trans_actions = transparency::remove_transparency(&mut inner, &page_map);
77+
all_actions.extend(trans_actions);
78+
}
79+
80+
// Build result document
81+
let result_doc = Document::from_lopdf(inner)?;
82+
83+
// Validate the result to find remaining issues
84+
let report = crate::validation::validate_pdf_a(&result_doc, options.level)?;
85+
let remaining_issues = report.issues;
86+
87+
// Check for unresolvable issues (like unembedded fonts)
88+
let has_errors = remaining_issues
89+
.iter()
90+
.any(|i| i.severity == Severity::Error);
91+
92+
if has_errors && !options.force {
93+
return Err(PdfError::Conversion(format!(
94+
"PDF/A conversion has {} remaining error(s) that cannot be automatically fixed. \
95+
Use force=true to proceed. Issues: {}",
96+
remaining_issues
97+
.iter()
98+
.filter(|i| i.severity == Severity::Error)
99+
.count(),
100+
remaining_issues
101+
.iter()
102+
.filter(|i| i.severity == Severity::Error)
103+
.map(|i| i.message.as_str())
104+
.collect::<Vec<_>>()
105+
.join("; "),
106+
)));
107+
}
108+
109+
let actions_taken = all_actions
110+
.into_iter()
111+
.map(|desc| ConversionAction {
112+
category: categorize_action(&desc),
113+
description: desc,
114+
page: None,
115+
})
116+
.collect();
117+
118+
Ok((
119+
result_doc,
120+
ConversionResult {
121+
level: options.level,
122+
success: !has_errors,
123+
actions_taken,
124+
remaining_issues,
125+
},
126+
))
127+
}
128+
129+
/// Remove encryption dictionary from the document.
130+
fn remove_encryption(doc: &mut lopdf::Document) -> Vec<String> {
131+
let mut actions = Vec::new();
132+
133+
if let Ok(encrypt_ref) = doc.trailer.get(b"Encrypt") {
134+
if let Ok(id) = encrypt_ref.as_reference() {
135+
doc.objects.remove(&id);
136+
}
137+
doc.trailer.remove(b"Encrypt");
138+
actions.push("Removed encryption dictionary".to_string());
139+
}
140+
141+
actions
142+
}
143+
144+
fn categorize_action(desc: &str) -> String {
145+
if desc.contains("XMP") || desc.contains("metadata") {
146+
"metadata".to_string()
147+
} else if desc.contains("OutputIntent") || desc.contains("ICC") {
148+
"color".to_string()
149+
} else if desc.contains("encryption") {
150+
"encryption".to_string()
151+
} else if desc.contains("transparency") || desc.contains("blend") || desc.contains("mask") {
152+
"transparency".to_string()
153+
} else if desc.contains("javascript") || desc.contains("action") || desc.contains("Removed") {
154+
"actions".to_string()
155+
} else {
156+
"other".to_string()
157+
}
158+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//! sRGB ICC profile embedding and OutputIntent creation for PDF/A.
2+
3+
use lopdf::{dictionary, Object};
4+
5+
use crate::error::{PdfError, Result};
6+
7+
/// Embedded sRGB IEC61966-2.1 ICC profile.
8+
const SRGB_ICC_PROFILE: &[u8] = include_bytes!("../../data/sRGB.icc");
9+
10+
/// Ensure the document has an OutputIntent with an sRGB ICC profile.
11+
pub fn ensure_output_intent(doc: &mut lopdf::Document) -> Result<Vec<String>> {
12+
let mut actions = Vec::new();
13+
14+
let catalog_id = doc
15+
.trailer
16+
.get(b"Root")
17+
.map_err(|_| PdfError::Conversion("No /Root".to_string()))?
18+
.as_reference()
19+
.map_err(|_| PdfError::Conversion("/Root not ref".to_string()))?;
20+
21+
// Check if a PDF/A OutputIntent already exists
22+
let has_pdfa_intent = {
23+
let catalog = doc
24+
.get_object(catalog_id)
25+
.map_err(|e| PdfError::Conversion(format!("catalog: {}", e)))?
26+
.as_dict()
27+
.map_err(|_| PdfError::Conversion("catalog not dict".to_string()))?;
28+
29+
if let Ok(Object::Array(arr)) = catalog.get(b"OutputIntents") {
30+
arr.iter().any(|item| {
31+
let d = match item {
32+
Object::Reference(id) => {
33+
doc.get_object(*id).ok().and_then(|o| o.as_dict().ok())
34+
}
35+
Object::Dictionary(d) => Some(d),
36+
_ => None,
37+
};
38+
d.is_some_and(|d| matches!(d.get(b"S"), Ok(Object::Name(n)) if n == b"GTS_PDFA1"))
39+
})
40+
} else {
41+
false
42+
}
43+
};
44+
45+
if has_pdfa_intent {
46+
return Ok(actions);
47+
}
48+
49+
// Create ICC profile stream
50+
let icc_stream = lopdf::Stream::new(
51+
dictionary! {
52+
"N" => Object::Integer(3),
53+
"Alternate" => Object::Name(b"DeviceRGB".to_vec()),
54+
"Length" => Object::Integer(SRGB_ICC_PROFILE.len() as i64)
55+
},
56+
SRGB_ICC_PROFILE.to_vec(),
57+
);
58+
let icc_id = doc.add_object(Object::Stream(icc_stream));
59+
60+
// Create OutputIntent dictionary
61+
let output_intent = dictionary! {
62+
"Type" => Object::Name(b"OutputIntent".to_vec()),
63+
"S" => Object::Name(b"GTS_PDFA1".to_vec()),
64+
"OutputConditionIdentifier" => Object::String(
65+
b"sRGB IEC61966-2.1".to_vec(),
66+
lopdf::StringFormat::Literal,
67+
),
68+
"RegistryName" => Object::String(
69+
b"http://www.color.org".to_vec(),
70+
lopdf::StringFormat::Literal,
71+
),
72+
"DestOutputProfile" => Object::Reference(icc_id)
73+
};
74+
let oi_id = doc.add_object(Object::Dictionary(output_intent));
75+
76+
// Add to catalog /OutputIntents
77+
let catalog = doc
78+
.get_object_mut(catalog_id)
79+
.map_err(|e| PdfError::Conversion(format!("catalog: {}", e)))?
80+
.as_dict_mut()
81+
.map_err(|_| PdfError::Conversion("catalog not dict".to_string()))?;
82+
83+
match catalog.get_mut(b"OutputIntents") {
84+
Ok(Object::Array(arr)) => {
85+
arr.push(Object::Reference(oi_id));
86+
}
87+
_ => {
88+
catalog.set(
89+
"OutputIntents",
90+
Object::Array(vec![Object::Reference(oi_id)]),
91+
);
92+
}
93+
}
94+
95+
actions.push("Added sRGB OutputIntent with ICC profile".to_string());
96+
Ok(actions)
97+
}

0 commit comments

Comments
 (0)