Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions crates/paperjam-docx/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,29 @@ impl DocxDocument {
}
}

/// Read `docProps/core.xml` from the DOCX ZIP archive.
/// Per-entry decompressed byte limit when reading the DOCX archive.
/// `docProps/core.xml` is a tiny metadata file; anything claiming more than
/// this is almost certainly malicious.
const MAX_ENTRY_BYTES: u64 = 16 * 1024 * 1024;

/// Read `docProps/core.xml` from the DOCX ZIP archive with a size cap.
fn read_core_xml_from_zip(bytes: &[u8]) -> Option<String> {
let cursor = Cursor::new(bytes);
let mut archive = zip::ZipArchive::new(cursor).ok()?;
let mut file = archive.by_name("docProps/core.xml").ok()?;

if file.size() > MAX_ENTRY_BYTES {
return None;
}

let mut contents = String::new();
file.read_to_string(&mut contents).ok()?;
let read = (&mut file)
.take(MAX_ENTRY_BYTES + 1)
.read_to_string(&mut contents)
.ok()?;
if read as u64 > MAX_ENTRY_BYTES {
return None;
}
Some(contents)
}

Expand Down
3 changes: 3 additions & 0 deletions crates/paperjam-epub/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ pub enum EpubError {

#[error("invalid EPUB structure: {0}")]
InvalidStructure(String),

#[error("EPUB entry `{name}` is too large ({size} bytes, limit {limit})")]
EntryTooLarge { name: String, size: u64, limit: u64 },
}

impl From<quick_xml::Error> for EpubError {
Expand Down
1 change: 1 addition & 0 deletions crates/paperjam-epub/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ mod image;
mod markdown;
mod metadata;
pub mod parser;
mod safe_read;
mod structure;
mod table;
mod text;
Expand Down
38 changes: 7 additions & 31 deletions crates/paperjam-epub/src/parser.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::collections::HashMap;
use std::io::Read;

use quick_xml::events::Event;
use quick_xml::Reader;

use crate::document::{ChapterData, EpubDocument, OpfMetadata, TocEntry};
use crate::error::{EpubError, Result};
use crate::safe_read::{read_entry_bytes, read_entry_string};
use crate::toc;

/// Parse an EPUB document from raw bytes.
Expand All @@ -14,15 +14,15 @@ pub fn parse_epub(bytes: &[u8]) -> Result<EpubDocument> {
let mut archive = zip::ZipArchive::new(cursor)?;

// 1. Find the OPF path from container.xml.
let container_xml = read_zip_entry_string(&mut archive, "META-INF/container.xml")?;
let container_xml = read_entry_string(&mut archive, "META-INF/container.xml")?;
let opf_path = parse_container_xml(&container_xml)?;
let opf_base_dir = opf_path
.rsplit_once('/')
.map(|(d, _)| d.to_string())
.unwrap_or_default();

// 2. Parse OPF: metadata, manifest, spine.
let opf_xml = read_zip_entry_string(&mut archive, &opf_path)?;
let opf_xml = read_entry_string(&mut archive, &opf_path)?;
let (opf_metadata, manifest, spine) = parse_opf(&opf_xml)?;

// 3. Parse TOC.
Expand All @@ -33,7 +33,7 @@ pub fn parse_epub(bytes: &[u8]) -> Result<EpubDocument> {
for (idx, spine_idref) in spine.iter().enumerate() {
if let Some(href) = manifest.get(spine_idref) {
let full_path = resolve_path(&opf_base_dir, href);
match read_zip_entry_bytes(&mut archive, &full_path) {
match read_entry_bytes(&mut archive, &full_path) {
Ok(html_bytes) => {
let html_doc = paperjam_html::HtmlDocument::from_bytes(&html_bytes)?;
let title = find_toc_title(&toc_entries, href);
Expand Down Expand Up @@ -67,30 +67,6 @@ pub fn parse_epub(bytes: &[u8]) -> Result<EpubDocument> {
// Helpers
// ---------------------------------------------------------------------------

fn read_zip_entry_string(
archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
name: &str,
) -> Result<String> {
let mut file = archive
.by_name(name)
.map_err(|_| EpubError::MissingEntry(name.to_string()))?;
let mut buf = String::new();
file.read_to_string(&mut buf)?;
Ok(buf)
}

fn read_zip_entry_bytes(
archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
name: &str,
) -> Result<Vec<u8>> {
let mut file = archive
.by_name(name)
.map_err(|_| EpubError::MissingEntry(name.to_string()))?;
let mut buf = Vec::new();
file.read_to_end(&mut buf)?;
Ok(buf)
}

fn resolve_path(base_dir: &str, href: &str) -> String {
if base_dir.is_empty() {
href.to_string()
Expand Down Expand Up @@ -284,7 +260,7 @@ fn parse_toc_from_manifest(
for (id, href) in manifest {
if id == "ncx" || href.ends_with(".ncx") {
let full_path = resolve_path(opf_base_dir, href);
if let Ok(xml) = read_zip_entry_string(archive, &full_path) {
if let Ok(xml) = read_entry_string(archive, &full_path) {
let entries = toc::parse_ncx(&xml);
if !entries.is_empty() {
return entries;
Expand All @@ -297,7 +273,7 @@ fn parse_toc_from_manifest(
for href in manifest.values() {
if href.contains("nav") && (href.ends_with(".xhtml") || href.ends_with(".html")) {
let full_path = resolve_path(opf_base_dir, href);
if let Ok(html_bytes) = read_zip_entry_bytes(archive, &full_path) {
if let Ok(html_bytes) = read_entry_bytes(archive, &full_path) {
let entries = toc::parse_nav_xhtml(&html_bytes);
if !entries.is_empty() {
return entries;
Expand All @@ -322,7 +298,7 @@ fn collect_images(
let lower = href.to_ascii_lowercase();
if image_extensions.iter().any(|ext| lower.ends_with(ext)) {
let full_path = resolve_path(opf_base_dir, href);
if let Ok(data) = read_zip_entry_bytes(archive, &full_path) {
if let Ok(data) = read_entry_bytes(archive, &full_path) {
images.push((href.clone(), data));
}
}
Expand Down
115 changes: 115 additions & 0 deletions crates/paperjam-epub/src/safe_read.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
//! Bounded readers for ZIP entries to mitigate decompression-bomb attacks.
//!
//! EPUB files are ZIP archives; malicious inputs can declare tiny compressed
//! sizes that expand to gigabytes. We cap the decompressed length on every
//! entry we pull out of the archive.

use std::io::Read;

use crate::error::{EpubError, Result};

/// Per-entry decompressed byte limit. Normal EPUB entries (XHTML chapters,
/// cover images, fonts) are comfortably under this. A document that exceeds
/// it is either pathological or malicious.
pub const MAX_ENTRY_BYTES: u64 = 100 * 1024 * 1024;

pub fn read_entry_string(
archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
name: &str,
) -> Result<String> {
let mut entry = archive
.by_name(name)
.map_err(|_| EpubError::MissingEntry(name.to_string()))?;

let declared = entry.size();
if declared > MAX_ENTRY_BYTES {
return Err(EpubError::EntryTooLarge {
name: name.to_string(),
size: declared,
limit: MAX_ENTRY_BYTES,
});
}

let mut buf = String::new();
let read = (&mut entry)
.take(MAX_ENTRY_BYTES + 1)
.read_to_string(&mut buf)?;
if read as u64 > MAX_ENTRY_BYTES {
return Err(EpubError::EntryTooLarge {
name: name.to_string(),
size: read as u64,
limit: MAX_ENTRY_BYTES,
});
}
Ok(buf)
}

pub fn read_entry_bytes(
archive: &mut zip::ZipArchive<std::io::Cursor<&[u8]>>,
name: &str,
) -> Result<Vec<u8>> {
let mut entry = archive
.by_name(name)
.map_err(|_| EpubError::MissingEntry(name.to_string()))?;

let declared = entry.size();
if declared > MAX_ENTRY_BYTES {
return Err(EpubError::EntryTooLarge {
name: name.to_string(),
size: declared,
limit: MAX_ENTRY_BYTES,
});
}

let mut buf = Vec::new();
let read = (&mut entry)
.take(MAX_ENTRY_BYTES + 1)
.read_to_end(&mut buf)?;
if read as u64 > MAX_ENTRY_BYTES {
return Err(EpubError::EntryTooLarge {
name: name.to_string(),
size: read as u64,
limit: MAX_ENTRY_BYTES,
});
}
Ok(buf)
}

#[cfg(test)]
mod tests {
use super::*;

use std::io::{Cursor, Write};

fn build_archive_with_entry(name: &str, contents: &[u8]) -> Vec<u8> {
let mut buf = Vec::new();
{
let mut w = zip::ZipWriter::new(Cursor::new(&mut buf));
w.start_file::<_, ()>(name, zip::write::SimpleFileOptions::default())
.unwrap();
w.write_all(contents).unwrap();
w.finish().unwrap();
}
buf
}

#[test]
fn small_entry_reads_normally() {
let bytes = build_archive_with_entry("hello.txt", b"hello world");
let bytes_slice: &[u8] = &bytes;
let mut archive = zip::ZipArchive::new(Cursor::new(bytes_slice)).unwrap();
let s = read_entry_string(&mut archive, "hello.txt").unwrap();
assert_eq!(s, "hello world");
}

#[test]
fn oversized_entry_is_rejected_by_declared_size() {
// Build an archive whose single entry exceeds the per-entry cap.
let blob = vec![b'a'; (MAX_ENTRY_BYTES as usize) + 1];
let bytes = build_archive_with_entry("big.bin", &blob);
let bytes_slice: &[u8] = &bytes;
let mut archive = zip::ZipArchive::new(Cursor::new(bytes_slice)).unwrap();
let err = read_entry_bytes(&mut archive, "big.bin").unwrap_err();
assert!(matches!(err, EpubError::EntryTooLarge { .. }));
}
}
3 changes: 3 additions & 0 deletions crates/paperjam-mcp/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ pub enum McpError {
#[error("pipeline error: {0}")]
Pipeline(#[from] paperjam_pipeline::PipelineError),

#[error("path `{0}` escapes the working directory; pass --allow-absolute-paths to the server to opt out of sandboxing")]
PathEscapesSandbox(String),

#[error("{0}")]
Other(String),
}
Loading
Loading