Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions napi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pub struct PdfResult {
pub processing_time_ms: u32,
/// 1-indexed page numbers that need OCR.
pub pages_needing_ocr: Vec<u32>,
/// 1-indexed page numbers containing embedded images.
pub pages_with_images: Vec<u32>,
pub title: Option<String>,
pub confidence: f64,
pub is_complex_layout: bool,
Expand All @@ -55,6 +57,8 @@ pub struct PdfClassification {
pub page_count: u32,
/// 0-indexed page numbers that need OCR.
pub pages_needing_ocr: Vec<u32>,
/// 0-indexed page numbers containing embedded images.
pub pages_with_images: Vec<u32>,
pub confidence: f64,
}

Expand Down Expand Up @@ -119,6 +123,7 @@ fn to_napi_result(r: pdf_inspector::PdfProcessResult) -> PdfResult {
page_count: r.page_count,
processing_time_ms: r.processing_time_ms as u32,
pages_needing_ocr: r.pages_needing_ocr,
pages_with_images: r.pages_with_images,
title: r.title,
confidence: r.confidence as f64,
is_complex_layout: r.layout.is_complex,
Expand Down Expand Up @@ -208,6 +213,7 @@ pub fn classify_pdf(buffer: Buffer) -> Result<PdfClassification> {
pdf_type: convert_pdf_type(result.pdf_type),
page_count: result.page_count,
pages_needing_ocr: result.pages_needing_ocr,
pages_with_images: result.pages_with_images,
confidence: result.confidence as f64,
})
})
Expand Down
16 changes: 14 additions & 2 deletions pdf_inspector.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ from typing import Optional

class PdfResult:
"""Result of processing a PDF file."""

pdf_type: str
"""'text_based', 'scanned', 'image_based', or 'mixed'."""
markdown: Optional[str]
page_count: int
processing_time_ms: int
pages_needing_ocr: list[int]
pages_with_images: list[int]
title: Optional[str]
confidence: float
is_complex_layout: bool
Expand All @@ -19,15 +21,19 @@ class PdfResult:

class PdfClassification:
"""Lightweight PDF classification result."""

pdf_type: str
"""'text_based', 'scanned', 'image_based', or 'mixed'."""
page_count: int
pages_needing_ocr: list[int]
"""0-indexed page numbers that need OCR."""
pages_with_images: list[int]
"""0-indexed page numbers containing embedded images."""
confidence: float

class TextItem:
"""A positioned text item extracted from a PDF."""

text: str
x: float
y: float
Expand All @@ -42,12 +48,14 @@ class TextItem:

class RegionText:
"""Extracted text for a single region."""

text: str
needs_ocr: bool
"""True when the text should not be trusted."""

class PageRegionTexts:
"""Extracted text for one page's regions."""

page: int
"""0-indexed page number."""
regions: list[RegionText]
Expand Down Expand Up @@ -106,11 +114,15 @@ def extract_text_bytes(data: bytes) -> str:
"""Extract plain text from PDF bytes."""
...

def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]:
def extract_text_with_positions(
path: str, pages: Optional[list[int]] = None
) -> list[TextItem]:
"""Extract text with position information."""
...

def extract_text_with_positions_bytes(data: bytes, pages: Optional[list[int]] = None) -> list[TextItem]:
def extract_text_with_positions_bytes(
data: bytes, pages: Optional[list[int]] = None
) -> list[TextItem]:
"""Extract text with position information from bytes."""
...

Expand Down
25 changes: 23 additions & 2 deletions src/bin/detect_pdf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ fn run_analyze(pdf_path: &str, json_output: bool, start: Instant) {
.iter()
.map(|p| p.to_string())
.collect();
let image_pages: Vec<String> = result
.pages_with_images
.iter()
.map(|p| p.to_string())
.collect();
let table_pages: Vec<String> = result
.layout
.pages_with_tables
Expand All @@ -118,10 +123,11 @@ fn run_analyze(pdf_path: &str, json_output: bool, start: Instant) {
.map(|p| p.to_string())
.collect();
println!(
r#"{{"pdf_type":"{}","page_count":{},"pages_needing_ocr":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"detection_time_ms":{}}}"#,
r#"{{"pdf_type":"{}","page_count":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"detection_time_ms":{}}}"#,
pdf_type_str(&result.pdf_type),
result.page_count,
ocr_pages.join(","),
image_pages.join(","),
result.layout.is_complex,
table_pages.join(","),
col_pages.join(","),
Expand All @@ -145,6 +151,9 @@ fn run_analyze(pdf_path: &str, json_output: bool, start: Instant) {
if !result.pages_needing_ocr.is_empty() {
println!("Pages needing OCR: {:?}", result.pages_needing_ocr);
}
if !result.pages_with_images.is_empty() {
println!("Pages with images: {:?}", result.pages_with_images);
}
println!();
if result.layout.is_complex {
println!("Layout: COMPLEX");
Expand Down Expand Up @@ -184,8 +193,13 @@ fn run_detect_only(pdf_path: &str, json_output: bool, start: Instant) {
.iter()
.map(|p| p.to_string())
.collect();
let image_pages: Vec<String> = result
.pages_with_images
.iter()
.map(|p| p.to_string())
.collect();
println!(
r#"{{"pdf_type":"{}","page_count":{},"pages_sampled":{},"pages_with_text":{},"confidence":{:.2},"title":{},"ocr_recommended":{},"pages_needing_ocr":[{}],"detection_time_ms":{}}}"#,
r#"{{"pdf_type":"{}","page_count":{},"pages_sampled":{},"pages_with_text":{},"confidence":{:.2},"title":{},"ocr_recommended":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"detection_time_ms":{}}}"#,
pdf_type_str(&result.pdf_type),
result.page_count,
result.pages_sampled,
Expand All @@ -198,6 +212,7 @@ fn run_detect_only(pdf_path: &str, json_output: bool, start: Instant) {
.unwrap_or_else(|| "null".to_string()),
result.ocr_recommended,
ocr_pages.join(","),
image_pages.join(","),
elapsed.as_millis()
);
} else {
Expand Down Expand Up @@ -233,6 +248,12 @@ fn run_detect_only(pdf_path: &str, json_output: bool, start: Instant) {
);
}
}
if !result.pages_with_images.is_empty() {
println!(
"Pages with images: {:?} (of {})",
result.pages_with_images, result.page_count
);
}
if let Some(title) = &result.title {
println!("Title: {}", title);
}
Expand Down
28 changes: 26 additions & 2 deletions src/bin/pdf2md.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,11 @@ fn main() {
.iter()
.map(|p| p.to_string())
.collect();
let image_pages: Vec<String> = result
.pages_with_images
.iter()
.map(|p| p.to_string())
.collect();
let table_pages: Vec<String> = result
.layout
.pages_with_tables
Expand All @@ -178,11 +183,12 @@ fn main() {
.map(|p| p.to_string())
.collect();
println!(
r#"{{"pdf_type":"{}","page_count":{},"processing_time_ms":{},"pages_needing_ocr":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{}}}"#,
r#"{{"pdf_type":"{}","page_count":{},"processing_time_ms":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{}}}"#,
pdf_type_str,
result.page_count,
result.processing_time_ms,
ocr_pages.join(","),
image_pages.join(","),
result.layout.is_complex,
table_pages.join(","),
col_pages.join(","),
Expand All @@ -195,6 +201,9 @@ fn main() {
if !result.pages_needing_ocr.is_empty() {
eprintln!("Pages needing OCR: {:?}", result.pages_needing_ocr);
}
if !result.pages_with_images.is_empty() {
eprintln!("Pages with images: {:?}", result.pages_with_images);
}
if analyze {
print_layout_info(&result.layout);
}
Expand All @@ -211,6 +220,11 @@ fn main() {
.iter()
.map(|p| p.to_string())
.collect();
let image_pages: Vec<String> = result
.pages_with_images
.iter()
.map(|p| p.to_string())
.collect();
let table_pages: Vec<String> = result
.layout
.pages_with_tables
Expand All @@ -224,7 +238,7 @@ fn main() {
.map(|p| p.to_string())
.collect();
println!(
r#"{{"pdf_type":"{}","page_count":{},"has_text":{},"processing_time_ms":{},"markdown_length":{},"pages_needing_ocr":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{},"markdown":"{}"}}"#,
r#"{{"pdf_type":"{}","page_count":{},"has_text":{},"processing_time_ms":{},"markdown_length":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{},"markdown":"{}"}}"#,
match result.pdf_type {
PdfType::TextBased => "text_based",
PdfType::Scanned => "scanned",
Expand All @@ -236,6 +250,7 @@ fn main() {
result.processing_time_ms,
result.markdown.as_ref().map(|m| m.len()).unwrap_or(0),
ocr_pages.join(","),
image_pages.join(","),
result.layout.is_complex,
table_pages.join(","),
col_pages.join(","),
Expand Down Expand Up @@ -271,6 +286,9 @@ fn main() {
if !result.pages_needing_ocr.is_empty() {
eprintln!("Pages needing OCR: {:?}", result.pages_needing_ocr);
}
if !result.pages_with_images.is_empty() {
eprintln!("Pages with images: {:?}", result.pages_with_images);
}

if let Some(markdown) = &result.markdown {
if let Some(output) = output_file {
Expand All @@ -297,6 +315,9 @@ fn main() {
);
eprintln!("Pages: {}", result.page_count);
eprintln!("Processing time: {}ms", result.processing_time_ms);
if !result.pages_with_images.is_empty() {
eprintln!("Pages with images: {:?}", result.pages_with_images);
}
eprintln!();
eprintln!("This PDF requires OCR for text extraction.");
eprintln!("Consider using MinerU or similar OCR tool.");
Expand All @@ -315,6 +336,9 @@ fn main() {
} else {
eprintln!("Pages needing OCR: {:?}", result.pages_needing_ocr);
}
if !result.pages_with_images.is_empty() {
eprintln!("Pages with images: {:?}", result.pages_with_images);
}
eprintln!();

if let Some(output) = output_file {
Expand Down
40 changes: 28 additions & 12 deletions src/detector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ pub struct PdfTypeResult {
/// 1-indexed page numbers that need OCR (image-only or insufficient text).
/// Empty for TextBased. All pages for Scanned/ImageBased. Specific pages for Mixed.
pub pages_needing_ocr: Vec<u32>,
/// 1-indexed page numbers containing embedded images.
/// Includes image XObjects and inline images.
pub pages_with_images: Vec<u32>,
}

/// Configuration for PDF type detection
Expand Down Expand Up @@ -382,7 +385,9 @@ pub(crate) fn detect_from_document(
let analysis = if let Some(cached) = analysis_cache.get(&page_num) {
cached.clone()
} else if let Some(&page_id) = pages.get(&page_num) {
analyze_page_content(doc, page_id)
let analysis = analyze_page_content(doc, page_id);
analysis_cache.insert(page_num, analysis.clone());
analysis
} else {
continue;
};
Expand Down Expand Up @@ -418,24 +423,34 @@ pub(crate) fn detect_from_document(
pages_needing_ocr.push(page_num);
}
}
// Check uncached pages too (when not all pages were sampled).
// Check uncached pages too and populate the cache for later passes.
// Use analyze_page_content to get usage-based font checks (P1 + P2 fix).
if pages_needing_ocr.len() < total_pages as usize {
for page_num in 1..=total_pages {
if analysis_cache.contains_key(&page_num) || pages_needing_ocr.contains(&page_num) {
continue;
}
if let Some(&page_id) = pages.get(&page_num) {
let analysis = analyze_page_content(doc, page_id);
if analysis.has_identity_h_no_tounicode || analysis.has_only_type3_fonts {
pages_needing_ocr.push(page_num);
}
for page_num in 1..=total_pages {
if analysis_cache.contains_key(&page_num) {
continue;
}
if let Some(&page_id) = pages.get(&page_num) {
let analysis = analyze_page_content(doc, page_id);
let has_undecodable_fonts =
analysis.has_identity_h_no_tounicode || analysis.has_only_type3_fonts;
analysis_cache.insert(page_num, analysis);
if has_undecodable_fonts && !pages_needing_ocr.contains(&page_num) {
pages_needing_ocr.push(page_num);
}
}
}
pages_needing_ocr.sort();
pages_needing_ocr.dedup();

// Phase 4: Build per-page image list from cached analyses.
let pages_with_images: Vec<u32> = (1..=total_pages)
.filter(|page_num| {
analysis_cache
.get(page_num)
.is_some_and(|analysis| analysis.has_images)
})
.collect();

// Try to get title from metadata
let title = get_document_title(doc);

Expand All @@ -448,6 +463,7 @@ pub(crate) fn detect_from_document(
title,
ocr_recommended,
pages_needing_ocr,
pages_with_images,
})
}

Expand Down
Loading