firecrawl · tspython · Apr 15, 2026 · Apr 15, 2026 · Apr 16, 2026 · Apr 27, 2026
diff --git a/napi/src/lib.rs b/napi/src/lib.rs
@@ -40,6 +40,8 @@ pub struct PdfResult {
     pub processing_time_ms: u32,
     /// 1-indexed page numbers that need OCR.
     pub pages_needing_ocr: Vec<u32>,
+    /// 1-indexed page numbers containing embedded images.
+    pub pages_with_images: Vec<u32>,
     pub title: Option<String>,
     pub confidence: f64,
     pub is_complex_layout: bool,
@@ -55,6 +57,8 @@ pub struct PdfClassification {
     pub page_count: u32,
     /// 0-indexed page numbers that need OCR.
     pub pages_needing_ocr: Vec<u32>,
+    /// 0-indexed page numbers containing embedded images.
+    pub pages_with_images: Vec<u32>,
     pub confidence: f64,
 }
 
@@ -119,6 +123,7 @@ fn to_napi_result(r: pdf_inspector::PdfProcessResult) -> PdfResult {
         page_count: r.page_count,
         processing_time_ms: r.processing_time_ms as u32,
         pages_needing_ocr: r.pages_needing_ocr,
+        pages_with_images: r.pages_with_images,
         title: r.title,
         confidence: r.confidence as f64,
         is_complex_layout: r.layout.is_complex,
@@ -208,6 +213,7 @@ pub fn classify_pdf(buffer: Buffer) -> Result<PdfClassification> {
             pdf_type: convert_pdf_type(result.pdf_type),
             page_count: result.page_count,
             pages_needing_ocr: result.pages_needing_ocr,
+            pages_with_images: result.pages_with_images,
             confidence: result.confidence as f64,
         })
     })

diff --git a/pdf_inspector.pyi b/pdf_inspector.pyi
@@ -4,12 +4,14 @@ from typing import Optional
 
 class PdfResult:
     """Result of processing a PDF file."""
+
     pdf_type: str
     """'text_based', 'scanned', 'image_based', or 'mixed'."""
     markdown: Optional[str]
     page_count: int
     processing_time_ms: int
     pages_needing_ocr: list[int]
+    pages_with_images: list[int]
     title: Optional[str]
     confidence: float
     is_complex_layout: bool
@@ -19,15 +21,19 @@ class PdfResult:
 
 class PdfClassification:
     """Lightweight PDF classification result."""
+
     pdf_type: str
     """'text_based', 'scanned', 'image_based', or 'mixed'."""
     page_count: int
     pages_needing_ocr: list[int]
     """0-indexed page numbers that need OCR."""
+    pages_with_images: list[int]
+    """0-indexed page numbers containing embedded images."""
     confidence: float
 
 class TextItem:
     """A positioned text item extracted from a PDF."""
+
     text: str
     x: float
     y: float
@@ -42,12 +48,14 @@ class TextItem:
 
 class RegionText:
     """Extracted text for a single region."""
+
     text: str
     needs_ocr: bool
     """True when the text should not be trusted."""
 
 class PageRegionTexts:
     """Extracted text for one page's regions."""
+
     page: int
     """0-indexed page number."""
     regions: list[RegionText]
@@ -106,11 +114,15 @@ def extract_text_bytes(data: bytes) -> str:
     """Extract plain text from PDF bytes."""
     ...
 
-def extract_text_with_positions(path: str, pages: Optional[list[int]] = None) -> list[TextItem]:
+def extract_text_with_positions(
+    path: str, pages: Optional[list[int]] = None
+) -> list[TextItem]:
     """Extract text with position information."""
     ...
 
-def extract_text_with_positions_bytes(data: bytes, pages: Optional[list[int]] = None) -> list[TextItem]:
+def extract_text_with_positions_bytes(
+    data: bytes, pages: Optional[list[int]] = None
+) -> list[TextItem]:
     """Extract text with position information from bytes."""
     ...
 

diff --git a/src/bin/detect_pdf.rs b/src/bin/detect_pdf.rs
@@ -105,6 +105,11 @@ fn run_analyze(pdf_path: &str, json_output: bool, start: Instant) {
                     .iter()
                     .map(|p| p.to_string())
                     .collect();
+                let image_pages: Vec<String> = result
+                    .pages_with_images
+                    .iter()
+                    .map(|p| p.to_string())
+                    .collect();
                 let table_pages: Vec<String> = result
                     .layout
                     .pages_with_tables
@@ -118,10 +123,11 @@ fn run_analyze(pdf_path: &str, json_output: bool, start: Instant) {
                     .map(|p| p.to_string())
                     .collect();
                 println!(
-                    r#"{{"pdf_type":"{}","page_count":{},"pages_needing_ocr":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"detection_time_ms":{}}}"#,
+                    r#"{{"pdf_type":"{}","page_count":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"detection_time_ms":{}}}"#,
                     pdf_type_str(&result.pdf_type),
                     result.page_count,
                     ocr_pages.join(","),
+                    image_pages.join(","),
                     result.layout.is_complex,
                     table_pages.join(","),
                     col_pages.join(","),
@@ -145,6 +151,9 @@ fn run_analyze(pdf_path: &str, json_output: bool, start: Instant) {
                 if !result.pages_needing_ocr.is_empty() {
                     println!("Pages needing OCR: {:?}", result.pages_needing_ocr);
                 }
+                if !result.pages_with_images.is_empty() {
+                    println!("Pages with images: {:?}", result.pages_with_images);
+                }
                 println!();
                 if result.layout.is_complex {
                     println!("Layout: COMPLEX");
@@ -184,8 +193,13 @@ fn run_detect_only(pdf_path: &str, json_output: bool, start: Instant) {
                     .iter()
                     .map(|p| p.to_string())
                     .collect();
+                let image_pages: Vec<String> = result
+                    .pages_with_images
+                    .iter()
+                    .map(|p| p.to_string())
+                    .collect();
                 println!(
-                    r#"{{"pdf_type":"{}","page_count":{},"pages_sampled":{},"pages_with_text":{},"confidence":{:.2},"title":{},"ocr_recommended":{},"pages_needing_ocr":[{}],"detection_time_ms":{}}}"#,
+                    r#"{{"pdf_type":"{}","page_count":{},"pages_sampled":{},"pages_with_text":{},"confidence":{:.2},"title":{},"ocr_recommended":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"detection_time_ms":{}}}"#,
                     pdf_type_str(&result.pdf_type),
                     result.page_count,
                     result.pages_sampled,
@@ -198,6 +212,7 @@ fn run_detect_only(pdf_path: &str, json_output: bool, start: Instant) {
                         .unwrap_or_else(|| "null".to_string()),
                     result.ocr_recommended,
                     ocr_pages.join(","),
+                    image_pages.join(","),
                     elapsed.as_millis()
                 );
             } else {
@@ -233,6 +248,12 @@ fn run_detect_only(pdf_path: &str, json_output: bool, start: Instant) {
                         );
                     }
                 }
+                if !result.pages_with_images.is_empty() {
+                    println!(
+                        "Pages with images: {:?} (of {})",
+                        result.pages_with_images, result.page_count
+                    );
+                }
                 if let Some(title) = &result.title {
                     println!("Title: {}", title);
                 }

diff --git a/src/bin/pdf2md.rs b/src/bin/pdf2md.rs
@@ -165,6 +165,11 @@ fn main() {
                         .iter()
                         .map(|p| p.to_string())
                         .collect();
+                    let image_pages: Vec<String> = result
+                        .pages_with_images
+                        .iter()
+                        .map(|p| p.to_string())
+                        .collect();
                     let table_pages: Vec<String> = result
                         .layout
                         .pages_with_tables
@@ -178,11 +183,12 @@ fn main() {
                         .map(|p| p.to_string())
                         .collect();
                     println!(
-                        r#"{{"pdf_type":"{}","page_count":{},"processing_time_ms":{},"pages_needing_ocr":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{}}}"#,
+                        r#"{{"pdf_type":"{}","page_count":{},"processing_time_ms":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{}}}"#,
                         pdf_type_str,
                         result.page_count,
                         result.processing_time_ms,
                         ocr_pages.join(","),
+                        image_pages.join(","),
                         result.layout.is_complex,
                         table_pages.join(","),
                         col_pages.join(","),
@@ -195,6 +201,9 @@ fn main() {
                     if !result.pages_needing_ocr.is_empty() {
                         eprintln!("Pages needing OCR: {:?}", result.pages_needing_ocr);
                     }
+                    if !result.pages_with_images.is_empty() {
+                        eprintln!("Pages with images: {:?}", result.pages_with_images);
+                    }
                     if analyze {
                         print_layout_info(&result.layout);
                     }
@@ -211,6 +220,11 @@ fn main() {
                     .iter()
                     .map(|p| p.to_string())
                     .collect();
+                let image_pages: Vec<String> = result
+                    .pages_with_images
+                    .iter()
+                    .map(|p| p.to_string())
+                    .collect();
                 let table_pages: Vec<String> = result
                     .layout
                     .pages_with_tables
@@ -224,7 +238,7 @@ fn main() {
                     .map(|p| p.to_string())
                     .collect();
                 println!(
-                    r#"{{"pdf_type":"{}","page_count":{},"has_text":{},"processing_time_ms":{},"markdown_length":{},"pages_needing_ocr":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{},"markdown":"{}"}}"#,
+                    r#"{{"pdf_type":"{}","page_count":{},"has_text":{},"processing_time_ms":{},"markdown_length":{},"pages_needing_ocr":[{}],"pages_with_images":[{}],"is_complex":{},"pages_with_tables":[{}],"pages_with_columns":[{}],"has_encoding_issues":{},"markdown":"{}"}}"#,
                     match result.pdf_type {
                         PdfType::TextBased => "text_based",
                         PdfType::Scanned => "scanned",
@@ -236,6 +250,7 @@ fn main() {
                     result.processing_time_ms,
                     result.markdown.as_ref().map(|m| m.len()).unwrap_or(0),
                     ocr_pages.join(","),
+                    image_pages.join(","),
                     result.layout.is_complex,
                     table_pages.join(","),
                     col_pages.join(","),
@@ -271,6 +286,9 @@ fn main() {
                         if !result.pages_needing_ocr.is_empty() {
                             eprintln!("Pages needing OCR: {:?}", result.pages_needing_ocr);
                         }
+                        if !result.pages_with_images.is_empty() {
+                            eprintln!("Pages with images: {:?}", result.pages_with_images);
+                        }
 
                         if let Some(markdown) = &result.markdown {
                             if let Some(output) = output_file {
@@ -297,6 +315,9 @@ fn main() {
                         );
                         eprintln!("Pages: {}", result.page_count);
                         eprintln!("Processing time: {}ms", result.processing_time_ms);
+                        if !result.pages_with_images.is_empty() {
+                            eprintln!("Pages with images: {:?}", result.pages_with_images);
+                        }
                         eprintln!();
                         eprintln!("This PDF requires OCR for text extraction.");
                         eprintln!("Consider using MinerU or similar OCR tool.");
@@ -315,6 +336,9 @@ fn main() {
                             } else {
                                 eprintln!("Pages needing OCR: {:?}", result.pages_needing_ocr);
                             }
+                            if !result.pages_with_images.is_empty() {
+                                eprintln!("Pages with images: {:?}", result.pages_with_images);
+                            }
                             eprintln!();
 
                             if let Some(output) = output_file {

diff --git a/src/detector.rs b/src/detector.rs
@@ -60,6 +60,9 @@ pub struct PdfTypeResult {
     /// 1-indexed page numbers that need OCR (image-only or insufficient text).
     /// Empty for TextBased. All pages for Scanned/ImageBased. Specific pages for Mixed.
     pub pages_needing_ocr: Vec<u32>,
+    /// 1-indexed page numbers containing embedded images.
+    /// Includes image XObjects and inline images.
+    pub pages_with_images: Vec<u32>,
 }
 
 /// Configuration for PDF type detection
@@ -382,7 +385,9 @@ pub(crate) fn detect_from_document(
                 let analysis = if let Some(cached) = analysis_cache.get(&page_num) {
                     cached.clone()
                 } else if let Some(&page_id) = pages.get(&page_num) {
-                    analyze_page_content(doc, page_id)
+                    let analysis = analyze_page_content(doc, page_id);
+                    analysis_cache.insert(page_num, analysis.clone());
+                    analysis
                 } else {
                     continue;
                 };
@@ -418,24 +423,34 @@ pub(crate) fn detect_from_document(
             pages_needing_ocr.push(page_num);
         }
     }
-    // Check uncached pages too (when not all pages were sampled).
+    // Check uncached pages too and populate the cache for later passes.
     // Use analyze_page_content to get usage-based font checks (P1 + P2 fix).
-    if pages_needing_ocr.len() < total_pages as usize {
-        for page_num in 1..=total_pages {
-            if analysis_cache.contains_key(&page_num) || pages_needing_ocr.contains(&page_num) {
-                continue;
-            }
-            if let Some(&page_id) = pages.get(&page_num) {
-                let analysis = analyze_page_content(doc, page_id);
-                if analysis.has_identity_h_no_tounicode || analysis.has_only_type3_fonts {
-                    pages_needing_ocr.push(page_num);
-                }
+    for page_num in 1..=total_pages {
+        if analysis_cache.contains_key(&page_num) {
+            continue;
+        }
+        if let Some(&page_id) = pages.get(&page_num) {
+            let analysis = analyze_page_content(doc, page_id);
+            let has_undecodable_fonts =
+                analysis.has_identity_h_no_tounicode || analysis.has_only_type3_fonts;
+            analysis_cache.insert(page_num, analysis);
+            if has_undecodable_fonts && !pages_needing_ocr.contains(&page_num) {
+                pages_needing_ocr.push(page_num);
             }
         }
     }
     pages_needing_ocr.sort();
     pages_needing_ocr.dedup();
 
+    // Phase 4: Build per-page image list from cached analyses.
+    let pages_with_images: Vec<u32> = (1..=total_pages)
+        .filter(|page_num| {
+            analysis_cache
+                .get(page_num)
+                .is_some_and(|analysis| analysis.has_images)
+        })
+        .collect();
+
     // Try to get title from metadata
     let title = get_document_title(doc);
 
@@ -448,6 +463,7 @@ pub(crate) fn detect_from_document(
         title,
         ocr_recommended,
         pages_needing_ocr,
+        pages_with_images,
     })
 }