From cc85057a0edcb5d0e4f3acd50684a9e1541a5aa7 Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Wed, 15 Apr 2026 10:51:43 -0700 Subject: [PATCH] feat: add extractFormulasInRegions for native formula text extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new region extraction endpoint that uses formula-specific quality checks instead of the generic text garbage detector. Formula text is legitimately symbol-heavy (Greek letters, math operators, subscripts), so the standard is_garbage_text check — which requires >50% alphanumeric characters — would false-positive on valid formula regions. The new is_formula_garbage validator catches actual decode failures: PUA characters from undecoded TeX extensible delimiters (>10%) and control characters from broken font encodings (>30%). Also refactors the shared page-extraction boilerplate into prepare_region_extraction, eliminating duplication across extract_text_in_regions_mem and extract_tables_in_regions_mem. Co-Authored-By: Claude Opus 4.6 (1M context) --- napi/src/lib.rs | 23 ++++ src/lib.rs | 325 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 257 insertions(+), 91 deletions(-) diff --git a/napi/src/lib.rs b/napi/src/lib.rs index 5689b5d..8924d65 100644 --- a/napi/src/lib.rs +++ b/napi/src/lib.rs @@ -317,6 +317,29 @@ pub fn extract_tables_in_regions( }) } +/// Extract formula text within bounding-box regions from a PDF. +/// +/// Like `extractTextInRegions` but uses formula-specific quality checks. +/// Formula text is legitimately symbol-heavy (Greek letters, math operators) +/// so the generic garbage-text check is relaxed. When the text decodes +/// cleanly, `needsOcr` is `false` — the caller can skip GPU OCR. +/// +/// Coordinates are PDF points with top-left origin. +#[napi] +pub fn extract_formulas_in_regions( + buffer: Buffer, + page_regions: Vec, +) -> Result> { + let bytes: Vec = buffer.to_vec(); + let regions = parse_page_regions(&page_regions); + + catch_panic("extract_formulas_in_regions", move || { + let results = pdf_inspector::extract_formulas_in_regions_mem(&bytes, ®ions) + .map_err(|e| to_napi_err(e, "extract_formulas_in_regions"))?; + Ok(to_page_region_texts(results)) + }) +} + /// Per-page markdown extraction result. #[napi(object)] pub struct PageMarkdownResult { diff --git a/src/lib.rs b/src/lib.rs index afc7161..aee2673 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -462,34 +462,28 @@ pub struct PageRegionResult { pub regions: Vec, } -/// Extract text within bounding-box regions from a PDF in memory. -/// -/// This is designed for hybrid OCR pipelines: a layout model detects regions -/// in a rendered page image, and this function extracts the PDF text that -/// falls within each region — avoiding GPU OCR for text-based pages. -/// -/// Each region result includes a `needs_ocr` flag that is set when extraction -/// quality is suspect (empty text, GID-encoded fonts, garbage/encoding issues). -/// -/// # Arguments -/// -/// * `buffer` — PDF file bytes -/// * `page_regions` — list of `(page_number_0indexed, Vec<[x1, y1, x2, y2]>)`. -/// Coordinates are in **PDF points** with **top-left origin** (matching typical -/// layout model output after coordinate conversion). -/// -/// # Returns +/// Shared page extraction state for region-based extraction functions. +struct RegionExtractionData { + items_by_page: HashMap>, + page_heights: HashMap, + #[allow(dead_code)] + gid_pages: HashSet, + page_thresholds: HashMap, + rotated_pages: HashSet, +} + +/// Extract text items, page heights, and metadata for the pages needed by region queries. /// -/// A `Vec` parallel to `page_regions`. -pub fn extract_text_in_regions_mem( +/// This is the shared boilerplate for `extract_text_in_regions_mem`, +/// `extract_tables_in_regions_mem`, and `extract_formulas_in_regions_mem`. +fn prepare_region_extraction( buffer: &[u8], page_regions: &[(u32, Vec<[f32; 4]>)], -) -> Result, PdfError> { +) -> Result { validate_pdf_bytes(buffer)?; let (doc, _page_count) = load_document_from_mem(buffer)?; let pages = doc.get_pages(); - // Build a set of pages we need to extract (1-indexed for lopdf) let needed_pages: HashSet = page_regions.iter().map(|(p, _)| p + 1).collect(); // Fast mode: skip expensive TrueType font fallback parsing. @@ -497,7 +491,6 @@ pub fn extract_text_in_regions_mem( // text, triggering needs_ocr=true → GPU OCR fallback in the pipeline. let font_cmaps = FontCMaps::from_doc_pages_fast(&doc, Some(&needed_pages)); - // Extract text items for needed pages only let mut items_by_page: HashMap> = HashMap::new(); let mut page_heights: HashMap = HashMap::new(); let mut gid_pages: HashSet = HashSet::new(); @@ -509,11 +502,9 @@ pub fn extract_text_in_regions_mem( continue; } - // Get page height from MediaBox for coordinate flip let height = get_page_height(&doc, page_id).unwrap_or(792.0); page_heights.insert(*page_num, height); - // Extract text items for this page let ((mut items, _rects, _lines), has_gid, coords_rotated) = extractor::content_stream::extract_page_text_items( &doc, @@ -535,20 +526,65 @@ pub fn extract_text_in_regions_mem( items_by_page.insert(*page_num, items); } - // For each page's regions, filter and assemble text + Ok(RegionExtractionData { + items_by_page, + page_heights, + gid_pages, + page_thresholds, + rotated_pages, + }) +} + +/// Resolve per-page coord space and adaptive threshold for a given page. +fn page_region_context( + data: &RegionExtractionData, + page_1idx: u32, +) -> (f32, f32, RegionCoordSpace) { + let page_h = data.page_heights.get(&page_1idx).copied().unwrap_or(792.0); + let adaptive_threshold = data + .page_thresholds + .get(&page_1idx) + .copied() + .unwrap_or(0.10); + let coords = if data.rotated_pages.contains(&page_1idx) { + RegionCoordSpace::Rotated90Ccw + } else { + RegionCoordSpace::Standard + }; + (page_h, adaptive_threshold, coords) +} + +/// Extract text within bounding-box regions from a PDF in memory. +/// +/// This is designed for hybrid OCR pipelines: a layout model detects regions +/// in a rendered page image, and this function extracts the PDF text that +/// falls within each region — avoiding GPU OCR for text-based pages. +/// +/// Each region result includes a `needs_ocr` flag that is set when extraction +/// quality is suspect (empty text, GID-encoded fonts, garbage/encoding issues). +/// +/// # Arguments +/// +/// * `buffer` — PDF file bytes +/// * `page_regions` — list of `(page_number_0indexed, Vec<[x1, y1, x2, y2]>)`. +/// Coordinates are in **PDF points** with **top-left origin** (matching typical +/// layout model output after coordinate conversion). +/// +/// # Returns +/// +/// A `Vec` parallel to `page_regions`. +pub fn extract_text_in_regions_mem( + buffer: &[u8], + page_regions: &[(u32, Vec<[f32; 4]>)], +) -> Result, PdfError> { + let data = prepare_region_extraction(buffer, page_regions)?; + let mut results = Vec::with_capacity(page_regions.len()); for (page_0idx, regions) in page_regions { let page_1idx = page_0idx + 1; - let items = items_by_page.get(&page_1idx); - let page_h = page_heights.get(&page_1idx).copied().unwrap_or(792.0); - let _page_has_gid = gid_pages.contains(&page_1idx); - let adaptive_threshold = page_thresholds.get(&page_1idx).copied().unwrap_or(0.10); - let coords = if rotated_pages.contains(&page_1idx) { - RegionCoordSpace::Rotated90Ccw - } else { - RegionCoordSpace::Standard - }; + let items = data.items_by_page.get(&page_1idx); + let (page_h, adaptive_threshold, coords) = page_region_context(&data, page_1idx); let mut page_results = Vec::with_capacity(regions.len()); @@ -602,74 +638,20 @@ pub fn extract_tables_in_regions_mem( buffer: &[u8], page_regions: &[(u32, Vec<[f32; 4]>)], ) -> Result, PdfError> { - validate_pdf_bytes(buffer)?; - let (doc, _page_count) = load_document_from_mem(buffer)?; - let pages = doc.get_pages(); - - let needed_pages: HashSet = page_regions.iter().map(|(p, _)| p + 1).collect(); - let font_cmaps = FontCMaps::from_doc_pages_fast(&doc, Some(&needed_pages)); - - let mut items_by_page: HashMap> = HashMap::new(); - let mut page_heights: HashMap = HashMap::new(); - let mut gid_pages: HashSet = HashSet::new(); - let mut page_thresholds: HashMap = HashMap::new(); - let mut rotated_pages: HashSet = HashSet::new(); - - for (page_num, &page_id) in pages.iter() { - if !needed_pages.contains(page_num) { - continue; - } - let height = get_page_height(&doc, page_id).unwrap_or(792.0); - page_heights.insert(*page_num, height); - - let ((mut items, _rects, _lines), has_gid, coords_rotated) = - extractor::content_stream::extract_page_text_items( - &doc, - page_id, - *page_num, - &font_cmaps, - false, - )?; - let threshold = text_utils::fix_letterspaced_items(&mut items); - if threshold > 0.10 { - page_thresholds.insert(*page_num, threshold); - } - if has_gid { - gid_pages.insert(*page_num); - } - if coords_rotated { - rotated_pages.insert(*page_num); - } - items_by_page.insert(*page_num, items); - } + let data = prepare_region_extraction(buffer, page_regions)?; let mut results = Vec::with_capacity(page_regions.len()); for (page_0idx, regions) in page_regions { let page_1idx = page_0idx + 1; - let items = items_by_page.get(&page_1idx); - let page_h = page_heights.get(&page_1idx).copied().unwrap_or(792.0); - let _page_has_gid = gid_pages.contains(&page_1idx); - let coords = if rotated_pages.contains(&page_1idx) { - RegionCoordSpace::Rotated90Ccw - } else { - RegionCoordSpace::Standard - }; + let items = data.items_by_page.get(&page_1idx); + let (page_h, _adaptive_threshold, coords) = page_region_context(&data, page_1idx); let mut page_results = Vec::with_capacity(regions.len()); for rect in regions { let [rx1, ry1, rx2, ry2] = *rect; - // Note: we intentionally DO NOT bail on page_has_gid here. - // The GID flag means some font on the page uses unresolvable - // glyph IDs, but that font may only appear in a logo or - // header — not in the table region. Instead we let the - // per-region text quality checks (is_garbage_text, is_cid_garbage, - // detect_encoding_issues) reject based on the actual extracted - // content. This avoids rejecting clean tables just because an - // unrelated decorative font on the same page is GID-encoded. - let matched: Vec = match items { Some(items) => { let bounds = region_bounds(rx1, ry1, rx2, ry2, page_h, coords); @@ -750,6 +732,70 @@ pub fn extract_tables_in_regions_mem( Ok(results) } +/// Extract formula text within bounding-box regions from a PDF in memory. +/// +/// Similar to [`extract_text_in_regions_mem`] but uses formula-specific quality +/// checks. Formula text is legitimately symbol-heavy (Greek letters, math +/// operators, subscripts) so the generic `is_garbage_text` check — which rejects +/// text with <50% alphanumeric characters — would false-positive on valid +/// formula regions. +/// +/// When the extracted text decodes cleanly, `needs_ocr` is `false` and the +/// caller can skip GPU OCR. When extraction fails (empty, PUA-heavy, encoding +/// issues), `needs_ocr` is `true` for OCR fallback. +pub fn extract_formulas_in_regions_mem( + buffer: &[u8], + page_regions: &[(u32, Vec<[f32; 4]>)], +) -> Result, PdfError> { + let data = prepare_region_extraction(buffer, page_regions)?; + + let mut results = Vec::with_capacity(page_regions.len()); + + for (page_0idx, regions) in page_regions { + let page_1idx = page_0idx + 1; + let items = data.items_by_page.get(&page_1idx); + let (page_h, adaptive_threshold, coords) = page_region_context(&data, page_1idx); + + let mut page_results = Vec::with_capacity(regions.len()); + + for rect in regions { + let [rx1, ry1, rx2, ry2] = *rect; + + let text = match items { + Some(items) => collect_text_in_region_with_options( + items, + rx1, + ry1, + rx2, + ry2, + page_h, + coords, + adaptive_threshold, + ), + None => String::new(), + }; + + // Formula-specific quality checks: + // - Skip is_garbage_text (formulas are legitimately symbol-heavy) + // - Keep CID/encoding checks (broken font decode is still broken) + // - Add PUA check (extensible delimiters that didn't decode) + let needs_ocr = text.trim().is_empty() + || is_cid_garbage(&text) + || detect_encoding_issues(&text) + || is_formula_garbage(&text); + + page_results.push(RegionText { text, needs_ocr }); + } + + results.push(PageRegionResult { + page: *page_0idx, + regions: page_results, + }); + } + + Ok(results) +} + /// Get page height in points from MediaBox. fn get_page_height(doc: &Document, page_id: lopdf::ObjectId) -> Option { let page_dict = doc.get_dictionary(page_id).ok()?; @@ -1347,6 +1393,49 @@ fn is_cid_garbage(text: &str) -> bool { high_latin * 5 >= total * 2 && ascii_letters * 3 < total } +/// Detect formula text that is unlikely to be usable despite passing generic checks. +/// +/// Formula text (Greek letters, math operators, variables) is legitimately +/// symbol-heavy, so `is_garbage_text` would false-positive. This check instead +/// catches: +/// +/// 1. **Private Use Area (PUA) characters** — TeX extensible delimiter glyphs +/// (large brackets from CMEX fonts) often map to PUA U+E000–F8FF when the +/// ToUnicode CMap is missing. >10% PUA means significant undecoded content. +/// +/// 2. **Control characters** — C0 controls (U+0000–001F excluding whitespace) +/// indicate broken font encoding, not formula content. >30% is rejected. +fn is_formula_garbage(text: &str) -> bool { + let mut total = 0usize; + let mut pua = 0usize; + let mut control = 0usize; + for ch in text.chars() { + if ch.is_whitespace() { + continue; + } + total += 1; + if ('\u{E000}'..='\u{F8FF}').contains(&ch) { + pua += 1; + } + let cp = ch as u32; + if cp < 0x20 { + control += 1; + } + } + if total < 3 { + return false; + } + // >10% PUA — significant undecoded extensible delimiters + if pua * 10 > total { + return true; + } + // >30% control chars — broken encoding + if control * 10 > total * 3 { + return true; + } + false +} + /// Detect markdown tables with suspicious structure that suggest the heuristic /// missed/mangled rows or columns. Returns true when the caller should treat /// the result as `needs_ocr` and fall back to GPU OCR. @@ -2064,4 +2153,58 @@ mod tests { "Valid Japanese text should not be flagged as garbage" ); } + + #[test] + fn test_is_formula_garbage_accepts_math_text() { + // Greek letters, math operators, variables — typical formula text + let formula = "Φ(ν) = ∫ ∞ dze −it r1 − iνr2 sinh z"; + assert!( + !is_formula_garbage(formula), + "Valid formula text should not be flagged as garbage" + ); + + // Dense operator text + let operators = "α + β − γ × δ ÷ ε ≤ ζ ≥ η ≈ θ ≠ ι ± κ"; + assert!( + !is_formula_garbage(operators), + "Math operator text should not be flagged as garbage" + ); + + // Short formula (e.g. single equation variable) + let short = "αβ"; + assert!( + !is_formula_garbage(short), + "Short formula text should not be flagged" + ); + } + + #[test] + fn test_is_formula_garbage_rejects_pua_heavy() { + // Simulates extensible delimiters from CMEX fonts mapping to PUA + let pua_heavy = "x \u{F8EB} \u{F8EC} \u{F8ED} \u{F8F6} \u{F8F7} \u{F8F8} y"; + assert!( + is_formula_garbage(pua_heavy), + "PUA-heavy text should be flagged as formula garbage" + ); + } + + #[test] + fn test_is_formula_garbage_rejects_control_chars() { + // Control characters indicate broken encoding + let control_heavy = "a\x01b\x02c\x03d\x04e\x05f\x06g\x07h\x08i"; + assert!( + is_formula_garbage(control_heavy), + "Control-char-heavy text should be flagged as formula garbage" + ); + } + + #[test] + fn test_is_formula_garbage_accepts_few_pua() { + // A few PUA chars among many valid chars is fine (<10% threshold) + let mostly_good = "Φ(ν) = ∫ dze r1 − iνr2 sinh z α β γ δ ε ζ η θ \u{F8EB}"; + assert!( + !is_formula_garbage(mostly_good), + "Mostly-good text with rare PUA should pass" + ); + } }