Improve PaddleOCR efficiency by skipping conversion to images

dilithjay · web-flow · commit 35156ddd28d3 · 2025-11-13T07:36:11.000-04:00
diff --git a/lexoid/api.py b/lexoid/api.py
@@ -29,13 +29,17 @@
     LATEX_USER_PROMPT,
 )
 from lexoid.core.utils import (
+    DEFAULT_LLM,
+    DEFAULT_STATIC_FRAMEWORK,
     bbox_router,
     create_sub_pdf,
     download_file,
+    get_file_type,
     get_webpage_soup,
     is_supported_file_type,
     is_supported_url_file_type,
     recursive_read_html,
+    resize_image_if_needed,
     router,
     split_pdf,
 )
@@ -136,7 +140,7 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
     return_bboxes = kwargs.get("return_bboxes", False)
     has_bboxes = bool(result["segments"][0].get("bboxes"))
     bbox_framework = kwargs.get("bbox_framework", None)
-    framework = kwargs.get("framework", None)
+    framework = kwargs.get("framework", DEFAULT_STATIC_FRAMEWORK)
     bbox_framework_different = bbox_framework and bbox_framework != framework
     if return_bboxes and (not has_bboxes or bbox_framework_different):
         logger.debug("Extracting bounding boxes...")
@@ -263,6 +267,13 @@ def parse(
             f"Unsupported file type {os.path.splitext(path)[1]}"
         )
 
+        if "image" in get_file_type(path):
+            # Resize image if too large
+            max_dimension = kwargs.get("max_image_dimension", 1500)
+            path = resize_image_if_needed(
+                path, max_dimension=max_dimension, tmpdir=temp_dir
+            )
+
         if as_pdf and not path.lower().endswith(".pdf"):
             pdf_path = os.path.join(temp_dir, "converted.pdf")
             logger.debug("Converting file to PDF")
@@ -328,9 +339,7 @@ def parse(
             else:
                 raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
 
-            api_cost = api_cost_mapping.get(
-                kwargs.get("model", "gemini-2.0-flash"), None
-            )
+            api_cost = api_cost_mapping.get(kwargs.get("model", DEFAULT_LLM), None)
             if api_cost:
                 token_usage = result["token_usage"]
                 token_cost = {
diff --git a/lexoid/core/conversion_utils.py b/lexoid/core/conversion_utils.py
@@ -22,12 +22,23 @@
 
 
 def convert_pdf_page_to_base64(
-    pdf_document: pdfium.PdfDocument, page_number: int
+    pdf_document: pdfium.PdfDocument, page_number: int, max_dimension: int = 1500
 ) -> str:
     """Convert a PDF page to a base64-encoded PNG string."""
     page = pdf_document[page_number]
-    # Render with 4x scaling for better quality
-    pil_image = page.render(scale=4).to_pil()
+    pil_image = page.render(scale=1).to_pil()
+
+    # Resize image if too large
+    if pil_image.width > max_dimension or pil_image.height > max_dimension:
+        scaling_factor = min(
+            max_dimension / pil_image.width, max_dimension / pil_image.height
+        )
+        new_size = (
+            int(pil_image.width * scaling_factor),
+            int(pil_image.height * scaling_factor),
+        )
+        pil_image = pil_image.resize(new_size, Image.Resampling.LANCZOS)
+        logger.debug(f"Resized page {page_number} to {new_size} for base64 conversion.")
 
     # Convert to base64
     img_byte_arr = io.BytesIO()
@@ -36,12 +47,15 @@ def convert_pdf_page_to_base64(
     return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
 
 
-def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
+def convert_doc_to_base64_images(
+    path: str, max_dimension: int = 1500
+) -> List[Tuple[int, str]]:
     """
     Converts a document (PDF or image) to a base64 encoded string.
 
     Args:
         path (str): Path to the document.
+        max_dimension (int): Maximum dimension (width or height) for the output images. Default is 1500.
 
     Returns:
         List[Tuple[int, str]]: A list of tuples where each tuple contains the page number
@@ -52,7 +66,7 @@ def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
         images = [
             (
                 page_num,
-                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
+                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num, max_dimension)}",
             )
             for page_num in range(len(pdf_document))
         ]
diff --git a/lexoid/core/parse_type/llm_parser.py b/lexoid/core/parse_type/llm_parser.py
@@ -8,7 +8,6 @@
 from functools import wraps
 from typing import Dict, List, Optional, Tuple
 
-import pypdfium2 as pdfium
 import requests
 import torch
 from anthropic import Anthropic
@@ -23,56 +22,69 @@
 
 from lexoid.core.conversion_utils import (
     convert_image_to_pdf,
-    convert_pdf_page_to_base64,
+    convert_doc_to_base64_images,
 )
 from lexoid.core.prompt_templates import (
     INSTRUCTIONS_ADD_PG_BREAK,
     LLAMA_PARSER_PROMPT,
     OPENAI_USER_PROMPT,
     PARSER_PROMPT,
 )
-from lexoid.core.utils import get_api_provider_for_model, get_file_type
+from lexoid.core.utils import (
+    DEFAULT_LLM,
+    DEFAULT_LOCAL_LM,
+    get_api_provider_for_model,
+    get_file_type,
+)
 
 
 def retry_on_error(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
+        return_dict = {
+            "raw": "",
+            "segments": [],
+            "title": kwargs["title"],
+            "url": kwargs.get("url", ""),
+            "parent_title": kwargs.get("parent_title", ""),
+            "recursive_docs": [],
+        }
         try:
             return func(*args, **kwargs)
         except HTTPError as e:
             logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
+            if not kwargs.get("retry_on_fail", True):
+                return_dict["error"] = (
+                    f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}"
+                )
+                return return_dict
             time.sleep(10)
             try:
                 logger.debug(f"Retry {func.__name__}")
                 return func(*args, **kwargs)
             except HTTPError as e:
                 logger.error(f"Retry failed: {e}")
-                return {
-                    "raw": "",
-                    "segments": [],
-                    "title": kwargs["title"],
-                    "url": kwargs.get("url", ""),
-                    "parent_title": kwargs.get("parent_title", ""),
-                    "recursive_docs": [],
-                    "error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
-                }
+                return_dict["error"] = (
+                    f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}"
+                )
+                return return_dict
         except ValueError as e:
             logger.error(f"ValueError encountered: {e}")
+            if not kwargs.get("retry_on_fail", True):
+                return_dict["error"] = (
+                    f"ValueError encountered on page {kwargs.get('start', 0)}: {e}"
+                )
+                return return_dict
             time.sleep(10)
             try:
                 logger.debug(f"Retry {func.__name__}")
                 return func(*args, **kwargs)
             except ValueError as e:
                 logger.error(f"Retry failed: {e}")
-                return {
-                    "raw": "",
-                    "segments": [],
-                    "title": kwargs["title"],
-                    "url": kwargs.get("url", ""),
-                    "parent_title": kwargs.get("parent_title", ""),
-                    "recursive_docs": [],
-                    "error": f"ValueError encountered on page {kwargs.get('start', 0)}: {e}",
-                }
+                return_dict["error"] = (
+                    f"ValueError encountered on page {kwargs.get('start', 0)}: {e}"
+                )
+                return return_dict
 
     return wrapper
 
@@ -90,7 +102,7 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
         elif kwargs["api_provider"]:
             return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
 
-    model = kwargs.get("model", "gemini-2.0-flash")
+    model = kwargs.get("model", DEFAULT_LLM)
     kwargs["model"] = model
 
     api_provider = get_api_provider_for_model(model)
@@ -287,7 +299,7 @@ def flush_row():
 
 def parse_with_local_model(path: str, **kwargs) -> Dict:
     # Source: https://huggingface.co/ibm-granite/granite-docling-258M
-    model_name = kwargs.get("model", "ds4sd/SmolDocling-256M-preview")
+    model_name = kwargs.get("model", DEFAULT_LOCAL_LM)
     device = "cuda" if torch.cuda.is_available() else "cpu"
 
     processor = AutoProcessor.from_pretrained(model_name)
@@ -296,7 +308,8 @@ def parse_with_local_model(path: str, **kwargs) -> Dict:
         torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
     ).to(device)
 
-    images = convert_path_to_images(path)
+    max_dimension = kwargs.get("max_image_dimension", 1500)
+    images = convert_doc_to_base64_images(path, max_dimension=max_dimension)
     proc_images = [
         Image.open(io.BytesIO(base64.b64decode(image_b64.split(",")[1]))).convert("RGB")
         for _, image_b64 in images
@@ -479,27 +492,6 @@ def parse_image_with_gemini(
     }
 
 
-def convert_path_to_images(path):
-    mime_type, _ = mimetypes.guess_type(path)
-    if mime_type and mime_type.startswith("image"):
-        # Single image processing
-        with open(path, "rb") as img_file:
-            image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
-            return [(0, f"data:{mime_type};base64,{image_base64}")]
-    elif mime_type and mime_type.startswith("application/pdf"):
-        # PDF processing
-        pdf_document = pdfium.PdfDocument(path)
-        return [
-            (
-                page_num,
-                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
-            )
-            for page_num in range(len(pdf_document))
-        ]
-    else:
-        raise ValueError(f"Unsupported file type: {mime_type}")
-
-
 def get_messages(
     system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
 ) -> List[Dict]:
@@ -696,25 +688,8 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
         Dict: Dictionary containing parsed document data
     """
     logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
-
-    # Handle different input types
-    mime_type, _ = mimetypes.guess_type(path)
-    if mime_type and mime_type.startswith("image"):
-        # Single image processing
-        with open(path, "rb") as img_file:
-            image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
-            images = [(0, f"data:{mime_type};base64,{image_base64}")]
-    else:
-        # PDF processing
-        pdf_document = pdfium.PdfDocument(path)
-        images = [
-            (
-                page_num,
-                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
-            )
-            for page_num in range(len(pdf_document))
-        ]
-    images = convert_path_to_images(path)
+    max_dimension = kwargs.get("max_image_dimension", 1500)
+    images = convert_doc_to_base64_images(path, max_dimension=max_dimension)
 
     # Process each page/image
     all_results = []
diff --git a/lexoid/core/parse_type/static_parser.py b/lexoid/core/parse_type/static_parser.py
@@ -15,10 +15,6 @@
 from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
 from pptx2md import ConversionConfig, convert
 
-from lexoid.core.conversion_utils import (
-    base64_to_np_array,
-    convert_doc_to_base64_images,
-)
 from lexoid.core.utils import (
     get_file_type,
     get_uri_rect,
@@ -240,7 +236,6 @@ def embed_links_in_text(page, text, links):
             offset += len(uri) + 4  # Adjust offset for added link syntax
         else:
             logger.warning(f"No matching text found for link: {uri}")
-    logger.debug(f"Embedded {len(links)} links into text: {text}.")
     return text
 
 
@@ -761,23 +756,23 @@ def parse_with_paddleocr(path: str, **kwargs) -> Dict:
     Returns:
         Dict: Dictionary containing parsed document data with segments per page.
     """
-    ocr = PaddleOCR(use_textline_orientation=False, lang="en")
-
-    base64_images = convert_doc_to_base64_images(path)
+    ocr = PaddleOCR(
+        use_doc_orientation_classify=False,
+        use_doc_unwarping=False,
+        use_textline_orientation=False,
+    )
 
     segments = []
     all_texts = []
 
-    for page_num, base64_img_str in base64_images:
-        image_np = base64_to_np_array(base64_img_str, gray_scale=False)
-
-        results = ocr.predict(image_np, use_doc_unwarping=False)
-
+    results = ocr.predict(path)
+    for result in results:
         page_texts = []
         page_bboxes = []
 
-        height_img, width_img = image_np.shape[:2]
-        for text, bbox in zip(results[0]["rec_texts"], results[0]["dt_polys"]):
+        page_num = result["page_index"]
+        height_img, width_img, _ = result["doc_preprocessor_res"]["output_img"].shape
+        for text, bbox in zip(result["rec_texts"], result["dt_polys"]):
             x_coords = bbox[:, 0]
             y_coords = bbox[:, 1]
             x_min = x_coords.min().item()
diff --git a/lexoid/core/utils.py b/lexoid/core/utils.py
@@ -21,6 +21,9 @@
 from lexoid.core.llm_selector import DocumentRankedLLMSelector
 
 HTML_TAG_PATTERN = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
+DEFAULT_LLM = "gemini-2.0-flash"
+DEFAULT_LOCAL_LM = "ds4sd/SmolDocling-256M-preview"
+DEFAULT_STATIC_FRAMEWORK = "pdfplumber"
 
 
 def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
@@ -58,6 +61,31 @@ def get_file_type(path: str) -> str:
     return mimetypes.guess_type(path)[0] or ""
 
 
+def resize_image_if_needed(
+    path: str, max_dimension: int = 1500, tmpdir: Optional[str] = None
+) -> str:
+    """Resize image if its dimensions exceed max_dimension."""
+    from PIL import Image
+
+    with Image.open(path) as img:
+        width, height = img.size
+        if max(width, height) > max_dimension:
+            logger.debug(
+                f"Resizing image to fit within max dimensions of {max_dimension}."
+            )
+            scaling_factor = max_dimension / float(max(width, height))
+            new_size = (int(width * scaling_factor), int(height * scaling_factor))
+            img = img.resize(new_size, Image.Resampling.LANCZOS)
+            if tmpdir:
+                folder = tmpdir
+            else:
+                folder = os.path.dirname(path)
+            resized_path = os.path.join(folder, f"resized_{os.path.basename(path)}")
+            img.save(resized_path)
+            return resized_path
+    return path
+
+
 def is_supported_file_type(path: str) -> bool:
     """Check if the file type is supported for parsing."""
     file_type = get_file_type(path)