Skip to content

Commit 35156dd

Browse files
authored
Improve PaddleOCR efficiency by skipping conversion to images
1 parent bf1af58 commit 35156dd

5 files changed

Lines changed: 109 additions & 88 deletions

File tree

lexoid/api.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,17 @@
2929
LATEX_USER_PROMPT,
3030
)
3131
from lexoid.core.utils import (
32+
DEFAULT_LLM,
33+
DEFAULT_STATIC_FRAMEWORK,
3234
bbox_router,
3335
create_sub_pdf,
3436
download_file,
37+
get_file_type,
3538
get_webpage_soup,
3639
is_supported_file_type,
3740
is_supported_url_file_type,
3841
recursive_read_html,
42+
resize_image_if_needed,
3943
router,
4044
split_pdf,
4145
)
@@ -136,7 +140,7 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
136140
return_bboxes = kwargs.get("return_bboxes", False)
137141
has_bboxes = bool(result["segments"][0].get("bboxes"))
138142
bbox_framework = kwargs.get("bbox_framework", None)
139-
framework = kwargs.get("framework", None)
143+
framework = kwargs.get("framework", DEFAULT_STATIC_FRAMEWORK)
140144
bbox_framework_different = bbox_framework and bbox_framework != framework
141145
if return_bboxes and (not has_bboxes or bbox_framework_different):
142146
logger.debug("Extracting bounding boxes...")
@@ -263,6 +267,13 @@ def parse(
263267
f"Unsupported file type {os.path.splitext(path)[1]}"
264268
)
265269

270+
if "image" in get_file_type(path):
271+
# Resize image if too large
272+
max_dimension = kwargs.get("max_image_dimension", 1500)
273+
path = resize_image_if_needed(
274+
path, max_dimension=max_dimension, tmpdir=temp_dir
275+
)
276+
266277
if as_pdf and not path.lower().endswith(".pdf"):
267278
pdf_path = os.path.join(temp_dir, "converted.pdf")
268279
logger.debug("Converting file to PDF")
@@ -328,9 +339,7 @@ def parse(
328339
else:
329340
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
330341

331-
api_cost = api_cost_mapping.get(
332-
kwargs.get("model", "gemini-2.0-flash"), None
333-
)
342+
api_cost = api_cost_mapping.get(kwargs.get("model", DEFAULT_LLM), None)
334343
if api_cost:
335344
token_usage = result["token_usage"]
336345
token_cost = {

lexoid/core/conversion_utils.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,23 @@
2222

2323

2424
def convert_pdf_page_to_base64(
25-
pdf_document: pdfium.PdfDocument, page_number: int
25+
pdf_document: pdfium.PdfDocument, page_number: int, max_dimension: int = 1500
2626
) -> str:
2727
"""Convert a PDF page to a base64-encoded PNG string."""
2828
page = pdf_document[page_number]
29-
# Render with 4x scaling for better quality
30-
pil_image = page.render(scale=4).to_pil()
29+
pil_image = page.render(scale=1).to_pil()
30+
31+
# Resize image if too large
32+
if pil_image.width > max_dimension or pil_image.height > max_dimension:
33+
scaling_factor = min(
34+
max_dimension / pil_image.width, max_dimension / pil_image.height
35+
)
36+
new_size = (
37+
int(pil_image.width * scaling_factor),
38+
int(pil_image.height * scaling_factor),
39+
)
40+
pil_image = pil_image.resize(new_size, Image.Resampling.LANCZOS)
41+
logger.debug(f"Resized page {page_number} to {new_size} for base64 conversion.")
3142

3243
# Convert to base64
3344
img_byte_arr = io.BytesIO()
@@ -36,12 +47,15 @@ def convert_pdf_page_to_base64(
3647
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
3748

3849

39-
def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
50+
def convert_doc_to_base64_images(
51+
path: str, max_dimension: int = 1500
52+
) -> List[Tuple[int, str]]:
4053
"""
4154
Converts a document (PDF or image) to a base64 encoded string.
4255
4356
Args:
4457
path (str): Path to the document.
58+
max_dimension (int): Maximum dimension (width or height) for the output images. Default is 1500.
4559
4660
Returns:
4761
List[Tuple[int, str]]: A list of tuples where each tuple contains the page number
@@ -52,7 +66,7 @@ def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
5266
images = [
5367
(
5468
page_num,
55-
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
69+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num, max_dimension)}",
5670
)
5771
for page_num in range(len(pdf_document))
5872
]

lexoid/core/parse_type/llm_parser.py

Lines changed: 39 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from functools import wraps
99
from typing import Dict, List, Optional, Tuple
1010

11-
import pypdfium2 as pdfium
1211
import requests
1312
import torch
1413
from anthropic import Anthropic
@@ -23,56 +22,69 @@
2322

2423
from lexoid.core.conversion_utils import (
2524
convert_image_to_pdf,
26-
convert_pdf_page_to_base64,
25+
convert_doc_to_base64_images,
2726
)
2827
from lexoid.core.prompt_templates import (
2928
INSTRUCTIONS_ADD_PG_BREAK,
3029
LLAMA_PARSER_PROMPT,
3130
OPENAI_USER_PROMPT,
3231
PARSER_PROMPT,
3332
)
34-
from lexoid.core.utils import get_api_provider_for_model, get_file_type
33+
from lexoid.core.utils import (
34+
DEFAULT_LLM,
35+
DEFAULT_LOCAL_LM,
36+
get_api_provider_for_model,
37+
get_file_type,
38+
)
3539

3640

3741
def retry_on_error(func):
3842
@wraps(func)
3943
def wrapper(*args, **kwargs):
44+
return_dict = {
45+
"raw": "",
46+
"segments": [],
47+
"title": kwargs["title"],
48+
"url": kwargs.get("url", ""),
49+
"parent_title": kwargs.get("parent_title", ""),
50+
"recursive_docs": [],
51+
}
4052
try:
4153
return func(*args, **kwargs)
4254
except HTTPError as e:
4355
logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
56+
if not kwargs.get("retry_on_fail", True):
57+
return_dict["error"] = (
58+
f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}"
59+
)
60+
return return_dict
4461
time.sleep(10)
4562
try:
4663
logger.debug(f"Retry {func.__name__}")
4764
return func(*args, **kwargs)
4865
except HTTPError as e:
4966
logger.error(f"Retry failed: {e}")
50-
return {
51-
"raw": "",
52-
"segments": [],
53-
"title": kwargs["title"],
54-
"url": kwargs.get("url", ""),
55-
"parent_title": kwargs.get("parent_title", ""),
56-
"recursive_docs": [],
57-
"error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
58-
}
67+
return_dict["error"] = (
68+
f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}"
69+
)
70+
return return_dict
5971
except ValueError as e:
6072
logger.error(f"ValueError encountered: {e}")
73+
if not kwargs.get("retry_on_fail", True):
74+
return_dict["error"] = (
75+
f"ValueError encountered on page {kwargs.get('start', 0)}: {e}"
76+
)
77+
return return_dict
6178
time.sleep(10)
6279
try:
6380
logger.debug(f"Retry {func.__name__}")
6481
return func(*args, **kwargs)
6582
except ValueError as e:
6683
logger.error(f"Retry failed: {e}")
67-
return {
68-
"raw": "",
69-
"segments": [],
70-
"title": kwargs["title"],
71-
"url": kwargs.get("url", ""),
72-
"parent_title": kwargs.get("parent_title", ""),
73-
"recursive_docs": [],
74-
"error": f"ValueError encountered on page {kwargs.get('start', 0)}: {e}",
75-
}
84+
return_dict["error"] = (
85+
f"ValueError encountered on page {kwargs.get('start', 0)}: {e}"
86+
)
87+
return return_dict
7688

7789
return wrapper
7890

@@ -90,7 +102,7 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
90102
elif kwargs["api_provider"]:
91103
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
92104

93-
model = kwargs.get("model", "gemini-2.0-flash")
105+
model = kwargs.get("model", DEFAULT_LLM)
94106
kwargs["model"] = model
95107

96108
api_provider = get_api_provider_for_model(model)
@@ -287,7 +299,7 @@ def flush_row():
287299

288300
def parse_with_local_model(path: str, **kwargs) -> Dict:
289301
# Source: https://huggingface.co/ibm-granite/granite-docling-258M
290-
model_name = kwargs.get("model", "ds4sd/SmolDocling-256M-preview")
302+
model_name = kwargs.get("model", DEFAULT_LOCAL_LM)
291303
device = "cuda" if torch.cuda.is_available() else "cpu"
292304

293305
processor = AutoProcessor.from_pretrained(model_name)
@@ -296,7 +308,8 @@ def parse_with_local_model(path: str, **kwargs) -> Dict:
296308
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
297309
).to(device)
298310

299-
images = convert_path_to_images(path)
311+
max_dimension = kwargs.get("max_image_dimension", 1500)
312+
images = convert_doc_to_base64_images(path, max_dimension=max_dimension)
300313
proc_images = [
301314
Image.open(io.BytesIO(base64.b64decode(image_b64.split(",")[1]))).convert("RGB")
302315
for _, image_b64 in images
@@ -479,27 +492,6 @@ def parse_image_with_gemini(
479492
}
480493

481494

482-
def convert_path_to_images(path):
483-
mime_type, _ = mimetypes.guess_type(path)
484-
if mime_type and mime_type.startswith("image"):
485-
# Single image processing
486-
with open(path, "rb") as img_file:
487-
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
488-
return [(0, f"data:{mime_type};base64,{image_base64}")]
489-
elif mime_type and mime_type.startswith("application/pdf"):
490-
# PDF processing
491-
pdf_document = pdfium.PdfDocument(path)
492-
return [
493-
(
494-
page_num,
495-
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
496-
)
497-
for page_num in range(len(pdf_document))
498-
]
499-
else:
500-
raise ValueError(f"Unsupported file type: {mime_type}")
501-
502-
503495
def get_messages(
504496
system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
505497
) -> List[Dict]:
@@ -696,25 +688,8 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
696688
Dict: Dictionary containing parsed document data
697689
"""
698690
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
699-
700-
# Handle different input types
701-
mime_type, _ = mimetypes.guess_type(path)
702-
if mime_type and mime_type.startswith("image"):
703-
# Single image processing
704-
with open(path, "rb") as img_file:
705-
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
706-
images = [(0, f"data:{mime_type};base64,{image_base64}")]
707-
else:
708-
# PDF processing
709-
pdf_document = pdfium.PdfDocument(path)
710-
images = [
711-
(
712-
page_num,
713-
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
714-
)
715-
for page_num in range(len(pdf_document))
716-
]
717-
images = convert_path_to_images(path)
691+
max_dimension = kwargs.get("max_image_dimension", 1500)
692+
images = convert_doc_to_base64_images(path, max_dimension=max_dimension)
718693

719694
# Process each page/image
720695
all_results = []

lexoid/core/parse_type/static_parser.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@
1515
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
1616
from pptx2md import ConversionConfig, convert
1717

18-
from lexoid.core.conversion_utils import (
19-
base64_to_np_array,
20-
convert_doc_to_base64_images,
21-
)
2218
from lexoid.core.utils import (
2319
get_file_type,
2420
get_uri_rect,
@@ -240,7 +236,6 @@ def embed_links_in_text(page, text, links):
240236
offset += len(uri) + 4 # Adjust offset for added link syntax
241237
else:
242238
logger.warning(f"No matching text found for link: {uri}")
243-
logger.debug(f"Embedded {len(links)} links into text: {text}.")
244239
return text
245240

246241

@@ -761,23 +756,23 @@ def parse_with_paddleocr(path: str, **kwargs) -> Dict:
761756
Returns:
762757
Dict: Dictionary containing parsed document data with segments per page.
763758
"""
764-
ocr = PaddleOCR(use_textline_orientation=False, lang="en")
765-
766-
base64_images = convert_doc_to_base64_images(path)
759+
ocr = PaddleOCR(
760+
use_doc_orientation_classify=False,
761+
use_doc_unwarping=False,
762+
use_textline_orientation=False,
763+
)
767764

768765
segments = []
769766
all_texts = []
770767

771-
for page_num, base64_img_str in base64_images:
772-
image_np = base64_to_np_array(base64_img_str, gray_scale=False)
773-
774-
results = ocr.predict(image_np, use_doc_unwarping=False)
775-
768+
results = ocr.predict(path)
769+
for result in results:
776770
page_texts = []
777771
page_bboxes = []
778772

779-
height_img, width_img = image_np.shape[:2]
780-
for text, bbox in zip(results[0]["rec_texts"], results[0]["dt_polys"]):
773+
page_num = result["page_index"]
774+
height_img, width_img, _ = result["doc_preprocessor_res"]["output_img"].shape
775+
for text, bbox in zip(result["rec_texts"], result["dt_polys"]):
781776
x_coords = bbox[:, 0]
782777
y_coords = bbox[:, 1]
783778
x_min = x_coords.min().item()

lexoid/core/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
from lexoid.core.llm_selector import DocumentRankedLLMSelector
2222

2323
HTML_TAG_PATTERN = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
24+
DEFAULT_LLM = "gemini-2.0-flash"
25+
DEFAULT_LOCAL_LM = "ds4sd/SmolDocling-256M-preview"
26+
DEFAULT_STATIC_FRAMEWORK = "pdfplumber"
2427

2528

2629
def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
@@ -58,6 +61,31 @@ def get_file_type(path: str) -> str:
5861
return mimetypes.guess_type(path)[0] or ""
5962

6063

64+
def resize_image_if_needed(
65+
path: str, max_dimension: int = 1500, tmpdir: Optional[str] = None
66+
) -> str:
67+
"""Resize image if its dimensions exceed max_dimension."""
68+
from PIL import Image
69+
70+
with Image.open(path) as img:
71+
width, height = img.size
72+
if max(width, height) > max_dimension:
73+
logger.debug(
74+
f"Resizing image to fit within max dimensions of {max_dimension}."
75+
)
76+
scaling_factor = max_dimension / float(max(width, height))
77+
new_size = (int(width * scaling_factor), int(height * scaling_factor))
78+
img = img.resize(new_size, Image.Resampling.LANCZOS)
79+
if tmpdir:
80+
folder = tmpdir
81+
else:
82+
folder = os.path.dirname(path)
83+
resized_path = os.path.join(folder, f"resized_{os.path.basename(path)}")
84+
img.save(resized_path)
85+
return resized_path
86+
return path
87+
88+
6189
def is_supported_file_type(path: str) -> bool:
6290
"""Check if the file type is supported for parsing."""
6391
file_type = get_file_type(path)

0 commit comments

Comments
 (0)