88from functools import wraps
99from typing import Dict , List , Optional , Tuple
1010
11- import pypdfium2 as pdfium
1211import requests
1312import torch
1413from anthropic import Anthropic
2322
2423from lexoid .core .conversion_utils import (
2524 convert_image_to_pdf ,
26- convert_pdf_page_to_base64 ,
25+ convert_doc_to_base64_images ,
2726)
2827from lexoid .core .prompt_templates import (
2928 INSTRUCTIONS_ADD_PG_BREAK ,
3029 LLAMA_PARSER_PROMPT ,
3130 OPENAI_USER_PROMPT ,
3231 PARSER_PROMPT ,
3332)
34- from lexoid .core .utils import get_api_provider_for_model , get_file_type
33+ from lexoid .core .utils import (
34+ DEFAULT_LLM ,
35+ DEFAULT_LOCAL_LM ,
36+ get_api_provider_for_model ,
37+ get_file_type ,
38+ )
3539
3640
3741def retry_on_error (func ):
3842 @wraps (func )
3943 def wrapper (* args , ** kwargs ):
44+ return_dict = {
45+ "raw" : "" ,
46+ "segments" : [],
47+ "title" : kwargs ["title" ],
48+ "url" : kwargs .get ("url" , "" ),
49+ "parent_title" : kwargs .get ("parent_title" , "" ),
50+ "recursive_docs" : [],
51+ }
4052 try :
4153 return func (* args , ** kwargs )
4254 except HTTPError as e :
4355 logger .error (f"HTTPError encountered: { e } . Retrying in 10 seconds..." )
56+ if not kwargs .get ("retry_on_fail" , True ):
57+ return_dict ["error" ] = (
58+ f"HTTPError encountered on page { kwargs .get ('start' , 0 )} : { e } "
59+ )
60+ return return_dict
4461 time .sleep (10 )
4562 try :
4663 logger .debug (f"Retry { func .__name__ } " )
4764 return func (* args , ** kwargs )
4865 except HTTPError as e :
4966 logger .error (f"Retry failed: { e } " )
50- return {
51- "raw" : "" ,
52- "segments" : [],
53- "title" : kwargs ["title" ],
54- "url" : kwargs .get ("url" , "" ),
55- "parent_title" : kwargs .get ("parent_title" , "" ),
56- "recursive_docs" : [],
57- "error" : f"HTTPError encountered on page { kwargs .get ('start' , 0 )} : { e } " ,
58- }
67+ return_dict ["error" ] = (
68+ f"HTTPError encountered on page { kwargs .get ('start' , 0 )} : { e } "
69+ )
70+ return return_dict
5971 except ValueError as e :
6072 logger .error (f"ValueError encountered: { e } " )
73+ if not kwargs .get ("retry_on_fail" , True ):
74+ return_dict ["error" ] = (
75+ f"ValueError encountered on page { kwargs .get ('start' , 0 )} : { e } "
76+ )
77+ return return_dict
6178 time .sleep (10 )
6279 try :
6380 logger .debug (f"Retry { func .__name__ } " )
6481 return func (* args , ** kwargs )
6582 except ValueError as e :
6683 logger .error (f"Retry failed: { e } " )
67- return {
68- "raw" : "" ,
69- "segments" : [],
70- "title" : kwargs ["title" ],
71- "url" : kwargs .get ("url" , "" ),
72- "parent_title" : kwargs .get ("parent_title" , "" ),
73- "recursive_docs" : [],
74- "error" : f"ValueError encountered on page { kwargs .get ('start' , 0 )} : { e } " ,
75- }
84+ return_dict ["error" ] = (
85+ f"ValueError encountered on page { kwargs .get ('start' , 0 )} : { e } "
86+ )
87+ return return_dict
7688
7789 return wrapper
7890
@@ -90,7 +102,7 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
90102 elif kwargs ["api_provider" ]:
91103 return parse_with_api (path , api = kwargs ["api_provider" ], ** kwargs )
92104
93- model = kwargs .get ("model" , "gemini-2.0-flash" )
105+ model = kwargs .get ("model" , DEFAULT_LLM )
94106 kwargs ["model" ] = model
95107
96108 api_provider = get_api_provider_for_model (model )
@@ -287,7 +299,7 @@ def flush_row():
287299
288300def parse_with_local_model (path : str , ** kwargs ) -> Dict :
289301 # Source: https://huggingface.co/ibm-granite/granite-docling-258M
290- model_name = kwargs .get ("model" , "ds4sd/SmolDocling-256M-preview" )
302+ model_name = kwargs .get ("model" , DEFAULT_LOCAL_LM )
291303 device = "cuda" if torch .cuda .is_available () else "cpu"
292304
293305 processor = AutoProcessor .from_pretrained (model_name )
@@ -296,7 +308,8 @@ def parse_with_local_model(path: str, **kwargs) -> Dict:
296308 torch_dtype = torch .bfloat16 if device == "cuda" else torch .float32 ,
297309 ).to (device )
298310
299- images = convert_path_to_images (path )
311+ max_dimension = kwargs .get ("max_image_dimension" , 1500 )
312+ images = convert_doc_to_base64_images (path , max_dimension = max_dimension )
300313 proc_images = [
301314 Image .open (io .BytesIO (base64 .b64decode (image_b64 .split ("," )[1 ]))).convert ("RGB" )
302315 for _ , image_b64 in images
@@ -479,27 +492,6 @@ def parse_image_with_gemini(
479492 }
480493
481494
482- def convert_path_to_images (path ):
483- mime_type , _ = mimetypes .guess_type (path )
484- if mime_type and mime_type .startswith ("image" ):
485- # Single image processing
486- with open (path , "rb" ) as img_file :
487- image_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
488- return [(0 , f"data:{ mime_type } ;base64,{ image_base64 } " )]
489- elif mime_type and mime_type .startswith ("application/pdf" ):
490- # PDF processing
491- pdf_document = pdfium .PdfDocument (path )
492- return [
493- (
494- page_num ,
495- f"data:image/png;base64,{ convert_pdf_page_to_base64 (pdf_document , page_num )} " ,
496- )
497- for page_num in range (len (pdf_document ))
498- ]
499- else :
500- raise ValueError (f"Unsupported file type: { mime_type } " )
501-
502-
503495def get_messages (
504496 system_prompt : Optional [str ], user_prompt : Optional [str ], image_url : Optional [str ]
505497) -> List [Dict ]:
@@ -696,25 +688,8 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
696688 Dict: Dictionary containing parsed document data
697689 """
698690 logger .debug (f"Parsing with { api } API and model { kwargs ['model' ]} " )
699-
700- # Handle different input types
701- mime_type , _ = mimetypes .guess_type (path )
702- if mime_type and mime_type .startswith ("image" ):
703- # Single image processing
704- with open (path , "rb" ) as img_file :
705- image_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
706- images = [(0 , f"data:{ mime_type } ;base64,{ image_base64 } " )]
707- else :
708- # PDF processing
709- pdf_document = pdfium .PdfDocument (path )
710- images = [
711- (
712- page_num ,
713- f"data:image/png;base64,{ convert_pdf_page_to_base64 (pdf_document , page_num )} " ,
714- )
715- for page_num in range (len (pdf_document ))
716- ]
717- images = convert_path_to_images (path )
691+ max_dimension = kwargs .get ("max_image_dimension" , 1500 )
692+ images = convert_doc_to_base64_images (path , max_dimension = max_dimension )
718693
719694 # Process each page/image
720695 all_results = []
0 commit comments