@@ -42,26 +42,37 @@ def parse(self, src: Path) -> ParseResult:
4242 ) from exc
4343
4444 client = Mistral (api_key = api_key )
45- uploaded = client .files .upload (
46- file = {"file_name" : src .name , "content" : src .read_bytes ()}, purpose = "ocr"
47- )
48- signed = client .files .get_signed_url (file_id = uploaded .id )
49- resp = client .ocr .process (
50- model = self .model ,
51- document = {"type" : "document_url" , "document_url" : signed .url },
52- include_image_base64 = True ,
53- )
45+ uploaded = None
46+ try :
47+ uploaded = client .files .upload (
48+ file = {"file_name" : src .name , "content" : src .read_bytes ()}, purpose = "ocr"
49+ )
50+ signed = client .files .get_signed_url (file_id = uploaded .id )
51+ resp = client .ocr .process (
52+ model = self .model ,
53+ document = {"type" : "document_url" , "document_url" : signed .url },
54+ include_image_base64 = True ,
55+ )
5456
55- parts : list [str ] = []
56- images : dict [str , bytes ] = {}
57- for page in resp .pages :
58- parts .append (page .markdown or "" )
59- for img in getattr (page , "images" , None ) or []:
60- raw = img .image_base64 or ""
61- raw = _DATA_URI_RE .sub ("" , raw )
57+ parts : list [str ] = []
58+ images : dict [str , bytes ] = {}
59+ for page in resp .pages :
60+ parts .append (page .markdown or "" )
61+ for img in getattr (page , "images" , None ) or []:
62+ raw = img .image_base64 or ""
63+ raw = _DATA_URI_RE .sub ("" , raw )
64+ try :
65+ images [img .id ] = base64 .b64decode (raw , validate = True )
66+ except Exception :
67+ logger .warning ("Skipping undecodable Mistral image: %s" , getattr (img , "id" , "?" ))
68+ continue
69+ return ParseResult (markdown = "\n \n " .join (parts ), images = images )
70+ finally :
71+ if uploaded is not None :
6272 try :
63- images [ img . id ] = base64 . b64decode ( raw , validate = True )
73+ client . files . delete ( file_id = uploaded . id )
6474 except Exception :
65- logger .warning ("Skipping undecodable Mistral image: %s" , getattr (img , "id" , "?" ))
66- continue
67- return ParseResult (markdown = "\n \n " .join (parts ), images = images )
75+ logger .warning (
76+ "Failed to delete uploaded Mistral OCR file %s" ,
77+ getattr (uploaded , "id" , "?" ),
78+ )
0 commit comments