11import json
22import re
33from pathlib import Path
4- from typing import Any , Optional , Union
4+ from typing import Any
55
66from haystack import Document , component , default_from_dict , default_to_dict , logging
77from haystack .components .converters .utils import (
@@ -103,9 +103,9 @@ def __init__(
103103 api_key : Secret = Secret .from_env_var ("MISTRAL_API_KEY" ),
104104 model : str = "mistral-ocr-2505" ,
105105 include_image_base64 : bool = False ,
106- pages : Optional [ list [int ]] = None ,
107- image_limit : Optional [ int ] = None ,
108- image_min_size : Optional [ int ] = None ,
106+ pages : list [int ] | None = None ,
107+ image_limit : int | None = None ,
108+ image_min_size : int | None = None ,
109109 cleanup_uploaded_files : bool = True ,
110110 ):
111111 """
@@ -175,10 +175,10 @@ def from_dict(cls, data: dict[str, Any]) -> "MistralOCRDocumentConverter":
175175 @component .output_types (documents = list [Document ], raw_mistral_response = list [dict [str , Any ]])
176176 def run (
177177 self ,
178- sources : list [Union [ str , Path , ByteStream , DocumentURLChunk , FileChunk , ImageURLChunk ] ],
179- meta : Optional [ Union [ dict [str , Any ], list [dict [str , Any ]]]] = None ,
180- bbox_annotation_schema : Optional [ type [BaseModel ]] = None ,
181- document_annotation_schema : Optional [ type [BaseModel ]] = None ,
178+ sources : list [str | Path | ByteStream | DocumentURLChunk | FileChunk | ImageURLChunk ],
179+ meta : dict [str , Any ] | list [dict [str , Any ]] | None = None ,
180+ bbox_annotation_schema : type [BaseModel ] | None = None ,
181+ document_annotation_schema : type [BaseModel ] | None = None ,
182182 ) -> dict [str , Any ]:
183183 """
184184 Extract text from documents using Mistral OCR.
@@ -234,7 +234,7 @@ def run(
234234 raw_responses = []
235235 uploaded_file_ids = []
236236
237- for source , user_metadata in zip (sources , meta_list ):
237+ for source , user_metadata in zip (sources , meta_list , strict = True ):
238238 document , raw_response , uploaded_file_id = self ._process_single_source (
239239 source ,
240240 user_metadata ,
@@ -259,12 +259,12 @@ def run(
259259
260260 def _process_single_source (
261261 self ,
262- source : Union [ str , Path , ByteStream , DocumentURLChunk , FileChunk , ImageURLChunk ] ,
262+ source : str | Path | ByteStream | DocumentURLChunk | FileChunk | ImageURLChunk ,
263263 user_metadata : dict [str , Any ],
264- bbox_annotation_format : Optional [ Any ] ,
265- document_annotation_format : Optional [ Any ] ,
266- document_annotation_schema : Optional [ type [BaseModel ]] ,
267- ) -> tuple [Optional [ Document ], Optional [ dict [str , Any ]], Optional [ str ] ]:
264+ bbox_annotation_format : Any | None ,
265+ document_annotation_format : Any | None ,
266+ document_annotation_schema : type [BaseModel ] | None ,
267+ ) -> tuple [Document | None , dict [str , Any ] | None , str | None ]:
268268 """
269269 Process a single source and return the document, raw response, and file_id if uploaded.
270270
@@ -334,8 +334,8 @@ def _cleanup_uploaded_files(self, file_ids: list[str]) -> None:
334334
335335 def _convert_source_to_chunk (
336336 self ,
337- source : Union [ str , Path , ByteStream , DocumentURLChunk , FileChunk , ImageURLChunk ] ,
338- ) -> Union [ DocumentURLChunk , FileChunk , ImageURLChunk ] :
337+ source : str | Path | ByteStream | DocumentURLChunk | FileChunk | ImageURLChunk ,
338+ ) -> DocumentURLChunk | FileChunk | ImageURLChunk :
339339 """
340340 Convert various source types to Mistral-compatible chunk format.
341341
@@ -371,7 +371,7 @@ def _process_ocr_response(
371371 self ,
372372 ocr_response : OCRResponse ,
373373 user_metadata : dict [str , Any ],
374- document_annotation_schema : Optional [ type [BaseModel ]] ,
374+ document_annotation_schema : type [BaseModel ] | None ,
375375 ) -> Document :
376376 """
377377 Convert an OCR response from Mistral API into a single Haystack Document.
0 commit comments