33# SPDX-License-Identifier: Apache-2.0
44import base64
55from pathlib import Path
6- from typing import Any , Literal , Optional , Union
6+ from typing import Any , Literal
77
88import requests
99from haystack import Document , component , default_from_dict , default_to_dict , logging
2424logger = logging .getLogger (__name__ )
2525
2626
27- FileTypeInput = Union [ Literal ["pdf" , "image" ], None ]
27+ FileTypeInput = Literal ["pdf" , "image" ] | None
2828
2929# Supported image file extensions
3030_IMAGE_EXTENSIONS = {
4141
4242
4343def _infer_file_type_from_source (
44- source : Union [ str , Path , ByteStream ] ,
45- mime_type : Optional [ str ] = None ,
46- ) -> Optional [ FileType ] :
44+ source : str | Path | ByteStream ,
45+ mime_type : str | None = None ,
46+ ) -> FileType | None :
4747 """
4848 Infer file type from file extension or MIME type.
4949
@@ -56,7 +56,7 @@ def _infer_file_type_from_source(
5656 determined.
5757 """
5858 # Try to get extension from file path
59- file_path : Optional [ str ] = None
59+ file_path : str | None = None
6060
6161 # Check if source is a file path
6262 if isinstance (source , (str , Path )):
@@ -86,7 +86,7 @@ def _infer_file_type_from_source(
8686 return None
8787
8888
89- def _normalize_file_type (file_type : Optional [ FileTypeInput ] ) -> Optional [ FileType ] :
89+ def _normalize_file_type (file_type : FileTypeInput | None ) -> FileType | None :
9090 """
9191 Normalize file type input to the numeric format expected by the API.
9292
@@ -145,26 +145,26 @@ def __init__(
145145 * ,
146146 api_url : str ,
147147 access_token : Secret = Secret .from_env_var ("AISTUDIO_ACCESS_TOKEN" ),
148- file_type : Optional [ FileTypeInput ] = None ,
149- use_doc_orientation_classify : Optional [ bool ] = None ,
150- use_doc_unwarping : Optional [ bool ] = None ,
151- use_layout_detection : Optional [ bool ] = None ,
152- use_chart_recognition : Optional [ bool ] = None ,
153- layout_threshold : Optional [ Union [ float , dict ]] = None ,
154- layout_nms : Optional [ bool ] = None ,
155- layout_unclip_ratio : Optional [ Union [ float , tuple [float , float ], dict ]] = None ,
156- layout_merge_bboxes_mode : Optional [ Union [ str , dict ]] = None ,
157- prompt_label : Optional [ str ] = None ,
158- format_block_content : Optional [ bool ] = None ,
159- repetition_penalty : Optional [ float ] = None ,
160- temperature : Optional [ float ] = None ,
161- top_p : Optional [ float ] = None ,
162- min_pixels : Optional [ int ] = None ,
163- max_pixels : Optional [ int ] = None ,
164- prettify_markdown : Optional [ bool ] = None ,
165- show_formula_number : Optional [ bool ] = None ,
166- visualize : Optional [ bool ] = None ,
167- additional_params : Optional [ dict [str , Any ]] = None ,
148+ file_type : FileTypeInput | None = None ,
149+ use_doc_orientation_classify : bool | None = None ,
150+ use_doc_unwarping : bool | None = None ,
151+ use_layout_detection : bool | None = None ,
152+ use_chart_recognition : bool | None = None ,
153+ layout_threshold : float | dict | None = None ,
154+ layout_nms : bool | None = None ,
155+ layout_unclip_ratio : float | tuple [float , float ] | dict | None = None ,
156+ layout_merge_bboxes_mode : str | dict | None = None ,
157+ prompt_label : str | None = None ,
158+ format_block_content : bool | None = None ,
159+ repetition_penalty : float | None = None ,
160+ temperature : float | None = None ,
161+ top_p : float | None = None ,
162+ min_pixels : int | None = None ,
163+ max_pixels : int | None = None ,
164+ prettify_markdown : bool | None = None ,
165+ show_formula_number : bool | None = None ,
166+ visualize : bool | None = None ,
167+ additional_params : dict [str , Any ] | None = None ,
168168 ):
169169 """
170170 Create a `PaddleOCRVLDocumentConverter` component.
@@ -421,8 +421,8 @@ def _parse(self, data: bytes, file_type: FileType) -> tuple[str, dict[str, Any]]
421421 @component .output_types (documents = list [Document ], raw_paddleocr_responses = list [dict [str , Any ]])
422422 def run (
423423 self ,
424- sources : list [Union [ str , Path , ByteStream ] ],
425- meta : Optional [ Union [ dict [str , Any ], list [dict [str , Any ]]]] = None ,
424+ sources : list [str | Path | ByteStream ],
425+ meta : dict [str , Any ] | list [dict [str , Any ]] | None = None ,
426426 ) -> dict [str , Any ]:
427427 """
428428 Convert image or PDF files to Documents.
@@ -448,7 +448,7 @@ def run(
448448
449449 meta_list = normalize_metadata (meta , sources_count = len (sources ))
450450
451- for source , metadata in zip (sources , meta_list ):
451+ for source , metadata in zip (sources , meta_list , strict = True ):
452452 try :
453453 bytestream = get_bytestream_from_source (source )
454454 except Exception as e :
0 commit comments