From 58ee1f77baa566cfe2ec84c3687b602459dacdc5 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 12 Jan 2026 15:35:27 +0100 Subject: [PATCH 1/2] chore!: paddleocr - drop Python 3.9 and use X|Y typing --- integrations/paddleocr/pyproject.toml | 10 +--- .../paddleocr_vl_document_converter.py | 60 +++++++++---------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/integrations/paddleocr/pyproject.toml b/integrations/paddleocr/pyproject.toml index 95790bcac7..7dd50823d8 100644 --- a/integrations/paddleocr/pyproject.toml +++ b/integrations/paddleocr/pyproject.toml @@ -7,7 +7,7 @@ name = "paddleocr-haystack" dynamic = ["version"] description = 'An integration of PaddleOCR with Haystack' readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = "Apache-2.0" keywords = [] authors = [ @@ -16,7 +16,6 @@ authors = [ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -24,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.19.0", + "haystack-ai>=2.22.0", "paddleocr>=3.3.2", "paddlex[serving]>=3.3.10", "requests>=2.25.0", @@ -80,7 +79,6 @@ check_untyped_defs = true disallow_incomplete_defs = true [tool.ruff] -target-version = "py39" line-length = 120 [tool.ruff.lint] @@ -127,10 +125,6 @@ ignore = [ "B008", "S101", ] -unfixable = [ - # Don't touch unused imports - "F401", -] [tool.ruff.lint.isort] known-first-party = ["haystack_integrations"] diff --git a/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py b/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py index cfef1c48e9..fdc50a7ba4 100644 --- a/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +++ b/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, Literal import requests from haystack import Document, component, default_from_dict, default_to_dict, logging @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) -FileTypeInput = Union[Literal["pdf", "image"], None] +FileTypeInput = Literal["pdf", "image"] | None # Supported image file extensions _IMAGE_EXTENSIONS = { @@ -41,9 +41,9 @@ def _infer_file_type_from_source( - source: Union[str, Path, ByteStream], - mime_type: Optional[str] = None, -) -> Optional[FileType]: + source: str | Path | ByteStream, + mime_type: str | None = None, +) -> FileType | None: """ Infer file type from file extension or MIME type. @@ -56,7 +56,7 @@ def _infer_file_type_from_source( determined. """ # Try to get extension from file path - file_path: Optional[str] = None + file_path: str | None = None # Check if source is a file path if isinstance(source, (str, Path)): @@ -86,7 +86,7 @@ def _infer_file_type_from_source( return None -def _normalize_file_type(file_type: Optional[FileTypeInput]) -> Optional[FileType]: +def _normalize_file_type(file_type: FileTypeInput | None) -> FileType | None: """ Normalize file type input to the numeric format expected by the API. @@ -145,26 +145,26 @@ def __init__( *, api_url: str, access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"), - file_type: Optional[FileTypeInput] = None, - use_doc_orientation_classify: Optional[bool] = None, - use_doc_unwarping: Optional[bool] = None, - use_layout_detection: Optional[bool] = None, - use_chart_recognition: Optional[bool] = None, - layout_threshold: Optional[Union[float, dict]] = None, - layout_nms: Optional[bool] = None, - layout_unclip_ratio: Optional[Union[float, tuple[float, float], dict]] = None, - layout_merge_bboxes_mode: Optional[Union[str, dict]] = None, - prompt_label: Optional[str] = None, - format_block_content: Optional[bool] = None, - repetition_penalty: Optional[float] = None, - temperature: Optional[float] = None, - top_p: Optional[float] = None, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - prettify_markdown: Optional[bool] = None, - show_formula_number: Optional[bool] = None, - visualize: Optional[bool] = None, - additional_params: Optional[dict[str, Any]] = None, + file_type: FileTypeInput | None = None, + use_doc_orientation_classify: bool | None = None, + use_doc_unwarping: bool | None = None, + use_layout_detection: bool | None = None, + use_chart_recognition: bool | None = None, + layout_threshold: float | dict | None = None, + layout_nms: bool | None = None, + layout_unclip_ratio: float | tuple[float, float] | dict | None = None, + layout_merge_bboxes_mode: str | dict | None = None, + prompt_label: str | None = None, + format_block_content: bool | None = None, + repetition_penalty: float | None = None, + temperature: float | None = None, + top_p: float | None = None, + min_pixels: int | None = None, + max_pixels: int | None = None, + prettify_markdown: bool | None = None, + show_formula_number: bool | None = None, + visualize: bool | None = None, + additional_params: dict[str, Any] | None = None, ): """ Create a `PaddleOCRVLDocumentConverter` component. @@ -421,8 +421,8 @@ def _parse(self, data: bytes, file_type: FileType) -> tuple[str, dict[str, Any]] @component.output_types(documents=list[Document], raw_paddleocr_responses=list[dict[str, Any]]) def run( self, - sources: list[Union[str, Path, ByteStream]], - meta: Optional[Union[dict[str, Any], list[dict[str, Any]]]] = None, + sources: list[str | Path | ByteStream], + meta: dict[str, Any] | list[dict[str, Any]] | None = None, ) -> dict[str, Any]: """ Convert image or PDF files to Documents. @@ -448,7 +448,7 @@ def run( meta_list = normalize_metadata(meta, sources_count=len(sources)) - for source, metadata in zip(sources, meta_list): + for source, metadata in zip(sources, meta_list, strict=True): try: bytestream = get_bytestream_from_source(source) except Exception as e: From 349e20d48dc9068b870536dcdcec8d558149faf7 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 12 Jan 2026 15:38:51 +0100 Subject: [PATCH 2/2] chore!: paddleocr - drop Python 3.9 and use X|Y typing --- .../converters/paddleocr/paddleocr_vl_document_converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py b/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py index fdc50a7ba4..8a1aa24ae7 100644 --- a/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py +++ b/integrations/paddleocr/src/haystack_integrations/components/converters/paddleocr/paddleocr_vl_document_converter.py @@ -86,7 +86,7 @@ def _infer_file_type_from_source( return None -def _normalize_file_type(file_type: FileTypeInput | None) -> FileType | None: +def _normalize_file_type(file_type: FileTypeInput) -> FileType | None: """ Normalize file type input to the numeric format expected by the API. @@ -145,7 +145,7 @@ def __init__( *, api_url: str, access_token: Secret = Secret.from_env_var("AISTUDIO_ACCESS_TOKEN"), - file_type: FileTypeInput | None = None, + file_type: FileTypeInput = None, use_doc_orientation_classify: bool | None = None, use_doc_unwarping: bool | None = None, use_layout_detection: bool | None = None,