diff --git a/langchain-paddleocr/README.md b/langchain-paddleocr/README.md index 3411266a4fb..12010f6c8be 100644 --- a/langchain-paddleocr/README.md +++ b/langchain-paddleocr/README.md @@ -41,6 +41,63 @@ for doc in docs[:2]: print("---") ``` +### `PaddleOCRLoader` + +The `PaddleOCRLoader` wraps the **local** PaddleOCR library to extract text from PDF and image files — no cloud API or access token required. + +It supports two modes: + +- **Basic OCR** (default) — fast text extraction using PP-OCRv5. +- **Structure mode** — layout-aware extraction (tables, titles, figures) using PP-StructureV3. + +#### Basic OCR + +```python +from langchain_paddleocr import PaddleOCRLoader + +loader = PaddleOCRLoader(file_path="path/to/document.pdf") +docs = loader.load() + +for doc in docs: + print(f"Page {doc.metadata['page']}: {doc.page_content[:100]}...") + print(f"Confidence: {doc.metadata['confidence']:.2f}") +``` + +#### Structure mode + +```python +from langchain_paddleocr import PaddleOCRLoader +from langchain_paddleocr.document_loaders.paddleocr import PaddleOCRConfig + +config = PaddleOCRConfig(lang="en", use_table_recognition=True) +loader = PaddleOCRLoader( + file_path=["page1.png", "page2.png"], + use_structure=True, + config=config, +) + +for doc in loader.lazy_load(): + print(doc.page_content) + print(doc.metadata["layout_blocks"]) +``` + +#### Configuration + +Use `PaddleOCRConfig` to pass engine parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `lang` | `str` | Language code (`"ch"`, `"en"`, `"fr"`, etc.) | +| `ocr_version` | `str` | Pipeline version (`"PP-OCRv3"`, `"PP-OCRv4"`, `"PP-OCRv5"`) | +| `use_doc_orientation_classify` | `bool` | Enable document orientation classification | +| `use_doc_unwarping` | `bool` | Enable document de-warping | +| `text_det_thresh` | `float` | Detection confidence threshold | +| `text_rec_score_thresh` | `float` | Recognition confidence threshold | +| `use_table_recognition` | `bool` | Enable table recognition (structure mode) | +| `use_chart_recognition` | `bool` | Enable chart recognition (structure mode) | + +See the full list in `PaddleOCRConfig`. + ## 📖 Documentation For full documentation, see the [LangChain Docs](https://docs.langchain.com/oss/python/integrations/providers/baidu). diff --git a/langchain-paddleocr/README_cn.md b/langchain-paddleocr/README_cn.md index 9ed724f8105..57a79493473 100644 --- a/langchain-paddleocr/README_cn.md +++ b/langchain-paddleocr/README_cn.md @@ -42,6 +42,63 @@ for doc in docs[:2]: ``` +### `PaddleOCRLoader` + +`PaddleOCRLoader` 封装了 **本地** PaddleOCR 库,从 PDF 和图像文件中提取文本 — 无需云 API 或访问令牌。 + +支持两种模式: + +- **基础 OCR**(默认)— 使用 PP-OCRv5 进行快速文本提取。 +- **版面分析模式** — 使用 PP-StructureV3 进行版面感知提取(表格、标题、图片等)。 + +#### 基础 OCR + +```python +from langchain_paddleocr import PaddleOCRLoader + +loader = PaddleOCRLoader(file_path="path/to/document.pdf") +docs = loader.load() + +for doc in docs: + print(f"页面 {doc.metadata['page']}: {doc.page_content[:100]}...") + print(f"置信度: {doc.metadata['confidence']:.2f}") +``` + +#### 版面分析模式 + +```python +from langchain_paddleocr import PaddleOCRLoader +from langchain_paddleocr.document_loaders.paddleocr import PaddleOCRConfig + +config = PaddleOCRConfig(lang="ch", use_table_recognition=True) +loader = PaddleOCRLoader( + file_path=["page1.png", "page2.png"], + use_structure=True, + config=config, +) + +for doc in loader.lazy_load(): + print(doc.page_content) + print(doc.metadata["layout_blocks"]) +``` + +#### 配置 + +使用 `PaddleOCRConfig` 传递引擎参数: + +| 参数 | 类型 | 说明 | +|------|------|------| +| `lang` | `str` | 语言代码(`"ch"`、`"en"`、`"fr"` 等) | +| `ocr_version` | `str` | 流水线版本(`"PP-OCRv3"`、`"PP-OCRv4"`、`"PP-OCRv5"`) | +| `use_doc_orientation_classify` | `bool` | 启用文档方向分类 | +| `use_doc_unwarping` | `bool` | 启用文档去弯曲 | +| `text_det_thresh` | `float` | 检测置信度阈值 | +| `text_rec_score_thresh` | `float` | 识别置信度阈值 | +| `use_table_recognition` | `bool` | 启用表格识别(版面分析模式) | +| `use_chart_recognition` | `bool` | 启用图表识别(版面分析模式) | + +完整参数请参阅 `PaddleOCRConfig`。 + ## 📖 文档 完整文档请参阅 [LangChain 文档](https://docs.langchain.com/oss/python/integrations/providers/baidu)。 diff --git a/langchain-paddleocr/langchain_paddleocr/__init__.py b/langchain-paddleocr/langchain_paddleocr/__init__.py index d817f231c81..6e53363e42f 100644 --- a/langchain-paddleocr/langchain_paddleocr/__init__.py +++ b/langchain-paddleocr/langchain_paddleocr/__init__.py @@ -1,3 +1,12 @@ -from .document_loaders import PaddleOCRVLLoader +from .document_loaders import PaddleOCRLoader -__all__ = ["PaddleOCRVLLoader"] +__all__ = ["PaddleOCRLoader", "PaddleOCRVLLoader"] + + +def __getattr__(name: str) -> object: + if name == "PaddleOCRVLLoader": + from .document_loaders import PaddleOCRVLLoader + + return PaddleOCRVLLoader + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/langchain-paddleocr/langchain_paddleocr/document_loaders/__init__.py b/langchain-paddleocr/langchain_paddleocr/document_loaders/__init__.py index e487c09f765..aec7031c0d8 100644 --- a/langchain-paddleocr/langchain_paddleocr/document_loaders/__init__.py +++ b/langchain-paddleocr/langchain_paddleocr/document_loaders/__init__.py @@ -1,3 +1,12 @@ -from .paddleocr_vl import PaddleOCRVLLoader +from .paddleocr import PaddleOCRLoader -__all__ = ["PaddleOCRVLLoader"] +__all__ = ["PaddleOCRLoader", "PaddleOCRVLLoader"] + + +def __getattr__(name: str) -> object: + if name == "PaddleOCRVLLoader": + from .paddleocr_vl import PaddleOCRVLLoader + + return PaddleOCRVLLoader + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/langchain-paddleocr/langchain_paddleocr/document_loaders/paddleocr.py b/langchain-paddleocr/langchain_paddleocr/document_loaders/paddleocr.py new file mode 100644 index 00000000000..9a610abf42f --- /dev/null +++ b/langchain-paddleocr/langchain_paddleocr/document_loaders/paddleocr.py @@ -0,0 +1,422 @@ +"""PaddleOCR document loader for local OCR inference. + +This module provides ``PaddleOCRLoader``, a LangChain document loader that wraps +the local PaddleOCR library (PP-OCRv5 and PP-StructureV3) to extract text from +PDF and image files -- without requiring any cloud API. +""" + +from __future__ import annotations + +import dataclasses +import logging +from collections.abc import Iterable, Iterator +from pathlib import Path +from typing import Any + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document + +logger = logging.getLogger(__name__) + +_IMAGE_EXTENSIONS = { + ".jpg", + ".jpeg", + ".png", + ".bmp", + ".tiff", + ".tif", + ".webp", +} + +_PDF_EXTENSIONS = {".pdf"} + +_SUPPORTED_EXTENSIONS = _IMAGE_EXTENSIONS | _PDF_EXTENSIONS + +_PAGES_DELIMITER = "\n\f" + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + + +class PaddleOCRLoaderError(Exception): + """Base exception for all errors raised by :class:`PaddleOCRLoader`.""" + + +class UnsupportedFileTypeError(PaddleOCRLoaderError): + """Raised when a file has an unsupported extension.""" + + +class FileReadError(PaddleOCRLoaderError): + """Raised when a file cannot be read from disk.""" + + +class OCREngineError(PaddleOCRLoaderError): + """Raised when the PaddleOCR engine fails during inference.""" + + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass +class PaddleOCRConfig: + """Configuration for the PaddleOCR local inference engine. + + All fields mirror the PaddleOCR / PP-StructureV3 constructor parameters. + Fields left as ``None`` use the library defaults. + + Example:: + + config = PaddleOCRConfig(lang="en", use_doc_orientation_classify=True) + loader = PaddleOCRLoader(file_path="doc.pdf", config=config) + """ + + # -- Language & version ------------------------------------------------- + lang: str | None = None + """Language code for OCR (e.g. ``"ch"``, ``"en"``, ``"fr"``). + Defaults to the library default (``"ch"``).""" + + ocr_version: str | None = None + """OCR pipeline version (``"PP-OCRv3"``, ``"PP-OCRv4"``, ``"PP-OCRv5"``).""" + + # -- Document pre-processing -------------------------------------------- + use_doc_orientation_classify: bool | None = None + """Enable automatic document orientation classification.""" + + use_doc_unwarping: bool | None = None + """Enable document de-warping (straightening curved text).""" + + use_textline_orientation: bool | None = None + """Enable text-line orientation correction.""" + + # -- Text detection ----------------------------------------------------- + text_detection_model_name: str | None = None + text_detection_model_dir: str | None = None + text_det_limit_side_len: int | None = None + text_det_limit_type: str | None = None + text_det_thresh: float | None = None + text_det_box_thresh: float | None = None + text_det_unclip_ratio: float | None = None + + # -- Text recognition --------------------------------------------------- + text_recognition_model_name: str | None = None + text_recognition_model_dir: str | None = None + text_recognition_batch_size: int | None = None + text_rec_score_thresh: float | None = None + + # -- Structure mode (PP-StructureV3) ------------------------------------ + use_table_recognition: bool | None = None + """Enable table structure recognition (structure mode only).""" + + use_formula_recognition: bool | None = None + """Enable formula recognition (structure mode only).""" + + use_chart_recognition: bool | None = None + """Enable chart recognition (structure mode only).""" + + use_seal_recognition: bool | None = None + """Enable seal text recognition (structure mode only).""" + + use_region_detection: bool | None = None + """Enable region detection (structure mode only).""" + + layout_detection_model_name: str | None = None + layout_detection_model_dir: str | None = None + layout_threshold: float | None = None + layout_nms: bool | None = None + layout_unclip_ratio: float | None = None + layout_merge_bboxes_mode: str | None = None + + format_block_content: bool | None = None + """Whether to format block content in structure mode output.""" + + markdown_ignore_labels: list[str] | None = None + """Layout labels to skip when generating markdown (structure mode only).""" + + def to_engine_kwargs(self) -> dict[str, Any]: + """Return a dict of non-``None`` fields suitable for engine construction.""" + return { + field.name: value + for field in dataclasses.fields(self) + if (value := getattr(self, field.name)) is not None + } + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _validate_file_path(path: Path) -> None: + """Validate that *path* exists and has a supported extension. + + Raises: + FileReadError: If the path does not exist or is not a file. + UnsupportedFileTypeError: If the file extension is not supported. + """ + if not path.exists(): + msg = f"File not found: '{path}'" + raise FileReadError(msg) + if not path.is_file(): + msg = f"Path is not a file: '{path}'" + raise FileReadError(msg) + suffix = path.suffix.lower() + if suffix not in _SUPPORTED_EXTENSIONS: + msg = ( + f"Unsupported file extension '{suffix}' for '{path}'. " + f"Supported extensions: {sorted(_SUPPORTED_EXTENSIONS)}" + ) + raise UnsupportedFileTypeError(msg) + + +def _extract_text_from_ocr_result(result: dict[str, Any]) -> str: + """Join recognised text lines from a single-page OCR result dict.""" + texts: list[str] = result.get("rec_texts", []) + return "\n".join(texts) + + +def _mean_confidence(result: dict[str, Any]) -> float: + """Compute the mean recognition confidence for one page.""" + scores: list[float] = result.get("rec_scores", []) + if not scores: + return 0.0 + return sum(scores) / len(scores) + + +# --------------------------------------------------------------------------- +# Loader +# --------------------------------------------------------------------------- + + +class PaddleOCRLoader(BaseLoader): + """Load documents using the local PaddleOCR library. + + Supports two modes of operation: + + * **Basic OCR** (default) -- uses :class:`paddleocr.PaddleOCR` to extract + raw text lines with bounding boxes and confidence scores. + * **Structure mode** -- uses :class:`paddleocr.PPStructureV3` for + layout-aware extraction including tables, titles, figures, and formulas. + + Example -- basic OCR:: + + from langchain_paddleocr import PaddleOCRLoader + + loader = PaddleOCRLoader(file_path="invoice.pdf") + docs = loader.load() + + Example -- structure mode with custom config:: + + from langchain_paddleocr import PaddleOCRLoader + from langchain_paddleocr.document_loaders.paddleocr import PaddleOCRConfig + + config = PaddleOCRConfig(lang="en", use_table_recognition=True) + loader = PaddleOCRLoader( + file_path=["page1.png", "page2.png"], + use_structure=True, + config=config, + ) + for doc in loader.lazy_load(): + print(doc.page_content) + """ + + def __init__( + self, + file_path: str | Iterable[str], + *, + use_structure: bool = False, + config: PaddleOCRConfig | None = None, + ) -> None: + """Initialise the loader. + + Args: + file_path: Single path or iterable of paths to PDF / image files. + use_structure: If ``True``, use PP-StructureV3 for layout-aware + extraction. Otherwise use basic PaddleOCR text extraction. + config: Optional :class:`PaddleOCRConfig` with engine parameters. + When ``None``, library defaults are used. + """ + self._file_paths: list[str] = ( + list(file_path) + if isinstance(file_path, Iterable) and not isinstance(file_path, str) + else [file_path] + ) + self._use_structure = use_structure + self._config = config or PaddleOCRConfig() + + # -- Engine helpers (lazy-imported) ------------------------------------- + + def _build_ocr_engine(self) -> Any: + """Create a :class:`paddleocr.PaddleOCR` engine instance.""" + try: + from paddleocr import PaddleOCR # Lazy import -- slow first load + except ImportError as exc: + msg = ( + "The 'paddleocr' package is required for PaddleOCRLoader. " + "Install it with: pip install paddleocr" + ) + raise ImportError(msg) from exc + + kwargs = self._config.to_engine_kwargs() + logger.debug("Initialising PaddleOCR engine with params: %s", kwargs) + return PaddleOCR(**kwargs) + + def _build_structure_engine(self) -> Any: + """Create a :class:`paddleocr.PPStructureV3` engine instance.""" + try: + from paddleocr import PPStructureV3 # Lazy import + except ImportError as exc: + msg = ( + "The 'paddleocr' package is required for PaddleOCRLoader " + "structure mode. Install it with: pip install paddleocr" + ) + raise ImportError(msg) from exc + + kwargs = self._config.to_engine_kwargs() + logger.debug( + "Initialising PPStructureV3 engine with params: %s", + kwargs, + ) + return PPStructureV3(**kwargs) + + # -- Core inference ----------------------------------------------------- + + def _process_with_ocr( + self, + engine: Any, + file_path: str, + ) -> Iterator[Document]: + """Run basic OCR on *file_path* and yield one Document per page.""" + try: + results: list[dict[str, Any]] = engine.predict(file_path) + except Exception as exc: + msg = f"PaddleOCR engine failed on '{file_path}': {exc}" + raise OCREngineError(msg) from exc + + if not results: + logger.warning( + "%s: OCR returned no results for '%s'. Yielding empty document.", + self.__class__.__name__, + file_path, + ) + yield Document( + page_content="", + metadata={"source": file_path, "engine": "paddleocr"}, + ) + return + + for page_index, page_result in enumerate(results): + text = _extract_text_from_ocr_result(page_result) + confidence = _mean_confidence(page_result) + + if not text: + logger.warning( + "%s: No text extracted from page %d of '%s'.", + self.__class__.__name__, + page_index, + file_path, + ) + + metadata: dict[str, Any] = { + "source": file_path, + "page": page_index, + "confidence": confidence, + "language": self._config.lang or "ch", + "engine": "paddleocr", + } + yield Document(page_content=text, metadata=metadata) + + def _process_with_structure( + self, + engine: Any, + file_path: str, + ) -> Iterator[Document]: + """Run PP-StructureV3 on *file_path* and yield one Document per page.""" + try: + results: list[dict[str, Any]] = engine.predict(file_path) + except Exception as exc: + msg = f"PPStructureV3 engine failed on '{file_path}': {exc}" + raise OCREngineError(msg) from exc + + if not results: + logger.warning( + "%s: Structure analysis returned no results for '%s'. " + "Yielding empty document.", + self.__class__.__name__, + file_path, + ) + yield Document( + page_content="", + metadata={"source": file_path, "engine": "ppstructurev3"}, + ) + return + + for page_index, page_result in enumerate(results): + # --- Extract text from overall OCR result if available --------- + overall_ocr: dict[str, Any] = page_result.get("overall_ocr_res", {}) + ocr_text = _extract_text_from_ocr_result(overall_ocr) + confidence = _mean_confidence(overall_ocr) + + # --- Collect layout blocks ------------------------------------ + layout_blocks: list[dict[str, Any]] = page_result.get("layout_blocks", []) + + if not ocr_text and not layout_blocks: + logger.warning( + "%s: No content extracted from page %d of '%s'.", + self.__class__.__name__, + page_index, + file_path, + ) + + metadata: dict[str, Any] = { + "source": file_path, + "page": page_index, + "confidence": confidence, + "language": self._config.lang or "ch", + "engine": "ppstructurev3", + "layout_blocks": layout_blocks, + } + yield Document(page_content=ocr_text, metadata=metadata) + + # -- BaseLoader interface ----------------------------------------------- + + def lazy_load(self) -> Iterator[Document]: + """Lazily load documents from the configured file paths. + + Yields one :class:`~langchain_core.documents.Document` per page per + file. Multi-page PDFs produce multiple documents. + + Raises: + FileReadError: If a file path does not exist or is not a file. + UnsupportedFileTypeError: If a file has an unsupported extension. + OCREngineError: If the PaddleOCR engine fails during inference. + """ + # Validate all paths up-front before expensive engine initialisation. + resolved_paths: list[Path] = [] + for raw_path in self._file_paths: + path = Path(raw_path) + _validate_file_path(path) + resolved_paths.append(path) + + # Build the engine once for all files. + engine = ( + self._build_structure_engine() + if self._use_structure + else self._build_ocr_engine() + ) + + for path in resolved_paths: + file_str = str(path) + logger.info( + "%s: Processing '%s' (mode=%s).", + self.__class__.__name__, + file_str, + "structure" if self._use_structure else "ocr", + ) + if self._use_structure: + yield from self._process_with_structure(engine, file_str) + else: + yield from self._process_with_ocr(engine, file_str) diff --git a/langchain-paddleocr/tests/integration_tests/document_loaders/test_paddleocr_loader.py b/langchain-paddleocr/tests/integration_tests/document_loaders/test_paddleocr_loader.py new file mode 100644 index 00000000000..be7cde6bb54 --- /dev/null +++ b/langchain-paddleocr/tests/integration_tests/document_loaders/test_paddleocr_loader.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from langchain_paddleocr.document_loaders.paddleocr import ( + PaddleOCRConfig, + PaddleOCRLoader, +) + + +@pytest.mark.requires("paddleocr") +def test_paddleocr_loader_ocr_mode_on_image() -> None: + """Integration test: basic OCR on a sample image. + + Requires a working PaddleOCR installation with downloaded models. + """ + tests_dir = Path(__file__).resolve().parents[2] + sample_image = tests_dir / "data" / "sample_img.jpg" + + if not sample_image.exists(): + pytest.skip(f"Sample image not found: {sample_image}") + + config = PaddleOCRConfig(lang="en") + loader = PaddleOCRLoader(file_path=str(sample_image), config=config) + docs = list(loader.lazy_load()) + + assert len(docs) >= 1 + for doc in docs: + assert isinstance(doc.page_content, str) + assert doc.metadata["source"] == str(sample_image) + assert doc.metadata["engine"] == "paddleocr" + assert "page" in doc.metadata + assert "confidence" in doc.metadata + + +@pytest.mark.requires("paddleocr") +def test_paddleocr_loader_ocr_mode_on_pdf() -> None: + """Integration test: basic OCR on a sample PDF.""" + tests_dir = Path(__file__).resolve().parents[2] + sample_pdf = tests_dir / "data" / "sample_pdf.pdf" + + if not sample_pdf.exists(): + pytest.skip(f"Sample PDF not found: {sample_pdf}") + + config = PaddleOCRConfig(lang="en") + loader = PaddleOCRLoader(file_path=str(sample_pdf), config=config) + docs = list(loader.lazy_load()) + + assert len(docs) >= 1 + for doc in docs: + assert isinstance(doc.page_content, str) + assert doc.metadata["source"] == str(sample_pdf) + assert doc.metadata["engine"] == "paddleocr" + + +@pytest.mark.requires("paddleocr") +def test_paddleocr_loader_structure_mode_on_image() -> None: + """Integration test: structure mode on a sample image.""" + tests_dir = Path(__file__).resolve().parents[2] + sample_image = tests_dir / "data" / "sample_img.jpg" + + if not sample_image.exists(): + pytest.skip(f"Sample image not found: {sample_image}") + + config = PaddleOCRConfig(lang="en") + loader = PaddleOCRLoader( + file_path=str(sample_image), + use_structure=True, + config=config, + ) + docs = list(loader.lazy_load()) + + assert len(docs) >= 1 + for doc in docs: + assert isinstance(doc.page_content, str) + assert doc.metadata["engine"] == "ppstructurev3" + assert "layout_blocks" in doc.metadata diff --git a/langchain-paddleocr/tests/unit_tests/document_loaders/test_paddleocr_loader.py b/langchain-paddleocr/tests/unit_tests/document_loaders/test_paddleocr_loader.py new file mode 100644 index 00000000000..aab5f8fe47e --- /dev/null +++ b/langchain-paddleocr/tests/unit_tests/document_loaders/test_paddleocr_loader.py @@ -0,0 +1,386 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_paddleocr.document_loaders.paddleocr import ( + FileReadError, + OCREngineError, + PaddleOCRConfig, + PaddleOCRLoader, + UnsupportedFileTypeError, + _extract_text_from_ocr_result, + _mean_confidence, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +SAMPLE_OCR_PAGE: dict[str, Any] = { + "rec_texts": ["Hello", "World"], + "rec_scores": [0.98, 0.95], + "dt_polys": [ + [[0, 0], [100, 0], [100, 20], [0, 20]], + [[0, 30], [100, 30], [100, 50], [0, 50]], + ], +} + +SAMPLE_OCR_PAGE_EMPTY: dict[str, Any] = { + "rec_texts": [], + "rec_scores": [], + "dt_polys": [], +} + +SAMPLE_STRUCTURE_PAGE: dict[str, Any] = { + "overall_ocr_res": { + "rec_texts": ["Title", "Some body text"], + "rec_scores": [0.99, 0.92], + }, + "layout_blocks": [ + {"type": "title", "bbox": [0, 0, 200, 40], "res": {}}, + {"type": "text", "bbox": [0, 50, 200, 200], "res": {}}, + ], +} + + +@pytest.fixture +def tmp_image(tmp_path: Path) -> Path: + img = tmp_path / "test.png" + img.write_bytes(b"\x89PNG\r\n\x1a\n") + return img + + +@pytest.fixture +def tmp_pdf(tmp_path: Path) -> Path: + pdf = tmp_path / "test.pdf" + pdf.write_bytes(b"%PDF-1.4") + return pdf + + +@pytest.fixture +def tmp_unsupported(tmp_path: Path) -> Path: + f = tmp_path / "data.xyz" + f.write_bytes(b"some data") + return f + + +# --------------------------------------------------------------------------- +# PaddleOCRConfig tests +# --------------------------------------------------------------------------- + + +class TestPaddleOCRConfig: + def test_default_config_produces_empty_kwargs(self) -> None: + config = PaddleOCRConfig() + assert config.to_engine_kwargs() == {} + + def test_non_none_fields_are_included(self) -> None: + config = PaddleOCRConfig(lang="en", text_det_thresh=0.5) + kwargs = config.to_engine_kwargs() + assert kwargs == {"lang": "en", "text_det_thresh": 0.5} + + def test_none_fields_are_excluded(self) -> None: + config = PaddleOCRConfig(lang="fr") + kwargs = config.to_engine_kwargs() + assert "ocr_version" not in kwargs + assert "text_detection_model_dir" not in kwargs + + +# --------------------------------------------------------------------------- +# Helper function tests +# --------------------------------------------------------------------------- + + +class TestHelpers: + def test_extract_text_joins_lines(self) -> None: + text = _extract_text_from_ocr_result(SAMPLE_OCR_PAGE) + assert text == "Hello\nWorld" + + def test_extract_text_empty_result(self) -> None: + assert _extract_text_from_ocr_result(SAMPLE_OCR_PAGE_EMPTY) == "" + + def test_extract_text_missing_key(self) -> None: + assert _extract_text_from_ocr_result({}) == "" + + def test_mean_confidence_normal(self) -> None: + conf = _mean_confidence(SAMPLE_OCR_PAGE) + assert conf == pytest.approx(0.965) + + def test_mean_confidence_empty(self) -> None: + assert _mean_confidence(SAMPLE_OCR_PAGE_EMPTY) == 0.0 + + def test_mean_confidence_missing_key(self) -> None: + assert _mean_confidence({}) == 0.0 + + +# --------------------------------------------------------------------------- +# File validation tests +# --------------------------------------------------------------------------- + + +class TestFileValidation: + def test_nonexistent_file_raises_file_read_error(self) -> None: + loader = PaddleOCRLoader(file_path="nonexistent.png") + with pytest.raises(FileReadError, match="File not found"): + list(loader.lazy_load()) + + def test_directory_path_raises_file_read_error(self, tmp_path: Path) -> None: + loader = PaddleOCRLoader(file_path=str(tmp_path)) + with pytest.raises(FileReadError, match="not a file"): + list(loader.lazy_load()) + + def test_unsupported_extension_raises(self, tmp_unsupported: Path) -> None: + loader = PaddleOCRLoader(file_path=str(tmp_unsupported)) + with pytest.raises(UnsupportedFileTypeError, match="Unsupported file"): + list(loader.lazy_load()) + + def test_supported_image_extensions(self, tmp_path: Path) -> None: + for ext in (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"): + f = tmp_path / f"test{ext}" + f.write_bytes(b"fake") + loader = PaddleOCRLoader(file_path=str(f)) + # Should not raise during validation -- will fail at engine build + # which is separate from validation. + with patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) as mock_build: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_OCR_PAGE] + mock_build.return_value = mock_engine + docs = list(loader.lazy_load()) + assert len(docs) == 1 + + def test_pdf_extension_supported(self, tmp_pdf: Path) -> None: + loader = PaddleOCRLoader(file_path=str(tmp_pdf)) + with patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) as mock_build: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_OCR_PAGE] + mock_build.return_value = mock_engine + docs = list(loader.lazy_load()) + assert len(docs) == 1 + + +# --------------------------------------------------------------------------- +# Basic OCR mode tests +# --------------------------------------------------------------------------- + + +class TestOCRMode: + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_single_page_document(self, mock_build: MagicMock, tmp_image: Path) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_OCR_PAGE] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image)) + docs = list(loader.lazy_load()) + + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "Hello\nWorld" + assert doc.metadata["source"] == str(tmp_image) + assert doc.metadata["page"] == 0 + assert doc.metadata["confidence"] == pytest.approx(0.965) + assert doc.metadata["engine"] == "paddleocr" + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_multi_page_pdf(self, mock_build: MagicMock, tmp_pdf: Path) -> None: + page1 = {"rec_texts": ["Page one"], "rec_scores": [0.90]} + page2 = {"rec_texts": ["Page two"], "rec_scores": [0.85]} + mock_engine = MagicMock() + mock_engine.predict.return_value = [page1, page2] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_pdf)) + docs = list(loader.lazy_load()) + + assert len(docs) == 2 + assert docs[0].page_content == "Page one" + assert docs[0].metadata["page"] == 0 + assert docs[1].page_content == "Page two" + assert docs[1].metadata["page"] == 1 + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_empty_ocr_result_yields_empty_document( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image)) + docs = list(loader.lazy_load()) + + assert len(docs) == 1 + assert docs[0].page_content == "" + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_engine_failure_raises_ocr_engine_error( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.side_effect = RuntimeError("GPU out of memory") + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image)) + with pytest.raises(OCREngineError, match="GPU out of memory"): + list(loader.lazy_load()) + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_multiple_files( + self, mock_build: MagicMock, tmp_image: Path, tmp_pdf: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_OCR_PAGE] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=[str(tmp_image), str(tmp_pdf)]) + docs = list(loader.lazy_load()) + + assert len(docs) == 2 + assert docs[0].metadata["source"] == str(tmp_image) + assert docs[1].metadata["source"] == str(tmp_pdf) + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_custom_config_passed_to_engine( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_OCR_PAGE] + mock_build.return_value = mock_engine + + config = PaddleOCRConfig(lang="en", text_det_thresh=0.3) + loader = PaddleOCRLoader( + file_path=str(tmp_image), + config=config, + ) + docs = list(loader.lazy_load()) + + assert len(docs) == 1 + assert docs[0].metadata["language"] == "en" + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_ocr_engine" + ) + def test_default_language_is_ch( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_OCR_PAGE] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image)) + docs = list(loader.lazy_load()) + assert docs[0].metadata["language"] == "ch" + + +# --------------------------------------------------------------------------- +# Structure mode tests +# --------------------------------------------------------------------------- + + +class TestStructureMode: + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_structure_engine" + ) + def test_structure_mode_document( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [SAMPLE_STRUCTURE_PAGE] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image), use_structure=True) + docs = list(loader.lazy_load()) + + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "Title\nSome body text" + assert doc.metadata["engine"] == "ppstructurev3" + assert len(doc.metadata["layout_blocks"]) == 2 + assert doc.metadata["layout_blocks"][0]["type"] == "title" + assert doc.metadata["layout_blocks"][1]["type"] == "text" + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_structure_engine" + ) + def test_structure_mode_empty_result( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.return_value = [] + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image), use_structure=True) + docs = list(loader.lazy_load()) + + assert len(docs) == 1 + assert docs[0].page_content == "" + assert docs[0].metadata["engine"] == "ppstructurev3" + + @patch( + "langchain_paddleocr.document_loaders.paddleocr.PaddleOCRLoader._build_structure_engine" + ) + def test_structure_engine_failure_raises( + self, mock_build: MagicMock, tmp_image: Path + ) -> None: + mock_engine = MagicMock() + mock_engine.predict.side_effect = RuntimeError("Model load failed") + mock_build.return_value = mock_engine + + loader = PaddleOCRLoader(file_path=str(tmp_image), use_structure=True) + with pytest.raises(OCREngineError, match="Model load failed"): + list(loader.lazy_load()) + + +# --------------------------------------------------------------------------- +# Constructor / init tests +# --------------------------------------------------------------------------- + + +class TestConstructor: + def test_single_string_path_normalised_to_list(self) -> None: + loader = PaddleOCRLoader(file_path="test.png") + assert loader._file_paths == ["test.png"] + + def test_iterable_paths_stored(self) -> None: + loader = PaddleOCRLoader(file_path=["a.png", "b.pdf"]) + assert loader._file_paths == ["a.png", "b.pdf"] + + def test_default_config_created_when_none(self) -> None: + loader = PaddleOCRLoader(file_path="test.png") + assert isinstance(loader._config, PaddleOCRConfig) + + def test_use_structure_default_false(self) -> None: + loader = PaddleOCRLoader(file_path="test.png") + assert loader._use_structure is False + + +# --------------------------------------------------------------------------- +# Import test +# --------------------------------------------------------------------------- + + +def test_import_from_package() -> None: + """Verify PaddleOCRLoader is importable from the top-level package.""" + from langchain_paddleocr import PaddleOCRLoader as LoaderAlias + + assert LoaderAlias is PaddleOCRLoader