diff --git a/CHANGELOG.md b/CHANGELOG.md index 49bab66762..acb8fda9cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ ### Enhancements - **Add page number support to v1 HTML parser**: The v1 HTML parser now reads `data-page-number` attributes from ancestor elements and includes the page number in element metadata, consistent with the v2 parser behavior. +### Fixes +- **Fix PaddleOCR language code conversion in `OCRAgent.get_agent()`**: When PaddleOCR is configured as the OCR agent, `get_agent()` now converts Tesseract language codes (e.g., `"eng"`) to PaddleOCR language codes (e.g., `"en"`) before instantiating the agent. Previously, Tesseract-format codes were passed directly to PaddleOCR in the `ocr_only` strategy path and `table_structure` module, causing language detection failures. + ## 0.22.17 ### Fixes diff --git a/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py b/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py index 8140f4635e..8e5e3f2489 100644 --- a/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py +++ b/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py @@ -41,6 +41,42 @@ def it_provides_access_to_the_configured_OCR_agent( get_instance_.assert_called_once_with(OCR_AGENT_TESSERACT, "eng") assert ocr_agent is ocr_agent_ + def it_converts_tesseract_language_to_paddle_when_paddle_is_configured( + self, _get_ocr_agent_cls_qname_: Mock, get_instance_: Mock, ocr_agent_: Mock + ): + _get_ocr_agent_cls_qname_.return_value = OCR_AGENT_PADDLE + get_instance_.return_value = ocr_agent_ + + ocr_agent = OCRAgent.get_agent(language="eng") + + _get_ocr_agent_cls_qname_.assert_called_once_with() + get_instance_.assert_called_once_with(OCR_AGENT_PADDLE, "en") + assert ocr_agent is ocr_agent_ + + @pytest.mark.parametrize( + ("tesseract_lang", "expected_paddle_lang"), + [ + ("eng", "en"), + ("ara", "ar"), + ("chi_sim", "ch"), + ("deu", "german"), + ], + ) + def it_converts_various_tesseract_languages_to_paddle_format( + self, + tesseract_lang: str, + expected_paddle_lang: str, + _get_ocr_agent_cls_qname_: Mock, + get_instance_: Mock, + ocr_agent_: Mock, + ): + _get_ocr_agent_cls_qname_.return_value = OCR_AGENT_PADDLE + get_instance_.return_value = ocr_agent_ + + OCRAgent.get_agent(language=tesseract_lang) + + get_instance_.assert_called_once_with(OCR_AGENT_PADDLE, expected_paddle_lang) + def but_it_raises_when_the_requested_agent_is_not_whitelisted( self, _get_ocr_agent_cls_qname_: Mock ): diff --git a/unstructured/partition/utils/ocr_models/ocr_interface.py b/unstructured/partition/utils/ocr_models/ocr_interface.py index fd1e0ab143..c54d688173 100644 --- a/unstructured/partition/utils/ocr_models/ocr_interface.py +++ b/unstructured/partition/utils/ocr_models/ocr_interface.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING from unstructured.logger import logger +from unstructured.partition.common.lang import tesseract_to_paddle_language from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import ( OCR_AGENT_MODULES_WHITELIST, @@ -29,8 +30,12 @@ def get_agent(cls, language: str) -> OCRAgent: """Get the configured OCRAgent instance. The OCR package used by the agent is determined by the `OCR_AGENT` environment variable. + If PaddleOCR is configured, Tesseract language codes are automatically converted + to PaddleOCR language codes. """ ocr_agent_cls_qname = cls._get_ocr_agent_cls_qname() + if ocr_agent_cls_qname == OCR_AGENT_PADDLE: + language = tesseract_to_paddle_language(language) return cls.get_instance(ocr_agent_cls_qname, language) @staticmethod