Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
### Enhancements
- **Add page number support to v1 HTML parser**: The v1 HTML parser now reads `data-page-number` attributes from ancestor elements and includes the page number in element metadata, consistent with the v2 parser behavior.

### Fixes
- **Fix PaddleOCR language code conversion in `OCRAgent.get_agent()`**: When PaddleOCR is configured as the OCR agent, `get_agent()` now converts Tesseract language codes (e.g., `"eng"`) to PaddleOCR language codes (e.g., `"en"`) before instantiating the agent. Previously, Tesseract-format codes were passed directly to PaddleOCR in the `ocr_only` strategy path and `table_structure` module, causing language detection failures.

## 0.22.17

### Fixes
Expand Down
36 changes: 36 additions & 0 deletions test_unstructured/partition/utils/ocr_models/test_ocr_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,42 @@ def it_provides_access_to_the_configured_OCR_agent(
get_instance_.assert_called_once_with(OCR_AGENT_TESSERACT, "eng")
assert ocr_agent is ocr_agent_

def it_converts_tesseract_language_to_paddle_when_paddle_is_configured(
self, _get_ocr_agent_cls_qname_: Mock, get_instance_: Mock, ocr_agent_: Mock
):
_get_ocr_agent_cls_qname_.return_value = OCR_AGENT_PADDLE
get_instance_.return_value = ocr_agent_

ocr_agent = OCRAgent.get_agent(language="eng")

_get_ocr_agent_cls_qname_.assert_called_once_with()
get_instance_.assert_called_once_with(OCR_AGENT_PADDLE, "en")
assert ocr_agent is ocr_agent_

@pytest.mark.parametrize(
("tesseract_lang", "expected_paddle_lang"),
[
("eng", "en"),
("ara", "ar"),
("chi_sim", "ch"),
("deu", "german"),
],
)
def it_converts_various_tesseract_languages_to_paddle_format(
self,
tesseract_lang: str,
expected_paddle_lang: str,
_get_ocr_agent_cls_qname_: Mock,
get_instance_: Mock,
ocr_agent_: Mock,
):
_get_ocr_agent_cls_qname_.return_value = OCR_AGENT_PADDLE
get_instance_.return_value = ocr_agent_

OCRAgent.get_agent(language=tesseract_lang)

get_instance_.assert_called_once_with(OCR_AGENT_PADDLE, expected_paddle_lang)

def but_it_raises_when_the_requested_agent_is_not_whitelisted(
self, _get_ocr_agent_cls_qname_: Mock
):
Expand Down
5 changes: 5 additions & 0 deletions unstructured/partition/utils/ocr_models/ocr_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import TYPE_CHECKING

from unstructured.logger import logger
from unstructured.partition.common.lang import tesseract_to_paddle_language
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import (
OCR_AGENT_MODULES_WHITELIST,
Expand All @@ -29,8 +30,12 @@ def get_agent(cls, language: str) -> OCRAgent:
"""Get the configured OCRAgent instance.

The OCR package used by the agent is determined by the `OCR_AGENT` environment variable.
If PaddleOCR is configured, Tesseract language codes are automatically converted
to PaddleOCR language codes.
"""
ocr_agent_cls_qname = cls._get_ocr_agent_cls_qname()
if ocr_agent_cls_qname == OCR_AGENT_PADDLE:
language = tesseract_to_paddle_language(language)
return cls.get_instance(ocr_agent_cls_qname, language)

@staticmethod
Expand Down