Skip to content

Commit a311bf4

Browse files
authored
test: Paddle OCR - add unit tests (#3208)
1 parent 75fe023 commit a311bf4

1 file changed

Lines changed: 263 additions & 0 deletions

File tree

integrations/paddleocr/tests/test_paddleocr_vl_document_converter.py

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
from haystack_integrations.components.converters.paddleocr import (
1616
PaddleOCRVLDocumentConverter,
1717
)
18+
from haystack_integrations.components.converters.paddleocr.paddleocr_vl_document_converter import (
19+
_infer_file_type_from_source,
20+
)
1821

1922

2023
def create_empty_pdf(tmp_path, filename="test.pdf"):
@@ -111,6 +114,21 @@ def test_init_with_all_optional_parameters(self):
111114
assert converter.visualize is True
112115
assert converter.additional_params == {}
113116

117+
@pytest.mark.parametrize(
118+
"file_type, error_match",
119+
[
120+
("invalid_string", "Invalid `file_type` string"),
121+
(123, "Invalid `file_type` value"),
122+
],
123+
)
124+
def test_init_invalid_file_type_raises(self, file_type, error_match):
125+
with pytest.raises(ValueError, match=error_match):
126+
PaddleOCRVLDocumentConverter(
127+
api_url="http://test-api-url.com",
128+
access_token=Secret.from_token("test-access-token"),
129+
file_type=file_type,
130+
)
131+
114132
def test_to_dict(self, monkeypatch):
115133
monkeypatch.setenv("AISTUDIO_ACCESS_TOKEN", "test-access-token")
116134
converter = PaddleOCRVLDocumentConverter(api_url="http://test-api-url.com")
@@ -498,6 +516,70 @@ def test_run_handles_api_error(self, mock_ocr_response, tmp_path):
498516
assert len(result["documents"]) == 2
499517
assert len(result["raw_paddleocr_responses"]) == 2
500518

519+
def test_run_skips_source_that_cannot_be_read(self, mock_ocr_response):
520+
converter = PaddleOCRVLDocumentConverter(
521+
api_url="http://test-api-url.com",
522+
access_token=Secret.from_token("test-access-token"),
523+
)
524+
525+
with patch("requests.post") as mock_post:
526+
mock_response = MagicMock()
527+
mock_response.status_code = 200
528+
mock_response.json.return_value = mock_ocr_response
529+
mock_post.return_value = mock_response
530+
531+
result = converter.run(sources=["/non/existent/file.png"])
532+
533+
assert len(result["documents"]) == 0
534+
assert len(result["raw_paddleocr_responses"]) == 0
535+
mock_post.assert_not_called()
536+
537+
def test_run_handles_empty_text_response(self, tmp_path):
538+
mock_response_empty_text = {
539+
"result": {
540+
"layoutParsingResults": [
541+
{"markdown": {"text": ""}, "prunedResult": {}},
542+
],
543+
"dataInfo": {"width": 1024, "height": 1024, "type": "image"},
544+
},
545+
}
546+
converter = PaddleOCRVLDocumentConverter(
547+
api_url="http://test-api-url.com",
548+
access_token=Secret.from_token("test-access-token"),
549+
)
550+
test_file = create_empty_image(tmp_path, "test.png")
551+
552+
with patch("requests.post") as mock_post:
553+
mock_response = MagicMock()
554+
mock_response.status_code = 200
555+
mock_response.json.return_value = mock_response_empty_text
556+
mock_post.return_value = mock_response
557+
558+
result = converter.run(sources=[str(test_file)])
559+
560+
assert len(result["documents"]) == 1
561+
assert result["documents"][0].content == ""
562+
563+
def test_run_omits_auth_header_when_token_missing(self, mock_ocr_response, tmp_path, monkeypatch):
564+
monkeypatch.delenv("AISTUDIO_ACCESS_TOKEN", raising=False)
565+
converter = PaddleOCRVLDocumentConverter(
566+
api_url="http://test-api-url.com",
567+
access_token=Secret.from_env_var("AISTUDIO_ACCESS_TOKEN", strict=False),
568+
)
569+
test_file = create_empty_image(tmp_path, "test.png")
570+
571+
with patch("requests.post") as mock_post:
572+
mock_response = MagicMock()
573+
mock_response.status_code = 200
574+
mock_response.json.return_value = mock_ocr_response
575+
mock_post.return_value = mock_response
576+
577+
result = converter.run(sources=[str(test_file)])
578+
579+
assert len(result["documents"]) == 1
580+
headers = mock_post.call_args[1]["headers"]
581+
assert "Authorization" not in headers
582+
501583
def test_run_with_meta_single_dict(self, mock_ocr_response, tmp_path):
502584
"""Test that meta parameter with single dict is applied to all documents"""
503585
converter = PaddleOCRVLDocumentConverter(
@@ -580,6 +662,167 @@ def test_run_with_meta_none(self, mock_ocr_response, tmp_path):
580662
# Only file path metadata should be present
581663
assert "file_path" in result["documents"][0].meta
582664

665+
def test_parse_sends_all_optional_parameters_in_request(self, mock_ocr_response, tmp_path):
666+
converter = PaddleOCRVLDocumentConverter(
667+
api_url="http://test-api-url.com",
668+
access_token=Secret.from_token("test-access-token"),
669+
use_doc_orientation_classify=True,
670+
use_doc_unwarping=True,
671+
use_layout_detection=True,
672+
use_chart_recognition=True,
673+
use_seal_recognition=True,
674+
use_ocr_for_image_block=True,
675+
layout_threshold=0.5,
676+
layout_nms=True,
677+
layout_unclip_ratio=1.5,
678+
layout_merge_bboxes_mode="large",
679+
layout_shape_mode="auto",
680+
prompt_label="ocr",
681+
format_block_content=True,
682+
repetition_penalty=1.1,
683+
temperature=0.7,
684+
top_p=0.9,
685+
min_pixels=100,
686+
max_pixels=1000,
687+
max_new_tokens=500,
688+
merge_layout_blocks=True,
689+
markdown_ignore_labels=["footer"],
690+
vlm_extra_args={"extra": "arg"},
691+
prettify_markdown=True,
692+
show_formula_number=True,
693+
restructure_pages=True,
694+
merge_tables=True,
695+
relevel_titles=True,
696+
visualize=True,
697+
additional_params={"logId": "custom-log-id"},
698+
)
699+
test_file = create_empty_image(tmp_path, "test.png")
700+
701+
with patch("requests.post") as mock_post:
702+
mock_response = MagicMock()
703+
mock_response.status_code = 200
704+
mock_response.json.return_value = mock_ocr_response
705+
mock_post.return_value = mock_response
706+
707+
result = converter.run(sources=[str(test_file)])
708+
709+
assert len(result["documents"]) == 1
710+
payload = mock_post.call_args[1]["json"]
711+
assert payload["useDocOrientationClassify"] is True
712+
assert payload["useDocUnwarping"] is True
713+
assert payload["useLayoutDetection"] is True
714+
assert payload["useChartRecognition"] is True
715+
assert payload["useSealRecognition"] is True
716+
assert payload["useOcrForImageBlock"] is True
717+
assert payload["layoutThreshold"] == 0.5
718+
assert payload["layoutNms"] is True
719+
assert payload["layoutUnclipRatio"] == 1.5
720+
assert payload["layoutMergeBboxesMode"] == "large"
721+
assert payload["layoutShapeMode"] == "auto"
722+
assert payload["promptLabel"] == "ocr"
723+
assert payload["formatBlockContent"] is True
724+
assert payload["repetitionPenalty"] == 1.1
725+
assert payload["temperature"] == 0.7
726+
assert payload["topP"] == 0.9
727+
assert payload["minPixels"] == 100
728+
assert payload["maxPixels"] == 1000
729+
assert payload["maxNewTokens"] == 500
730+
assert payload["mergeLayoutBlocks"] is True
731+
assert payload["markdownIgnoreLabels"] == ["footer"]
732+
assert payload["vlmExtraArgs"] == {"extra": "arg"}
733+
assert payload["prettifyMarkdown"] is True
734+
assert payload["showFormulaNumber"] is True
735+
assert payload["restructurePages"] is True
736+
assert payload["mergeTables"] is True
737+
assert payload["relevelTitles"] is True
738+
assert payload["visualize"] is True
739+
assert payload["logId"] == "custom-log-id"
740+
741+
def test_parse_skips_none_optional_parameters(self, mock_ocr_response, tmp_path):
742+
converter = PaddleOCRVLDocumentConverter(
743+
api_url="http://test-api-url.com",
744+
access_token=Secret.from_token("test-access-token"),
745+
use_doc_orientation_classify=None,
746+
use_doc_unwarping=None,
747+
)
748+
test_file = create_empty_image(tmp_path, "test.png")
749+
750+
with patch("requests.post") as mock_post:
751+
mock_response = MagicMock()
752+
mock_response.status_code = 200
753+
mock_response.json.return_value = mock_ocr_response
754+
mock_post.return_value = mock_response
755+
756+
result = converter.run(sources=[str(test_file)])
757+
758+
assert len(result["documents"]) == 1
759+
payload = mock_post.call_args[1]["json"]
760+
assert "useDocOrientationClassify" not in payload
761+
assert "useDocUnwarping" not in payload
762+
763+
def test_parse_raises_on_invalid_request_parameters(self, tmp_path):
764+
converter = PaddleOCRVLDocumentConverter(
765+
api_url="http://test-api-url.com",
766+
access_token=Secret.from_token("test-access-token"),
767+
)
768+
test_file = create_empty_image(tmp_path, "test.png")
769+
770+
with patch(
771+
"haystack_integrations.components.converters.paddleocr."
772+
"paddleocr_vl_document_converter.PaddleOCRVLInferRequest",
773+
side_effect=Exception("Validation failed"),
774+
):
775+
result = converter.run(sources=[str(test_file)])
776+
assert len(result["documents"]) == 0
777+
778+
def test_parse_raises_on_invalid_json_response(self, tmp_path):
779+
converter = PaddleOCRVLDocumentConverter(
780+
api_url="http://test-api-url.com",
781+
access_token=Secret.from_token("test-access-token"),
782+
)
783+
test_file = create_empty_image(tmp_path, "test.png")
784+
785+
with patch("requests.post") as mock_post:
786+
mock_response = MagicMock()
787+
mock_response.status_code = 200
788+
mock_response.json.side_effect = ValueError("Invalid JSON")
789+
mock_post.return_value = mock_response
790+
791+
result = converter.run(sources=[str(test_file)])
792+
assert len(result["documents"]) == 0
793+
794+
def test_parse_raises_when_result_field_missing(self, tmp_path):
795+
converter = PaddleOCRVLDocumentConverter(
796+
api_url="http://test-api-url.com",
797+
access_token=Secret.from_token("test-access-token"),
798+
)
799+
test_file = create_empty_image(tmp_path, "test.png")
800+
801+
with patch("requests.post") as mock_post:
802+
mock_response = MagicMock()
803+
mock_response.status_code = 200
804+
mock_response.json.return_value = {"logId": "123", "errorMsg": "Error"}
805+
mock_post.return_value = mock_response
806+
807+
result = converter.run(sources=[str(test_file)])
808+
assert len(result["documents"]) == 0
809+
810+
def test_parse_raises_on_invalid_response_format(self, tmp_path):
811+
converter = PaddleOCRVLDocumentConverter(
812+
api_url="http://test-api-url.com",
813+
access_token=Secret.from_token("test-access-token"),
814+
)
815+
test_file = create_empty_image(tmp_path, "test.png")
816+
817+
with patch("requests.post") as mock_post:
818+
mock_response = MagicMock()
819+
mock_response.status_code = 200
820+
mock_response.json.return_value = {"result": {"invalid": "format"}}
821+
mock_post.return_value = mock_response
822+
823+
result = converter.run(sources=[str(test_file)])
824+
assert len(result["documents"]) == 0
825+
583826
def test_file_type_auto_detection_pdf(self, mock_ocr_response_with_multiple_pages, tmp_path):
584827
"""Test that file_type is automatically detected as PDF from .pdf extension"""
585828
converter = PaddleOCRVLDocumentConverter(
@@ -732,6 +975,26 @@ def test_file_type_auto_detection_bytestream(self, mock_ocr_response, tmp_path):
732975
# Second call should be for image (fileType=1)
733976
assert calls[1][1]["json"]["fileType"] == 1
734977

978+
@pytest.mark.parametrize(
979+
"mime_type, expected",
980+
[
981+
("application/pdf", 0),
982+
("APPLICATION/PDF", 0),
983+
("image/png", 1),
984+
("image/jpeg", 1),
985+
("text/plain", None),
986+
(None, None),
987+
],
988+
)
989+
def test_infer_file_type_from_mime_type(self, mime_type, expected):
990+
source = ByteStream(data=b"dummy")
991+
assert _infer_file_type_from_source(source, mime_type) == expected
992+
993+
def test_infer_file_type_falls_back_to_mime_type_when_extension_unknown(self, tmp_path):
994+
source = tmp_path / "file.unknown"
995+
assert _infer_file_type_from_source(source, "application/pdf") == 0
996+
assert _infer_file_type_from_source(source, "image/png") == 1
997+
735998
@pytest.mark.skipif(
736999
not os.environ.get("PADDLEOCR_VL_API_URL") or not os.environ.get("AISTUDIO_ACCESS_TOKEN"),
7371000
reason="Export env vars `PADDLEOCR_VL_API_URL` and `AISTUDIO_ACCESS_TOKEN` to run this test.",

0 commit comments

Comments
 (0)