|
15 | 15 | from haystack_integrations.components.converters.paddleocr import ( |
16 | 16 | PaddleOCRVLDocumentConverter, |
17 | 17 | ) |
| 18 | +from haystack_integrations.components.converters.paddleocr.paddleocr_vl_document_converter import ( |
| 19 | + _infer_file_type_from_source, |
| 20 | +) |
18 | 21 |
|
19 | 22 |
|
20 | 23 | def create_empty_pdf(tmp_path, filename="test.pdf"): |
@@ -111,6 +114,21 @@ def test_init_with_all_optional_parameters(self): |
111 | 114 | assert converter.visualize is True |
112 | 115 | assert converter.additional_params == {} |
113 | 116 |
|
| 117 | + @pytest.mark.parametrize( |
| 118 | + "file_type, error_match", |
| 119 | + [ |
| 120 | + ("invalid_string", "Invalid `file_type` string"), |
| 121 | + (123, "Invalid `file_type` value"), |
| 122 | + ], |
| 123 | + ) |
| 124 | + def test_init_invalid_file_type_raises(self, file_type, error_match): |
| 125 | + with pytest.raises(ValueError, match=error_match): |
| 126 | + PaddleOCRVLDocumentConverter( |
| 127 | + api_url="http://test-api-url.com", |
| 128 | + access_token=Secret.from_token("test-access-token"), |
| 129 | + file_type=file_type, |
| 130 | + ) |
| 131 | + |
114 | 132 | def test_to_dict(self, monkeypatch): |
115 | 133 | monkeypatch.setenv("AISTUDIO_ACCESS_TOKEN", "test-access-token") |
116 | 134 | converter = PaddleOCRVLDocumentConverter(api_url="http://test-api-url.com") |
@@ -498,6 +516,70 @@ def test_run_handles_api_error(self, mock_ocr_response, tmp_path): |
498 | 516 | assert len(result["documents"]) == 2 |
499 | 517 | assert len(result["raw_paddleocr_responses"]) == 2 |
500 | 518 |
|
| 519 | + def test_run_skips_source_that_cannot_be_read(self, mock_ocr_response): |
| 520 | + converter = PaddleOCRVLDocumentConverter( |
| 521 | + api_url="http://test-api-url.com", |
| 522 | + access_token=Secret.from_token("test-access-token"), |
| 523 | + ) |
| 524 | + |
| 525 | + with patch("requests.post") as mock_post: |
| 526 | + mock_response = MagicMock() |
| 527 | + mock_response.status_code = 200 |
| 528 | + mock_response.json.return_value = mock_ocr_response |
| 529 | + mock_post.return_value = mock_response |
| 530 | + |
| 531 | + result = converter.run(sources=["/non/existent/file.png"]) |
| 532 | + |
| 533 | + assert len(result["documents"]) == 0 |
| 534 | + assert len(result["raw_paddleocr_responses"]) == 0 |
| 535 | + mock_post.assert_not_called() |
| 536 | + |
| 537 | + def test_run_handles_empty_text_response(self, tmp_path): |
| 538 | + mock_response_empty_text = { |
| 539 | + "result": { |
| 540 | + "layoutParsingResults": [ |
| 541 | + {"markdown": {"text": ""}, "prunedResult": {}}, |
| 542 | + ], |
| 543 | + "dataInfo": {"width": 1024, "height": 1024, "type": "image"}, |
| 544 | + }, |
| 545 | + } |
| 546 | + converter = PaddleOCRVLDocumentConverter( |
| 547 | + api_url="http://test-api-url.com", |
| 548 | + access_token=Secret.from_token("test-access-token"), |
| 549 | + ) |
| 550 | + test_file = create_empty_image(tmp_path, "test.png") |
| 551 | + |
| 552 | + with patch("requests.post") as mock_post: |
| 553 | + mock_response = MagicMock() |
| 554 | + mock_response.status_code = 200 |
| 555 | + mock_response.json.return_value = mock_response_empty_text |
| 556 | + mock_post.return_value = mock_response |
| 557 | + |
| 558 | + result = converter.run(sources=[str(test_file)]) |
| 559 | + |
| 560 | + assert len(result["documents"]) == 1 |
| 561 | + assert result["documents"][0].content == "" |
| 562 | + |
| 563 | + def test_run_omits_auth_header_when_token_missing(self, mock_ocr_response, tmp_path, monkeypatch): |
| 564 | + monkeypatch.delenv("AISTUDIO_ACCESS_TOKEN", raising=False) |
| 565 | + converter = PaddleOCRVLDocumentConverter( |
| 566 | + api_url="http://test-api-url.com", |
| 567 | + access_token=Secret.from_env_var("AISTUDIO_ACCESS_TOKEN", strict=False), |
| 568 | + ) |
| 569 | + test_file = create_empty_image(tmp_path, "test.png") |
| 570 | + |
| 571 | + with patch("requests.post") as mock_post: |
| 572 | + mock_response = MagicMock() |
| 573 | + mock_response.status_code = 200 |
| 574 | + mock_response.json.return_value = mock_ocr_response |
| 575 | + mock_post.return_value = mock_response |
| 576 | + |
| 577 | + result = converter.run(sources=[str(test_file)]) |
| 578 | + |
| 579 | + assert len(result["documents"]) == 1 |
| 580 | + headers = mock_post.call_args[1]["headers"] |
| 581 | + assert "Authorization" not in headers |
| 582 | + |
501 | 583 | def test_run_with_meta_single_dict(self, mock_ocr_response, tmp_path): |
502 | 584 | """Test that meta parameter with single dict is applied to all documents""" |
503 | 585 | converter = PaddleOCRVLDocumentConverter( |
@@ -580,6 +662,167 @@ def test_run_with_meta_none(self, mock_ocr_response, tmp_path): |
580 | 662 | # Only file path metadata should be present |
581 | 663 | assert "file_path" in result["documents"][0].meta |
582 | 664 |
|
| 665 | + def test_parse_sends_all_optional_parameters_in_request(self, mock_ocr_response, tmp_path): |
| 666 | + converter = PaddleOCRVLDocumentConverter( |
| 667 | + api_url="http://test-api-url.com", |
| 668 | + access_token=Secret.from_token("test-access-token"), |
| 669 | + use_doc_orientation_classify=True, |
| 670 | + use_doc_unwarping=True, |
| 671 | + use_layout_detection=True, |
| 672 | + use_chart_recognition=True, |
| 673 | + use_seal_recognition=True, |
| 674 | + use_ocr_for_image_block=True, |
| 675 | + layout_threshold=0.5, |
| 676 | + layout_nms=True, |
| 677 | + layout_unclip_ratio=1.5, |
| 678 | + layout_merge_bboxes_mode="large", |
| 679 | + layout_shape_mode="auto", |
| 680 | + prompt_label="ocr", |
| 681 | + format_block_content=True, |
| 682 | + repetition_penalty=1.1, |
| 683 | + temperature=0.7, |
| 684 | + top_p=0.9, |
| 685 | + min_pixels=100, |
| 686 | + max_pixels=1000, |
| 687 | + max_new_tokens=500, |
| 688 | + merge_layout_blocks=True, |
| 689 | + markdown_ignore_labels=["footer"], |
| 690 | + vlm_extra_args={"extra": "arg"}, |
| 691 | + prettify_markdown=True, |
| 692 | + show_formula_number=True, |
| 693 | + restructure_pages=True, |
| 694 | + merge_tables=True, |
| 695 | + relevel_titles=True, |
| 696 | + visualize=True, |
| 697 | + additional_params={"logId": "custom-log-id"}, |
| 698 | + ) |
| 699 | + test_file = create_empty_image(tmp_path, "test.png") |
| 700 | + |
| 701 | + with patch("requests.post") as mock_post: |
| 702 | + mock_response = MagicMock() |
| 703 | + mock_response.status_code = 200 |
| 704 | + mock_response.json.return_value = mock_ocr_response |
| 705 | + mock_post.return_value = mock_response |
| 706 | + |
| 707 | + result = converter.run(sources=[str(test_file)]) |
| 708 | + |
| 709 | + assert len(result["documents"]) == 1 |
| 710 | + payload = mock_post.call_args[1]["json"] |
| 711 | + assert payload["useDocOrientationClassify"] is True |
| 712 | + assert payload["useDocUnwarping"] is True |
| 713 | + assert payload["useLayoutDetection"] is True |
| 714 | + assert payload["useChartRecognition"] is True |
| 715 | + assert payload["useSealRecognition"] is True |
| 716 | + assert payload["useOcrForImageBlock"] is True |
| 717 | + assert payload["layoutThreshold"] == 0.5 |
| 718 | + assert payload["layoutNms"] is True |
| 719 | + assert payload["layoutUnclipRatio"] == 1.5 |
| 720 | + assert payload["layoutMergeBboxesMode"] == "large" |
| 721 | + assert payload["layoutShapeMode"] == "auto" |
| 722 | + assert payload["promptLabel"] == "ocr" |
| 723 | + assert payload["formatBlockContent"] is True |
| 724 | + assert payload["repetitionPenalty"] == 1.1 |
| 725 | + assert payload["temperature"] == 0.7 |
| 726 | + assert payload["topP"] == 0.9 |
| 727 | + assert payload["minPixels"] == 100 |
| 728 | + assert payload["maxPixels"] == 1000 |
| 729 | + assert payload["maxNewTokens"] == 500 |
| 730 | + assert payload["mergeLayoutBlocks"] is True |
| 731 | + assert payload["markdownIgnoreLabels"] == ["footer"] |
| 732 | + assert payload["vlmExtraArgs"] == {"extra": "arg"} |
| 733 | + assert payload["prettifyMarkdown"] is True |
| 734 | + assert payload["showFormulaNumber"] is True |
| 735 | + assert payload["restructurePages"] is True |
| 736 | + assert payload["mergeTables"] is True |
| 737 | + assert payload["relevelTitles"] is True |
| 738 | + assert payload["visualize"] is True |
| 739 | + assert payload["logId"] == "custom-log-id" |
| 740 | + |
| 741 | + def test_parse_skips_none_optional_parameters(self, mock_ocr_response, tmp_path): |
| 742 | + converter = PaddleOCRVLDocumentConverter( |
| 743 | + api_url="http://test-api-url.com", |
| 744 | + access_token=Secret.from_token("test-access-token"), |
| 745 | + use_doc_orientation_classify=None, |
| 746 | + use_doc_unwarping=None, |
| 747 | + ) |
| 748 | + test_file = create_empty_image(tmp_path, "test.png") |
| 749 | + |
| 750 | + with patch("requests.post") as mock_post: |
| 751 | + mock_response = MagicMock() |
| 752 | + mock_response.status_code = 200 |
| 753 | + mock_response.json.return_value = mock_ocr_response |
| 754 | + mock_post.return_value = mock_response |
| 755 | + |
| 756 | + result = converter.run(sources=[str(test_file)]) |
| 757 | + |
| 758 | + assert len(result["documents"]) == 1 |
| 759 | + payload = mock_post.call_args[1]["json"] |
| 760 | + assert "useDocOrientationClassify" not in payload |
| 761 | + assert "useDocUnwarping" not in payload |
| 762 | + |
| 763 | + def test_parse_raises_on_invalid_request_parameters(self, tmp_path): |
| 764 | + converter = PaddleOCRVLDocumentConverter( |
| 765 | + api_url="http://test-api-url.com", |
| 766 | + access_token=Secret.from_token("test-access-token"), |
| 767 | + ) |
| 768 | + test_file = create_empty_image(tmp_path, "test.png") |
| 769 | + |
| 770 | + with patch( |
| 771 | + "haystack_integrations.components.converters.paddleocr." |
| 772 | + "paddleocr_vl_document_converter.PaddleOCRVLInferRequest", |
| 773 | + side_effect=Exception("Validation failed"), |
| 774 | + ): |
| 775 | + result = converter.run(sources=[str(test_file)]) |
| 776 | + assert len(result["documents"]) == 0 |
| 777 | + |
| 778 | + def test_parse_raises_on_invalid_json_response(self, tmp_path): |
| 779 | + converter = PaddleOCRVLDocumentConverter( |
| 780 | + api_url="http://test-api-url.com", |
| 781 | + access_token=Secret.from_token("test-access-token"), |
| 782 | + ) |
| 783 | + test_file = create_empty_image(tmp_path, "test.png") |
| 784 | + |
| 785 | + with patch("requests.post") as mock_post: |
| 786 | + mock_response = MagicMock() |
| 787 | + mock_response.status_code = 200 |
| 788 | + mock_response.json.side_effect = ValueError("Invalid JSON") |
| 789 | + mock_post.return_value = mock_response |
| 790 | + |
| 791 | + result = converter.run(sources=[str(test_file)]) |
| 792 | + assert len(result["documents"]) == 0 |
| 793 | + |
| 794 | + def test_parse_raises_when_result_field_missing(self, tmp_path): |
| 795 | + converter = PaddleOCRVLDocumentConverter( |
| 796 | + api_url="http://test-api-url.com", |
| 797 | + access_token=Secret.from_token("test-access-token"), |
| 798 | + ) |
| 799 | + test_file = create_empty_image(tmp_path, "test.png") |
| 800 | + |
| 801 | + with patch("requests.post") as mock_post: |
| 802 | + mock_response = MagicMock() |
| 803 | + mock_response.status_code = 200 |
| 804 | + mock_response.json.return_value = {"logId": "123", "errorMsg": "Error"} |
| 805 | + mock_post.return_value = mock_response |
| 806 | + |
| 807 | + result = converter.run(sources=[str(test_file)]) |
| 808 | + assert len(result["documents"]) == 0 |
| 809 | + |
| 810 | + def test_parse_raises_on_invalid_response_format(self, tmp_path): |
| 811 | + converter = PaddleOCRVLDocumentConverter( |
| 812 | + api_url="http://test-api-url.com", |
| 813 | + access_token=Secret.from_token("test-access-token"), |
| 814 | + ) |
| 815 | + test_file = create_empty_image(tmp_path, "test.png") |
| 816 | + |
| 817 | + with patch("requests.post") as mock_post: |
| 818 | + mock_response = MagicMock() |
| 819 | + mock_response.status_code = 200 |
| 820 | + mock_response.json.return_value = {"result": {"invalid": "format"}} |
| 821 | + mock_post.return_value = mock_response |
| 822 | + |
| 823 | + result = converter.run(sources=[str(test_file)]) |
| 824 | + assert len(result["documents"]) == 0 |
| 825 | + |
583 | 826 | def test_file_type_auto_detection_pdf(self, mock_ocr_response_with_multiple_pages, tmp_path): |
584 | 827 | """Test that file_type is automatically detected as PDF from .pdf extension""" |
585 | 828 | converter = PaddleOCRVLDocumentConverter( |
@@ -732,6 +975,26 @@ def test_file_type_auto_detection_bytestream(self, mock_ocr_response, tmp_path): |
732 | 975 | # Second call should be for image (fileType=1) |
733 | 976 | assert calls[1][1]["json"]["fileType"] == 1 |
734 | 977 |
|
| 978 | + @pytest.mark.parametrize( |
| 979 | + "mime_type, expected", |
| 980 | + [ |
| 981 | + ("application/pdf", 0), |
| 982 | + ("APPLICATION/PDF", 0), |
| 983 | + ("image/png", 1), |
| 984 | + ("image/jpeg", 1), |
| 985 | + ("text/plain", None), |
| 986 | + (None, None), |
| 987 | + ], |
| 988 | + ) |
| 989 | + def test_infer_file_type_from_mime_type(self, mime_type, expected): |
| 990 | + source = ByteStream(data=b"dummy") |
| 991 | + assert _infer_file_type_from_source(source, mime_type) == expected |
| 992 | + |
| 993 | + def test_infer_file_type_falls_back_to_mime_type_when_extension_unknown(self, tmp_path): |
| 994 | + source = tmp_path / "file.unknown" |
| 995 | + assert _infer_file_type_from_source(source, "application/pdf") == 0 |
| 996 | + assert _infer_file_type_from_source(source, "image/png") == 1 |
| 997 | + |
735 | 998 | @pytest.mark.skipif( |
736 | 999 | not os.environ.get("PADDLEOCR_VL_API_URL") or not os.environ.get("AISTUDIO_ACCESS_TOKEN"), |
737 | 1000 | reason="Export env vars `PADDLEOCR_VL_API_URL` and `AISTUDIO_ACCESS_TOKEN` to run this test.", |
|
0 commit comments