diff --git a/integrations/kreuzberg/tests/test_converter.py b/integrations/kreuzberg/tests/test_converter.py index ff5895bcbd..b84ba1d6de 100644 --- a/integrations/kreuzberg/tests/test_converter.py +++ b/integrations/kreuzberg/tests/test_converter.py @@ -1,10 +1,8 @@ # SPDX-FileCopyrightText: 2026-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -import json -import tempfile + from pathlib import Path -from typing import Any from unittest.mock import MagicMock, patch import pytest @@ -14,8 +12,6 @@ ExtractionConfig, ExtractionResult, LanguageDetectionConfig, - OcrConfig, - config_to_json, ) from haystack_integrations.components.converters.kreuzberg import KreuzbergConverter @@ -28,243 +24,6 @@ CONVERTER_MODULE = "haystack_integrations.components.converters.kreuzberg.converter" -def test_init_default() -> None: - converter = KreuzbergConverter() - assert converter.config is None - assert converter.config_path is None - assert converter.store_full_path is False - assert converter.batch is True - assert converter.easyocr_kwargs is None - - -def test_init_with_all_params() -> None: - config = ExtractionConfig(output_format="markdown") - converter = KreuzbergConverter( - config=config, - store_full_path=True, - batch=False, - easyocr_kwargs={"gpu": False}, - ) - assert converter.config is config - assert converter.config_path is None - assert converter.store_full_path is True - assert converter.batch is False - assert converter.easyocr_kwargs == {"gpu": False} - - -def test_serialization_roundtrip_defaults() -> None: - converter = KreuzbergConverter() - d = converter.to_dict() - assert d == { - "type": "haystack_integrations.components.converters.kreuzberg.converter.KreuzbergConverter", - "init_parameters": { - "config": None, - "config_path": None, - "store_full_path": False, - "batch": True, - "easyocr_kwargs": None, - }, - } - restored = KreuzbergConverter.from_dict(d) - assert restored.config is None - assert restored.config_path is None - assert restored.store_full_path is False - assert restored.batch is True - assert restored.easyocr_kwargs is None - - -def test_serialization_roundtrip_all_params() -> None: - converter = KreuzbergConverter( - config=ExtractionConfig( - output_format="html", - ocr=OcrConfig(backend="tesseract", language="deu"), - ), - store_full_path=True, - batch=False, - easyocr_kwargs={"gpu": True, "beam_width": 3}, - ) - # to_dict serializes config as JSON string - d = converter.to_dict() - params = d["init_parameters"] - assert isinstance(params["config"], str) - assert json.loads(params["config"])["output_format"] == "html" - assert params["store_full_path"] is True - assert params["batch"] is False - assert params["easyocr_kwargs"] == {"gpu": True, "beam_width": 3} - - # Save config JSON before from_dict mutates d in place - original_config_json = json.loads(params["config"]) - - # from_dict restores all fields - restored = KreuzbergConverter.from_dict(d) - assert restored.config.output_format == "html" - assert restored.config.ocr.backend == "tesseract" - assert restored.config.ocr.language == "deu" - assert restored.config_path is None - assert restored.store_full_path is True - assert restored.batch is False - assert restored.easyocr_kwargs == {"gpu": True, "beam_width": 3} - - # Double roundtrip produces identical to_dict output - d2 = restored.to_dict() - p1 = {k: v for k, v in d["init_parameters"].items() if k != "config"} - p2 = {k: v for k, v in d2["init_parameters"].items() if k != "config"} - assert p1 == p2 - assert original_config_json == json.loads(d2["init_parameters"]["config"]) - - -def test_serialization_roundtrip_config_path() -> None: - # String config_path - converter = KreuzbergConverter(config_path="/tmp/kreuzberg.toml") - d = converter.to_dict() - assert d["init_parameters"]["config_path"] == "/tmp/kreuzberg.toml" - restored = KreuzbergConverter.from_dict(d) - assert restored.config_path == "/tmp/kreuzberg.toml" - - # Path object is normalized to str - converter2 = KreuzbergConverter(config_path=Path("/some/path/config.json")) - d2 = converter2.to_dict() - assert d2["init_parameters"]["config_path"] == "/some/path/config.json" - assert isinstance(d2["init_parameters"]["config_path"], str) - - -def test_serialization_from_dict_empty_init_parameters() -> None: - d = { - "type": "haystack_integrations.components.converters.kreuzberg.converter.KreuzbergConverter", - "init_parameters": {}, - } - converter = KreuzbergConverter.from_dict(d) - assert converter.config is None - assert converter.config_path is None - assert converter.store_full_path is False - assert converter.batch is True - assert converter.easyocr_kwargs is None - - -def test_build_config_default(converter: KreuzbergConverter) -> None: - config = converter._build_config() - assert config.output_format == "plain" - assert config.language_detection is not None - assert config.language_detection.enabled is True - - -def test_build_config_does_not_mutate_self_config() -> None: - base = ExtractionConfig(output_format="html") - converter = KreuzbergConverter(config=base) - converter._build_config() - assert base.output_format == "html" - - -def test_build_config_from_file() -> None: - config = ExtractionConfig(output_format="markdown") - json_str = config_to_json(config) - with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - f.write(json_str) - path = f.name - - try: - converter = KreuzbergConverter(config_path=path) - built = converter._build_config() - assert built.output_format == "markdown" - finally: - Path(path).unlink(missing_ok=True) - - -def test_raises_when_both_config_and_config_path() -> None: - with pytest.raises(ValueError, match="Cannot specify both"): - KreuzbergConverter( - config=ExtractionConfig(output_format="markdown"), - config_path="/tmp/config.json", - ) - - -def test_table_assembly_appends_markdown_to_content() -> None: - table = MagicMock(spec=ExtractedTable) - table.markdown = "| A | B |\n|---|---|\n| 1 | 2 |" - content = KreuzbergConverter._assemble_content("Main text", [table], "plain") - assert content == "Main text\n\n| A | B |\n|---|---|\n| 1 | 2 |" - - # Also works with dict-style tables - dict_table = {"markdown": "| A |\n|---|\n| 1 |", "cells": [["A"], ["1"]], "page_number": 1} - content = KreuzbergConverter._assemble_content("Main text", [dict_table], "plain") - assert content == "Main text\n\n| A |\n|---|\n| 1 |" - - -def test_table_assembly_appends_multiple_tables() -> None: - t1 = MagicMock(spec=ExtractedTable) - t1.markdown = "| A |\n|---|\n| 1 |" - t2 = MagicMock(spec=ExtractedTable) - t2.markdown = "| B |\n|---|\n| 2 |" - content = KreuzbergConverter._assemble_content("Text", [t1, t2], "plain") - assert content == "Text\n\n| A |\n|---|\n| 1 |\n\n| B |\n|---|\n| 2 |" - - -def test_table_assembly_skips_tables_with_empty_markdown() -> None: - table = MagicMock(spec=ExtractedTable) - table.markdown = "" - content = KreuzbergConverter._assemble_content("Main text", [table], "plain") - assert content == "Main text" - - # Also works with dict-style tables - dict_table: dict[str, Any] = {"markdown": "", "cells": [], "page_number": 1} - content = KreuzbergConverter._assemble_content("Main text", [dict_table], "plain") - assert content == "Main text" - - -def test_table_assembly_no_tables_returns_text_unchanged() -> None: - assert KreuzbergConverter._assemble_content("text", None, "plain") == "text" - assert KreuzbergConverter._assemble_content("text", [], "plain") == "text" - - -def test_table_assembly_skipped_for_markdown_and_html_format() -> None: - table = MagicMock(spec=ExtractedTable) - table.markdown = "| A |" - assert KreuzbergConverter._assemble_content("text", [table], "markdown") == "text" - assert KreuzbergConverter._assemble_content("text", [table], "html") == "text" - - -def test_edge_empty_sources_list(sequential_converter: KreuzbergConverter) -> None: - converter = sequential_converter - result = converter.run(sources=[]) - assert result["documents"] == [] - - -@patch(CONVERTER_MODULE + ".extract_file_sync") -def test_edge_sequential_extraction_error_skipped( - mock_extract: MagicMock, sequential_converter: KreuzbergConverter, fixtures_dir: Path -) -> None: - mock_extract.side_effect = RuntimeError("extraction failed") - converter = sequential_converter - result = converter.run(sources=[fixtures_dir / "sample.txt"]) - assert result["documents"] == [] - - -def test_metadata_mock_images_excludes_binary_data(converter: KreuzbergConverter, make_mock_result) -> None: - result = make_mock_result( - images=[ - { - "format": "png", - "page_number": 1, - "width": 200, - "height": 100, - "description": "chart", - "image_index": 0, - "data": b"binary_data_here", - }, - ] - ) - - meta = converter._build_extraction_metadata(result) - assert meta["image_count"] == 1 - assert meta["images"][0]["format"] == "png" - assert meta["images"][0]["width"] == 200 - assert meta["images"][0]["height"] == 100 - assert meta["images"][0]["description"] == "chart" - assert meta["images"][0]["image_index"] == 0 - assert meta["images"][0]["page_number"] == 1 - assert "data" not in meta["images"][0] - - def test_metadata_mock_all_fields_populated(converter: KreuzbergConverter, make_mock_result) -> None: warning = MagicMock() warning.source = "parser"