deepset-ai · anakin87 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -1,10 +1,8 @@
 # SPDX-FileCopyrightText: 2026-present deepset GmbH <info@deepset.ai>
 #
 # SPDX-License-Identifier: Apache-2.0
-import json
-import tempfile
+
 from pathlib import Path
-from typing import Any
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -14,8 +12,6 @@
     ExtractionConfig,
     ExtractionResult,
     LanguageDetectionConfig,
-    OcrConfig,
-    config_to_json,
 )
 
 from haystack_integrations.components.converters.kreuzberg import KreuzbergConverter
@@ -28,243 +24,6 @@
 CONVERTER_MODULE = "haystack_integrations.components.converters.kreuzberg.converter"
 
 
-def test_init_default() -> None:
-    converter = KreuzbergConverter()
-    assert converter.config is None
-    assert converter.config_path is None
-    assert converter.store_full_path is False
-    assert converter.batch is True
-    assert converter.easyocr_kwargs is None
-
-
-def test_init_with_all_params() -> None:
-    config = ExtractionConfig(output_format="markdown")
-    converter = KreuzbergConverter(
-        config=config,
-        store_full_path=True,
-        batch=False,
-        easyocr_kwargs={"gpu": False},
-    )
-    assert converter.config is config
-    assert converter.config_path is None
-    assert converter.store_full_path is True
-    assert converter.batch is False
-    assert converter.easyocr_kwargs == {"gpu": False}
-
-
-def test_serialization_roundtrip_defaults() -> None:
-    converter = KreuzbergConverter()
-    d = converter.to_dict()
-    assert d == {
-        "type": "haystack_integrations.components.converters.kreuzberg.converter.KreuzbergConverter",
-        "init_parameters": {
-            "config": None,
-            "config_path": None,
-            "store_full_path": False,
-            "batch": True,
-            "easyocr_kwargs": None,
-        },
-    }
-    restored = KreuzbergConverter.from_dict(d)
-    assert restored.config is None
-    assert restored.config_path is None
-    assert restored.store_full_path is False
-    assert restored.batch is True
-    assert restored.easyocr_kwargs is None
-
-
-def test_serialization_roundtrip_all_params() -> None:
-    converter = KreuzbergConverter(
-        config=ExtractionConfig(
-            output_format="html",
-            ocr=OcrConfig(backend="tesseract", language="deu"),
-        ),
-        store_full_path=True,
-        batch=False,
-        easyocr_kwargs={"gpu": True, "beam_width": 3},
-    )
-    # to_dict serializes config as JSON string
-    d = converter.to_dict()
-    params = d["init_parameters"]
-    assert isinstance(params["config"], str)
-    assert json.loads(params["config"])["output_format"] == "html"
-    assert params["store_full_path"] is True
-    assert params["batch"] is False
-    assert params["easyocr_kwargs"] == {"gpu": True, "beam_width": 3}
-
-    # Save config JSON before from_dict mutates d in place
-    original_config_json = json.loads(params["config"])
-
-    # from_dict restores all fields
-    restored = KreuzbergConverter.from_dict(d)
-    assert restored.config.output_format == "html"
-    assert restored.config.ocr.backend == "tesseract"
-    assert restored.config.ocr.language == "deu"
-    assert restored.config_path is None
-    assert restored.store_full_path is True
-    assert restored.batch is False
-    assert restored.easyocr_kwargs == {"gpu": True, "beam_width": 3}
-
-    # Double roundtrip produces identical to_dict output
-    d2 = restored.to_dict()
-    p1 = {k: v for k, v in d["init_parameters"].items() if k != "config"}
-    p2 = {k: v for k, v in d2["init_parameters"].items() if k != "config"}
-    assert p1 == p2
-    assert original_config_json == json.loads(d2["init_parameters"]["config"])
-
-
-def test_serialization_roundtrip_config_path() -> None:
-    # String config_path
-    converter = KreuzbergConverter(config_path="/tmp/kreuzberg.toml")
-    d = converter.to_dict()
-    assert d["init_parameters"]["config_path"] == "/tmp/kreuzberg.toml"
-    restored = KreuzbergConverter.from_dict(d)
-    assert restored.config_path == "/tmp/kreuzberg.toml"
-
-    # Path object is normalized to str
-    converter2 = KreuzbergConverter(config_path=Path("/some/path/config.json"))
-    d2 = converter2.to_dict()
-    assert d2["init_parameters"]["config_path"] == "/some/path/config.json"
-    assert isinstance(d2["init_parameters"]["config_path"], str)
-
-
-def test_serialization_from_dict_empty_init_parameters() -> None:
-    d = {
-        "type": "haystack_integrations.components.converters.kreuzberg.converter.KreuzbergConverter",
-        "init_parameters": {},
-    }
-    converter = KreuzbergConverter.from_dict(d)
-    assert converter.config is None
-    assert converter.config_path is None
-    assert converter.store_full_path is False
-    assert converter.batch is True
-    assert converter.easyocr_kwargs is None
-
-
-def test_build_config_default(converter: KreuzbergConverter) -> None:
-    config = converter._build_config()
-    assert config.output_format == "plain"
-    assert config.language_detection is not None
-    assert config.language_detection.enabled is True
-
-
-def test_build_config_does_not_mutate_self_config() -> None:
-    base = ExtractionConfig(output_format="html")
-    converter = KreuzbergConverter(config=base)
-    converter._build_config()
-    assert base.output_format == "html"
-
-
-def test_build_config_from_file() -> None:
-    config = ExtractionConfig(output_format="markdown")
-    json_str = config_to_json(config)
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-        f.write(json_str)
-        path = f.name
-
-    try:
-        converter = KreuzbergConverter(config_path=path)
-        built = converter._build_config()
-        assert built.output_format == "markdown"
-    finally:
-        Path(path).unlink(missing_ok=True)
-
-
-def test_raises_when_both_config_and_config_path() -> None:
-    with pytest.raises(ValueError, match="Cannot specify both"):
-        KreuzbergConverter(
-            config=ExtractionConfig(output_format="markdown"),
-            config_path="/tmp/config.json",
-        )
-
-
-def test_table_assembly_appends_markdown_to_content() -> None:
-    table = MagicMock(spec=ExtractedTable)
-    table.markdown = "| A | B |\n|---|---|\n| 1 | 2 |"
-    content = KreuzbergConverter._assemble_content("Main text", [table], "plain")
-    assert content == "Main text\n\n| A | B |\n|---|---|\n| 1 | 2 |"
-
-    # Also works with dict-style tables
-    dict_table = {"markdown": "| A |\n|---|\n| 1 |", "cells": [["A"], ["1"]], "page_number": 1}
-    content = KreuzbergConverter._assemble_content("Main text", [dict_table], "plain")
-    assert content == "Main text\n\n| A |\n|---|\n| 1 |"
-
-
-def test_table_assembly_appends_multiple_tables() -> None:
-    t1 = MagicMock(spec=ExtractedTable)
-    t1.markdown = "| A |\n|---|\n| 1 |"
-    t2 = MagicMock(spec=ExtractedTable)
-    t2.markdown = "| B |\n|---|\n| 2 |"
-    content = KreuzbergConverter._assemble_content("Text", [t1, t2], "plain")
-    assert content == "Text\n\n| A |\n|---|\n| 1 |\n\n| B |\n|---|\n| 2 |"
-
-
-def test_table_assembly_skips_tables_with_empty_markdown() -> None:
-    table = MagicMock(spec=ExtractedTable)
-    table.markdown = ""
-    content = KreuzbergConverter._assemble_content("Main text", [table], "plain")
-    assert content == "Main text"
-
-    # Also works with dict-style tables
-    dict_table: dict[str, Any] = {"markdown": "", "cells": [], "page_number": 1}
-    content = KreuzbergConverter._assemble_content("Main text", [dict_table], "plain")
-    assert content == "Main text"
-
-
-def test_table_assembly_no_tables_returns_text_unchanged() -> None:
-    assert KreuzbergConverter._assemble_content("text", None, "plain") == "text"
-    assert KreuzbergConverter._assemble_content("text", [], "plain") == "text"
-
-
-def test_table_assembly_skipped_for_markdown_and_html_format() -> None:
-    table = MagicMock(spec=ExtractedTable)
-    table.markdown = "| A |"
-    assert KreuzbergConverter._assemble_content("text", [table], "markdown") == "text"
-    assert KreuzbergConverter._assemble_content("text", [table], "html") == "text"
-
-
-def test_edge_empty_sources_list(sequential_converter: KreuzbergConverter) -> None:
-    converter = sequential_converter
-    result = converter.run(sources=[])
-    assert result["documents"] == []
-
-
-@patch(CONVERTER_MODULE + ".extract_file_sync")
-def test_edge_sequential_extraction_error_skipped(
-    mock_extract: MagicMock, sequential_converter: KreuzbergConverter, fixtures_dir: Path
-) -> None:
-    mock_extract.side_effect = RuntimeError("extraction failed")
-    converter = sequential_converter
-    result = converter.run(sources=[fixtures_dir / "sample.txt"])
-    assert result["documents"] == []
-
-
-def test_metadata_mock_images_excludes_binary_data(converter: KreuzbergConverter, make_mock_result) -> None:
-    result = make_mock_result(
-        images=[
-            {
-                "format": "png",
-                "page_number": 1,
-                "width": 200,
-                "height": 100,
-                "description": "chart",
-                "image_index": 0,
-                "data": b"binary_data_here",
-            },
-        ]
-    )
-
-    meta = converter._build_extraction_metadata(result)
-    assert meta["image_count"] == 1
-    assert meta["images"][0]["format"] == "png"
-    assert meta["images"][0]["width"] == 200
-    assert meta["images"][0]["height"] == 100
-    assert meta["images"][0]["description"] == "chart"
-    assert meta["images"][0]["image_index"] == 0
-    assert meta["images"][0]["page_number"] == 1
-    assert "data" not in meta["images"][0]
-
-
 def test_metadata_mock_all_fields_populated(converter: KreuzbergConverter, make_mock_result) -> None:
     warning = MagicMock()
     warning.source = "parser"