Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 1 addition & 242 deletions integrations/kreuzberg/tests/test_converter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# SPDX-FileCopyrightText: 2026-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import json
import tempfile

from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch

import pytest
Expand All @@ -14,8 +12,6 @@
ExtractionConfig,
ExtractionResult,
LanguageDetectionConfig,
OcrConfig,
config_to_json,
)

from haystack_integrations.components.converters.kreuzberg import KreuzbergConverter
Expand All @@ -28,243 +24,6 @@
CONVERTER_MODULE = "haystack_integrations.components.converters.kreuzberg.converter"


def test_init_default() -> None:
converter = KreuzbergConverter()
assert converter.config is None
assert converter.config_path is None
assert converter.store_full_path is False
assert converter.batch is True
assert converter.easyocr_kwargs is None


def test_init_with_all_params() -> None:
config = ExtractionConfig(output_format="markdown")
converter = KreuzbergConverter(
config=config,
store_full_path=True,
batch=False,
easyocr_kwargs={"gpu": False},
)
assert converter.config is config
assert converter.config_path is None
assert converter.store_full_path is True
assert converter.batch is False
assert converter.easyocr_kwargs == {"gpu": False}


def test_serialization_roundtrip_defaults() -> None:
converter = KreuzbergConverter()
d = converter.to_dict()
assert d == {
"type": "haystack_integrations.components.converters.kreuzberg.converter.KreuzbergConverter",
"init_parameters": {
"config": None,
"config_path": None,
"store_full_path": False,
"batch": True,
"easyocr_kwargs": None,
},
}
restored = KreuzbergConverter.from_dict(d)
assert restored.config is None
assert restored.config_path is None
assert restored.store_full_path is False
assert restored.batch is True
assert restored.easyocr_kwargs is None


def test_serialization_roundtrip_all_params() -> None:
converter = KreuzbergConverter(
config=ExtractionConfig(
output_format="html",
ocr=OcrConfig(backend="tesseract", language="deu"),
),
store_full_path=True,
batch=False,
easyocr_kwargs={"gpu": True, "beam_width": 3},
)
# to_dict serializes config as JSON string
d = converter.to_dict()
params = d["init_parameters"]
assert isinstance(params["config"], str)
assert json.loads(params["config"])["output_format"] == "html"
assert params["store_full_path"] is True
assert params["batch"] is False
assert params["easyocr_kwargs"] == {"gpu": True, "beam_width": 3}

# Save config JSON before from_dict mutates d in place
original_config_json = json.loads(params["config"])

# from_dict restores all fields
restored = KreuzbergConverter.from_dict(d)
assert restored.config.output_format == "html"
assert restored.config.ocr.backend == "tesseract"
assert restored.config.ocr.language == "deu"
assert restored.config_path is None
assert restored.store_full_path is True
assert restored.batch is False
assert restored.easyocr_kwargs == {"gpu": True, "beam_width": 3}

# Double roundtrip produces identical to_dict output
d2 = restored.to_dict()
p1 = {k: v for k, v in d["init_parameters"].items() if k != "config"}
p2 = {k: v for k, v in d2["init_parameters"].items() if k != "config"}
assert p1 == p2
assert original_config_json == json.loads(d2["init_parameters"]["config"])


def test_serialization_roundtrip_config_path() -> None:
# String config_path
converter = KreuzbergConverter(config_path="/tmp/kreuzberg.toml")
d = converter.to_dict()
assert d["init_parameters"]["config_path"] == "/tmp/kreuzberg.toml"
restored = KreuzbergConverter.from_dict(d)
assert restored.config_path == "/tmp/kreuzberg.toml"

# Path object is normalized to str
converter2 = KreuzbergConverter(config_path=Path("/some/path/config.json"))
d2 = converter2.to_dict()
assert d2["init_parameters"]["config_path"] == "/some/path/config.json"
assert isinstance(d2["init_parameters"]["config_path"], str)


def test_serialization_from_dict_empty_init_parameters() -> None:
d = {
"type": "haystack_integrations.components.converters.kreuzberg.converter.KreuzbergConverter",
"init_parameters": {},
}
converter = KreuzbergConverter.from_dict(d)
assert converter.config is None
assert converter.config_path is None
assert converter.store_full_path is False
assert converter.batch is True
assert converter.easyocr_kwargs is None


def test_build_config_default(converter: KreuzbergConverter) -> None:
config = converter._build_config()
assert config.output_format == "plain"
assert config.language_detection is not None
assert config.language_detection.enabled is True


def test_build_config_does_not_mutate_self_config() -> None:
base = ExtractionConfig(output_format="html")
converter = KreuzbergConverter(config=base)
converter._build_config()
assert base.output_format == "html"


def test_build_config_from_file() -> None:
config = ExtractionConfig(output_format="markdown")
json_str = config_to_json(config)
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
f.write(json_str)
path = f.name

try:
converter = KreuzbergConverter(config_path=path)
built = converter._build_config()
assert built.output_format == "markdown"
finally:
Path(path).unlink(missing_ok=True)


def test_raises_when_both_config_and_config_path() -> None:
with pytest.raises(ValueError, match="Cannot specify both"):
KreuzbergConverter(
config=ExtractionConfig(output_format="markdown"),
config_path="/tmp/config.json",
)


def test_table_assembly_appends_markdown_to_content() -> None:
table = MagicMock(spec=ExtractedTable)
table.markdown = "| A | B |\n|---|---|\n| 1 | 2 |"
content = KreuzbergConverter._assemble_content("Main text", [table], "plain")
assert content == "Main text\n\n| A | B |\n|---|---|\n| 1 | 2 |"

# Also works with dict-style tables
dict_table = {"markdown": "| A |\n|---|\n| 1 |", "cells": [["A"], ["1"]], "page_number": 1}
content = KreuzbergConverter._assemble_content("Main text", [dict_table], "plain")
assert content == "Main text\n\n| A |\n|---|\n| 1 |"


def test_table_assembly_appends_multiple_tables() -> None:
t1 = MagicMock(spec=ExtractedTable)
t1.markdown = "| A |\n|---|\n| 1 |"
t2 = MagicMock(spec=ExtractedTable)
t2.markdown = "| B |\n|---|\n| 2 |"
content = KreuzbergConverter._assemble_content("Text", [t1, t2], "plain")
assert content == "Text\n\n| A |\n|---|\n| 1 |\n\n| B |\n|---|\n| 2 |"


def test_table_assembly_skips_tables_with_empty_markdown() -> None:
table = MagicMock(spec=ExtractedTable)
table.markdown = ""
content = KreuzbergConverter._assemble_content("Main text", [table], "plain")
assert content == "Main text"

# Also works with dict-style tables
dict_table: dict[str, Any] = {"markdown": "", "cells": [], "page_number": 1}
content = KreuzbergConverter._assemble_content("Main text", [dict_table], "plain")
assert content == "Main text"


def test_table_assembly_no_tables_returns_text_unchanged() -> None:
assert KreuzbergConverter._assemble_content("text", None, "plain") == "text"
assert KreuzbergConverter._assemble_content("text", [], "plain") == "text"


def test_table_assembly_skipped_for_markdown_and_html_format() -> None:
table = MagicMock(spec=ExtractedTable)
table.markdown = "| A |"
assert KreuzbergConverter._assemble_content("text", [table], "markdown") == "text"
assert KreuzbergConverter._assemble_content("text", [table], "html") == "text"


def test_edge_empty_sources_list(sequential_converter: KreuzbergConverter) -> None:
converter = sequential_converter
result = converter.run(sources=[])
assert result["documents"] == []


@patch(CONVERTER_MODULE + ".extract_file_sync")
def test_edge_sequential_extraction_error_skipped(
mock_extract: MagicMock, sequential_converter: KreuzbergConverter, fixtures_dir: Path
) -> None:
mock_extract.side_effect = RuntimeError("extraction failed")
converter = sequential_converter
result = converter.run(sources=[fixtures_dir / "sample.txt"])
assert result["documents"] == []


def test_metadata_mock_images_excludes_binary_data(converter: KreuzbergConverter, make_mock_result) -> None:
result = make_mock_result(
images=[
{
"format": "png",
"page_number": 1,
"width": 200,
"height": 100,
"description": "chart",
"image_index": 0,
"data": b"binary_data_here",
},
]
)

meta = converter._build_extraction_metadata(result)
assert meta["image_count"] == 1
assert meta["images"][0]["format"] == "png"
assert meta["images"][0]["width"] == 200
assert meta["images"][0]["height"] == 100
assert meta["images"][0]["description"] == "chart"
assert meta["images"][0]["image_index"] == 0
assert meta["images"][0]["page_number"] == 1
assert "data" not in meta["images"][0]


def test_metadata_mock_all_fields_populated(converter: KreuzbergConverter, make_mock_result) -> None:
warning = MagicMock()
warning.source = "parser"
Expand Down
Loading