Skip to content

Commit 06f2c81

Browse files
authored
fix: fix kreuzberg for 4.7 breaking changes (#3110)
* fix: fix kreuzberg for 4.7 breaking changes * simplify
1 parent 200cb45 commit 06f2c81

4 files changed

Lines changed: 41 additions & 5 deletions

File tree

integrations/kreuzberg/src/haystack_integrations/components/converters/kreuzberg/converter.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
_is_batch_error,
3838
_serialize_annotations,
3939
_serialize_images,
40+
_serialize_keywords,
4041
_serialize_warnings,
4142
)
4243

@@ -266,13 +267,15 @@ def _build_extraction_metadata(result: ExtractionResult) -> dict[str, Any]:
266267
Build metadata dict from an `ExtractionResult`.
267268
268269
Flattens kreuzberg's metadata fields and enriches with top-level result attributes.
269-
270-
Fields already present in `result.metadata` (`quality_score`,
271-
`output_format`, `keywords`) are passed through as-is - they
272-
don't need separate serialization. None values are filtered out.
273270
"""
274271
meta: dict[str, Any] = {k: v for k, v in result.metadata.items() if v is not None}
275272

273+
if result.output_format:
274+
meta["output_format"] = result.output_format
275+
if result.quality_score:
276+
meta["quality_score"] = result.quality_score
277+
if result.extracted_keywords:
278+
meta["keywords"] = _serialize_keywords(result.extracted_keywords)
276279
if result.processing_warnings:
277280
meta["processing_warnings"] = _serialize_warnings(result.processing_warnings)
278281
if result.detected_languages:

integrations/kreuzberg/src/haystack_integrations/components/converters/kreuzberg/utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,19 @@ def _serialize_warnings(warnings: list[Any]) -> list[dict[str, str]]:
4949
return serialized
5050

5151

52+
def _serialize_keywords(keywords: list[Any]) -> list[dict[str, Any]]:
53+
"""Serialize kreuzberg `ExtractedKeyword` objects to plain dicts (PyO3 objects aren't picklable)."""
54+
return [
55+
{
56+
"text": k.text,
57+
"score": k.score,
58+
"algorithm": k.algorithm,
59+
"positions": list(k.positions) if k.positions is not None else None,
60+
}
61+
for k in keywords
62+
]
63+
64+
5265
def _serialize_annotations(annotations: list[Any]) -> list[dict[str, Any]]:
5366
"""Serialize PDF annotations to plain dicts."""
5467
serialized = []

integrations/kreuzberg/tests/test_converter.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from haystack_integrations.components.converters.kreuzberg import KreuzbergConverter
2222
from haystack_integrations.components.converters.kreuzberg.utils import (
2323
_is_batch_error,
24+
_serialize_keywords,
2425
_serialize_warnings,
2526
)
2627

@@ -522,6 +523,26 @@ def test_helper_serialize_warnings_with_objects() -> None:
522523
assert result == [{"source": "parser", "message": "skipped element"}]
523524

524525

526+
def test_helper_serialize_keywords() -> None:
527+
k = MagicMock(spec=["text", "score", "algorithm", "positions"])
528+
k.text = "haystack"
529+
k.score = 0.87
530+
k.algorithm = "yake"
531+
k.positions = [(0, 8), (42, 50)]
532+
assert _serialize_keywords([k]) == [
533+
{"text": "haystack", "score": 0.87, "algorithm": "yake", "positions": [(0, 8), (42, 50)]}
534+
]
535+
536+
537+
def test_helper_serialize_keywords_with_none_positions() -> None:
538+
k = MagicMock(spec=["text", "score", "algorithm", "positions"])
539+
k.text = "kreuzberg"
540+
k.score = 0.5
541+
k.algorithm = "yake"
542+
k.positions = None
543+
assert _serialize_keywords([k]) == [{"text": "kreuzberg", "score": 0.5, "algorithm": "yake", "positions": None}]
544+
545+
525546
def test_build_config_skips_auto_language_detection_when_already_set() -> None:
526547
config = ExtractionConfig(language_detection=LanguageDetectionConfig(enabled=False))
527548
converter = KreuzbergConverter(config=config)

integrations/kreuzberg/tests/test_converter_integration.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,6 @@ def test_metadata_keyword_extraction() -> None:
207207
)
208208
result = converter.run(sources=[FIXTURES_DIR / "sample.pdf"])
209209
doc = _docs(result)[0]
210-
# Keywords flow through from result.metadata as-is (plain dicts)
211210
assert "keywords" in doc.meta
212211
keywords = doc.meta["keywords"]
213212
assert len(keywords) == 3

0 commit comments

Comments
 (0)