fix: filter batch error results instead of passing them as Documents

v-tan · v-tan · commit df65ceb2ec6f · 2026-03-20T01:18:25.000+05:30
Kreuzberg's batch APIs return error results as ExtractionResult with
empty metadata and None quality_score instead of raising exceptions.
Previously these were silently passed through as valid Documents.

- Add _is_batch_error() to utils.py to detect error results
- Add _collect_batch_results() static method to filter errors with
  structured warning logs (classify_error + error_code_name)
- Fix LogRecord conflict: rename 'name' kwarg to 'code_name' in
  both batch and sequential error logging paths
- Add 3 unit tests for _is_batch_error detection logic
- Add 3 integration tests for corrupt file/bytestream filtering
diff --git a/integrations/kreuzberg/src/haystack_integrations/components/converters/kreuzberg/converter.py b/integrations/kreuzberg/src/haystack_integrations/components/converters/kreuzberg/converter.py
@@ -19,6 +19,7 @@
     LanguageDetectionConfig,
     batch_extract_bytes_sync,
     batch_extract_files_sync,
+    classify_error,
     config_to_json,
     detect_mime_type,
     error_code_name,
@@ -33,6 +34,7 @@
     _config_from_json_str,
     _copy_config,
     _get_table_markdown,
+    _is_batch_error,
     _serialize_annotations,
     _serialize_images,
     _serialize_warnings,
@@ -218,20 +220,43 @@ def _extract_batch(
 
         # Batch-extract file paths
         if file_paths:
-            file_results = batch_extract_files_sync(file_paths, config=config, easyocr_kwargs=self.easyocr_kwargs)
-            for idx, result in zip(file_indices, file_results, strict=True):
-                results[idx] = result
+            self._collect_batch_results(
+                file_indices, batch_extract_files_sync(file_paths, config=config, easyocr_kwargs=self.easyocr_kwargs),
+                sources, results,
+            )
 
         # Batch-extract byte streams
         if bytes_data:
-            bytes_results = batch_extract_bytes_sync(
-                bytes_data, bytes_mimes, config=config, easyocr_kwargs=self.easyocr_kwargs
+            self._collect_batch_results(
+                bytes_indices,
+                batch_extract_bytes_sync(bytes_data, bytes_mimes, config=config, easyocr_kwargs=self.easyocr_kwargs),
+                sources, results,
             )
-            for idx, result in zip(bytes_indices, bytes_results, strict=True):
-                results[idx] = result
 
         return results
 
+    @staticmethod
+    def _collect_batch_results(
+        indices: list[int],
+        batch_results: list[ExtractionResult],
+        sources: list[str | Path | ByteStream],
+        results: list[ExtractionResult | None],
+    ) -> None:
+        """Filter batch results, logging and skipping error entries."""
+        for idx, result in zip(indices, batch_results, strict=True):
+            if _is_batch_error(result):
+                err_code = classify_error(result.content)
+                logger.warning(
+                    "Could not convert {source} to Document. Error code: {code} ({code_name}). "
+                    "Details: {details}. Skipping it.",
+                    source=sources[idx],
+                    code=err_code,
+                    code_name=error_code_name(err_code),
+                    details=result.content,
+                )
+                continue
+            results[idx] = result
+
     @staticmethod
     def _build_extraction_metadata(result: ExtractionResult) -> dict[str, Any]:
         """
@@ -501,11 +526,11 @@ def _extract_sequential(
                     details = get_error_details()
                     code_name = error_code_name(err_code) if err_code is not None else "UNKNOWN"
                     logger.warning(
-                        "Could not convert {source} to Document. Error code: {code} ({name}). "
+                        "Could not convert {source} to Document. Error code: {code} ({code_name}). "
                         "Details: {details}. Skipping it.",
                         source=source,
                         code=err_code,
-                        name=code_name,
+                        code_name=code_name,
                         details=details.get("message", str(e)),
                     )
                 except Exception as diag_err:
diff --git a/integrations/kreuzberg/src/haystack_integrations/components/converters/kreuzberg/utils.py b/integrations/kreuzberg/src/haystack_integrations/components/converters/kreuzberg/utils.py
@@ -9,10 +9,21 @@
     ExtractedImage,
     ExtractedTable,
     ExtractionConfig,
+    ExtractionResult,
     config_to_json,
 )
 
 
+def _is_batch_error(result: ExtractionResult) -> bool:
+    """Detect error results returned by kreuzberg's batch APIs.
+
+    Batch APIs return ``ExtractionResult(content="Error: ...", metadata={},
+    quality_score=None)`` instead of raising exceptions. Valid results always
+    have populated metadata (at minimum ``output_format``).
+    """
+    return result.metadata == {} and result.quality_score is None
+
+
 def _get_table_markdown(table: ExtractedTable | dict[str, Any]) -> str | None:
     """Get markdown string from a table (`ExtractedTable` object or dict)."""
     if isinstance(table, dict):
diff --git a/integrations/kreuzberg/tests/test_converter.py b/integrations/kreuzberg/tests/test_converter.py
@@ -20,6 +20,7 @@
 
 from haystack_integrations.components.converters.kreuzberg import KreuzbergConverter
 from haystack_integrations.components.converters.kreuzberg.utils import (
+    _is_batch_error,
     _serialize_warnings,
 )
 
@@ -693,3 +694,33 @@ def test_run_batch_success(mock_get_bs: MagicMock, converter: KreuzbergConverter
         result = converter.run(sources=[Path("a.pdf"), Path("b.pdf")])
 
     assert len(result["documents"]) == 2
+
+
+def test_is_batch_error_detects_error_result() -> None:
+    """Batch error results have empty metadata and None quality_score."""
+    result = MagicMock(spec=ExtractionResult)
+    result.metadata = {}
+    result.quality_score = None
+    result.content = "Error: could not parse file"
+
+    assert _is_batch_error(result) is True
+
+
+def test_is_batch_error_passes_valid_result() -> None:
+    """Valid results have populated metadata and are not flagged."""
+    result = MagicMock(spec=ExtractionResult)
+    result.metadata = {"output_format": "plain", "quality_score": 0.85}
+    result.quality_score = 0.85
+    result.content = "Hello world"
+
+    assert _is_batch_error(result) is False
+
+
+def test_is_batch_error_no_false_positive_on_error_content() -> None:
+    """Content starting with 'Error:' but valid metadata is NOT a batch error."""
+    result = MagicMock(spec=ExtractionResult)
+    result.metadata = {"output_format": "plain"}
+    result.quality_score = 0.5
+    result.content = "Error: this is actual document content that starts with Error:"
+
+    assert _is_batch_error(result) is False
diff --git a/integrations/kreuzberg/tests/test_converter_integration.py b/integrations/kreuzberg/tests/test_converter_integration.py
@@ -514,3 +514,44 @@ def test_edge_config_not_mutated_across_runs() -> None:
     converter.run(sources=[FIXTURES_DIR / "sample.txt"])
     # Original config should not be mutated
     assert config.output_format == "html"
+
+
+@pytest.mark.integration
+def test_batch_filters_corrupt_files(tmp_path: Path) -> None:
+    """Batch mode filters corrupt files and keeps valid ones."""
+    corrupt = tmp_path / "corrupt.pdf"
+    corrupt.write_bytes(b"not a valid pdf")
+
+    converter = KreuzbergConverter(batch=True)
+    result = converter.run(sources=[FIXTURES_DIR / "sample.txt", corrupt])
+    docs = _docs(result)
+
+    assert len(docs) == 1
+    assert "sample text document" in docs[0].content
+
+
+@pytest.mark.integration
+def test_batch_filters_corrupt_bytestream(tmp_path: Path) -> None:
+    """Batch mode filters corrupt ByteStream and keeps valid file."""
+    corrupt_bs = ByteStream(data=b"not a valid pdf", mime_type="application/pdf")
+
+    converter = KreuzbergConverter(batch=True)
+    result = converter.run(sources=[FIXTURES_DIR / "sample.txt", corrupt_bs])
+    docs = _docs(result)
+
+    assert len(docs) == 1
+    assert "sample text document" in docs[0].content
+
+
+@pytest.mark.integration
+def test_batch_all_corrupt(tmp_path: Path) -> None:
+    """Batch mode with all corrupt sources returns empty documents list."""
+    corrupt1 = tmp_path / "corrupt1.pdf"
+    corrupt1.write_bytes(b"not a valid pdf")
+    corrupt2 = tmp_path / "corrupt2.pdf"
+    corrupt2.write_bytes(b"also not valid")
+
+    converter = KreuzbergConverter(batch=True)
+    result = converter.run(sources=[corrupt1, corrupt2])
+
+    assert result["documents"] == []