Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:

def _detect_language(self, document: Document) -> str | None:
language = None
if document.content is None:
logger.warning(
"Langdetect cannot detect the language of Document with id: {document_id} because its content is None",
document_id=document.id,
)
return language
try:
language = langdetect.detect(document.content)
except langdetect.LangDetectException:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,33 @@ def test_warning_if_no_language_detected(self, caplog):
classifier = DocumentLanguageClassifier()
classifier.run(documents=[Document(content=".")])
assert "Langdetect cannot detect the language of Document with id" in caplog.text

def test_content_none_does_not_raise(self):
"""Regression test for https://github.com/deepset-ai/haystack/issues/11418.

Documents with content=None (blob-only documents) must not raise TypeError.
They should be classified as 'unmatched' and a warning must be emitted.
"""
classifier = DocumentLanguageClassifier()
# Should NOT raise TypeError
result = classifier.run(documents=[Document(content=None)])
assert len(result["documents"]) == 1
assert result["documents"][0].meta["language"] == "unmatched"

def test_content_none_emits_warning(self, caplog):
"""Regression test: a warning is logged for documents with content=None."""
with caplog.at_level(logging.WARNING):
classifier = DocumentLanguageClassifier()
classifier.run(documents=[Document(content=None)])
assert "Langdetect cannot detect the language of Document with id" in caplog.text

def test_mixed_none_and_text_content(self):
"""Documents with content=None and normal documents can coexist in the same batch."""
classifier = DocumentLanguageClassifier()
docs = [
Document(content="This is an english sentence."),
Document(content=None),
]
result = classifier.run(documents=docs)
assert result["documents"][0].meta["language"] == "en"
assert result["documents"][1].meta["language"] == "unmatched"