diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 0697f22cb4..33cfd7a1a0 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -111,6 +111,12 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: def _detect_language(self, document: Document) -> str | None: language = None + if document.content is None: + logger.warning( + "Langdetect cannot detect the language of Document with id: {document_id} because its content is None", + document_id=document.id, + ) + return language try: language = langdetect.detect(document.content) except langdetect.LangDetectException: diff --git a/test/components/classifiers/test_document_language_classifier.py b/test/components/classifiers/test_document_language_classifier.py index b3f259f344..b489da7b81 100644 --- a/test/components/classifiers/test_document_language_classifier.py +++ b/test/components/classifiers/test_document_language_classifier.py @@ -50,3 +50,33 @@ def test_warning_if_no_language_detected(self, caplog): classifier = DocumentLanguageClassifier() classifier.run(documents=[Document(content=".")]) assert "Langdetect cannot detect the language of Document with id" in caplog.text + + def test_content_none_does_not_raise(self): + """Regression test for https://github.com/deepset-ai/haystack/issues/11418. + + Documents with content=None (blob-only documents) must not raise TypeError. + They should be classified as 'unmatched' and a warning must be emitted. + """ + classifier = DocumentLanguageClassifier() + # Should NOT raise TypeError + result = classifier.run(documents=[Document(content=None)]) + assert len(result["documents"]) == 1 + assert result["documents"][0].meta["language"] == "unmatched" + + def test_content_none_emits_warning(self, caplog): + """Regression test: a warning is logged for documents with content=None.""" + with caplog.at_level(logging.WARNING): + classifier = DocumentLanguageClassifier() + classifier.run(documents=[Document(content=None)]) + assert "Langdetect cannot detect the language of Document with id" in caplog.text + + def test_mixed_none_and_text_content(self): + """Documents with content=None and normal documents can coexist in the same batch.""" + classifier = DocumentLanguageClassifier() + docs = [ + Document(content="This is an english sentence."), + Document(content=None), + ] + result = classifier.run(documents=docs) + assert result["documents"][0].meta["language"] == "en" + assert result["documents"][1].meta["language"] == "unmatched"