From d218205060e7f65bbe0d9c398d6674698fc7103f Mon Sep 17 00:00:00 2001 From: devteamaegis Date: Thu, 28 May 2026 04:18:46 -0400 Subject: [PATCH 1/2] fix(classifiers): guard against None content in DocumentLanguageClassifier --- .../components/classifiers/document_language_classifier.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 0697f22cb4..33cfd7a1a0 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -111,6 +111,12 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: def _detect_language(self, document: Document) -> str | None: language = None + if document.content is None: + logger.warning( + "Langdetect cannot detect the language of Document with id: {document_id} because its content is None", + document_id=document.id, + ) + return language try: language = langdetect.detect(document.content) except langdetect.LangDetectException: From 345d9b27732a3e80a22495b6fbcb138c0bb3a502 Mon Sep 17 00:00:00 2001 From: devteamaegis Date: Thu, 28 May 2026 04:18:52 -0400 Subject: [PATCH 2/2] test(classifiers): add regression tests for content=None in DocumentLanguageClassifier --- .../test_document_language_classifier.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/components/classifiers/test_document_language_classifier.py b/test/components/classifiers/test_document_language_classifier.py index b3f259f344..b489da7b81 100644 --- a/test/components/classifiers/test_document_language_classifier.py +++ b/test/components/classifiers/test_document_language_classifier.py @@ -50,3 +50,33 @@ def test_warning_if_no_language_detected(self, caplog): classifier = DocumentLanguageClassifier() classifier.run(documents=[Document(content=".")]) assert "Langdetect cannot detect the language of Document with id" in caplog.text + + def test_content_none_does_not_raise(self): + """Regression test for https://github.com/deepset-ai/haystack/issues/11418. + + Documents with content=None (blob-only documents) must not raise TypeError. + They should be classified as 'unmatched' and a warning must be emitted. + """ + classifier = DocumentLanguageClassifier() + # Should NOT raise TypeError + result = classifier.run(documents=[Document(content=None)]) + assert len(result["documents"]) == 1 + assert result["documents"][0].meta["language"] == "unmatched" + + def test_content_none_emits_warning(self, caplog): + """Regression test: a warning is logged for documents with content=None.""" + with caplog.at_level(logging.WARNING): + classifier = DocumentLanguageClassifier() + classifier.run(documents=[Document(content=None)]) + assert "Langdetect cannot detect the language of Document with id" in caplog.text + + def test_mixed_none_and_text_content(self): + """Documents with content=None and normal documents can coexist in the same batch.""" + classifier = DocumentLanguageClassifier() + docs = [ + Document(content="This is an english sentence."), + Document(content=None), + ] + result = classifier.run(documents=docs) + assert result["documents"][0].meta["language"] == "en" + assert result["documents"][1].meta["language"] == "unmatched"