diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 0697f22cb4..64d4703d8a 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -110,11 +110,15 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: return {"documents": new_documents} def _detect_language(self, document: Document) -> str | None: - language = None + if document.content is None: + logger.warning( + "Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id + ) + return None try: - language = langdetect.detect(document.content) + return langdetect.detect(document.content) except langdetect.LangDetectException: logger.warning( "Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id ) - return language + return None diff --git a/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml b/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml new file mode 100644 index 0000000000..3a9832669d --- /dev/null +++ b/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Prevent DocumentLanguageClassifier from crashing when ``Document.content=None`` by + marking them as unmatched and logging a warning. diff --git a/test/components/classifiers/test_document_language_classifier.py b/test/components/classifiers/test_document_language_classifier.py index b3f259f344..6fe6e2cb89 100644 --- a/test/components/classifiers/test_document_language_classifier.py +++ b/test/components/classifiers/test_document_language_classifier.py @@ -50,3 +50,10 @@ def test_warning_if_no_language_detected(self, caplog): classifier = DocumentLanguageClassifier() classifier.run(documents=[Document(content=".")]) assert "Langdetect cannot detect the language of Document with id" in caplog.text + + def test_none_content_is_unmatched(self, caplog): + with caplog.at_level(logging.WARNING): + classifier = DocumentLanguageClassifier() + result = classifier.run(documents=[Document(content=None)]) + assert result["documents"][0].meta["language"] == "unmatched" + assert "Langdetect cannot detect the language of Document with id" in caplog.text