From c831fa524588eb972c7ec365f0687d9dd978c689 Mon Sep 17 00:00:00 2001 From: Aditya Raut Date: Wed, 27 May 2026 16:17:23 +0530 Subject: [PATCH 1/4] fix: handle Document with content=None in DocumentLanguageClassifier --- .../components/classifiers/document_language_classifier.py | 5 +++++ ...-language-classifier-none-content-a245d044b02a19b0.yaml | 5 +++++ .../classifiers/test_document_language_classifier.py | 7 +++++++ 3 files changed, 17 insertions(+) create mode 100644 releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 0697f22cb4..22bc097345 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -111,6 +111,11 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: def _detect_language(self, document: Document) -> str | None: language = None + if document.content is None: + logger.warning( + "Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id + ) + return language try: language = langdetect.detect(document.content) except langdetect.LangDetectException: diff --git a/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml b/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml new file mode 100644 index 0000000000..43588bd920 --- /dev/null +++ b/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Prevent DocumentLanguageClassifier from crashing on Documents with no text by + marking them as unmatched and logging a warning. diff --git a/test/components/classifiers/test_document_language_classifier.py b/test/components/classifiers/test_document_language_classifier.py index b3f259f344..6fe6e2cb89 100644 --- a/test/components/classifiers/test_document_language_classifier.py +++ b/test/components/classifiers/test_document_language_classifier.py @@ -50,3 +50,10 @@ def test_warning_if_no_language_detected(self, caplog): classifier = DocumentLanguageClassifier() classifier.run(documents=[Document(content=".")]) assert "Langdetect cannot detect the language of Document with id" in caplog.text + + def test_none_content_is_unmatched(self, caplog): + with caplog.at_level(logging.WARNING): + classifier = DocumentLanguageClassifier() + result = classifier.run(documents=[Document(content=None)]) + assert result["documents"][0].meta["language"] == "unmatched" + assert "Langdetect cannot detect the language of Document with id" in caplog.text From eeeea249b0f9e970f767c7b16b002acefe8bffb8 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 29 May 2026 12:02:23 +0200 Subject: [PATCH 2/4] add double backticks to release note and mention None --- ...ument-language-classifier-none-content-a245d044b02a19b0.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml b/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml index 43588bd920..3a9832669d 100644 --- a/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml +++ b/releasenotes/notes/document-language-classifier-none-content-a245d044b02a19b0.yaml @@ -1,5 +1,5 @@ --- fixes: - | - Prevent DocumentLanguageClassifier from crashing on Documents with no text by + Prevent DocumentLanguageClassifier from crashing when ``Document.content=None`` by marking them as unmatched and logging a warning. From 3dc15b2743aabdcb0e08bab64086a955cd2ae390 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 29 May 2026 12:04:30 +0200 Subject: [PATCH 3/4] Refactor _detect_language method to simplify return --- .../components/classifiers/document_language_classifier.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index 22bc097345..e69b4f369d 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -110,16 +110,15 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]: return {"documents": new_documents} def _detect_language(self, document: Document) -> str | None: - language = None if document.content is None: logger.warning( "Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id ) - return language + return None try: language = langdetect.detect(document.content) except langdetect.LangDetectException: logger.warning( "Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id ) - return language + return None From e0365f1bc009a8fdba7ab3f1e39a01ca6576cb33 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 29 May 2026 12:12:31 +0200 Subject: [PATCH 4/4] Simplify language detection return statement --- haystack/components/classifiers/document_language_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py index e69b4f369d..64d4703d8a 100644 --- a/haystack/components/classifiers/document_language_classifier.py +++ b/haystack/components/classifiers/document_language_classifier.py @@ -116,7 +116,7 @@ def _detect_language(self, document: Document) -> str | None: ) return None try: - language = langdetect.detect(document.content) + return langdetect.detect(document.content) except langdetect.LangDetectException: logger.warning( "Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id