Skip to content

Commit cdeec75

Browse files
fix: handle Document with content=None in DocumentLanguageClassifier (#11419)
Co-authored-by: Julian Risch <julian.risch@deepset.ai>
1 parent f822499 commit cdeec75

3 files changed

Lines changed: 19 additions & 3 deletions

File tree

haystack/components/classifiers/document_language_classifier.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,11 +110,15 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
110110
return {"documents": new_documents}
111111

112112
def _detect_language(self, document: Document) -> str | None:
113-
language = None
113+
if document.content is None:
114+
logger.warning(
115+
"Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id
116+
)
117+
return None
114118
try:
115-
language = langdetect.detect(document.content)
119+
return langdetect.detect(document.content)
116120
except langdetect.LangDetectException:
117121
logger.warning(
118122
"Langdetect cannot detect the language of Document with id: {document_id}", document_id=document.id
119123
)
120-
return language
124+
return None
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
fixes:
3+
- |
4+
Prevent DocumentLanguageClassifier from crashing when ``Document.content=None`` by
5+
marking them as unmatched and logging a warning.

test/components/classifiers/test_document_language_classifier.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,10 @@ def test_warning_if_no_language_detected(self, caplog):
5050
classifier = DocumentLanguageClassifier()
5151
classifier.run(documents=[Document(content=".")])
5252
assert "Langdetect cannot detect the language of Document with id" in caplog.text
53+
54+
def test_none_content_is_unmatched(self, caplog):
55+
with caplog.at_level(logging.WARNING):
56+
classifier = DocumentLanguageClassifier()
57+
result = classifier.run(documents=[Document(content=None)])
58+
assert result["documents"][0].meta["language"] == "unmatched"
59+
assert "Langdetect cannot detect the language of Document with id" in caplog.text

0 commit comments

Comments
 (0)