fix(presidio): address reviewer feedback on warm-up pattern and tests

SyedShahmeerAli12 · SyedShahmeerAli12 · commit 10d90f572eb2 · 2026-04-13T13:32:24.000+05:00
- Add `_is_warmed_up` guard to `warm_up()` so repeated calls are idempotent
- Auto-warm on first `run()` call instead of raising RuntimeError
- Update component docstrings to reflect lazy loading behavior
- Fix broken Presidio doc link (supported_languages → analyzer/languages)
- Add `_make_*_with_mocks()` helper in each test class to centralize mock
  setup and prevent auto-warm from overwriting injected mocks
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
@@ -20,7 +20,8 @@ class PresidioDocumentCleaner:
 
     Documents without text content are passed through unchanged.
 
-    Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines.
+    The analyzer and anonymizer engines are loaded on the first call to `run()`,
+    or by calling `warm_up()` explicitly beforehand.
 
     ### Usage example
 
@@ -29,7 +30,6 @@ class PresidioDocumentCleaner:
     from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
 
     cleaner = PresidioDocumentCleaner()
-    cleaner.warm_up()
     result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")])
     print(result["documents"][0].content)
     # My name is <PERSON> and my email is <EMAIL_ADDRESS>
@@ -48,7 +48,7 @@ def __init__(
 
         :param language:
             Language code for PII detection. Defaults to `"en"`.
-            See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
+            See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
         :param entities:
             List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
             If `None`, all supported entity types are used.
@@ -62,18 +62,22 @@ def __init__(
         self.score_threshold = score_threshold
         self._analyzer: AnalyzerEngine | None = None
         self._anonymizer: AnonymizerEngine | None = None
+        self._is_warmed_up = False
 
     def warm_up(self) -> None:
         """
         Initializes the Presidio analyzer and anonymizer engines.
 
-        This method loads the underlying NLP models and should be called before `run()`.
-        In a Haystack Pipeline, this is called automatically before the first run.
+        This method loads the underlying NLP models. In a Haystack Pipeline,
+        this is called automatically before the first run.
         """
-        if self._analyzer is None:
-            self._analyzer = AnalyzerEngine()
-        if self._anonymizer is None:
-            self._anonymizer = AnonymizerEngine()
+        if self._is_warmed_up:
+            return
+
+        self._analyzer = AnalyzerEngine()
+        self._anonymizer = AnonymizerEngine()
+
+        self._is_warmed_up = True
 
     @component.output_types(documents=list[Document])
     def run(self, documents: list[Document]) -> dict[str, list[Document]]:
@@ -85,22 +89,22 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         :returns:
             A dictionary with key `documents` containing the cleaned Documents.
         """
+        if not self._is_warmed_up:
+            self.warm_up()
+
         cleaned: list[Document] = []
         for doc in documents:
             if doc.content is None:
                 cleaned.append(doc)
                 continue
-            if self._analyzer is None or self._anonymizer is None:
-                msg = "The component was not warmed up. Call warm_up() before running it."
-                raise RuntimeError(msg)
             try:
-                analyzer_results = self._analyzer.analyze(
+                analyzer_results = self._analyzer.analyze(  # type: ignore[union-attr]
                     text=doc.content,
                     language=self.language,
                     entities=self.entities,
                     score_threshold=self.score_threshold,
                 )
-                anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results)  # type: ignore[arg-type]
+                anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results)  # type: ignore[arg-type, union-attr]
                 cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy()))
             except Exception as e:
                 logger.warning(
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
@@ -21,7 +21,8 @@ class PresidioEntityExtractor:
 
     Original Documents are not mutated. Documents without text content are passed through unchanged.
 
-    Call `warm_up()` before running this component to load the Presidio analyzer engine.
+    The analyzer engine is loaded on the first call to `run()`,
+    or by calling `warm_up()` explicitly beforehand.
 
     ### Usage example
 
@@ -30,7 +31,6 @@ class PresidioEntityExtractor:
     from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor
 
     extractor = PresidioEntityExtractor()
-    extractor.warm_up()
     result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")])
     print(result["documents"][0].meta["entities"])
     # [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85},
@@ -50,7 +50,7 @@ def __init__(
 
         :param language:
             Language code for PII detection. Defaults to `"en"`.
-            See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
+            See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
         :param entities:
             List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
             If `None`, all supported entity types are detected.
@@ -63,16 +63,21 @@ def __init__(
         self.entities = entities
         self.score_threshold = score_threshold
         self._analyzer: AnalyzerEngine | None = None
+        self._is_warmed_up = False
 
     def warm_up(self) -> None:
         """
         Initializes the Presidio analyzer engine.
 
-        This method loads the underlying NLP models and should be called before `run()`.
-        In a Haystack Pipeline, this is called automatically before the first run.
+        This method loads the underlying NLP models. In a Haystack Pipeline,
+        this is called automatically before the first run.
         """
-        if self._analyzer is None:
-            self._analyzer = AnalyzerEngine()
+        if self._is_warmed_up:
+            return
+
+        self._analyzer = AnalyzerEngine()
+
+        self._is_warmed_up = True
 
     @component.output_types(documents=list[Document])
     def run(self, documents: list[Document]) -> dict[str, list[Document]]:
@@ -85,16 +90,16 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
             A dictionary with key `documents` containing Documents with detected entities
             stored in metadata under the key `"entities"`.
         """
+        if not self._is_warmed_up:
+            self.warm_up()
+
         result_docs: list[Document] = []
         for doc in documents:
             if doc.content is None:
                 result_docs.append(doc)
                 continue
-            if self._analyzer is None:
-                msg = "The component was not warmed up. Call warm_up() before running it."
-                raise RuntimeError(msg)
             try:
-                analyzer_results = self._analyzer.analyze(
+                analyzer_results = self._analyzer.analyze(  # type: ignore[union-attr]
                     text=doc.content,
                     language=self.language,
                     entities=self.entities,
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
@@ -18,15 +18,15 @@ class PresidioTextCleaner:
     a new list of strings with PII replaced by entity type placeholders (e.g. `<PERSON>`).
     Useful for sanitizing user queries before they are sent to an LLM.
 
-    Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines.
+    The analyzer and anonymizer engines are loaded on the first call to `run()`,
+    or by calling `warm_up()` explicitly beforehand.
 
     ### Usage example
 
     ```python
     from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner
 
     cleaner = PresidioTextCleaner()
-    cleaner.warm_up()
     result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"])
     print(result["texts"][0])
     # Hi, I am <PERSON>, call me at <PHONE_NUMBER>
@@ -45,7 +45,7 @@ def __init__(
 
         :param language:
             Language code for PII detection. Defaults to `"en"`.
-            See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
+            See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
         :param entities:
             List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
             If `None`, all supported entity types are used.
@@ -59,18 +59,22 @@ def __init__(
         self.score_threshold = score_threshold
         self._analyzer: AnalyzerEngine | None = None
         self._anonymizer: AnonymizerEngine | None = None
+        self._is_warmed_up = False
 
     def warm_up(self) -> None:
         """
         Initializes the Presidio analyzer and anonymizer engines.
 
-        This method loads the underlying NLP models and should be called before `run()`.
-        In a Haystack Pipeline, this is called automatically before the first run.
+        This method loads the underlying NLP models. In a Haystack Pipeline,
+        this is called automatically before the first run.
         """
-        if self._analyzer is None:
-            self._analyzer = AnalyzerEngine()
-        if self._anonymizer is None:
-            self._anonymizer = AnonymizerEngine()
+        if self._is_warmed_up:
+            return
+
+        self._analyzer = AnalyzerEngine()
+        self._anonymizer = AnonymizerEngine()
+
+        self._is_warmed_up = True
 
     @component.output_types(texts=list[str])
     def run(self, texts: list[str]) -> dict[str, list[str]]:
@@ -82,19 +86,19 @@ def run(self, texts: list[str]) -> dict[str, list[str]]:
         :returns:
             A dictionary with key `texts` containing the cleaned strings.
         """
-        if self._analyzer is None or self._anonymizer is None:
-            msg = "The component was not warmed up. Call warm_up() before running it."
-            raise RuntimeError(msg)
+        if not self._is_warmed_up:
+            self.warm_up()
+
         cleaned: list[str] = []
         for text in texts:
             try:
-                analyzer_results = self._analyzer.analyze(
+                analyzer_results = self._analyzer.analyze(  # type: ignore[union-attr]
                     text=text,
                     language=self.language,
                     entities=self.entities,
                     score_threshold=self.score_threshold,
                 )
-                anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results)  # type: ignore[arg-type]
+                anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results)  # type: ignore[arg-type, union-attr]
                 cleaned.append(anonymized.text)
             except Exception as e:
                 logger.warning(
diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py
@@ -49,13 +49,19 @@ def test_from_dict(self):
         assert cleaner.entities == ["PERSON"]
         assert cleaner.score_threshold == 0.6
 
+    def _make_cleaner_with_mocks(self, **kwargs):
+        """Return a cleaner with mocked engines so unit tests don't load real NLP models."""
+        cleaner = PresidioDocumentCleaner(**kwargs)
+        cleaner._analyzer = MagicMock()
+        cleaner._anonymizer = MagicMock()
+        cleaner._is_warmed_up = True
+        return cleaner
+
     def test_run_anonymizes_pii(self):
-        cleaner = PresidioDocumentCleaner()
+        cleaner = self._make_cleaner_with_mocks()
         mock_result = MagicMock()
         mock_result.text = "My name is <PERSON> and email is <EMAIL_ADDRESS>"
-        cleaner._anonymizer = MagicMock()
         cleaner._anonymizer.anonymize.return_value = mock_result
-        cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.return_value = []
 
         docs = [Document(content="My name is John and email is john@example.com")]
@@ -65,12 +71,10 @@ def test_run_anonymizes_pii(self):
         assert result["documents"][0].content == "My name is <PERSON> and email is <EMAIL_ADDRESS>"
 
     def test_run_preserves_metadata(self):
-        cleaner = PresidioDocumentCleaner()
+        cleaner = self._make_cleaner_with_mocks()
         mock_result = MagicMock()
         mock_result.text = "Hello <PERSON>"
-        cleaner._anonymizer = MagicMock()
         cleaner._anonymizer.anonymize.return_value = mock_result
-        cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.return_value = []
 
         docs = [Document(content="Hello John", meta={"source": "email", "page": 1})]
@@ -80,12 +84,10 @@ def test_run_preserves_metadata(self):
         assert result["documents"][0].meta["page"] == 1
 
     def test_run_does_not_mutate_original(self):
-        cleaner = PresidioDocumentCleaner()
+        cleaner = self._make_cleaner_with_mocks()
         mock_result = MagicMock()
         mock_result.text = "Hello <PERSON>"
-        cleaner._anonymizer = MagicMock()
         cleaner._anonymizer.anonymize.return_value = mock_result
-        cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.return_value = []
 
         original = Document(content="Hello John")
@@ -94,7 +96,7 @@ def test_run_does_not_mutate_original(self):
         assert original.content == "Hello John"
 
     def test_run_passes_through_none_content(self):
-        cleaner = PresidioDocumentCleaner()
+        cleaner = self._make_cleaner_with_mocks()
         doc = Document(content=None, meta={"source": "test"})
         result = cleaner.run(documents=[doc])
 
@@ -103,10 +105,8 @@ def test_run_passes_through_none_content(self):
         assert result["documents"][0].meta["source"] == "test"
 
     def test_run_skips_on_error(self, caplog):
-        cleaner = PresidioDocumentCleaner()
-        cleaner._analyzer = MagicMock()
+        cleaner = self._make_cleaner_with_mocks()
         cleaner._analyzer.analyze.side_effect = Exception("Analyzer error")
-        cleaner._anonymizer = MagicMock()
 
         doc = Document(content="Some text with PII")
         with caplog.at_level(logging.WARNING):
@@ -117,12 +117,10 @@ def test_run_skips_on_error(self, caplog):
         assert "Could not anonymize" in caplog.text
 
     def test_run_multiple_documents(self):
-        cleaner = PresidioDocumentCleaner()
+        cleaner = self._make_cleaner_with_mocks()
         mock_result = MagicMock()
         mock_result.text = "cleaned"
-        cleaner._anonymizer = MagicMock()
         cleaner._anonymizer.anonymize.return_value = mock_result
-        cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.return_value = []
 
         docs = [Document(content=f"doc {i}") for i in range(3)]
@@ -131,12 +129,10 @@ def test_run_multiple_documents(self):
         assert len(result["documents"]) == 3
 
     def test_run_passes_language_and_entities_to_analyzer(self):
-        cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.8)
+        cleaner = self._make_cleaner_with_mocks(language="de", entities=["PERSON"], score_threshold=0.8)
         mock_result = MagicMock()
         mock_result.text = "cleaned"
-        cleaner._anonymizer = MagicMock()
         cleaner._anonymizer.anonymize.return_value = mock_result
-        cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.return_value = []
 
         cleaner.run(documents=[Document(content="Hello John")])
diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py
diff --git a/integrations/presidio/tests/test_presidio_text_cleaner.py b/integrations/presidio/tests/test_presidio_text_cleaner.py