Skip to content

Commit 10d90f5

Browse files
fix(presidio): address reviewer feedback on warm-up pattern and tests
- Add `_is_warmed_up` guard to `warm_up()` so repeated calls are idempotent - Auto-warm on first `run()` call instead of raising RuntimeError - Update component docstrings to reflect lazy loading behavior - Fix broken Presidio doc link (supported_languages → analyzer/languages) - Add `_make_*_with_mocks()` helper in each test class to centralize mock setup and prevent auto-warm from overwriting injected mocks
1 parent 7e15ec3 commit 10d90f5

6 files changed

Lines changed: 92 additions & 81 deletions

File tree

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ class PresidioDocumentCleaner:
2020
2121
Documents without text content are passed through unchanged.
2222
23-
Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines.
23+
The analyzer and anonymizer engines are loaded on the first call to `run()`,
24+
or by calling `warm_up()` explicitly beforehand.
2425
2526
### Usage example
2627
@@ -29,7 +30,6 @@ class PresidioDocumentCleaner:
2930
from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
3031
3132
cleaner = PresidioDocumentCleaner()
32-
cleaner.warm_up()
3333
result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")])
3434
print(result["documents"][0].content)
3535
# My name is <PERSON> and my email is <EMAIL_ADDRESS>
@@ -48,7 +48,7 @@ def __init__(
4848
4949
:param language:
5050
Language code for PII detection. Defaults to `"en"`.
51-
See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
51+
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
5252
:param entities:
5353
List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
5454
If `None`, all supported entity types are used.
@@ -62,18 +62,22 @@ def __init__(
6262
self.score_threshold = score_threshold
6363
self._analyzer: AnalyzerEngine | None = None
6464
self._anonymizer: AnonymizerEngine | None = None
65+
self._is_warmed_up = False
6566

6667
def warm_up(self) -> None:
6768
"""
6869
Initializes the Presidio analyzer and anonymizer engines.
6970
70-
This method loads the underlying NLP models and should be called before `run()`.
71-
In a Haystack Pipeline, this is called automatically before the first run.
71+
This method loads the underlying NLP models. In a Haystack Pipeline,
72+
this is called automatically before the first run.
7273
"""
73-
if self._analyzer is None:
74-
self._analyzer = AnalyzerEngine()
75-
if self._anonymizer is None:
76-
self._anonymizer = AnonymizerEngine()
74+
if self._is_warmed_up:
75+
return
76+
77+
self._analyzer = AnalyzerEngine()
78+
self._anonymizer = AnonymizerEngine()
79+
80+
self._is_warmed_up = True
7781

7882
@component.output_types(documents=list[Document])
7983
def run(self, documents: list[Document]) -> dict[str, list[Document]]:
@@ -85,22 +89,22 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
8589
:returns:
8690
A dictionary with key `documents` containing the cleaned Documents.
8791
"""
92+
if not self._is_warmed_up:
93+
self.warm_up()
94+
8895
cleaned: list[Document] = []
8996
for doc in documents:
9097
if doc.content is None:
9198
cleaned.append(doc)
9299
continue
93-
if self._analyzer is None or self._anonymizer is None:
94-
msg = "The component was not warmed up. Call warm_up() before running it."
95-
raise RuntimeError(msg)
96100
try:
97-
analyzer_results = self._analyzer.analyze(
101+
analyzer_results = self._analyzer.analyze( # type: ignore[union-attr]
98102
text=doc.content,
99103
language=self.language,
100104
entities=self.entities,
101105
score_threshold=self.score_threshold,
102106
)
103-
anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results) # type: ignore[arg-type]
107+
anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results) # type: ignore[arg-type, union-attr]
104108
cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy()))
105109
except Exception as e:
106110
logger.warning(

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ class PresidioEntityExtractor:
2121
2222
Original Documents are not mutated. Documents without text content are passed through unchanged.
2323
24-
Call `warm_up()` before running this component to load the Presidio analyzer engine.
24+
The analyzer engine is loaded on the first call to `run()`,
25+
or by calling `warm_up()` explicitly beforehand.
2526
2627
### Usage example
2728
@@ -30,7 +31,6 @@ class PresidioEntityExtractor:
3031
from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor
3132
3233
extractor = PresidioEntityExtractor()
33-
extractor.warm_up()
3434
result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")])
3535
print(result["documents"][0].meta["entities"])
3636
# [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85},
@@ -50,7 +50,7 @@ def __init__(
5050
5151
:param language:
5252
Language code for PII detection. Defaults to `"en"`.
53-
See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
53+
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
5454
:param entities:
5555
List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
5656
If `None`, all supported entity types are detected.
@@ -63,16 +63,21 @@ def __init__(
6363
self.entities = entities
6464
self.score_threshold = score_threshold
6565
self._analyzer: AnalyzerEngine | None = None
66+
self._is_warmed_up = False
6667

6768
def warm_up(self) -> None:
6869
"""
6970
Initializes the Presidio analyzer engine.
7071
71-
This method loads the underlying NLP models and should be called before `run()`.
72-
In a Haystack Pipeline, this is called automatically before the first run.
72+
This method loads the underlying NLP models. In a Haystack Pipeline,
73+
this is called automatically before the first run.
7374
"""
74-
if self._analyzer is None:
75-
self._analyzer = AnalyzerEngine()
75+
if self._is_warmed_up:
76+
return
77+
78+
self._analyzer = AnalyzerEngine()
79+
80+
self._is_warmed_up = True
7681

7782
@component.output_types(documents=list[Document])
7883
def run(self, documents: list[Document]) -> dict[str, list[Document]]:
@@ -85,16 +90,16 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
8590
A dictionary with key `documents` containing Documents with detected entities
8691
stored in metadata under the key `"entities"`.
8792
"""
93+
if not self._is_warmed_up:
94+
self.warm_up()
95+
8896
result_docs: list[Document] = []
8997
for doc in documents:
9098
if doc.content is None:
9199
result_docs.append(doc)
92100
continue
93-
if self._analyzer is None:
94-
msg = "The component was not warmed up. Call warm_up() before running it."
95-
raise RuntimeError(msg)
96101
try:
97-
analyzer_results = self._analyzer.analyze(
102+
analyzer_results = self._analyzer.analyze( # type: ignore[union-attr]
98103
text=doc.content,
99104
language=self.language,
100105
entities=self.entities,

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ class PresidioTextCleaner:
1818
a new list of strings with PII replaced by entity type placeholders (e.g. `<PERSON>`).
1919
Useful for sanitizing user queries before they are sent to an LLM.
2020
21-
Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines.
21+
The analyzer and anonymizer engines are loaded on the first call to `run()`,
22+
or by calling `warm_up()` explicitly beforehand.
2223
2324
### Usage example
2425
2526
```python
2627
from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner
2728
2829
cleaner = PresidioTextCleaner()
29-
cleaner.warm_up()
3030
result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"])
3131
print(result["texts"][0])
3232
# Hi, I am <PERSON>, call me at <PHONE_NUMBER>
@@ -45,7 +45,7 @@ def __init__(
4545
4646
:param language:
4747
Language code for PII detection. Defaults to `"en"`.
48-
See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
48+
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
4949
:param entities:
5050
List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
5151
If `None`, all supported entity types are used.
@@ -59,18 +59,22 @@ def __init__(
5959
self.score_threshold = score_threshold
6060
self._analyzer: AnalyzerEngine | None = None
6161
self._anonymizer: AnonymizerEngine | None = None
62+
self._is_warmed_up = False
6263

6364
def warm_up(self) -> None:
6465
"""
6566
Initializes the Presidio analyzer and anonymizer engines.
6667
67-
This method loads the underlying NLP models and should be called before `run()`.
68-
In a Haystack Pipeline, this is called automatically before the first run.
68+
This method loads the underlying NLP models. In a Haystack Pipeline,
69+
this is called automatically before the first run.
6970
"""
70-
if self._analyzer is None:
71-
self._analyzer = AnalyzerEngine()
72-
if self._anonymizer is None:
73-
self._anonymizer = AnonymizerEngine()
71+
if self._is_warmed_up:
72+
return
73+
74+
self._analyzer = AnalyzerEngine()
75+
self._anonymizer = AnonymizerEngine()
76+
77+
self._is_warmed_up = True
7478

7579
@component.output_types(texts=list[str])
7680
def run(self, texts: list[str]) -> dict[str, list[str]]:
@@ -82,19 +86,19 @@ def run(self, texts: list[str]) -> dict[str, list[str]]:
8286
:returns:
8387
A dictionary with key `texts` containing the cleaned strings.
8488
"""
85-
if self._analyzer is None or self._anonymizer is None:
86-
msg = "The component was not warmed up. Call warm_up() before running it."
87-
raise RuntimeError(msg)
89+
if not self._is_warmed_up:
90+
self.warm_up()
91+
8892
cleaned: list[str] = []
8993
for text in texts:
9094
try:
91-
analyzer_results = self._analyzer.analyze(
95+
analyzer_results = self._analyzer.analyze( # type: ignore[union-attr]
9296
text=text,
9397
language=self.language,
9498
entities=self.entities,
9599
score_threshold=self.score_threshold,
96100
)
97-
anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results) # type: ignore[arg-type]
101+
anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results) # type: ignore[arg-type, union-attr]
98102
cleaned.append(anonymized.text)
99103
except Exception as e:
100104
logger.warning(

integrations/presidio/tests/test_presidio_document_cleaner.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,19 @@ def test_from_dict(self):
4949
assert cleaner.entities == ["PERSON"]
5050
assert cleaner.score_threshold == 0.6
5151

52+
def _make_cleaner_with_mocks(self, **kwargs):
53+
"""Return a cleaner with mocked engines so unit tests don't load real NLP models."""
54+
cleaner = PresidioDocumentCleaner(**kwargs)
55+
cleaner._analyzer = MagicMock()
56+
cleaner._anonymizer = MagicMock()
57+
cleaner._is_warmed_up = True
58+
return cleaner
59+
5260
def test_run_anonymizes_pii(self):
53-
cleaner = PresidioDocumentCleaner()
61+
cleaner = self._make_cleaner_with_mocks()
5462
mock_result = MagicMock()
5563
mock_result.text = "My name is <PERSON> and email is <EMAIL_ADDRESS>"
56-
cleaner._anonymizer = MagicMock()
5764
cleaner._anonymizer.anonymize.return_value = mock_result
58-
cleaner._analyzer = MagicMock()
5965
cleaner._analyzer.analyze.return_value = []
6066

6167
docs = [Document(content="My name is John and email is john@example.com")]
@@ -65,12 +71,10 @@ def test_run_anonymizes_pii(self):
6571
assert result["documents"][0].content == "My name is <PERSON> and email is <EMAIL_ADDRESS>"
6672

6773
def test_run_preserves_metadata(self):
68-
cleaner = PresidioDocumentCleaner()
74+
cleaner = self._make_cleaner_with_mocks()
6975
mock_result = MagicMock()
7076
mock_result.text = "Hello <PERSON>"
71-
cleaner._anonymizer = MagicMock()
7277
cleaner._anonymizer.anonymize.return_value = mock_result
73-
cleaner._analyzer = MagicMock()
7478
cleaner._analyzer.analyze.return_value = []
7579

7680
docs = [Document(content="Hello John", meta={"source": "email", "page": 1})]
@@ -80,12 +84,10 @@ def test_run_preserves_metadata(self):
8084
assert result["documents"][0].meta["page"] == 1
8185

8286
def test_run_does_not_mutate_original(self):
83-
cleaner = PresidioDocumentCleaner()
87+
cleaner = self._make_cleaner_with_mocks()
8488
mock_result = MagicMock()
8589
mock_result.text = "Hello <PERSON>"
86-
cleaner._anonymizer = MagicMock()
8790
cleaner._anonymizer.anonymize.return_value = mock_result
88-
cleaner._analyzer = MagicMock()
8991
cleaner._analyzer.analyze.return_value = []
9092

9193
original = Document(content="Hello John")
@@ -94,7 +96,7 @@ def test_run_does_not_mutate_original(self):
9496
assert original.content == "Hello John"
9597

9698
def test_run_passes_through_none_content(self):
97-
cleaner = PresidioDocumentCleaner()
99+
cleaner = self._make_cleaner_with_mocks()
98100
doc = Document(content=None, meta={"source": "test"})
99101
result = cleaner.run(documents=[doc])
100102

@@ -103,10 +105,8 @@ def test_run_passes_through_none_content(self):
103105
assert result["documents"][0].meta["source"] == "test"
104106

105107
def test_run_skips_on_error(self, caplog):
106-
cleaner = PresidioDocumentCleaner()
107-
cleaner._analyzer = MagicMock()
108+
cleaner = self._make_cleaner_with_mocks()
108109
cleaner._analyzer.analyze.side_effect = Exception("Analyzer error")
109-
cleaner._anonymizer = MagicMock()
110110

111111
doc = Document(content="Some text with PII")
112112
with caplog.at_level(logging.WARNING):
@@ -117,12 +117,10 @@ def test_run_skips_on_error(self, caplog):
117117
assert "Could not anonymize" in caplog.text
118118

119119
def test_run_multiple_documents(self):
120-
cleaner = PresidioDocumentCleaner()
120+
cleaner = self._make_cleaner_with_mocks()
121121
mock_result = MagicMock()
122122
mock_result.text = "cleaned"
123-
cleaner._anonymizer = MagicMock()
124123
cleaner._anonymizer.anonymize.return_value = mock_result
125-
cleaner._analyzer = MagicMock()
126124
cleaner._analyzer.analyze.return_value = []
127125

128126
docs = [Document(content=f"doc {i}") for i in range(3)]
@@ -131,12 +129,10 @@ def test_run_multiple_documents(self):
131129
assert len(result["documents"]) == 3
132130

133131
def test_run_passes_language_and_entities_to_analyzer(self):
134-
cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.8)
132+
cleaner = self._make_cleaner_with_mocks(language="de", entities=["PERSON"], score_threshold=0.8)
135133
mock_result = MagicMock()
136134
mock_result.text = "cleaned"
137-
cleaner._anonymizer = MagicMock()
138135
cleaner._anonymizer.anonymize.return_value = mock_result
139-
cleaner._analyzer = MagicMock()
140136
cleaner._analyzer.analyze.return_value = []
141137

142138
cleaner.run(documents=[Document(content="Hello John")])

0 commit comments

Comments
 (0)