Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from haystack import Document, component, logging
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,12 +47,15 @@ def __init__(
language: str = "en",
entities: list[str] | None = None,
score_threshold: float = 0.35,
models: list[dict[str, str]] | None = None,
) -> None:
"""
Initializes the PresidioEntityExtractor.

:param language:
Language code for PII detection. Defaults to `"en"`.
Presidio's default NLP engine only includes an English spaCy model. For non-English languages,
use the `models` parameter to specify which spaCy model to load for that language.
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
:param entities:
List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
Expand All @@ -60,10 +64,17 @@ def __init__(
:param score_threshold:
Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`.
See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
:param models:
List of spaCy model configurations for language support.
Each entry must contain `"lang_code"` and `"model_name"` keys,
e.g. `[{"lang_code": "fr", "model_name": "fr_core_news_lg"}]`.
The corresponding spaCy model will be loaded at warm-up time.
If `None`, Presidio's default English model (`en_core_web_lg`) is used.
"""
self.language = language
self.entities = entities
self.score_threshold = score_threshold
self.models = models
self._analyzer: AnalyzerEngine | None = None
self._is_warmed_up = False

Expand All @@ -77,7 +88,14 @@ def warm_up(self) -> None:
if self._is_warmed_up:
return

self._analyzer = AnalyzerEngine()
if self.models:
nlp_engine = NlpEngineProvider(
nlp_configuration={"nlp_engine_name": "spacy", "models": self.models}
).create_engine()
supported_languages = [m["lang_code"] for m in self.models]
self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=supported_languages)
else:
self._analyzer = AnalyzerEngine(supported_languages=[self.language])

self._is_warmed_up = True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from haystack import Document, component, logging
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,12 +43,15 @@ def __init__(
language: str = "en",
entities: list[str] | None = None,
score_threshold: float = 0.35,
models: list[dict[str, str]] | None = None,
) -> None:
"""
Initializes the PresidioDocumentCleaner.

:param language:
Language code for PII detection. Defaults to `"en"`.
Presidio's default NLP engine only includes an English spaCy model. For non-English languages,
use the `models` parameter to specify which spaCy model to load for that language.
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
:param entities:
List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
Expand All @@ -56,10 +60,17 @@ def __init__(
:param score_threshold:
Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
:param models:
List of spaCy model configurations for language support.
Each entry must contain `"lang_code"` and `"model_name"` keys,
e.g. `[{"lang_code": "fr", "model_name": "fr_core_news_lg"}]`.
The corresponding spaCy model will be loaded at warm-up time.
If `None`, Presidio's default English model (`en_core_web_lg`) is used.
"""
self.language = language
self.entities = entities
self.score_threshold = score_threshold
self.models = models
self._analyzer: AnalyzerEngine | None = None
self._anonymizer: AnonymizerEngine | None = None
self._is_warmed_up = False
Expand All @@ -74,7 +85,14 @@ def warm_up(self) -> None:
if self._is_warmed_up:
return

self._analyzer = AnalyzerEngine()
if self.models:
nlp_engine = NlpEngineProvider(
nlp_configuration={"nlp_engine_name": "spacy", "models": self.models}
).create_engine()
supported_languages = [m["lang_code"] for m in self.models]
self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=supported_languages)
else:
self._analyzer = AnalyzerEngine(supported_languages=[self.language])
self._anonymizer = AnonymizerEngine()

self._is_warmed_up = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from haystack import component, logging
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -39,12 +40,15 @@ def __init__(
language: str = "en",
entities: list[str] | None = None,
score_threshold: float = 0.35,
models: list[dict[str, str]] | None = None,
) -> None:
"""
Initializes the PresidioTextCleaner.

:param language:
Language code for PII detection. Defaults to `"en"`.
Presidio's default NLP engine only includes an English spaCy model. For non-English languages,
use the `models` parameter to specify which spaCy model to load for that language.
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
:param entities:
List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
Expand All @@ -53,10 +57,17 @@ def __init__(
:param score_threshold:
Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
:param models:
List of spaCy model configurations for language support.
Each entry must contain `"lang_code"` and `"model_name"` keys,
e.g. `[{"lang_code": "fr", "model_name": "fr_core_news_lg"}]`.
The corresponding spaCy model will be loaded at warm-up time.
If `None`, Presidio's default English model (`en_core_web_lg`) is used.
"""
self.language = language
self.entities = entities
self.score_threshold = score_threshold
self.models = models
self._analyzer: AnalyzerEngine | None = None
self._anonymizer: AnonymizerEngine | None = None
self._is_warmed_up = False
Expand All @@ -71,7 +82,14 @@ def warm_up(self) -> None:
if self._is_warmed_up:
return

self._analyzer = AnalyzerEngine()
if self.models:
nlp_engine = NlpEngineProvider(
nlp_configuration={"nlp_engine_name": "spacy", "models": self.models}
).create_engine()
supported_languages = [m["lang_code"] for m in self.models]
self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=supported_languages)
else:
self._analyzer = AnalyzerEngine(supported_languages=[self.language])
self._anonymizer = AnonymizerEngine()

self._is_warmed_up = True
Expand Down
56 changes: 52 additions & 4 deletions integrations/presidio/tests/test_presidio_document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch

import pytest
from haystack import Document
Expand All @@ -18,6 +18,7 @@ def test_init_defaults(self):
assert cleaner.language == "en"
assert cleaner.entities is None
assert cleaner.score_threshold == 0.35
assert cleaner.models is None

def test_init_custom_params(self):
cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.7)
Expand All @@ -26,28 +27,61 @@ def test_init_custom_params(self):
assert cleaner.score_threshold == 0.7

def test_to_dict(self):
cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5)
models = [{"lang_code": "fr", "model_name": "fr_core_news_lg"}]
cleaner = PresidioDocumentCleaner(language="fr", entities=["EMAIL_ADDRESS"], score_threshold=0.5, models=models)
data = component_to_dict(cleaner, "PresidioDocumentCleaner")
expected_type = (
"haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner"
)
assert data["type"] == expected_type
assert data["init_parameters"]["language"] == "en"
assert data["init_parameters"]["language"] == "fr"
assert data["init_parameters"]["entities"] == ["EMAIL_ADDRESS"]
assert data["init_parameters"]["score_threshold"] == 0.5
assert data["init_parameters"]["models"] == models

def test_from_dict(self):
models = [{"lang_code": "de", "model_name": "de_core_news_lg"}]
data = {
"type": (
"haystack_integrations.components.preprocessors.presidio"
".presidio_document_cleaner.PresidioDocumentCleaner"
),
"init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6},
"init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6, "models": models},
}
cleaner = component_from_dict(PresidioDocumentCleaner, data, "PresidioDocumentCleaner")
assert cleaner.language == "de"
assert cleaner.entities == ["PERSON"]
assert cleaner.score_threshold == 0.6
assert cleaner.models == models

def test_warm_up(self):
cleaner = PresidioDocumentCleaner(language="en")
with (
patch(
"haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.AnalyzerEngine"
) as mock_analyzer_cls,
patch("haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.AnonymizerEngine"),
):
cleaner.warm_up()
mock_analyzer_cls.assert_called_once_with(supported_languages=["en"])

def test_warm_up_with_models(self):
models = [{"lang_code": "fr", "model_name": "fr_core_news_lg"}]
cleaner = PresidioDocumentCleaner(language="fr", models=models)
mock_nlp_engine = MagicMock()
with (
patch(
"haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.NlpEngineProvider"
) as mock_provider_cls,
patch(
"haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.AnalyzerEngine"
) as mock_analyzer_cls,
patch("haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.AnonymizerEngine"),
):
mock_provider_cls.return_value.create_engine.return_value = mock_nlp_engine
cleaner.warm_up()
mock_provider_cls.assert_called_once_with(nlp_configuration={"nlp_engine_name": "spacy", "models": models})
mock_analyzer_cls.assert_called_once_with(nlp_engine=mock_nlp_engine, supported_languages=["fr"])

def _make_cleaner_with_mocks(self, **kwargs):
"""Return a cleaner with mocked engines so unit tests don't load real NLP models."""
Expand Down Expand Up @@ -150,3 +184,17 @@ def test_run_integration(self):
assert len(result["documents"]) == 1
assert "John Smith" not in result["documents"][0].content
assert "john@example.com" not in result["documents"][0].content

@pytest.mark.integration
def test_run_integration_german(self):
cleaner = PresidioDocumentCleaner(
language="de",
models=[{"lang_code": "de", "model_name": "de_core_news_lg"}],
)
cleaner.warm_up()
docs = [Document(content="Mein Name ist Hans Müller und meine E-Mail ist hans@example.com")]
result = cleaner.run(documents=docs)

assert len(result["documents"]) == 1
assert "Hans Müller" not in result["documents"][0].content
assert "hans@example.com" not in result["documents"][0].content
57 changes: 54 additions & 3 deletions integrations/presidio/tests/test_presidio_entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import logging
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch

import pytest
from haystack import Document
Expand All @@ -18,6 +18,32 @@ def test_init_defaults(self):
assert extractor.language == "en"
assert extractor.entities is None
assert extractor.score_threshold == 0.35
assert extractor.models is None

def test_warm_up(self):
extractor = PresidioEntityExtractor(language="fr")
with patch(
"haystack_integrations.components.extractors.presidio.presidio_entity_extractor.AnalyzerEngine"
) as mock_analyzer_cls:
extractor.warm_up()
mock_analyzer_cls.assert_called_once_with(supported_languages=["fr"])

def test_warm_up_with_models(self):
models = [{"lang_code": "fr", "model_name": "fr_core_news_lg"}]
extractor = PresidioEntityExtractor(language="fr", models=models)
mock_nlp_engine = MagicMock()
with (
patch(
"haystack_integrations.components.extractors.presidio.presidio_entity_extractor.NlpEngineProvider"
) as mock_provider_cls,
patch(
"haystack_integrations.components.extractors.presidio.presidio_entity_extractor.AnalyzerEngine"
) as mock_analyzer_cls,
):
mock_provider_cls.return_value.create_engine.return_value = mock_nlp_engine
extractor.warm_up()
mock_provider_cls.assert_called_once_with(nlp_configuration={"nlp_engine_name": "spacy", "models": models})
mock_analyzer_cls.assert_called_once_with(nlp_engine=mock_nlp_engine, supported_languages=["fr"])

def _make_extractor_with_mocks(self, **kwargs):
"""Return an extractor with a mocked analyzer so unit tests don't load real NLP models."""
Expand All @@ -27,24 +53,34 @@ def _make_extractor_with_mocks(self, **kwargs):
return extractor

def test_to_dict(self):
extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6)
models = [{"lang_code": "fr", "model_name": "fr_core_news_lg"}]
extractor = PresidioEntityExtractor(language="fr", entities=["PERSON"], score_threshold=0.6, models=models)
data = component_to_dict(extractor, "PresidioEntityExtractor")
expected_type = (
"haystack_integrations.components.extractors.presidio.presidio_entity_extractor.PresidioEntityExtractor"
)
assert data["type"] == expected_type
assert data["init_parameters"]["language"] == "fr"
assert data["init_parameters"]["entities"] == ["PERSON"]
assert data["init_parameters"]["score_threshold"] == 0.6
assert data["init_parameters"]["models"] == models

def test_from_dict(self):
models = [{"lang_code": "fr", "model_name": "fr_core_news_lg"}]
data = {
"type": (
"haystack_integrations.components.extractors.presidio.presidio_entity_extractor.PresidioEntityExtractor"
),
"init_parameters": {"language": "en", "entities": ["EMAIL_ADDRESS"], "score_threshold": 0.5},
"init_parameters": {
"language": "fr",
"entities": ["EMAIL_ADDRESS"],
"score_threshold": 0.5,
"models": models,
},
}
extractor = component_from_dict(PresidioEntityExtractor, data, "PresidioEntityExtractor")
assert extractor.entities == ["EMAIL_ADDRESS"]
assert extractor.models == models

def test_run_extracts_entities_into_metadata(self):
extractor = self._make_extractor_with_mocks()
Expand Down Expand Up @@ -124,3 +160,18 @@ def test_run_integration(self):
entities = result["documents"][0].meta["entities"]
entity_types = [e["entity_type"] for e in entities]
assert "EMAIL_ADDRESS" in entity_types

@pytest.mark.integration
def test_run_integration_german(self):
extractor = PresidioEntityExtractor(
language="de",
models=[{"lang_code": "de", "model_name": "de_core_news_lg"}],
)
extractor.warm_up()
docs = [Document(content="Kontaktieren Sie Hans Müller unter hans@example.com")]
result = extractor.run(documents=docs)

entities = result["documents"][0].meta["entities"]
entity_types = [e["entity_type"] for e in entities]
assert "EMAIL_ADDRESS" in entity_types
assert "PERSON" in entity_types
Loading
Loading