From 028361072bc4967df5e89cc6150af9654e9ad73d Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 1 Apr 2026 12:49:21 +0500 Subject: [PATCH 1/9] feat: add Presidio integration for PII detection and anonymization Implements three Haystack components using Microsoft Presidio: - PresidioDocumentCleaner: anonymizes PII in list[Document] - PresidioTextCleaner: anonymizes PII in list[str] (for query sanitization) - PresidioEntityExtractor: detects PII entities and stores them in Document metadata --- .github/workflows/presidio.yml | 72 ++++++++ integrations/presidio/pyproject.toml | 166 ++++++++++++++++++ .../preprocessors/presidio/__init__.py | 9 + .../presidio/presidio_document_cleaner.py | 92 ++++++++++ .../presidio/presidio_entity_extractor.py | 100 +++++++++++ .../presidio/presidio_text_cleaner.py | 85 +++++++++ .../tests/test_presidio_document_cleaner.py | 152 ++++++++++++++++ .../tests/test_presidio_entity_extractor.py | 121 +++++++++++++ .../tests/test_presidio_text_cleaner.py | 95 ++++++++++ 9 files changed, 892 insertions(+) create mode 100644 .github/workflows/presidio.yml create mode 100644 integrations/presidio/pyproject.toml create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py create mode 100644 integrations/presidio/tests/test_presidio_document_cleaner.py create mode 100644 integrations/presidio/tests/test_presidio_entity_extractor.py create mode 100644 integrations/presidio/tests/test_presidio_text_cleaner.py diff --git a/.github/workflows/presidio.yml b/.github/workflows/presidio.yml new file mode 100644 index 0000000000..7395bde691 --- /dev/null +++ b/.github/workflows/presidio.yml @@ -0,0 +1,72 @@ +name: Test / presidio + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/presidio/**" + - "!integrations/presidio/*.md" + - ".github/workflows/presidio.yml" + +defaults: + run: + working-directory: integrations/presidio + +concurrency: + group: presidio-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-latest + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/integrations/presidio/pyproject.toml b/integrations/presidio/pyproject.toml new file mode 100644 index 0000000000..1a618957e4 --- /dev/null +++ b/integrations/presidio/pyproject.toml @@ -0,0 +1,166 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "presidio-haystack" +dynamic = ["version"] +description = "Haystack integration for Microsoft Presidio — PII detection and anonymization" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = ["Haystack", "Presidio", "PII", "anonymization", "privacy", "NLP"] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.9.0", + "presidio-analyzer>=2.2.0", + "presidio-anonymizer>=2.2.0", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/presidio-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/presidio-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = [ + "presidio_analyzer", + "presidio_analyzer.*", + "presidio_anonymizer", + "presidio_anonymizer.*", +] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", + "D103", + "D205", + "D209", + "D213", + "D417", + "D419", + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + "B027", + "B008", + "S105", + "S106", + "S107", + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +relative_files = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py new file mode 100644 index 0000000000..bdaf79cba1 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner import PresidioDocumentCleaner +from haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor import PresidioEntityExtractor +from haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner import PresidioTextCleaner + +__all__ = ["PresidioDocumentCleaner", "PresidioEntityExtractor", "PresidioTextCleaner"] diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py new file mode 100644 index 0000000000..0bdcef7a70 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from haystack import Document, component, logging +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine + +logger = logging.getLogger(__name__) + + +@component +class PresidioDocumentCleaner: + """ + Anonymizes PII in Haystack Documents using [Microsoft Presidio](https://microsoft.github.io/presidio/). + + Accepts a list of Documents, detects personally identifiable information (PII) in their + text content, and returns new Documents with PII replaced by entity type placeholders + (e.g. ``, ``). Original Documents are not mutated. + + Documents without text content are passed through unchanged. + + ### Usage example + + ```python + from haystack import Document + from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner + + cleaner = PresidioDocumentCleaner() + result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")]) + print(result["documents"][0].content) + # My name is and my email is + ``` + """ + + def __init__( + self, + language: str = "en", + entities: list[str] | None = None, + score_threshold: float = 0.35, + ) -> None: + """ + Initializes the PresidioDocumentCleaner. + + :param language: + Language code for PII detection. Defaults to `"en"`. + :param entities: + List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`). + If `None`, all supported entity types are used. + :param score_threshold: + Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`. + """ + self.language = language + self.entities = entities + self.score_threshold = score_threshold + self._analyzer = AnalyzerEngine() + self._anonymizer = AnonymizerEngine() + + @component.output_types(documents=list[Document]) + def run(self, documents: list[Document]) -> dict[str, Any]: + """ + Anonymizes PII in the provided Documents. + + :param documents: + List of Documents whose text content will be anonymized. + :returns: + A dictionary with key `documents` containing the cleaned Documents. + """ + cleaned: list[Document] = [] + for doc in documents: + if doc.content is None: + cleaned.append(doc) + continue + try: + analyzer_results = self._analyzer.analyze( + text=doc.content, + language=self.language, + entities=self.entities, + score_threshold=self.score_threshold, + ) + anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results) + cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy())) + except Exception as e: + logger.warning( + "Could not anonymize document {doc_id}. Skipping it. Error: {error}", + doc_id=doc.id, + error=e, + ) + cleaned.append(doc) + return {"documents": cleaned} diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py new file mode 100644 index 0000000000..2f71219de8 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from haystack import Document, component, logging +from presidio_analyzer import AnalyzerEngine + +logger = logging.getLogger(__name__) + + +@component +class PresidioEntityExtractor: + """ + Detects PII entities in Haystack Documents using [Microsoft Presidio Analyzer](https://microsoft.github.io/presidio/). + + Accepts a list of Documents and returns new Documents with detected PII entities stored + in each Document's metadata under the key `"entities"`. Each entry in the list contains + the entity type, start/end character offsets, and the confidence score. + + Original Documents are not mutated. Documents without text content are passed through unchanged. + + ### Usage example + + ```python + from haystack import Document + from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor + + extractor = PresidioEntityExtractor() + result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")]) + print(result["documents"][0].meta["entities"]) + # [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85}, + # {"entity_type": "EMAIL_ADDRESS", "start": 17, "end": 34, "score": 1.0}] + ``` + """ + + def __init__( + self, + language: str = "en", + entities: list[str] | None = None, + score_threshold: float = 0.35, + ) -> None: + """ + Initializes the PresidioEntityExtractor. + + :param language: + Language code for PII detection. Defaults to `"en"`. + :param entities: + List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`). + If `None`, all supported entity types are detected. + :param score_threshold: + Minimum confidence score (0–1) for a detected entity to be included. Defaults to `0.35`. + """ + self.language = language + self.entities = entities + self.score_threshold = score_threshold + self._analyzer = AnalyzerEngine() + + @component.output_types(documents=list[Document]) + def run(self, documents: list[Document]) -> dict[str, Any]: + """ + Detects PII entities in the provided Documents. + + :param documents: + List of Documents to analyze for PII entities. + :returns: + A dictionary with key `documents` containing Documents with detected entities + stored in metadata under the key `"entities"`. + """ + result_docs: list[Document] = [] + for doc in documents: + if doc.content is None: + result_docs.append(doc) + continue + try: + analyzer_results = self._analyzer.analyze( + text=doc.content, + language=self.language, + entities=self.entities, + score_threshold=self.score_threshold, + ) + entities = [ + { + "entity_type": r.entity_type, + "start": r.start, + "end": r.end, + "score": r.score, + } + for r in analyzer_results + ] + result_docs.append(Document(content=doc.content, meta={**doc.meta, "entities": entities})) + except Exception as e: + logger.warning( + "Could not extract entities from document {doc_id}. Skipping it. Error: {error}", + doc_id=doc.id, + error=e, + ) + result_docs.append(doc) + return {"documents": result_docs} diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py new file mode 100644 index 0000000000..9d19479486 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py @@ -0,0 +1,85 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from haystack import component, logging +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine + +logger = logging.getLogger(__name__) + + +@component +class PresidioTextCleaner: + """ + Anonymizes PII in plain strings using [Microsoft Presidio](https://microsoft.github.io/presidio/). + + Accepts a list of strings, detects personally identifiable information (PII), and returns + a new list of strings with PII replaced by entity type placeholders (e.g. ``). + Useful for sanitizing user queries before they are sent to an LLM. + + ### Usage example + + ```python + from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner + + cleaner = PresidioTextCleaner() + result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"]) + print(result["texts"][0]) + # Hi, I am , call me at + ``` + """ + + def __init__( + self, + language: str = "en", + entities: list[str] | None = None, + score_threshold: float = 0.35, + ) -> None: + """ + Initializes the PresidioTextCleaner. + + :param language: + Language code for PII detection. Defaults to `"en"`. + :param entities: + List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`). + If `None`, all supported entity types are used. + :param score_threshold: + Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`. + """ + self.language = language + self.entities = entities + self.score_threshold = score_threshold + self._analyzer = AnalyzerEngine() + self._anonymizer = AnonymizerEngine() + + @component.output_types(texts=list[str]) + def run(self, texts: list[str]) -> dict[str, Any]: + """ + Anonymizes PII in the provided strings. + + :param texts: + List of strings to anonymize. + :returns: + A dictionary with key `texts` containing the cleaned strings. + """ + cleaned: list[str] = [] + for text in texts: + try: + analyzer_results = self._analyzer.analyze( + text=text, + language=self.language, + entities=self.entities, + score_threshold=self.score_threshold, + ) + anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results) + cleaned.append(anonymized.text) + except Exception as e: + logger.warning( + "Could not anonymize text. Skipping it. Error: {error}", + error=e, + ) + cleaned.append(text) + return {"texts": cleaned} diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py new file mode 100644 index 0000000000..9bd134437e --- /dev/null +++ b/integrations/presidio/tests/test_presidio_document_cleaner.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from unittest.mock import MagicMock, patch + +import pytest +from haystack import Document +from haystack.core.serialization import component_from_dict, component_to_dict + +from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner + + +class TestPresidioDocumentCleaner: + def test_init_defaults(self): + cleaner = PresidioDocumentCleaner() + assert cleaner.language == "en" + assert cleaner.entities is None + assert cleaner.score_threshold == 0.35 + + def test_init_custom_params(self): + cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.7) + assert cleaner.language == "de" + assert cleaner.entities == ["PERSON"] + assert cleaner.score_threshold == 0.7 + + def test_to_dict(self): + cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5) + data = component_to_dict(cleaner, "PresidioDocumentCleaner") + assert ( + data["type"] + == "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner" + ) + assert data["init_parameters"]["language"] == "en" + assert data["init_parameters"]["entities"] == ["EMAIL_ADDRESS"] + assert data["init_parameters"]["score_threshold"] == 0.5 + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner", + "init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6}, + } + cleaner = component_from_dict(PresidioDocumentCleaner, data, "PresidioDocumentCleaner") + assert cleaner.language == "de" + assert cleaner.entities == ["PERSON"] + assert cleaner.score_threshold == 0.6 + + def test_run_anonymizes_pii(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "My name is and email is " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + docs = [Document(content="My name is John and email is john@example.com")] + result = cleaner.run(documents=docs) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "My name is and email is " + + def test_run_preserves_metadata(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "Hello " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + docs = [Document(content="Hello John", meta={"source": "email", "page": 1})] + result = cleaner.run(documents=docs) + + assert result["documents"][0].meta["source"] == "email" + assert result["documents"][0].meta["page"] == 1 + + def test_run_does_not_mutate_original(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "Hello " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + original = Document(content="Hello John") + cleaner.run(documents=[original]) + + assert original.content == "Hello John" + + def test_run_passes_through_none_content(self): + cleaner = PresidioDocumentCleaner() + doc = Document(content=None, meta={"source": "test"}) + result = cleaner.run(documents=[doc]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content is None + assert result["documents"][0].meta["source"] == "test" + + def test_run_skips_on_error(self, caplog): + cleaner = PresidioDocumentCleaner() + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.side_effect = Exception("Analyzer error") + + doc = Document(content="Some text with PII") + with caplog.at_level(logging.WARNING): + result = cleaner.run(documents=[doc]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Some text with PII" + assert "Could not anonymize" in caplog.text + + def test_run_multiple_documents(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "cleaned" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + docs = [Document(content=f"doc {i}") for i in range(3)] + result = cleaner.run(documents=docs) + + assert len(result["documents"]) == 3 + + def test_run_passes_language_and_entities_to_analyzer(self): + cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.8) + mock_result = MagicMock() + mock_result.text = "cleaned" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + cleaner.run(documents=[Document(content="Hello John")]) + + cleaner._analyzer.analyze.assert_called_once_with( + text="Hello John", language="de", entities=["PERSON"], score_threshold=0.8 + ) + + @pytest.mark.integration + def test_run_integration(self): + cleaner = PresidioDocumentCleaner() + docs = [Document(content="My name is John Smith and my email is john@example.com")] + result = cleaner.run(documents=docs) + + assert len(result["documents"]) == 1 + assert "John Smith" not in result["documents"][0].content + assert "john@example.com" not in result["documents"][0].content diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py new file mode 100644 index 0000000000..f82fb013b9 --- /dev/null +++ b/integrations/presidio/tests/test_presidio_entity_extractor.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from unittest.mock import MagicMock + +import pytest +from haystack import Document +from haystack.core.serialization import component_from_dict, component_to_dict + +from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor + + +class TestPresidioEntityExtractor: + def test_init_defaults(self): + extractor = PresidioEntityExtractor() + assert extractor.language == "en" + assert extractor.entities is None + assert extractor.score_threshold == 0.35 + + def test_to_dict(self): + extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6) + data = component_to_dict(extractor, "PresidioEntityExtractor") + assert ( + data["type"] + == "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor" + ) + assert data["init_parameters"]["entities"] == ["PERSON"] + assert data["init_parameters"]["score_threshold"] == 0.6 + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor", + "init_parameters": {"language": "en", "entities": ["EMAIL_ADDRESS"], "score_threshold": 0.5}, + } + extractor = component_from_dict(PresidioEntityExtractor, data, "PresidioEntityExtractor") + assert extractor.entities == ["EMAIL_ADDRESS"] + + def test_run_extracts_entities_into_metadata(self): + extractor = PresidioEntityExtractor() + mock_entity = MagicMock() + mock_entity.entity_type = "PERSON" + mock_entity.start = 11 + mock_entity.end = 15 + mock_entity.score = 0.85 + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [mock_entity] + + docs = [Document(content="My name is John")] + result = extractor.run(documents=docs) + + entities = result["documents"][0].meta["entities"] + assert len(entities) == 1 + assert entities[0]["entity_type"] == "PERSON" + assert entities[0]["start"] == 11 + assert entities[0]["end"] == 15 + assert entities[0]["score"] == 0.85 + + def test_run_does_not_mutate_original(self): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [] + + original = Document(content="Hello John", meta={"source": "test"}) + extractor.run(documents=[original]) + + assert "entities" not in original.meta + + def test_run_passes_through_none_content(self): + extractor = PresidioEntityExtractor() + doc = Document(content=None, meta={"source": "test"}) + result = extractor.run(documents=[doc]) + + assert result["documents"][0].content is None + assert "entities" not in result["documents"][0].meta + + def test_run_empty_entities(self): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [] + + docs = [Document(content="No PII here")] + result = extractor.run(documents=docs) + + assert result["documents"][0].meta["entities"] == [] + + def test_run_skips_on_error(self, caplog): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.side_effect = Exception("Analyzer error") + + doc = Document(content="Some text") + with caplog.at_level(logging.WARNING): + result = extractor.run(documents=[doc]) + + assert result["documents"][0].content == "Some text" + assert "entities" not in result["documents"][0].meta + assert "Could not extract entities" in caplog.text + + def test_run_preserves_existing_metadata(self): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [] + + docs = [Document(content="Hello", meta={"page": 3, "author": "Bob"})] + result = extractor.run(documents=docs) + + assert result["documents"][0].meta["page"] == 3 + assert result["documents"][0].meta["author"] == "Bob" + assert result["documents"][0].meta["entities"] == [] + + @pytest.mark.integration + def test_run_integration(self): + extractor = PresidioEntityExtractor() + docs = [Document(content="Contact Alice at alice@example.com")] + result = extractor.run(documents=docs) + + entities = result["documents"][0].meta["entities"] + entity_types = [e["entity_type"] for e in entities] + assert "EMAIL_ADDRESS" in entity_types diff --git a/integrations/presidio/tests/test_presidio_text_cleaner.py b/integrations/presidio/tests/test_presidio_text_cleaner.py new file mode 100644 index 0000000000..8c1922bbd3 --- /dev/null +++ b/integrations/presidio/tests/test_presidio_text_cleaner.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from unittest.mock import MagicMock + +import pytest +from haystack.core.serialization import component_from_dict, component_to_dict + +from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner + + +class TestPresidioTextCleaner: + def test_init_defaults(self): + cleaner = PresidioTextCleaner() + assert cleaner.language == "en" + assert cleaner.entities is None + assert cleaner.score_threshold == 0.35 + + def test_to_dict(self): + cleaner = PresidioTextCleaner(language="en", entities=["PHONE_NUMBER"], score_threshold=0.5) + data = component_to_dict(cleaner, "PresidioTextCleaner") + assert ( + data["type"] + == "haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner.PresidioTextCleaner" + ) + assert data["init_parameters"]["entities"] == ["PHONE_NUMBER"] + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner.PresidioTextCleaner", + "init_parameters": {"language": "en", "entities": None, "score_threshold": 0.4}, + } + cleaner = component_from_dict(PresidioTextCleaner, data, "PresidioTextCleaner") + assert cleaner.score_threshold == 0.4 + + def test_run_anonymizes_pii(self): + cleaner = PresidioTextCleaner() + mock_result = MagicMock() + mock_result.text = "Call me at " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + result = cleaner.run(texts=["Call me at 212-555-1234"]) + + assert result["texts"][0] == "Call me at " + + def test_run_multiple_texts(self): + cleaner = PresidioTextCleaner() + mock_result = MagicMock() + mock_result.text = "cleaned" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + result = cleaner.run(texts=["text 1", "text 2", "text 3"]) + + assert len(result["texts"]) == 3 + + def test_run_skips_on_error(self, caplog): + cleaner = PresidioTextCleaner() + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.side_effect = Exception("error") + + with caplog.at_level(logging.WARNING): + result = cleaner.run(texts=["My name is John"]) + + assert result["texts"][0] == "My name is John" + assert "Could not anonymize" in caplog.text + + def test_run_empty_text(self): + cleaner = PresidioTextCleaner() + mock_result = MagicMock() + mock_result.text = "" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + result = cleaner.run(texts=[""]) + + assert result["texts"][0] == "" + + @pytest.mark.integration + def test_run_integration(self): + cleaner = PresidioTextCleaner() + result = cleaner.run(texts=["Hi, I am Alice and my phone is 212-555-5678"]) + + assert len(result["texts"]) == 1 + assert "Alice" not in result["texts"][0] + assert "212-555-5678" not in result["texts"][0] From b7f035923466ef1732a17e90e2d8c63580dbc375 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 1 Apr 2026 12:56:54 +0500 Subject: [PATCH 2/9] fix(presidio): add missing README.md required by hatchling build --- integrations/presidio/README.md | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 integrations/presidio/README.md diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md new file mode 100644 index 0000000000..7ad2e30cc3 --- /dev/null +++ b/integrations/presidio/README.md @@ -0,0 +1,44 @@ +# presidio-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) + +Haystack integration for [Microsoft Presidio](https://microsoft.github.io/presidio/) — PII detection and anonymization. + +--- + +## Installation + +```bash +pip install presidio-haystack +``` + +You also need to download the spaCy model used by Presidio: + +```bash +python -m spacy download en_core_web_lg +``` + +## Components + +- **PresidioDocumentCleaner** — anonymizes PII in `list[Document]` +- **PresidioTextCleaner** — anonymizes PII in `list[str]` (useful for query sanitization) +- **PresidioEntityExtractor** — detects PII entities and stores them in Document metadata + +## Usage + +```python +from haystack import Document +from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner + +cleaner = PresidioDocumentCleaner() +result = cleaner.run(documents=[Document(content="My name is John, email: john@example.com")]) +print(result["documents"][0].content) +# My name is , email: +``` + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). From e1629493cc17c0d218bb3231850e8dd253e10b1c Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 1 Apr 2026 13:01:48 +0500 Subject: [PATCH 3/9] fix(presidio): fix lint errors and add missing pydoc config --- integrations/presidio/pydoc/config_docusaurus.yml | 15 +++++++++++++++ .../presidio/presidio_document_cleaner.py | 2 +- .../presidio/presidio_entity_extractor.py | 2 +- .../presidio/presidio_text_cleaner.py | 2 +- .../tests/test_presidio_document_cleaner.py | 14 +++++++++----- .../tests/test_presidio_entity_extractor.py | 12 ++++++++---- 6 files changed, 35 insertions(+), 12 deletions(-) create mode 100644 integrations/presidio/pydoc/config_docusaurus.yml diff --git a/integrations/presidio/pydoc/config_docusaurus.yml b/integrations/presidio/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..def818e2a9 --- /dev/null +++ b/integrations/presidio/pydoc/config_docusaurus.yml @@ -0,0 +1,15 @@ +loaders: + - modules: + - haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner + - haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner + - haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Presidio integration for Haystack + id: integrations-presidio + filename: presidio.md + title: Presidio diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py index 0bdcef7a70..f8e4551270 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py @@ -50,7 +50,7 @@ def __init__( List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`). If `None`, all supported entity types are used. :param score_threshold: - Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`. + Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`. """ self.language = language self.entities = entities diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py index 2f71219de8..fb21b89a11 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py @@ -50,7 +50,7 @@ def __init__( List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`). If `None`, all supported entity types are detected. :param score_threshold: - Minimum confidence score (0–1) for a detected entity to be included. Defaults to `0.35`. + Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`. """ self.language = language self.entities = entities diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py index 9d19479486..c9221d5804 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py @@ -47,7 +47,7 @@ def __init__( List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`). If `None`, all supported entity types are used. :param score_threshold: - Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`. + Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`. """ self.language = language self.entities = entities diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py index 9bd134437e..2122743b04 100644 --- a/integrations/presidio/tests/test_presidio_document_cleaner.py +++ b/integrations/presidio/tests/test_presidio_document_cleaner.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest from haystack import Document @@ -28,17 +28,21 @@ def test_init_custom_params(self): def test_to_dict(self): cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5) data = component_to_dict(cleaner, "PresidioDocumentCleaner") - assert ( - data["type"] - == "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner" + expected_type = ( + "haystack_integrations.components.preprocessors.presidio" + ".presidio_document_cleaner.PresidioDocumentCleaner" ) + assert data["type"] == expected_type assert data["init_parameters"]["language"] == "en" assert data["init_parameters"]["entities"] == ["EMAIL_ADDRESS"] assert data["init_parameters"]["score_threshold"] == 0.5 def test_from_dict(self): data = { - "type": "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner", + "type": ( + "haystack_integrations.components.preprocessors.presidio" + ".presidio_document_cleaner.PresidioDocumentCleaner" + ), "init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6}, } cleaner = component_from_dict(PresidioDocumentCleaner, data, "PresidioDocumentCleaner") diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py index f82fb013b9..137712bca9 100644 --- a/integrations/presidio/tests/test_presidio_entity_extractor.py +++ b/integrations/presidio/tests/test_presidio_entity_extractor.py @@ -22,16 +22,20 @@ def test_init_defaults(self): def test_to_dict(self): extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6) data = component_to_dict(extractor, "PresidioEntityExtractor") - assert ( - data["type"] - == "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor" + expected_type = ( + "haystack_integrations.components.preprocessors.presidio" + ".presidio_entity_extractor.PresidioEntityExtractor" ) + assert data["type"] == expected_type assert data["init_parameters"]["entities"] == ["PERSON"] assert data["init_parameters"]["score_threshold"] == 0.6 def test_from_dict(self): data = { - "type": "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor", + "type": ( + "haystack_integrations.components.preprocessors.presidio" + ".presidio_entity_extractor.PresidioEntityExtractor" + ), "init_parameters": {"language": "en", "entities": ["EMAIL_ADDRESS"], "score_threshold": 0.5}, } extractor = component_from_dict(PresidioEntityExtractor, data, "PresidioEntityExtractor") From cc518d19821fbf2fa7d560b1eed3bf6165d846f8 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 1 Apr 2026 13:05:07 +0500 Subject: [PATCH 4/9] fix(presidio): apply ruff format to test files --- integrations/presidio/tests/test_presidio_document_cleaner.py | 3 +-- integrations/presidio/tests/test_presidio_entity_extractor.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py index 2122743b04..4f170e4913 100644 --- a/integrations/presidio/tests/test_presidio_document_cleaner.py +++ b/integrations/presidio/tests/test_presidio_document_cleaner.py @@ -29,8 +29,7 @@ def test_to_dict(self): cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5) data = component_to_dict(cleaner, "PresidioDocumentCleaner") expected_type = ( - "haystack_integrations.components.preprocessors.presidio" - ".presidio_document_cleaner.PresidioDocumentCleaner" + "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner" ) assert data["type"] == expected_type assert data["init_parameters"]["language"] == "en" diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py index 137712bca9..0cbbbf0c72 100644 --- a/integrations/presidio/tests/test_presidio_entity_extractor.py +++ b/integrations/presidio/tests/test_presidio_entity_extractor.py @@ -23,8 +23,7 @@ def test_to_dict(self): extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6) data = component_to_dict(extractor, "PresidioEntityExtractor") expected_type = ( - "haystack_integrations.components.preprocessors.presidio" - ".presidio_entity_extractor.PresidioEntityExtractor" + "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor" ) assert data["type"] == expected_type assert data["init_parameters"]["entities"] == ["PERSON"] From a363f89b1aa26fc9d322aab5625595c72b552b26 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 1 Apr 2026 13:07:29 +0500 Subject: [PATCH 5/9] fix(presidio): add py.typed marker for mypy type checking --- .../src/haystack_integrations/components/preprocessors/py.typed | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed b/integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed new file mode 100644 index 0000000000..e69de29bb2 From a8b2004c790405db8abb899cad73a3a93ac38688 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 1 Apr 2026 13:10:22 +0500 Subject: [PATCH 6/9] fix(presidio): suppress mypy arg-type error for presidio cross-package type mismatch --- .../preprocessors/presidio/presidio_document_cleaner.py | 2 +- .../components/preprocessors/presidio/presidio_text_cleaner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py index f8e4551270..0e576c1889 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py @@ -80,7 +80,7 @@ def run(self, documents: list[Document]) -> dict[str, Any]: entities=self.entities, score_threshold=self.score_threshold, ) - anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results) + anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results) # type: ignore[arg-type] cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy())) except Exception as e: logger.warning( diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py index c9221d5804..3b85194db8 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py @@ -74,7 +74,7 @@ def run(self, texts: list[str]) -> dict[str, Any]: entities=self.entities, score_threshold=self.score_threshold, ) - anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results) + anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results) # type: ignore[arg-type] cleaned.append(anonymized.text) except Exception as e: logger.warning( From 80c8c1d101d49fe216b9ece084ffd3d989b39acb Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Thu, 2 Apr 2026 19:36:10 +0500 Subject: [PATCH 7/9] Address PR review: update README, add Python 3.14 support, labeler and coverage entries --- .github/labeler.yml | 5 ++++ .github/workflows/CI_coverage_comment.yml | 1 + .github/workflows/presidio.yml | 2 +- README.md | 1 + integrations/presidio/README.md | 34 +---------------------- integrations/presidio/pyproject.toml | 1 + 6 files changed, 10 insertions(+), 34 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index 4d44b76b10..ffe59fb600 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -198,6 +198,11 @@ integration:pgvector: - any-glob-to-any-file: "integrations/pgvector/**/*" - any-glob-to-any-file: ".github/workflows/pgvector.yml" +integration:presidio: + - changed-files: + - any-glob-to-any-file: "integrations/presidio/**/*" + - any-glob-to-any-file: ".github/workflows/presidio.yml" + integration:pinecone: - changed-files: - any-glob-to-any-file: "integrations/pinecone/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index f4b83385a5..29373df7d8 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -42,6 +42,7 @@ on: - "Test / optimum" - "Test / paddleocr" - "Test / pgvector" + - "Test / presidio" - "Test / pinecone" - "Test / pyversity" - "Test / qdrant" diff --git a/.github/workflows/presidio.yml b/.github/workflows/presidio.yml index 7395bde691..597d5fb208 100644 --- a/.github/workflows/presidio.yml +++ b/.github/workflows/presidio.yml @@ -29,7 +29,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.10", "3.13"] + python-version: ["3.10", "3.14"] steps: - uses: actions/checkout@v4 diff --git a/README.md b/README.md index f52e01b852..e123e7bde9 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch-combined/htmlcov/index.html) | | [optimum-haystack](integrations/optimum/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum-combined/htmlcov/index.html) | | [paddleocr-haystack](integrations/paddleocr/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack) | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) | +| [presidio-haystack](integrations/presidio/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml) | | | | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) | | [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) | | [pyversity-haystack](integrations/pyversity/) | Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack) | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) | | diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md index 7ad2e30cc3..c29ac70462 100644 --- a/integrations/presidio/README.md +++ b/integrations/presidio/README.md @@ -3,39 +3,7 @@ [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) -Haystack integration for [Microsoft Presidio](https://microsoft.github.io/presidio/) — PII detection and anonymization. - ---- - -## Installation - -```bash -pip install presidio-haystack -``` - -You also need to download the spaCy model used by Presidio: - -```bash -python -m spacy download en_core_web_lg -``` - -## Components - -- **PresidioDocumentCleaner** — anonymizes PII in `list[Document]` -- **PresidioTextCleaner** — anonymizes PII in `list[str]` (useful for query sanitization) -- **PresidioEntityExtractor** — detects PII entities and stores them in Document metadata - -## Usage - -```python -from haystack import Document -from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner - -cleaner = PresidioDocumentCleaner() -result = cleaner.run(documents=[Document(content="My name is John, email: john@example.com")]) -print(result["documents"][0].content) -# My name is , email: -``` +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/presidio/CHANGELOG.md) --- diff --git a/integrations/presidio/pyproject.toml b/integrations/presidio/pyproject.toml index 1a618957e4..c8e166796f 100644 --- a/integrations/presidio/pyproject.toml +++ b/integrations/presidio/pyproject.toml @@ -19,6 +19,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] From b1377c0b4dbca548d1100b0bd58db07a4bc589d7 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Thu, 9 Apr 2026 16:51:39 +0500 Subject: [PATCH 8/9] =?UTF-8?q?fix(presidio):=20address=20reviewer=20feedb?= =?UTF-8?q?ack=20=E2=80=94=20fix=20README=20format,=20alphabetical=20order?= =?UTF-8?q?ing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/CI_coverage_comment.yml | 2 +- README.md | 2 +- integrations/presidio/README.md | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index 29373df7d8..7c80a698cf 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -42,8 +42,8 @@ on: - "Test / optimum" - "Test / paddleocr" - "Test / pgvector" - - "Test / presidio" - "Test / pinecone" + - "Test / presidio" - "Test / pyversity" - "Test / qdrant" - "Test / ragas" diff --git a/README.md b/README.md index e123e7bde9..32c0508234 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,9 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch-combined/htmlcov/index.html) | | [optimum-haystack](integrations/optimum/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum-combined/htmlcov/index.html) | | [paddleocr-haystack](integrations/paddleocr/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack) | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) | -| [presidio-haystack](integrations/presidio/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml) | | | | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) | | [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) | +| [presidio-haystack](integrations/presidio/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml) | | | | [pyversity-haystack](integrations/pyversity/) | Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack) | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) | | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant-combined/htmlcov/index.html) | | [ragas-haystack](integrations/ragas/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/ragas-haystack.svg)](https://pypi.org/project/ragas-haystack) | [![Test / ragas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ragas/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ragas/htmlcov/index.html) | | diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md index c29ac70462..2f9e57089b 100644 --- a/integrations/presidio/README.md +++ b/integrations/presidio/README.md @@ -7,6 +7,4 @@ --- -## Contributing - Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). From 7e15ec3c0706ff3dacd9dfc3afc180ff7c4239c3 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Thu, 9 Apr 2026 18:56:49 +0500 Subject: [PATCH 9/9] =?UTF-8?q?fix(presidio):=20address=20reviewer=20feedb?= =?UTF-8?q?ack=20=E2=80=94=20keyword-only=20args,=20warm=5Fup,=20type=20hi?= =?UTF-8?q?nts,=20dataclasses.replace,=20doc=20links?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add keyword-only arguments (*, ) to all three component __init__ methods - Move AnalyzerEngine/AnonymizerEngine initialization to warm_up() since they load spaCy ML models - Fix run() return types from dict[str, Any] to proper typed dicts - Use dataclasses.replace() in PresidioEntityExtractor instead of Document() - Add Presidio documentation links for language, entities, and score_threshold params - Update integration tests to call warm_up() before run() - Add missing _anonymizer mock in test_run_skips_on_error tests --- .../presidio/presidio_document_cleaner.py | 30 +++++++++++++++---- .../presidio/presidio_entity_extractor.py | 28 ++++++++++++++--- .../presidio/presidio_text_cleaner.py | 30 +++++++++++++++---- .../tests/test_presidio_document_cleaner.py | 2 ++ .../tests/test_presidio_entity_extractor.py | 1 + .../tests/test_presidio_text_cleaner.py | 2 ++ 6 files changed, 79 insertions(+), 14 deletions(-) diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py index 0e576c1889..348593f440 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any - from haystack import Document, component, logging from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine @@ -22,6 +20,8 @@ class PresidioDocumentCleaner: Documents without text content are passed through unchanged. + Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines. + ### Usage example ```python @@ -29,6 +29,7 @@ class PresidioDocumentCleaner: from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner cleaner = PresidioDocumentCleaner() + cleaner.warm_up() result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")]) print(result["documents"][0].content) # My name is and my email is @@ -37,6 +38,7 @@ class PresidioDocumentCleaner: def __init__( self, + *, language: str = "en", entities: list[str] | None = None, score_threshold: float = 0.35, @@ -46,20 +48,35 @@ def __init__( :param language: Language code for PII detection. Defaults to `"en"`. + See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/). :param entities: List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`). If `None`, all supported entity types are used. + See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/). :param score_threshold: Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`. + See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/). """ self.language = language self.entities = entities self.score_threshold = score_threshold - self._analyzer = AnalyzerEngine() - self._anonymizer = AnonymizerEngine() + self._analyzer: AnalyzerEngine | None = None + self._anonymizer: AnonymizerEngine | None = None + + def warm_up(self) -> None: + """ + Initializes the Presidio analyzer and anonymizer engines. + + This method loads the underlying NLP models and should be called before `run()`. + In a Haystack Pipeline, this is called automatically before the first run. + """ + if self._analyzer is None: + self._analyzer = AnalyzerEngine() + if self._anonymizer is None: + self._anonymizer = AnonymizerEngine() @component.output_types(documents=list[Document]) - def run(self, documents: list[Document]) -> dict[str, Any]: + def run(self, documents: list[Document]) -> dict[str, list[Document]]: """ Anonymizes PII in the provided Documents. @@ -73,6 +90,9 @@ def run(self, documents: list[Document]) -> dict[str, Any]: if doc.content is None: cleaned.append(doc) continue + if self._analyzer is None or self._anonymizer is None: + msg = "The component was not warmed up. Call warm_up() before running it." + raise RuntimeError(msg) try: analyzer_results = self._analyzer.analyze( text=doc.content, diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py index fb21b89a11..7b1b42b3d5 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any +from dataclasses import replace from haystack import Document, component, logging from presidio_analyzer import AnalyzerEngine @@ -21,6 +21,8 @@ class PresidioEntityExtractor: Original Documents are not mutated. Documents without text content are passed through unchanged. + Call `warm_up()` before running this component to load the Presidio analyzer engine. + ### Usage example ```python @@ -28,6 +30,7 @@ class PresidioEntityExtractor: from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor extractor = PresidioEntityExtractor() + extractor.warm_up() result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")]) print(result["documents"][0].meta["entities"]) # [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85}, @@ -37,6 +40,7 @@ class PresidioEntityExtractor: def __init__( self, + *, language: str = "en", entities: list[str] | None = None, score_threshold: float = 0.35, @@ -46,19 +50,32 @@ def __init__( :param language: Language code for PII detection. Defaults to `"en"`. + See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/). :param entities: List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`). If `None`, all supported entity types are detected. + See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/). :param score_threshold: Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`. + See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/). """ self.language = language self.entities = entities self.score_threshold = score_threshold - self._analyzer = AnalyzerEngine() + self._analyzer: AnalyzerEngine | None = None + + def warm_up(self) -> None: + """ + Initializes the Presidio analyzer engine. + + This method loads the underlying NLP models and should be called before `run()`. + In a Haystack Pipeline, this is called automatically before the first run. + """ + if self._analyzer is None: + self._analyzer = AnalyzerEngine() @component.output_types(documents=list[Document]) - def run(self, documents: list[Document]) -> dict[str, Any]: + def run(self, documents: list[Document]) -> dict[str, list[Document]]: """ Detects PII entities in the provided Documents. @@ -73,6 +90,9 @@ def run(self, documents: list[Document]) -> dict[str, Any]: if doc.content is None: result_docs.append(doc) continue + if self._analyzer is None: + msg = "The component was not warmed up. Call warm_up() before running it." + raise RuntimeError(msg) try: analyzer_results = self._analyzer.analyze( text=doc.content, @@ -89,7 +109,7 @@ def run(self, documents: list[Document]) -> dict[str, Any]: } for r in analyzer_results ] - result_docs.append(Document(content=doc.content, meta={**doc.meta, "entities": entities})) + result_docs.append(replace(doc, meta={**doc.meta, "entities": entities})) except Exception as e: logger.warning( "Could not extract entities from document {doc_id}. Skipping it. Error: {error}", diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py index 3b85194db8..d20f889c19 100644 --- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any - from haystack import component, logging from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine @@ -20,12 +18,15 @@ class PresidioTextCleaner: a new list of strings with PII replaced by entity type placeholders (e.g. ``). Useful for sanitizing user queries before they are sent to an LLM. + Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines. + ### Usage example ```python from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner cleaner = PresidioTextCleaner() + cleaner.warm_up() result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"]) print(result["texts"][0]) # Hi, I am , call me at @@ -34,6 +35,7 @@ class PresidioTextCleaner: def __init__( self, + *, language: str = "en", entities: list[str] | None = None, score_threshold: float = 0.35, @@ -43,20 +45,35 @@ def __init__( :param language: Language code for PII detection. Defaults to `"en"`. + See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/). :param entities: List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`). If `None`, all supported entity types are used. + See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/). :param score_threshold: Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`. + See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/). """ self.language = language self.entities = entities self.score_threshold = score_threshold - self._analyzer = AnalyzerEngine() - self._anonymizer = AnonymizerEngine() + self._analyzer: AnalyzerEngine | None = None + self._anonymizer: AnonymizerEngine | None = None + + def warm_up(self) -> None: + """ + Initializes the Presidio analyzer and anonymizer engines. + + This method loads the underlying NLP models and should be called before `run()`. + In a Haystack Pipeline, this is called automatically before the first run. + """ + if self._analyzer is None: + self._analyzer = AnalyzerEngine() + if self._anonymizer is None: + self._anonymizer = AnonymizerEngine() @component.output_types(texts=list[str]) - def run(self, texts: list[str]) -> dict[str, Any]: + def run(self, texts: list[str]) -> dict[str, list[str]]: """ Anonymizes PII in the provided strings. @@ -65,6 +82,9 @@ def run(self, texts: list[str]) -> dict[str, Any]: :returns: A dictionary with key `texts` containing the cleaned strings. """ + if self._analyzer is None or self._anonymizer is None: + msg = "The component was not warmed up. Call warm_up() before running it." + raise RuntimeError(msg) cleaned: list[str] = [] for text in texts: try: diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py index 4f170e4913..7c1d35107f 100644 --- a/integrations/presidio/tests/test_presidio_document_cleaner.py +++ b/integrations/presidio/tests/test_presidio_document_cleaner.py @@ -106,6 +106,7 @@ def test_run_skips_on_error(self, caplog): cleaner = PresidioDocumentCleaner() cleaner._analyzer = MagicMock() cleaner._analyzer.analyze.side_effect = Exception("Analyzer error") + cleaner._anonymizer = MagicMock() doc = Document(content="Some text with PII") with caplog.at_level(logging.WARNING): @@ -147,6 +148,7 @@ def test_run_passes_language_and_entities_to_analyzer(self): @pytest.mark.integration def test_run_integration(self): cleaner = PresidioDocumentCleaner() + cleaner.warm_up() docs = [Document(content="My name is John Smith and my email is john@example.com")] result = cleaner.run(documents=docs) diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py index 0cbbbf0c72..77d73a0250 100644 --- a/integrations/presidio/tests/test_presidio_entity_extractor.py +++ b/integrations/presidio/tests/test_presidio_entity_extractor.py @@ -116,6 +116,7 @@ def test_run_preserves_existing_metadata(self): @pytest.mark.integration def test_run_integration(self): extractor = PresidioEntityExtractor() + extractor.warm_up() docs = [Document(content="Contact Alice at alice@example.com")] result = extractor.run(documents=docs) diff --git a/integrations/presidio/tests/test_presidio_text_cleaner.py b/integrations/presidio/tests/test_presidio_text_cleaner.py index 8c1922bbd3..030ac5057b 100644 --- a/integrations/presidio/tests/test_presidio_text_cleaner.py +++ b/integrations/presidio/tests/test_presidio_text_cleaner.py @@ -65,6 +65,7 @@ def test_run_skips_on_error(self, caplog): cleaner = PresidioTextCleaner() cleaner._analyzer = MagicMock() cleaner._analyzer.analyze.side_effect = Exception("error") + cleaner._anonymizer = MagicMock() with caplog.at_level(logging.WARNING): result = cleaner.run(texts=["My name is John"]) @@ -88,6 +89,7 @@ def test_run_empty_text(self): @pytest.mark.integration def test_run_integration(self): cleaner = PresidioTextCleaner() + cleaner.warm_up() result = cleaner.run(texts=["Hi, I am Alice and my phone is 212-555-5678"]) assert len(result["texts"]) == 1