diff --git a/.github/labeler.yml b/.github/labeler.yml index 4d44b76b10..ffe59fb600 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -198,6 +198,11 @@ integration:pgvector: - any-glob-to-any-file: "integrations/pgvector/**/*" - any-glob-to-any-file: ".github/workflows/pgvector.yml" +integration:presidio: + - changed-files: + - any-glob-to-any-file: "integrations/presidio/**/*" + - any-glob-to-any-file: ".github/workflows/presidio.yml" + integration:pinecone: - changed-files: - any-glob-to-any-file: "integrations/pinecone/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index f4b83385a5..7c80a698cf 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -43,6 +43,7 @@ on: - "Test / paddleocr" - "Test / pgvector" - "Test / pinecone" + - "Test / presidio" - "Test / pyversity" - "Test / qdrant" - "Test / ragas" diff --git a/.github/workflows/presidio.yml b/.github/workflows/presidio.yml new file mode 100644 index 0000000000..597d5fb208 --- /dev/null +++ b/.github/workflows/presidio.yml @@ -0,0 +1,72 @@ +name: Test / presidio + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/presidio/**" + - "!integrations/presidio/*.md" + - ".github/workflows/presidio.yml" + +defaults: + run: + working-directory: integrations/presidio + +concurrency: + group: presidio-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10", "3.14"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-latest + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index f52e01b852..32c0508234 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [paddleocr-haystack](integrations/paddleocr/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack) | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) | | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) | | [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) | +| [presidio-haystack](integrations/presidio/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml) | | | | [pyversity-haystack](integrations/pyversity/) | Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack) | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) | | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant-combined/htmlcov/index.html) | | [ragas-haystack](integrations/ragas/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/ragas-haystack.svg)](https://pypi.org/project/ragas-haystack) | [![Test / ragas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ragas/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ragas/htmlcov/index.html) | | diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md new file mode 100644 index 0000000000..2f9e57089b --- /dev/null +++ b/integrations/presidio/README.md @@ -0,0 +1,10 @@ +# presidio-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/presidio/CHANGELOG.md) + +--- + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/presidio/pydoc/config_docusaurus.yml b/integrations/presidio/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..def818e2a9 --- /dev/null +++ b/integrations/presidio/pydoc/config_docusaurus.yml @@ -0,0 +1,15 @@ +loaders: + - modules: + - haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner + - haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner + - haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Presidio integration for Haystack + id: integrations-presidio + filename: presidio.md + title: Presidio diff --git a/integrations/presidio/pyproject.toml b/integrations/presidio/pyproject.toml new file mode 100644 index 0000000000..c8e166796f --- /dev/null +++ b/integrations/presidio/pyproject.toml @@ -0,0 +1,167 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "presidio-haystack" +dynamic = ["version"] +description = "Haystack integration for Microsoft Presidio — PII detection and anonymization" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = ["Haystack", "Presidio", "PII", "anonymization", "privacy", "NLP"] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.9.0", + "presidio-analyzer>=2.2.0", + "presidio-anonymizer>=2.2.0", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/presidio-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/presidio-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = [ + "presidio_analyzer", + "presidio_analyzer.*", + "presidio_anonymizer", + "presidio_anonymizer.*", +] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", + "D103", + "D205", + "D209", + "D213", + "D417", + "D419", + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + "B027", + "B008", + "S105", + "S106", + "S107", + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +relative_files = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py new file mode 100644 index 0000000000..bdaf79cba1 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner import PresidioDocumentCleaner +from haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor import PresidioEntityExtractor +from haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner import PresidioTextCleaner + +__all__ = ["PresidioDocumentCleaner", "PresidioEntityExtractor", "PresidioTextCleaner"] diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py new file mode 100644 index 0000000000..348593f440 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py @@ -0,0 +1,112 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack import Document, component, logging +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine + +logger = logging.getLogger(__name__) + + +@component +class PresidioDocumentCleaner: + """ + Anonymizes PII in Haystack Documents using [Microsoft Presidio](https://microsoft.github.io/presidio/). + + Accepts a list of Documents, detects personally identifiable information (PII) in their + text content, and returns new Documents with PII replaced by entity type placeholders + (e.g. ``, ``). Original Documents are not mutated. + + Documents without text content are passed through unchanged. + + Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines. + + ### Usage example + + ```python + from haystack import Document + from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner + + cleaner = PresidioDocumentCleaner() + cleaner.warm_up() + result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")]) + print(result["documents"][0].content) + # My name is and my email is + ``` + """ + + def __init__( + self, + *, + language: str = "en", + entities: list[str] | None = None, + score_threshold: float = 0.35, + ) -> None: + """ + Initializes the PresidioDocumentCleaner. + + :param language: + Language code for PII detection. Defaults to `"en"`. + See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/). + :param entities: + List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`). + If `None`, all supported entity types are used. + See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/). + :param score_threshold: + Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`. + See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/). + """ + self.language = language + self.entities = entities + self.score_threshold = score_threshold + self._analyzer: AnalyzerEngine | None = None + self._anonymizer: AnonymizerEngine | None = None + + def warm_up(self) -> None: + """ + Initializes the Presidio analyzer and anonymizer engines. + + This method loads the underlying NLP models and should be called before `run()`. + In a Haystack Pipeline, this is called automatically before the first run. + """ + if self._analyzer is None: + self._analyzer = AnalyzerEngine() + if self._anonymizer is None: + self._anonymizer = AnonymizerEngine() + + @component.output_types(documents=list[Document]) + def run(self, documents: list[Document]) -> dict[str, list[Document]]: + """ + Anonymizes PII in the provided Documents. + + :param documents: + List of Documents whose text content will be anonymized. + :returns: + A dictionary with key `documents` containing the cleaned Documents. + """ + cleaned: list[Document] = [] + for doc in documents: + if doc.content is None: + cleaned.append(doc) + continue + if self._analyzer is None or self._anonymizer is None: + msg = "The component was not warmed up. Call warm_up() before running it." + raise RuntimeError(msg) + try: + analyzer_results = self._analyzer.analyze( + text=doc.content, + language=self.language, + entities=self.entities, + score_threshold=self.score_threshold, + ) + anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results) # type: ignore[arg-type] + cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy())) + except Exception as e: + logger.warning( + "Could not anonymize document {doc_id}. Skipping it. Error: {error}", + doc_id=doc.id, + error=e, + ) + cleaned.append(doc) + return {"documents": cleaned} diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py new file mode 100644 index 0000000000..7b1b42b3d5 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import replace + +from haystack import Document, component, logging +from presidio_analyzer import AnalyzerEngine + +logger = logging.getLogger(__name__) + + +@component +class PresidioEntityExtractor: + """ + Detects PII entities in Haystack Documents using [Microsoft Presidio Analyzer](https://microsoft.github.io/presidio/). + + Accepts a list of Documents and returns new Documents with detected PII entities stored + in each Document's metadata under the key `"entities"`. Each entry in the list contains + the entity type, start/end character offsets, and the confidence score. + + Original Documents are not mutated. Documents without text content are passed through unchanged. + + Call `warm_up()` before running this component to load the Presidio analyzer engine. + + ### Usage example + + ```python + from haystack import Document + from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor + + extractor = PresidioEntityExtractor() + extractor.warm_up() + result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")]) + print(result["documents"][0].meta["entities"]) + # [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85}, + # {"entity_type": "EMAIL_ADDRESS", "start": 17, "end": 34, "score": 1.0}] + ``` + """ + + def __init__( + self, + *, + language: str = "en", + entities: list[str] | None = None, + score_threshold: float = 0.35, + ) -> None: + """ + Initializes the PresidioEntityExtractor. + + :param language: + Language code for PII detection. Defaults to `"en"`. + See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/). + :param entities: + List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`). + If `None`, all supported entity types are detected. + See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/). + :param score_threshold: + Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`. + See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/). + """ + self.language = language + self.entities = entities + self.score_threshold = score_threshold + self._analyzer: AnalyzerEngine | None = None + + def warm_up(self) -> None: + """ + Initializes the Presidio analyzer engine. + + This method loads the underlying NLP models and should be called before `run()`. + In a Haystack Pipeline, this is called automatically before the first run. + """ + if self._analyzer is None: + self._analyzer = AnalyzerEngine() + + @component.output_types(documents=list[Document]) + def run(self, documents: list[Document]) -> dict[str, list[Document]]: + """ + Detects PII entities in the provided Documents. + + :param documents: + List of Documents to analyze for PII entities. + :returns: + A dictionary with key `documents` containing Documents with detected entities + stored in metadata under the key `"entities"`. + """ + result_docs: list[Document] = [] + for doc in documents: + if doc.content is None: + result_docs.append(doc) + continue + if self._analyzer is None: + msg = "The component was not warmed up. Call warm_up() before running it." + raise RuntimeError(msg) + try: + analyzer_results = self._analyzer.analyze( + text=doc.content, + language=self.language, + entities=self.entities, + score_threshold=self.score_threshold, + ) + entities = [ + { + "entity_type": r.entity_type, + "start": r.start, + "end": r.end, + "score": r.score, + } + for r in analyzer_results + ] + result_docs.append(replace(doc, meta={**doc.meta, "entities": entities})) + except Exception as e: + logger.warning( + "Could not extract entities from document {doc_id}. Skipping it. Error: {error}", + doc_id=doc.id, + error=e, + ) + result_docs.append(doc) + return {"documents": result_docs} diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py new file mode 100644 index 0000000000..d20f889c19 --- /dev/null +++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack import component, logging +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine + +logger = logging.getLogger(__name__) + + +@component +class PresidioTextCleaner: + """ + Anonymizes PII in plain strings using [Microsoft Presidio](https://microsoft.github.io/presidio/). + + Accepts a list of strings, detects personally identifiable information (PII), and returns + a new list of strings with PII replaced by entity type placeholders (e.g. ``). + Useful for sanitizing user queries before they are sent to an LLM. + + Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines. + + ### Usage example + + ```python + from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner + + cleaner = PresidioTextCleaner() + cleaner.warm_up() + result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"]) + print(result["texts"][0]) + # Hi, I am , call me at + ``` + """ + + def __init__( + self, + *, + language: str = "en", + entities: list[str] | None = None, + score_threshold: float = 0.35, + ) -> None: + """ + Initializes the PresidioTextCleaner. + + :param language: + Language code for PII detection. Defaults to `"en"`. + See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/). + :param entities: + List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`). + If `None`, all supported entity types are used. + See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/). + :param score_threshold: + Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`. + See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/). + """ + self.language = language + self.entities = entities + self.score_threshold = score_threshold + self._analyzer: AnalyzerEngine | None = None + self._anonymizer: AnonymizerEngine | None = None + + def warm_up(self) -> None: + """ + Initializes the Presidio analyzer and anonymizer engines. + + This method loads the underlying NLP models and should be called before `run()`. + In a Haystack Pipeline, this is called automatically before the first run. + """ + if self._analyzer is None: + self._analyzer = AnalyzerEngine() + if self._anonymizer is None: + self._anonymizer = AnonymizerEngine() + + @component.output_types(texts=list[str]) + def run(self, texts: list[str]) -> dict[str, list[str]]: + """ + Anonymizes PII in the provided strings. + + :param texts: + List of strings to anonymize. + :returns: + A dictionary with key `texts` containing the cleaned strings. + """ + if self._analyzer is None or self._anonymizer is None: + msg = "The component was not warmed up. Call warm_up() before running it." + raise RuntimeError(msg) + cleaned: list[str] = [] + for text in texts: + try: + analyzer_results = self._analyzer.analyze( + text=text, + language=self.language, + entities=self.entities, + score_threshold=self.score_threshold, + ) + anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results) # type: ignore[arg-type] + cleaned.append(anonymized.text) + except Exception as e: + logger.warning( + "Could not anonymize text. Skipping it. Error: {error}", + error=e, + ) + cleaned.append(text) + return {"texts": cleaned} diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed b/integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py new file mode 100644 index 0000000000..7c1d35107f --- /dev/null +++ b/integrations/presidio/tests/test_presidio_document_cleaner.py @@ -0,0 +1,157 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from unittest.mock import MagicMock + +import pytest +from haystack import Document +from haystack.core.serialization import component_from_dict, component_to_dict + +from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner + + +class TestPresidioDocumentCleaner: + def test_init_defaults(self): + cleaner = PresidioDocumentCleaner() + assert cleaner.language == "en" + assert cleaner.entities is None + assert cleaner.score_threshold == 0.35 + + def test_init_custom_params(self): + cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.7) + assert cleaner.language == "de" + assert cleaner.entities == ["PERSON"] + assert cleaner.score_threshold == 0.7 + + def test_to_dict(self): + cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5) + data = component_to_dict(cleaner, "PresidioDocumentCleaner") + expected_type = ( + "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner" + ) + assert data["type"] == expected_type + assert data["init_parameters"]["language"] == "en" + assert data["init_parameters"]["entities"] == ["EMAIL_ADDRESS"] + assert data["init_parameters"]["score_threshold"] == 0.5 + + def test_from_dict(self): + data = { + "type": ( + "haystack_integrations.components.preprocessors.presidio" + ".presidio_document_cleaner.PresidioDocumentCleaner" + ), + "init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6}, + } + cleaner = component_from_dict(PresidioDocumentCleaner, data, "PresidioDocumentCleaner") + assert cleaner.language == "de" + assert cleaner.entities == ["PERSON"] + assert cleaner.score_threshold == 0.6 + + def test_run_anonymizes_pii(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "My name is and email is " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + docs = [Document(content="My name is John and email is john@example.com")] + result = cleaner.run(documents=docs) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "My name is and email is " + + def test_run_preserves_metadata(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "Hello " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + docs = [Document(content="Hello John", meta={"source": "email", "page": 1})] + result = cleaner.run(documents=docs) + + assert result["documents"][0].meta["source"] == "email" + assert result["documents"][0].meta["page"] == 1 + + def test_run_does_not_mutate_original(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "Hello " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + original = Document(content="Hello John") + cleaner.run(documents=[original]) + + assert original.content == "Hello John" + + def test_run_passes_through_none_content(self): + cleaner = PresidioDocumentCleaner() + doc = Document(content=None, meta={"source": "test"}) + result = cleaner.run(documents=[doc]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content is None + assert result["documents"][0].meta["source"] == "test" + + def test_run_skips_on_error(self, caplog): + cleaner = PresidioDocumentCleaner() + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.side_effect = Exception("Analyzer error") + cleaner._anonymizer = MagicMock() + + doc = Document(content="Some text with PII") + with caplog.at_level(logging.WARNING): + result = cleaner.run(documents=[doc]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Some text with PII" + assert "Could not anonymize" in caplog.text + + def test_run_multiple_documents(self): + cleaner = PresidioDocumentCleaner() + mock_result = MagicMock() + mock_result.text = "cleaned" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + docs = [Document(content=f"doc {i}") for i in range(3)] + result = cleaner.run(documents=docs) + + assert len(result["documents"]) == 3 + + def test_run_passes_language_and_entities_to_analyzer(self): + cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.8) + mock_result = MagicMock() + mock_result.text = "cleaned" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + cleaner.run(documents=[Document(content="Hello John")]) + + cleaner._analyzer.analyze.assert_called_once_with( + text="Hello John", language="de", entities=["PERSON"], score_threshold=0.8 + ) + + @pytest.mark.integration + def test_run_integration(self): + cleaner = PresidioDocumentCleaner() + cleaner.warm_up() + docs = [Document(content="My name is John Smith and my email is john@example.com")] + result = cleaner.run(documents=docs) + + assert len(result["documents"]) == 1 + assert "John Smith" not in result["documents"][0].content + assert "john@example.com" not in result["documents"][0].content diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py new file mode 100644 index 0000000000..77d73a0250 --- /dev/null +++ b/integrations/presidio/tests/test_presidio_entity_extractor.py @@ -0,0 +1,125 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from unittest.mock import MagicMock + +import pytest +from haystack import Document +from haystack.core.serialization import component_from_dict, component_to_dict + +from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor + + +class TestPresidioEntityExtractor: + def test_init_defaults(self): + extractor = PresidioEntityExtractor() + assert extractor.language == "en" + assert extractor.entities is None + assert extractor.score_threshold == 0.35 + + def test_to_dict(self): + extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6) + data = component_to_dict(extractor, "PresidioEntityExtractor") + expected_type = ( + "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor" + ) + assert data["type"] == expected_type + assert data["init_parameters"]["entities"] == ["PERSON"] + assert data["init_parameters"]["score_threshold"] == 0.6 + + def test_from_dict(self): + data = { + "type": ( + "haystack_integrations.components.preprocessors.presidio" + ".presidio_entity_extractor.PresidioEntityExtractor" + ), + "init_parameters": {"language": "en", "entities": ["EMAIL_ADDRESS"], "score_threshold": 0.5}, + } + extractor = component_from_dict(PresidioEntityExtractor, data, "PresidioEntityExtractor") + assert extractor.entities == ["EMAIL_ADDRESS"] + + def test_run_extracts_entities_into_metadata(self): + extractor = PresidioEntityExtractor() + mock_entity = MagicMock() + mock_entity.entity_type = "PERSON" + mock_entity.start = 11 + mock_entity.end = 15 + mock_entity.score = 0.85 + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [mock_entity] + + docs = [Document(content="My name is John")] + result = extractor.run(documents=docs) + + entities = result["documents"][0].meta["entities"] + assert len(entities) == 1 + assert entities[0]["entity_type"] == "PERSON" + assert entities[0]["start"] == 11 + assert entities[0]["end"] == 15 + assert entities[0]["score"] == 0.85 + + def test_run_does_not_mutate_original(self): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [] + + original = Document(content="Hello John", meta={"source": "test"}) + extractor.run(documents=[original]) + + assert "entities" not in original.meta + + def test_run_passes_through_none_content(self): + extractor = PresidioEntityExtractor() + doc = Document(content=None, meta={"source": "test"}) + result = extractor.run(documents=[doc]) + + assert result["documents"][0].content is None + assert "entities" not in result["documents"][0].meta + + def test_run_empty_entities(self): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [] + + docs = [Document(content="No PII here")] + result = extractor.run(documents=docs) + + assert result["documents"][0].meta["entities"] == [] + + def test_run_skips_on_error(self, caplog): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.side_effect = Exception("Analyzer error") + + doc = Document(content="Some text") + with caplog.at_level(logging.WARNING): + result = extractor.run(documents=[doc]) + + assert result["documents"][0].content == "Some text" + assert "entities" not in result["documents"][0].meta + assert "Could not extract entities" in caplog.text + + def test_run_preserves_existing_metadata(self): + extractor = PresidioEntityExtractor() + extractor._analyzer = MagicMock() + extractor._analyzer.analyze.return_value = [] + + docs = [Document(content="Hello", meta={"page": 3, "author": "Bob"})] + result = extractor.run(documents=docs) + + assert result["documents"][0].meta["page"] == 3 + assert result["documents"][0].meta["author"] == "Bob" + assert result["documents"][0].meta["entities"] == [] + + @pytest.mark.integration + def test_run_integration(self): + extractor = PresidioEntityExtractor() + extractor.warm_up() + docs = [Document(content="Contact Alice at alice@example.com")] + result = extractor.run(documents=docs) + + entities = result["documents"][0].meta["entities"] + entity_types = [e["entity_type"] for e in entities] + assert "EMAIL_ADDRESS" in entity_types diff --git a/integrations/presidio/tests/test_presidio_text_cleaner.py b/integrations/presidio/tests/test_presidio_text_cleaner.py new file mode 100644 index 0000000000..030ac5057b --- /dev/null +++ b/integrations/presidio/tests/test_presidio_text_cleaner.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from unittest.mock import MagicMock + +import pytest +from haystack.core.serialization import component_from_dict, component_to_dict + +from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner + + +class TestPresidioTextCleaner: + def test_init_defaults(self): + cleaner = PresidioTextCleaner() + assert cleaner.language == "en" + assert cleaner.entities is None + assert cleaner.score_threshold == 0.35 + + def test_to_dict(self): + cleaner = PresidioTextCleaner(language="en", entities=["PHONE_NUMBER"], score_threshold=0.5) + data = component_to_dict(cleaner, "PresidioTextCleaner") + assert ( + data["type"] + == "haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner.PresidioTextCleaner" + ) + assert data["init_parameters"]["entities"] == ["PHONE_NUMBER"] + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner.PresidioTextCleaner", + "init_parameters": {"language": "en", "entities": None, "score_threshold": 0.4}, + } + cleaner = component_from_dict(PresidioTextCleaner, data, "PresidioTextCleaner") + assert cleaner.score_threshold == 0.4 + + def test_run_anonymizes_pii(self): + cleaner = PresidioTextCleaner() + mock_result = MagicMock() + mock_result.text = "Call me at " + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + result = cleaner.run(texts=["Call me at 212-555-1234"]) + + assert result["texts"][0] == "Call me at " + + def test_run_multiple_texts(self): + cleaner = PresidioTextCleaner() + mock_result = MagicMock() + mock_result.text = "cleaned" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + result = cleaner.run(texts=["text 1", "text 2", "text 3"]) + + assert len(result["texts"]) == 3 + + def test_run_skips_on_error(self, caplog): + cleaner = PresidioTextCleaner() + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.side_effect = Exception("error") + cleaner._anonymizer = MagicMock() + + with caplog.at_level(logging.WARNING): + result = cleaner.run(texts=["My name is John"]) + + assert result["texts"][0] == "My name is John" + assert "Could not anonymize" in caplog.text + + def test_run_empty_text(self): + cleaner = PresidioTextCleaner() + mock_result = MagicMock() + mock_result.text = "" + cleaner._anonymizer = MagicMock() + cleaner._anonymizer.anonymize.return_value = mock_result + cleaner._analyzer = MagicMock() + cleaner._analyzer.analyze.return_value = [] + + result = cleaner.run(texts=[""]) + + assert result["texts"][0] == "" + + @pytest.mark.integration + def test_run_integration(self): + cleaner = PresidioTextCleaner() + cleaner.warm_up() + result = cleaner.run(texts=["Hi, I am Alice and my phone is 212-555-5678"]) + + assert len(result["texts"]) == 1 + assert "Alice" not in result["texts"][0] + assert "212-555-5678" not in result["texts"][0]