Skip to content

Commit 95335c8

Browse files
feat: add Presidio integration for PII detection and anonymization
Implements three Haystack components using Microsoft Presidio: - PresidioDocumentCleaner: anonymizes PII in list[Document] - PresidioTextCleaner: anonymizes PII in list[str] (for query sanitization) - PresidioEntityExtractor: detects PII entities and stores them in Document metadata Closes #3063 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent bf5c2a4 commit 95335c8

9 files changed

Lines changed: 892 additions & 0 deletions

File tree

.github/workflows/presidio.yml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: Test / presidio
2+
3+
on:
4+
schedule:
5+
- cron: "0 0 * * *"
6+
pull_request:
7+
paths:
8+
- "integrations/presidio/**"
9+
- "!integrations/presidio/*.md"
10+
- ".github/workflows/presidio.yml"
11+
12+
defaults:
13+
run:
14+
working-directory: integrations/presidio
15+
16+
concurrency:
17+
group: presidio-${{ github.head_ref }}
18+
cancel-in-progress: true
19+
20+
env:
21+
PYTHONUNBUFFERED: "1"
22+
FORCE_COLOR: "1"
23+
24+
jobs:
25+
run:
26+
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
27+
runs-on: ${{ matrix.os }}
28+
strategy:
29+
fail-fast: false
30+
matrix:
31+
os: [ubuntu-latest]
32+
python-version: ["3.10", "3.13"]
33+
34+
steps:
35+
- uses: actions/checkout@v4
36+
37+
- name: Set up Python ${{ matrix.python-version }}
38+
uses: actions/setup-python@v5
39+
with:
40+
python-version: ${{ matrix.python-version }}
41+
42+
- name: Install Hatch
43+
run: pip install hatch
44+
45+
- name: Lint
46+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
47+
run: hatch run fmt-check && hatch run test:types
48+
49+
- name: Run unit tests
50+
run: hatch run test:unit-cov-retry
51+
52+
- name: Run unit tests with lowest direct dependencies
53+
run: |
54+
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
55+
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
56+
hatch run test:unit
57+
58+
- name: Nightly - run unit tests with Haystack main branch
59+
if: github.event_name == 'schedule'
60+
run: |
61+
hatch env prune
62+
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
63+
hatch run test:unit
64+
65+
notify-slack-on-failure:
66+
needs: run
67+
if: failure() && github.event_name == 'schedule'
68+
runs-on: ubuntu-latest
69+
steps:
70+
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
71+
with:
72+
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
[build-system]
2+
requires = ["hatchling", "hatch-vcs"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "presidio-haystack"
7+
dynamic = ["version"]
8+
description = "Haystack integration for Microsoft Presidio — PII detection and anonymization"
9+
readme = "README.md"
10+
requires-python = ">=3.10"
11+
license = "Apache-2.0"
12+
keywords = ["Haystack", "Presidio", "PII", "anonymization", "privacy", "NLP"]
13+
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
14+
classifiers = [
15+
"License :: OSI Approved :: Apache Software License",
16+
"Development Status :: 4 - Beta",
17+
"Programming Language :: Python",
18+
"Programming Language :: Python :: 3.10",
19+
"Programming Language :: Python :: 3.11",
20+
"Programming Language :: Python :: 3.12",
21+
"Programming Language :: Python :: 3.13",
22+
"Programming Language :: Python :: Implementation :: CPython",
23+
"Programming Language :: Python :: Implementation :: PyPy",
24+
]
25+
dependencies = [
26+
"haystack-ai>=2.9.0",
27+
"presidio-analyzer>=2.2.0",
28+
"presidio-anonymizer>=2.2.0",
29+
]
30+
31+
[project.urls]
32+
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio#readme"
33+
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
34+
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio"
35+
36+
[tool.hatch.build.targets.wheel]
37+
packages = ["src/haystack_integrations"]
38+
39+
[tool.hatch.version]
40+
source = "vcs"
41+
tag-pattern = 'integrations\/presidio-v(?P<version>.*)'
42+
43+
[tool.hatch.version.raw-options]
44+
root = "../.."
45+
git_describe_command = 'git describe --tags --match="integrations/presidio-v[0-9]*"'
46+
47+
[tool.hatch.envs.default]
48+
installer = "uv"
49+
dependencies = ["haystack-pydoc-tools", "ruff"]
50+
51+
[tool.hatch.envs.default.scripts]
52+
docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
53+
fmt = "ruff check --fix {args}; ruff format {args}"
54+
fmt-check = "ruff check {args} && ruff format --check {args}"
55+
56+
[tool.hatch.envs.test]
57+
dependencies = [
58+
"pytest",
59+
"pytest-asyncio",
60+
"pytest-cov",
61+
"pytest-rerunfailures",
62+
"mypy",
63+
"pip",
64+
]
65+
66+
[tool.hatch.envs.test.scripts]
67+
unit = 'pytest -m "not integration" {args:tests}'
68+
integration = 'pytest -m "integration" {args:tests}'
69+
all = 'pytest {args:tests}'
70+
unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
71+
types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}"
72+
73+
[tool.mypy]
74+
install_types = true
75+
non_interactive = true
76+
check_untyped_defs = true
77+
disallow_incomplete_defs = true
78+
79+
[[tool.mypy.overrides]]
80+
module = [
81+
"presidio_analyzer",
82+
"presidio_analyzer.*",
83+
"presidio_anonymizer",
84+
"presidio_anonymizer.*",
85+
]
86+
ignore_missing_imports = true
87+
88+
[tool.ruff]
89+
line-length = 120
90+
91+
[tool.ruff.lint]
92+
select = [
93+
"A",
94+
"ANN",
95+
"ARG",
96+
"B",
97+
"C",
98+
"D102",
99+
"D103",
100+
"D205",
101+
"D209",
102+
"D213",
103+
"D417",
104+
"D419",
105+
"DTZ",
106+
"E",
107+
"EM",
108+
"F",
109+
"I",
110+
"ICN",
111+
"ISC",
112+
"N",
113+
"PLC",
114+
"PLE",
115+
"PLR",
116+
"PLW",
117+
"Q",
118+
"RUF",
119+
"S",
120+
"T",
121+
"TID",
122+
"UP",
123+
"W",
124+
"YTT",
125+
]
126+
ignore = [
127+
"B027",
128+
"B008",
129+
"S105",
130+
"S106",
131+
"S107",
132+
"C901",
133+
"PLR0911",
134+
"PLR0912",
135+
"PLR0913",
136+
"PLR0915",
137+
"ANN401",
138+
]
139+
140+
[tool.ruff.lint.isort]
141+
known-first-party = ["haystack_integrations"]
142+
143+
[tool.ruff.lint.flake8-tidy-imports]
144+
ban-relative-imports = "parents"
145+
146+
[tool.ruff.lint.per-file-ignores]
147+
"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]
148+
149+
[tool.coverage.run]
150+
source = ["haystack_integrations"]
151+
branch = true
152+
relative_files = true
153+
parallel = false
154+
155+
[tool.coverage.report]
156+
omit = ["*/tests/*", "*/__init__.py"]
157+
show_missing = true
158+
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
159+
160+
[tool.pytest.ini_options]
161+
addopts = "--strict-markers"
162+
markers = [
163+
"integration: integration tests",
164+
]
165+
log_cli = true
166+
asyncio_default_fixture_loop_scope = "function"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner import PresidioDocumentCleaner
6+
from haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor import PresidioEntityExtractor
7+
from haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner import PresidioTextCleaner
8+
9+
__all__ = ["PresidioDocumentCleaner", "PresidioEntityExtractor", "PresidioTextCleaner"]
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Any
6+
7+
from haystack import Document, component, logging
8+
from presidio_analyzer import AnalyzerEngine
9+
from presidio_anonymizer import AnonymizerEngine
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
@component
15+
class PresidioDocumentCleaner:
16+
"""
17+
Anonymizes PII in Haystack Documents using [Microsoft Presidio](https://microsoft.github.io/presidio/).
18+
19+
Accepts a list of Documents, detects personally identifiable information (PII) in their
20+
text content, and returns new Documents with PII replaced by entity type placeholders
21+
(e.g. `<PERSON>`, `<EMAIL_ADDRESS>`). Original Documents are not mutated.
22+
23+
Documents without text content are passed through unchanged.
24+
25+
### Usage example
26+
27+
```python
28+
from haystack import Document
29+
from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
30+
31+
cleaner = PresidioDocumentCleaner()
32+
result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")])
33+
print(result["documents"][0].content)
34+
# My name is <PERSON> and my email is <EMAIL_ADDRESS>
35+
```
36+
"""
37+
38+
def __init__(
39+
self,
40+
language: str = "en",
41+
entities: list[str] | None = None,
42+
score_threshold: float = 0.35,
43+
) -> None:
44+
"""
45+
Initializes the PresidioDocumentCleaner.
46+
47+
:param language:
48+
Language code for PII detection. Defaults to `"en"`.
49+
:param entities:
50+
List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
51+
If `None`, all supported entity types are used.
52+
:param score_threshold:
53+
Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`.
54+
"""
55+
self.language = language
56+
self.entities = entities
57+
self.score_threshold = score_threshold
58+
self._analyzer = AnalyzerEngine()
59+
self._anonymizer = AnonymizerEngine()
60+
61+
@component.output_types(documents=list[Document])
62+
def run(self, documents: list[Document]) -> dict[str, Any]:
63+
"""
64+
Anonymizes PII in the provided Documents.
65+
66+
:param documents:
67+
List of Documents whose text content will be anonymized.
68+
:returns:
69+
A dictionary with key `documents` containing the cleaned Documents.
70+
"""
71+
cleaned: list[Document] = []
72+
for doc in documents:
73+
if doc.content is None:
74+
cleaned.append(doc)
75+
continue
76+
try:
77+
analyzer_results = self._analyzer.analyze(
78+
text=doc.content,
79+
language=self.language,
80+
entities=self.entities,
81+
score_threshold=self.score_threshold,
82+
)
83+
anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results)
84+
cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy()))
85+
except Exception as e:
86+
logger.warning(
87+
"Could not anonymize document {doc_id}. Skipping it. Error: {error}",
88+
doc_id=doc.id,
89+
error=e,
90+
)
91+
cleaned.append(doc)
92+
return {"documents": cleaned}

0 commit comments

Comments
 (0)