From 028361072bc4967df5e89cc6150af9654e9ad73d Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Wed, 1 Apr 2026 12:49:21 +0500
Subject: [PATCH 1/9] feat: add Presidio integration for PII detection and
 anonymization

Implements three Haystack components using Microsoft Presidio:
- PresidioDocumentCleaner: anonymizes PII in list[Document]
- PresidioTextCleaner: anonymizes PII in list[str] (for query sanitization)
- PresidioEntityExtractor: detects PII entities and stores them in Document metadata
---
 .github/workflows/presidio.yml                |  72 ++++++++
 integrations/presidio/pyproject.toml          | 166 ++++++++++++++++++
 .../preprocessors/presidio/__init__.py        |   9 +
 .../presidio/presidio_document_cleaner.py     |  92 ++++++++++
 .../presidio/presidio_entity_extractor.py     | 100 +++++++++++
 .../presidio/presidio_text_cleaner.py         |  85 +++++++++
 .../tests/test_presidio_document_cleaner.py   | 152 ++++++++++++++++
 .../tests/test_presidio_entity_extractor.py   | 121 +++++++++++++
 .../tests/test_presidio_text_cleaner.py       |  95 ++++++++++
 9 files changed, 892 insertions(+)
 create mode 100644 .github/workflows/presidio.yml
 create mode 100644 integrations/presidio/pyproject.toml
 create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py
 create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
 create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
 create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
 create mode 100644 integrations/presidio/tests/test_presidio_document_cleaner.py
 create mode 100644 integrations/presidio/tests/test_presidio_entity_extractor.py
 create mode 100644 integrations/presidio/tests/test_presidio_text_cleaner.py

diff --git a/.github/workflows/presidio.yml b/.github/workflows/presidio.yml
new file mode 100644
index 0000000000..7395bde691
--- /dev/null
+++ b/.github/workflows/presidio.yml
@@ -0,0 +1,72 @@
+name: Test / presidio
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - "integrations/presidio/**"
+      - "!integrations/presidio/*.md"
+      - ".github/workflows/presidio.yml"
+
+defaults:
+  run:
+    working-directory: integrations/presidio
+
+concurrency:
+  group: presidio-${{ github.head_ref }}
+  cancel-in-progress: true
+
+env:
+  PYTHONUNBUFFERED: "1"
+  FORCE_COLOR: "1"
+
+jobs:
+  run:
+    name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.10", "3.13"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Hatch
+        run: pip install hatch
+
+      - name: Lint
+        if: matrix.python-version == '3.10' && runner.os == 'Linux'
+        run: hatch run fmt-check && hatch run test:types
+
+      - name: Run unit tests
+        run: hatch run test:unit-cov-retry
+
+      - name: Run unit tests with lowest direct dependencies
+        run: |
+          hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
+          hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
+          hatch run test:unit
+
+      - name: Nightly - run unit tests with Haystack main branch
+        if: github.event_name == 'schedule'
+        run: |
+          hatch env prune
+          hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
+          hatch run test:unit
+
+  notify-slack-on-failure:
+    needs: run
+    if: failure() && github.event_name == 'schedule'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
+        with:
+          slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}
diff --git a/integrations/presidio/pyproject.toml b/integrations/presidio/pyproject.toml
new file mode 100644
index 0000000000..1a618957e4
--- /dev/null
+++ b/integrations/presidio/pyproject.toml
@@ -0,0 +1,166 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "presidio-haystack"
+dynamic = ["version"]
+description = "Haystack integration for Microsoft Presidio — PII detection and anonymization"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "Apache-2.0"
+keywords = ["Haystack", "Presidio", "PII", "anonymization", "privacy", "NLP"]
+authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
+classifiers = [
+  "License :: OSI Approved :: Apache Software License",
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+  "haystack-ai>=2.9.0",
+  "presidio-analyzer>=2.2.0",
+  "presidio-anonymizer>=2.2.0",
+]
+
+[project.urls]
+Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio#readme"
+Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
+Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/haystack_integrations"]
+
+[tool.hatch.version]
+source = "vcs"
+tag-pattern = 'integrations\/presidio-v(?P<version>.*)'
+
+[tool.hatch.version.raw-options]
+root = "../.."
+git_describe_command = 'git describe --tags --match="integrations/presidio-v[0-9]*"'
+
+[tool.hatch.envs.default]
+installer = "uv"
+dependencies = ["haystack-pydoc-tools", "ruff"]
+
+[tool.hatch.envs.default.scripts]
+docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
+fmt = "ruff check --fix {args}; ruff format {args}"
+fmt-check = "ruff check {args} && ruff format --check {args}"
+
+[tool.hatch.envs.test]
+dependencies = [
+    "pytest",
+    "pytest-asyncio",
+    "pytest-cov",
+    "pytest-rerunfailures",
+    "mypy",
+    "pip",
+]
+
+[tool.hatch.envs.test.scripts]
+unit = 'pytest -m "not integration" {args:tests}'
+integration = 'pytest -m "integration" {args:tests}'
+all = 'pytest {args:tests}'
+unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
+types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}"
+
+[tool.mypy]
+install_types = true
+non_interactive = true
+check_untyped_defs = true
+disallow_incomplete_defs = true
+
+[[tool.mypy.overrides]]
+module = [
+  "presidio_analyzer",
+  "presidio_analyzer.*",
+  "presidio_anonymizer",
+  "presidio_anonymizer.*",
+]
+ignore_missing_imports = true
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+select = [
+    "A",
+    "ANN",
+    "ARG",
+    "B",
+    "C",
+    "D102",
+    "D103",
+    "D205",
+    "D209",
+    "D213",
+    "D417",
+    "D419",
+    "DTZ",
+    "E",
+    "EM",
+    "F",
+    "I",
+    "ICN",
+    "ISC",
+    "N",
+    "PLC",
+    "PLE",
+    "PLR",
+    "PLW",
+    "Q",
+    "RUF",
+    "S",
+    "T",
+    "TID",
+    "UP",
+    "W",
+    "YTT",
+]
+ignore = [
+    "B027",
+    "B008",
+    "S105",
+    "S106",
+    "S107",
+    "C901",
+    "PLR0911",
+    "PLR0912",
+    "PLR0913",
+    "PLR0915",
+    "ANN401",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["haystack_integrations"]
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "parents"
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]
+
+[tool.coverage.run]
+source = ["haystack_integrations"]
+branch = true
+relative_files = true
+parallel = false
+
+[tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing = true
+exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
+
+[tool.pytest.ini_options]
+addopts = "--strict-markers"
+markers = [
+  "integration: integration tests",
+]
+log_cli = true
+asyncio_default_fixture_loop_scope = "function"
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py
new file mode 100644
index 0000000000..bdaf79cba1
--- /dev/null
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner import PresidioDocumentCleaner
+from haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor import PresidioEntityExtractor
+from haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner import PresidioTextCleaner
+
+__all__ = ["PresidioDocumentCleaner", "PresidioEntityExtractor", "PresidioTextCleaner"]
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
new file mode 100644
index 0000000000..0bdcef7a70
--- /dev/null
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+from haystack import Document, component, logging
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class PresidioDocumentCleaner:
+    """
+    Anonymizes PII in Haystack Documents using [Microsoft Presidio](https://microsoft.github.io/presidio/).
+
+    Accepts a list of Documents, detects personally identifiable information (PII) in their
+    text content, and returns new Documents with PII replaced by entity type placeholders
+    (e.g. `<PERSON>`, `<EMAIL_ADDRESS>`). Original Documents are not mutated.
+
+    Documents without text content are passed through unchanged.
+
+    ### Usage example
+
+    ```python
+    from haystack import Document
+    from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
+
+    cleaner = PresidioDocumentCleaner()
+    result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")])
+    print(result["documents"][0].content)
+    # My name is <PERSON> and my email is <EMAIL_ADDRESS>
+    ```
+    """
+
+    def __init__(
+        self,
+        language: str = "en",
+        entities: list[str] | None = None,
+        score_threshold: float = 0.35,
+    ) -> None:
+        """
+        Initializes the PresidioDocumentCleaner.
+
+        :param language:
+            Language code for PII detection. Defaults to `"en"`.
+        :param entities:
+            List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
+            If `None`, all supported entity types are used.
+        :param score_threshold:
+            Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`.
+        """
+        self.language = language
+        self.entities = entities
+        self.score_threshold = score_threshold
+        self._analyzer = AnalyzerEngine()
+        self._anonymizer = AnonymizerEngine()
+
+    @component.output_types(documents=list[Document])
+    def run(self, documents: list[Document]) -> dict[str, Any]:
+        """
+        Anonymizes PII in the provided Documents.
+
+        :param documents:
+            List of Documents whose text content will be anonymized.
+        :returns:
+            A dictionary with key `documents` containing the cleaned Documents.
+        """
+        cleaned: list[Document] = []
+        for doc in documents:
+            if doc.content is None:
+                cleaned.append(doc)
+                continue
+            try:
+                analyzer_results = self._analyzer.analyze(
+                    text=doc.content,
+                    language=self.language,
+                    entities=self.entities,
+                    score_threshold=self.score_threshold,
+                )
+                anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results)
+                cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy()))
+            except Exception as e:
+                logger.warning(
+                    "Could not anonymize document {doc_id}. Skipping it. Error: {error}",
+                    doc_id=doc.id,
+                    error=e,
+                )
+                cleaned.append(doc)
+        return {"documents": cleaned}
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
new file mode 100644
index 0000000000..2f71219de8
--- /dev/null
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+from haystack import Document, component, logging
+from presidio_analyzer import AnalyzerEngine
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class PresidioEntityExtractor:
+    """
+    Detects PII entities in Haystack Documents using [Microsoft Presidio Analyzer](https://microsoft.github.io/presidio/).
+
+    Accepts a list of Documents and returns new Documents with detected PII entities stored
+    in each Document's metadata under the key `"entities"`. Each entry in the list contains
+    the entity type, start/end character offsets, and the confidence score.
+
+    Original Documents are not mutated. Documents without text content are passed through unchanged.
+
+    ### Usage example
+
+    ```python
+    from haystack import Document
+    from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor
+
+    extractor = PresidioEntityExtractor()
+    result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")])
+    print(result["documents"][0].meta["entities"])
+    # [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85},
+    #  {"entity_type": "EMAIL_ADDRESS", "start": 17, "end": 34, "score": 1.0}]
+    ```
+    """
+
+    def __init__(
+        self,
+        language: str = "en",
+        entities: list[str] | None = None,
+        score_threshold: float = 0.35,
+    ) -> None:
+        """
+        Initializes the PresidioEntityExtractor.
+
+        :param language:
+            Language code for PII detection. Defaults to `"en"`.
+        :param entities:
+            List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
+            If `None`, all supported entity types are detected.
+        :param score_threshold:
+            Minimum confidence score (0–1) for a detected entity to be included. Defaults to `0.35`.
+        """
+        self.language = language
+        self.entities = entities
+        self.score_threshold = score_threshold
+        self._analyzer = AnalyzerEngine()
+
+    @component.output_types(documents=list[Document])
+    def run(self, documents: list[Document]) -> dict[str, Any]:
+        """
+        Detects PII entities in the provided Documents.
+
+        :param documents:
+            List of Documents to analyze for PII entities.
+        :returns:
+            A dictionary with key `documents` containing Documents with detected entities
+            stored in metadata under the key `"entities"`.
+        """
+        result_docs: list[Document] = []
+        for doc in documents:
+            if doc.content is None:
+                result_docs.append(doc)
+                continue
+            try:
+                analyzer_results = self._analyzer.analyze(
+                    text=doc.content,
+                    language=self.language,
+                    entities=self.entities,
+                    score_threshold=self.score_threshold,
+                )
+                entities = [
+                    {
+                        "entity_type": r.entity_type,
+                        "start": r.start,
+                        "end": r.end,
+                        "score": r.score,
+                    }
+                    for r in analyzer_results
+                ]
+                result_docs.append(Document(content=doc.content, meta={**doc.meta, "entities": entities}))
+            except Exception as e:
+                logger.warning(
+                    "Could not extract entities from document {doc_id}. Skipping it. Error: {error}",
+                    doc_id=doc.id,
+                    error=e,
+                )
+                result_docs.append(doc)
+        return {"documents": result_docs}
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
new file mode 100644
index 0000000000..9d19479486
--- /dev/null
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any
+
+from haystack import component, logging
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class PresidioTextCleaner:
+    """
+    Anonymizes PII in plain strings using [Microsoft Presidio](https://microsoft.github.io/presidio/).
+
+    Accepts a list of strings, detects personally identifiable information (PII), and returns
+    a new list of strings with PII replaced by entity type placeholders (e.g. `<PERSON>`).
+    Useful for sanitizing user queries before they are sent to an LLM.
+
+    ### Usage example
+
+    ```python
+    from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner
+
+    cleaner = PresidioTextCleaner()
+    result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"])
+    print(result["texts"][0])
+    # Hi, I am <PERSON>, call me at <PHONE_NUMBER>
+    ```
+    """
+
+    def __init__(
+        self,
+        language: str = "en",
+        entities: list[str] | None = None,
+        score_threshold: float = 0.35,
+    ) -> None:
+        """
+        Initializes the PresidioTextCleaner.
+
+        :param language:
+            Language code for PII detection. Defaults to `"en"`.
+        :param entities:
+            List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
+            If `None`, all supported entity types are used.
+        :param score_threshold:
+            Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`.
+        """
+        self.language = language
+        self.entities = entities
+        self.score_threshold = score_threshold
+        self._analyzer = AnalyzerEngine()
+        self._anonymizer = AnonymizerEngine()
+
+    @component.output_types(texts=list[str])
+    def run(self, texts: list[str]) -> dict[str, Any]:
+        """
+        Anonymizes PII in the provided strings.
+
+        :param texts:
+            List of strings to anonymize.
+        :returns:
+            A dictionary with key `texts` containing the cleaned strings.
+        """
+        cleaned: list[str] = []
+        for text in texts:
+            try:
+                analyzer_results = self._analyzer.analyze(
+                    text=text,
+                    language=self.language,
+                    entities=self.entities,
+                    score_threshold=self.score_threshold,
+                )
+                anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
+                cleaned.append(anonymized.text)
+            except Exception as e:
+                logger.warning(
+                    "Could not anonymize text. Skipping it. Error: {error}",
+                    error=e,
+                )
+                cleaned.append(text)
+        return {"texts": cleaned}
diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py
new file mode 100644
index 0000000000..9bd134437e
--- /dev/null
+++ b/integrations/presidio/tests/test_presidio_document_cleaner.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from unittest.mock import MagicMock, patch
+
+import pytest
+from haystack import Document
+from haystack.core.serialization import component_from_dict, component_to_dict
+
+from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
+
+
+class TestPresidioDocumentCleaner:
+    def test_init_defaults(self):
+        cleaner = PresidioDocumentCleaner()
+        assert cleaner.language == "en"
+        assert cleaner.entities is None
+        assert cleaner.score_threshold == 0.35
+
+    def test_init_custom_params(self):
+        cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.7)
+        assert cleaner.language == "de"
+        assert cleaner.entities == ["PERSON"]
+        assert cleaner.score_threshold == 0.7
+
+    def test_to_dict(self):
+        cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5)
+        data = component_to_dict(cleaner, "PresidioDocumentCleaner")
+        assert (
+            data["type"]
+            == "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner"
+        )
+        assert data["init_parameters"]["language"] == "en"
+        assert data["init_parameters"]["entities"] == ["EMAIL_ADDRESS"]
+        assert data["init_parameters"]["score_threshold"] == 0.5
+
+    def test_from_dict(self):
+        data = {
+            "type": "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner",
+            "init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6},
+        }
+        cleaner = component_from_dict(PresidioDocumentCleaner, data, "PresidioDocumentCleaner")
+        assert cleaner.language == "de"
+        assert cleaner.entities == ["PERSON"]
+        assert cleaner.score_threshold == 0.6
+
+    def test_run_anonymizes_pii(self):
+        cleaner = PresidioDocumentCleaner()
+        mock_result = MagicMock()
+        mock_result.text = "My name is <PERSON> and email is <EMAIL_ADDRESS>"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        docs = [Document(content="My name is John and email is john@example.com")]
+        result = cleaner.run(documents=docs)
+
+        assert len(result["documents"]) == 1
+        assert result["documents"][0].content == "My name is <PERSON> and email is <EMAIL_ADDRESS>"
+
+    def test_run_preserves_metadata(self):
+        cleaner = PresidioDocumentCleaner()
+        mock_result = MagicMock()
+        mock_result.text = "Hello <PERSON>"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        docs = [Document(content="Hello John", meta={"source": "email", "page": 1})]
+        result = cleaner.run(documents=docs)
+
+        assert result["documents"][0].meta["source"] == "email"
+        assert result["documents"][0].meta["page"] == 1
+
+    def test_run_does_not_mutate_original(self):
+        cleaner = PresidioDocumentCleaner()
+        mock_result = MagicMock()
+        mock_result.text = "Hello <PERSON>"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        original = Document(content="Hello John")
+        cleaner.run(documents=[original])
+
+        assert original.content == "Hello John"
+
+    def test_run_passes_through_none_content(self):
+        cleaner = PresidioDocumentCleaner()
+        doc = Document(content=None, meta={"source": "test"})
+        result = cleaner.run(documents=[doc])
+
+        assert len(result["documents"]) == 1
+        assert result["documents"][0].content is None
+        assert result["documents"][0].meta["source"] == "test"
+
+    def test_run_skips_on_error(self, caplog):
+        cleaner = PresidioDocumentCleaner()
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.side_effect = Exception("Analyzer error")
+
+        doc = Document(content="Some text with PII")
+        with caplog.at_level(logging.WARNING):
+            result = cleaner.run(documents=[doc])
+
+        assert len(result["documents"]) == 1
+        assert result["documents"][0].content == "Some text with PII"
+        assert "Could not anonymize" in caplog.text
+
+    def test_run_multiple_documents(self):
+        cleaner = PresidioDocumentCleaner()
+        mock_result = MagicMock()
+        mock_result.text = "cleaned"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        docs = [Document(content=f"doc {i}") for i in range(3)]
+        result = cleaner.run(documents=docs)
+
+        assert len(result["documents"]) == 3
+
+    def test_run_passes_language_and_entities_to_analyzer(self):
+        cleaner = PresidioDocumentCleaner(language="de", entities=["PERSON"], score_threshold=0.8)
+        mock_result = MagicMock()
+        mock_result.text = "cleaned"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        cleaner.run(documents=[Document(content="Hello John")])
+
+        cleaner._analyzer.analyze.assert_called_once_with(
+            text="Hello John", language="de", entities=["PERSON"], score_threshold=0.8
+        )
+
+    @pytest.mark.integration
+    def test_run_integration(self):
+        cleaner = PresidioDocumentCleaner()
+        docs = [Document(content="My name is John Smith and my email is john@example.com")]
+        result = cleaner.run(documents=docs)
+
+        assert len(result["documents"]) == 1
+        assert "John Smith" not in result["documents"][0].content
+        assert "john@example.com" not in result["documents"][0].content
diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py
new file mode 100644
index 0000000000..f82fb013b9
--- /dev/null
+++ b/integrations/presidio/tests/test_presidio_entity_extractor.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from unittest.mock import MagicMock
+
+import pytest
+from haystack import Document
+from haystack.core.serialization import component_from_dict, component_to_dict
+
+from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor
+
+
+class TestPresidioEntityExtractor:
+    def test_init_defaults(self):
+        extractor = PresidioEntityExtractor()
+        assert extractor.language == "en"
+        assert extractor.entities is None
+        assert extractor.score_threshold == 0.35
+
+    def test_to_dict(self):
+        extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6)
+        data = component_to_dict(extractor, "PresidioEntityExtractor")
+        assert (
+            data["type"]
+            == "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor"
+        )
+        assert data["init_parameters"]["entities"] == ["PERSON"]
+        assert data["init_parameters"]["score_threshold"] == 0.6
+
+    def test_from_dict(self):
+        data = {
+            "type": "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor",
+            "init_parameters": {"language": "en", "entities": ["EMAIL_ADDRESS"], "score_threshold": 0.5},
+        }
+        extractor = component_from_dict(PresidioEntityExtractor, data, "PresidioEntityExtractor")
+        assert extractor.entities == ["EMAIL_ADDRESS"]
+
+    def test_run_extracts_entities_into_metadata(self):
+        extractor = PresidioEntityExtractor()
+        mock_entity = MagicMock()
+        mock_entity.entity_type = "PERSON"
+        mock_entity.start = 11
+        mock_entity.end = 15
+        mock_entity.score = 0.85
+        extractor._analyzer = MagicMock()
+        extractor._analyzer.analyze.return_value = [mock_entity]
+
+        docs = [Document(content="My name is John")]
+        result = extractor.run(documents=docs)
+
+        entities = result["documents"][0].meta["entities"]
+        assert len(entities) == 1
+        assert entities[0]["entity_type"] == "PERSON"
+        assert entities[0]["start"] == 11
+        assert entities[0]["end"] == 15
+        assert entities[0]["score"] == 0.85
+
+    def test_run_does_not_mutate_original(self):
+        extractor = PresidioEntityExtractor()
+        extractor._analyzer = MagicMock()
+        extractor._analyzer.analyze.return_value = []
+
+        original = Document(content="Hello John", meta={"source": "test"})
+        extractor.run(documents=[original])
+
+        assert "entities" not in original.meta
+
+    def test_run_passes_through_none_content(self):
+        extractor = PresidioEntityExtractor()
+        doc = Document(content=None, meta={"source": "test"})
+        result = extractor.run(documents=[doc])
+
+        assert result["documents"][0].content is None
+        assert "entities" not in result["documents"][0].meta
+
+    def test_run_empty_entities(self):
+        extractor = PresidioEntityExtractor()
+        extractor._analyzer = MagicMock()
+        extractor._analyzer.analyze.return_value = []
+
+        docs = [Document(content="No PII here")]
+        result = extractor.run(documents=docs)
+
+        assert result["documents"][0].meta["entities"] == []
+
+    def test_run_skips_on_error(self, caplog):
+        extractor = PresidioEntityExtractor()
+        extractor._analyzer = MagicMock()
+        extractor._analyzer.analyze.side_effect = Exception("Analyzer error")
+
+        doc = Document(content="Some text")
+        with caplog.at_level(logging.WARNING):
+            result = extractor.run(documents=[doc])
+
+        assert result["documents"][0].content == "Some text"
+        assert "entities" not in result["documents"][0].meta
+        assert "Could not extract entities" in caplog.text
+
+    def test_run_preserves_existing_metadata(self):
+        extractor = PresidioEntityExtractor()
+        extractor._analyzer = MagicMock()
+        extractor._analyzer.analyze.return_value = []
+
+        docs = [Document(content="Hello", meta={"page": 3, "author": "Bob"})]
+        result = extractor.run(documents=docs)
+
+        assert result["documents"][0].meta["page"] == 3
+        assert result["documents"][0].meta["author"] == "Bob"
+        assert result["documents"][0].meta["entities"] == []
+
+    @pytest.mark.integration
+    def test_run_integration(self):
+        extractor = PresidioEntityExtractor()
+        docs = [Document(content="Contact Alice at alice@example.com")]
+        result = extractor.run(documents=docs)
+
+        entities = result["documents"][0].meta["entities"]
+        entity_types = [e["entity_type"] for e in entities]
+        assert "EMAIL_ADDRESS" in entity_types
diff --git a/integrations/presidio/tests/test_presidio_text_cleaner.py b/integrations/presidio/tests/test_presidio_text_cleaner.py
new file mode 100644
index 0000000000..8c1922bbd3
--- /dev/null
+++ b/integrations/presidio/tests/test_presidio_text_cleaner.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from unittest.mock import MagicMock
+
+import pytest
+from haystack.core.serialization import component_from_dict, component_to_dict
+
+from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner
+
+
+class TestPresidioTextCleaner:
+    def test_init_defaults(self):
+        cleaner = PresidioTextCleaner()
+        assert cleaner.language == "en"
+        assert cleaner.entities is None
+        assert cleaner.score_threshold == 0.35
+
+    def test_to_dict(self):
+        cleaner = PresidioTextCleaner(language="en", entities=["PHONE_NUMBER"], score_threshold=0.5)
+        data = component_to_dict(cleaner, "PresidioTextCleaner")
+        assert (
+            data["type"]
+            == "haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner.PresidioTextCleaner"
+        )
+        assert data["init_parameters"]["entities"] == ["PHONE_NUMBER"]
+
+    def test_from_dict(self):
+        data = {
+            "type": "haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner.PresidioTextCleaner",
+            "init_parameters": {"language": "en", "entities": None, "score_threshold": 0.4},
+        }
+        cleaner = component_from_dict(PresidioTextCleaner, data, "PresidioTextCleaner")
+        assert cleaner.score_threshold == 0.4
+
+    def test_run_anonymizes_pii(self):
+        cleaner = PresidioTextCleaner()
+        mock_result = MagicMock()
+        mock_result.text = "Call me at <PHONE_NUMBER>"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        result = cleaner.run(texts=["Call me at 212-555-1234"])
+
+        assert result["texts"][0] == "Call me at <PHONE_NUMBER>"
+
+    def test_run_multiple_texts(self):
+        cleaner = PresidioTextCleaner()
+        mock_result = MagicMock()
+        mock_result.text = "cleaned"
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        result = cleaner.run(texts=["text 1", "text 2", "text 3"])
+
+        assert len(result["texts"]) == 3
+
+    def test_run_skips_on_error(self, caplog):
+        cleaner = PresidioTextCleaner()
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.side_effect = Exception("error")
+
+        with caplog.at_level(logging.WARNING):
+            result = cleaner.run(texts=["My name is John"])
+
+        assert result["texts"][0] == "My name is John"
+        assert "Could not anonymize" in caplog.text
+
+    def test_run_empty_text(self):
+        cleaner = PresidioTextCleaner()
+        mock_result = MagicMock()
+        mock_result.text = ""
+        cleaner._anonymizer = MagicMock()
+        cleaner._anonymizer.anonymize.return_value = mock_result
+        cleaner._analyzer = MagicMock()
+        cleaner._analyzer.analyze.return_value = []
+
+        result = cleaner.run(texts=[""])
+
+        assert result["texts"][0] == ""
+
+    @pytest.mark.integration
+    def test_run_integration(self):
+        cleaner = PresidioTextCleaner()
+        result = cleaner.run(texts=["Hi, I am Alice and my phone is 212-555-5678"])
+
+        assert len(result["texts"]) == 1
+        assert "Alice" not in result["texts"][0]
+        assert "212-555-5678" not in result["texts"][0]

From b7f035923466ef1732a17e90e2d8c63580dbc375 Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Wed, 1 Apr 2026 12:56:54 +0500
Subject: [PATCH 2/9] fix(presidio): add missing README.md required by
 hatchling build

---
 integrations/presidio/README.md | 44 +++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 integrations/presidio/README.md

diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md
new file mode 100644
index 0000000000..7ad2e30cc3
--- /dev/null
+++ b/integrations/presidio/README.md
@@ -0,0 +1,44 @@
+# presidio-haystack
+
+[![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)
+
+Haystack integration for [Microsoft Presidio](https://microsoft.github.io/presidio/) — PII detection and anonymization.
+
+---
+
+## Installation
+
+```bash
+pip install presidio-haystack
+```
+
+You also need to download the spaCy model used by Presidio:
+
+```bash
+python -m spacy download en_core_web_lg
+```
+
+## Components
+
+- **PresidioDocumentCleaner** — anonymizes PII in `list[Document]`
+- **PresidioTextCleaner** — anonymizes PII in `list[str]` (useful for query sanitization)
+- **PresidioEntityExtractor** — detects PII entities and stores them in Document metadata
+
+## Usage
+
+```python
+from haystack import Document
+from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
+
+cleaner = PresidioDocumentCleaner()
+result = cleaner.run(documents=[Document(content="My name is John, email: john@example.com")])
+print(result["documents"][0].content)
+# My name is <PERSON>, email: <EMAIL_ADDRESS>
+```
+
+---
+
+## Contributing
+
+Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).

From e1629493cc17c0d218bb3231850e8dd253e10b1c Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Wed, 1 Apr 2026 13:01:48 +0500
Subject: [PATCH 3/9] fix(presidio): fix lint errors and add missing pydoc
 config

---
 integrations/presidio/pydoc/config_docusaurus.yml | 15 +++++++++++++++
 .../presidio/presidio_document_cleaner.py         |  2 +-
 .../presidio/presidio_entity_extractor.py         |  2 +-
 .../presidio/presidio_text_cleaner.py             |  2 +-
 .../tests/test_presidio_document_cleaner.py       | 14 +++++++++-----
 .../tests/test_presidio_entity_extractor.py       | 12 ++++++++----
 6 files changed, 35 insertions(+), 12 deletions(-)
 create mode 100644 integrations/presidio/pydoc/config_docusaurus.yml

diff --git a/integrations/presidio/pydoc/config_docusaurus.yml b/integrations/presidio/pydoc/config_docusaurus.yml
new file mode 100644
index 0000000000..def818e2a9
--- /dev/null
+++ b/integrations/presidio/pydoc/config_docusaurus.yml
@@ -0,0 +1,15 @@
+loaders:
+  - modules:
+      - haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner
+      - haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner
+      - haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor
+    search_path: [../src]
+processors:
+  - type: filter
+    documented_only: true
+    skip_empty_modules: true
+renderer:
+  description: Presidio integration for Haystack
+  id: integrations-presidio
+  filename: presidio.md
+  title: Presidio
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
index 0bdcef7a70..f8e4551270 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
@@ -50,7 +50,7 @@ def __init__(
             List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
             If `None`, all supported entity types are used.
         :param score_threshold:
-            Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`.
+            Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
         """
         self.language = language
         self.entities = entities
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
index 2f71219de8..fb21b89a11 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
@@ -50,7 +50,7 @@ def __init__(
             List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
             If `None`, all supported entity types are detected.
         :param score_threshold:
-            Minimum confidence score (0–1) for a detected entity to be included. Defaults to `0.35`.
+            Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`.
         """
         self.language = language
         self.entities = entities
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
index 9d19479486..c9221d5804 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
@@ -47,7 +47,7 @@ def __init__(
             List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
             If `None`, all supported entity types are used.
         :param score_threshold:
-            Minimum confidence score (0–1) for a detected entity to be anonymized. Defaults to `0.35`.
+            Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
         """
         self.language = language
         self.entities = entities
diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py
index 9bd134437e..2122743b04 100644
--- a/integrations/presidio/tests/test_presidio_document_cleaner.py
+++ b/integrations/presidio/tests/test_presidio_document_cleaner.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import logging
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 from haystack import Document
@@ -28,17 +28,21 @@ def test_init_custom_params(self):
     def test_to_dict(self):
         cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5)
         data = component_to_dict(cleaner, "PresidioDocumentCleaner")
-        assert (
-            data["type"]
-            == "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner"
+        expected_type = (
+            "haystack_integrations.components.preprocessors.presidio"
+            ".presidio_document_cleaner.PresidioDocumentCleaner"
         )
+        assert data["type"] == expected_type
         assert data["init_parameters"]["language"] == "en"
         assert data["init_parameters"]["entities"] == ["EMAIL_ADDRESS"]
         assert data["init_parameters"]["score_threshold"] == 0.5
 
     def test_from_dict(self):
         data = {
-            "type": "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner",
+            "type": (
+                "haystack_integrations.components.preprocessors.presidio"
+                ".presidio_document_cleaner.PresidioDocumentCleaner"
+            ),
             "init_parameters": {"language": "de", "entities": ["PERSON"], "score_threshold": 0.6},
         }
         cleaner = component_from_dict(PresidioDocumentCleaner, data, "PresidioDocumentCleaner")
diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py
index f82fb013b9..137712bca9 100644
--- a/integrations/presidio/tests/test_presidio_entity_extractor.py
+++ b/integrations/presidio/tests/test_presidio_entity_extractor.py
@@ -22,16 +22,20 @@ def test_init_defaults(self):
     def test_to_dict(self):
         extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6)
         data = component_to_dict(extractor, "PresidioEntityExtractor")
-        assert (
-            data["type"]
-            == "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor"
+        expected_type = (
+            "haystack_integrations.components.preprocessors.presidio"
+            ".presidio_entity_extractor.PresidioEntityExtractor"
         )
+        assert data["type"] == expected_type
         assert data["init_parameters"]["entities"] == ["PERSON"]
         assert data["init_parameters"]["score_threshold"] == 0.6
 
     def test_from_dict(self):
         data = {
-            "type": "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor",
+            "type": (
+                "haystack_integrations.components.preprocessors.presidio"
+                ".presidio_entity_extractor.PresidioEntityExtractor"
+            ),
             "init_parameters": {"language": "en", "entities": ["EMAIL_ADDRESS"], "score_threshold": 0.5},
         }
         extractor = component_from_dict(PresidioEntityExtractor, data, "PresidioEntityExtractor")

From cc518d19821fbf2fa7d560b1eed3bf6165d846f8 Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Wed, 1 Apr 2026 13:05:07 +0500
Subject: [PATCH 4/9] fix(presidio): apply ruff format to test files

---
 integrations/presidio/tests/test_presidio_document_cleaner.py | 3 +--
 integrations/presidio/tests/test_presidio_entity_extractor.py | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py
index 2122743b04..4f170e4913 100644
--- a/integrations/presidio/tests/test_presidio_document_cleaner.py
+++ b/integrations/presidio/tests/test_presidio_document_cleaner.py
@@ -29,8 +29,7 @@ def test_to_dict(self):
         cleaner = PresidioDocumentCleaner(language="en", entities=["EMAIL_ADDRESS"], score_threshold=0.5)
         data = component_to_dict(cleaner, "PresidioDocumentCleaner")
         expected_type = (
-            "haystack_integrations.components.preprocessors.presidio"
-            ".presidio_document_cleaner.PresidioDocumentCleaner"
+            "haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner.PresidioDocumentCleaner"
         )
         assert data["type"] == expected_type
         assert data["init_parameters"]["language"] == "en"
diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py
index 137712bca9..0cbbbf0c72 100644
--- a/integrations/presidio/tests/test_presidio_entity_extractor.py
+++ b/integrations/presidio/tests/test_presidio_entity_extractor.py
@@ -23,8 +23,7 @@ def test_to_dict(self):
         extractor = PresidioEntityExtractor(language="en", entities=["PERSON"], score_threshold=0.6)
         data = component_to_dict(extractor, "PresidioEntityExtractor")
         expected_type = (
-            "haystack_integrations.components.preprocessors.presidio"
-            ".presidio_entity_extractor.PresidioEntityExtractor"
+            "haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor.PresidioEntityExtractor"
         )
         assert data["type"] == expected_type
         assert data["init_parameters"]["entities"] == ["PERSON"]

From a363f89b1aa26fc9d322aab5625595c72b552b26 Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Wed, 1 Apr 2026 13:07:29 +0500
Subject: [PATCH 5/9] fix(presidio): add py.typed marker for mypy type checking

---
 .../src/haystack_integrations/components/preprocessors/py.typed   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed

diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed b/integrations/presidio/src/haystack_integrations/components/preprocessors/py.typed
new file mode 100644
index 0000000000..e69de29bb2

From a8b2004c790405db8abb899cad73a3a93ac38688 Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Wed, 1 Apr 2026 13:10:22 +0500
Subject: [PATCH 6/9] fix(presidio): suppress mypy arg-type error for presidio
 cross-package type mismatch

---
 .../preprocessors/presidio/presidio_document_cleaner.py         | 2 +-
 .../components/preprocessors/presidio/presidio_text_cleaner.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
index f8e4551270..0e576c1889 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
@@ -80,7 +80,7 @@ def run(self, documents: list[Document]) -> dict[str, Any]:
                     entities=self.entities,
                     score_threshold=self.score_threshold,
                 )
-                anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results)
+                anonymized = self._anonymizer.anonymize(text=doc.content, analyzer_results=analyzer_results)  # type: ignore[arg-type]
                 cleaned.append(Document(content=anonymized.text, meta=doc.meta.copy()))
             except Exception as e:
                 logger.warning(
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
index c9221d5804..3b85194db8 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
@@ -74,7 +74,7 @@ def run(self, texts: list[str]) -> dict[str, Any]:
                     entities=self.entities,
                     score_threshold=self.score_threshold,
                 )
-                anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
+                anonymized = self._anonymizer.anonymize(text=text, analyzer_results=analyzer_results)  # type: ignore[arg-type]
                 cleaned.append(anonymized.text)
             except Exception as e:
                 logger.warning(

From 80c8c1d101d49fe216b9ece084ffd3d989b39acb Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Thu, 2 Apr 2026 19:36:10 +0500
Subject: [PATCH 7/9] Address PR review: update README, add Python 3.14
 support, labeler and coverage entries

---
 .github/labeler.yml                       |  5 ++++
 .github/workflows/CI_coverage_comment.yml |  1 +
 .github/workflows/presidio.yml            |  2 +-
 README.md                                 |  1 +
 integrations/presidio/README.md           | 34 +----------------------
 integrations/presidio/pyproject.toml      |  1 +
 6 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 4d44b76b10..ffe59fb600 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -198,6 +198,11 @@ integration:pgvector:
       - any-glob-to-any-file: "integrations/pgvector/**/*"
       - any-glob-to-any-file: ".github/workflows/pgvector.yml"
 
+integration:presidio:
+  - changed-files:
+      - any-glob-to-any-file: "integrations/presidio/**/*"
+      - any-glob-to-any-file: ".github/workflows/presidio.yml"
+
 integration:pinecone:
   - changed-files:
       - any-glob-to-any-file: "integrations/pinecone/**/*"
diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml
index f4b83385a5..29373df7d8 100644
--- a/.github/workflows/CI_coverage_comment.yml
+++ b/.github/workflows/CI_coverage_comment.yml
@@ -42,6 +42,7 @@ on:
       - "Test / optimum"
       - "Test / paddleocr"
       - "Test / pgvector"
+      - "Test / presidio"
       - "Test / pinecone"
       - "Test / pyversity"
       - "Test / qdrant"
diff --git a/.github/workflows/presidio.yml b/.github/workflows/presidio.yml
index 7395bde691..597d5fb208 100644
--- a/.github/workflows/presidio.yml
+++ b/.github/workflows/presidio.yml
@@ -29,7 +29,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.10", "3.13"]
+        python-version: ["3.10", "3.14"]
 
     steps:
       - uses: actions/checkout@v4
diff --git a/README.md b/README.md
index f52e01b852..e123e7bde9 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [opensearch-haystack](integrations/opensearch/)                         | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack)                                 | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml)                                     | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch-combined/htmlcov/index.html) |
 | [optimum-haystack](integrations/optimum/)                               | Embedder                    | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack)                                       | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml)                                              | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum-combined/htmlcov/index.html) |
 | [paddleocr-haystack](integrations/paddleocr/)                           | Converter                    | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack)                                   | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml)                                        | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) |
+| [presidio-haystack](integrations/presidio/)                             | Preprocessor                | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)                                     | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml)                                           | | |
 | [pinecone-haystack](integrations/pinecone/)                             | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack)                        | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)                                           | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) |
 | [pgvector-haystack](integrations/pgvector/)                             | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack)                        | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml)                                           | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) |
 | [pyversity-haystack](integrations/pyversity/)                           | Ranker                      | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack)                                   | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml)                                        | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) |  |
diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md
index 7ad2e30cc3..c29ac70462 100644
--- a/integrations/presidio/README.md
+++ b/integrations/presidio/README.md
@@ -3,39 +3,7 @@
 [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)
 
-Haystack integration for [Microsoft Presidio](https://microsoft.github.io/presidio/) — PII detection and anonymization.
-
----
-
-## Installation
-
-```bash
-pip install presidio-haystack
-```
-
-You also need to download the spaCy model used by Presidio:
-
-```bash
-python -m spacy download en_core_web_lg
-```
-
-## Components
-
-- **PresidioDocumentCleaner** — anonymizes PII in `list[Document]`
-- **PresidioTextCleaner** — anonymizes PII in `list[str]` (useful for query sanitization)
-- **PresidioEntityExtractor** — detects PII entities and stores them in Document metadata
-
-## Usage
-
-```python
-from haystack import Document
-from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
-
-cleaner = PresidioDocumentCleaner()
-result = cleaner.run(documents=[Document(content="My name is John, email: john@example.com")])
-print(result["documents"][0].content)
-# My name is <PERSON>, email: <EMAIL_ADDRESS>
-```
+- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/presidio/CHANGELOG.md)
 
 ---
 
diff --git a/integrations/presidio/pyproject.toml b/integrations/presidio/pyproject.toml
index 1a618957e4..c8e166796f 100644
--- a/integrations/presidio/pyproject.toml
+++ b/integrations/presidio/pyproject.toml
@@ -19,6 +19,7 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]

From b1377c0b4dbca548d1100b0bd58db07a4bc589d7 Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Thu, 9 Apr 2026 16:51:39 +0500
Subject: [PATCH 8/9] =?UTF-8?q?fix(presidio):=20address=20reviewer=20feedb?=
 =?UTF-8?q?ack=20=E2=80=94=20fix=20README=20format,=20alphabetical=20order?=
 =?UTF-8?q?ing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/CI_coverage_comment.yml | 2 +-
 README.md                                 | 2 +-
 integrations/presidio/README.md           | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml
index 29373df7d8..7c80a698cf 100644
--- a/.github/workflows/CI_coverage_comment.yml
+++ b/.github/workflows/CI_coverage_comment.yml
@@ -42,8 +42,8 @@ on:
       - "Test / optimum"
       - "Test / paddleocr"
       - "Test / pgvector"
-      - "Test / presidio"
       - "Test / pinecone"
+      - "Test / presidio"
       - "Test / pyversity"
       - "Test / qdrant"
       - "Test / ragas"
diff --git a/README.md b/README.md
index e123e7bde9..32c0508234 100644
--- a/README.md
+++ b/README.md
@@ -65,9 +65,9 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [opensearch-haystack](integrations/opensearch/)                         | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack)                                 | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml)                                     | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch-combined/htmlcov/index.html) |
 | [optimum-haystack](integrations/optimum/)                               | Embedder                    | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack)                                       | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml)                                              | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum-combined/htmlcov/index.html) |
 | [paddleocr-haystack](integrations/paddleocr/)                           | Converter                    | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack)                                   | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml)                                        | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) |
-| [presidio-haystack](integrations/presidio/)                             | Preprocessor                | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)                                     | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml)                                           | | |
 | [pinecone-haystack](integrations/pinecone/)                             | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack)                        | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml)                                           | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) |
 | [pgvector-haystack](integrations/pgvector/)                             | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack)                        | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml)                                           | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) |
+| [presidio-haystack](integrations/presidio/)                             | Preprocessor                | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)                                     | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml)                                           | | |
 | [pyversity-haystack](integrations/pyversity/)                           | Ranker                      | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack)                                   | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml)                                        | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) |  |
 | [qdrant-haystack](integrations/qdrant/)                                 | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack)                            | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml)                                                 | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant-combined/htmlcov/index.html) |
 | [ragas-haystack](integrations/ragas/)                                   | Evaluator                   | [![PyPI - Version](https://img.shields.io/pypi/v/ragas-haystack.svg)](https://pypi.org/project/ragas-haystack)                                           | [![Test / ragas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml)                                                    | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ragas/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ragas/htmlcov/index.html) |  |
diff --git a/integrations/presidio/README.md b/integrations/presidio/README.md
index c29ac70462..2f9e57089b 100644
--- a/integrations/presidio/README.md
+++ b/integrations/presidio/README.md
@@ -7,6 +7,4 @@
 
 ---
 
-## Contributing
-
 Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).

From 7e15ec3c0706ff3dacd9dfc3afc180ff7c4239c3 Mon Sep 17 00:00:00 2001
From: SyedShahmeerAli12 <syedshahmeerali196@gmail.com>
Date: Thu, 9 Apr 2026 18:56:49 +0500
Subject: [PATCH 9/9] =?UTF-8?q?fix(presidio):=20address=20reviewer=20feedb?=
 =?UTF-8?q?ack=20=E2=80=94=20keyword-only=20args,=20warm=5Fup,=20type=20hi?=
 =?UTF-8?q?nts,=20dataclasses.replace,=20doc=20links?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add keyword-only arguments (*, ) to all three component __init__ methods
- Move AnalyzerEngine/AnonymizerEngine initialization to warm_up() since they load spaCy ML models
- Fix run() return types from dict[str, Any] to proper typed dicts
- Use dataclasses.replace() in PresidioEntityExtractor instead of Document()
- Add Presidio documentation links for language, entities, and score_threshold params
- Update integration tests to call warm_up() before run()
- Add missing _anonymizer mock in test_run_skips_on_error tests
---
 .../presidio/presidio_document_cleaner.py     | 30 +++++++++++++++----
 .../presidio/presidio_entity_extractor.py     | 28 ++++++++++++++---
 .../presidio/presidio_text_cleaner.py         | 30 +++++++++++++++----
 .../tests/test_presidio_document_cleaner.py   |  2 ++
 .../tests/test_presidio_entity_extractor.py   |  1 +
 .../tests/test_presidio_text_cleaner.py       |  2 ++
 6 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
index 0e576c1889..348593f440 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py
@@ -2,8 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any
-
 from haystack import Document, component, logging
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
@@ -22,6 +20,8 @@ class PresidioDocumentCleaner:
 
     Documents without text content are passed through unchanged.
 
+    Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines.
+
     ### Usage example
 
     ```python
@@ -29,6 +29,7 @@ class PresidioDocumentCleaner:
     from haystack_integrations.components.preprocessors.presidio import PresidioDocumentCleaner
 
     cleaner = PresidioDocumentCleaner()
+    cleaner.warm_up()
     result = cleaner.run(documents=[Document(content="My name is John and my email is john@example.com")])
     print(result["documents"][0].content)
     # My name is <PERSON> and my email is <EMAIL_ADDRESS>
@@ -37,6 +38,7 @@ class PresidioDocumentCleaner:
 
     def __init__(
         self,
+        *,
         language: str = "en",
         entities: list[str] | None = None,
         score_threshold: float = 0.35,
@@ -46,20 +48,35 @@ def __init__(
 
         :param language:
             Language code for PII detection. Defaults to `"en"`.
+            See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
         :param entities:
             List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
             If `None`, all supported entity types are used.
+            See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/).
         :param score_threshold:
             Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
+            See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
         """
         self.language = language
         self.entities = entities
         self.score_threshold = score_threshold
-        self._analyzer = AnalyzerEngine()
-        self._anonymizer = AnonymizerEngine()
+        self._analyzer: AnalyzerEngine | None = None
+        self._anonymizer: AnonymizerEngine | None = None
+
+    def warm_up(self) -> None:
+        """
+        Initializes the Presidio analyzer and anonymizer engines.
+
+        This method loads the underlying NLP models and should be called before `run()`.
+        In a Haystack Pipeline, this is called automatically before the first run.
+        """
+        if self._analyzer is None:
+            self._analyzer = AnalyzerEngine()
+        if self._anonymizer is None:
+            self._anonymizer = AnonymizerEngine()
 
     @component.output_types(documents=list[Document])
-    def run(self, documents: list[Document]) -> dict[str, Any]:
+    def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         """
         Anonymizes PII in the provided Documents.
 
@@ -73,6 +90,9 @@ def run(self, documents: list[Document]) -> dict[str, Any]:
             if doc.content is None:
                 cleaned.append(doc)
                 continue
+            if self._analyzer is None or self._anonymizer is None:
+                msg = "The component was not warmed up. Call warm_up() before running it."
+                raise RuntimeError(msg)
             try:
                 analyzer_results = self._analyzer.analyze(
                     text=doc.content,
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
index fb21b89a11..7b1b42b3d5 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any
+from dataclasses import replace
 
 from haystack import Document, component, logging
 from presidio_analyzer import AnalyzerEngine
@@ -21,6 +21,8 @@ class PresidioEntityExtractor:
 
     Original Documents are not mutated. Documents without text content are passed through unchanged.
 
+    Call `warm_up()` before running this component to load the Presidio analyzer engine.
+
     ### Usage example
 
     ```python
@@ -28,6 +30,7 @@ class PresidioEntityExtractor:
     from haystack_integrations.components.preprocessors.presidio import PresidioEntityExtractor
 
     extractor = PresidioEntityExtractor()
+    extractor.warm_up()
     result = extractor.run(documents=[Document(content="Contact Alice at alice@example.com")])
     print(result["documents"][0].meta["entities"])
     # [{"entity_type": "PERSON", "start": 8, "end": 13, "score": 0.85},
@@ -37,6 +40,7 @@ class PresidioEntityExtractor:
 
     def __init__(
         self,
+        *,
         language: str = "en",
         entities: list[str] | None = None,
         score_threshold: float = 0.35,
@@ -46,19 +50,32 @@ def __init__(
 
         :param language:
             Language code for PII detection. Defaults to `"en"`.
+            See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
         :param entities:
             List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
             If `None`, all supported entity types are detected.
+            See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/).
         :param score_threshold:
             Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`.
+            See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
         """
         self.language = language
         self.entities = entities
         self.score_threshold = score_threshold
-        self._analyzer = AnalyzerEngine()
+        self._analyzer: AnalyzerEngine | None = None
+
+    def warm_up(self) -> None:
+        """
+        Initializes the Presidio analyzer engine.
+
+        This method loads the underlying NLP models and should be called before `run()`.
+        In a Haystack Pipeline, this is called automatically before the first run.
+        """
+        if self._analyzer is None:
+            self._analyzer = AnalyzerEngine()
 
     @component.output_types(documents=list[Document])
-    def run(self, documents: list[Document]) -> dict[str, Any]:
+    def run(self, documents: list[Document]) -> dict[str, list[Document]]:
         """
         Detects PII entities in the provided Documents.
 
@@ -73,6 +90,9 @@ def run(self, documents: list[Document]) -> dict[str, Any]:
             if doc.content is None:
                 result_docs.append(doc)
                 continue
+            if self._analyzer is None:
+                msg = "The component was not warmed up. Call warm_up() before running it."
+                raise RuntimeError(msg)
             try:
                 analyzer_results = self._analyzer.analyze(
                     text=doc.content,
@@ -89,7 +109,7 @@ def run(self, documents: list[Document]) -> dict[str, Any]:
                     }
                     for r in analyzer_results
                 ]
-                result_docs.append(Document(content=doc.content, meta={**doc.meta, "entities": entities}))
+                result_docs.append(replace(doc, meta={**doc.meta, "entities": entities}))
             except Exception as e:
                 logger.warning(
                     "Could not extract entities from document {doc_id}. Skipping it. Error: {error}",
diff --git a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
index 3b85194db8..d20f889c19 100644
--- a/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
+++ b/integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py
@@ -2,8 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any
-
 from haystack import component, logging
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
@@ -20,12 +18,15 @@ class PresidioTextCleaner:
     a new list of strings with PII replaced by entity type placeholders (e.g. `<PERSON>`).
     Useful for sanitizing user queries before they are sent to an LLM.
 
+    Call `warm_up()` before running this component to load the Presidio analyzer and anonymizer engines.
+
     ### Usage example
 
     ```python
     from haystack_integrations.components.preprocessors.presidio import PresidioTextCleaner
 
     cleaner = PresidioTextCleaner()
+    cleaner.warm_up()
     result = cleaner.run(texts=["Hi, I am John Smith, call me at 212-555-1234"])
     print(result["texts"][0])
     # Hi, I am <PERSON>, call me at <PHONE_NUMBER>
@@ -34,6 +35,7 @@ class PresidioTextCleaner:
 
     def __init__(
         self,
+        *,
         language: str = "en",
         entities: list[str] | None = None,
         score_threshold: float = 0.35,
@@ -43,20 +45,35 @@ def __init__(
 
         :param language:
             Language code for PII detection. Defaults to `"en"`.
+            See [Presidio supported languages](https://microsoft.github.io/presidio/supported_languages/).
         :param entities:
             List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
             If `None`, all supported entity types are used.
+            See [Presidio supported entities](https://microsoft.github.io/presidio/supported_entities/).
         :param score_threshold:
             Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
+            See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
         """
         self.language = language
         self.entities = entities
         self.score_threshold = score_threshold
-        self._analyzer = AnalyzerEngine()
-        self._anonymizer = AnonymizerEngine()
+        self._analyzer: AnalyzerEngine | None = None
+        self._anonymizer: AnonymizerEngine | None = None
+
+    def warm_up(self) -> None:
+        """
+        Initializes the Presidio analyzer and anonymizer engines.
+
+        This method loads the underlying NLP models and should be called before `run()`.
+        In a Haystack Pipeline, this is called automatically before the first run.
+        """
+        if self._analyzer is None:
+            self._analyzer = AnalyzerEngine()
+        if self._anonymizer is None:
+            self._anonymizer = AnonymizerEngine()
 
     @component.output_types(texts=list[str])
-    def run(self, texts: list[str]) -> dict[str, Any]:
+    def run(self, texts: list[str]) -> dict[str, list[str]]:
         """
         Anonymizes PII in the provided strings.
 
@@ -65,6 +82,9 @@ def run(self, texts: list[str]) -> dict[str, Any]:
         :returns:
             A dictionary with key `texts` containing the cleaned strings.
         """
+        if self._analyzer is None or self._anonymizer is None:
+            msg = "The component was not warmed up. Call warm_up() before running it."
+            raise RuntimeError(msg)
         cleaned: list[str] = []
         for text in texts:
             try:
diff --git a/integrations/presidio/tests/test_presidio_document_cleaner.py b/integrations/presidio/tests/test_presidio_document_cleaner.py
index 4f170e4913..7c1d35107f 100644
--- a/integrations/presidio/tests/test_presidio_document_cleaner.py
+++ b/integrations/presidio/tests/test_presidio_document_cleaner.py
@@ -106,6 +106,7 @@ def test_run_skips_on_error(self, caplog):
         cleaner = PresidioDocumentCleaner()
         cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.side_effect = Exception("Analyzer error")
+        cleaner._anonymizer = MagicMock()
 
         doc = Document(content="Some text with PII")
         with caplog.at_level(logging.WARNING):
@@ -147,6 +148,7 @@ def test_run_passes_language_and_entities_to_analyzer(self):
     @pytest.mark.integration
     def test_run_integration(self):
         cleaner = PresidioDocumentCleaner()
+        cleaner.warm_up()
         docs = [Document(content="My name is John Smith and my email is john@example.com")]
         result = cleaner.run(documents=docs)
 
diff --git a/integrations/presidio/tests/test_presidio_entity_extractor.py b/integrations/presidio/tests/test_presidio_entity_extractor.py
index 0cbbbf0c72..77d73a0250 100644
--- a/integrations/presidio/tests/test_presidio_entity_extractor.py
+++ b/integrations/presidio/tests/test_presidio_entity_extractor.py
@@ -116,6 +116,7 @@ def test_run_preserves_existing_metadata(self):
     @pytest.mark.integration
     def test_run_integration(self):
         extractor = PresidioEntityExtractor()
+        extractor.warm_up()
         docs = [Document(content="Contact Alice at alice@example.com")]
         result = extractor.run(documents=docs)
 
diff --git a/integrations/presidio/tests/test_presidio_text_cleaner.py b/integrations/presidio/tests/test_presidio_text_cleaner.py
index 8c1922bbd3..030ac5057b 100644
--- a/integrations/presidio/tests/test_presidio_text_cleaner.py
+++ b/integrations/presidio/tests/test_presidio_text_cleaner.py
@@ -65,6 +65,7 @@ def test_run_skips_on_error(self, caplog):
         cleaner = PresidioTextCleaner()
         cleaner._analyzer = MagicMock()
         cleaner._analyzer.analyze.side_effect = Exception("error")
+        cleaner._anonymizer = MagicMock()
 
         with caplog.at_level(logging.WARNING):
             result = cleaner.run(texts=["My name is John"])
@@ -88,6 +89,7 @@ def test_run_empty_text(self):
     @pytest.mark.integration
     def test_run_integration(self):
         cleaner = PresidioTextCleaner()
+        cleaner.warm_up()
         result = cleaner.run(texts=["Hi, I am Alice and my phone is 212-555-5678"])
 
         assert len(result["texts"]) == 1