Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ integration:pgvector:
- any-glob-to-any-file: "integrations/pgvector/**/*"
- any-glob-to-any-file: ".github/workflows/pgvector.yml"

integration:presidio:
- changed-files:
- any-glob-to-any-file: "integrations/presidio/**/*"
- any-glob-to-any-file: ".github/workflows/presidio.yml"

integration:pinecone:
- changed-files:
- any-glob-to-any-file: "integrations/pinecone/**/*"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/CI_coverage_comment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ on:
- "Test / paddleocr"
- "Test / pgvector"
- "Test / pinecone"
- "Test / presidio"
- "Test / pyversity"
- "Test / qdrant"
- "Test / ragas"
Expand Down
72 changes: 72 additions & 0 deletions .github/workflows/presidio.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: Test / presidio

on:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/presidio/**"
- "!integrations/presidio/*.md"
- ".github/workflows/presidio.yml"

defaults:
run:
working-directory: integrations/presidio

concurrency:
group: presidio-${{ github.head_ref }}
cancel-in-progress: true

env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"

jobs:
run:
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.10", "3.14"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install Hatch
run: pip install hatch

- name: Lint
if: matrix.python-version == '3.10' && runner.os == 'Linux'
run: hatch run fmt-check && hatch run test:types

- name: Run unit tests
run: hatch run test:unit-cov-retry

- name: Run unit tests with lowest direct dependencies
run: |
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
hatch run test:unit

- name: Nightly - run unit tests with Haystack main branch
if: github.event_name == 'schedule'
run: |
hatch env prune
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
hatch run test:unit

notify-slack-on-failure:
needs: run
if: failure() && github.event_name == 'schedule'
runs-on: ubuntu-latest
steps:
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
with:
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
| [paddleocr-haystack](integrations/paddleocr/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack) | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) |
| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) |
| [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) |
| [presidio-haystack](integrations/presidio/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml) | | |
| [pyversity-haystack](integrations/pyversity/) | Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack) | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) | |
| [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant-combined/htmlcov/index.html) |
| [ragas-haystack](integrations/ragas/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/ragas-haystack.svg)](https://pypi.org/project/ragas-haystack) | [![Test / ragas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ragas/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ragas/htmlcov/index.html) | |
Expand Down
10 changes: 10 additions & 0 deletions integrations/presidio/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# presidio-haystack

[![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)

- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/presidio/CHANGELOG.md)

---

Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
15 changes: 15 additions & 0 deletions integrations/presidio/pydoc/config_docusaurus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
loaders:
- modules:
- haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner
- haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner
- haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor
search_path: [../src]
processors:
- type: filter
documented_only: true
skip_empty_modules: true
renderer:
description: Presidio integration for Haystack
id: integrations-presidio
filename: presidio.md
title: Presidio
167 changes: 167 additions & 0 deletions integrations/presidio/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "presidio-haystack"
dynamic = ["version"]
description = "Haystack integration for Microsoft Presidio — PII detection and anonymization"
readme = "README.md"
requires-python = ">=3.10"
license = "Apache-2.0"
keywords = ["Haystack", "Presidio", "PII", "anonymization", "privacy", "NLP"]
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"haystack-ai>=2.9.0",
"presidio-analyzer>=2.2.0",
"presidio-anonymizer>=2.2.0",
]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio#readme"
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio"

[tool.hatch.build.targets.wheel]
packages = ["src/haystack_integrations"]

[tool.hatch.version]
source = "vcs"
tag-pattern = 'integrations\/presidio-v(?P<version>.*)'

[tool.hatch.version.raw-options]
root = "../.."
git_describe_command = 'git describe --tags --match="integrations/presidio-v[0-9]*"'

[tool.hatch.envs.default]
installer = "uv"
dependencies = ["haystack-pydoc-tools", "ruff"]

[tool.hatch.envs.default.scripts]
docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
fmt = "ruff check --fix {args}; ruff format {args}"
fmt-check = "ruff check {args} && ruff format --check {args}"

[tool.hatch.envs.test]
dependencies = [
"pytest",
"pytest-asyncio",
"pytest-cov",
"pytest-rerunfailures",
"mypy",
"pip",
]

[tool.hatch.envs.test.scripts]
unit = 'pytest -m "not integration" {args:tests}'
integration = 'pytest -m "integration" {args:tests}'
all = 'pytest {args:tests}'
unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}"

[tool.mypy]
install_types = true
non_interactive = true
check_untyped_defs = true
disallow_incomplete_defs = true

[[tool.mypy.overrides]]
module = [
"presidio_analyzer",
"presidio_analyzer.*",
"presidio_anonymizer",
"presidio_anonymizer.*",
]
ignore_missing_imports = true

[tool.ruff]
line-length = 120

[tool.ruff.lint]
select = [
"A",
"ANN",
"ARG",
"B",
"C",
"D102",
"D103",
"D205",
"D209",
"D213",
"D417",
"D419",
"DTZ",
"E",
"EM",
"F",
"I",
"ICN",
"ISC",
"N",
"PLC",
"PLE",
"PLR",
"PLW",
"Q",
"RUF",
"S",
"T",
"TID",
"UP",
"W",
"YTT",
]
ignore = [
"B027",
"B008",
"S105",
"S106",
"S107",
"C901",
"PLR0911",
"PLR0912",
"PLR0913",
"PLR0915",
"ANN401",
]

[tool.ruff.lint.isort]
known-first-party = ["haystack_integrations"]

[tool.ruff.lint.flake8-tidy-imports]
ban-relative-imports = "parents"

[tool.ruff.lint.per-file-ignores]
"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]

[tool.coverage.run]
source = ["haystack_integrations"]
branch = true
relative_files = true
parallel = false

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing = true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]

[tool.pytest.ini_options]
addopts = "--strict-markers"
markers = [
"integration: integration tests",
]
log_cli = true
asyncio_default_fixture_loop_scope = "function"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner import PresidioDocumentCleaner
from haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor import PresidioEntityExtractor
from haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner import PresidioTextCleaner

__all__ = ["PresidioDocumentCleaner", "PresidioEntityExtractor", "PresidioTextCleaner"]
Loading
Loading