Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ integration:pgvector:
- any-glob-to-any-file: "integrations/pgvector/**/*"
- any-glob-to-any-file: ".github/workflows/pgvector.yml"

integration:presidio:
- changed-files:
- any-glob-to-any-file: "integrations/presidio/**/*"
- any-glob-to-any-file: ".github/workflows/presidio.yml"

integration:pinecone:
- changed-files:
- any-glob-to-any-file: "integrations/pinecone/**/*"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/CI_coverage_comment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ on:
- "Test / paddleocr"
- "Test / pgvector"
- "Test / pinecone"
- "Test / presidio"
- "Test / pyversity"
- "Test / qdrant"
- "Test / ragas"
Expand Down
133 changes: 133 additions & 0 deletions .github/workflows/presidio.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / presidio
Comment thread
sjrl marked this conversation as resolved.

on:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/presidio/**"
- "!integrations/presidio/*.md"
- ".github/workflows/presidio.yml"
push:
branches:
- main
paths:
- "integrations/presidio/**"
- "!integrations/presidio/*.md"
- ".github/workflows/presidio.yml"

defaults:
run:
working-directory: integrations/presidio

concurrency:
group: presidio-${{ github.head_ref || github.sha }}
cancel-in-progress: true

env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"
TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]'
TEST_MATRIX_PYTHON: '["3.10", "3.14"]'

jobs:
compute-test-matrix:
runs-on: ubuntu-slim
defaults:
run:
working-directory: .
outputs:
os: ${{ steps.set.outputs.os }}
python-version: ${{ steps.set.outputs.python-version }}
steps:
- id: set
run: |
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"

run:
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
needs: compute-test-matrix
permissions:
contents: write
pull-requests: write
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}

steps:
- name: Support longpaths
if: matrix.os == 'windows-latest'
working-directory: .
run: git config --system core.longpaths true

- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: ${{ matrix.python-version }}

- name: Install Hatch
run: pip install hatch

- name: Lint
if: matrix.python-version == '3.10' && runner.os == 'Linux'
run: hatch run fmt-check && hatch run test:types

- name: Run unit tests
run: hatch run test:unit-cov-retry

# On PR: generates coverage comment artifact. On push to main: stores coverage baseline on data branch.
- name: Store unit tests coverage
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
with:
GITHUB_TOKEN: ${{ github.token }}
COVERAGE_PATH: integrations/presidio
SUBPROJECT_ID: presidio
COMMENT_ARTIFACT_NAME: coverage-comment-presidio
MINIMUM_GREEN: 90
MINIMUM_ORANGE: 60

- name: Run integration tests
run: hatch run test:integration-cov-append-retry

- name: Store combined coverage
if: github.event_name == 'push'
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
with:
GITHUB_TOKEN: ${{ github.token }}
COVERAGE_PATH: integrations/presidio
SUBPROJECT_ID: presidio-combined
COMMENT_ARTIFACT_NAME: coverage-comment-presidio-combined
MINIMUM_GREEN: 90
MINIMUM_ORANGE: 60

- name: Run unit tests with lowest direct dependencies
if: github.event_name != 'push'
run: |
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
hatch run test:unit

- name: Nightly - run unit tests with Haystack main branch
if: github.event_name == 'schedule'
run: |
hatch env prune
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
hatch run test:unit

notify-slack-on-failure:
needs: run
if: failure() && github.event_name == 'schedule'
runs-on: ubuntu-slim
steps:
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
with:
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
| [paddleocr-haystack](integrations/paddleocr/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/paddleocr-haystack.svg)](https://pypi.org/project/paddleocr-haystack) | [![Test / paddleocr](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/paddleocr.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-paddleocr-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-paddleocr-combined/htmlcov/index.html) |
| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pinecone-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pinecone-combined/htmlcov/index.html) |
| [pgvector-haystack](integrations/pgvector/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pgvector-haystack.svg?color=orange)](https://pypi.org/project/pgvector-haystack) | [![Test / pgvector](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pgvector.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pgvector-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pgvector-combined/htmlcov/index.html) |
| [presidio-haystack](integrations/presidio/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack) | [![Test / presidio](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/presidio.yml) | | |
| [pyversity-haystack](integrations/pyversity/) | Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/pyversity-haystack.svg)](https://pypi.org/project/pyversity-haystack) | [![Test / pyversity](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pyversity.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-pyversity/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-pyversity/htmlcov/index.html) | |
| [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-qdrant-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-qdrant-combined/htmlcov/index.html) |
| [ragas-haystack](integrations/ragas/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/ragas-haystack.svg)](https://pypi.org/project/ragas-haystack) | [![Test / ragas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ragas/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ragas/htmlcov/index.html) | |
Expand Down
10 changes: 10 additions & 0 deletions integrations/presidio/README.md
Comment thread
sjrl marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# presidio-haystack

[![PyPI - Version](https://img.shields.io/pypi/v/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/presidio-haystack.svg)](https://pypi.org/project/presidio-haystack)

- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/presidio/CHANGELOG.md)

---

Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
15 changes: 15 additions & 0 deletions integrations/presidio/pydoc/config_docusaurus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
loaders:
- modules:
- haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner
- haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner
- haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor
search_path: [../src]
processors:
- type: filter
documented_only: true
skip_empty_modules: true
renderer:
description: Presidio integration for Haystack
id: integrations-presidio
filename: presidio.md
title: Presidio
168 changes: 168 additions & 0 deletions integrations/presidio/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "presidio-haystack"
dynamic = ["version"]
description = "Haystack integration for Microsoft Presidio — PII detection and anonymization"
readme = "README.md"
requires-python = ">=3.10"
license = "Apache-2.0"
keywords = ["Haystack", "Presidio", "PII", "anonymization", "privacy", "NLP"]
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
Comment thread
sjrl marked this conversation as resolved.
"Programming Language :: Python :: 3.14",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"haystack-ai>=2.9.0",
"presidio-analyzer>=2.2.0",
"presidio-anonymizer>=2.2.0",
]

[project.urls]
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio#readme"
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/presidio"

[tool.hatch.build.targets.wheel]
packages = ["src/haystack_integrations"]

[tool.hatch.version]
source = "vcs"
tag-pattern = 'integrations\/presidio-v(?P<version>.*)'

[tool.hatch.version.raw-options]
root = "../.."
git_describe_command = 'git describe --tags --match="integrations/presidio-v[0-9]*"'

[tool.hatch.envs.default]
installer = "uv"
dependencies = ["haystack-pydoc-tools", "ruff"]

[tool.hatch.envs.default.scripts]
docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
fmt = "ruff check --fix {args}; ruff format {args}"
fmt-check = "ruff check {args} && ruff format --check {args}"

[tool.hatch.envs.test]
dependencies = [
"pytest",
"pytest-asyncio",
"pytest-cov",
"pytest-rerunfailures",
"mypy",
"pip",
]

[tool.hatch.envs.test.scripts]
unit = 'pytest -m "not integration" {args:tests}'
integration = 'pytest -m "integration" {args:tests}'
all = 'pytest {args:tests}'
unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}'
types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}"

[tool.mypy]
install_types = true
non_interactive = true
check_untyped_defs = true
disallow_incomplete_defs = true

[[tool.mypy.overrides]]
module = [
"presidio_analyzer",
"presidio_analyzer.*",
"presidio_anonymizer",
"presidio_anonymizer.*",
]
ignore_missing_imports = true

[tool.ruff]
line-length = 120

[tool.ruff.lint]
select = [
"A",
"ANN",
"ARG",
"B",
"C",
"D102",
"D103",
"D205",
"D209",
"D213",
"D417",
"D419",
"DTZ",
"E",
"EM",
"F",
"I",
"ICN",
"ISC",
"N",
"PLC",
"PLE",
"PLR",
"PLW",
"Q",
"RUF",
"S",
"T",
"TID",
"UP",
"W",
"YTT",
]
ignore = [
"B027",
"B008",
"S105",
"S106",
"S107",
"C901",
"PLR0911",
"PLR0912",
"PLR0913",
"PLR0915",
"ANN401",
]

[tool.ruff.lint.isort]
known-first-party = ["haystack_integrations"]

[tool.ruff.lint.flake8-tidy-imports]
ban-relative-imports = "parents"

[tool.ruff.lint.per-file-ignores]
"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]

[tool.coverage.run]
source = ["haystack_integrations"]
branch = true
relative_files = true
parallel = false

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing = true
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]

[tool.pytest.ini_options]
addopts = "--strict-markers"
markers = [
"integration: integration tests",
]
log_cli = true
asyncio_default_fixture_loop_scope = "function"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from haystack_integrations.components.preprocessors.presidio.presidio_document_cleaner import PresidioDocumentCleaner
from haystack_integrations.components.preprocessors.presidio.presidio_entity_extractor import PresidioEntityExtractor
from haystack_integrations.components.preprocessors.presidio.presidio_text_cleaner import PresidioTextCleaner

__all__ = ["PresidioDocumentCleaner", "PresidioEntityExtractor", "PresidioTextCleaner"]
Loading
Loading