Skip to content

Commit 7c5aa5d

Browse files
fix: address review comments on presidio integration
- Regenerate presidio.yml workflow from template (compute-test-matrix job, pinned action versions, push trigger, coverage steps) - Add integration-cov-append-retry script to pyproject.toml - Drop un-anonymized documents on error in PresidioDocumentCleaner instead of passing them through unanonymized - Clarify warning message in PresidioEntityExtractor to say extraction is skipped but document is kept - Update test to assert failed docs are dropped in PresidioDocumentCleaner
1 parent 10d90f5 commit 7c5aa5d

5 files changed

Lines changed: 70 additions & 10 deletions

File tree

.github/workflows/presidio.yml

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# This workflow comes from https://github.com/ofek/hatch-mypyc
2+
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
13
name: Test / presidio
24

35
on:
@@ -8,34 +10,66 @@ on:
810
- "integrations/presidio/**"
911
- "!integrations/presidio/*.md"
1012
- ".github/workflows/presidio.yml"
13+
push:
14+
branches:
15+
- main
16+
paths:
17+
- "integrations/presidio/**"
18+
- "!integrations/presidio/*.md"
19+
- ".github/workflows/presidio.yml"
1120

1221
defaults:
1322
run:
1423
working-directory: integrations/presidio
1524

1625
concurrency:
17-
group: presidio-${{ github.head_ref }}
26+
group: presidio-${{ github.head_ref || github.sha }}
1827
cancel-in-progress: true
1928

2029
env:
2130
PYTHONUNBUFFERED: "1"
2231
FORCE_COLOR: "1"
32+
TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]'
33+
TEST_MATRIX_PYTHON: '["3.10", "3.14"]'
2334

2435
jobs:
36+
compute-test-matrix:
37+
runs-on: ubuntu-slim
38+
defaults:
39+
run:
40+
working-directory: .
41+
outputs:
42+
os: ${{ steps.set.outputs.os }}
43+
python-version: ${{ steps.set.outputs.python-version }}
44+
steps:
45+
- id: set
46+
run: |
47+
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
48+
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
49+
2550
run:
2651
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
52+
needs: compute-test-matrix
53+
permissions:
54+
contents: write
55+
pull-requests: write
2756
runs-on: ${{ matrix.os }}
2857
strategy:
2958
fail-fast: false
3059
matrix:
31-
os: [ubuntu-latest]
32-
python-version: ["3.10", "3.14"]
60+
os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
61+
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
3362

3463
steps:
35-
- uses: actions/checkout@v4
64+
- name: Support longpaths
65+
if: matrix.os == 'windows-latest'
66+
working-directory: .
67+
run: git config --system core.longpaths true
68+
69+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
3670

3771
- name: Set up Python ${{ matrix.python-version }}
38-
uses: actions/setup-python@v5
72+
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
3973
with:
4074
python-version: ${{ matrix.python-version }}
4175

@@ -49,7 +83,34 @@ jobs:
4983
- name: Run unit tests
5084
run: hatch run test:unit-cov-retry
5185

86+
# On PR: generates coverage comment artifact. On push to main: stores coverage baseline on data branch.
87+
- name: Store unit tests coverage
88+
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
89+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
90+
with:
91+
GITHUB_TOKEN: ${{ github.token }}
92+
COVERAGE_PATH: integrations/presidio
93+
SUBPROJECT_ID: presidio
94+
COMMENT_ARTIFACT_NAME: coverage-comment-presidio
95+
MINIMUM_GREEN: 90
96+
MINIMUM_ORANGE: 60
97+
98+
- name: Run integration tests
99+
run: hatch run test:integration-cov-append-retry
100+
101+
- name: Store combined coverage
102+
if: github.event_name == 'push'
103+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
104+
with:
105+
GITHUB_TOKEN: ${{ github.token }}
106+
COVERAGE_PATH: integrations/presidio
107+
SUBPROJECT_ID: presidio-combined
108+
COMMENT_ARTIFACT_NAME: coverage-comment-presidio-combined
109+
MINIMUM_GREEN: 90
110+
MINIMUM_ORANGE: 60
111+
52112
- name: Run unit tests with lowest direct dependencies
113+
if: github.event_name != 'push'
53114
run: |
54115
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
55116
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
@@ -65,7 +126,7 @@ jobs:
65126
notify-slack-on-failure:
66127
needs: run
67128
if: failure() && github.event_name == 'schedule'
68-
runs-on: ubuntu-latest
129+
runs-on: ubuntu-slim
69130
steps:
70131
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
71132
with:

integrations/presidio/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ unit = 'pytest -m "not integration" {args:tests}'
6969
integration = 'pytest -m "integration" {args:tests}'
7070
all = 'pytest {args:tests}'
7171
unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
72+
integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}'
7273
types = "mypy -p haystack_integrations.components.preprocessors.presidio {args}"
7374

7475
[tool.mypy]

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,5 +112,4 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
112112
doc_id=doc.id,
113113
error=e,
114114
)
115-
cleaned.append(doc)
116115
return {"documents": cleaned}

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_entity_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:
117117
result_docs.append(replace(doc, meta={**doc.meta, "entities": entities}))
118118
except Exception as e:
119119
logger.warning(
120-
"Could not extract entities from document {doc_id}. Skipping it. Error: {error}",
120+
"Could not extract entities from document {doc_id}. Skipping extraction, keeping document. Error: {error}",
121121
doc_id=doc.id,
122122
error=e,
123123
)

integrations/presidio/tests/test_presidio_document_cleaner.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,7 @@ def test_run_skips_on_error(self, caplog):
112112
with caplog.at_level(logging.WARNING):
113113
result = cleaner.run(documents=[doc])
114114

115-
assert len(result["documents"]) == 1
116-
assert result["documents"][0].content == "Some text with PII"
115+
assert len(result["documents"]) == 0
117116
assert "Could not anonymize" in caplog.text
118117

119118
def test_run_multiple_documents(self):

0 commit comments

Comments
 (0)