Skip to content

Commit 6dba45b

Browse files
sjrlbogdankostic
andauthored
feat: Add better language support presidio (#3209)
Co-authored-by: bogdankostic <bogdankostic@web.de>
1 parent 4fd89ce commit 6dba45b

9 files changed

Lines changed: 353 additions & 16 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# Maps ISO 639-1 language codes to the largest available spaCy model for that language.
6+
# Used to automatically configure the NLP engine when only `language` is specified.
7+
# See https://spacy.io/models for the full list of available models.
8+
SPACY_DEFAULT_MODELS: dict[str, str] = {
9+
"ca": "ca_core_news_lg",
10+
"zh": "zh_core_web_lg",
11+
"hr": "hr_core_news_lg",
12+
"da": "da_core_news_lg",
13+
"nl": "nl_core_news_lg",
14+
"en": "en_core_web_lg",
15+
"fi": "fi_core_news_lg",
16+
"fr": "fr_core_news_lg",
17+
"de": "de_core_news_lg",
18+
"el": "el_core_news_lg",
19+
"it": "it_core_news_lg",
20+
"ja": "ja_core_news_lg",
21+
"ko": "ko_core_news_lg",
22+
"lt": "lt_core_news_lg",
23+
"mk": "mk_core_news_lg",
24+
"nb": "nb_core_news_lg",
25+
"pl": "pl_core_news_lg",
26+
"pt": "pt_core_news_lg",
27+
"ro": "ro_core_news_lg",
28+
"ru": "ru_core_news_lg",
29+
"sl": "sl_core_news_lg",
30+
"es": "es_core_news_lg",
31+
"sv": "sv_core_news_lg",
32+
"uk": "uk_core_news_lg",
33+
}

integrations/presidio/src/haystack_integrations/components/common/py.typed

Whitespace-only changes.

integrations/presidio/src/haystack_integrations/components/extractors/presidio/presidio_entity_extractor.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,13 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
from dataclasses import replace
6+
from typing import ClassVar
67

78
from haystack import Document, component, logging
89
from presidio_analyzer import AnalyzerEngine
10+
from presidio_analyzer.nlp_engine import NlpEngineProvider
11+
12+
from haystack_integrations.components.common.presidio.utils import SPACY_DEFAULT_MODELS as _SPACY_DEFAULT_MODELS
913

1014
logger = logging.getLogger(__name__)
1115

@@ -40,18 +44,29 @@ class PresidioEntityExtractor:
4044
```
4145
"""
4246

47+
SPACY_DEFAULT_MODELS: ClassVar[dict[str, str]] = _SPACY_DEFAULT_MODELS
48+
"""Mapping from ISO 639-1 language code to the largest available spaCy model for that language.
49+
50+
Used to automatically select an NLP model when `models` is not specified.
51+
See [spaCy documentation](https://spacy.io/models) for the full list of available spaCy models.
52+
"""
53+
4354
def __init__(
4455
self,
4556
*,
4657
language: str = "en",
4758
entities: list[str] | None = None,
4859
score_threshold: float = 0.35,
60+
models: list[dict[str, str]] | None = None,
4961
) -> None:
5062
"""
5163
Initializes the PresidioEntityExtractor.
5264
5365
:param language:
54-
Language code for PII detection. Defaults to `"en"`.
66+
ISO 639-1 language code for PII detection. Defaults to `"en"`.
67+
For languages in the built-in mapping (e.g. `"de"`, `"fr"`, `"es"`), the appropriate
68+
spaCy model is loaded automatically at warm-up time — no need to set `models`.
69+
For unsupported languages, use the `models` parameter to configure a custom model.
5570
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
5671
:param entities:
5772
List of PII entity types to detect (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
@@ -60,10 +75,18 @@ def __init__(
6075
:param score_threshold:
6176
Minimum confidence score (0-1) for a detected entity to be included. Defaults to `0.35`.
6277
See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
78+
:param models:
79+
Advanced override: list of spaCy model configurations.
80+
Each entry must contain `"lang_code"` and `"model_name"` keys,
81+
e.g. `[{"lang_code": "fr", "model_name": "fr_core_news_md"}]`.
82+
Use this only when you need a specific model variant or a language not covered by the
83+
built-in mapping. If `None`, the model is selected automatically from `SPACY_DEFAULT_MODELS`
84+
based on `language`.
6385
"""
6486
self.language = language
6587
self.entities = entities
6688
self.score_threshold = score_threshold
89+
self.models = models
6790
self._analyzer: AnalyzerEngine | None = None
6891
self._is_warmed_up = False
6992

@@ -77,7 +100,21 @@ def warm_up(self) -> None:
77100
if self._is_warmed_up:
78101
return
79102

80-
self._analyzer = AnalyzerEngine()
103+
models = self.models
104+
if models is None:
105+
if self.language not in self.SPACY_DEFAULT_MODELS:
106+
supported = ", ".join(sorted(self.SPACY_DEFAULT_MODELS))
107+
msg = (
108+
f"No default spaCy model is available for language '{self.language}'. "
109+
f"Use the `models` parameter to specify a custom model. "
110+
f"Languages with built-in support: {supported}."
111+
)
112+
raise ValueError(msg)
113+
models = [{"lang_code": self.language, "model_name": self.SPACY_DEFAULT_MODELS[self.language]}]
114+
115+
nlp_engine = NlpEngineProvider(nlp_configuration={"nlp_engine_name": "spacy", "models": models}).create_engine()
116+
supported_languages = [m["lang_code"] for m in models]
117+
self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=supported_languages)
81118

82119
self._is_warmed_up = True
83120

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from typing import ClassVar
6+
57
from haystack import Document, component, logging
68
from presidio_analyzer import AnalyzerEngine
9+
from presidio_analyzer.nlp_engine import NlpEngineProvider
710
from presidio_anonymizer import AnonymizerEngine
811

12+
from haystack_integrations.components.common.presidio.utils import SPACY_DEFAULT_MODELS as _SPACY_DEFAULT_MODELS
13+
914
logger = logging.getLogger(__name__)
1015

1116

@@ -36,18 +41,29 @@ class PresidioDocumentCleaner:
3641
```
3742
"""
3843

44+
SPACY_DEFAULT_MODELS: ClassVar[dict[str, str]] = _SPACY_DEFAULT_MODELS
45+
"""Mapping from ISO 639-1 language code to the largest available spaCy model for that language.
46+
47+
Used to automatically select an NLP model when `models` is not specified.
48+
See [spaCy documentation](https://spacy.io/models) for the full list of available spaCy models.
49+
"""
50+
3951
def __init__(
4052
self,
4153
*,
4254
language: str = "en",
4355
entities: list[str] | None = None,
4456
score_threshold: float = 0.35,
57+
models: list[dict[str, str]] | None = None,
4558
) -> None:
4659
"""
4760
Initializes the PresidioDocumentCleaner.
4861
4962
:param language:
50-
Language code for PII detection. Defaults to `"en"`.
63+
ISO 639-1 language code for PII detection. Defaults to `"en"`.
64+
For languages in the built-in mapping (e.g. `"de"`, `"fr"`, `"es"`), the appropriate
65+
spaCy model is loaded automatically at warm-up time — no need to set `models`.
66+
For unsupported languages, use the `models` parameter to configure a custom model.
5167
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
5268
:param entities:
5369
List of PII entity types to detect and anonymize (e.g. `["PERSON", "EMAIL_ADDRESS"]`).
@@ -56,10 +72,18 @@ def __init__(
5672
:param score_threshold:
5773
Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
5874
See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
75+
:param models:
76+
Advanced override: list of spaCy model configurations.
77+
Each entry must contain `"lang_code"` and `"model_name"` keys,
78+
e.g. `[{"lang_code": "fr", "model_name": "fr_core_news_md"}]`.
79+
Use this only when you need a specific model variant or a language not covered by the
80+
built-in mapping. If `None`, the model is selected automatically from `SPACY_DEFAULT_MODELS`
81+
based on `language`.
5982
"""
6083
self.language = language
6184
self.entities = entities
6285
self.score_threshold = score_threshold
86+
self.models = models
6387
self._analyzer: AnalyzerEngine | None = None
6488
self._anonymizer: AnonymizerEngine | None = None
6589
self._is_warmed_up = False
@@ -74,7 +98,21 @@ def warm_up(self) -> None:
7498
if self._is_warmed_up:
7599
return
76100

77-
self._analyzer = AnalyzerEngine()
101+
models = self.models
102+
if models is None:
103+
if self.language not in self.SPACY_DEFAULT_MODELS:
104+
supported = ", ".join(sorted(self.SPACY_DEFAULT_MODELS))
105+
msg = (
106+
f"No default spaCy model is available for language '{self.language}'. "
107+
f"Use the `models` parameter to specify a custom model. "
108+
f"Languages with built-in support: {supported}."
109+
)
110+
raise ValueError(msg)
111+
models = [{"lang_code": self.language, "model_name": self.SPACY_DEFAULT_MODELS[self.language]}]
112+
113+
nlp_engine = NlpEngineProvider(nlp_configuration={"nlp_engine_name": "spacy", "models": models}).create_engine()
114+
supported_languages = [m["lang_code"] for m in models]
115+
self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=supported_languages)
78116
self._anonymizer = AnonymizerEngine()
79117

80118
self._is_warmed_up = True

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from typing import ClassVar
6+
57
from haystack import component, logging
68
from presidio_analyzer import AnalyzerEngine
9+
from presidio_analyzer.nlp_engine import NlpEngineProvider
710
from presidio_anonymizer import AnonymizerEngine
811

12+
from haystack_integrations.components.common.presidio.utils import SPACY_DEFAULT_MODELS as _SPACY_DEFAULT_MODELS
13+
914
logger = logging.getLogger(__name__)
1015

1116

@@ -33,18 +38,29 @@ class PresidioTextCleaner:
3338
```
3439
"""
3540

41+
SPACY_DEFAULT_MODELS: ClassVar[dict[str, str]] = _SPACY_DEFAULT_MODELS
42+
"""Mapping from ISO 639-1 language code to the largest available spaCy model for that language.
43+
44+
Used to automatically select an NLP model when `models` is not specified.
45+
See [spaCy documentation](https://spacy.io/models) for the full list of available spaCy models.
46+
"""
47+
3648
def __init__(
3749
self,
3850
*,
3951
language: str = "en",
4052
entities: list[str] | None = None,
4153
score_threshold: float = 0.35,
54+
models: list[dict[str, str]] | None = None,
4255
) -> None:
4356
"""
4457
Initializes the PresidioTextCleaner.
4558
4659
:param language:
47-
Language code for PII detection. Defaults to `"en"`.
60+
ISO 639-1 language code for PII detection. Defaults to `"en"`.
61+
For languages in the built-in mapping (e.g. `"de"`, `"fr"`, `"es"`), the appropriate
62+
spaCy model is loaded automatically at warm-up time — no need to set `models`.
63+
For unsupported languages, use the `models` parameter to configure a custom model.
4864
See [Presidio supported languages](https://microsoft.github.io/presidio/analyzer/languages/).
4965
:param entities:
5066
List of PII entity types to detect and anonymize (e.g. `["PERSON", "PHONE_NUMBER"]`).
@@ -53,10 +69,18 @@ def __init__(
5369
:param score_threshold:
5470
Minimum confidence score (0-1) for a detected entity to be anonymized. Defaults to `0.35`.
5571
See [Presidio analyzer documentation](https://microsoft.github.io/presidio/analyzer/).
72+
:param models:
73+
Advanced override: list of spaCy model configurations.
74+
Each entry must contain `"lang_code"` and `"model_name"` keys,
75+
e.g. `[{"lang_code": "fr", "model_name": "fr_core_news_md"}]`.
76+
Use this only when you need a specific model variant or a language not covered by the
77+
built-in mapping. If `None`, the model is selected automatically from `SPACY_DEFAULT_MODELS`
78+
based on `language`.
5679
"""
5780
self.language = language
5881
self.entities = entities
5982
self.score_threshold = score_threshold
83+
self.models = models
6084
self._analyzer: AnalyzerEngine | None = None
6185
self._anonymizer: AnonymizerEngine | None = None
6286
self._is_warmed_up = False
@@ -71,7 +95,21 @@ def warm_up(self) -> None:
7195
if self._is_warmed_up:
7296
return
7397

74-
self._analyzer = AnalyzerEngine()
98+
models = self.models
99+
if models is None:
100+
if self.language not in self.SPACY_DEFAULT_MODELS:
101+
supported = ", ".join(sorted(self.SPACY_DEFAULT_MODELS))
102+
msg = (
103+
f"No default spaCy model is available for language '{self.language}'. "
104+
f"Use the `models` parameter to specify a custom model. "
105+
f"Languages with built-in support: {supported}."
106+
)
107+
raise ValueError(msg)
108+
models = [{"lang_code": self.language, "model_name": self.SPACY_DEFAULT_MODELS[self.language]}]
109+
110+
nlp_engine = NlpEngineProvider(nlp_configuration={"nlp_engine_name": "spacy", "models": models}).create_engine()
111+
supported_languages = [m["lang_code"] for m in models]
112+
self._analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=supported_languages)
75113
self._anonymizer = AnonymizerEngine()
76114

77115
self._is_warmed_up = True

0 commit comments

Comments
 (0)