Skip to content

Commit 598ed16

Browse files
committed
refactor default mapping location and add it as a class var
1 parent bcc4545 commit 598ed16

6 files changed

Lines changed: 77 additions & 99 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# Maps ISO 639-1 language codes to the largest available spaCy model for that language.
6+
# Used to automatically configure the NLP engine when only `language` is specified.
7+
# See https://spacy.io/models for the full list of available models.
8+
SPACY_DEFAULT_MODELS: dict[str, str] = {
9+
"ca": "ca_core_news_lg",
10+
"zh": "zh_core_web_lg",
11+
"hr": "hr_core_news_lg",
12+
"da": "da_core_news_lg",
13+
"nl": "nl_core_news_lg",
14+
"en": "en_core_web_lg",
15+
"fi": "fi_core_news_lg",
16+
"fr": "fr_core_news_lg",
17+
"de": "de_core_news_lg",
18+
"el": "el_core_news_lg",
19+
"it": "it_core_news_lg",
20+
"ja": "ja_core_news_lg",
21+
"ko": "ko_core_news_lg",
22+
"lt": "lt_core_news_lg",
23+
"mk": "mk_core_news_lg",
24+
"nb": "nb_core_news_lg",
25+
"pl": "pl_core_news_lg",
26+
"pt": "pt_core_news_lg",
27+
"ro": "ro_core_news_lg",
28+
"ru": "ru_core_news_lg",
29+
"sl": "sl_core_news_lg",
30+
"es": "es_core_news_lg",
31+
"sv": "sv_core_news_lg",
32+
"uk": "uk_core_news_lg",
33+
}

integrations/presidio/src/haystack_integrations/components/common/py.typed

Whitespace-only changes.

integrations/presidio/src/haystack_integrations/components/extractors/presidio/presidio_entity_extractor.py

Lines changed: 13 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,42 +3,15 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
from dataclasses import replace
6+
from typing import ClassVar
67

78
from haystack import Document, component, logging
89
from presidio_analyzer import AnalyzerEngine
910
from presidio_analyzer.nlp_engine import NlpEngineProvider
1011

11-
logger = logging.getLogger(__name__)
12+
from haystack_integrations.components.common.presidio.utils import SPACY_DEFAULT_MODELS as _SPACY_DEFAULT_MODELS
1213

13-
# Maps ISO 639-1 language codes to the largest available spaCy model for that language.
14-
# Used to automatically configure the NLP engine when only `language` is specified.
15-
# See https://spacy.io/models for the full list of available models.
16-
SPACY_DEFAULT_MODELS: dict[str, str] = {
17-
"ca": "ca_core_news_lg",
18-
"zh": "zh_core_web_lg",
19-
"hr": "hr_core_news_lg",
20-
"da": "da_core_news_lg",
21-
"nl": "nl_core_news_lg",
22-
"en": "en_core_web_lg",
23-
"fi": "fi_core_news_lg",
24-
"fr": "fr_core_news_lg",
25-
"de": "de_core_news_lg",
26-
"el": "el_core_news_lg",
27-
"it": "it_core_news_lg",
28-
"ja": "ja_core_news_lg",
29-
"ko": "ko_core_news_lg",
30-
"lt": "lt_core_news_lg",
31-
"mk": "mk_core_news_lg",
32-
"nb": "nb_core_news_lg",
33-
"pl": "pl_core_news_lg",
34-
"pt": "pt_core_news_lg",
35-
"ro": "ro_core_news_lg",
36-
"ru": "ru_core_news_lg",
37-
"sl": "sl_core_news_lg",
38-
"es": "es_core_news_lg",
39-
"sv": "sv_core_news_lg",
40-
"uk": "uk_core_news_lg",
41-
}
14+
logger = logging.getLogger(__name__)
4215

4316

4417
@component
@@ -71,6 +44,13 @@ class PresidioEntityExtractor:
7144
```
7245
"""
7346

47+
SPACY_DEFAULT_MODELS: ClassVar[dict[str, str]] = _SPACY_DEFAULT_MODELS
48+
"""Mapping from ISO 639-1 language code to the largest available spaCy model for that language.
49+
50+
Used to automatically select an NLP model when `models` is not specified.
51+
See https://spacy.io/models for the full list of available spaCy models.
52+
"""
53+
7454
def __init__(
7555
self,
7656
*,
@@ -122,15 +102,15 @@ def warm_up(self) -> None:
122102

123103
models = self.models
124104
if models is None:
125-
if self.language not in SPACY_DEFAULT_MODELS:
126-
supported = ", ".join(sorted(SPACY_DEFAULT_MODELS))
105+
if self.language not in self.SPACY_DEFAULT_MODELS:
106+
supported = ", ".join(sorted(self.SPACY_DEFAULT_MODELS))
127107
msg = (
128108
f"No default spaCy model is available for language '{self.language}'. "
129109
f"Use the `models` parameter to specify a custom model. "
130110
f"Languages with built-in support: {supported}."
131111
)
132112
raise ValueError(msg)
133-
models = [{"lang_code": self.language, "model_name": SPACY_DEFAULT_MODELS[self.language]}]
113+
models = [{"lang_code": self.language, "model_name": self.SPACY_DEFAULT_MODELS[self.language]}]
134114

135115
nlp_engine = NlpEngineProvider(
136116
nlp_configuration={"nlp_engine_name": "spacy", "models": models}

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_document_cleaner.py

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,16 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from typing import ClassVar
6+
57
from haystack import Document, component, logging
68
from presidio_analyzer import AnalyzerEngine
79
from presidio_analyzer.nlp_engine import NlpEngineProvider
810
from presidio_anonymizer import AnonymizerEngine
911

10-
logger = logging.getLogger(__name__)
12+
from haystack_integrations.components.common.presidio.utils import SPACY_DEFAULT_MODELS as _SPACY_DEFAULT_MODELS
1113

12-
# Maps ISO 639-1 language codes to the largest available spaCy model for that language.
13-
# Used to automatically configure the NLP engine when only `language` is specified.
14-
# See https://spacy.io/models for the full list of available models.
15-
SPACY_DEFAULT_MODELS: dict[str, str] = {
16-
"ca": "ca_core_news_lg",
17-
"zh": "zh_core_web_lg",
18-
"hr": "hr_core_news_lg",
19-
"da": "da_core_news_lg",
20-
"nl": "nl_core_news_lg",
21-
"en": "en_core_web_lg",
22-
"fi": "fi_core_news_lg",
23-
"fr": "fr_core_news_lg",
24-
"de": "de_core_news_lg",
25-
"el": "el_core_news_lg",
26-
"it": "it_core_news_lg",
27-
"ja": "ja_core_news_lg",
28-
"ko": "ko_core_news_lg",
29-
"lt": "lt_core_news_lg",
30-
"mk": "mk_core_news_lg",
31-
"nb": "nb_core_news_lg",
32-
"pl": "pl_core_news_lg",
33-
"pt": "pt_core_news_lg",
34-
"ro": "ro_core_news_lg",
35-
"ru": "ru_core_news_lg",
36-
"sl": "sl_core_news_lg",
37-
"es": "es_core_news_lg",
38-
"sv": "sv_core_news_lg",
39-
"uk": "uk_core_news_lg",
40-
}
14+
logger = logging.getLogger(__name__)
4115

4216

4317
@component
@@ -67,6 +41,13 @@ class PresidioDocumentCleaner:
6741
```
6842
"""
6943

44+
SPACY_DEFAULT_MODELS: ClassVar[dict[str, str]] = _SPACY_DEFAULT_MODELS
45+
"""Mapping from ISO 639-1 language code to the largest available spaCy model for that language.
46+
47+
Used to automatically select an NLP model when `models` is not specified.
48+
See https://spacy.io/models for the full list of available spaCy models.
49+
"""
50+
7051
def __init__(
7152
self,
7253
*,
@@ -119,15 +100,15 @@ def warm_up(self) -> None:
119100

120101
models = self.models
121102
if models is None:
122-
if self.language not in SPACY_DEFAULT_MODELS:
123-
supported = ", ".join(sorted(SPACY_DEFAULT_MODELS))
103+
if self.language not in self.SPACY_DEFAULT_MODELS:
104+
supported = ", ".join(sorted(self.SPACY_DEFAULT_MODELS))
124105
msg = (
125106
f"No default spaCy model is available for language '{self.language}'. "
126107
f"Use the `models` parameter to specify a custom model. "
127108
f"Languages with built-in support: {supported}."
128109
)
129110
raise ValueError(msg)
130-
models = [{"lang_code": self.language, "model_name": SPACY_DEFAULT_MODELS[self.language]}]
111+
models = [{"lang_code": self.language, "model_name": self.SPACY_DEFAULT_MODELS[self.language]}]
131112

132113
nlp_engine = NlpEngineProvider(
133114
nlp_configuration={"nlp_engine_name": "spacy", "models": models}

integrations/presidio/src/haystack_integrations/components/preprocessors/presidio/presidio_text_cleaner.py

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,16 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from typing import ClassVar
6+
57
from haystack import component, logging
68
from presidio_analyzer import AnalyzerEngine
79
from presidio_analyzer.nlp_engine import NlpEngineProvider
810
from presidio_anonymizer import AnonymizerEngine
911

10-
logger = logging.getLogger(__name__)
12+
from haystack_integrations.components.common.presidio.utils import SPACY_DEFAULT_MODELS as _SPACY_DEFAULT_MODELS
1113

12-
# Maps ISO 639-1 language codes to the largest available spaCy model for that language.
13-
# Used to automatically configure the NLP engine when only `language` is specified.
14-
# See https://spacy.io/models for the full list of available models.
15-
SPACY_DEFAULT_MODELS: dict[str, str] = {
16-
"ca": "ca_core_news_lg",
17-
"zh": "zh_core_web_lg",
18-
"hr": "hr_core_news_lg",
19-
"da": "da_core_news_lg",
20-
"nl": "nl_core_news_lg",
21-
"en": "en_core_web_lg",
22-
"fi": "fi_core_news_lg",
23-
"fr": "fr_core_news_lg",
24-
"de": "de_core_news_lg",
25-
"el": "el_core_news_lg",
26-
"it": "it_core_news_lg",
27-
"ja": "ja_core_news_lg",
28-
"ko": "ko_core_news_lg",
29-
"lt": "lt_core_news_lg",
30-
"mk": "mk_core_news_lg",
31-
"nb": "nb_core_news_lg",
32-
"pl": "pl_core_news_lg",
33-
"pt": "pt_core_news_lg",
34-
"ro": "ro_core_news_lg",
35-
"ru": "ru_core_news_lg",
36-
"sl": "sl_core_news_lg",
37-
"es": "es_core_news_lg",
38-
"sv": "sv_core_news_lg",
39-
"uk": "uk_core_news_lg",
40-
}
14+
logger = logging.getLogger(__name__)
4115

4216

4317
@component
@@ -64,6 +38,13 @@ class PresidioTextCleaner:
6438
```
6539
"""
6640

41+
SPACY_DEFAULT_MODELS: ClassVar[dict[str, str]] = _SPACY_DEFAULT_MODELS
42+
"""Mapping from ISO 639-1 language code to the largest available spaCy model for that language.
43+
44+
Used to automatically select an NLP model when `models` is not specified.
45+
See https://spacy.io/models for the full list of available spaCy models.
46+
"""
47+
6748
def __init__(
6849
self,
6950
*,
@@ -116,15 +97,15 @@ def warm_up(self) -> None:
11697

11798
models = self.models
11899
if models is None:
119-
if self.language not in SPACY_DEFAULT_MODELS:
120-
supported = ", ".join(sorted(SPACY_DEFAULT_MODELS))
100+
if self.language not in self.SPACY_DEFAULT_MODELS:
101+
supported = ", ".join(sorted(self.SPACY_DEFAULT_MODELS))
121102
msg = (
122103
f"No default spaCy model is available for language '{self.language}'. "
123104
f"Use the `models` parameter to specify a custom model. "
124105
f"Languages with built-in support: {supported}."
125106
)
126107
raise ValueError(msg)
127-
models = [{"lang_code": self.language, "model_name": SPACY_DEFAULT_MODELS[self.language]}]
108+
models = [{"lang_code": self.language, "model_name": self.SPACY_DEFAULT_MODELS[self.language]}]
128109

129110
nlp_engine = NlpEngineProvider(
130111
nlp_configuration={"nlp_engine_name": "spacy", "models": models}

0 commit comments

Comments
 (0)