Skip to content

Commit 6ca2fcd

Browse files
committed
remove automatic SpaCy model initialization based on language name. The language model needs to be specified
1 parent 9c6b53d commit 6ca2fcd

File tree

3 files changed

+13
-71
lines changed

3 files changed

+13
-71
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="text_preprocessing",
8-
version="1.0.5",
8+
version="1.1",
99
author="The ARTFL Project",
1010
author_email="clovisgladstone@gmail.com",
1111
packages=["text_preprocessing", "text_preprocessing.lang"],

text_preprocessing/preprocessor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def __init__(
5757
word_regex: str = r"[\p{L}\p{M}\p{N}]+|'",
5858
sentence_boundaries: list[str] = [".", "!", "?"],
5959
language: str = "french",
60+
language_model: str | None = None,
6061
modernize: bool = False,
6162
strip_tags: bool = False,
6263
is_philo_db: bool = False,
@@ -100,7 +101,8 @@ def __init__(
100101
if nlp_model is not None:
101102
self.nlp = nlp_model
102103
else:
103-
self.nlp, using_gpu = load_language_model(self.language, self.normalize_options)
104+
if language_model is not None:
105+
self.nlp, using_gpu = load_language_model(language_model, self.normalize_options)
104106
self.using_gpu = using_gpu
105107
if workers is None:
106108
cpu_count = os.cpu_count() or 2

text_preprocessing/spacy_helpers.py

Lines changed: 9 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -17,35 +17,6 @@
1717
from thinc.api import prefer_gpu, set_gpu_allocator
1818
from unidecode import unidecode
1919

20-
# Updated as of 8/23/2022
21-
SPACY_LANGUAGE_MODEL_MAP: Dict[str, List[str]] = {
22-
"catalan": ["ca_core_news_sm", "ca_core_news_md", "ca_core_news_lg", "ca_core_news_trf"],
23-
"chinese": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg", "zh_core_web_trf"],
24-
"croation": ["hr_core_news_sm", "hr_core_news_md", "hr_core_news_lg"],
25-
"danish": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"],
26-
"dutch": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
27-
"english": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
28-
"finnish": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"],
29-
"german": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
30-
"greek": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"],
31-
"french": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
32-
"italian": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"],
33-
"japanese": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"],
34-
"korean": ["ko_core_news_sm", "ko_core_news_md", "ko_core_news_lg"],
35-
"lithuanian": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"],
36-
"macedonian": ["mk_core_news_sm", "mk_core_news_md", "mk_core_news_lg"],
37-
"norwegian": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"],
38-
"polish": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"],
39-
"portuguese": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"],
40-
"romanian": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"],
41-
"russian": ["ru_core_news_sm", "ru_core_news_md", "ru_core_news_lg"],
42-
"spanish": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
43-
"swedish": ["sv_core_news_sm", "sv_core_news_md", "sv_core_news_lg"],
44-
"ukrainian": ["uk_core_news_sm", "uk_core_news_md", "uk_core_news_lg"],
45-
"multi-language": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
46-
}
47-
48-
4920
PUNCTUATION_MAP = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))
5021
PUNCTUATION_CLASS = set([chr(i) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")])
5122
NUMBERS = re.compile(r"\d")
@@ -303,24 +274,6 @@ def load(cls, path):
303274
return Tokens(tokens["tokens"], tokens["metadata"])
304275

305276

306-
def check_for_updates(language) -> List[str]:
307-
"""Check for spacy language model updates"""
308-
import requests
309-
310-
response = requests.get("https://raw.githubusercontent.com/explosion/spaCy/master/website/meta/languages.json")
311-
if response.status_code == 404:
312-
print("Unable to fetch language information from Spacy GitHub")
313-
return []
314-
try:
315-
languages = response.json()
316-
models = {lang["name"].lower(): lang["models"] for lang in languages["languages"] if "models" in lang}
317-
model: List[str] = models[language][::-1]
318-
print(model)
319-
except KeyError:
320-
return []
321-
return model
322-
323-
324277
@Language.factory(
325278
"postprocessor",
326279
default_config={
@@ -528,18 +481,9 @@ def clear_trf_data(doc):
528481
return doc
529482

530483

531-
def load_language_model(language, normalize_options: dict[str, Any]) -> tuple[Language, bool]:
484+
def load_language_model(language_model, normalize_options: dict[str, Any]) -> tuple[Language, bool]:
532485
"""Load language model based on name"""
533486
nlp = None
534-
language = language.lower()
535-
try:
536-
possible_models = SPACY_LANGUAGE_MODEL_MAP[language][::-1]
537-
except KeyError:
538-
try:
539-
possible_models = check_for_updates(language)
540-
except KeyError:
541-
print(f"Spacy does not support the {language} language.")
542-
exit(-1)
543487
if any(
544488
(
545489
normalize_options["lemmatizer"] == "spacy",
@@ -552,26 +496,22 @@ def load_language_model(language, normalize_options: dict[str, Any]) -> tuple[La
552496
disabled_pipelines.append("tagger")
553497
if not normalize_options["ents_to_keep"]:
554498
disabled_pipelines.append("ner")
555-
model_loaded = ""
556499
set_gpu_allocator("pytorch")
557500
use_gpu = prefer_gpu()
558-
for model in possible_models:
559-
try:
560-
nlp = spacy.load(model, exclude=disabled_pipelines)
561-
print("Using Spacy model", model)
562-
except OSError:
563-
pass
564-
if nlp is not None:
565-
model_loaded = model
566-
break
501+
try:
502+
nlp = spacy.load(language_model, exclude=disabled_pipelines)
503+
except OSError:
504+
pass
567505
if nlp is None:
568-
print(f"No Spacy model installed for the {language} language. Stopping...")
506+
print(
507+
f"The Spacy model {language_model} is not installed on your system. See https://spacy.io/models for instructions. Stopping..."
508+
)
569509
exit(-1)
570510
if use_gpu is True:
571511
nlp.add_pipe("clear_trf_data", last=True)
572512
nlp.add_pipe("postprocessor", config=normalize_options, last=True)
573513
if normalize_options["ents_to_keep"] and "ner" not in nlp.pipe_names:
574-
print(f"There is no NER pipeline for model {model_loaded}. Exiting...")
514+
print(f"There is no NER pipeline for model {language_model}. Exiting...")
575515
exit(-1)
576516
return nlp, use_gpu
577517
nlp = spacy.blank("en")

0 commit comments

Comments
 (0)