1717from thinc .api import prefer_gpu , set_gpu_allocator
1818from unidecode import unidecode
1919
20- # Updated as of 8/23/2022
21- SPACY_LANGUAGE_MODEL_MAP : Dict [str , List [str ]] = {
22- "catalan" : ["ca_core_news_sm" , "ca_core_news_md" , "ca_core_news_lg" , "ca_core_news_trf" ],
23- "chinese" : ["zh_core_web_sm" , "zh_core_web_md" , "zh_core_web_lg" , "zh_core_web_trf" ],
24- "croation" : ["hr_core_news_sm" , "hr_core_news_md" , "hr_core_news_lg" ],
25- "danish" : ["da_core_news_sm" , "da_core_news_md" , "da_core_news_lg" ],
26- "dutch" : ["nl_core_news_sm" , "nl_core_news_md" , "nl_core_news_lg" ],
27- "english" : ["en_core_web_sm" , "en_core_web_md" , "en_core_web_lg" , "en_core_web_trf" ],
28- "finnish" : ["fi_core_news_sm" , "fi_core_news_md" , "fi_core_news_lg" ],
29- "german" : ["de_core_news_sm" , "de_core_news_md" , "de_core_news_lg" , "de_dep_news_trf" ],
30- "greek" : ["el_core_news_sm" , "el_core_news_md" , "el_core_news_lg" ],
31- "french" : ["fr_core_news_sm" , "fr_core_news_md" , "fr_core_news_lg" , "fr_dep_news_trf" ],
32- "italian" : ["it_core_news_sm" , "it_core_news_md" , "it_core_news_lg" ],
33- "japanese" : ["ja_core_news_sm" , "ja_core_news_md" , "ja_core_news_lg" ],
34- "korean" : ["ko_core_news_sm" , "ko_core_news_md" , "ko_core_news_lg" ],
35- "lithuanian" : ["lt_core_news_sm" , "lt_core_news_md" , "lt_core_news_lg" ],
36- "macedonian" : ["mk_core_news_sm" , "mk_core_news_md" , "mk_core_news_lg" ],
37- "norwegian" : ["nb_core_news_sm" , "nb_core_news_md" , "nb_core_news_lg" ],
38- "polish" : ["pl_core_news_sm" , "pl_core_news_md" , "pl_core_news_lg" ],
39- "portuguese" : ["pt_core_news_sm" , "pt_core_news_md" , "pt_core_news_lg" ],
40- "romanian" : ["ro_core_news_sm" , "ro_core_news_md" , "ro_core_news_lg" ],
41- "russian" : ["ru_core_news_sm" , "ru_core_news_md" , "ru_core_news_lg" ],
42- "spanish" : ["es_core_news_sm" , "es_core_news_md" , "es_core_news_lg" , "es_dep_news_trf" ],
43- "swedish" : ["sv_core_news_sm" , "sv_core_news_md" , "sv_core_news_lg" ],
44- "ukrainian" : ["uk_core_news_sm" , "uk_core_news_md" , "uk_core_news_lg" ],
45- "multi-language" : ["xx_ent_wiki_sm" , "xx_sent_ud_sm" ],
46- }
47-
48-
4920PUNCTUATION_MAP = dict .fromkeys (i for i in range (sys .maxunicode ) if unicodedata .category (chr (i )).startswith ("P" ))
5021PUNCTUATION_CLASS = set ([chr (i ) for i in range (sys .maxunicode ) if unicodedata .category (chr (i )).startswith ("P" )])
5122NUMBERS = re .compile (r"\d" )
@@ -303,24 +274,6 @@ def load(cls, path):
303274 return Tokens (tokens ["tokens" ], tokens ["metadata" ])
304275
305276
306- def check_for_updates (language ) -> List [str ]:
307- """Check for spacy language model updates"""
308- import requests
309-
310- response = requests .get ("https://raw.githubusercontent.com/explosion/spaCy/master/website/meta/languages.json" )
311- if response .status_code == 404 :
312- print ("Unable to fetch language information from Spacy GitHub" )
313- return []
314- try :
315- languages = response .json ()
316- models = {lang ["name" ].lower (): lang ["models" ] for lang in languages ["languages" ] if "models" in lang }
317- model : List [str ] = models [language ][::- 1 ]
318- print (model )
319- except KeyError :
320- return []
321- return model
322-
323-
324277@Language .factory (
325278 "postprocessor" ,
326279 default_config = {
@@ -528,18 +481,9 @@ def clear_trf_data(doc):
528481 return doc
529482
530483
531- def load_language_model (language , normalize_options : dict [str , Any ]) -> tuple [Language , bool ]:
484+ def load_language_model (language_model , normalize_options : dict [str , Any ]) -> tuple [Language , bool ]:
532485 """Load language model based on name"""
533486 nlp = None
534- language = language .lower ()
535- try :
536- possible_models = SPACY_LANGUAGE_MODEL_MAP [language ][::- 1 ]
537- except KeyError :
538- try :
539- possible_models = check_for_updates (language )
540- except KeyError :
541- print (f"Spacy does not support the { language } language." )
542- exit (- 1 )
543487 if any (
544488 (
545489 normalize_options ["lemmatizer" ] == "spacy" ,
@@ -552,26 +496,22 @@ def load_language_model(language, normalize_options: dict[str, Any]) -> tuple[La
552496 disabled_pipelines .append ("tagger" )
553497 if not normalize_options ["ents_to_keep" ]:
554498 disabled_pipelines .append ("ner" )
555- model_loaded = ""
556499 set_gpu_allocator ("pytorch" )
557500 use_gpu = prefer_gpu ()
558- for model in possible_models :
559- try :
560- nlp = spacy .load (model , exclude = disabled_pipelines )
561- print ("Using Spacy model" , model )
562- except OSError :
563- pass
564- if nlp is not None :
565- model_loaded = model
566- break
501+ try :
502+ nlp = spacy .load (language_model , exclude = disabled_pipelines )
503+ except OSError :
504+ pass
567505 if nlp is None :
568- print (f"No Spacy model installed for the { language } language. Stopping..." )
506+ print (
507+ f"The Spacy model { language_model } is not installed on your system. See https://spacy.io/models for instructions. Stopping..."
508+ )
569509 exit (- 1 )
570510 if use_gpu is True :
571511 nlp .add_pipe ("clear_trf_data" , last = True )
572512 nlp .add_pipe ("postprocessor" , config = normalize_options , last = True )
573513 if normalize_options ["ents_to_keep" ] and "ner" not in nlp .pipe_names :
574- print (f"There is no NER pipeline for model { model_loaded } . Exiting..." )
514+ print (f"There is no NER pipeline for model { language_model } . Exiting..." )
575515 exit (- 1 )
576516 return nlp , use_gpu
577517 nlp = spacy .blank ("en" )
0 commit comments