11"""Helper functions for Spacy"""
22
33import os
4- import pickle
54import re
65import sys
76import unicodedata
109from typing import Any , Dict , Iterable , List , Optional , Union
1110from xml .sax .saxutils import unescape as unescape_xml
1211
12+ import pickle
1313import spacy
1414from spacy .language import Language
1515from spacy .tokens import Doc , Token
1616from Stemmer import Stemmer
1717from thinc .api import prefer_gpu , set_gpu_allocator
1818from unidecode import unidecode
1919
20-
2120# Updated as of 8/23/2022
2221SPACY_LANGUAGE_MODEL_MAP : Dict [str , List [str ]] = {
2322 "catalan" : ["ca_core_news_sm" , "ca_core_news_md" , "ca_core_news_lg" , "ca_core_news_trf" ],
@@ -71,8 +70,6 @@ class PreprocessorToken(str):
7170
7271 """
7372
74- ext : dict [str , Any ]
75-
7673 def __new__ (cls , value , pos_ = "" , ent_type_ = "" , ext = {}):
7774 return str .__new__ (cls , value )
7875
@@ -83,6 +80,7 @@ def __init__(
8380 ent_type_ : str = "" ,
8481 ext : dict [str , Any ] | None = None ,
8582 ):
83+ super ().__init__ ()
8684 self .text = text or ""
8785 self .ext = ext or {}
8886 if self .ext is not None :
@@ -149,8 +147,8 @@ def __get_tokens(self, doc: Doc):
149147 yield PreprocessorToken (token .text , token .pos_ , token .ent_type_ , token ._ .ext )
150148 elif self .keep_all is True :
151149 yield PreprocessorToken ("" , token .pos_ , token .ent_type_ , token ._ .ext )
152- if token .whitespace_ and index < max_index : # remove trailing whitespace
153- yield PreprocessorToken (token .whitespace_ , "" , "" , {** token ._ .ext , "token" : token .whitespace_ })
150+ if token .whitespace_ and index < max_index : # remove trailing whitespace
151+ yield PreprocessorToken (token .whitespace_ , "" , "" , {** token ._ .ext , "token" : token .whitespace_ })
154152
155153 def __iter__ (self ) -> Iterable [PreprocessorToken ]:
156154 for token in self .tokens :
0 commit comments