@@ -139,13 +139,14 @@ def __init__(self, doc: Doc | Iterable[PreprocessorToken], metadata=None, keep_a
139139
140140 def __get_tokens (self , doc : Doc ):
141141 """Return a generator of PreprocessorToken objects"""
142- for token in doc :
142+ max_index = len (doc ) - 1
143+ for index , token in enumerate (doc ):
143144 if token .text != "#DEL#" :
144145 yield PreprocessorToken (token .text , token .pos_ , token .ent_type_ , token ._ .ext )
145146 elif self .keep_all is True :
146147 yield PreprocessorToken ("" , token .pos_ , token .ent_type_ , token ._ .ext )
147- if token .whitespace_ :
148- yield PreprocessorToken (token .whitespace_ , "" , "" , {"token" : token .whitespace_ })
148+ if token .whitespace_ and index < max_index : # remove trailing whitespace
149+ yield PreprocessorToken (token .whitespace_ , "" , "" , {** token . _ . ext , "token" : token .whitespace_ })
149150
150151 def __iter__ (self ) -> Iterator [PreprocessorToken ]:
151152 for token in self .tokens :
@@ -267,7 +268,7 @@ def appendleft(self, token: PreprocessorToken):
267268
268269 def purge (self ):
269270 """Remove empty tokens"""
270- self .tokens = deque (token for token in self .tokens if token )
271+ self .tokens = deque (token for token in self .tokens if token . text and token . text != " " )
271272 self .length = len (self .tokens )
272273 if self .length :
273274 self .metadata ["start_byte" ] = self .tokens [0 ].ext ["start_byte" ]
@@ -331,6 +332,7 @@ def __init__(
331332 pos_to_keep : list [str ] | bool = False ,
332333 ents_to_keep : list [str ] | bool = False ,
333334 post_processing_function : Callable | None = None ,
335+ nlp_model : Language | None = None ,
334336 ** _ , # this is meant to make the constructor accept invalid keywords
335337 ):
336338 self .normalize_options = {
@@ -348,7 +350,10 @@ def __init__(
348350 "ents_to_keep" : ents_to_keep or [],
349351 }
350352 self .language = language
351- self .nlp = load_language_model (self .language , self .normalize_options )
353+ if nlp_model is not None :
354+ self .nlp = nlp_model
355+ else :
356+ self .nlp = load_language_model (self .language , self .normalize_options )
352357 if workers is None :
353358 cpu_count = os .cpu_count () or 2
354359 self .workers = cpu_count - 1
@@ -414,7 +419,7 @@ def process_texts(
414419 )
415420 if isinstance (tokens , PreparedDoc ):
416421 spacy_doc = make_spacy_doc (self .nlp , tokens )
417- if spacy_doc ._ .char_num > 10000 :
422+ if spacy_doc ._ .char_num > 100000 : # being conservative to preserve GPU RAM
418423 split_doc = self .__split_spacy_docs (spacy_doc )
419424 rebuilt_doc = Doc .from_docs (list (self .nlp .pipe (split_doc )))
420425 rebuilt_doc ._ .metadata = spacy_doc ._ .metadata
0 commit comments