Skip to content

Commit c2192f4

Browse files
committed
fixes to Tokens object
1 parent acbc879 commit c2192f4

1 file changed

Lines changed: 11 additions & 6 deletions

File tree

text_preprocessing/preprocessor.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,14 @@ def __init__(self, doc: Doc | Iterable[PreprocessorToken], metadata=None, keep_a
139139

140140
def __get_tokens(self, doc: Doc):
141141
"""Return a generator of PreprocessorToken objects"""
142-
for token in doc:
142+
max_index = len(doc) - 1
143+
for index, token in enumerate(doc):
143144
if token.text != "#DEL#":
144145
yield PreprocessorToken(token.text, token.pos_, token.ent_type_, token._.ext)
145146
elif self.keep_all is True:
146147
yield PreprocessorToken("", token.pos_, token.ent_type_, token._.ext)
147-
if token.whitespace_:
148-
yield PreprocessorToken(token.whitespace_, "", "", {"token": token.whitespace_})
148+
if token.whitespace_ and index < max_index: # remove trailing whitespace
149+
yield PreprocessorToken(token.whitespace_, "", "", {**token._.ext, "token": token.whitespace_})
149150

150151
def __iter__(self) -> Iterator[PreprocessorToken]:
151152
for token in self.tokens:
@@ -267,7 +268,7 @@ def appendleft(self, token: PreprocessorToken):
267268

268269
def purge(self):
269270
"""Remove empty tokens"""
270-
self.tokens = deque(token for token in self.tokens if token)
271+
self.tokens = deque(token for token in self.tokens if token.text and token.text != " ")
271272
self.length = len(self.tokens)
272273
if self.length:
273274
self.metadata["start_byte"] = self.tokens[0].ext["start_byte"]
@@ -331,6 +332,7 @@ def __init__(
331332
pos_to_keep: list[str] | bool = False,
332333
ents_to_keep: list[str] | bool = False,
333334
post_processing_function: Callable | None = None,
335+
nlp_model: Language | None = None,
334336
**_, # this is meant to make the constructor accept invalid keywords
335337
):
336338
self.normalize_options = {
@@ -348,7 +350,10 @@ def __init__(
348350
"ents_to_keep": ents_to_keep or [],
349351
}
350352
self.language = language
351-
self.nlp = load_language_model(self.language, self.normalize_options)
353+
if nlp_model is not None:
354+
self.nlp = nlp_model
355+
else:
356+
self.nlp = load_language_model(self.language, self.normalize_options)
352357
if workers is None:
353358
cpu_count = os.cpu_count() or 2
354359
self.workers = cpu_count - 1
@@ -414,7 +419,7 @@ def process_texts(
414419
)
415420
if isinstance(tokens, PreparedDoc):
416421
spacy_doc = make_spacy_doc(self.nlp, tokens)
417-
if spacy_doc._.char_num > 10000:
422+
if spacy_doc._.char_num > 100000: # being conservative to preserve GPU RAM
418423
split_doc = self.__split_spacy_docs(spacy_doc)
419424
rebuilt_doc = Doc.from_docs(list(self.nlp.pipe(split_doc)))
420425
rebuilt_doc._.metadata = spacy_doc._.metadata

0 commit comments

Comments
 (0)