Skip to content

Commit f9c7a7a

Browse files
committed
purge docs before creating ngrams
1 parent 2d9887a commit f9c7a7a

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

text_preprocessing/preprocessor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def __init__(
126126
ngram_config=self.ngram_config,
127127
workers=workers,
128128
)
129-
if self.normalize_options["pos_to_keep"] or self.normalize_options["ents_to_keep"]:
129+
if self.normalize_options["pos_to_keep"] or self.normalize_options["ents_to_keep"] or lemmatizer == "spacy":
130130
self.do_nlp = True
131131
else:
132132
self.do_nlp = False
@@ -461,6 +461,7 @@ def generate_ngrams(ngram_size: int, ngram_window: int, ngram_word_order: bool,
461461
ngrams: list[PreprocessorToken] = []
462462
ngram: Deque[PreprocessorToken] = deque()
463463
ngram_text: str
464+
tokens.purge() # remove empty tokens
464465
for token in tokens:
465466
ngram.append(token)
466467
if len(ngram) == ngram_window:

0 commit comments

Comments
 (0)