Skip to content

Commit a6bc9b4

Browse files
committed
fixes
1 parent 6ca2fcd commit a6bc9b4

File tree

2 files changed

+4
-6
lines changed

2 files changed

+4
-6
lines changed

text_preprocessing/preprocessor.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,7 @@ def __init__(
101101
if nlp_model is not None:
102102
self.nlp = nlp_model
103103
else:
104-
if language_model is not None:
105-
self.nlp, using_gpu = load_language_model(language_model, self.normalize_options)
104+
self.nlp, using_gpu = load_language_model(language_model, self.normalize_options)
106105
self.using_gpu = using_gpu
107106
if workers is None:
108107
cpu_count = os.cpu_count() or 2
@@ -173,7 +172,7 @@ def process_texts(
173172
)
174173
if isinstance(tokens, PreparedDoc):
175174
spacy_doc = make_spacy_doc(self.nlp, tokens)
176-
if spacy_doc._.char_num > 100000: # being conservative to preserve GPU RAM
175+
if spacy_doc._.char_num > 100000 and self.using_gpu is True: # being conservative to preserve GPU RAM
177176
split_doc = self.__split_spacy_docs(spacy_doc)
178177
rebuilt_doc = Doc.from_docs(list(self.nlp.pipe(split_doc, batch_size=128)))
179178
rebuilt_doc._.metadata = spacy_doc._.metadata
@@ -260,9 +259,7 @@ def __init__(
260259
else:
261260
cls.modernize = False
262261
cls.strip_tags = strip_tags
263-
264262
cls.is_philo_db = is_philo_db
265-
266263
cls.text_object_type = text_object_type
267264
cls.token_regex = re.compile(rf"({word_regex})|([{''.join(sentence_boundaries)}])")
268265
cls.sentence_boundaries = sentence_boundaries

text_preprocessing/spacy_helpers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ def __filter_token(self, token: Token | PreprocessorToken) -> bool:
438438
return True
439439
return False
440440
if self.pos_to_keep and token.pos_ not in self.pos_to_keep:
441+
print(token, self.pos_to_keep, token.pos_)
441442
return True
442443
return False
443444

@@ -484,7 +485,7 @@ def clear_trf_data(doc):
484485
def load_language_model(language_model, normalize_options: dict[str, Any]) -> tuple[Language, bool]:
485486
"""Load language model based on name"""
486487
nlp = None
487-
if any(
488+
if language_model is not None and any(
488489
(
489490
normalize_options["lemmatizer"] == "spacy",
490491
normalize_options["pos_to_keep"],

0 commit comments

Comments
 (0)