Skip to content

Commit efad180

Browse files
committed
various bug fixes
1 parent adfc335 commit efad180

File tree

8 files changed

+78
-18
lines changed

8 files changed

+78
-18
lines changed
2.06 KB
Binary file not shown.
1.81 KB
Binary file not shown.
2.65 KB
Binary file not shown.

tests/test_preprocessor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,19 @@ def append_marker(tokens):
381381
assert "world_X" in texts
382382
assert "foo_X" in texts
383383

384+
def test_post_processing_function_applied_in_process_string(self):
385+
def append_marker(tokens):
386+
for tok in tokens:
387+
if tok.text and tok.text != " ":
388+
tok.text = tok.text + "_X"
389+
return tokens
390+
391+
p = PreProcessor(language="english", post_processing_function=append_marker, workers=1)
392+
texts = words(p.process_string("hello world foo"))
393+
assert "hello_X" in texts
394+
assert "world_X" in texts
395+
assert "foo_X" in texts
396+
384397
# --- workers=1 explicit ---
385398

386399
def test_single_worker_produces_same_output(self):

tests/test_spacy_models.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,17 @@ def test_en_lemmatizer_does_not_alter_already_base_form(self):
251251
assert "cat" in texts
252252
assert "dog" in texts
253253

254+
def test_en_lemmatizer_works_without_pos_to_keep(self):
255+
# Tagger must be kept even when pos_to_keep is unset, because the
256+
# English rule-based lemmatizer depends on POS annotations.
257+
p = PreProcessor(language="english", language_model=EN_MODEL,
258+
lemmatizer="spacy", workers=1)
259+
toks = tokens_with_pos(p.process_string("dogs are running in cities"))
260+
texts = [t.text for t in toks]
261+
assert "dog" in texts # dogs → dog
262+
assert "run" in texts # running → run
263+
assert "city" in texts # cities → city
264+
254265

255266
# ===========================================================================
256267
# Integration: language model + corpus files

tests/test_tokens.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,28 @@ def test_iter_text_values(self):
109109
tokens = make_tokens(["hello", "world", "foo"])
110110
assert [t.text for t in tokens] == ["hello", "world", "foo"]
111111

112+
def test_next_yields_all_tokens(self):
113+
tokens = make_tokens(["a", "b", "c"])
114+
result = []
115+
try:
116+
while True:
117+
result.append(next(tokens))
118+
except StopIteration:
119+
pass
120+
assert [t.text for t in result] == ["a", "b", "c"]
121+
122+
def test_next_raises_stop_iteration(self):
123+
tokens = make_tokens(["a"])
124+
next(tokens)
125+
with pytest.raises(StopIteration):
126+
next(tokens)
127+
128+
def test_next_resets_after_exhaustion(self):
129+
tokens = make_tokens(["a", "b"])
130+
list(tokens) # exhaust via __iter__
131+
# __next__ should work again from the start
132+
assert next(tokens).text == "a"
133+
112134
def test_getitem_int(self):
113135
tokens = make_tokens(["hello", "world"])
114136
assert tokens[0].text == "hello"

text_preprocessing/preprocessor.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ class PreparedDoc:
5050

5151

5252
class PreProcessor:
53-
"""Text Preprocessing class"""
53+
"""Text preprocessing pipeline.
54+
55+
Only one instance should be active at a time: configuration is stored on
56+
TextFetcher as class variables so forked workers inherit it via
57+
copy-on-write, avoiding costly pickling of the Spacy model and language
58+
dictionaries.
59+
"""
5460

5561
def __init__(
5662
self,
@@ -192,7 +198,10 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
192198
"""Take a string and return a list of preprocessed tokens"""
193199
doc = self.text_fetcher.process_string(text)
194200
processed_doc = self.nlp(doc)
195-
return Tokens(processed_doc, keep_all=keep_all)
201+
tokens = Tokens(processed_doc, keep_all=keep_all)
202+
if self.post_func is not None:
203+
tokens = self.post_func(tokens)
204+
return tokens
196205

197206
def __split_spacy_docs(self, doc: Doc) -> list[Doc]:
198207
"""Split spacy doc into smaller docs of 10 sentences"""
@@ -202,23 +211,27 @@ def __split_spacy_docs(self, doc: Doc) -> list[Doc]:
202211
if len(sentence_group) == 10:
203212
docs.append(Doc.from_docs(sentence_group))
204213
sentence_group = []
205-
else:
206-
sent_starts = []
207-
words = []
208-
for token in sent:
209-
sent_starts.append(token.is_sent_start)
210-
words.append(token.text)
211-
sent_doc = Doc(self.nlp.vocab, words, sent_starts=sent_starts)
212-
for pos, token in enumerate(sent):
213-
sent_doc[pos]._.ext = token._.ext
214-
sentence_group.append(sent_doc)
214+
sent_starts = []
215+
words = []
216+
for token in sent:
217+
sent_starts.append(token.is_sent_start)
218+
words.append(token.text)
219+
sent_doc = Doc(self.nlp.vocab, words, sent_starts=sent_starts)
220+
for pos, token in enumerate(sent):
221+
sent_doc[pos]._.ext = token._.ext
222+
sentence_group.append(sent_doc)
215223
if sentence_group:
216224
docs.append(Doc.from_docs(sentence_group))
217225
return docs
218226

219227

220228
class TextFetcher:
221-
"""Text fetcher"""
229+
"""Tokeniser and file reader for PreProcessor.
230+
231+
Configuration is kept as class variables so that forked worker processes
232+
inherit the Spacy model and language dictionaries via copy-on-write without
233+
pickling them. Workers treat this state as read-only.
234+
"""
222235

223236
word_regex: str = r"[\p{L}\p{M}\p{N}]+|'"
224237
sentence_boundaries: list[str] = [".", "!", "?"]

text_preprocessing/spacy_helpers.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,12 @@ def __iter__(self) -> Iterable[PreprocessorToken]:
130130
yield token
131131

132132
def __next__(self):
133+
if self.iter_index >= self.length:
134+
self.iter_index = 0
135+
raise StopIteration
136+
token = self.tokens[self.iter_index]
133137
self.iter_index += 1
134-
if self.iter_index < self.length:
135-
return self.tokens[self.iter_index]
136-
else:
137-
raise IndexError
138+
return token
138139

139140
def __getitem__(self, index: Union[int, slice]) -> Union[PreprocessorToken, Iterable[PreprocessorToken]]:
140141
if isinstance(index, int):
@@ -484,7 +485,7 @@ def load_language_model(language_model, normalize_options: dict[str, Any]) -> tu
484485
)
485486
):
486487
disabled_pipelines = ["tokenizer", "textcat"]
487-
if not normalize_options["pos_to_keep"]:
488+
if not normalize_options["pos_to_keep"] and normalize_options["lemmatizer"] != "spacy":
488489
disabled_pipelines.append("tagger")
489490
if not normalize_options["ents_to_keep"]:
490491
disabled_pipelines.append("ner")

0 commit comments

Comments
 (0)