Skip to content

Commit 2dcf4a3

Browse files
committed
various fixes and tweaks
1 parent c2192f4 commit 2dcf4a3

File tree

4 files changed

+268
-269
lines changed

4 files changed

+268
-269
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="text_preprocessing",
9-
version="1.0rc4",
9+
version="1.0rc5",
1010
author="The ARTFL Project",
1111
author_email="clovisgladstone@gmail.com",
1212
packages=["text_preprocessing", "text_preprocessing.lang"],

text_preprocessing/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .modernizer import Modernizer
2-
from .preprocessor import PreProcessor, Tokens, PreprocessorToken as Token
2+
from .preprocessor import PreProcessor
33
from .text_loader import text_loader
4-
from .spacy_helpers import load_language_model
4+
from .spacy_helpers import load_language_model, PreProcessingPipe, Tokens, PreprocessorToken as Token

text_preprocessing/preprocessor.py

Lines changed: 6 additions & 259 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
#!/usr/bin/env python3
22
"""Text Preprocessor"""
33

4-
import json
54
import os
65
import sqlite3
76
import sys
87
from collections import defaultdict, deque
98
from dataclasses import dataclass
109
from itertools import combinations
11-
from typing import Any, Callable, DefaultDict, Deque, Iterable, Iterator, Union, overload
10+
from typing import Any, Callable, DefaultDict, Deque, Iterable
1211

1312
import lz4.frame
1413
import orjson
@@ -18,7 +17,7 @@
1817
from spacy.tokens import Doc, Token
1918

2019
from .modernizer import Modernizer
21-
from .spacy_helpers import load_language_model
20+
from .spacy_helpers import load_language_model, Tokens, PreprocessorToken
2221

2322
Doc.set_extension("metadata", default={})
2423
Doc.set_extension("char_num", default=0)
@@ -50,260 +49,6 @@ class PreparedDoc:
5049
char_num: int
5150

5251

53-
class PreprocessorToken(str):
54-
"""Token Object class inheriting from string
55-
56-
Args:
57-
text: a string value
58-
surface_form: surface form to be changed. Defaults to text if none given
59-
pos_: a string value describing part-of-speech
60-
ext: a dictionary containing additional metadata
61-
62-
Attributes:
63-
text: a string value
64-
surface_form: surface form to be changed. Defaults to text if none given
65-
pos_: a string value describing part-of-speech
66-
ext: a dictionary containing additional metadata
67-
68-
"""
69-
70-
ext: dict[str, Any]
71-
72-
def __new__(cls, value, pos_="", ent="", ext={}):
73-
return str.__new__(cls, value)
74-
75-
def __init__(
76-
self,
77-
text: str,
78-
pos_: str = "",
79-
ent: str = "",
80-
ext: dict[str, Any] | None = None,
81-
):
82-
self.text = text or ""
83-
self.ext = ext or {}
84-
if self.ext is not None:
85-
self.surface_form = ext["token"]
86-
else:
87-
self.surface_form = text
88-
self.ext["pos"] = pos_
89-
self.pos_ = pos_
90-
self.ent = ent
91-
92-
def __hash__(self):
93-
return hash(self.text)
94-
95-
def __eq__(self, other) -> bool:
96-
if isinstance(other, PreprocessorToken):
97-
return self.text == other.text
98-
return self.text == other
99-
100-
def __str__(self) -> str:
101-
return self.text
102-
103-
def __call__(self):
104-
return self
105-
106-
def __repr__(self) -> str:
107-
return f"text={repr(self.text)}, surface_form={repr(self.surface_form)}, pos={self.pos_}, ext={repr(self.ext)}"
108-
109-
def __add__(self, other) -> str:
110-
return self.text + other
111-
112-
113-
class Tokens:
114-
"""Tokens object contains a list of tokens as well as metadata
115-
116-
Args:
117-
tokens: a list of Token objects
118-
metadata: a dict containing metadata
119-
120-
Attributes:
121-
tokens: a list of Token ojects
122-
metadata: a dict containing metadata
123-
length: length of Tokens.tokens
124-
125-
"""
126-
127-
def __init__(self, doc: Doc | Iterable[PreprocessorToken], metadata=None, keep_all=False):
128-
self.keep_all = keep_all
129-
if isinstance(doc, Doc):
130-
self.tokens: Deque[PreprocessorToken] = Deque(self.__get_tokens(doc))
131-
else:
132-
self.tokens = Deque(doc)
133-
if metadata is None:
134-
self.metadata: dict[str, Any] = doc._.metadata # type: ignore
135-
else:
136-
self.metadata = metadata
137-
self.length: int = len(self.tokens)
138-
self.iter_index = 0
139-
140-
def __get_tokens(self, doc: Doc):
141-
"""Return a generator of PreprocessorToken objects"""
142-
max_index = len(doc) - 1
143-
for index, token in enumerate(doc):
144-
if token.text != "#DEL#":
145-
yield PreprocessorToken(token.text, token.pos_, token.ent_type_, token._.ext)
146-
elif self.keep_all is True:
147-
yield PreprocessorToken("", token.pos_, token.ent_type_, token._.ext)
148-
if token.whitespace_ and index < max_index: # remove trailing whitespace
149-
yield PreprocessorToken(token.whitespace_, "", "", {**token._.ext, "token": token.whitespace_})
150-
151-
def __iter__(self) -> Iterator[PreprocessorToken]:
152-
for token in self.tokens:
153-
yield token
154-
155-
def __next__(self):
156-
self.iter_index += 1
157-
if self.iter_index < self.length:
158-
return self.tokens[self.iter_index]
159-
else:
160-
raise IndexError
161-
162-
@overload
163-
def __getitem__(self, index: int) -> PreprocessorToken:
164-
...
165-
166-
@overload
167-
def __getitem__(self, index: slice) -> Iterable[PreprocessorToken]:
168-
...
169-
170-
def __getitem__(self, index: Union[int, slice]) -> Union[PreprocessorToken, Iterable[PreprocessorToken]]:
171-
if isinstance(index, int):
172-
return self.tokens[index]
173-
elif isinstance(index, slice):
174-
return Tokens(list(self.tokens)[index], self.metadata)
175-
else:
176-
print(f"{repr(index)} of type {type(index)} is not an index or slice")
177-
raise TypeError
178-
179-
def __len__(self) -> int:
180-
return self.length
181-
182-
def __bool__(self) -> bool:
183-
if self.length == 0:
184-
return False
185-
return True
186-
187-
def __repr__(self):
188-
return repr([repr(t) for t in self.tokens])
189-
190-
def __str__(self):
191-
return repr([str(t) for t in self.tokens])
192-
193-
def split_tokens(self, n: int) -> Iterator["Tokens"]:
194-
"""Divide Tokens in to smaller Tokens of n length
195-
196-
Args:
197-
n: split Tokens obj into a list of Tokens of length n
198-
199-
Returns:
200-
A Iterator of Tokens
201-
202-
"""
203-
max_index: int = self.length - 1
204-
for i in range(0, len(self), n):
205-
end: int = i + n
206-
if end > max_index:
207-
metadata: dict[str, Any] = {
208-
**self.metadata,
209-
"start_byte": self[i].ext["start_byte"],
210-
"end_byte": self[max_index].ext["end_byte"],
211-
}
212-
yield Tokens(self[i:max_index], metadata)
213-
else:
214-
metadata = {
215-
**self.metadata,
216-
"start_byte": self[i].ext["start_byte"],
217-
"end_byte": self[end - 1].ext["end_byte"],
218-
}
219-
yield Tokens(self[i:end], metadata)
220-
221-
def extend(self, tokens) -> None:
222-
"""Extend size of Tokens"""
223-
self.tokens.extend(tokens)
224-
if not self.metadata:
225-
self.metadata = tokens.metadata
226-
self.metadata["end_byte"] = tokens.metadata["end_byte"]
227-
228-
def pop(self) -> PreprocessorToken | None:
229-
"""Remove last token from self.tokens"""
230-
if self.tokens:
231-
token = self.tokens.pop()
232-
try:
233-
self.metadata["end_byte"] = self.tokens[-1].ext["end_byte"]
234-
self.length -= 1
235-
return token
236-
except IndexError:
237-
self.length = 0
238-
return token
239-
return None
240-
241-
def popleft(self) -> PreprocessorToken | None:
242-
"""Remove first token from self.tokens"""
243-
if self.tokens:
244-
token = self.tokens.popleft()
245-
try:
246-
self.metadata["start_byte"] = self.tokens[0].ext["start_byte"]
247-
self.length -= 1
248-
except IndexError:
249-
self.length = 0
250-
return token
251-
return None
252-
253-
def append(self, token: PreprocessorToken):
254-
"""Append Token"""
255-
if not self.tokens:
256-
self.metadata["start_byte"] = token.ext["start_byte"]
257-
self.tokens.append(token)
258-
self.metadata["end_byte"] = token.ext["end_byte"]
259-
self.length += 1
260-
261-
def appendleft(self, token: PreprocessorToken):
262-
"""Append Token to the left of tokens"""
263-
if not self.tokens:
264-
self.metadata["end_byte"] = token.ext["end_byte"]
265-
self.tokens.appendleft(token)
266-
self.metadata["start_byte"] = token.ext["start_byte"]
267-
self.length += 1
268-
269-
def purge(self):
270-
"""Remove empty tokens"""
271-
self.tokens = deque(token for token in self.tokens if token.text and token.text != " ")
272-
self.length = len(self.tokens)
273-
if self.length:
274-
self.metadata["start_byte"] = self.tokens[0].ext["start_byte"]
275-
self.metadata["end_byte"] = self.tokens[-1].ext["end_byte"]
276-
else:
277-
self.metadata["start_byte"] = 0
278-
self.metadata["end_byte"] = 0
279-
280-
def save(self, path):
281-
"""Save Tokens to disk"""
282-
tokens_to_serialize = {"tokens": [], "metadata": self.metadata}
283-
for token in self:
284-
tokens_to_serialize["tokens"].append((token.text, token.surface_form, token.pos_, token.ext))
285-
with open(path, "w", encoding="utf8") as output:
286-
json.dump(tokens_to_serialize, output)
287-
288-
def load(self, path):
289-
"""Load tokens from disk"""
290-
with open(path, "r", encoding="utf8") as input_file:
291-
tokens = json.load(input_file)
292-
self.metadata = tokens["metadata"]
293-
self.tokens = deque(PreprocessorToken(t[0], t[1], t[2], t[3]) for t in tokens["tokens"])
294-
295-
296-
def chunks(l, n):
297-
"""Yield n number of sequential chunks from l."""
298-
l = list(l)
299-
d, r = divmod(len(l), n)
300-
for i in range(n):
301-
si = (d + 1) * (i if i < r else r) + d * (0 if i < r else i - r)
302-
result = l[si : si + (d + 1 if i < r else d)]
303-
if result:
304-
yield result
305-
306-
30752
class PreProcessor:
30853
"""Text Preprocessing class"""
30954

@@ -400,7 +145,9 @@ def process_texts(
400145
)
401146
if self.text_fetcher.text_object_type in ("para", "sent"):
402147
fetched_texts = self.nlp.pipe(
403-
((make_spacy_doc(self.nlp, tokens), c) for tokens, c in fetched_texts), as_tuples=True
148+
((make_spacy_doc(self.nlp, tokens), c) for tokens, c in fetched_texts),
149+
as_tuples=True,
150+
batch_size=500,
404151
)
405152
for tokens, doc_count in fetched_texts:
406153
count += 1
@@ -421,7 +168,7 @@ def process_texts(
421168
spacy_doc = make_spacy_doc(self.nlp, tokens)
422169
if spacy_doc._.char_num > 100000: # being conservative to preserve GPU RAM
423170
split_doc = self.__split_spacy_docs(spacy_doc)
424-
rebuilt_doc = Doc.from_docs(list(self.nlp.pipe(split_doc)))
171+
rebuilt_doc = Doc.from_docs(list(self.nlp.pipe(split_doc, batch_size=128)))
425172
rebuilt_doc._.metadata = spacy_doc._.metadata
426173
tokens = Tokens(rebuilt_doc, keep_all=keep_all)
427174
else:

0 commit comments

Comments
 (0)