11#!/usr/bin/env python3
22"""Text Preprocessor"""
33
4- import json
54import os
65import sqlite3
76import sys
87from collections import defaultdict , deque
98from dataclasses import dataclass
109from itertools import combinations
11- from typing import Any , Callable , DefaultDict , Deque , Iterable , Iterator , Union , overload
10+ from typing import Any , Callable , DefaultDict , Deque , Iterable
1211
1312import lz4 .frame
1413import orjson
1817from spacy .tokens import Doc , Token
1918
2019from .modernizer import Modernizer
21- from .spacy_helpers import load_language_model
20+ from .spacy_helpers import load_language_model , Tokens , PreprocessorToken
2221
2322Doc .set_extension ("metadata" , default = {})
2423Doc .set_extension ("char_num" , default = 0 )
@@ -50,260 +49,6 @@ class PreparedDoc:
5049 char_num : int
5150
5251
53- class PreprocessorToken (str ):
54- """Token Object class inheriting from string
55-
56- Args:
57- text: a string value
58- surface_form: surface form to be changed. Defaults to text if none given
59- pos_: a string value describing part-of-speech
60- ext: a dictionary containing additional metadata
61-
62- Attributes:
63- text: a string value
64- surface_form: surface form to be changed. Defaults to text if none given
65- pos_: a string value describing part-of-speech
66- ext: a dictionary containing additional metadata
67-
68- """
69-
70- ext : dict [str , Any ]
71-
72- def __new__ (cls , value , pos_ = "" , ent = "" , ext = {}):
73- return str .__new__ (cls , value )
74-
75- def __init__ (
76- self ,
77- text : str ,
78- pos_ : str = "" ,
79- ent : str = "" ,
80- ext : dict [str , Any ] | None = None ,
81- ):
82- self .text = text or ""
83- self .ext = ext or {}
84- if self .ext is not None :
85- self .surface_form = ext ["token" ]
86- else :
87- self .surface_form = text
88- self .ext ["pos" ] = pos_
89- self .pos_ = pos_
90- self .ent = ent
91-
92- def __hash__ (self ):
93- return hash (self .text )
94-
95- def __eq__ (self , other ) -> bool :
96- if isinstance (other , PreprocessorToken ):
97- return self .text == other .text
98- return self .text == other
99-
100- def __str__ (self ) -> str :
101- return self .text
102-
103- def __call__ (self ):
104- return self
105-
106- def __repr__ (self ) -> str :
107- return f"text={ repr (self .text )} , surface_form={ repr (self .surface_form )} , pos={ self .pos_ } , ext={ repr (self .ext )} "
108-
109- def __add__ (self , other ) -> str :
110- return self .text + other
111-
112-
113- class Tokens :
114- """Tokens object contains a list of tokens as well as metadata
115-
116- Args:
117- tokens: a list of Token objects
118- metadata: a dict containing metadata
119-
120- Attributes:
121- tokens: a list of Token ojects
122- metadata: a dict containing metadata
123- length: length of Tokens.tokens
124-
125- """
126-
127- def __init__ (self , doc : Doc | Iterable [PreprocessorToken ], metadata = None , keep_all = False ):
128- self .keep_all = keep_all
129- if isinstance (doc , Doc ):
130- self .tokens : Deque [PreprocessorToken ] = Deque (self .__get_tokens (doc ))
131- else :
132- self .tokens = Deque (doc )
133- if metadata is None :
134- self .metadata : dict [str , Any ] = doc ._ .metadata # type: ignore
135- else :
136- self .metadata = metadata
137- self .length : int = len (self .tokens )
138- self .iter_index = 0
139-
140- def __get_tokens (self , doc : Doc ):
141- """Return a generator of PreprocessorToken objects"""
142- max_index = len (doc ) - 1
143- for index , token in enumerate (doc ):
144- if token .text != "#DEL#" :
145- yield PreprocessorToken (token .text , token .pos_ , token .ent_type_ , token ._ .ext )
146- elif self .keep_all is True :
147- yield PreprocessorToken ("" , token .pos_ , token .ent_type_ , token ._ .ext )
148- if token .whitespace_ and index < max_index : # remove trailing whitespace
149- yield PreprocessorToken (token .whitespace_ , "" , "" , {** token ._ .ext , "token" : token .whitespace_ })
150-
151- def __iter__ (self ) -> Iterator [PreprocessorToken ]:
152- for token in self .tokens :
153- yield token
154-
155- def __next__ (self ):
156- self .iter_index += 1
157- if self .iter_index < self .length :
158- return self .tokens [self .iter_index ]
159- else :
160- raise IndexError
161-
162- @overload
163- def __getitem__ (self , index : int ) -> PreprocessorToken :
164- ...
165-
166- @overload
167- def __getitem__ (self , index : slice ) -> Iterable [PreprocessorToken ]:
168- ...
169-
170- def __getitem__ (self , index : Union [int , slice ]) -> Union [PreprocessorToken , Iterable [PreprocessorToken ]]:
171- if isinstance (index , int ):
172- return self .tokens [index ]
173- elif isinstance (index , slice ):
174- return Tokens (list (self .tokens )[index ], self .metadata )
175- else :
176- print (f"{ repr (index )} of type { type (index )} is not an index or slice" )
177- raise TypeError
178-
179- def __len__ (self ) -> int :
180- return self .length
181-
182- def __bool__ (self ) -> bool :
183- if self .length == 0 :
184- return False
185- return True
186-
187- def __repr__ (self ):
188- return repr ([repr (t ) for t in self .tokens ])
189-
190- def __str__ (self ):
191- return repr ([str (t ) for t in self .tokens ])
192-
193- def split_tokens (self , n : int ) -> Iterator ["Tokens" ]:
194- """Divide Tokens in to smaller Tokens of n length
195-
196- Args:
197- n: split Tokens obj into a list of Tokens of length n
198-
199- Returns:
200- A Iterator of Tokens
201-
202- """
203- max_index : int = self .length - 1
204- for i in range (0 , len (self ), n ):
205- end : int = i + n
206- if end > max_index :
207- metadata : dict [str , Any ] = {
208- ** self .metadata ,
209- "start_byte" : self [i ].ext ["start_byte" ],
210- "end_byte" : self [max_index ].ext ["end_byte" ],
211- }
212- yield Tokens (self [i :max_index ], metadata )
213- else :
214- metadata = {
215- ** self .metadata ,
216- "start_byte" : self [i ].ext ["start_byte" ],
217- "end_byte" : self [end - 1 ].ext ["end_byte" ],
218- }
219- yield Tokens (self [i :end ], metadata )
220-
221- def extend (self , tokens ) -> None :
222- """Extend size of Tokens"""
223- self .tokens .extend (tokens )
224- if not self .metadata :
225- self .metadata = tokens .metadata
226- self .metadata ["end_byte" ] = tokens .metadata ["end_byte" ]
227-
228- def pop (self ) -> PreprocessorToken | None :
229- """Remove last token from self.tokens"""
230- if self .tokens :
231- token = self .tokens .pop ()
232- try :
233- self .metadata ["end_byte" ] = self .tokens [- 1 ].ext ["end_byte" ]
234- self .length -= 1
235- return token
236- except IndexError :
237- self .length = 0
238- return token
239- return None
240-
241- def popleft (self ) -> PreprocessorToken | None :
242- """Remove first token from self.tokens"""
243- if self .tokens :
244- token = self .tokens .popleft ()
245- try :
246- self .metadata ["start_byte" ] = self .tokens [0 ].ext ["start_byte" ]
247- self .length -= 1
248- except IndexError :
249- self .length = 0
250- return token
251- return None
252-
253- def append (self , token : PreprocessorToken ):
254- """Append Token"""
255- if not self .tokens :
256- self .metadata ["start_byte" ] = token .ext ["start_byte" ]
257- self .tokens .append (token )
258- self .metadata ["end_byte" ] = token .ext ["end_byte" ]
259- self .length += 1
260-
261- def appendleft (self , token : PreprocessorToken ):
262- """Append Token to the left of tokens"""
263- if not self .tokens :
264- self .metadata ["end_byte" ] = token .ext ["end_byte" ]
265- self .tokens .appendleft (token )
266- self .metadata ["start_byte" ] = token .ext ["start_byte" ]
267- self .length += 1
268-
269- def purge (self ):
270- """Remove empty tokens"""
271- self .tokens = deque (token for token in self .tokens if token .text and token .text != " " )
272- self .length = len (self .tokens )
273- if self .length :
274- self .metadata ["start_byte" ] = self .tokens [0 ].ext ["start_byte" ]
275- self .metadata ["end_byte" ] = self .tokens [- 1 ].ext ["end_byte" ]
276- else :
277- self .metadata ["start_byte" ] = 0
278- self .metadata ["end_byte" ] = 0
279-
280- def save (self , path ):
281- """Save Tokens to disk"""
282- tokens_to_serialize = {"tokens" : [], "metadata" : self .metadata }
283- for token in self :
284- tokens_to_serialize ["tokens" ].append ((token .text , token .surface_form , token .pos_ , token .ext ))
285- with open (path , "w" , encoding = "utf8" ) as output :
286- json .dump (tokens_to_serialize , output )
287-
288- def load (self , path ):
289- """Load tokens from disk"""
290- with open (path , "r" , encoding = "utf8" ) as input_file :
291- tokens = json .load (input_file )
292- self .metadata = tokens ["metadata" ]
293- self .tokens = deque (PreprocessorToken (t [0 ], t [1 ], t [2 ], t [3 ]) for t in tokens ["tokens" ])
294-
295-
296- def chunks (l , n ):
297- """Yield n number of sequential chunks from l."""
298- l = list (l )
299- d , r = divmod (len (l ), n )
300- for i in range (n ):
301- si = (d + 1 ) * (i if i < r else r ) + d * (0 if i < r else i - r )
302- result = l [si : si + (d + 1 if i < r else d )]
303- if result :
304- yield result
305-
306-
30752class PreProcessor :
30853 """Text Preprocessing class"""
30954
@@ -400,7 +145,9 @@ def process_texts(
400145 )
401146 if self .text_fetcher .text_object_type in ("para" , "sent" ):
402147 fetched_texts = self .nlp .pipe (
403- ((make_spacy_doc (self .nlp , tokens ), c ) for tokens , c in fetched_texts ), as_tuples = True
148+ ((make_spacy_doc (self .nlp , tokens ), c ) for tokens , c in fetched_texts ),
149+ as_tuples = True ,
150+ batch_size = 500 ,
404151 )
405152 for tokens , doc_count in fetched_texts :
406153 count += 1
@@ -421,7 +168,7 @@ def process_texts(
421168 spacy_doc = make_spacy_doc (self .nlp , tokens )
422169 if spacy_doc ._ .char_num > 100000 : # being conservative to preserve GPU RAM
423170 split_doc = self .__split_spacy_docs (spacy_doc )
424- rebuilt_doc = Doc .from_docs (list (self .nlp .pipe (split_doc )))
171+ rebuilt_doc = Doc .from_docs (list (self .nlp .pipe (split_doc , batch_size = 128 )))
425172 rebuilt_doc ._ .metadata = spacy_doc ._ .metadata
426173 tokens = Tokens (rebuilt_doc , keep_all = keep_all )
427174 else :
0 commit comments