@@ -142,13 +142,17 @@ def __init__(self, doc: Doc | Iterable[PreprocessorToken], metadata=None, keep_a
142142 def __get_tokens (self , doc : Doc ):
143143 """Return a generator of PreprocessorToken objects"""
144144 max_index = len (doc ) - 1
145+ word_before = False
145146 for index , token in enumerate (doc ):
146147 if token .text != "#DEL#" :
147148 yield PreprocessorToken (token .text , token .pos_ , token .ent_type_ , token ._ .ext )
149+ word_before = True
148150 elif self .keep_all is True :
149151 yield PreprocessorToken ("" , token .pos_ , token .ent_type_ , token ._ .ext )
150- if token .whitespace_ and index < max_index : # remove trailing whitespace
152+ word_before = True
153+ if all ((token .whitespace_ , word_before , index < max_index )): # keep whitespace except at the very end
151154 yield PreprocessorToken (token .whitespace_ , "" , "" , {** token ._ .ext , "token" : token .whitespace_ })
155+ word_before = False
152156
153157 def __iter__ (self ) -> Iterable [PreprocessorToken ]:
154158 for token in self .tokens :
@@ -165,7 +169,20 @@ def __getitem__(self, index: Union[int, slice]) -> Union[PreprocessorToken, Iter
165169 if isinstance (index , int ):
166170 return self .tokens [index ]
167171 elif isinstance (index , slice ):
168- return Tokens (list (self .tokens )[index ], self .metadata )
172+ tokens = list (self .tokens )[index ]
173+ if tokens :
174+ metadata = {
175+ ** self .metadata ,
176+ "start_byte" : tokens [0 ].ext ["start_byte" ],
177+ "end_byte" : tokens [- 1 ].ext ["end_byte" ],
178+ }
179+ else :
180+ metadata = {
181+ ** self .metadata ,
182+ "start_byte" : 0 ,
183+ "end_byte" : 0 ,
184+ }
185+ return Tokens (tokens , metadata )
169186 else :
170187 print (f"{ repr (index )} of type { type (index )} is not an index or slice" )
171188 raise TypeError
@@ -215,6 +232,7 @@ def split_tokens(self, n: int) -> Iterable["Tokens"]:
215232 def extend (self , tokens ) -> None :
216233 """Extend size of Tokens"""
217234 self .tokens .extend (tokens .tokens )
235+ self .length = len (self .tokens )
218236 if not self .metadata :
219237 self .metadata = tokens .metadata
220238 self .metadata ["end_byte" ] = tokens .metadata ["end_byte" ]
0 commit comments