Skip to content

Commit e453561

Browse files
committed
bug fixes
1 parent bb8f826 commit e453561

File tree

2 files changed

+32
-8
lines changed

2 files changed

+32
-8
lines changed

text_preprocessing/preprocessor.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,7 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
383383
sent_starts_list: list[list[bool]] = []
384384
sent_starts: list[bool] = []
385385
current_sent_id: str = ""
386+
word_count = 0
386387
with open_file(text) as philo_db_text:
387388
for line in philo_db_text:
388389
word_obj: dict[str, Any] = orjson.loads(line.strip())
@@ -409,28 +410,33 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
409410
text,
410411
)
411412
if cls.text_object_type == "sent":
412-
obj_metadata["philo_id"] = " ".join(
413-
current_text_object[0][1]["position"].split()[:6] + ["0"]
414-
)
415-
obj_metadata["philo_type"] = "sent"
416-
obj_metadata["start_byte"] = current_text_object[0][1]["start_byte"]
417-
obj_metadata["end_byte"] = current_text_object[-1][1]["end_byte"]
413+
obj_metadata = {
414+
**obj_metadata,
415+
"philo_id": " ".join(current_text_object[0][1]["position"].split()[:6] + ["0"]),
416+
"philo_type": "sent",
417+
"start_byte": current_text_object[0][1]["start_byte"],
418+
"end_byte": current_text_object[-1][1]["end_byte"],
419+
"word_count": word_count,
420+
}
418421
metadata.append(obj_metadata)
419422
else:
420423
metadata.append(os.path.basename(text))
421424
docs.append(current_text_object)
422425
sent_starts_list.append(sent_starts)
423426
sent_starts = []
424427
current_text_object = []
428+
word_count = 0
425429
current_object_id = object_id
426430
if current_sent_id == sent_id:
427431
sent_starts.append(False)
428432
else:
429433
sent_starts.append(True)
430434
if cls.modernize is not False:
431435
current_text_object.append((cls.modernize(word_obj["token"]), word_obj)) # type: ignore
436+
word_count += 1
432437
else:
433438
current_text_object.append((word_obj["token"], word_obj))
439+
word_count += 1
434440
sent_id = " ".join(philo_id[: PHILO_TEXT_OBJECT_TYPE["sent"]])
435441
if current_text_object:
436442
if fetch_metadata is True:

text_preprocessing/spacy_helpers.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,17 @@ def __init__(self, doc: Doc | Iterable[PreprocessorToken], metadata=None, keep_a
142142
def __get_tokens(self, doc: Doc):
143143
"""Return a generator of PreprocessorToken objects"""
144144
max_index = len(doc) - 1
145+
word_before = False
145146
for index, token in enumerate(doc):
146147
if token.text != "#DEL#":
147148
yield PreprocessorToken(token.text, token.pos_, token.ent_type_, token._.ext)
149+
word_before = True
148150
elif self.keep_all is True:
149151
yield PreprocessorToken("", token.pos_, token.ent_type_, token._.ext)
150-
if token.whitespace_ and index < max_index: # remove trailing whitespace
152+
word_before = True
153+
if all((token.whitespace_, word_before, index < max_index)): # keep whitespace except at the very end
151154
yield PreprocessorToken(token.whitespace_, "", "", {**token._.ext, "token": token.whitespace_})
155+
word_before = False
152156

153157
def __iter__(self) -> Iterable[PreprocessorToken]:
154158
for token in self.tokens:
@@ -165,7 +169,20 @@ def __getitem__(self, index: Union[int, slice]) -> Union[PreprocessorToken, Iter
165169
if isinstance(index, int):
166170
return self.tokens[index]
167171
elif isinstance(index, slice):
168-
return Tokens(list(self.tokens)[index], self.metadata)
172+
tokens = list(self.tokens)[index]
173+
if tokens:
174+
metadata = {
175+
**self.metadata,
176+
"start_byte": tokens[0].ext["start_byte"],
177+
"end_byte": tokens[-1].ext["end_byte"],
178+
}
179+
else:
180+
metadata = {
181+
**self.metadata,
182+
"start_byte": 0,
183+
"end_byte": 0,
184+
}
185+
return Tokens(tokens, metadata)
169186
else:
170187
print(f"{repr(index)} of type {type(index)} is not an index or slice")
171188
raise TypeError
@@ -215,6 +232,7 @@ def split_tokens(self, n: int) -> Iterable["Tokens"]:
215232
def extend(self, tokens) -> None:
216233
"""Extend size of Tokens"""
217234
self.tokens.extend(tokens.tokens)
235+
self.length = len(self.tokens)
218236
if not self.metadata:
219237
self.metadata = tokens.metadata
220238
self.metadata["end_byte"] = tokens.metadata["end_byte"]

0 commit comments

Comments
 (0)