bug fixes

clovis · clovis · commit e4535618b8f6 · 2023-09-20T09:04:11.000-05:00
diff --git a/text_preprocessing/preprocessor.py b/text_preprocessing/preprocessor.py
@@ -383,6 +383,7 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
         sent_starts_list: list[list[bool]] = []
         sent_starts: list[bool] = []
         current_sent_id: str = ""
+        word_count = 0
         with open_file(text) as philo_db_text:
             for line in philo_db_text:
                 word_obj: dict[str, Any] = orjson.loads(line.strip())
@@ -409,28 +410,33 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
                                 text,
                             )
                             if cls.text_object_type == "sent":
-                                obj_metadata["philo_id"] = " ".join(
-                                    current_text_object[0][1]["position"].split()[:6] + ["0"]
-                                )
-                                obj_metadata["philo_type"] = "sent"
-                                obj_metadata["start_byte"] = current_text_object[0][1]["start_byte"]
-                                obj_metadata["end_byte"] = current_text_object[-1][1]["end_byte"]
+                                obj_metadata = {
+                                    **obj_metadata,
+                                    "philo_id": " ".join(current_text_object[0][1]["position"].split()[:6] + ["0"]),
+                                    "philo_type": "sent",
+                                    "start_byte": current_text_object[0][1]["start_byte"],
+                                    "end_byte": current_text_object[-1][1]["end_byte"],
+                                    "word_count": word_count,
+                                }
                             metadata.append(obj_metadata)
                         else:
                             metadata.append(os.path.basename(text))
                         docs.append(current_text_object)
                         sent_starts_list.append(sent_starts)
                         sent_starts = []
                         current_text_object = []
+                        word_count = 0
                     current_object_id = object_id
                 if current_sent_id == sent_id:
                     sent_starts.append(False)
                 else:
                     sent_starts.append(True)
                 if cls.modernize is not False:
                     current_text_object.append((cls.modernize(word_obj["token"]), word_obj))  # type: ignore
+                    word_count += 1
                 else:
                     current_text_object.append((word_obj["token"], word_obj))
+                    word_count += 1
                 sent_id = " ".join(philo_id[: PHILO_TEXT_OBJECT_TYPE["sent"]])
         if current_text_object:
             if fetch_metadata is True:
diff --git a/text_preprocessing/spacy_helpers.py b/text_preprocessing/spacy_helpers.py
@@ -142,13 +142,17 @@ def __init__(self, doc: Doc | Iterable[PreprocessorToken], metadata=None, keep_a
     def __get_tokens(self, doc: Doc):
         """Return a generator of PreprocessorToken objects"""
         max_index = len(doc) - 1
+        word_before = False
         for index, token in enumerate(doc):
             if token.text != "#DEL#":
                 yield PreprocessorToken(token.text, token.pos_, token.ent_type_, token._.ext)
+                word_before = True
             elif self.keep_all is True:
                 yield PreprocessorToken("", token.pos_, token.ent_type_, token._.ext)
-            if token.whitespace_ and index < max_index:  # remove trailing whitespace
+                word_before = True
+            if all((token.whitespace_, word_before, index < max_index)):  # keep whitespace except at the very end
                 yield PreprocessorToken(token.whitespace_, "", "", {**token._.ext, "token": token.whitespace_})
+                word_before = False
 
     def __iter__(self) -> Iterable[PreprocessorToken]:
         for token in self.tokens:
@@ -165,7 +169,20 @@ def __getitem__(self, index: Union[int, slice]) -> Union[PreprocessorToken, Iter
         if isinstance(index, int):
             return self.tokens[index]
         elif isinstance(index, slice):
-            return Tokens(list(self.tokens)[index], self.metadata)
+            tokens = list(self.tokens)[index]
+            if tokens:
+                metadata = {
+                    **self.metadata,
+                    "start_byte": tokens[0].ext["start_byte"],
+                    "end_byte": tokens[-1].ext["end_byte"],
+                }
+            else:
+                metadata = {
+                    **self.metadata,
+                    "start_byte": 0,
+                    "end_byte": 0,
+                }
+            return Tokens(tokens, metadata)
         else:
             print(f"{repr(index)} of type {type(index)} is not an index or slice")
             raise TypeError
@@ -215,6 +232,7 @@ def split_tokens(self, n: int) -> Iterable["Tokens"]:
     def extend(self, tokens) -> None:
         """Extend size of Tokens"""
         self.tokens.extend(tokens.tokens)
+        self.length = len(self.tokens)
         if not self.metadata:
             self.metadata = tokens.metadata
         self.metadata["end_byte"] = tokens.metadata["end_byte"]