Skip to content

Commit 2d9887a

Browse files
committed
workaround for Philo4 parser bug
1 parent b95f56f commit 2d9887a

File tree

2 files changed

+13
-2
lines changed

2 files changed

+13
-2
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="text_preprocessing",
9-
version="1.0rc5",
9+
version="1.0",
1010
author="The ARTFL Project",
1111
author_email="clovisgladstone@gmail.com",
1212
packages=["text_preprocessing", "text_preprocessing.lang"],

text_preprocessing/preprocessor.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,10 +382,17 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
382382
sent_id: str | None = None
383383
sent_starts_list: list[list[bool]] = []
384384
sent_starts: list[bool] = []
385+
current_sent_id: str = ""
385386
with open_file(text) as philo_db_text:
386387
for line in philo_db_text:
387388
word_obj: dict[str, Any] = orjson.loads(line.strip())
388-
philo_id = word_obj["position"].split()
389+
if (
390+
word_obj["philo_type"] == "punct" and current_sent_id
391+
): # workaround for bug in Philo4 parser where punctuation is assigned to wrong sentence
392+
philo_id = current_sent_id.split()
393+
word_obj["position"] = current_sent_id + " 0"
394+
else:
395+
philo_id = word_obj["position"].split()
389396
object_id = " ".join(philo_id[: PHILO_TEXT_OBJECT_TYPE[cls.text_object_type]])
390397
current_sent_id = " ".join(philo_id[: PHILO_TEXT_OBJECT_TYPE["sent"]])
391398
if current_object_id == "":
@@ -406,6 +413,8 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
406413
current_text_object[0][1]["position"].split()[:6] + ["0"]
407414
)
408415
obj_metadata["philo_type"] = "sent"
416+
obj_metadata["start_byte"] = current_text_object[0][1]["start_byte"]
417+
obj_metadata["end_byte"] = current_text_object[-1][1]["end_byte"]
409418
metadata.append(obj_metadata)
410419
else:
411420
metadata.append(os.path.basename(text))
@@ -436,6 +445,8 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
436445
if cls.text_object_type == "sent":
437446
obj_metadata["philo_id"] = " ".join(current_text_object[0][1]["position"].split()[:6] + ["0"])
438447
obj_metadata["philo_type"] = "sent"
448+
obj_metadata["start_byte"] = current_text_object[0][1]["start_byte"]
449+
obj_metadata["end_byte"] = current_text_object[-1][1]["end_byte"]
439450
metadata.append(obj_metadata)
440451
docs.append(current_text_object)
441452
sent_starts_list.append(sent_starts)

0 commit comments

Comments
 (0)