@@ -382,10 +382,17 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
382382 sent_id : str | None = None
383383 sent_starts_list : list [list [bool ]] = []
384384 sent_starts : list [bool ] = []
385+ current_sent_id : str = ""
385386 with open_file (text ) as philo_db_text :
386387 for line in philo_db_text :
387388 word_obj : dict [str , Any ] = orjson .loads (line .strip ())
388- philo_id = word_obj ["position" ].split ()
389+ if (
390+ word_obj ["philo_type" ] == "punct" and current_sent_id
391+ ): # workaround for bug in Philo4 parser where punctuation is assigned to wrong sentence
392+ philo_id = current_sent_id .split ()
393+ word_obj ["position" ] = current_sent_id + " 0"
394+ else :
395+ philo_id = word_obj ["position" ].split ()
389396 object_id = " " .join (philo_id [: PHILO_TEXT_OBJECT_TYPE [cls .text_object_type ]])
390397 current_sent_id = " " .join (philo_id [: PHILO_TEXT_OBJECT_TYPE ["sent" ]])
391398 if current_object_id == "" :
@@ -406,6 +413,8 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
406413 current_text_object [0 ][1 ]["position" ].split ()[:6 ] + ["0" ]
407414 )
408415 obj_metadata ["philo_type" ] = "sent"
416+ obj_metadata ["start_byte" ] = current_text_object [0 ][1 ]["start_byte" ]
417+ obj_metadata ["end_byte" ] = current_text_object [- 1 ][1 ]["end_byte" ]
409418 metadata .append (obj_metadata )
410419 else :
411420 metadata .append (os .path .basename (text ))
@@ -436,6 +445,8 @@ def process_philo_text(cls, text: str, fetch_metadata: bool = True):
436445 if cls .text_object_type == "sent" :
437446 obj_metadata ["philo_id" ] = " " .join (current_text_object [0 ][1 ]["position" ].split ()[:6 ] + ["0" ])
438447 obj_metadata ["philo_type" ] = "sent"
448+ obj_metadata ["start_byte" ] = current_text_object [0 ][1 ]["start_byte" ]
449+ obj_metadata ["end_byte" ] = current_text_object [- 1 ][1 ]["end_byte" ]
439450 metadata .append (obj_metadata )
440451 docs .append (current_text_object )
441452 sent_starts_list .append (sent_starts )
0 commit comments