Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions docling/models/stages/reading_order/readingorder_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,13 @@ def _merge_elements(self, element, merged_elem, new_item, page_height):
),
bbox=merged_elem.cluster.bbox.to_bottom_left_origin(page_height),
)
new_item.text += f" {merged_elem.text}"
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
if new_item.text.endswith('\u00AD'):
# Soft hyphen (U+00AD): strip it and join without space (hyphenated word split across lines)
Comment thread
cau-git marked this conversation as resolved.
new_item.text = new_item.text[:-1] + merged_elem.text
new_item.orig = new_item.orig[:-1] + merged_elem.text # TODO: This is incomplete, we don't have the `orig` field of the merged element.
else:
new_item.text += f" {merged_elem.text}"
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
new_item.prov.append(prov)

if new_item.hyperlink != merged_elem.hyperlink:
Expand Down
Loading