Skip to content

Commit d041ecc

Browse files
fix: chain merge continuations instead of stopping after one pair (#156)
Signed-off-by: stone <frank.schruefer@t-online.de>
1 parent 327990c commit d041ecc

2 files changed

Lines changed: 47 additions & 27 deletions

File tree

docling_ibm_models/reading_order/reading_order_rb.py

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,15 @@ def predict_merges(
174174

175175
merges: Dict[int, List[int]] = {}
176176

177+
skip_labels = [
178+
DocItemLabel.PAGE_HEADER,
179+
DocItemLabel.PAGE_FOOTER,
180+
DocItemLabel.TABLE,
181+
DocItemLabel.PICTURE,
182+
DocItemLabel.CAPTION,
183+
DocItemLabel.FOOTNOTE,
184+
]
185+
177186
curr_ind = -1
178187
for ind, elem in enumerate(sorted_elements):
179188

@@ -182,32 +191,43 @@ def predict_merges(
182191

183192
if elem.label in [DocItemLabel.TEXT]:
184193

185-
ind_p1 = ind + 1
186-
while ind_p1 < len(sorted_elements) and sorted_elements[ind_p1] in [
187-
DocItemLabel.PAGE_HEADER,
188-
DocItemLabel.PAGE_FOOTER,
189-
DocItemLabel.TABLE,
190-
DocItemLabel.PICTURE,
191-
DocItemLabel.CAPTION,
192-
DocItemLabel.FOOTNOTE,
193-
]:
194-
ind_p1 += 1
195-
196-
if (
197-
ind_p1 < len(sorted_elements)
198-
and sorted_elements[ind_p1].label == elem.label
199-
and (
200-
elem.page_no != sorted_elements[ind_p1].label
201-
or elem.is_strictly_left_of(sorted_elements[ind_p1])
202-
)
203-
):
204-
205-
m1 = re.fullmatch(r".+([a-z,\-])(\s*)", elem.text)
206-
m2 = re.fullmatch(r"(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
194+
merge_list: List[int] = []
195+
check_ind = ind
196+
197+
while True:
198+
ind_p1 = check_ind + 1
199+
while (
200+
ind_p1 < len(sorted_elements)
201+
and sorted_elements[ind_p1].label in skip_labels
202+
):
203+
ind_p1 += 1
204+
205+
if (
206+
ind_p1 < len(sorted_elements)
207+
and sorted_elements[ind_p1].label == elem.label
208+
and (
209+
elem.page_no != sorted_elements[ind_p1].page_no
210+
or elem.is_strictly_left_of(sorted_elements[ind_p1])
211+
)
212+
):
213+
m1 = re.fullmatch(
214+
r".+([a-z,\-])(\s*)", sorted_elements[check_ind].text
215+
)
216+
m2 = re.fullmatch(
217+
r"(\s*[a-z])(.+)", sorted_elements[ind_p1].text
218+
)
219+
220+
if m1 and m2:
221+
merge_list.append(sorted_elements[ind_p1].cid)
222+
curr_ind = ind_p1
223+
check_ind = ind_p1
224+
else:
225+
break
226+
else:
227+
break
207228

208-
if m1 and m2:
209-
merges[elem.cid] = [sorted_elements[ind_p1].cid]
210-
curr_ind = ind_p1
229+
if merge_list:
230+
merges[elem.cid] = merge_list
211231

212232
return merges
213233

@@ -681,7 +701,7 @@ def _find_to_captions(
681701
"""
682702

683703
def _remove_overlapping_indexes(
684-
mapping: Dict[int, List[int]]
704+
mapping: Dict[int, List[int]],
685705
) -> Dict[int, List[int]]:
686706
used = set()
687707
result = {}

docling_ibm_models/tableformer/data_management/tf_predictor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def resize_img(self, image, width=None, height=None, inter=cv2.INTER_AREA):
433433
# initialize the dimensions of the image to be resized and
434434
# grab the image size
435435
dim = None
436-
(h, w) = image.shape[:2]
436+
h, w = image.shape[:2]
437437
sf = 1.0
438438
# if both the width and height are None, then return the
439439
# original image

0 commit comments

Comments
 (0)