@@ -174,6 +174,15 @@ def predict_merges(
174174
175175 merges : Dict [int , List [int ]] = {}
176176
177+ skip_labels = [
178+ DocItemLabel .PAGE_HEADER ,
179+ DocItemLabel .PAGE_FOOTER ,
180+ DocItemLabel .TABLE ,
181+ DocItemLabel .PICTURE ,
182+ DocItemLabel .CAPTION ,
183+ DocItemLabel .FOOTNOTE ,
184+ ]
185+
177186 curr_ind = - 1
178187 for ind , elem in enumerate (sorted_elements ):
179188
@@ -182,32 +191,43 @@ def predict_merges(
182191
183192 if elem .label in [DocItemLabel .TEXT ]:
184193
185- ind_p1 = ind + 1
186- while ind_p1 < len (sorted_elements ) and sorted_elements [ind_p1 ] in [
187- DocItemLabel .PAGE_HEADER ,
188- DocItemLabel .PAGE_FOOTER ,
189- DocItemLabel .TABLE ,
190- DocItemLabel .PICTURE ,
191- DocItemLabel .CAPTION ,
192- DocItemLabel .FOOTNOTE ,
193- ]:
194- ind_p1 += 1
195-
196- if (
197- ind_p1 < len (sorted_elements )
198- and sorted_elements [ind_p1 ].label == elem .label
199- and (
200- elem .page_no != sorted_elements [ind_p1 ].label
201- or elem .is_strictly_left_of (sorted_elements [ind_p1 ])
202- )
203- ):
204-
205- m1 = re .fullmatch (r".+([a-z,\-])(\s*)" , elem .text )
206- m2 = re .fullmatch (r"(\s*[a-z])(.+)" , sorted_elements [ind_p1 ].text )
194+ merge_list : List [int ] = []
195+ check_ind = ind
196+
197+ while True :
198+ ind_p1 = check_ind + 1
199+ while (
200+ ind_p1 < len (sorted_elements )
201+ and sorted_elements [ind_p1 ].label in skip_labels
202+ ):
203+ ind_p1 += 1
204+
205+ if (
206+ ind_p1 < len (sorted_elements )
207+ and sorted_elements [ind_p1 ].label == elem .label
208+ and (
209+ elem .page_no != sorted_elements [ind_p1 ].page_no
210+ or elem .is_strictly_left_of (sorted_elements [ind_p1 ])
211+ )
212+ ):
213+ m1 = re .fullmatch (
214+ r".+([a-z,\-])(\s*)" , sorted_elements [check_ind ].text
215+ )
216+ m2 = re .fullmatch (
217+ r"(\s*[a-z])(.+)" , sorted_elements [ind_p1 ].text
218+ )
219+
220+ if m1 and m2 :
221+ merge_list .append (sorted_elements [ind_p1 ].cid )
222+ curr_ind = ind_p1
223+ check_ind = ind_p1
224+ else :
225+ break
226+ else :
227+ break
207228
208- if m1 and m2 :
209- merges [elem .cid ] = [sorted_elements [ind_p1 ].cid ]
210- curr_ind = ind_p1
229+ if merge_list :
230+ merges [elem .cid ] = merge_list
211231
212232 return merges
213233
@@ -681,7 +701,7 @@ def _find_to_captions(
681701 """
682702
683703 def _remove_overlapping_indexes (
684- mapping : Dict [int , List [int ]]
704+ mapping : Dict [int , List [int ]],
685705 ) -> Dict [int , List [int ]]:
686706 used = set ()
687707 result = {}
0 commit comments