diff --git a/docling_ibm_models/tableformer/data_management/matching_post_processor.py b/docling_ibm_models/tableformer/data_management/matching_post_processor.py index a4037a7..428ef00 100644 --- a/docling_ibm_models/tableformer/data_management/matching_post_processor.py +++ b/docling_ibm_models/tableformer/data_management/matching_post_processor.py @@ -896,6 +896,9 @@ def _pick_orphan_cells( orphan_columns_bbox = [] used_col_pdf_ids = [] used_col_columnid = [] + # Cache each column's X-centroid so we can fall back to nearest-column + # assignment for orphans that match a row band but no column band. + col_centroids: dict[int, float] = {} for col in range(tab_cols): bbox_x1s = [] # y2 > y1 @@ -930,6 +933,10 @@ def _pick_orphan_cells( if len(bbox_x2s) > 0: col_x2 = max(bbox_x2s) + # Cache centroid for nearest-column fallback (see orphan-rows loop). + if col_x1 >= 0 and col_x2 >= 0: + col_centroids[col] = (col_x1 + col_x2) / 2 + # Find "orphan" cells that intersect the band for pdf_cell in pdf_cells: pdf_str_id = str(pdf_cell["id"]) @@ -1050,51 +1057,84 @@ def _pick_orphan_cells( depth_index = orphan_columns[new_column_id].index(pdf_cell_id) confidence = orphan_columns_depth[new_column_id][depth_index] pdf_bbox = orphan_columns_bbox[new_column_id][depth_index] - - # 1. Find table_cell_id by new_row_id / new_column_id - new_table_cell_id = -1 - tcell = list( - filter( - lambda table_cell: table_cell["row_id"] == new_row_id - and table_cell["column_id"] == new_column_id, - table_cells, - ) + else: + # Row-band match but no column-band match. Without the + # following fallback the pdf cell is silently dropped (see + # https://github.com/docling-project/docling-ibm-models/issues/28 + # and the symptom in the parent docling issue tracker: + # right-aligned values on tables whose predicted columns are + # narrower than the page region disappear from the output). + # Snap to the nearest column by X-centroid distance and emit + # a WARNING so the recovery is observable. + pdf_cell = next( + (p for p in pdf_cells if str(p["id"]) == pdf_cell_id), + None, + ) + if pdf_cell is None or not col_centroids: + # Nothing to attach to; preserve historic behaviour. + continue + pdf_bbox = pdf_cell["bbox"] + cell_cx = (pdf_bbox[0] + pdf_bbox[2]) / 2 + new_column_id, min_dist = min( + ((c, abs(cell_cx - cx)) for c, cx in col_centroids.items()), + key=lambda t: t[1], + ) + # confidence < 0 marks "nearest-column fallback" so downstream + # consumers can distinguish high-confidence band matches from + # snapped fallbacks if they care. + confidence = -int(round(min_dist)) - 1 + self._log().warning( + "Orphan pdf_cell %s recovered to col=%s by nearest-column " + "fallback (row=%s, x=%.1f, dist=%.1f)", + pdf_cell_id, + new_column_id, + new_row_id, + cell_cx, + min_dist, ) - if len(tcell) > 0: - new_table_cell_id = tcell[0]["cell_id"] - self._log().debug( - "reusing table_cell_id: {}".format(new_table_cell_id) - ) + # 1. Find table_cell_id by new_row_id / new_column_id + new_table_cell_id = -1 + tcell = list( + filter( + lambda table_cell: table_cell["row_id"] == new_row_id + and table_cell["column_id"] == new_column_id, + table_cells, + ) + ) - for i in range(len(new_table_cells)): - if new_table_cells[i]["cell_id"] == new_table_cell_id: - bbox_tmp = self._merge_two_bboxes( - new_table_cells[i]["bbox"], pdf_bbox - ) - new_table_cells[i]["bbox"] = bbox_tmp - - if new_table_cell_id < 0: - max_cell_id += 1 - new_table_cell_id = max_cell_id - - new_table_cell = { - "bbox": pdf_bbox, - "cell_id": new_table_cell_id, - "column_id": new_column_id, - "label": "body", - "row_id": new_row_id, - "cell_class": 2, - } - self._log().debug( - "making new table_cell_id: {}".format(new_table_cell_id) - ) - new_table_cells.append(new_table_cell) + if len(tcell) > 0: + new_table_cell_id = tcell[0]["cell_id"] + self._log().debug("reusing table_cell_id: {}".format(new_table_cell_id)) + + for i in range(len(new_table_cells)): + if new_table_cells[i]["cell_id"] == new_table_cell_id: + bbox_tmp = self._merge_two_bboxes( + new_table_cells[i]["bbox"], pdf_bbox + ) + new_table_cells[i]["bbox"] = bbox_tmp + + if new_table_cell_id < 0: + max_cell_id += 1 + new_table_cell_id = max_cell_id + + new_table_cell = { + "bbox": pdf_bbox, + "cell_id": new_table_cell_id, + "column_id": new_column_id, + "label": "body", + "row_id": new_row_id, + "cell_class": 2, + } + self._log().debug( + "making new table_cell_id: {}".format(new_table_cell_id) + ) + new_table_cells.append(new_table_cell) - # And then add new match to the new_matches - new_matches[str(pdf_cell_id)] = [ - {"post": confidence, "table_cell_id": new_table_cell_id} - ] + # And then add new match to the new_matches + new_matches[str(pdf_cell_id)] = [ + {"post": confidence, "table_cell_id": new_table_cell_id} + ] return new_matches, new_table_cells, max_cell_id def _clear_pdf_cells(self, pdf_cells): diff --git a/tests/test_matching_post_processor.py b/tests/test_matching_post_processor.py new file mode 100644 index 0000000..bcb2873 --- /dev/null +++ b/tests/test_matching_post_processor.py @@ -0,0 +1,108 @@ +"""Unit tests for `MatchingPostProcessor._pick_orphan_cells`. + +Specifically the silent-drop case where a PDF text cell falls inside a row +band but outside every column band: prior to the nearest-column fallback, +such cells were dropped on the floor without warning. Now they are snapped +to the closest column by X-centroid distance. + +See https://github.com/docling-project/docling-ibm-models/issues/28. +""" + +from docling_ibm_models.tableformer.data_management.matching_post_processor import ( + MatchingPostProcessor, +) + + +def _make_proc(): + # `_pick_orphan_cells` does not exercise the cell matcher; a stub config + # is sufficient. CellMatcher init reads `pdf_cell_iou_thres`. + return MatchingPostProcessor({"predict": {"pdf_cell_iou_thres": 0.05}}) + + +def test_orphan_with_no_col_band_match_is_recovered_to_nearest_column(): + """A pdf cell inside the row band but outside every column band must + still produce a match — historically it was silently dropped.""" + proc = _make_proc() + + # Two predicted columns whose x-bands lie at x≈10..200, in two rows. + # The orphan pdf cell sits in row 1's y-band but its x=560 is outside + # both column x-bands — exactly the silent-drop case. + table_cells = [ + {"cell_id": 0, "row_id": 0, "column_id": 0, "label": "body", + "cell_class": 2, "bbox": [10, 10, 90, 25]}, + {"cell_id": 1, "row_id": 0, "column_id": 1, "label": "body", + "cell_class": 2, "bbox": [110, 10, 200, 25]}, + {"cell_id": 2, "row_id": 1, "column_id": 0, "label": "body", + "cell_class": 2, "bbox": [10, 30, 90, 45]}, + {"cell_id": 3, "row_id": 1, "column_id": 1, "label": "body", + "cell_class": 2, "bbox": [110, 30, 200, 45]}, + ] + pdf_cells = [ + # In row 1's y band (30..45) but x=560 — outside col 0 (10..90) + # and col 1 (110..200). Pre-fix this gets dropped silently. + {"id": 99, "bbox": [550, 32, 590, 43], "text": "$4,129.51"}, + ] + matches: dict = {} # the orphan is unmatched at entry + + new_matches, new_table_cells, _max_cell_id = proc._pick_orphan_cells( + tab_rows=2, + tab_cols=2, + max_cell_id=3, + table_cells=table_cells, + pdf_cells=pdf_cells, + matches=matches, + ) + + # After the fix: the orphan must appear in new_matches assigned to a + # column (the nearest-centroid one — column 1 at centroid x=155 vs + # column 0 at centroid x=50). + assert "99" in new_matches, ( + "orphan pdf_cell was silently dropped (no match emitted)" + ) + assigned = new_matches["99"][0] + target_table_cell = next( + tc for tc in new_table_cells if tc["cell_id"] == assigned["table_cell_id"] + ) + assert target_table_cell["column_id"] == 1, ( + f"expected nearest-column fallback to col 1 (centroid 155), " + f"got col {target_table_cell['column_id']}" + ) + # confidence is negative for nearest-column fallback. + assert assigned["post"] < 0, ( + f"nearest-column fallback should mark confidence < 0, got {assigned['post']}" + ) + + +def test_band_matched_orphans_use_normal_path(): + """The fast-path (orphan inside a column band) must be unchanged.""" + proc = _make_proc() + + # Two rows of body cells. Orphan pdf cell sits in row 1's y-band AND + # within column 0's x-band — the existing happy path. + table_cells = [ + {"cell_id": 0, "row_id": 0, "column_id": 0, "label": "body", + "cell_class": 2, "bbox": [10, 10, 90, 25]}, + {"cell_id": 1, "row_id": 0, "column_id": 1, "label": "body", + "cell_class": 2, "bbox": [110, 10, 200, 25]}, + {"cell_id": 2, "row_id": 1, "column_id": 0, "label": "body", + "cell_class": 2, "bbox": [10, 30, 90, 45]}, + {"cell_id": 3, "row_id": 1, "column_id": 1, "label": "body", + "cell_class": 2, "bbox": [110, 30, 200, 45]}, + ] + pdf_cells = [ + # In col 0's x-band (10..90) AND row 1's y-band (30..45). + {"id": 7, "bbox": [20, 32, 80, 43], "text": "in band"}, + ] + matches: dict = {} + new_matches, new_table_cells, _ = proc._pick_orphan_cells( + tab_rows=2, tab_cols=2, max_cell_id=3, + table_cells=table_cells, pdf_cells=pdf_cells, matches=matches, + ) + assert "7" in new_matches + assigned = new_matches["7"][0] + target = next(tc for tc in new_table_cells if tc["cell_id"] == assigned["table_cell_id"]) + assert target["column_id"] == 0, "in-band orphan should land in col 0" + # Normal-path confidence is non-negative. + assert assigned["post"] >= 0, ( + f"in-band match should have non-negative confidence, got {assigned['post']}" + )