Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,9 @@ def _pick_orphan_cells(
orphan_columns_bbox = []
used_col_pdf_ids = []
used_col_columnid = []
# Cache each column's X-centroid so we can fall back to nearest-column
# assignment for orphans that match a row band but no column band.
col_centroids: dict[int, float] = {}

for col in range(tab_cols):
bbox_x1s = [] # y2 > y1
Expand Down Expand Up @@ -930,6 +933,10 @@ def _pick_orphan_cells(
if len(bbox_x2s) > 0:
col_x2 = max(bbox_x2s)

# Cache centroid for nearest-column fallback (see orphan-rows loop).
if col_x1 >= 0 and col_x2 >= 0:
col_centroids[col] = (col_x1 + col_x2) / 2

# Find "orphan" cells that intersect the band
for pdf_cell in pdf_cells:
pdf_str_id = str(pdf_cell["id"])
Expand Down Expand Up @@ -1050,51 +1057,84 @@ def _pick_orphan_cells(
depth_index = orphan_columns[new_column_id].index(pdf_cell_id)
confidence = orphan_columns_depth[new_column_id][depth_index]
pdf_bbox = orphan_columns_bbox[new_column_id][depth_index]

# 1. Find table_cell_id by new_row_id / new_column_id
new_table_cell_id = -1
tcell = list(
filter(
lambda table_cell: table_cell["row_id"] == new_row_id
and table_cell["column_id"] == new_column_id,
table_cells,
)
else:
# Row-band match but no column-band match. Without the
# following fallback the pdf cell is silently dropped (see
# https://github.com/docling-project/docling-ibm-models/issues/28
# and the symptom in the parent docling issue tracker:
# right-aligned values on tables whose predicted columns are
# narrower than the page region disappear from the output).
# Snap to the nearest column by X-centroid distance and emit
# a WARNING so the recovery is observable.
pdf_cell = next(
(p for p in pdf_cells if str(p["id"]) == pdf_cell_id),
None,
)
if pdf_cell is None or not col_centroids:
# Nothing to attach to; preserve historic behaviour.
continue
pdf_bbox = pdf_cell["bbox"]
cell_cx = (pdf_bbox[0] + pdf_bbox[2]) / 2
new_column_id, min_dist = min(
((c, abs(cell_cx - cx)) for c, cx in col_centroids.items()),
key=lambda t: t[1],
)
# confidence < 0 marks "nearest-column fallback" so downstream
# consumers can distinguish high-confidence band matches from
# snapped fallbacks if they care.
confidence = -int(round(min_dist)) - 1
self._log().warning(
"Orphan pdf_cell %s recovered to col=%s by nearest-column "
"fallback (row=%s, x=%.1f, dist=%.1f)",
pdf_cell_id,
new_column_id,
new_row_id,
cell_cx,
min_dist,
)

if len(tcell) > 0:
new_table_cell_id = tcell[0]["cell_id"]
self._log().debug(
"reusing table_cell_id: {}".format(new_table_cell_id)
)
# 1. Find table_cell_id by new_row_id / new_column_id
new_table_cell_id = -1
tcell = list(
filter(
lambda table_cell: table_cell["row_id"] == new_row_id
and table_cell["column_id"] == new_column_id,
table_cells,
)
)

for i in range(len(new_table_cells)):
if new_table_cells[i]["cell_id"] == new_table_cell_id:
bbox_tmp = self._merge_two_bboxes(
new_table_cells[i]["bbox"], pdf_bbox
)
new_table_cells[i]["bbox"] = bbox_tmp

if new_table_cell_id < 0:
max_cell_id += 1
new_table_cell_id = max_cell_id

new_table_cell = {
"bbox": pdf_bbox,
"cell_id": new_table_cell_id,
"column_id": new_column_id,
"label": "body",
"row_id": new_row_id,
"cell_class": 2,
}
self._log().debug(
"making new table_cell_id: {}".format(new_table_cell_id)
)
new_table_cells.append(new_table_cell)
if len(tcell) > 0:
new_table_cell_id = tcell[0]["cell_id"]
self._log().debug("reusing table_cell_id: {}".format(new_table_cell_id))

for i in range(len(new_table_cells)):
if new_table_cells[i]["cell_id"] == new_table_cell_id:
bbox_tmp = self._merge_two_bboxes(
new_table_cells[i]["bbox"], pdf_bbox
)
new_table_cells[i]["bbox"] = bbox_tmp

if new_table_cell_id < 0:
max_cell_id += 1
new_table_cell_id = max_cell_id

new_table_cell = {
"bbox": pdf_bbox,
"cell_id": new_table_cell_id,
"column_id": new_column_id,
"label": "body",
"row_id": new_row_id,
"cell_class": 2,
}
self._log().debug(
"making new table_cell_id: {}".format(new_table_cell_id)
)
new_table_cells.append(new_table_cell)

# And then add new match to the new_matches
new_matches[str(pdf_cell_id)] = [
{"post": confidence, "table_cell_id": new_table_cell_id}
]
# And then add new match to the new_matches
new_matches[str(pdf_cell_id)] = [
{"post": confidence, "table_cell_id": new_table_cell_id}
]
return new_matches, new_table_cells, max_cell_id

def _clear_pdf_cells(self, pdf_cells):
Expand Down
108 changes: 108 additions & 0 deletions tests/test_matching_post_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Unit tests for `MatchingPostProcessor._pick_orphan_cells`.

Specifically the silent-drop case where a PDF text cell falls inside a row
band but outside every column band: prior to the nearest-column fallback,
such cells were dropped on the floor without warning. Now they are snapped
to the closest column by X-centroid distance.

See https://github.com/docling-project/docling-ibm-models/issues/28.
"""

from docling_ibm_models.tableformer.data_management.matching_post_processor import (
MatchingPostProcessor,
)


def _make_proc():
# `_pick_orphan_cells` does not exercise the cell matcher; a stub config
# is sufficient. CellMatcher init reads `pdf_cell_iou_thres`.
return MatchingPostProcessor({"predict": {"pdf_cell_iou_thres": 0.05}})


def test_orphan_with_no_col_band_match_is_recovered_to_nearest_column():
"""A pdf cell inside the row band but outside every column band must
still produce a match — historically it was silently dropped."""
proc = _make_proc()

# Two predicted columns whose x-bands lie at x≈10..200, in two rows.
# The orphan pdf cell sits in row 1's y-band but its x=560 is outside
# both column x-bands — exactly the silent-drop case.
table_cells = [
{"cell_id": 0, "row_id": 0, "column_id": 0, "label": "body",
"cell_class": 2, "bbox": [10, 10, 90, 25]},
{"cell_id": 1, "row_id": 0, "column_id": 1, "label": "body",
"cell_class": 2, "bbox": [110, 10, 200, 25]},
{"cell_id": 2, "row_id": 1, "column_id": 0, "label": "body",
"cell_class": 2, "bbox": [10, 30, 90, 45]},
{"cell_id": 3, "row_id": 1, "column_id": 1, "label": "body",
"cell_class": 2, "bbox": [110, 30, 200, 45]},
]
pdf_cells = [
# In row 1's y band (30..45) but x=560 — outside col 0 (10..90)
# and col 1 (110..200). Pre-fix this gets dropped silently.
{"id": 99, "bbox": [550, 32, 590, 43], "text": "$4,129.51"},
]
matches: dict = {} # the orphan is unmatched at entry

new_matches, new_table_cells, _max_cell_id = proc._pick_orphan_cells(
tab_rows=2,
tab_cols=2,
max_cell_id=3,
table_cells=table_cells,
pdf_cells=pdf_cells,
matches=matches,
)

# After the fix: the orphan must appear in new_matches assigned to a
# column (the nearest-centroid one — column 1 at centroid x=155 vs
# column 0 at centroid x=50).
assert "99" in new_matches, (
"orphan pdf_cell was silently dropped (no match emitted)"
)
assigned = new_matches["99"][0]
target_table_cell = next(
tc for tc in new_table_cells if tc["cell_id"] == assigned["table_cell_id"]
)
assert target_table_cell["column_id"] == 1, (
f"expected nearest-column fallback to col 1 (centroid 155), "
f"got col {target_table_cell['column_id']}"
)
# confidence is negative for nearest-column fallback.
assert assigned["post"] < 0, (
f"nearest-column fallback should mark confidence < 0, got {assigned['post']}"
)


def test_band_matched_orphans_use_normal_path():
"""The fast-path (orphan inside a column band) must be unchanged."""
proc = _make_proc()

# Two rows of body cells. Orphan pdf cell sits in row 1's y-band AND
# within column 0's x-band — the existing happy path.
table_cells = [
{"cell_id": 0, "row_id": 0, "column_id": 0, "label": "body",
"cell_class": 2, "bbox": [10, 10, 90, 25]},
{"cell_id": 1, "row_id": 0, "column_id": 1, "label": "body",
"cell_class": 2, "bbox": [110, 10, 200, 25]},
{"cell_id": 2, "row_id": 1, "column_id": 0, "label": "body",
"cell_class": 2, "bbox": [10, 30, 90, 45]},
{"cell_id": 3, "row_id": 1, "column_id": 1, "label": "body",
"cell_class": 2, "bbox": [110, 30, 200, 45]},
]
pdf_cells = [
# In col 0's x-band (10..90) AND row 1's y-band (30..45).
{"id": 7, "bbox": [20, 32, 80, 43], "text": "in band"},
]
matches: dict = {}
new_matches, new_table_cells, _ = proc._pick_orphan_cells(
tab_rows=2, tab_cols=2, max_cell_id=3,
table_cells=table_cells, pdf_cells=pdf_cells, matches=matches,
)
assert "7" in new_matches
assigned = new_matches["7"][0]
target = next(tc for tc in new_table_cells if tc["cell_id"] == assigned["table_cell_id"])
assert target["column_id"] == 0, "in-band orphan should land in col 0"
# Normal-path confidence is non-negative.
assert assigned["post"] >= 0, (
f"in-band match should have non-negative confidence, got {assigned['post']}"
)
Loading