Skip to content

Commit 28d003b

Browse files
committed
Process tables in batches for Docling plugin
1 parent 9a4f2f6 commit 28d003b

1 file changed

Lines changed: 53 additions & 36 deletions

File tree

cells2table/docling.py

Lines changed: 53 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -136,56 +136,73 @@ def predict_tables(
136136
) -> Sequence[TableStructurePrediction]:
137137

138138
pages = list(pages)
139+
page_images: list[numpy.ndarray | None] = []
139140
predictions: list[TableStructurePrediction] = []
140141

141-
for page in pages:
142-
assert page._backend is not None
143-
if not page._backend.is_valid():
144-
existing_prediction = page.predictions.tablestructure or TableStructurePrediction()
145-
page.predictions.tablestructure = existing_prediction
146-
predictions.append(existing_prediction)
142+
table_images: list[numpy.ndarray] = []
143+
table_clusters: list[Cluster] = []
144+
cluster_page: list[int] = []
145+
146+
for i, page in enumerate(pages):
147+
table_prediction = page.predictions.tablestructure or TableStructurePrediction()
148+
page.predictions.tablestructure = table_prediction
149+
predictions.append(table_prediction)
150+
151+
if (
152+
page._backend is None
153+
or not page._backend.is_valid()
154+
or page.size is None
155+
or page.predictions.layout is None
156+
):
157+
page_images.append(None)
147158
continue
148159

149-
with TimeRecorder(conv_res, "table_structure"):
150-
assert page.predictions.layout is not None
151-
assert page.size is not None
160+
clusters = [
161+
cluster
162+
for cluster in page.predictions.layout.clusters
163+
if cluster.label in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
164+
]
152165

153-
table_prediction = TableStructurePrediction()
154-
page.predictions.tablestructure = table_prediction
166+
if not clusters:
167+
page_images.append(None)
168+
continue
155169

156-
in_tables = [
157-
cluster
158-
for cluster in page.predictions.layout.clusters
159-
if cluster.label in [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
160-
]
161-
if not in_tables:
162-
predictions.append(table_prediction)
163-
continue
170+
page_image = numpy.asarray(page.get_image(scale=self.scale))
171+
172+
page_images.append(page_image)
173+
cluster_page.extend([i] * len(clusters))
174+
table_clusters.extend(clusters)
164175

165-
page_image = numpy.asarray(page.get_image(scale=self.scale))
176+
for cluster in clusters:
177+
bbox = cluster.bbox
166178

167-
for table_cluster in in_tables:
168-
bbox = table_cluster.bbox
179+
table_image = page_image[
180+
round(bbox.t * self.scale) : round(bbox.b * self.scale),
181+
round(bbox.l * self.scale) : round(bbox.r * self.scale),
182+
]
169183

170-
table_image = page_image[
171-
round(bbox.t * self.scale) : round(bbox.b * self.scale),
172-
round(bbox.l * self.scale) : round(bbox.r * self.scale),
173-
]
184+
table_images.append(table_image)
174185

175-
table = self.pipeline([table_image], self.options.confidence_threshold)[0]
186+
if len(table_images) == 0:
187+
return predictions
176188

177-
docling_table = build_docling_table(table, table_cluster, page, self.scale)
189+
with TimeRecorder(conv_res, "table_structure"):
190+
tables = self.pipeline(table_images, self.options.confidence_threshold)
178191

179-
table_prediction.table_map[table_cluster.id] = docling_table
192+
for table, cluster, page_idx in zip(tables, table_clusters, cluster_page):
193+
page = pages[page_idx]
194+
assert page.predictions.tablestructure is not None
180195

181-
if settings.debug.visualize_tables:
182-
self.draw_table_and_cells(
183-
conv_res,
184-
page,
185-
page.predictions.tablestructure.table_map.values(),
186-
)
196+
docling_table = build_docling_table(table, cluster, page, self.scale)
197+
198+
page.predictions.tablestructure.table_map[cluster.id] = docling_table
187199

188-
predictions.append(table_prediction)
200+
if settings.debug.visualize_tables:
201+
self.draw_table_and_cells(
202+
conv_res,
203+
page,
204+
page.predictions.tablestructure.table_map.values(),
205+
)
189206

190207
return predictions
191208

0 commit comments

Comments
 (0)