Skip to content

Commit e1b25f4

Browse files
committed
fix: table extraction
1 parent f2eb06f commit e1b25f4

1 file changed

Lines changed: 37 additions & 2 deletions

File tree

packages/extractor/python/extract_server.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from docling.document_converter import DocumentConverter
2121
from docling.datamodel.base_models import InputFormat
22+
from docling_core.types.doc.labels import DocItemLabel
2223

2324

2425
@contextlib.contextmanager
@@ -43,12 +44,46 @@ def strip_image_data(extraction: dict) -> dict:
4344
return extraction
4445

4546

47+
def smart_extract_text(doc) -> str:
48+
"""Extract text without table markdown bloat.
49+
50+
Docling's export_to_markdown() pads table cells with spaces for column alignment,
51+
which causes exponential growth when tables are nested (common in DOCX).
52+
This function extracts:
53+
1. Non-table content as markdown (preserves headings, lists, emphasis)
54+
2. Table cells as plain text (without markdown table formatting)
55+
"""
56+
# Get non-table content as markdown
57+
non_table_labels = set(DocItemLabel) - {DocItemLabel.TABLE}
58+
non_table_content = doc.export_to_markdown(labels=non_table_labels)
59+
60+
# Get table cell text directly (no markdown formatting)
61+
table_texts = []
62+
for table in doc.tables:
63+
rows = []
64+
for row in table.data.grid:
65+
cells = []
66+
for cell in row:
67+
# Use cell.text attribute if available, otherwise empty
68+
if hasattr(cell, 'text') and cell.text:
69+
cells.append(cell.text)
70+
if cells:
71+
rows.append(' | '.join(cells))
72+
if rows:
73+
table_texts.append('\n'.join(rows))
74+
75+
# Combine non-table content with table text
76+
if table_texts:
77+
return non_table_content + '\n\n' + '\n\n'.join(table_texts)
78+
return non_table_content
79+
80+
4681
def extract(converter: DocumentConverter, file_path: str) -> dict:
4782
"""Extract text and structure from a DOCX file using Docling."""
4883
result = converter.convert(file_path)
4984

50-
# Export as markdown for text extraction
51-
text = result.document.export_to_markdown()
85+
# Use smart extraction to avoid table padding bloat
86+
text = smart_extract_text(result.document)
5287

5388
# Get full structured extraction (stripped of image data)
5489
extraction = result.document.export_to_dict()

0 commit comments

Comments
 (0)