1919
2020from docling .document_converter import DocumentConverter
2121from docling .datamodel .base_models import InputFormat
22+ from docling_core .types .doc .labels import DocItemLabel
2223
2324
2425@contextlib .contextmanager
@@ -43,12 +44,46 @@ def strip_image_data(extraction: dict) -> dict:
4344 return extraction
4445
4546
47+ def smart_extract_text (doc ) -> str :
48+ """Extract text without table markdown bloat.
49+
50+ Docling's export_to_markdown() pads table cells with spaces for column alignment,
51+ which causes exponential growth when tables are nested (common in DOCX).
52+ This function extracts:
53+ 1. Non-table content as markdown (preserves headings, lists, emphasis)
54+ 2. Table cells as plain text (without markdown table formatting)
55+ """
56+ # Get non-table content as markdown
57+ non_table_labels = set (DocItemLabel ) - {DocItemLabel .TABLE }
58+ non_table_content = doc .export_to_markdown (labels = non_table_labels )
59+
60+ # Get table cell text directly (no markdown formatting)
61+ table_texts = []
62+ for table in doc .tables :
63+ rows = []
64+ for row in table .data .grid :
65+ cells = []
66+ for cell in row :
67+ # Use cell.text attribute if available, otherwise empty
68+ if hasattr (cell , 'text' ) and cell .text :
69+ cells .append (cell .text )
70+ if cells :
71+ rows .append (' | ' .join (cells ))
72+ if rows :
73+ table_texts .append ('\n ' .join (rows ))
74+
75+ # Combine non-table content with table text
76+ if table_texts :
77+ return non_table_content + '\n \n ' + '\n \n ' .join (table_texts )
78+ return non_table_content
79+
80+
4681def extract (converter : DocumentConverter , file_path : str ) -> dict :
4782 """Extract text and structure from a DOCX file using Docling."""
4883 result = converter .convert (file_path )
4984
50- # Export as markdown for text extraction
51- text = result .document . export_to_markdown ( )
85+ # Use smart extraction to avoid table padding bloat
86+ text = smart_extract_text ( result .document )
5287
5388 # Get full structured extraction (stripped of image data)
5489 extraction = result .document .export_to_dict ()
0 commit comments