-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathdocling_adapter.py
More file actions
233 lines (199 loc) · 7.38 KB
/
Copy pathdocling_adapter.py
File metadata and controls
233 lines (199 loc) · 7.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""
Docling worker: same NDJSON-on-stdio protocol as ks_adapter, but for the
IBM Docling DocumentConverter.
Docling treats xlsx as a document → markdown. We map its native objects
to the BenchmarkRecord schema as faithfully as we can; capabilities it
does NOT model are left as ``None`` (the schema's load-bearing distinction
between "feature absent in this file" and "feature not modeled at all").
What docling models for xlsx:
- sheets ← one table per sheet (its convention)
- cells ← sum of table cells across tables
- tables ← len(doc.tables)
- merges ← cells with row_span/col_span > 1
What it does NOT model:
- formulas, formula_dependencies — computed values only, no `=…`
- charts, chart_types
- pivots
- conditional formatting / data validation
- named ranges
- hyperlinks per cell, comments, sparklines
- images
Chunks: docling has a chunker but we keep this adapter shallow. We export
the full document to markdown and emit it as a single chunk; the retrieval
metric splits on table boundaries and headings downstream.
"""
from __future__ import annotations
import json
import os
import sys
import time
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
_HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(_HERE.parent.parent.parent)) # repo root
from tests.benchmarks._mem import peak_rss_mb # noqa: E402
from tests.benchmarks._schema import SCHEMA_VERSION, BenchmarkRecord # noqa: E402
PARSER_NAME = "docling"
MAX_ERR_LEN = 500
def _write(obj: dict[str, Any]) -> None:
sys.stdout.write(json.dumps(obj, separators=(",", ":")) + "\n")
sys.stdout.flush()
def _build_converter():
"""Build a DocumentConverter once and reuse across files."""
from docling.document_converter import DocumentConverter
return DocumentConverter()
def _docling_version() -> str:
try:
import importlib.metadata
return importlib.metadata.version("docling")
except Exception:
try:
import docling
return getattr(docling, "__version__", "unknown")
except Exception:
return "unknown"
def _count_features(doc, file_size: int, path: str, parse_time_ms: float,
peak_mb: float, commit: str, markdown: str) -> BenchmarkRecord:
# Tables: docling produces one TableItem per detected table region.
tables = list(doc.tables)
n_tables = len(tables)
# Cells: sum of table_cells. Docling does not surface cell counts
# outside tables, so this undercounts free-floating text cells.
n_cells = 0
n_merges = 0
for t in tables:
cells = t.data.table_cells if t.data else []
for c in cells:
n_cells += 1
row_span = (c.end_row_offset_idx or 0) - (c.start_row_offset_idx or 0)
col_span = (c.end_col_offset_idx or 0) - (c.start_col_offset_idx or 0)
if row_span > 1 or col_span > 1:
n_merges += 1
# Sheets: docling does not expose sheet count directly. The common
# case is one table per sheet, but multi-table sheets exist. Best
# available proxy: count unique pages — but docling for xlsx uses
# one page per sheet, so len(doc.pages) ≈ sheet count.
try:
n_sheets = len(doc.pages) if doc.pages else None
except Exception:
n_sheets = None
return BenchmarkRecord(
file=path,
file_size_bytes=file_size,
parser=PARSER_NAME,
parser_version=_docling_version(),
status="ok",
error=None,
parse_time_ms=parse_time_ms,
peak_memory_mb=peak_mb,
sheets=n_sheets if n_sheets is not None else 1,
cells=n_cells,
formulas=None, # not modeled
formula_dependencies=None, # not modeled
charts=None, # not modeled for xlsx
chart_types=None,
tables=n_tables,
pivots=None, # not modeled
merges=n_merges,
cf_rules=None, # not modeled
dv_rules=None, # not modeled
named_ranges=None, # not modeled
hyperlinks=None, # not surfaced per-cell
images=None, # not modeled for xlsx
comments=None, # not modeled
sparklines=None, # not modeled
chunks=1, # single markdown blob — retrieval step re-chunks
token_count=max(len(markdown) // 4, 1), # crude estimate
schema_version=SCHEMA_VERSION,
timestamp=datetime.now(UTC).isoformat(),
harness_commit=commit,
extra={"markdown": markdown},
)
def _error_record(path: str, file_size: int, status: str, error: str,
parse_time_ms: float | None, peak_mb: float | None,
commit: str) -> BenchmarkRecord:
return BenchmarkRecord(
file=path,
file_size_bytes=file_size,
parser=PARSER_NAME,
parser_version=_docling_version(),
status=status,
error=(error or "")[:MAX_ERR_LEN] or None,
parse_time_ms=parse_time_ms,
peak_memory_mb=peak_mb,
sheets=None,
cells=None,
formulas=None,
formula_dependencies=None,
charts=None,
chart_types=None,
tables=None,
pivots=None,
merges=None,
cf_rules=None,
dv_rules=None,
named_ranges=None,
hyperlinks=None,
images=None,
comments=None,
sparklines=None,
chunks=None,
token_count=None,
schema_version=SCHEMA_VERSION,
timestamp=datetime.now(UTC).isoformat(),
harness_commit=commit,
)
def main() -> int:
commit = os.environ.get("HARNESS_COMMIT", "")
converter = _build_converter()
_write({"event": "ready", "parser": PARSER_NAME, "version": _docling_version()})
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
msg = json.loads(line)
path = msg["path"]
except Exception as exc:
_write({"event": "error", "error": f"bad input line: {exc}"})
continue
try:
file_size = os.path.getsize(path)
except OSError:
file_size = 0
rss0 = peak_rss_mb()
t0 = time.perf_counter()
try:
result = converter.convert(path)
doc = result.document
md = doc.export_to_markdown()
t1 = time.perf_counter()
rss1 = peak_rss_mb()
rec = _count_features(
doc=doc,
file_size=file_size,
path=path,
parse_time_ms=(t1 - t0) * 1000.0,
peak_mb=max(rss1 - rss0, 0.0),
commit=commit,
markdown=md,
)
except Exception as exc: # noqa: BLE001
t1 = time.perf_counter()
rss1 = peak_rss_mb()
rec = _error_record(
path=path,
file_size=file_size,
status="error",
error=f"{type(exc).__name__}: {exc}",
parse_time_ms=(t1 - t0) * 1000.0,
peak_mb=max(rss1 - rss0, 0.0),
commit=commit,
)
sys.stdout.write(rec.to_json_line())
sys.stdout.flush()
_write({"event": "done"})
return 0
if __name__ == "__main__":
sys.exit(main())