-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmodels.py
More file actions
408 lines (326 loc) · 12.1 KB
/
models.py
File metadata and controls
408 lines (326 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
from abc import ABC
from dataclasses import dataclass
from enum import IntEnum
from io import BytesIO
from typing import List, Optional, Any, Union
from pydantic import BaseModel
class BoundingBox(BaseModel):
x0: float
y0: float
x1: float
y1: float
class Style(BaseModel):
font_name: Optional[str] = None
font_size: Optional[float] = None
font_style: Optional[str] = None
color: Optional[str] = None
alpha: Optional[int] = None
weight: Optional[float] = None
class Character(BaseModel):
text: str
bbox: Optional[BoundingBox] = None
style: Optional[Style] = None
page: Optional[int] = None
source_data: Optional[dict[str, Any]] = None
def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''
class Span(BaseModel):
text: str
bbox: Optional[BoundingBox] = None
style: Optional[Style] = None
characters: Optional[List[Character]] = None
page: Optional[int] = None
source_data: Optional[dict[str, Any]] = None
def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''
class Line(BaseModel):
text: str
bbox: Optional[BoundingBox] = None
style: Optional[Style] = None
spans: Optional[List[Span]] = None
page: Optional[int] = None
source_data: Optional[dict[str, Any]] = None
def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''
class Block(BaseModel, ABC):
type: str
role: Optional[str] = 'generic'
"""Document Structure role recognized for this block"""
bbox: Optional[BoundingBox] = None
page: Optional[int] = None
source_data: Optional[dict[str, Any]] = None
category: Optional[str] = None
"""Category attributed to this block by the parser"""
class TextBlock(Block):
style: Optional[Style] = None
level: Optional[int] = None
lines: Optional[List[Line]] = None
text: str
def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''
class ImageBlock(Block):
name: Optional[str] = None
alt_text: Optional[str] = None
class TableBlock(Block):
text: str
def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''
class Page(BaseModel):
number: int
width: Optional[float] = None
height: Optional[float] = None
blocks: Optional[List[TextBlock | ImageBlock | TableBlock]] = None
text: str
source_data: Optional[dict[str, Any]] = None
def isEmpty(self) -> bool:
return not self.text or self.text.strip() == ''
class Metadata(BaseModel):
title: Optional[str] = None
author: Optional[str] = None
subject: Optional[str] = None
keywords: Optional[str] = None
creator: Optional[str] = None
producer: Optional[str] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
class Document(BaseModel):
filename: Optional[str] = None
language: Optional[str] = None
metadata: Optional[Metadata] = None
pages: List[Page]
outline: Optional[List[str]] = None
source_data: Optional[dict[str, Any]] = None
parsing_metadata: Optional[dict[str, Any]] = None
def isEmpty(self) -> bool:
return all(page.isEmpty() for page in self.pages)
def text(self, page_separator: str = '---') -> str:
"""Get the full text content of the document.
Parameters
----------
page_separator : str, optional
String to use as separator between pages, by default "---"
Set to empty string or None to disable page separation
Returns
-------
str
The concatenated text of all pages with optional separators
"""
if not self.pages:
return ''
# Filter out empty pages
texts = [page.text.strip() for page in self.pages if page.text]
if not texts:
return ''
# Add separator between pages if specified
if page_separator:
return f'\n{page_separator}\n'.join(texts)
return '\n'.join(texts)
def contentmd(
self,
title: Optional[str] = None,
description: Optional[str] = None,
date: Optional[str] = None,
license: Optional[str] = None,
author: Optional[str] = None,
page_separators: bool = False,
) -> str:
"""Get the document content formatted as content-md.
Delegates to :class:`~parxy_core.services.ContentMdService`.
Parameters
----------
title : str, optional
Document title. Falls back to metadata.title, a heading inferred
from the first page, filename, then 'Untitled'.
description : str, optional
Short summary (~200 characters). Falls back to a doc-abstract block,
then the longest TextBlock across the first two pages.
date : str, optional
Creation/publication date in ISO 8601. Falls back to metadata dates.
license : str, optional
License name or SPDX identifier.
author : str, optional
Author name. Falls back to metadata.author.
Returns
-------
str
The document content formatted as content-md.
"""
from parxy_core.services.contentmd_service import ContentMdService
return ContentMdService.render(
self,
title=title,
description=description,
date=date,
license=license,
author=author,
page_separators=page_separators,
)
def markdown(self, page_separators: bool = False) -> str:
"""Get the document content formatted as Markdown.
The method attempts to preserve the document structure by:
1. Converting TextBlocks to paragraphs based on their category
2. Preserving line breaks where meaningful
3. Adding section headers based on block levels
Parameters
----------
page_separators : bool, optional
When True, inserts an HTML comment ``<!-- page: N -->`` before
each page's content, by default False
Returns
-------
str
The document content formatted as Markdown
"""
if not self.pages:
return ''
markdown_parts = []
for page in self.pages:
page_parts = []
if page_separators:
page_parts.append(f'<!-- page: {page.number} -->')
if not page.blocks:
if page.text.strip():
page_parts.append(page.text.strip())
else:
for block in page.blocks:
if isinstance(block, TextBlock):
# Handle different block categories
if block.category and block.category.lower() in [
'heading',
'title',
'header',
]:
# Determine heading level (h1-h6) based on block level or default to h2
level = min(block.level or 2, 6)
page_parts.append(f'{"#" * level} {block.text.strip()}')
elif block.category and block.category.lower() == 'list':
# Convert to bullet points
for line in block.text.splitlines():
if line.strip():
page_parts.append(f'- {line.strip()}')
else:
# Regular paragraph
if block.text.strip():
page_parts.append(block.text.strip())
elif isinstance(block, ImageBlock):
ext = (
block.name.rsplit('.', 1)[-1]
if block.name and '.' in block.name
else ''
)
lang = f'image:{ext}' if ext else 'image'
alt = block.alt_text or ''
page_parts.append(f'```{lang}\n{alt}\n```')
elif isinstance(block, TableBlock):
if block.text.strip():
page_parts.append(block.text.strip())
if page_parts:
markdown_parts.append('\n\n'.join(page_parts))
return '\n\n'.join(markdown_parts)
@dataclass
class BatchTask:
"""Configuration for a single batch parsing task.
Allows specifying per-file configuration including drivers and extraction level.
Attributes
----------
file : str | BytesIO | bytes
The file to parse (path, URL, or binary data)
drivers : List[str] | None
Driver(s) to use for this file. If None, uses batch-level default
level : str | None
Extraction level for this file. If None, uses batch-level default
Example
-------
>>> tasks = [
... BatchTask(file='simple.pdf'), # Uses defaults
... BatchTask(file='complex.pdf', drivers=['llamaparse'], level='line'),
... BatchTask(file=pdf_bytes, drivers=['pymupdf', 'pdfact']),
... ]
>>> results = Parxy.batch(tasks)
"""
file: Union[str, BytesIO, bytes]
drivers: Optional[List[str]] = None
level: Optional[str] = None
@dataclass
class BatchResult:
"""Result of a single batch parsing task.
Attributes
----------
file : str | BytesIO | bytes
The input file that was processed
driver : str
The driver name used for parsing
document : Document | None
The parsed document, or None if an error occurred
error : str | None
Error message if parsing failed, None otherwise
exception : Exception | None
The original exception if parsing failed, None otherwise
"""
file: Union[str, BytesIO, bytes]
driver: str
document: Optional['Document']
error: Optional[str]
exception: Optional[Exception] = None
@property
def success(self) -> bool:
"""Return True if parsing succeeded."""
return self.document is not None
@property
def failed(self) -> bool:
"""Return True if parsing failed."""
return self.error is not None
class HierarchyLevel(IntEnum):
PAGE = 0
PARAGRAPH = 1
BLOCK = 2
LINE = 3
SPAN = 4
WORD = 5
CHARACTER = 6
def estimate_lines_from_block(
block: TextBlock, default_font_size: float = 11
) -> TextBlock:
"""Estimate line-level layout inside a text block by splitting text and assigning bounding boxes.
Args:
block (TextBlock): Text block to estimate lines for.
default_font_size (float): Default font size if not specified. Default to 11.
Returns:
TextBlock: The same block with its `lines` field populated.
"""
if not block.text or not block.bbox or block.lines is not None:
return block
block.lines = []
# Try to split by explicit newlines first
raw_lines = block.text.splitlines()
n_lines = len(raw_lines)
# fallback: if no explicit \n but text is too long, you might want to wrap it — skipped here
if n_lines == 0:
raw_lines = [block.text]
n_lines = 1
# Estimate line height
font_size = block.style.font_size if block.style else default_font_size
line_height = font_size * 1.1 # 10% line spacing
total_height = block.bbox.y1 - block.bbox.y0
# If bbox is taller than sum of line heights, spread the lines proportionally
if n_lines > 1:
estimated_line_height = total_height / n_lines
else:
estimated_line_height = line_height
x0 = block.bbox.x0
x1 = block.bbox.x1
y_top = block.bbox.y0
for idx, line_text in enumerate(raw_lines):
y0 = y_top + idx * estimated_line_height
y1 = y0 + estimated_line_height
line_bbox = BoundingBox(x0=x0, y0=y0, x1=x1, y1=y1)
line = Line(
text=line_text,
bbox=line_bbox,
style=block.style,
page=block.page,
source_data={'source': 'split_from_block'},
spans=None,
)
block.lines.append(line)
return block