parxy/src/parxy_core/models/models.py at main · OneOffTech/parxy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
from abc import ABC
from dataclasses import dataclass
from enum import IntEnum
from io import BytesIO
from typing import List, Optional, Any, Union

from pydantic import BaseModel


class BoundingBox(BaseModel):
    x0: float
    y0: float
    x1: float
    y1: float


class Style(BaseModel):
    font_name: Optional[str] = None
    font_size: Optional[float] = None
    font_style: Optional[str] = None
    color: Optional[str] = None
    alpha: Optional[int] = None
    weight: Optional[float] = None


class Character(BaseModel):
    text: str
    bbox: Optional[BoundingBox] = None
    style: Optional[Style] = None
    page: Optional[int] = None
    source_data: Optional[dict[str, Any]] = None

    def isEmpty(self) -> bool:
        return not self.text or self.text.strip() == ''


class Span(BaseModel):
    text: str
    bbox: Optional[BoundingBox] = None
    style: Optional[Style] = None
    characters: Optional[List[Character]] = None
    page: Optional[int] = None
    source_data: Optional[dict[str, Any]] = None

    def isEmpty(self) -> bool:
        return not self.text or self.text.strip() == ''


class Line(BaseModel):
    text: str
    bbox: Optional[BoundingBox] = None
    style: Optional[Style] = None
    spans: Optional[List[Span]] = None
    page: Optional[int] = None
    source_data: Optional[dict[str, Any]] = None

    def isEmpty(self) -> bool:
        return not self.text or self.text.strip() == ''


class Block(BaseModel, ABC):
    type: str
    role: Optional[str] = 'generic'
    """Document Structure role recognized for this block"""
    bbox: Optional[BoundingBox] = None
    page: Optional[int] = None
    source_data: Optional[dict[str, Any]] = None
    category: Optional[str] = None
    """Category attributed to this block by the parser"""


class TextBlock(Block):
    style: Optional[Style] = None
    level: Optional[int] = None
    lines: Optional[List[Line]] = None
    text: str

    def isEmpty(self) -> bool:
        return not self.text or self.text.strip() == ''


class ImageBlock(Block):
    name: Optional[str] = None
    alt_text: Optional[str] = None


class TableBlock(Block):
    text: str

    def isEmpty(self) -> bool:
        return not self.text or self.text.strip() == ''


class Page(BaseModel):
    number: int
    width: Optional[float] = None
    height: Optional[float] = None
    blocks: Optional[List[TextBlock | ImageBlock | TableBlock]] = None
    text: str
    source_data: Optional[dict[str, Any]] = None

    def isEmpty(self) -> bool:
        return not self.text or self.text.strip() == ''


class Metadata(BaseModel):
    title: Optional[str] = None
    author: Optional[str] = None
    subject: Optional[str] = None
    keywords: Optional[str] = None
    creator: Optional[str] = None
    producer: Optional[str] = None
    created_at: Optional[str] = None
    updated_at: Optional[str] = None


class Document(BaseModel):
    filename: Optional[str] = None
    language: Optional[str] = None
    metadata: Optional[Metadata] = None
    pages: List[Page]
    outline: Optional[List[str]] = None
    source_data: Optional[dict[str, Any]] = None
    parsing_metadata: Optional[dict[str, Any]] = None

    def isEmpty(self) -> bool:
        return all(page.isEmpty() for page in self.pages)

    def text(self, page_separator: str = '---') -> str:
        """Get the full text content of the document.

        Parameters
        ----------
        page_separator : str, optional
            String to use as separator between pages, by default "---"
            Set to empty string or None to disable page separation

        Returns
        -------
        str
            The concatenated text of all pages with optional separators
        """
        if not self.pages:
            return ''

        # Filter out empty pages
        texts = [page.text.strip() for page in self.pages if page.text]

        if not texts:
            return ''

        # Add separator between pages if specified
        if page_separator:
            return f'\n{page_separator}\n'.join(texts)

        return '\n'.join(texts)

    def contentmd(
        self,
        title: Optional[str] = None,
        description: Optional[str] = None,
        date: Optional[str] = None,
        license: Optional[str] = None,
        author: Optional[str] = None,
        page_separators: bool = False,
    ) -> str:
        """Get the document content formatted as content-md.

        Delegates to :class:`~parxy_core.services.ContentMdService`.

        Parameters
        ----------
        title : str, optional
            Document title. Falls back to metadata.title, a heading inferred
            from the first page, filename, then 'Untitled'.
        description : str, optional
            Short summary (~200 characters). Falls back to a doc-abstract block,
            then the longest TextBlock across the first two pages.
        date : str, optional
            Creation/publication date in ISO 8601. Falls back to metadata dates.
        license : str, optional
            License name or SPDX identifier.
        author : str, optional
            Author name. Falls back to metadata.author.

        Returns
        -------
        str
            The document content formatted as content-md.
        """
        from parxy_core.services.contentmd_service import ContentMdService

        return ContentMdService.render(
            self,
            title=title,
            description=description,
            date=date,
            license=license,
            author=author,
            page_separators=page_separators,
        )

    def markdown(self, page_separators: bool = False) -> str:
        """Get the document content formatted as Markdown.

        The method attempts to preserve the document structure by:
        1. Converting TextBlocks to paragraphs based on their category
        2. Preserving line breaks where meaningful
        3. Adding section headers based on block levels

        Parameters
        ----------
        page_separators : bool, optional
            When True, inserts an HTML comment ``<!-- page: N -->`` before
            each page's content, by default False

        Returns
        -------
        str
            The document content formatted as Markdown
        """
        if not self.pages:
            return ''

        markdown_parts = []

        for page in self.pages:
            page_parts = []

            if page_separators:
                page_parts.append(f'<!-- page: {page.number} -->')

            if not page.blocks:
                if page.text.strip():
                    page_parts.append(page.text.strip())
            else:
                for block in page.blocks:
                    if isinstance(block, TextBlock):
                        # Handle different block categories
                        if block.category and block.category.lower() in [
                            'heading',
                            'title',
                            'header',
                        ]:
                            # Determine heading level (h1-h6) based on block level or default to h2
                            level = min(block.level or 2, 6)
                            page_parts.append(f'{"#" * level} {block.text.strip()}')
                        elif block.category and block.category.lower() == 'list':
                            # Convert to bullet points
                            for line in block.text.splitlines():
                                if line.strip():
                                    page_parts.append(f'- {line.strip()}')
                        else:
                            # Regular paragraph
                            if block.text.strip():
                                page_parts.append(block.text.strip())

                    elif isinstance(block, ImageBlock):
                        ext = (
                            block.name.rsplit('.', 1)[-1]
                            if block.name and '.' in block.name
                            else ''
                        )
                        lang = f'image:{ext}' if ext else 'image'
                        alt = block.alt_text or ''
                        page_parts.append(f'```{lang}\n{alt}\n```')

                    elif isinstance(block, TableBlock):
                        if block.text.strip():
                            page_parts.append(block.text.strip())

            if page_parts:
                markdown_parts.append('\n\n'.join(page_parts))

        return '\n\n'.join(markdown_parts)


@dataclass
class BatchTask:
    """Configuration for a single batch parsing task.

    Allows specifying per-file configuration including drivers and extraction level.

    Attributes
    ----------
    file : str | BytesIO | bytes
        The file to parse (path, URL, or binary data)
    drivers : List[str] | None
        Driver(s) to use for this file. If None, uses batch-level default
    level : str | None
        Extraction level for this file. If None, uses batch-level default

    Example
    -------
    >>> tasks = [
    ...     BatchTask(file='simple.pdf'),  # Uses defaults
    ...     BatchTask(file='complex.pdf', drivers=['llamaparse'], level='line'),
    ...     BatchTask(file=pdf_bytes, drivers=['pymupdf', 'pdfact']),
    ... ]
    >>> results = Parxy.batch(tasks)
    """

    file: Union[str, BytesIO, bytes]
    drivers: Optional[List[str]] = None
    level: Optional[str] = None


@dataclass
class BatchResult:
    """Result of a single batch parsing task.

    Attributes
    ----------
    file : str | BytesIO | bytes
        The input file that was processed
    driver : str
        The driver name used for parsing
    document : Document | None
        The parsed document, or None if an error occurred
    error : str | None
        Error message if parsing failed, None otherwise
    exception : Exception | None
        The original exception if parsing failed, None otherwise
    """

    file: Union[str, BytesIO, bytes]
    driver: str
    document: Optional['Document']
    error: Optional[str]
    exception: Optional[Exception] = None

    @property
    def success(self) -> bool:
        """Return True if parsing succeeded."""
        return self.document is not None

    @property
    def failed(self) -> bool:
        """Return True if parsing failed."""
        return self.error is not None


class HierarchyLevel(IntEnum):
    PAGE = 0
    PARAGRAPH = 1
    BLOCK = 2
    LINE = 3
    SPAN = 4
    WORD = 5
    CHARACTER = 6


def estimate_lines_from_block(
    block: TextBlock, default_font_size: float = 11
) -> TextBlock:
    """Estimate line-level layout inside a text block by splitting text and assigning bounding boxes.

    Args:
        block (TextBlock): Text block to estimate lines for.
        default_font_size (float): Default font size if not specified. Default to 11.

    Returns:
        TextBlock: The same block with its `lines` field populated.
    """
    if not block.text or not block.bbox or block.lines is not None:
        return block

    block.lines = []

    # Try to split by explicit newlines first
    raw_lines = block.text.splitlines()
    n_lines = len(raw_lines)
    # fallback: if no explicit \n but text is too long, you might want to wrap it — skipped here

    if n_lines == 0:
        raw_lines = [block.text]
        n_lines = 1

    # Estimate line height
    font_size = block.style.font_size if block.style else default_font_size
    line_height = font_size * 1.1  # 10% line spacing
    total_height = block.bbox.y1 - block.bbox.y0

    # If bbox is taller than sum of line heights, spread the lines proportionally
    if n_lines > 1:
        estimated_line_height = total_height / n_lines
    else:
        estimated_line_height = line_height

    x0 = block.bbox.x0
    x1 = block.bbox.x1
    y_top = block.bbox.y0

    for idx, line_text in enumerate(raw_lines):
        y0 = y_top + idx * estimated_line_height
        y1 = y0 + estimated_line_height
        line_bbox = BoundingBox(x0=x0, y0=y0, x1=x1, y1=y1)

        line = Line(
            text=line_text,
            bbox=line_bbox,
            style=block.style,
            page=block.page,
            source_data={'source': 'split_from_block'},
            spans=None,
        )
        block.lines.append(line)
    return block