Skip to content

Commit 87f0293

Browse files
authored
Introduce Table and Image blocks (#53)
* Handle image and table blocks in Llama Parse * Add Typst sources for empty file and document with headings, images and a table * Fix page number start from 1
1 parent 8a6744b commit 87f0293

10 files changed

Lines changed: 336 additions & 14 deletions

File tree

src/parxy_core/drivers/llamaparse.py

Lines changed: 113 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,15 @@
1818
LlamaPage = object
1919

2020
from parxy_core.drivers import Driver
21-
from parxy_core.models import Document, Page, BoundingBox, TextBlock, HierarchyLevel
21+
from parxy_core.models import (
22+
Document,
23+
Page,
24+
BoundingBox,
25+
TextBlock,
26+
TableBlock,
27+
ImageBlock,
28+
HierarchyLevel,
29+
)
2230
from parxy_core.utils import safe_json_dumps
2331
from parxy_core.exceptions import (
2432
ParsingException,
@@ -502,6 +510,96 @@ def _convert_text_block(text_block: PageItem, page_number: int) -> TextBlock:
502510
)
503511

504512

513+
def _convert_table_block(text_block: PageItem, page_number: int) -> TableBlock:
514+
"""Convert a LlamaParse `PageItem` with table type to a `TableBlock`.
515+
516+
Parameters
517+
----
518+
text_block : PageItem
519+
The LlamaParse page item containing table data.
520+
page_number : int
521+
The page number (0-based).
522+
523+
Returns
524+
-------
525+
TableBlock
526+
The converted `TableBlock` object with markdown table content.
527+
"""
528+
bbox = BoundingBox(
529+
x0=text_block.bBox.x,
530+
y0=text_block.bBox.y,
531+
x1=text_block.bBox.x + text_block.bBox.w,
532+
y1=text_block.bBox.y + text_block.bBox.h,
533+
)
534+
# Use markdown representation as the text content for tables
535+
text_value = getattr(text_block, 'md', '') or ''
536+
category = text_block.type
537+
role = LLAMAPARSE_TO_ROLE.get(category, 'table') if category else 'table'
538+
return TableBlock(
539+
type='table',
540+
role=role,
541+
category=category,
542+
text=text_value,
543+
bbox=bbox,
544+
page=page_number,
545+
source_data=text_block.model_dump(exclude={'bBox', 'value', 'type', 'lvl'}),
546+
)
547+
548+
549+
def _convert_image_block(image_data, page_number: int) -> ImageBlock:
550+
"""Convert a LlamaParse image entry to an `ImageBlock`.
551+
552+
Parameters
553+
----
554+
image_data
555+
Image data from the LlamaParse page (model object or dict).
556+
page_number : int
557+
The page number (0-based).
558+
559+
Returns
560+
-------
561+
ImageBlock
562+
The converted `ImageBlock` object.
563+
"""
564+
# Normalise to dict so we can handle both Pydantic models and plain dicts
565+
if isinstance(image_data, dict):
566+
img = image_data
567+
elif hasattr(image_data, 'model_dump'):
568+
img = image_data.model_dump()
569+
else:
570+
img = vars(image_data)
571+
572+
bbox = BoundingBox(
573+
x0=img.get('x', 0),
574+
y0=img.get('y', 0),
575+
x1=img.get('x', 0) + img.get('width', 0),
576+
y1=img.get('y', 0) + img.get('height', 0),
577+
)
578+
579+
# Build alt_text from OCR entries when available
580+
ocr_entries = img.get('ocr') or []
581+
alt_text = (
582+
' '.join(
583+
entry.get('text', '')
584+
if isinstance(entry, dict)
585+
else getattr(entry, 'text', '')
586+
for entry in ocr_entries
587+
).strip()
588+
or None
589+
)
590+
591+
return ImageBlock(
592+
type='image',
593+
role='figure',
594+
category='figure',
595+
name=img.get('name'),
596+
alt_text=alt_text,
597+
bbox=bbox,
598+
page=page_number,
599+
source_data=img,
600+
)
601+
602+
505603
def _convert_page(
506604
page: LlamaPage,
507605
level: str,
@@ -520,15 +618,25 @@ def _convert_page(
520618
Page
521619
The converted `Page` object.
522620
"""
523-
text_blocks = None
621+
blocks = None
524622
if HierarchyLevel[level] >= HierarchyLevel.BLOCK:
525-
text_blocks = [_convert_text_block(item, page.page - 1) for item in page.items]
623+
blocks = []
624+
for item in page.items:
625+
if item.type in ('table', 'tables'):
626+
blocks.append(_convert_table_block(item, page.page))
627+
else:
628+
blocks.append(_convert_text_block(item, page.page))
629+
630+
# Process page-level images into ImageBlocks
631+
images = getattr(page, 'images', None) or []
632+
for image_data in images:
633+
blocks.append(_convert_image_block(image_data, page.page))
526634
return Page(
527-
number=page.page - 1,
635+
number=page.page,
528636
width=page.width,
529637
height=page.height,
530638
text=page.text if page.text != 'NO_CONTENT_HERE' else '',
531-
blocks=text_blocks,
639+
blocks=blocks,
532640
source_data=page.model_dump(
533641
exclude={'page', 'text', 'items', 'width', 'height'}
534642
),

src/parxy_core/drivers/llmwhisperer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def llmwhisperer_to_parxy(
285285
):
286286
pages.append(
287287
Page(
288-
number=page_number,
288+
number=page_number + 1,
289289
text=page_text,
290290
source_data=doc['extraction']['metadata'].get(str(page_number), None),
291291
)

src/parxy_core/models/models.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,16 @@ def isEmpty(self) -> bool:
7979
return not self.text or self.text.strip() == ''
8080

8181

82-
class ImageBlock(Block): ...
82+
class ImageBlock(Block):
83+
name: Optional[str] = None
84+
alt_text: Optional[str] = None
8385

8486

85-
class TableBlock(Block): ...
87+
class TableBlock(Block):
88+
text: str
89+
90+
def isEmpty(self) -> bool:
91+
return not self.text or self.text.strip() == ''
8692

8793

8894
class Page(BaseModel):
@@ -197,12 +203,18 @@ def markdown(self) -> str:
197203
page_parts.append(block.text.strip())
198204

199205
elif isinstance(block, ImageBlock):
200-
# Placeholder for images - could be enhanced with actual image data
201-
page_parts.append('![Image]')
206+
ext = (
207+
block.name.rsplit('.', 1)[-1]
208+
if block.name and '.' in block.name
209+
else ''
210+
)
211+
lang = f'image:{ext}' if ext else 'image'
212+
alt = block.alt_text or ''
213+
page_parts.append(f'```{lang}\n{alt}\n```')
202214

203215
elif isinstance(block, TableBlock):
204-
# Placeholder for tables - could be enhanced with actual table data
205-
page_parts.append('| Table content |')
216+
if block.text.strip():
217+
page_parts.append(block.text.strip())
206218

207219
if page_parts:
208220
markdown_parts.append('\n\n'.join(page_parts))

tests/drivers/test_llamaparse.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def test_llamaparse_driver_read_document(self):
9696
assert document.metadata is None
9797
assert len(document.pages) == 1
9898
assert isinstance(document.pages[0], Page)
99+
assert document.pages[0].number == 1
99100
assert (
100101
document.pages[0].text
101102
== 'This is the header\n\nThis is a test PDF to be used as input in unit\ntests\n\nThis is a heading 1\nThis is a paragraph below heading 1\n\n1'

tests/drivers/test_llmwhisperer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def test_llmwhisperer_driver_read_document(self):
8383
assert document.metadata is None
8484
assert len(document.pages) == 1
8585
assert isinstance(document.pages[0], Page)
86+
assert document.pages[0].number == 1
8687
assert (
8788
document.pages[0].text
8889
== '\n\nThis is the header \n\nThis is a test PDF to be used as input in unit \n\ntests \n\nThis is a heading 1 \nThis is a paragraph below heading 1 \n\n 1 \n'
1.9 MB
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
// This document results in a blank PDF on purpose
1.96 MB
Loading
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
= Introduction
2+
3+
In this report, we will write some _lorem ipsum_.
4+
5+
#lorem(90)
6+
7+
8+
= Section heading
9+
10+
#lorem(15)
11+
12+
+ The climate
13+
- Temperature
14+
- Precipitation
15+
+ The topography
16+
+ The geology
17+
18+
19+
20+
== Subsection with image and figure
21+
22+
#image("generated-image.png", width: 50%)
23+
24+
#lorem(15)
25+
26+
27+
#figure(
28+
image("generated-image.png", width: 50%),
29+
caption: [
30+
A generated image using _Google Nano Banana_ model of a winter landscape.
31+
],
32+
)
33+
34+
35+
36+
== Subsection with table
37+
38+
#lorem(15)
39+
40+
#table(
41+
columns: (1fr, auto, auto),
42+
inset: 10pt,
43+
align: horizon,
44+
table.header(
45+
[*Shape*], [*Volume*], [*Parameters*],
46+
),
47+
"cylinder",
48+
$ pi h (D^2 - d^2) / 4 $,
49+
[
50+
$h$: height \
51+
$D$: outer radius \
52+
$d$: inner radius
53+
],
54+
"tetrahedron",
55+
$ sqrt(2) / 12 a^3 $,
56+
[$a$: edge length]
57+
)

0 commit comments

Comments
 (0)