Introduce Table and Image blocks (#53)

avvertix · web-flow · commit 87f0293e94ba · 2026-02-12T15:31:34.000+01:00
* Handle image and table blocks in Llama Parse

* Add Typst sources for empty file and document with headings, images and a table

* Fix page number start from 1
diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py
@@ -18,7 +18,15 @@
     LlamaPage = object
 
 from parxy_core.drivers import Driver
-from parxy_core.models import Document, Page, BoundingBox, TextBlock, HierarchyLevel
+from parxy_core.models import (
+    Document,
+    Page,
+    BoundingBox,
+    TextBlock,
+    TableBlock,
+    ImageBlock,
+    HierarchyLevel,
+)
 from parxy_core.utils import safe_json_dumps
 from parxy_core.exceptions import (
     ParsingException,
@@ -502,6 +510,96 @@ def _convert_text_block(text_block: PageItem, page_number: int) -> TextBlock:
     )
 
 
+def _convert_table_block(text_block: PageItem, page_number: int) -> TableBlock:
+    """Convert a LlamaParse `PageItem` with table type to a `TableBlock`.
+
+    Parameters
+    ----
+    text_block : PageItem
+        The LlamaParse page item containing table data.
+    page_number : int
+        The page number (0-based).
+
+    Returns
+    -------
+    TableBlock
+        The converted `TableBlock` object with markdown table content.
+    """
+    bbox = BoundingBox(
+        x0=text_block.bBox.x,
+        y0=text_block.bBox.y,
+        x1=text_block.bBox.x + text_block.bBox.w,
+        y1=text_block.bBox.y + text_block.bBox.h,
+    )
+    # Use markdown representation as the text content for tables
+    text_value = getattr(text_block, 'md', '') or ''
+    category = text_block.type
+    role = LLAMAPARSE_TO_ROLE.get(category, 'table') if category else 'table'
+    return TableBlock(
+        type='table',
+        role=role,
+        category=category,
+        text=text_value,
+        bbox=bbox,
+        page=page_number,
+        source_data=text_block.model_dump(exclude={'bBox', 'value', 'type', 'lvl'}),
+    )
+
+
+def _convert_image_block(image_data, page_number: int) -> ImageBlock:
+    """Convert a LlamaParse image entry to an `ImageBlock`.
+
+    Parameters
+    ----
+    image_data
+        Image data from the LlamaParse page (model object or dict).
+    page_number : int
+        The page number (0-based).
+
+    Returns
+    -------
+    ImageBlock
+        The converted `ImageBlock` object.
+    """
+    # Normalise to dict so we can handle both Pydantic models and plain dicts
+    if isinstance(image_data, dict):
+        img = image_data
+    elif hasattr(image_data, 'model_dump'):
+        img = image_data.model_dump()
+    else:
+        img = vars(image_data)
+
+    bbox = BoundingBox(
+        x0=img.get('x', 0),
+        y0=img.get('y', 0),
+        x1=img.get('x', 0) + img.get('width', 0),
+        y1=img.get('y', 0) + img.get('height', 0),
+    )
+
+    # Build alt_text from OCR entries when available
+    ocr_entries = img.get('ocr') or []
+    alt_text = (
+        ' '.join(
+            entry.get('text', '')
+            if isinstance(entry, dict)
+            else getattr(entry, 'text', '')
+            for entry in ocr_entries
+        ).strip()
+        or None
+    )
+
+    return ImageBlock(
+        type='image',
+        role='figure',
+        category='figure',
+        name=img.get('name'),
+        alt_text=alt_text,
+        bbox=bbox,
+        page=page_number,
+        source_data=img,
+    )
+
+
 def _convert_page(
     page: LlamaPage,
     level: str,
@@ -520,15 +618,25 @@ def _convert_page(
     Page
         The converted `Page` object.
     """
-    text_blocks = None
+    blocks = None
     if HierarchyLevel[level] >= HierarchyLevel.BLOCK:
-        text_blocks = [_convert_text_block(item, page.page - 1) for item in page.items]
+        blocks = []
+        for item in page.items:
+            if item.type in ('table', 'tables'):
+                blocks.append(_convert_table_block(item, page.page))
+            else:
+                blocks.append(_convert_text_block(item, page.page))
+
+        # Process page-level images into ImageBlocks
+        images = getattr(page, 'images', None) or []
+        for image_data in images:
+            blocks.append(_convert_image_block(image_data, page.page))
     return Page(
-        number=page.page - 1,
+        number=page.page,
         width=page.width,
         height=page.height,
         text=page.text if page.text != 'NO_CONTENT_HERE' else '',
-        blocks=text_blocks,
+        blocks=blocks,
         source_data=page.model_dump(
             exclude={'page', 'text', 'items', 'width', 'height'}
         ),
diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py
@@ -285,7 +285,7 @@ def llmwhisperer_to_parxy(
     ):
         pages.append(
             Page(
-                number=page_number,
+                number=page_number + 1,
                 text=page_text,
                 source_data=doc['extraction']['metadata'].get(str(page_number), None),
             )
diff --git a/src/parxy_core/models/models.py b/src/parxy_core/models/models.py
@@ -79,10 +79,16 @@ def isEmpty(self) -> bool:
         return not self.text or self.text.strip() == ''
 
 
-class ImageBlock(Block): ...
+class ImageBlock(Block):
+    name: Optional[str] = None
+    alt_text: Optional[str] = None
 
 
-class TableBlock(Block): ...
+class TableBlock(Block):
+    text: str
+
+    def isEmpty(self) -> bool:
+        return not self.text or self.text.strip() == ''
 
 
 class Page(BaseModel):
@@ -197,12 +203,18 @@ def markdown(self) -> str:
                             page_parts.append(block.text.strip())
 
                 elif isinstance(block, ImageBlock):
-                    # Placeholder for images - could be enhanced with actual image data
-                    page_parts.append('![Image]')
+                    ext = (
+                        block.name.rsplit('.', 1)[-1]
+                        if block.name and '.' in block.name
+                        else ''
+                    )
+                    lang = f'image:{ext}' if ext else 'image'
+                    alt = block.alt_text or ''
+                    page_parts.append(f'```{lang}\n{alt}\n```')
 
                 elif isinstance(block, TableBlock):
-                    # Placeholder for tables - could be enhanced with actual table data
-                    page_parts.append('| Table content |')
+                    if block.text.strip():
+                        page_parts.append(block.text.strip())
 
             if page_parts:
                 markdown_parts.append('\n\n'.join(page_parts))
diff --git a/tests/drivers/test_llamaparse.py b/tests/drivers/test_llamaparse.py
@@ -96,6 +96,7 @@ def test_llamaparse_driver_read_document(self):
         assert document.metadata is None
         assert len(document.pages) == 1
         assert isinstance(document.pages[0], Page)
+        assert document.pages[0].number == 1
         assert (
             document.pages[0].text
             == 'This is the header\n\nThis is a test PDF to be used as input in unit\ntests\n\nThis is a heading 1\nThis is a paragraph below heading 1\n\n1'
diff --git a/tests/drivers/test_llmwhisperer.py b/tests/drivers/test_llmwhisperer.py
@@ -83,6 +83,7 @@ def test_llmwhisperer_driver_read_document(self):
         assert document.metadata is None
         assert len(document.pages) == 1
         assert isinstance(document.pages[0], Page)
+        assert document.pages[0].number == 1
         assert (
             document.pages[0].text
             == '\n\nThis is the header \n\nThis is a test PDF to be used as input in unit \n\ntests \n\nThis is a heading 1 \nThis is a paragraph below heading 1 \n\n                                                       1 \n'
diff --git a/tests/fixtures/pdf-headings-images-tables.pdf b/tests/fixtures/pdf-headings-images-tables.pdf
diff --git a/tests/fixtures/sources/empty-document.typ b/tests/fixtures/sources/empty-document.typ
@@ -0,0 +1 @@
+// This document results in a blank PDF on purpose
diff --git a/tests/fixtures/sources/generated-image.png b/tests/fixtures/sources/generated-image.png
diff --git a/tests/fixtures/sources/headings-images-tables.typ b/tests/fixtures/sources/headings-images-tables.typ
@@ -0,0 +1,57 @@
+= Introduction
+
+In this report, we will write some _lorem ipsum_.
+
+#lorem(90)
+
+
+= Section heading
+
+#lorem(15)
+
++ The climate
+  - Temperature
+  - Precipitation
++ The topography
++ The geology
+
+
+
+== Subsection with image and figure
+
+#image("generated-image.png", width: 50%)
+
+#lorem(15)
+
+
+#figure(
+  image("generated-image.png", width: 50%),
+  caption: [
+    A generated image using _Google Nano Banana_ model of a winter landscape.
+  ],
+)
+
+
+
+== Subsection with table
+
+#lorem(15)
+
+#table(
+  columns: (1fr, auto, auto),
+  inset: 10pt,
+  align: horizon,
+  table.header(
+    [*Shape*], [*Volume*], [*Parameters*],
+  ),
+  "cylinder",
+  $ pi h (D^2 - d^2) / 4 $,
+  [
+    $h$: height \
+    $D$: outer radius \
+    $d$: inner radius
+  ],
+  "tetrahedron",
+  $ sqrt(2) / 12 a^3 $,
+  [$a$: edge length]
+)
diff --git a/tests/test_models.py b/tests/test_models.py

Original file line number	Diff line number	Diff line change
`@@ -285,7 +285,7 @@ def llmwhisperer_to_parxy(`
`285`	`285`	`):`
`286`	`286`	`pages.append(`
`287`	`287`	`Page(`
`288`		`- number=page_number,`
	`288`	`+ number=page_number + 1,`
`289`	`289`	`text=page_text,`
`290`	`290`	`source_data=doc['extraction']['metadata'].get(str(page_number), None),`
`291`	`291`	`)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+// This document results in a blank PDF on purpose`