1818 LlamaPage = object
1919
2020from parxy_core .drivers import Driver
21- from parxy_core .models import Document , Page , BoundingBox , TextBlock , HierarchyLevel
21+ from parxy_core .models import (
22+ Document ,
23+ Page ,
24+ BoundingBox ,
25+ TextBlock ,
26+ TableBlock ,
27+ ImageBlock ,
28+ HierarchyLevel ,
29+ )
2230from parxy_core .utils import safe_json_dumps
2331from parxy_core .exceptions import (
2432 ParsingException ,
@@ -502,6 +510,96 @@ def _convert_text_block(text_block: PageItem, page_number: int) -> TextBlock:
502510 )
503511
504512
513+ def _convert_table_block (text_block : PageItem , page_number : int ) -> TableBlock :
514+ """Convert a LlamaParse `PageItem` with table type to a `TableBlock`.
515+
516+ Parameters
517+ ----
518+ text_block : PageItem
519+ The LlamaParse page item containing table data.
520+ page_number : int
521+ The page number (0-based).
522+
523+ Returns
524+ -------
525+ TableBlock
526+ The converted `TableBlock` object with markdown table content.
527+ """
528+ bbox = BoundingBox (
529+ x0 = text_block .bBox .x ,
530+ y0 = text_block .bBox .y ,
531+ x1 = text_block .bBox .x + text_block .bBox .w ,
532+ y1 = text_block .bBox .y + text_block .bBox .h ,
533+ )
534+ # Use markdown representation as the text content for tables
535+ text_value = getattr (text_block , 'md' , '' ) or ''
536+ category = text_block .type
537+ role = LLAMAPARSE_TO_ROLE .get (category , 'table' ) if category else 'table'
538+ return TableBlock (
539+ type = 'table' ,
540+ role = role ,
541+ category = category ,
542+ text = text_value ,
543+ bbox = bbox ,
544+ page = page_number ,
545+ source_data = text_block .model_dump (exclude = {'bBox' , 'value' , 'type' , 'lvl' }),
546+ )
547+
548+
549+ def _convert_image_block (image_data , page_number : int ) -> ImageBlock :
550+ """Convert a LlamaParse image entry to an `ImageBlock`.
551+
552+ Parameters
553+ ----
554+ image_data
555+ Image data from the LlamaParse page (model object or dict).
556+ page_number : int
557+ The page number (0-based).
558+
559+ Returns
560+ -------
561+ ImageBlock
562+ The converted `ImageBlock` object.
563+ """
564+ # Normalise to dict so we can handle both Pydantic models and plain dicts
565+ if isinstance (image_data , dict ):
566+ img = image_data
567+ elif hasattr (image_data , 'model_dump' ):
568+ img = image_data .model_dump ()
569+ else :
570+ img = vars (image_data )
571+
572+ bbox = BoundingBox (
573+ x0 = img .get ('x' , 0 ),
574+ y0 = img .get ('y' , 0 ),
575+ x1 = img .get ('x' , 0 ) + img .get ('width' , 0 ),
576+ y1 = img .get ('y' , 0 ) + img .get ('height' , 0 ),
577+ )
578+
579+ # Build alt_text from OCR entries when available
580+ ocr_entries = img .get ('ocr' ) or []
581+ alt_text = (
582+ ' ' .join (
583+ entry .get ('text' , '' )
584+ if isinstance (entry , dict )
585+ else getattr (entry , 'text' , '' )
586+ for entry in ocr_entries
587+ ).strip ()
588+ or None
589+ )
590+
591+ return ImageBlock (
592+ type = 'image' ,
593+ role = 'figure' ,
594+ category = 'figure' ,
595+ name = img .get ('name' ),
596+ alt_text = alt_text ,
597+ bbox = bbox ,
598+ page = page_number ,
599+ source_data = img ,
600+ )
601+
602+
505603def _convert_page (
506604 page : LlamaPage ,
507605 level : str ,
@@ -520,15 +618,25 @@ def _convert_page(
520618 Page
521619 The converted `Page` object.
522620 """
523- text_blocks = None
621+ blocks = None
524622 if HierarchyLevel [level ] >= HierarchyLevel .BLOCK :
525- text_blocks = [_convert_text_block (item , page .page - 1 ) for item in page .items ]
623+ blocks = []
624+ for item in page .items :
625+ if item .type in ('table' , 'tables' ):
626+ blocks .append (_convert_table_block (item , page .page ))
627+ else :
628+ blocks .append (_convert_text_block (item , page .page ))
629+
630+ # Process page-level images into ImageBlocks
631+ images = getattr (page , 'images' , None ) or []
632+ for image_data in images :
633+ blocks .append (_convert_image_block (image_data , page .page ))
526634 return Page (
527- number = page .page - 1 ,
635+ number = page .page ,
528636 width = page .width ,
529637 height = page .height ,
530638 text = page .text if page .text != 'NO_CONTENT_HERE' else '' ,
531- blocks = text_blocks ,
639+ blocks = blocks ,
532640 source_data = page .model_dump (
533641 exclude = {'page' , 'text' , 'items' , 'width' , 'height' }
534642 ),
0 commit comments