Sumanth077 · Tiioluwani · May 13, 2026 · May 18, 2026
diff --git a/multimodal/medical-document-parser/.env.example b/multimodal/medical-document-parser/.env.example
@@ -0,0 +1 @@
+ORQ_API_KEY=your_orq_api_key_here
diff --git a/multimodal/medical-document-parser/.gitignore b/multimodal/medical-document-parser/.gitignore
@@ -0,0 +1,4 @@
+.env
+.venv/
+__pycache__/
+*.pyc
diff --git a/multimodal/medical-document-parser/README.md b/multimodal/medical-document-parser/README.md
@@ -0,0 +1,100 @@
+# Medical Document Parser
+
+> Extract a structured clinical profile from medical PDFs and images using Gemma 4 vision via Orq.ai.
+
+## Overview
+
+Medical Document Parser is a Gradio app that ingests lab reports, prescriptions, imaging results, and clinical notes. PyMuPDF classifies each PDF page as text or vision, routes content to the appropriate extraction path, and uses Gemma 4 (`gemma-4-31b-it`) via [Orq.ai](https://orq.ai) to return a unified JSON clinical profile. Abnormal and critical values are surfaced in a dedicated flagged panel.
+
+## Demo
+
+![Demo](assets/demo.gif)
+
+
+## Features
+
+- Upload medical PDFs or images (PNG, JPG, WEBP, BMP, TIFF)
+- Per-page routing: text pages (>50 characters) vs vision pages (charts, scans, complex layouts)
+- Vision pages rendered at 150 DPI with Pillow before analysis
+- Structured JSON extraction with patient info, labs, imaging, and clinical signals
+- Multi-page PDF support with merged results across pages
+- Progress bar during processing
+- Abnormal and critical values highlighted in red
+
+## Tech Stack
+
+| Layer | Technology |
+|-------|------------|
+| LLM | Gemma 4 (`gemma-4-31b-it`) via [Orq.ai](https://orq.ai) (`openai` SDK) |
+| PDF parsing | PyMuPDF (`fitz`) |
+| Image processing | Pillow |
+| UI | Gradio |
+| Config | `python-dotenv`, Pydantic |
+
+## Prerequisites
+
+- Python 3.10+
+- [Orq.ai API key](https://orq.ai)
+
+## Installation
+
+```bash
+git clone https://github.com/Sumanth077/Hands-On-AI-Engineering.git
+cd Hands-On-AI-Engineering/multimodal/medical_document_parser
+```
+
+**Windows**
+
+```bash
+py -m venv .venv
+.venv\Scripts\activate
+pip install -r requirements.txt
+copy .env.example .env
+```
+
+**macOS / Linux**
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+cp .env.example .env
+```
+
+Edit `.env` and set your API key before running the app.
+
+## Usage
+
+```bash
+python app.py
+```
+
+Open the local Gradio URL shown in the terminal (typically `http://127.0.0.1:7860`). Upload a medical document, click **Extract Clinical Profile**, and review the JSON output and flagged values.
+
+## Environment Variables
+
+| Variable | Required | Description |
+|----------|----------|-------------|
+| `ORQ_API_KEY` | Yes | API key from [Orq.ai](https://orq.ai) |
+
+Copy `.env.example` to `.env` and add your key:
+
+```env
+ORQ_API_KEY=your_orq_api_key_here
+```
+
+## Project Structure
+
+```text
+medical-document-parser/
+├── app.py                  # Gradio UI and orchestration
+├── document_processor.py   # PDF/image page classification and rendering
+├── llm_extractor.py        # Gemma 4 API calls via Orq.ai (OpenAI SDK)
+├── merger.py               # Multi-page result merging
+├── schemas.py              # Pydantic clinical profile schema
+├── requirements.txt
+├── .env.example
+├── assets/
+│   └── demo.png            # Demo screenshot
+└── README.md
+```
diff --git a/multimodal/medical-document-parser/app.py b/multimodal/medical-document-parser/app.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import gradio as gr
+from dotenv import load_dotenv
+
+from document_processor import process_upload
+from llm_extractor import _build_client, extract_from_page
+from merger import merge_profiles
+
+PROJECT_DIR = Path(__file__).resolve().parent
+load_dotenv(PROJECT_DIR / ".env")
+
+CUSTOM_CSS = """
+.flagged-box textarea {
+    color: #dc3545 !important;
+    font-weight: 600 !important;
+    background: #fff5f5 !important;
+    border-color: #f5c2c7 !important;
+}
+"""
+
+
+def _format_flagged_text(flagged_items: list[str]) -> str:
+    if not flagged_items:
+        return "No abnormal or critical values flagged."
+    return "\n".join(f"• {item}" for item in flagged_items)
+
+
+def parse_document(
+    upload: str | None,
+    progress: gr.Progress = gr.Progress(),
+) -> tuple[dict, str]:
+    if upload is None:
+        raise gr.Error("Please upload a medical PDF or image.")
+
+    file_path = Path(upload)
+    if not file_path.exists():
+        raise gr.Error("Uploaded file could not be found.")
+
+    progress(0, desc="Preparing document...")
+    pages = process_upload(file_path)
+    if not pages:
+        raise gr.Error("No pages were found in the uploaded document.")
+
+    client = _build_client()
+    profiles = []
+    total = len(pages)
+
+    for index, page in enumerate(pages, start=1):
+        progress(
+            index / total,
+            desc=f"Analyzing page {index}/{total} ({page.kind}) with Gemma 4...",
+        )
+        profiles.append(extract_from_page(client, page))
+
+    progress(1.0, desc="Merging results...")
+    merged = merge_profiles(profiles)
+    payload = merged.model_dump()
+    return payload, _format_flagged_text(payload["flagged_items"])
+
+
+def build_app() -> gr.Blocks:
+    with gr.Blocks(title="Medical Document Parser") as demo:
+        gr.Markdown(
+            """
+            # Medical Document Parser
+            Upload a medical **PDF** or **image** (lab report, prescription, imaging result, or clinical notes).
+            The app routes each page through text or vision extraction, then uses **Gemma 4** to build a unified clinical profile.
+            """
+        )
+
+        with gr.Row():
+            upload = gr.File(
+                label="Medical document",
+                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"],
+                type="filepath",
+            )
+
+        parse_button = gr.Button("Extract Clinical Profile", variant="primary")
+
+        with gr.Row():
+            json_output = gr.JSON(label="Structured Clinical Profile")
+            flagged_output = gr.Textbox(
+                label="Flagged Abnormal Values",
+                lines=10,
+                elem_classes=["flagged-box"],
+            )
+
+        parse_button.click(
+            fn=parse_document,
+            inputs=[upload],
+            outputs=[json_output, flagged_output],
+        )
+
+        gr.Markdown(
+            """
+            **How it works**
+            1. PyMuPDF classifies each PDF page as text (>50 characters) or vision.
+            2. Text pages are sent directly to Gemma 4; vision pages are rendered at 150 DPI.
+            3. Per-page extractions are merged into one JSON profile with abnormal values highlighted.
+            """
+        )
+
+    return demo
+
+
+if __name__ == "__main__":
+    build_app().launch(css=CUSTOM_CSS, theme=gr.themes.Soft())
diff --git a/multimodal/medical-document-parser/assets/demo.gif b/multimodal/medical-document-parser/assets/demo.gif
diff --git a/multimodal/medical-document-parser/document_processor.py b/multimodal/medical-document-parser/document_processor.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+import io
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+
+import fitz
+from PIL import Image
+
+TEXT_THRESHOLD = 50
+RENDER_DPI = 150
+
+PageKind = Literal["text", "vision"]
+
+
+@dataclass
+class ProcessedPage:
+    page_number: int
+    kind: PageKind
+    text: str | None = None
+    image_bytes: bytes | None = None
+
+
+def _text_page(page: fitz.Page) -> ProcessedPage:
+    return ProcessedPage(
+        page_number=page.number + 1,
+        kind="text",
+        text=page.get_text("text").strip(),
+    )
+
+
+def _vision_page(page: fitz.Page) -> ProcessedPage:
+    pixmap = page.get_pixmap(dpi=RENDER_DPI)
+    image = Image.frombytes("RGB", (pixmap.width, pixmap.height), pixmap.samples)
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return ProcessedPage(
+        page_number=page.number + 1,
+        kind="vision",
+        image_bytes=buffer.getvalue(),
+    )
+
+
+def _classify_pdf_page(page: fitz.Page) -> ProcessedPage:
+    text = page.get_text("text").strip()
+    if len(text) > TEXT_THRESHOLD:
+        return ProcessedPage(
+            page_number=page.number + 1,
+            kind="text",
+            text=text,
+        )
+    return _vision_page(page)
+
+
+def _image_to_page(image_bytes: bytes, page_number: int = 1) -> ProcessedPage:
+    image = Image.open(io.BytesIO(image_bytes))
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    return ProcessedPage(
+        page_number=page_number,
+        kind="vision",
+        image_bytes=buffer.getvalue(),
+    )
+
+
+def process_upload(file_path: str | Path) -> list[ProcessedPage]:
+    path = Path(file_path)
+    suffix = path.suffix.lower()
+
+    if suffix == ".pdf":
+        pages: list[ProcessedPage] = []
+        with fitz.open(path) as document:
+            for page in document:
+                pages.append(_classify_pdf_page(page))
+        return pages
+
+    if suffix in {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}:
+        return [_image_to_page(path.read_bytes())]
+
+    raise ValueError(f"Unsupported file type: {suffix or 'unknown'}")