Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions multimodal/medical-document-parser/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ORQ_API_KEY=your_orq_api_key_here
4 changes: 4 additions & 0 deletions multimodal/medical-document-parser/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env
.venv/
__pycache__/
*.pyc
100 changes: 100 additions & 0 deletions multimodal/medical-document-parser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Medical Document Parser

> Extract a structured clinical profile from medical PDFs and images using Gemma 4 vision via Orq.ai.

## Overview

Medical Document Parser is a Gradio app that ingests lab reports, prescriptions, imaging results, and clinical notes. PyMuPDF classifies each PDF page as text or vision, routes content to the appropriate extraction path, and uses Gemma 4 (`gemma-4-31b-it`) via [Orq.ai](https://orq.ai) to return a unified JSON clinical profile. Abnormal and critical values are surfaced in a dedicated flagged panel.

## Demo

![Demo](assets/demo.gif)


## Features

- Upload medical PDFs or images (PNG, JPG, WEBP, BMP, TIFF)
- Per-page routing: text pages (>50 characters) vs vision pages (charts, scans, complex layouts)
- Vision pages rendered at 150 DPI with Pillow before analysis
- Structured JSON extraction with patient info, labs, imaging, and clinical signals
- Multi-page PDF support with merged results across pages
- Progress bar during processing
- Abnormal and critical values highlighted in red

## Tech Stack

| Layer | Technology |
|-------|------------|
| LLM | Gemma 4 (`gemma-4-31b-it`) via [Orq.ai](https://orq.ai) (`openai` SDK) |
| PDF parsing | PyMuPDF (`fitz`) |
| Image processing | Pillow |
| UI | Gradio |
| Config | `python-dotenv`, Pydantic |

## Prerequisites

- Python 3.10+
- [Orq.ai API key](https://orq.ai)

## Installation

```bash
git clone https://github.com/Sumanth077/Hands-On-AI-Engineering.git
cd Hands-On-AI-Engineering/multimodal/medical_document_parser
```

**Windows**

```bash
py -m venv .venv
.venv\Scripts\activate
pip install -r requirements.txt
copy .env.example .env
```

**macOS / Linux**

```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
cp .env.example .env
```

Edit `.env` and set your API key before running the app.

## Usage

```bash
python app.py
```

Open the local Gradio URL shown in the terminal (typically `http://127.0.0.1:7860`). Upload a medical document, click **Extract Clinical Profile**, and review the JSON output and flagged values.

## Environment Variables

| Variable | Required | Description |
|----------|----------|-------------|
| `ORQ_API_KEY` | Yes | API key from [Orq.ai](https://orq.ai) |

Copy `.env.example` to `.env` and add your key:

```env
ORQ_API_KEY=your_orq_api_key_here
```

## Project Structure

```text
medical-document-parser/
├── app.py # Gradio UI and orchestration
├── document_processor.py # PDF/image page classification and rendering
├── llm_extractor.py # Gemma 4 API calls via Orq.ai (OpenAI SDK)
├── merger.py # Multi-page result merging
├── schemas.py # Pydantic clinical profile schema
├── requirements.txt
├── .env.example
├── assets/
│ └── demo.png # Demo screenshot
└── README.md
```
110 changes: 110 additions & 0 deletions multimodal/medical-document-parser/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import annotations

from pathlib import Path

import gradio as gr
from dotenv import load_dotenv

from document_processor import process_upload
from llm_extractor import _build_client, extract_from_page
from merger import merge_profiles

PROJECT_DIR = Path(__file__).resolve().parent
load_dotenv(PROJECT_DIR / ".env")

CUSTOM_CSS = """
.flagged-box textarea {
color: #dc3545 !important;
font-weight: 600 !important;
background: #fff5f5 !important;
border-color: #f5c2c7 !important;
}
"""


def _format_flagged_text(flagged_items: list[str]) -> str:
if not flagged_items:
return "No abnormal or critical values flagged."
return "\n".join(f"• {item}" for item in flagged_items)


def parse_document(
upload: str | None,
progress: gr.Progress = gr.Progress(),
) -> tuple[dict, str]:
if upload is None:
raise gr.Error("Please upload a medical PDF or image.")

file_path = Path(upload)
if not file_path.exists():
raise gr.Error("Uploaded file could not be found.")

progress(0, desc="Preparing document...")
pages = process_upload(file_path)
if not pages:
raise gr.Error("No pages were found in the uploaded document.")

client = _build_client()
profiles = []
total = len(pages)

for index, page in enumerate(pages, start=1):
progress(
index / total,
desc=f"Analyzing page {index}/{total} ({page.kind}) with Gemma 4...",
)
profiles.append(extract_from_page(client, page))

progress(1.0, desc="Merging results...")
merged = merge_profiles(profiles)
payload = merged.model_dump()
return payload, _format_flagged_text(payload["flagged_items"])


def build_app() -> gr.Blocks:
with gr.Blocks(title="Medical Document Parser") as demo:
gr.Markdown(
"""
# Medical Document Parser
Upload a medical **PDF** or **image** (lab report, prescription, imaging result, or clinical notes).
The app routes each page through text or vision extraction, then uses **Gemma 4** to build a unified clinical profile.
"""
)

with gr.Row():
upload = gr.File(
label="Medical document",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"],
type="filepath",
)

parse_button = gr.Button("Extract Clinical Profile", variant="primary")

with gr.Row():
json_output = gr.JSON(label="Structured Clinical Profile")
flagged_output = gr.Textbox(
label="Flagged Abnormal Values",
lines=10,
elem_classes=["flagged-box"],
)

parse_button.click(
fn=parse_document,
inputs=[upload],
outputs=[json_output, flagged_output],
)

gr.Markdown(
"""
**How it works**
1. PyMuPDF classifies each PDF page as text (>50 characters) or vision.
2. Text pages are sent directly to Gemma 4; vision pages are rendered at 150 DPI.
3. Per-page extractions are merged into one JSON profile with abnormal values highlighted.
"""
)

return demo


if __name__ == "__main__":
build_app().launch(css=CUSTOM_CSS, theme=gr.themes.Soft())
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
83 changes: 83 additions & 0 deletions multimodal/medical-document-parser/document_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from __future__ import annotations

import io
from dataclasses import dataclass
from pathlib import Path
from typing import Literal

import fitz
from PIL import Image

TEXT_THRESHOLD = 50
RENDER_DPI = 150

PageKind = Literal["text", "vision"]


@dataclass
class ProcessedPage:
page_number: int
kind: PageKind
text: str | None = None
image_bytes: bytes | None = None


def _text_page(page: fitz.Page) -> ProcessedPage:
return ProcessedPage(
page_number=page.number + 1,
kind="text",
text=page.get_text("text").strip(),
)


def _vision_page(page: fitz.Page) -> ProcessedPage:
pixmap = page.get_pixmap(dpi=RENDER_DPI)
image = Image.frombytes("RGB", (pixmap.width, pixmap.height), pixmap.samples)
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return ProcessedPage(
page_number=page.number + 1,
kind="vision",
image_bytes=buffer.getvalue(),
)


def _classify_pdf_page(page: fitz.Page) -> ProcessedPage:
text = page.get_text("text").strip()
if len(text) > TEXT_THRESHOLD:
return ProcessedPage(
page_number=page.number + 1,
kind="text",
text=text,
)
return _vision_page(page)


def _image_to_page(image_bytes: bytes, page_number: int = 1) -> ProcessedPage:
image = Image.open(io.BytesIO(image_bytes))
if image.mode != "RGB":
image = image.convert("RGB")
buffer = io.BytesIO()
image.save(buffer, format="PNG")
return ProcessedPage(
page_number=page_number,
kind="vision",
image_bytes=buffer.getvalue(),
)


def process_upload(file_path: str | Path) -> list[ProcessedPage]:
path = Path(file_path)
suffix = path.suffix.lower()

if suffix == ".pdf":
pages: list[ProcessedPage] = []
with fitz.open(path) as document:
for page in document:
pages.append(_classify_pdf_page(page))
return pages

if suffix in {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}:
return [_image_to_page(path.read_bytes())]

raise ValueError(f"Unsupported file type: {suffix or 'unknown'}")
Loading