PyTorch-Vision-Transformers-ViT/app.py at main · jman4162/PyTorch-Vision-Transformers-ViT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Gradio Web Interface for ViT Image Classifier.

Run with: python app.py
Opens at: http://localhost:7860

This creates an interactive web interface for classifying images
using a fine-tuned Vision Transformer model.
"""

import torch
import gradio as gr
from PIL import Image

from vit_trainer import (
    load_model,
    CIFAR10_CLASSES,
    get_val_transform,
    visualize_attention,
    show_attention_on_image,
)

# Configuration
MODEL_VARIANT = "vit_b_16"
MODEL_PATH = "best_model_vit_b_16_cifar10.pt"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
try:
    model = load_model(
        MODEL_VARIANT,
        num_classes=10,
        checkpoint_path=MODEL_PATH,
        device=DEVICE,
    )
    print(f"Loaded model from {MODEL_PATH}")
except FileNotFoundError:
    model = load_model(MODEL_VARIANT, num_classes=10, device=DEVICE)
    print(f"Warning: {MODEL_PATH} not found. Using pretrained weights only.")

model.eval()

# Image transform
transform = get_val_transform(image_size=224)


def predict(image: Image.Image) -> dict:
    """Predict class for an uploaded image.

    Args:
        image: PIL Image from Gradio

    Returns:
        Dictionary of class probabilities
    """
    if image is None:
        return {cls: 0.0 for cls in CIFAR10_CLASSES}

    # Preprocess
    input_tensor = transform(image).unsqueeze(0).to(DEVICE)

    # Predict
    with torch.no_grad():
        outputs = model(input_tensor)
        probs = torch.softmax(outputs, dim=1)[0]

    # Return as dictionary
    return {CIFAR10_CLASSES[i]: float(probs[i]) for i in range(10)}


def predict_with_attention(image: Image.Image):
    """Predict class and show attention visualization.

    Args:
        image: PIL Image from Gradio

    Returns:
        Tuple of (predictions dict, attention overlay image)
    """
    if image is None:
        return {cls: 0.0 for cls in CIFAR10_CLASSES}, None

    # Preprocess
    input_tensor = transform(image).unsqueeze(0).to(DEVICE)

    # Predict
    with torch.no_grad():
        outputs = model(input_tensor)
        probs = torch.softmax(outputs, dim=1)[0]

    predictions = {CIFAR10_CLASSES[i]: float(probs[i]) for i in range(10)}

    # Get attention map
    attn_map = visualize_attention(model, input_tensor[0], device=DEVICE)
    if attn_map is not None:
        overlay = show_attention_on_image(image.resize((224, 224)), attn_map)
        return predictions, Image.fromarray(overlay)

    return predictions, image.resize((224, 224))


# Create Gradio interface
with gr.Blocks(title="ViT Image Classifier") as demo:
    gr.Markdown(
        """
        # Vision Transformer Image Classifier

        Upload an image to classify it using a fine-tuned Vision Transformer (ViT).

        **Model**: vit_b_16 fine-tuned on CIFAR-10

        **Classes**: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck

        **Note**: This model was trained on 32x32 CIFAR-10 images. For best results,
        use images of single objects similar to the training data.
        """
    )

    with gr.Tab("Simple Classification"):
        with gr.Row():
            with gr.Column():
                input_image = gr.Image(type="pil", label="Upload an Image")
                classify_btn = gr.Button("Classify", variant="primary")
            with gr.Column():
                output_label = gr.Label(num_top_classes=5, label="Predictions")

        classify_btn.click(predict, inputs=input_image, outputs=output_label)
        input_image.change(predict, inputs=input_image, outputs=output_label)

    with gr.Tab("With Attention Visualization"):
        with gr.Row():
            with gr.Column():
                input_image_attn = gr.Image(type="pil", label="Upload an Image")
                classify_attn_btn = gr.Button("Classify with Attention", variant="primary")
            with gr.Column():
                output_label_attn = gr.Label(num_top_classes=5, label="Predictions")
                output_attention = gr.Image(type="pil", label="Attention Overlay")

        classify_attn_btn.click(
            predict_with_attention,
            inputs=input_image_attn,
            outputs=[output_label_attn, output_attention],
        )

    gr.Markdown(
        """
        ---
        Built with [vit-trainer](https://github.com/jman4162/PyTorch-Vision-Transformers-ViT)
        """
    )


if __name__ == "__main__":
    print(f"Running on device: {DEVICE}")
    print("Starting Gradio interface...")
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
    )