MATBot-AI-Assistant/getimages.py at main · SingletLinkage/MATBot-AI-Assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import os
import requests
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import io
from tqdm import tqdm

# ───── CONFIG ───────────────────────────────────────────────────────────────
CORPUS_DIR = "/home/arka/Desktop/Hackathons/HCLTech_CS671/corpus"
JSON_LOG = "images_log.json"
# ────────────────────────────────────────────────────────────────────────────

# Load CLIP once
device    = "cuda" if torch.cuda.is_available() else "cpu"
model     = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()

def fetch_image_embedding(img_url: str) -> list[float]:
    resp = requests.get(img_url, timeout=10)
    resp.raise_for_status()
    img = Image.open(io.BytesIO(resp.content)).convert("RGB")
    inputs = processor(images=img, return_tensors="pt").to(device)
    with torch.no_grad():
        feats = model.get_image_features(**inputs)
        feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
    return feats[0].cpu().tolist()

def process_corpus_images():
    if not os.path.exists(CORPUS_DIR):
        print(f"❌ Corpus directory not found: {CORPUS_DIR}")
        return

    # Get all JSON files in the corpus directory
    corpus_files = [f for f in os.listdir(CORPUS_DIR) if f.endswith('.json')]

    if not corpus_files:
        print(f"❌ No JSON files found in corpus directory: {CORPUS_DIR}")
        return

    print(f"🔍 Found {len(corpus_files)} JSON files in corpus directory")

    results = {}
    total_images_count = 0
    processed_images_count = 0

    # First pass to count total images for overall progress bar
    for filename in corpus_files:
        try:
            with open(os.path.join(CORPUS_DIR, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
            image_urls = data.get("images", [])
            total_images_count += len(image_urls)
        except Exception as e:
            print(f"  ❌ Error counting images in {filename}: {e}")

    print(f"📊 Total images to process: {total_images_count}")

    # Process each file with progress bar
    overall_pbar = tqdm(total=total_images_count, desc="Total progress", unit="img")

    # Process each file
    for filename in tqdm(corpus_files, desc="Processing files", unit="file"):
        filepath = os.path.join(CORPUS_DIR, filename)

        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Get the page URL from the data
            page_url = data.get("link", f"file://{filepath}")

            # Extract image URLs from the data
            image_urls = data.get("images", [])

            if not image_urls:
                results[page_url] = []
                continue

            # Process each image with a nested progress bar
            entries = []
            for img_url in image_urls:
                # Find relevant heading from the chunks if available
                heading = None
                if "chunks" in data:
                    for chunk in data["chunks"]:
                        if "heading" in chunk:
                            heading = chunk["heading"]
                            break

                try:
                    emb = fetch_image_embedding(img_url)
                    entries.append({
                        "url":       img_url,
                        "heading":   heading,
                        "embedding": emb
                    })
                    processed_images_count += 1
                    overall_pbar.set_postfix({"success": f"{processed_images_count}/{total_images_count}"})
                except Exception as e:
                    overall_pbar.write(f"⚠️ Could not embed {img_url}: {e}")

                overall_pbar.update(1)

            results[page_url] = entries

        except Exception as e:
            overall_pbar.write(f"❌ Error processing {filename}: {e}")
            continue

    overall_pbar.close()

    # Write JSON log
    with open(JSON_LOG, "w", encoding="utf-8") as jf:
        json.dump(results, jf, ensure_ascii=False, indent=2)

    print(f"\n🎉 Done—processed {processed_images_count}/{total_images_count} images from {len(results)} files")
    print(f"📄 Results logged to {JSON_LOG}")

if __name__ == "__main__":
    process_corpus_images()