-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathgetimages.py
More file actions
123 lines (98 loc) · 4.86 KB
/
getimages.py
File metadata and controls
123 lines (98 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import os
import requests
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import io
from tqdm import tqdm
# ───── CONFIG ───────────────────────────────────────────────────────────────
CORPUS_DIR = "/home/arka/Desktop/Hackathons/HCLTech_CS671/corpus"
JSON_LOG = "images_log.json"
# ────────────────────────────────────────────────────────────────────────────
# Load CLIP once
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()
def fetch_image_embedding(img_url: str) -> list[float]:
resp = requests.get(img_url, timeout=10)
resp.raise_for_status()
img = Image.open(io.BytesIO(resp.content)).convert("RGB")
inputs = processor(images=img, return_tensors="pt").to(device)
with torch.no_grad():
feats = model.get_image_features(**inputs)
feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
return feats[0].cpu().tolist()
def process_corpus_images():
if not os.path.exists(CORPUS_DIR):
print(f"❌ Corpus directory not found: {CORPUS_DIR}")
return
# Get all JSON files in the corpus directory
corpus_files = [f for f in os.listdir(CORPUS_DIR) if f.endswith('.json')]
if not corpus_files:
print(f"❌ No JSON files found in corpus directory: {CORPUS_DIR}")
return
print(f"🔍 Found {len(corpus_files)} JSON files in corpus directory")
results = {}
total_images_count = 0
processed_images_count = 0
# First pass to count total images for overall progress bar
for filename in corpus_files:
try:
with open(os.path.join(CORPUS_DIR, filename), 'r', encoding='utf-8') as f:
data = json.load(f)
image_urls = data.get("images", [])
total_images_count += len(image_urls)
except Exception as e:
print(f" ❌ Error counting images in {filename}: {e}")
print(f"📊 Total images to process: {total_images_count}")
# Process each file with progress bar
overall_pbar = tqdm(total=total_images_count, desc="Total progress", unit="img")
# Process each file
for filename in tqdm(corpus_files, desc="Processing files", unit="file"):
filepath = os.path.join(CORPUS_DIR, filename)
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# Get the page URL from the data
page_url = data.get("link", f"file://{filepath}")
# Extract image URLs from the data
image_urls = data.get("images", [])
if not image_urls:
results[page_url] = []
continue
# Process each image with a nested progress bar
entries = []
for img_url in image_urls:
# Find relevant heading from the chunks if available
heading = None
if "chunks" in data:
for chunk in data["chunks"]:
if "heading" in chunk:
heading = chunk["heading"]
break
try:
emb = fetch_image_embedding(img_url)
entries.append({
"url": img_url,
"heading": heading,
"embedding": emb
})
processed_images_count += 1
overall_pbar.set_postfix({"success": f"{processed_images_count}/{total_images_count}"})
except Exception as e:
overall_pbar.write(f"⚠️ Could not embed {img_url}: {e}")
overall_pbar.update(1)
results[page_url] = entries
except Exception as e:
overall_pbar.write(f"❌ Error processing {filename}: {e}")
continue
overall_pbar.close()
# Write JSON log
with open(JSON_LOG, "w", encoding="utf-8") as jf:
json.dump(results, jf, ensure_ascii=False, indent=2)
print(f"\n🎉 Done—processed {processed_images_count}/{total_images_count} images from {len(results)} files")
print(f"📄 Results logged to {JSON_LOG}")
if __name__ == "__main__":
process_corpus_images()