-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfind_duplicate.py
More file actions
147 lines (123 loc) · 5.07 KB
/
find_duplicate.py
File metadata and controls
147 lines (123 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import pickle
import argparse
from pathlib import Path
from PIL import Image
import pillow_avif
from imagededup.methods import CNN
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# --- Configuration ---
INDEX_FILENAME = "image_database.pkl"
SIMILARITY_THRESHOLD = 0.98 # Similarity threshold (0.0 to 1.0). Higher = stricter.
# --- Funciones ---
def is_image_readable(filepath):
"""Checks if an image is not corrupt and can be processed."""
try:
with Image.open(filepath) as im:
im.verify()
with Image.open(filepath) as im:
im.convert('RGB')
return True
except Exception:
return False
def find_duplicates(input_image, root_dir):
"""
Searches for duplicates of an input image in a pre-computed index.
"""
input_path = Path(input_image).resolve()
root_path = Path(root_dir).resolve()
# 1. Validate inputs
if not input_path.is_file():
print(f"Error: The input file '{input_image}' does not exist.")
return
if not root_path.is_dir():
print(f"Error: The root directory '{root_dir}' does not exist.")
return
index_file_path = root_path / INDEX_FILENAME
if not index_file_path.is_file():
print(f"Error: Index file '{INDEX_FILENAME}' not found in '{root_dir}'.")
print("Please run the 'indexer.py' script on that directory first.")
return
# 2. Load the index
print("Loading image index...")
try:
with open(index_file_path, "rb") as f:
image_index = pickle.load(f)
except Exception as e:
print(f"Error: Could not load the index file. Cause: {e}")
return
if not image_index:
print("The index is empty. Nothing to compare.")
return
print(f"Index loaded with {len(image_index)} images.")
# 3. Generate embedding for the input image
if not is_image_readable(input_path):
print(f"Error: The input image '{input_image}' is corrupt or unreadable.")
return
print("Generating fingerprint for the input image...")
cnn_encoder = CNN()
try:
# Convert the image to a numpy array in memory for greater compatibility
with Image.open(input_path) as img:
img_rgb = img.convert('RGB')
img_array = np.array(img_rgb)
input_embedding = cnn_encoder.encode_image(image_array=img_array)
# Flatten and then ensure the embedding is 2D for scikit-learn
input_embedding_2d = input_embedding.flatten().reshape(1, -1)
except Exception as e:
print(f"Error: Could not process the input image. Cause: {e}")
return
# 4. Compare and find duplicates
print(f"Searching for duplicates with a similarity threshold of {SIMILARITY_THRESHOLD}...")
# Extract all embeddings and relative paths from the index
indexed_paths = list(image_index.keys())
indexed_embeddings = np.array(list(image_index.values()))
# Calculate cosine similarity between the input embedding and all indexed embeddings
# The imagededup function expects an array of embeddings, not a single one.
sims = cosine_similarity(input_embedding_2d, indexed_embeddings)
duplicates_found = []
# sims is an array of arrays, we take the first (and only) element
for i, similarity_score in enumerate(sims[0]):
if similarity_score >= SIMILARITY_THRESHOLD:
# Reconstruct the absolute path
relative_path = indexed_paths[i]
absolute_path = root_path / relative_path
# Prevent the image from finding itself if it's inside the indexed directory
if absolute_path.resolve() != input_path.resolve():
duplicates_found.append((str(absolute_path), similarity_score))
# 5. Display results
print("-" * 30)
if not duplicates_found:
print("No duplicates found.")
else:
print(f"Found {len(duplicates_found)} duplicate(s):")
# Sort by similarity score in descending order
duplicates_found.sort(key=lambda x: x[1], reverse=True)
for path, score in duplicates_found:
print(f" - Path: {path} (Similarity: {score:.4f})")
print("-" * 30)
# --- Script Execution ---
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Finds duplicates of a specific image using a pre-generated index.",
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
"input_image",
help="The path to the image for which you want to find duplicates."
)
parser.add_argument(
"collection_dir",
help="The root folder of the image collection (where the .pkl file is located)."
)
parser.add_argument(
"-t", "--threshold",
type=float,
default=SIMILARITY_THRESHOLD,
help=f"Similarity threshold to consider an image as a duplicate (0.0 to 1.0).\nDefault: {SIMILARITY_THRESHOLD}"
)
args = parser.parse_args()
# Update the threshold if provided
SIMILARITY_THRESHOLD = args.threshold
find_duplicates(args.input_image, args.collection_dir)