Image-Deduplicator-CNN-PHash-with-AVIF-support/find_duplicate.py at main · specializeddevel/Image-Deduplicator-CNN-PHash-with-AVIF-support · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import pickle
import argparse
from pathlib import Path
from PIL import Image
import pillow_avif
from imagededup.methods import CNN
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Configuration ---
INDEX_FILENAME = "image_database.pkl"
SIMILARITY_THRESHOLD = 0.98  # Similarity threshold (0.0 to 1.0). Higher = stricter.

# --- Funciones ---

def is_image_readable(filepath):
    """Checks if an image is not corrupt and can be processed."""
    try:
        with Image.open(filepath) as im:
            im.verify()
        with Image.open(filepath) as im:
            im.convert('RGB')
        return True
    except Exception:
        return False

def find_duplicates(input_image, root_dir):
    """
    Searches for duplicates of an input image in a pre-computed index.
    """
    input_path = Path(input_image).resolve()
    root_path = Path(root_dir).resolve()

    # 1. Validate inputs
    if not input_path.is_file():
        print(f"Error: The input file '{input_image}' does not exist.")
        return
    if not root_path.is_dir():
        print(f"Error: The root directory '{root_dir}' does not exist.")
        return

    index_file_path = root_path / INDEX_FILENAME
    if not index_file_path.is_file():
        print(f"Error: Index file '{INDEX_FILENAME}' not found in '{root_dir}'.")
        print("Please run the 'indexer.py' script on that directory first.")
        return

    # 2. Load the index
    print("Loading image index...")
    try:
        with open(index_file_path, "rb") as f:
            image_index = pickle.load(f)
    except Exception as e:
        print(f"Error: Could not load the index file. Cause: {e}")
        return

    if not image_index:
        print("The index is empty. Nothing to compare.")
        return

    print(f"Index loaded with {len(image_index)} images.")

    # 3. Generate embedding for the input image
    if not is_image_readable(input_path):
        print(f"Error: The input image '{input_image}' is corrupt or unreadable.")
        return

    print("Generating fingerprint for the input image...")
    cnn_encoder = CNN()
    try:
        # Convert the image to a numpy array in memory for greater compatibility
        with Image.open(input_path) as img:
            img_rgb = img.convert('RGB')
            img_array = np.array(img_rgb)
        input_embedding = cnn_encoder.encode_image(image_array=img_array)

        # Flatten and then ensure the embedding is 2D for scikit-learn
        input_embedding_2d = input_embedding.flatten().reshape(1, -1)

    except Exception as e:
        print(f"Error: Could not process the input image. Cause: {e}")
        return

    # 4. Compare and find duplicates
    print(f"Searching for duplicates with a similarity threshold of {SIMILARITY_THRESHOLD}...")

    # Extract all embeddings and relative paths from the index
    indexed_paths = list(image_index.keys())
    indexed_embeddings = np.array(list(image_index.values()))

    # Calculate cosine similarity between the input embedding and all indexed embeddings
    # The imagededup function expects an array of embeddings, not a single one.
    sims = cosine_similarity(input_embedding_2d, indexed_embeddings)

    duplicates_found = []
    # sims is an array of arrays, we take the first (and only) element
    for i, similarity_score in enumerate(sims[0]):
        if similarity_score >= SIMILARITY_THRESHOLD:
            # Reconstruct the absolute path
            relative_path = indexed_paths[i]
            absolute_path = root_path / relative_path

            # Prevent the image from finding itself if it's inside the indexed directory
            if absolute_path.resolve() != input_path.resolve():
                duplicates_found.append((str(absolute_path), similarity_score))

    # 5. Display results
    print("-" * 30)
    if not duplicates_found:
        print("No duplicates found.")
    else:
        print(f"Found {len(duplicates_found)} duplicate(s):")
        # Sort by similarity score in descending order
        duplicates_found.sort(key=lambda x: x[1], reverse=True)
        for path, score in duplicates_found:
            print(f"  - Path: {path} (Similarity: {score:.4f})")
    print("-" * 30)


# --- Script Execution ---

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Finds duplicates of a specific image using a pre-generated index.",
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument(
        "input_image",
        help="The path to the image for which you want to find duplicates."
    )
    parser.add_argument(
        "collection_dir",
        help="The root folder of the image collection (where the .pkl file is located)."
    )
    parser.add_argument(
        "-t", "--threshold",
        type=float,
        default=SIMILARITY_THRESHOLD,
        help=f"Similarity threshold to consider an image as a duplicate (0.0 to 1.0).\nDefault: {SIMILARITY_THRESHOLD}"
    )
    args = parser.parse_args()

    # Update the threshold if provided
    SIMILARITY_THRESHOLD = args.threshold

    find_duplicates(args.input_image, args.collection_dir)