search_engine_a/search_engine.py at main · AdarshDec/search_engine_a · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
Search Engine Core Module
=========================
This module implements the fundamental concepts of search engines:
1. Inverted Index - Maps words to documents
2. TF-IDF Ranking - Calculates relevance scores
3. Document Indexing - Processes and stores documents
"""

import re
import math
from collections import defaultdict
from typing import List, Dict, Tuple
from dataclasses import dataclass


@dataclass
class Document:
    """Represents a document in the search engine"""
    id: int
    title: str
    content: str


class SearchEngine:
    """
    Core Search Engine Implementation

    Key Concepts:
    ------------
    1. INVERTED INDEX:
       - Instead of: Document → Words
       - We store: Word → Documents containing it
       - Example: "python" → [doc1, doc3, doc5]
       - This makes searching MUCH faster!

    2. TF-IDF (Term Frequency - Inverse Document Frequency):
       - TF: How often a word appears in a document (normalized)
       - IDF: How rare/common a word is across all documents
       - Score = TF × IDF
       - Higher score = More relevant document
    """

    def __init__(self):
        # Inverted Index: word → list of [document_id, positions]
        # positions: where the word appears in the document
        # Using lists instead of tuples so we can modify positions
        self.inverted_index: Dict[str, List[List]] = defaultdict(list)

        # Document storage: document_id → Document object
        self.documents: Dict[int, Document] = {}

        # Document word counts: document_id → total word count
        self.doc_word_counts: Dict[int, int] = {}

        # Word document frequency: word → number of documents containing it
        self.word_doc_frequency: Dict[str, int] = defaultdict(int)

        self.next_doc_id = 1

    def _tokenize(self, text: str) -> List[str]:
        """
        Tokenization: Break text into individual words

        Steps:
        1. Convert to lowercase (case-insensitive search)
        2. Remove punctuation
        3. Split into words
        4. Filter out empty strings
        """
        # Convert to lowercase and remove punctuation
        text = text.lower()
        # Keep only alphanumeric and spaces
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        # Split into words and filter empty strings
        words = [word for word in text.split() if word]
        return words

    def _calculate_tf(self, word: str, doc_id: int) -> float:
        """
        Term Frequency (TF): How often a word appears in a document

        Formula: count(word in document) / total words in document

        Why normalize? Longer documents would naturally have more word occurrences.
        Normalizing makes scores comparable across documents of different lengths.
        """
        # Count how many times word appears in this document
        word_count = 0
        if word in self.inverted_index:
            for entry in self.inverted_index[word]:
                doc_id_in_index, positions = entry[0], entry[1]
                if doc_id_in_index == doc_id:
                    word_count = len(positions)
                    break

        # Normalize by document length
        total_words = self.doc_word_counts.get(doc_id, 1)
        return word_count / total_words if total_words > 0 else 0

    def _calculate_idf(self, word: str) -> float:
        """
        Inverse Document Frequency (IDF): How rare/common a word is

        Formula: log(total_documents / documents_containing_word)

        Why log? To dampen the effect - common words shouldn't be weighted too heavily.

        Examples:
        - "the" appears in all documents → IDF = log(10/10) = 0 (not important)
        - "python" appears in 2/10 documents → IDF = log(10/2) = 1.61 (important!)
        """
        total_docs = len(self.documents)
        if total_docs == 0:
            return 0

        docs_with_word = self.word_doc_frequency.get(word, 0)
        if docs_with_word == 0:
            return 0

        # Add 1 to avoid division by zero
        return math.log(total_docs / docs_with_word)

    def _calculate_tfidf(self, word: str, doc_id: int) -> float:
        """
        TF-IDF Score: Combines TF and IDF

        Score = TF(word, document) × IDF(word)

        High score means:
        - Word appears frequently in the document (high TF)
        - Word is relatively rare across all documents (high IDF)
        - This document is likely very relevant!
        """
        tf = self._calculate_tf(word, doc_id)
        idf = self._calculate_idf(word)
        return tf * idf

    def add_document(self, title: str, content: str) -> int:
        """
        Index a new document

        Process:
        1. Create Document object
        2. Tokenize content into words
        3. Build inverted index (word → document mappings)
        4. Update statistics
        """
        doc_id = self.next_doc_id
        self.next_doc_id += 1

        # Create document
        doc = Document(id=doc_id, title=title, content=content)
        self.documents[doc_id] = doc

        # Tokenize content
        words = self._tokenize(content)
        self.doc_word_counts[doc_id] = len(words)

        # Track which documents contain each word (for IDF calculation)
        unique_words_in_doc = set()

        # Build inverted index
        # For each word, record which document it appears in and at what positions
        for position, word in enumerate(words):
            # Track document frequency (for IDF)
            if word not in unique_words_in_doc:
                unique_words_in_doc.add(word)
                self.word_doc_frequency[word] += 1

            # Add to inverted index
            # Check if this document is already in the list for this word
            found = False
            for i, (existing_doc_id, positions) in enumerate(self.inverted_index[word]):
                if existing_doc_id == doc_id:
                    # Update the positions list
                    positions.append(position)
                    found = True
                    break

            if not found:
                self.inverted_index[word].append([doc_id, [position]])

        return doc_id

    def search(self, query: str, top_k: int = 10) -> List[Dict]:
        """
        Search for documents matching a query

        Process:
        1. Tokenize query
        2. Find documents containing query words (using inverted index)
        3. Calculate TF-IDF score for each document
        4. Rank by score (highest first)
        5. Return top results

        Returns:
        List of dictionaries with document info and relevance score
        """
        # Tokenize query
        query_words = self._tokenize(query)

        if not query_words:
            return []

        # Find candidate documents (documents containing at least one query word)
        candidate_docs = set()
        for word in query_words:
            if word in self.inverted_index:
                for entry in self.inverted_index[word]:
                    doc_id = entry[0]
                    candidate_docs.add(doc_id)

        if not candidate_docs:
            return []

        # Calculate TF-IDF scores for each candidate document
        doc_scores = {}
        for doc_id in candidate_docs:
            score = 0.0
            # Sum TF-IDF scores for all query words
            for word in query_words:
                score += self._calculate_tfidf(word, doc_id)
            doc_scores[doc_id] = score

        # Sort by score (descending) and get top K
        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        top_docs = sorted_docs[:top_k]

        # Build results
        results = []
        for doc_id, score in top_docs:
            doc = self.documents[doc_id]

            # Create snippet (first 200 characters of content)
            snippet = doc.content[:200] + "..." if len(doc.content) > 200 else doc.content

            results.append({
                "id": doc_id,
                "title": doc.title,
                "content": doc.content,
                "snippet": snippet,
                "score": round(score, 4)
            })

        return results

    def get_stats(self) -> Dict:
        """Get statistics about the search engine"""
        total_words = sum(self.doc_word_counts.values())
        unique_words = len(self.inverted_index)

        return {
            "total_documents": len(self.documents),
            "total_words_indexed": total_words,
            "unique_words": unique_words,
            "average_words_per_document": round(total_words / len(self.documents), 2) if self.documents else 0
        }

    def get_all_documents(self) -> List[Dict]:
        """Get all indexed documents"""
        return [
            {
                "id": doc.id,
                "title": doc.title,
                "content": doc.content
            }
            for doc in self.documents.values()
        ]