-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path04_text_file_search_Engine.py
More file actions
76 lines (57 loc) · 2.39 KB
/
Copy path04_text_file_search_Engine.py
File metadata and controls
76 lines (57 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""Mini text file search engine — builds an inverted word index over .txt
files in a folder and ranks files by how many query words they contain."""
from pathlib import Path
from collections import defaultdict
import re
import sys
STOPWORDS = {
"the", "a", "an", "is", "are", "was", "were", "in", "on", "of", "to",
"for", "and", "or", "as", "by", "with", "it", "this", "that", "be",
"at", "from", "but", "not", "can", "will",
}
def tokenise(text: str) -> set[str]:
"""Lowercase, extract words, drop stopwords and noise."""
words = re.findall(r"\b[a-z]{2,}\b", text.lower())
return {w for w in words if w not in STOPWORDS}
def build_index(folder: str) -> dict[str, set[str]]:
"""Map each word -> set of .txt filenames that contain it."""
index: dict[str, set[str]] = defaultdict(set)
for path in Path(folder).glob("**/*.txt"):
for word in tokenise(path.read_text(errors="ignore")):
index[word].add(path.name)
return dict(index)
def search(index: dict[str, set[str]], query: str) -> list[tuple[str, int]]:
"""Return (filename, match_count) sorted by how many query words matched."""
query_words = tokenise(query)
scores: dict[str, int] = defaultdict(int)
for word in query_words:
for filename in index.get(word, ()):
scores[filename] += 1
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
def main() -> None:
folder = sys.argv[1] if len(sys.argv) >= 2 else input("Folder to search: ").strip()
if not Path(folder).is_dir():
print(f"Not a folder: {folder}")
sys.exit(1)
index = build_index(folder)
if not index:
print(f"No .txt files found in {folder}")
return
print(f"Indexed {len(index)} unique words from {folder}\n")
while True:
query = input("Search (blank to quit): ").strip()
if not query:
break
query_words = tokenise(query)
if not query_words:
print(" (query had no usable words — try different terms)\n")
continue
results = search(index, query)
if not results:
print(" No matches.\n")
continue
for filename, score in results:
print(f" {filename:30} {score}/{len(query_words)} terms matched")
print()
if __name__ == "__main__":
main()