-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcustom_tokenization.py
More file actions
166 lines (132 loc) · 4.89 KB
/
custom_tokenization.py
File metadata and controls
166 lines (132 loc) · 4.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Example demonstrating custom tokenization with add_tokenized().
Shows how to use your own tokenizer instead of the default one.
"""
import re
from bm25comp import BM25Builder, BM25Reader
def custom_tokenizer(text: str) -> list[str]:
"""
Custom tokenizer that:
- Preserves case
- Removes punctuation
- Splits on whitespace
- Filters out very short tokens
"""
# Remove punctuation but keep alphanumeric and spaces
text = re.sub(r"[^\w\s]", "", text)
# Split on whitespace
tokens = text.split()
# Filter out tokens shorter than 2 characters
tokens = [t for t in tokens if len(t) >= 2]
return tokens
def stemming_tokenizer(text: str) -> list[str]:
"""
Simple stemming tokenizer (naive implementation for demo).
In production, use a real stemmer like Porter or Snowball.
"""
text = text.lower()
tokens = text.split()
# Very naive "stemming" - just remove common suffixes
stemmed = []
for token in tokens:
# Remove 'ing', 'ed', 's'
if token.endswith("ing"):
token = token[:-3]
elif token.endswith("ed"):
token = token[:-2]
elif token.endswith("s") and len(token) > 3:
token = token[:-1]
stemmed.append(token)
return stemmed
def main():
print("=" * 70)
print("Custom Tokenization Example")
print("=" * 70)
documents = {
"doc1": "The Quick Brown Fox jumps over the lazy dog!",
"doc2": "Python is a programming language.",
"doc3": "Machine Learning and AI are transforming technology.",
"doc4": "The quick fox ran quickly through the forest.",
}
# Example 1: Default tokenization
print("\n1. Default Tokenization (built-in)")
print("-" * 70)
builder1 = BM25Builder()
for key, content in documents.items():
builder1.add(key, content)
builder1.build()
stats1 = builder1.get_stats()
print(f" Unique terms: {stats1['num_unique_terms']}")
print(f" Sample terms: {list(builder1.postings.keys())[:10]}")
# Example 2: Custom tokenization (case-preserving)
print("\n2. Custom Tokenization (case-preserving, no punctuation)")
print("-" * 70)
builder2 = BM25Builder()
for key, content in documents.items():
tokens = custom_tokenizer(content)
print(f" {key}: {tokens[:5]}...")
builder2.add_tokenized(key, tokens)
builder2.build()
stats2 = builder2.get_stats()
print(f" Unique terms: {stats2['num_unique_terms']}")
print(f" Sample terms: {list(builder2.postings.keys())[:10]}")
# Example 3: Stemming tokenization
print("\n3. Stemming Tokenization (naive stemming)")
print("-" * 70)
builder3 = BM25Builder()
for key, content in documents.items():
tokens = stemming_tokenizer(content)
print(f" {key}: {tokens[:5]}...")
builder3.add_tokenized(key, tokens)
builder3.build()
stats3 = builder3.get_stats()
print(f" Unique terms: {stats3['num_unique_terms']}")
print(f" Sample terms: {list(builder3.postings.keys())[:10]}")
# Example 4: Search comparison
print("\n4. Search Comparison")
print("-" * 70)
# Save all indices
builder1.save("index_default.bm25")
builder2.save("index_custom.bm25")
builder3.save("index_stemmed.bm25")
query = "quick"
print(f" Query: '{query}'")
# Search with default tokenization
reader1 = BM25Reader()
reader1.load("index_default.bm25")
results1 = reader1.search(query, top_k=3)
print(f"\n Default tokenization results:")
for key, score in results1:
print(f" {key}: {score:.4f}")
# Search with custom tokenization
reader2 = BM25Reader()
reader2.load("index_custom.bm25")
# Need to tokenize query the same way!
query_tokens = custom_tokenizer(query)
results2 = reader2.search(" ".join(query_tokens), top_k=3)
print(f"\n Custom tokenization results:")
for key, score in results2:
print(f" {key}: {score:.4f}")
# Search with stemming
reader3 = BM25Reader()
reader3.load("index_stemmed.bm25")
query_tokens = stemming_tokenizer(query)
results3 = reader3.search(" ".join(query_tokens), top_k=3)
print(f"\n Stemmed tokenization results:")
for key, score in results3:
print(f" {key}: {score:.4f}")
print("\n" + "=" * 70)
print("Key Takeaways:")
print(" • add_tokenized() lets you use any tokenizer you want")
print(" • Remember to tokenize queries the same way as documents")
print(" • Custom tokenization can improve search quality for your domain")
print(" • Case-sensitive tokens create more unique terms")
print(" • Stemming reduces unique terms and improves recall")
print("=" * 70)
# Cleanup
import os
os.remove("index_default.bm25")
os.remove("index_custom.bm25")
os.remove("index_stemmed.bm25")
if __name__ == "__main__":
main()