|
2 | 2 |
|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
| 5 | +import math |
5 | 6 | import re |
6 | 7 | from collections.abc import Sequence |
7 | 8 | from difflib import SequenceMatcher |
@@ -96,65 +97,106 @@ async def delete_edge(self, edge_id: str) -> None: |
96 | 97 |
|
97 | 98 | async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]: |
98 | 99 | query_lower = query.lower() |
99 | | - terms = query_lower.split() |
100 | | - # No word boundary patterns — substring matching is better for diverse corpora |
101 | | - # (medical terms like "APOE4", Korean compounds, morphological variants) |
102 | | - term_patterns: dict[str, re.Pattern[str]] = {} |
103 | | - # Generate 2-gram substrings (for Korean compound word matching) |
| 100 | + terms = [t for t in query_lower.split() if len(t) >= 1] |
| 101 | + if not terms: |
| 102 | + return [] |
| 103 | + |
| 104 | + # --- BM25 parameters --- |
| 105 | + k1 = 1.5 |
| 106 | + b = 0.75 |
| 107 | + title_boost = 3.0 # title 매칭 가중치 (IDF와 곱해져서 additive) |
| 108 | + |
| 109 | + # Pre-compute corpus statistics for BM25 |
| 110 | + N = len(self._nodes) # total documents |
| 111 | + if N == 0: |
| 112 | + return [] |
| 113 | + |
| 114 | + # Document frequencies: how many docs contain each term (substring match) |
| 115 | + doc_freq: dict[str, int] = {} |
| 116 | + doc_texts: dict[str, str] = {} # node_id → full searchable text |
| 117 | + doc_lengths: dict[str, int] = {} # node_id → word count |
| 118 | + |
| 119 | + for node in self._nodes.values(): |
| 120 | + text = f"{node.title.lower()} {node.content.lower()}" |
| 121 | + if node.tags: |
| 122 | + text += " " + " ".join(node.tags).lower() |
| 123 | + if node.properties: |
| 124 | + kw = node.properties.get("_search_keywords", "") |
| 125 | + if kw: |
| 126 | + text += " " + kw.lower() |
| 127 | + doc_texts[node.id] = text |
| 128 | + doc_lengths[node.id] = len(text.split()) |
| 129 | + |
| 130 | + avgdl = sum(doc_lengths.values()) / N if N > 0 else 1.0 |
| 131 | + |
| 132 | + for t in terms: |
| 133 | + count = 0 |
| 134 | + for text in doc_texts.values(): |
| 135 | + if t in text: |
| 136 | + count += 1 |
| 137 | + doc_freq[t] = count |
| 138 | + |
| 139 | + # Bigrams for phrase matching |
104 | 140 | bigrams: list[str] = [] |
105 | 141 | if len(terms) >= 2: |
106 | 142 | for i in range(len(terms) - 1): |
107 | 143 | bigrams.append(f"{terms[i]} {terms[i + 1]}") |
108 | 144 |
|
| 145 | + # --- Score each document --- |
109 | 146 | scored: list[tuple[Node, float]] = [] |
110 | 147 | for node in self._nodes.values(): |
111 | 148 | title_lower = node.title.lower() |
112 | 149 | content_lower = node.content.lower() |
113 | | - full_text = f"{title_lower} {content_lower}" |
| 150 | + full_text = doc_texts[node.id] |
| 151 | + dl = doc_lengths[node.id] |
| 152 | + |
114 | 153 | score = 0.0 |
115 | 154 |
|
116 | | - # High bonus if full query is contained in title |
117 | | - if query_lower in title_lower: |
118 | | - score += len(terms) * 3.0 |
119 | | - else: |
120 | | - # Individual term matching in title (weight 2x) |
121 | | - for t in terms: |
122 | | - pat = term_patterns.get(t) |
123 | | - if pat is not None: |
124 | | - if pat.search(title_lower): |
125 | | - score += 2.0 |
126 | | - else: |
127 | | - if t in title_lower: |
128 | | - score += 2.0 |
129 | | - |
130 | | - # Individual term matching in content |
131 | 155 | for t in terms: |
132 | | - pat = term_patterns.get(t) |
133 | | - if pat is not None: |
134 | | - score += len(pat.findall(content_lower)) * 1.0 |
135 | | - else: |
136 | | - if t in content_lower: |
137 | | - score += 1.0 |
| 156 | + # Term frequency (substring count) |
| 157 | + tf_content = content_lower.count(t) |
| 158 | + tf_title = title_lower.count(t) |
| 159 | + |
| 160 | + if tf_content == 0 and tf_title == 0: |
| 161 | + continue |
| 162 | + |
| 163 | + # IDF: log((N - df + 0.5) / (df + 0.5) + 1) |
| 164 | + df = doc_freq.get(t, 0) |
| 165 | + idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0) |
138 | 166 |
|
139 | | - # Bigram match bonus (higher relevance when 2 consecutive terms appear together) |
140 | | - score += sum(1.5 for bg in bigrams if bg in full_text) |
| 167 | + # BM25 content score |
| 168 | + if tf_content > 0: |
| 169 | + numerator = tf_content * (k1 + 1) |
| 170 | + denominator = tf_content + k1 * (1 - b + b * dl / avgdl) |
| 171 | + score += idf * numerator / denominator |
141 | 172 |
|
142 | | - # Tag match bonus |
| 173 | + # Title bonus (separate, additive — not affected by BM25 length normalization) |
| 174 | + if tf_title > 0: |
| 175 | + score += idf * title_boost |
| 176 | + |
| 177 | + # Bigram bonus (phrase proximity) |
| 178 | + for bg in bigrams: |
| 179 | + if bg in full_text: |
| 180 | + score += 1.5 |
| 181 | + |
| 182 | + # Tag exact match bonus |
143 | 183 | if node.tags: |
144 | 184 | tag_text = " ".join(node.tags).lower() |
145 | | - score += sum(1.0 for t in terms if t in tag_text) |
| 185 | + for t in terms: |
| 186 | + if t in tag_text: |
| 187 | + score += 0.5 |
146 | 188 |
|
147 | | - # _search_keywords matching (LLM-generated search-optimized keywords) |
| 189 | + # LLM-generated search keywords bonus |
148 | 190 | if node.properties: |
149 | 191 | search_kw = node.properties.get("_search_keywords", "").lower() |
150 | 192 | if search_kw: |
151 | | - score += sum(1.5 for t in terms if t in search_kw) |
152 | | - summary = node.properties.get("_summary", "").lower() |
153 | | - if summary: |
154 | | - score += sum(0.5 for t in terms if t in summary) |
| 193 | + for t in terms: |
| 194 | + if t in search_kw: |
| 195 | + score += 1.0 |
155 | 196 |
|
156 | 197 | if score > 0: |
157 | 198 | scored.append((node, score)) |
| 199 | + |
158 | 200 | scored.sort(key=lambda x: x[1], reverse=True) |
159 | 201 | return [n for n, _ in scored[:limit]] |
160 | 202 |
|
|
0 commit comments