|
45 | 45 | # the vector/hybrid retrieval path must key rows by (type, id) to avoid collisions. |
46 | 46 | type SearchIndexKey = tuple[str, int] |
47 | 47 |
|
| 48 | +# --- Entity-aware ranking boost (#951) --- |
| 49 | + |
| 50 | +# Match word tokens (allowing internal apostrophes/hyphens) so we can inspect |
| 51 | +# their capitalization to detect proper-noun-like query terms. |
| 52 | +_ENTITY_TERM_TOKEN_PATTERN = re.compile(r"[A-Za-z][A-Za-z'\-]*") |
| 53 | + |
| 54 | +# Common capitalized sentence-starters and interrogatives that look like proper |
| 55 | +# nouns but are not entity references. Kept lowercase for case-insensitive checks. |
| 56 | +# Intentionally small: a candidate term only boosts a row when it actually matches |
| 57 | +# that row's title/relation names, so a stray non-entity term simply does nothing. |
| 58 | +_ENTITY_TERM_STOPWORDS = frozenset( |
| 59 | + { |
| 60 | + "a", |
| 61 | + "an", |
| 62 | + "and", |
| 63 | + "are", |
| 64 | + "as", |
| 65 | + "at", |
| 66 | + "be", |
| 67 | + "but", |
| 68 | + "by", |
| 69 | + "do", |
| 70 | + "does", |
| 71 | + "for", |
| 72 | + "from", |
| 73 | + "has", |
| 74 | + "have", |
| 75 | + "how", |
| 76 | + "i", |
| 77 | + "in", |
| 78 | + "is", |
| 79 | + "it", |
| 80 | + "of", |
| 81 | + "on", |
| 82 | + "or", |
| 83 | + "the", |
| 84 | + "their", |
| 85 | + "they", |
| 86 | + "this", |
| 87 | + "to", |
| 88 | + "was", |
| 89 | + "we", |
| 90 | + "were", |
| 91 | + "what", |
| 92 | + "when", |
| 93 | + "where", |
| 94 | + "which", |
| 95 | + "who", |
| 96 | + "whom", |
| 97 | + "whose", |
| 98 | + "why", |
| 99 | + "will", |
| 100 | + "with", |
| 101 | + "you", |
| 102 | + "your", |
| 103 | + } |
| 104 | +) |
| 105 | + |
48 | 106 |
|
49 | 107 | @dataclass |
50 | 108 | class VectorSyncBatchResult: |
@@ -166,6 +224,13 @@ class SearchRepositoryBase(ABC): |
166 | 224 | _vector_dimensions: int |
167 | 225 | _vector_tables_initialized: bool |
168 | 226 |
|
| 227 | + # Entity-aware ranking boost (#951). Defaults keep the feature off for any |
| 228 | + # subclass or test double that does not explicitly configure it. Concrete |
| 229 | + # backends overwrite these from BasicMemoryConfig in their __init__. |
| 230 | + _entity_boost_enabled: bool = False |
| 231 | + _entity_boost_weight: float = 0.0 |
| 232 | + _entity_boost_max_terms: int = 1 |
| 233 | + |
169 | 234 | def __init__(self, session_maker: async_sessionmaker[AsyncSession], project_id: int): |
170 | 235 | """Initialize with session maker and project_id filter. |
171 | 236 |
|
@@ -2147,6 +2212,105 @@ async def _fetch_search_index_rows_by_ids( |
2147 | 2212 | # Shared semantic search: hybrid score-based fusion |
2148 | 2213 | # ------------------------------------------------------------------ |
2149 | 2214 |
|
| 2215 | + # --- Entity-aware ranking boost (#951) --- |
| 2216 | + |
| 2217 | + @staticmethod |
| 2218 | + def _extract_query_entity_terms(search_text: Optional[str]) -> set[str]: |
| 2219 | + """Extract candidate entity (proper-noun) terms from a query string. |
| 2220 | +
|
| 2221 | + Heuristic, lexical only (no model inference): a token is a candidate entity |
| 2222 | + term when it is title-cased or all-caps and is not a common stopword. The |
| 2223 | + result is lowercased so downstream matching is case-insensitive. |
| 2224 | +
|
| 2225 | + Examples: |
| 2226 | + "What are Joanna's hobbies?" -> {"joanna"} |
| 2227 | + "Who is Anthony?" -> {"anthony"} |
| 2228 | + "Deborah and Jolene" -> {"deborah", "jolene"} |
| 2229 | + "what is the weather" -> set() (no proper nouns) |
| 2230 | + """ |
| 2231 | + if not search_text: |
| 2232 | + return set() |
| 2233 | + |
| 2234 | + terms: set[str] = set() |
| 2235 | + for match in _ENTITY_TERM_TOKEN_PATTERN.finditer(search_text): |
| 2236 | + token = match.group(0) |
| 2237 | + # Trigger: token begins with an uppercase letter (Title-Case or ALL-CAPS). |
| 2238 | + # Why: proper nouns and named entities are conventionally capitalized; this |
| 2239 | + # is the cheapest reliable signal without a NER model. |
| 2240 | + # Outcome: lowercase, non-capitalized words are ignored as generic terms. |
| 2241 | + if not token[0].isupper(): |
| 2242 | + continue |
| 2243 | + normalized = token.lower() |
| 2244 | + # Strip a trailing possessive so "Joanna's" matches the entity "Joanna". |
| 2245 | + if normalized.endswith("'s"): |
| 2246 | + normalized = normalized[:-2] |
| 2247 | + if normalized in _ENTITY_TERM_STOPWORDS: |
| 2248 | + continue |
| 2249 | + # Single characters (e.g. a stray "I") carry no entity signal. |
| 2250 | + if len(normalized) < 2: |
| 2251 | + continue |
| 2252 | + terms.add(normalized) |
| 2253 | + return terms |
| 2254 | + |
| 2255 | + @staticmethod |
| 2256 | + def _row_entity_match_count(row: SearchIndexRow, entity_terms: set[str]) -> int: |
| 2257 | + """Count distinct query entity terms that a candidate row references. |
| 2258 | +
|
| 2259 | + Matches against the row's own entity name (title) and the names embedded in |
| 2260 | + a relation row's title (``"From -> To"``). These are the fields where Basic |
| 2261 | + Memory's first-class entity names surface, so a match here is strong evidence |
| 2262 | + the candidate is about the queried entity rather than a same-topic document. |
| 2263 | + """ |
| 2264 | + if not entity_terms: |
| 2265 | + return 0 |
| 2266 | + |
| 2267 | + haystack_parts = [row.title or ""] |
| 2268 | + # Relation rows encode linked entity names in their title ("From -> To"); |
| 2269 | + # the relation_type itself is not an entity name, so it is excluded. |
| 2270 | + haystack = " ".join(part for part in haystack_parts if part) |
| 2271 | + if not haystack: |
| 2272 | + return 0 |
| 2273 | + |
| 2274 | + haystack_tokens: set[str] = set() |
| 2275 | + for match in _ENTITY_TERM_TOKEN_PATTERN.finditer(haystack): |
| 2276 | + token = match.group(0).lower() |
| 2277 | + # Mirror the query-side possessive stripping so a doc titled |
| 2278 | + # "Joanna's Hobbies" matches the query entity term "joanna". |
| 2279 | + if token.endswith("'s"): |
| 2280 | + token = token[:-2] |
| 2281 | + haystack_tokens.add(token) |
| 2282 | + return len(entity_terms & haystack_tokens) |
| 2283 | + |
| 2284 | + def _apply_entity_boost( |
| 2285 | + self, |
| 2286 | + fused_scores: dict[SearchIndexKey, float], |
| 2287 | + rows_by_key: dict[SearchIndexKey, SearchIndexRow], |
| 2288 | + entity_terms: set[str], |
| 2289 | + ) -> dict[SearchIndexKey, float]: |
| 2290 | + """Multiply fused scores by a per-matched-term bonus for entity-matching rows. |
| 2291 | +
|
| 2292 | + Trigger: entity boosting is enabled and the query contains proper-noun terms. |
| 2293 | + Why: a candidate whose entity/relation names contain a queried proper noun is a |
| 2294 | + stronger answer than a generic same-topic document (#951 cross-conversation |
| 2295 | + confusion). |
| 2296 | + Outcome: ``score * (1 + weight * min(matches, max_terms))``. Rows that match no |
| 2297 | + query entity term are returned unchanged, so relative order among non-matching |
| 2298 | + rows is preserved. |
| 2299 | + """ |
| 2300 | + if not self._entity_boost_enabled or not entity_terms or self._entity_boost_weight <= 0: |
| 2301 | + return fused_scores |
| 2302 | + |
| 2303 | + boosted: dict[SearchIndexKey, float] = {} |
| 2304 | + for row_key, score in fused_scores.items(): |
| 2305 | + row = rows_by_key.get(row_key) |
| 2306 | + matches = self._row_entity_match_count(row, entity_terms) if row is not None else 0 |
| 2307 | + if matches <= 0: |
| 2308 | + boosted[row_key] = score |
| 2309 | + continue |
| 2310 | + capped_matches = min(matches, self._entity_boost_max_terms) |
| 2311 | + boosted[row_key] = score * (1.0 + self._entity_boost_weight * capped_matches) |
| 2312 | + return boosted |
| 2313 | + |
2150 | 2314 | async def _search_hybrid( |
2151 | 2315 | self, |
2152 | 2316 | *, |
@@ -2250,6 +2414,15 @@ async def _search_hybrid( |
2250 | 2414 | f = fts_scores.get(row_key, 0.0) |
2251 | 2415 | fused_scores[row_key] = max(v, f) + FUSION_BONUS * min(v, f) |
2252 | 2416 |
|
| 2417 | + # Entity-aware ranking boost (#951): runs over the full fused candidate set |
| 2418 | + # before the limit/offset cut, so a boosted entity-matching candidate can be |
| 2419 | + # promoted into the returned window. No-op when the feature is disabled or the |
| 2420 | + # query contains no proper-noun terms, preserving the existing ordering. |
| 2421 | + entity_terms = ( |
| 2422 | + self._extract_query_entity_terms(query_text) if self._entity_boost_enabled else set() |
| 2423 | + ) |
| 2424 | + fused_scores = self._apply_entity_boost(fused_scores, rows_by_key, entity_terms) |
| 2425 | + |
2253 | 2426 | ranked = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True) |
2254 | 2427 | output: list[SearchIndexRow] = [] |
2255 | 2428 | for row_key, fused_score in ranked[offset : offset + limit]: |
|
0 commit comments