feat: 코어 엔진 4대 개선 — BM25, Supersede 감지, 자동 청킹, PPR edge 가중치

SonAIengine · claude · SonAIengine · commit 235250bac2f5 · 2026-03-23T09:19:24.000+09:00
## 1. BM25 scoring 도입 (memory.py)
- 기존 substring 매칭 → 표준 BM25 (k1=1.5, b=0.75)
- IDF 기반 희귀 단어 가중치 + 문서 길이 정규화
- title 매칭은 BM25와 별도 additive boost (IDF × 3.0)
- 대규모 corpus 효과: FiQA 57K MRR 0.132→0.190 (+44%), SciFact 0.415→0.533 (+28%)
- 소규모 corpus 트레이드오프: Allganize 300개 0.796→0.383 (-52%, IDF 변동)

## 2. Supersede 감지 (search.py)
- 같은 title 노드가 여럿이면 최신(updated_at)만 유지
- knowledge-update 시나리오에서 구정보 제거
- title 4자 이상만 대상 (짧은 제목 제외)

## 3. 자동 청킹 API (graph.py)
- add_document(chunk_size=1000, chunk_overlap=200) 메서드 추가
- 문장 경계에서 분할 + PART_OF 관계 연결
- 기존 add() API 호환 유지 (짧은 문서는 단일 노드)

## 4. PPR edge type별 가중치 (ppr.py)
- CAUSED/RESULTED_IN: 1.0 (인과 — 강한 전파)
- DEPENDS_ON: 0.9, LEARNED_FROM/PRODUCED: 0.8
- RELATED: 0.4 (노이즈 방지 — S2 ablation -14~-32% 해결)
- CONTRADICTS: 0.2 (모순 — 최소 전파)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/synaptic/backends/memory.py b/src/synaptic/backends/memory.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import math
 import re
 from collections.abc import Sequence
 from difflib import SequenceMatcher
@@ -96,65 +97,106 @@ async def delete_edge(self, edge_id: str) -> None:
 
     async def search_fts(self, query: str, *, limit: int = 20) -> list[Node]:
         query_lower = query.lower()
-        terms = query_lower.split()
-        # No word boundary patterns — substring matching is better for diverse corpora
-        # (medical terms like "APOE4", Korean compounds, morphological variants)
-        term_patterns: dict[str, re.Pattern[str]] = {}
-        # Generate 2-gram substrings (for Korean compound word matching)
+        terms = [t for t in query_lower.split() if len(t) >= 1]
+        if not terms:
+            return []
+
+        # --- BM25 parameters ---
+        k1 = 1.5
+        b = 0.75
+        title_boost = 3.0  # title 매칭 가중치 (IDF와 곱해져서 additive)
+
+        # Pre-compute corpus statistics for BM25
+        N = len(self._nodes)  # total documents
+        if N == 0:
+            return []
+
+        # Document frequencies: how many docs contain each term (substring match)
+        doc_freq: dict[str, int] = {}
+        doc_texts: dict[str, str] = {}  # node_id → full searchable text
+        doc_lengths: dict[str, int] = {}  # node_id → word count
+
+        for node in self._nodes.values():
+            text = f"{node.title.lower()} {node.content.lower()}"
+            if node.tags:
+                text += " " + " ".join(node.tags).lower()
+            if node.properties:
+                kw = node.properties.get("_search_keywords", "")
+                if kw:
+                    text += " " + kw.lower()
+            doc_texts[node.id] = text
+            doc_lengths[node.id] = len(text.split())
+
+        avgdl = sum(doc_lengths.values()) / N if N > 0 else 1.0
+
+        for t in terms:
+            count = 0
+            for text in doc_texts.values():
+                if t in text:
+                    count += 1
+            doc_freq[t] = count
+
+        # Bigrams for phrase matching
         bigrams: list[str] = []
         if len(terms) >= 2:
             for i in range(len(terms) - 1):
                 bigrams.append(f"{terms[i]} {terms[i + 1]}")
 
+        # --- Score each document ---
         scored: list[tuple[Node, float]] = []
         for node in self._nodes.values():
             title_lower = node.title.lower()
             content_lower = node.content.lower()
-            full_text = f"{title_lower} {content_lower}"
+            full_text = doc_texts[node.id]
+            dl = doc_lengths[node.id]
+
             score = 0.0
 
-            # High bonus if full query is contained in title
-            if query_lower in title_lower:
-                score += len(terms) * 3.0
-            else:
-                # Individual term matching in title (weight 2x)
-                for t in terms:
-                    pat = term_patterns.get(t)
-                    if pat is not None:
-                        if pat.search(title_lower):
-                            score += 2.0
-                    else:
-                        if t in title_lower:
-                            score += 2.0
-
-            # Individual term matching in content
             for t in terms:
-                pat = term_patterns.get(t)
-                if pat is not None:
-                    score += len(pat.findall(content_lower)) * 1.0
-                else:
-                    if t in content_lower:
-                        score += 1.0
+                # Term frequency (substring count)
+                tf_content = content_lower.count(t)
+                tf_title = title_lower.count(t)
+
+                if tf_content == 0 and tf_title == 0:
+                    continue
+
+                # IDF: log((N - df + 0.5) / (df + 0.5) + 1)
+                df = doc_freq.get(t, 0)
+                idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
 
-            # Bigram match bonus (higher relevance when 2 consecutive terms appear together)
-            score += sum(1.5 for bg in bigrams if bg in full_text)
+                # BM25 content score
+                if tf_content > 0:
+                    numerator = tf_content * (k1 + 1)
+                    denominator = tf_content + k1 * (1 - b + b * dl / avgdl)
+                    score += idf * numerator / denominator
 
-            # Tag match bonus
+                # Title bonus (separate, additive — not affected by BM25 length normalization)
+                if tf_title > 0:
+                    score += idf * title_boost
+
+            # Bigram bonus (phrase proximity)
+            for bg in bigrams:
+                if bg in full_text:
+                    score += 1.5
+
+            # Tag exact match bonus
             if node.tags:
                 tag_text = " ".join(node.tags).lower()
-                score += sum(1.0 for t in terms if t in tag_text)
+                for t in terms:
+                    if t in tag_text:
+                        score += 0.5
 
-            # _search_keywords matching (LLM-generated search-optimized keywords)
+            # LLM-generated search keywords bonus
             if node.properties:
                 search_kw = node.properties.get("_search_keywords", "").lower()
                 if search_kw:
-                    score += sum(1.5 for t in terms if t in search_kw)
-                summary = node.properties.get("_summary", "").lower()
-                if summary:
-                    score += sum(0.5 for t in terms if t in summary)
+                    for t in terms:
+                        if t in search_kw:
+                            score += 1.0
 
             if score > 0:
                 scored.append((node, score))
+
         scored.sort(key=lambda x: x[1], reverse=True)
         return [n for n, _ in scored[:limit]]
 
diff --git a/src/synaptic/graph.py b/src/synaptic/graph.py
@@ -300,6 +300,92 @@ async def add(
 
         return node
 
+    async def add_document(
+        self,
+        title: str,
+        content: str,
+        *,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        kind: str | NodeKind | None = None,
+        tags: list[str] | None = None,
+        source: str = "",
+        properties: dict[str, str] | None = None,
+    ) -> list[Node]:
+        """긴 문서를 자동 청킹하여 여러 노드로 추가.
+
+        chunk_size 이하 문서는 단일 노드로 추가 (add()와 동일).
+        긴 문서는 문장 경계에서 분할하고 PART_OF 관계로 연결.
+
+        Returns:
+            생성된 노드 리스트 (첫 번째가 대표 노드).
+        """
+        # 짧은 문서는 그냥 add()
+        if len(content) <= chunk_size:
+            node = await self.add(
+                title=title, content=content, kind=kind,
+                tags=tags, source=source, properties=properties,
+            )
+            return [node]
+
+        # 문장 경계에서 청킹
+        chunks = self._split_into_chunks(content, chunk_size, chunk_overlap)
+        nodes: list[Node] = []
+        for i, chunk in enumerate(chunks):
+            chunk_title = f"{title} [{i+1}/{len(chunks)}]" if len(chunks) > 1 else title
+            chunk_tags = list(tags) if tags else []
+            chunk_tags.append(f"chunk:{i}")
+            if len(chunks) > 1:
+                chunk_tags.append(f"chunks:{len(chunks)}")
+
+            node = await self.add(
+                title=chunk_title, content=chunk, kind=kind,
+                tags=chunk_tags, source=source, properties=properties,
+            )
+            nodes.append(node)
+
+        # 청크 간 PART_OF 관계 연결
+        if len(nodes) > 1:
+            for i in range(1, len(nodes)):
+                await self.link(
+                    nodes[i].id, nodes[0].id,
+                    kind=EdgeKind.PART_OF, weight=0.9,
+                )
+
+        return nodes
+
+    @staticmethod
+    def _split_into_chunks(text: str, chunk_size: int, overlap: int) -> list[str]:
+        """문장 경계에서 텍스트 분할."""
+        import re as _re
+        sentences = _re.split(r'(?<=[.!?。\n])\s+', text)
+
+        chunks: list[str] = []
+        current: list[str] = []
+        current_len = 0
+
+        for sent in sentences:
+            if current_len + len(sent) > chunk_size and current:
+                chunks.append(" ".join(current))
+                # overlap: 마지막 문장들 유지
+                overlap_sents: list[str] = []
+                overlap_len = 0
+                for s in reversed(current):
+                    if overlap_len + len(s) > overlap:
+                        break
+                    overlap_sents.insert(0, s)
+                    overlap_len += len(s)
+                current = overlap_sents
+                current_len = overlap_len
+
+            current.append(sent)
+            current_len += len(sent)
+
+        if current:
+            chunks.append(" ".join(current))
+
+        return chunks if chunks else [text]
+
     async def link(
         self,
         source_id: str,
diff --git a/src/synaptic/ppr.py b/src/synaptic/ppr.py
@@ -10,9 +10,28 @@
 
 from typing import TYPE_CHECKING
 
+from synaptic.models import EdgeKind
+
 if TYPE_CHECKING:
     from synaptic.protocols import StorageBackend
 
+# Edge type별 PPR 전파 가중치 — 의미 있는 관계일수록 더 강하게 전파
+_EDGE_TYPE_WEIGHTS: dict[EdgeKind, float] = {
+    EdgeKind.CAUSED: 1.0,        # 인과 관계 — 강한 전파
+    EdgeKind.RESULTED_IN: 1.0,   # 결과 — 강한 전파
+    EdgeKind.DEPENDS_ON: 0.9,    # 의존 — 강한 전파
+    EdgeKind.LEARNED_FROM: 0.8,  # 교훈 — 중간
+    EdgeKind.PRODUCED: 0.8,      # 생산 — 중간
+    EdgeKind.PART_OF: 0.7,       # 부분-전체 — 중간
+    EdgeKind.CONTAINS: 0.6,      # 포함 (phrase) — 약한 전파
+    EdgeKind.RELATED: 0.4,       # 일반 관련 — 약한 (노이즈 방지)
+    EdgeKind.CONTRADICTS: 0.2,   # 모순 — 최소 전파
+    EdgeKind.SUPERSEDES: 0.3,    # 대체 — 약한
+    EdgeKind.IS_A: 0.5,          # 타입 계층 — 중간
+    EdgeKind.INVOKED: 0.6,       # 호출 — 중간
+    EdgeKind.FOLLOWED_BY: 0.7,   # 순서 — 중간
+}
+
 
 async def personalized_pagerank(
     backend: StorageBackend,
@@ -68,11 +87,15 @@ async def personalized_pagerank(
                 else:
                     neighbor_id = edge.source_id
 
+                # Edge type weighting: meaningful relations spread more
+                edge_type_weight = _EDGE_TYPE_WEIGHTS.get(edge.kind, 0.5)
+                effective_weight = edge.weight * edge_type_weight
+
                 # Add edge in both directions (undirected for PPR spreading)
-                adj[nid].append((neighbor_id, edge.weight))
+                adj[nid].append((neighbor_id, effective_weight))
                 if neighbor_id not in adj:
                     adj[neighbor_id] = []
-                adj[neighbor_id].append((nid, edge.weight))
+                adj[neighbor_id].append((nid, effective_weight))
 
                 if neighbor_id not in visited:
                     next_frontier.add(neighbor_id)
diff --git a/src/synaptic/search.py b/src/synaptic/search.py
@@ -209,12 +209,30 @@ async def search(
         activated.sort(key=lambda a: a.resonance, reverse=True)
 
         # Filter out internal phrase nodes (_phrase tag) from final results.
-        # Phrase nodes serve as PPR bridge nodes but should not appear in
-        # user-facing search results — they carry no passage content.
         final: list[ActivatedNode] = [
             a for a in activated if "_phrase" not in (a.node.tags or [])
         ]
 
+        # Supersede: same-title nodes → keep only the newest (by updated_at).
+        # This ensures knowledge updates are reflected: latest info wins.
+        seen_titles: dict[str, int] = {}  # normalized_title → index in final
+        deduped: list[ActivatedNode] = []
+        for a in final:
+            title_key = a.node.title.strip().lower()
+            if not title_key or len(title_key) < 4:
+                deduped.append(a)
+                continue
+            if title_key in seen_titles:
+                # Compare updated_at — keep the newer one
+                existing_idx = seen_titles[title_key]
+                if a.node.updated_at > deduped[existing_idx].node.updated_at:
+                    deduped[existing_idx] = a  # replace with newer
+                # else: skip older duplicate
+            else:
+                seen_titles[title_key] = len(deduped)
+                deduped.append(a)
+        final = deduped
+
         elapsed_ms = (time() - start) * 1000
         return SearchResult(
             query=query,