Skip to content

Commit a875c1d

Browse files
committed
feat: hybrid dedup (embedding cosine) + immediate stale cleanup
- Add embedding cosine similarity (0.85 threshold) as third dedup pass in UpsertNodeBySummary after Jaccard text match fails. Catches semantic duplicates like "Repository pattern" vs "Repository abstraction" - Change stale cleanup from two-strike (>=2) to immediate (>=1) - Cascade-delete edges for stale nodes to prevent orphaned graph links - Remove demotion step (no more confidence halving) - Remove [stale] tags from UI entirely (only [missing] remains)
1 parent 480fef4 commit a875c1d

2 files changed

Lines changed: 112 additions & 33 deletions

File tree

internal/memory/sqlite.go

Lines changed: 105 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/json"
66
"fmt"
77
"log/slog"
8+
"math"
89
"os"
910
"path/filepath"
1011
"strings"
@@ -1142,38 +1143,48 @@ func (s *SQLiteStore) ReconcileStaleNodes(agentName string, workspaces ...string
11421143
defer func() { rollbackWithLog(tx, "reconcile-stale") }()
11431144

11441145
var wsFilter string
1145-
var deleteArgs, demoteArgs []any
1146+
var deleteArgs []any
11461147
if len(workspaces) > 0 {
11471148
placeholders := "?" + strings.Repeat(",?", len(workspaces)-1)
11481149
wsFilter = fmt.Sprintf(" AND workspace IN (%s)", placeholders)
11491150
deleteArgs = []any{agentName}
1150-
demoteArgs = []any{agentName}
11511151
for _, ws := range workspaces {
11521152
deleteArgs = append(deleteArgs, ws)
1153-
demoteArgs = append(demoteArgs, ws)
11541153
}
11551154
} else {
11561155
deleteArgs = []any{agentName}
1157-
demoteArgs = []any{agentName}
11581156
}
11591157

1160-
result, err := tx.Exec("DELETE FROM nodes WHERE source_agent = ? AND stale_count >= 2"+wsFilter, deleteArgs...)
1158+
// Get IDs of stale nodes for cascading edge cleanup
1159+
idRows, err := tx.Query("SELECT id FROM nodes WHERE source_agent = ? AND stale_count >= 1"+wsFilter, deleteArgs...)
11611160
if err != nil {
1162-
return 0, 0, fmt.Errorf("delete stale nodes: %w", err)
1161+
return 0, 0, fmt.Errorf("query stale node IDs: %w", err)
1162+
}
1163+
var staleIDs []string
1164+
for idRows.Next() {
1165+
var id string
1166+
if err := idRows.Scan(&id); err == nil {
1167+
staleIDs = append(staleIDs, id)
1168+
}
1169+
}
1170+
_ = idRows.Close()
1171+
1172+
// Cascade: delete edges referencing stale nodes
1173+
for _, id := range staleIDs {
1174+
_, _ = tx.Exec("DELETE FROM node_edges WHERE from_node_id = ? OR to_node_id = ?", id, id)
11631175
}
1164-
deleted, _ := result.RowsAffected()
11651176

1166-
result, err = tx.Exec(`UPDATE nodes SET verification_status = 'stale', confidence_score = confidence_score * 0.5
1167-
WHERE source_agent = ? AND stale_count = 1`+wsFilter, demoteArgs...)
1177+
// Delete stale nodes immediately (no demotion, no two-strike)
1178+
result, err := tx.Exec("DELETE FROM nodes WHERE source_agent = ? AND stale_count >= 1"+wsFilter, deleteArgs...)
11681179
if err != nil {
1169-
return 0, 0, fmt.Errorf("demote stale nodes: %w", err)
1180+
return 0, 0, fmt.Errorf("delete stale nodes: %w", err)
11701181
}
1171-
demoted, _ := result.RowsAffected()
1182+
deleted, _ := result.RowsAffected()
11721183

11731184
if err := tx.Commit(); err != nil {
11741185
return 0, 0, fmt.Errorf("commit reconcile: %w", err)
11751186
}
1176-
return int(deleted), int(demoted), nil
1187+
return int(deleted), 0, nil
11771188
}
11781189

11791190
// DeleteNodesByFiles removes nodes from a specific agent that reference any of the given files.
@@ -1486,7 +1497,71 @@ func (s *SQLiteStore) UpsertNodeBySummary(n Node) error {
14861497
}
14871498
_ = rows.Close()
14881499

1489-
// No similar node found - insert new node (including evidence and debt columns)
1500+
// Third pass: embedding-based cosine similarity (catches semantic duplicates
1501+
// where wording differs but meaning is identical, e.g. "Repository pattern
1502+
// for persistence" vs "Repository abstraction unifies database access")
1503+
if len(n.Embedding) > 0 {
1504+
embRows, embErr := tx.Query(`
1505+
SELECT id, summary, content, embedding FROM nodes
1506+
WHERE source_agent = ? AND embedding IS NOT NULL AND length(embedding) > 0
1507+
`, n.SourceAgent)
1508+
if embErr == nil {
1509+
const cosineThreshold float32 = 0.85
1510+
var bestID, bestSummary, bestContent string
1511+
var bestScore float32
1512+
for embRows.Next() {
1513+
var eid, esummary, econtent string
1514+
var rawEmb []byte
1515+
if err := embRows.Scan(&eid, &esummary, &econtent, &rawEmb); err != nil {
1516+
continue
1517+
}
1518+
existing := bytesToFloat32Slice(rawEmb)
1519+
if len(existing) != len(n.Embedding) {
1520+
continue
1521+
}
1522+
score := cosineSimilarityF32(n.Embedding, existing)
1523+
if score >= cosineThreshold && score > bestScore {
1524+
bestScore = score
1525+
bestID = eid
1526+
bestSummary = esummary
1527+
bestContent = econtent
1528+
}
1529+
}
1530+
_ = embRows.Close()
1531+
1532+
if bestID != "" {
1533+
// Merge into the semantically matching node
1534+
_ = bestSummary // used for logging if needed
1535+
if n.Content != bestContent {
1536+
_, err = tx.Exec(`
1537+
UPDATE nodes SET content = ?, type = ?, embedding = ?, summary = ?,
1538+
evidence = ?, verification_status = ?, verification_result = ?, confidence_score = ?,
1539+
debt_score = ?, debt_reason = ?, refactor_hint = ?,
1540+
stale_count = 0
1541+
WHERE id = ?
1542+
`, n.Content, n.Type, embeddingBytes, n.Summary,
1543+
n.Evidence, n.VerificationStatus, n.VerificationResult, n.ConfidenceScore,
1544+
n.DebtScore, n.DebtReason, n.RefactorHint, bestID)
1545+
} else {
1546+
_, err = tx.Exec(`
1547+
UPDATE nodes SET type = ?, embedding = ?, summary = ?,
1548+
evidence = ?, verification_status = ?, verification_result = ?, confidence_score = ?,
1549+
debt_score = ?, debt_reason = ?, refactor_hint = ?,
1550+
stale_count = 0
1551+
WHERE id = ?
1552+
`, n.Type, embeddingBytes, n.Summary,
1553+
n.Evidence, n.VerificationStatus, n.VerificationResult, n.ConfidenceScore,
1554+
n.DebtScore, n.DebtReason, n.RefactorHint, bestID)
1555+
}
1556+
if err != nil {
1557+
return fmt.Errorf("update embedding-matched node: %w", err)
1558+
}
1559+
return tx.Commit()
1560+
}
1561+
}
1562+
}
1563+
1564+
// No match found by any method - insert new node
14901565
_, err = tx.Exec(`
14911566
INSERT INTO nodes (id, content, type, summary, source_agent, workspace, embedding, created_at,
14921567
evidence, verification_status, verification_result, confidence_score,
@@ -2118,6 +2193,23 @@ func bytesToFloat32Slice(buf []byte) []float32 {
21182193
return floats
21192194
}
21202195

2196+
// cosineSimilarityF32 computes cosine similarity between two float32 vectors.
2197+
func cosineSimilarityF32(a, b []float32) float32 {
2198+
if len(a) != len(b) || len(a) == 0 {
2199+
return 0
2200+
}
2201+
var dot, normA, normB float64
2202+
for i := range a {
2203+
dot += float64(a[i]) * float64(b[i])
2204+
normA += float64(a[i]) * float64(a[i])
2205+
normB += float64(b[i]) * float64(b[i])
2206+
}
2207+
if normA == 0 || normB == 0 {
2208+
return 0
2209+
}
2210+
return float32(dot / (math.Sqrt(normA) * math.Sqrt(normB)))
2211+
}
2212+
21212213
const (
21222214
textSimilarityThreshold = 0.45
21232215
)

internal/ui/list_view.go

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,26 +52,16 @@ func renderNodeListInternal(nodes []memory.Node, verbose bool, basePath string)
5252
renderGroupedList(byType, typeOrder, showWorkspace, basePath)
5353
}
5454

55-
// Compact legend for stale/missing tags (only shown when relevant)
56-
var staleCount, missingCount int
55+
// Legend for [missing] tag (stale nodes are now deleted immediately, no tag needed)
56+
var missingCount int
5757
for _, n := range nodes {
58-
switch n.VerificationStatus {
59-
case "stale":
60-
staleCount++
61-
case "missing":
58+
if n.VerificationStatus == "missing" {
6259
missingCount++
6360
}
6461
}
65-
if staleCount > 0 || missingCount > 0 {
62+
if missingCount > 0 {
6663
dimStyle := lipgloss.NewStyle().Foreground(ColorDim)
67-
var parts []string
68-
if staleCount > 0 {
69-
parts = append(parts, fmt.Sprintf("[stale]=%d not re-extracted last run", staleCount))
70-
}
71-
if missingCount > 0 {
72-
parts = append(parts, fmt.Sprintf("[missing]=%d evidence files gone", missingCount))
73-
}
74-
fmt.Printf("\n %s\n", dimStyle.Render(strings.Join(parts, " ")))
64+
fmt.Printf("\n %s\n", dimStyle.Render(fmt.Sprintf("[missing]=%d evidence files no longer on disk", missingCount)))
7565
}
7666
}
7767

@@ -174,15 +164,12 @@ func renderGroupedList(byType map[string][]memory.Node, typeOrder []string, show
174164
summary = utils.Truncate(n.Text(), maxSummaryWidth-4)
175165
}
176166

177-
// Check freshness if basePath is available and node has evidence
167+
// Check freshness: only show [missing] (stale nodes are deleted immediately now)
178168
staleTag := ""
179169
staleTagWidth := 0
180170
if basePath != "" && n.Evidence != "" {
181171
result := knowledge.Check(basePath, n.Evidence, n.CreatedAt)
182-
if result.Status == knowledge.StatusStale {
183-
staleTag = staleStyle.Render(" [stale]")
184-
staleTagWidth = 8 // " [stale]"
185-
} else if result.Status == knowledge.StatusMissing {
172+
if result.Status == knowledge.StatusMissing {
186173
staleTag = staleStyle.Render(" [missing]")
187174
staleTagWidth = 10 // " [missing]"
188175
}

0 commit comments

Comments
 (0)