|
5 | 5 | "encoding/json" |
6 | 6 | "fmt" |
7 | 7 | "log/slog" |
| 8 | + "math" |
8 | 9 | "os" |
9 | 10 | "path/filepath" |
10 | 11 | "strings" |
@@ -1142,38 +1143,48 @@ func (s *SQLiteStore) ReconcileStaleNodes(agentName string, workspaces ...string |
1142 | 1143 | defer func() { rollbackWithLog(tx, "reconcile-stale") }() |
1143 | 1144 |
|
1144 | 1145 | var wsFilter string |
1145 | | - var deleteArgs, demoteArgs []any |
| 1146 | + var deleteArgs []any |
1146 | 1147 | if len(workspaces) > 0 { |
1147 | 1148 | placeholders := "?" + strings.Repeat(",?", len(workspaces)-1) |
1148 | 1149 | wsFilter = fmt.Sprintf(" AND workspace IN (%s)", placeholders) |
1149 | 1150 | deleteArgs = []any{agentName} |
1150 | | - demoteArgs = []any{agentName} |
1151 | 1151 | for _, ws := range workspaces { |
1152 | 1152 | deleteArgs = append(deleteArgs, ws) |
1153 | | - demoteArgs = append(demoteArgs, ws) |
1154 | 1153 | } |
1155 | 1154 | } else { |
1156 | 1155 | deleteArgs = []any{agentName} |
1157 | | - demoteArgs = []any{agentName} |
1158 | 1156 | } |
1159 | 1157 |
|
1160 | | - result, err := tx.Exec("DELETE FROM nodes WHERE source_agent = ? AND stale_count >= 2"+wsFilter, deleteArgs...) |
| 1158 | + // Get IDs of stale nodes for cascading edge cleanup |
| 1159 | + idRows, err := tx.Query("SELECT id FROM nodes WHERE source_agent = ? AND stale_count >= 1"+wsFilter, deleteArgs...) |
1161 | 1160 | if err != nil { |
1162 | | - return 0, 0, fmt.Errorf("delete stale nodes: %w", err) |
| 1161 | + return 0, 0, fmt.Errorf("query stale node IDs: %w", err) |
| 1162 | + } |
| 1163 | + var staleIDs []string |
| 1164 | + for idRows.Next() { |
| 1165 | + var id string |
| 1166 | + if err := idRows.Scan(&id); err == nil { |
| 1167 | + staleIDs = append(staleIDs, id) |
| 1168 | + } |
| 1169 | + } |
| 1170 | + _ = idRows.Close() |
| 1171 | + |
| 1172 | + // Cascade: delete edges referencing stale nodes |
| 1173 | + for _, id := range staleIDs { |
| 1174 | + _, _ = tx.Exec("DELETE FROM node_edges WHERE from_node_id = ? OR to_node_id = ?", id, id) |
1163 | 1175 | } |
1164 | | - deleted, _ := result.RowsAffected() |
1165 | 1176 |
|
1166 | | - result, err = tx.Exec(`UPDATE nodes SET verification_status = 'stale', confidence_score = confidence_score * 0.5 |
1167 | | - WHERE source_agent = ? AND stale_count = 1`+wsFilter, demoteArgs...) |
| 1177 | + // Delete stale nodes immediately (no demotion, no two-strike) |
| 1178 | + result, err := tx.Exec("DELETE FROM nodes WHERE source_agent = ? AND stale_count >= 1"+wsFilter, deleteArgs...) |
1168 | 1179 | if err != nil { |
1169 | | - return 0, 0, fmt.Errorf("demote stale nodes: %w", err) |
| 1180 | + return 0, 0, fmt.Errorf("delete stale nodes: %w", err) |
1170 | 1181 | } |
1171 | | - demoted, _ := result.RowsAffected() |
| 1182 | + deleted, _ := result.RowsAffected() |
1172 | 1183 |
|
1173 | 1184 | if err := tx.Commit(); err != nil { |
1174 | 1185 | return 0, 0, fmt.Errorf("commit reconcile: %w", err) |
1175 | 1186 | } |
1176 | | - return int(deleted), int(demoted), nil |
| 1187 | + return int(deleted), 0, nil |
1177 | 1188 | } |
1178 | 1189 |
|
1179 | 1190 | // DeleteNodesByFiles removes nodes from a specific agent that reference any of the given files. |
@@ -1486,7 +1497,71 @@ func (s *SQLiteStore) UpsertNodeBySummary(n Node) error { |
1486 | 1497 | } |
1487 | 1498 | _ = rows.Close() |
1488 | 1499 |
|
1489 | | - // No similar node found - insert new node (including evidence and debt columns) |
| 1500 | + // Third pass: embedding-based cosine similarity (catches semantic duplicates |
| 1501 | + // where wording differs but meaning is identical, e.g. "Repository pattern |
| 1502 | + // for persistence" vs "Repository abstraction unifies database access") |
| 1503 | + if len(n.Embedding) > 0 { |
| 1504 | + embRows, embErr := tx.Query(` |
| 1505 | + SELECT id, summary, content, embedding FROM nodes |
| 1506 | + WHERE source_agent = ? AND embedding IS NOT NULL AND length(embedding) > 0 |
| 1507 | + `, n.SourceAgent) |
| 1508 | + if embErr == nil { |
| 1509 | + const cosineThreshold float32 = 0.85 |
| 1510 | + var bestID, bestSummary, bestContent string |
| 1511 | + var bestScore float32 |
| 1512 | + for embRows.Next() { |
| 1513 | + var eid, esummary, econtent string |
| 1514 | + var rawEmb []byte |
| 1515 | + if err := embRows.Scan(&eid, &esummary, &econtent, &rawEmb); err != nil { |
| 1516 | + continue |
| 1517 | + } |
| 1518 | + existing := bytesToFloat32Slice(rawEmb) |
| 1519 | + if len(existing) != len(n.Embedding) { |
| 1520 | + continue |
| 1521 | + } |
| 1522 | + score := cosineSimilarityF32(n.Embedding, existing) |
| 1523 | + if score >= cosineThreshold && score > bestScore { |
| 1524 | + bestScore = score |
| 1525 | + bestID = eid |
| 1526 | + bestSummary = esummary |
| 1527 | + bestContent = econtent |
| 1528 | + } |
| 1529 | + } |
| 1530 | + _ = embRows.Close() |
| 1531 | + |
| 1532 | + if bestID != "" { |
| 1533 | + // Merge into the semantically matching node |
| 1534 | + _ = bestSummary // used for logging if needed |
| 1535 | + if n.Content != bestContent { |
| 1536 | + _, err = tx.Exec(` |
| 1537 | + UPDATE nodes SET content = ?, type = ?, embedding = ?, summary = ?, |
| 1538 | + evidence = ?, verification_status = ?, verification_result = ?, confidence_score = ?, |
| 1539 | + debt_score = ?, debt_reason = ?, refactor_hint = ?, |
| 1540 | + stale_count = 0 |
| 1541 | + WHERE id = ? |
| 1542 | + `, n.Content, n.Type, embeddingBytes, n.Summary, |
| 1543 | + n.Evidence, n.VerificationStatus, n.VerificationResult, n.ConfidenceScore, |
| 1544 | + n.DebtScore, n.DebtReason, n.RefactorHint, bestID) |
| 1545 | + } else { |
| 1546 | + _, err = tx.Exec(` |
| 1547 | + UPDATE nodes SET type = ?, embedding = ?, summary = ?, |
| 1548 | + evidence = ?, verification_status = ?, verification_result = ?, confidence_score = ?, |
| 1549 | + debt_score = ?, debt_reason = ?, refactor_hint = ?, |
| 1550 | + stale_count = 0 |
| 1551 | + WHERE id = ? |
| 1552 | + `, n.Type, embeddingBytes, n.Summary, |
| 1553 | + n.Evidence, n.VerificationStatus, n.VerificationResult, n.ConfidenceScore, |
| 1554 | + n.DebtScore, n.DebtReason, n.RefactorHint, bestID) |
| 1555 | + } |
| 1556 | + if err != nil { |
| 1557 | + return fmt.Errorf("update embedding-matched node: %w", err) |
| 1558 | + } |
| 1559 | + return tx.Commit() |
| 1560 | + } |
| 1561 | + } |
| 1562 | + } |
| 1563 | + |
| 1564 | + // No match found by any method - insert new node |
1490 | 1565 | _, err = tx.Exec(` |
1491 | 1566 | INSERT INTO nodes (id, content, type, summary, source_agent, workspace, embedding, created_at, |
1492 | 1567 | evidence, verification_status, verification_result, confidence_score, |
@@ -2118,6 +2193,23 @@ func bytesToFloat32Slice(buf []byte) []float32 { |
2118 | 2193 | return floats |
2119 | 2194 | } |
2120 | 2195 |
|
| 2196 | +// cosineSimilarityF32 computes cosine similarity between two float32 vectors. |
| 2197 | +func cosineSimilarityF32(a, b []float32) float32 { |
| 2198 | + if len(a) != len(b) || len(a) == 0 { |
| 2199 | + return 0 |
| 2200 | + } |
| 2201 | + var dot, normA, normB float64 |
| 2202 | + for i := range a { |
| 2203 | + dot += float64(a[i]) * float64(b[i]) |
| 2204 | + normA += float64(a[i]) * float64(a[i]) |
| 2205 | + normB += float64(b[i]) * float64(b[i]) |
| 2206 | + } |
| 2207 | + if normA == 0 || normB == 0 { |
| 2208 | + return 0 |
| 2209 | + } |
| 2210 | + return float32(dot / (math.Sqrt(normA) * math.Sqrt(normB))) |
| 2211 | +} |
| 2212 | + |
2121 | 2213 | const ( |
2122 | 2214 | textSimilarityThreshold = 0.45 |
2123 | 2215 | ) |
|
0 commit comments