|
| 1 | +import type { Hash, VectorStore } from "../core/types"; |
| 2 | +import type { ModelProfile } from "../core/ModelProfile"; |
| 3 | + |
| 4 | +export interface Metroid { |
| 5 | + m1: Hash; |
| 6 | + m2: Hash | null; |
| 7 | + c: Float32Array | null; |
| 8 | + knowledgeGap: boolean; |
| 9 | +} |
| 10 | + |
| 11 | +export interface MetroidBuilderOptions { |
| 12 | + modelProfile: ModelProfile; |
| 13 | + vectorStore: VectorStore; |
| 14 | +} |
| 15 | + |
| 16 | +/** Standard Matryoshka tier sizes in ascending order. */ |
| 17 | +const MATRYOSHKA_TIERS = [32, 64, 128, 256, 512, 768, 1024, 2048] as const; |
| 18 | + |
| 19 | +function cosineSimilarity(a: Float32Array, b: Float32Array): number { |
| 20 | + let dotProduct = 0; |
| 21 | + let normA = 0; |
| 22 | + let normB = 0; |
| 23 | + const len = Math.min(a.length, b.length); |
| 24 | + for (let i = 0; i < len; i++) { |
| 25 | + dotProduct += a[i] * b[i]; |
| 26 | + normA += a[i] * a[i]; |
| 27 | + normB += b[i] * b[i]; |
| 28 | + } |
| 29 | + if (normA === 0 || normB === 0) return 0; |
| 30 | + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); |
| 31 | +} |
| 32 | + |
| 33 | +function cosineDistance(a: Float32Array, b: Float32Array): number { |
| 34 | + return 1 - cosineSimilarity(a, b); |
| 35 | +} |
| 36 | + |
| 37 | +/** |
| 38 | + * Returns the index of the medoid: the element that minimises total cosine |
| 39 | + * distance to every other element in the set. |
| 40 | + */ |
| 41 | +function findMedoidIndex(embeddings: Float32Array[]): number { |
| 42 | + if (embeddings.length === 1) return 0; |
| 43 | + |
| 44 | + let bestIdx = 0; |
| 45 | + let bestTotal = Infinity; |
| 46 | + |
| 47 | + for (let i = 0; i < embeddings.length; i++) { |
| 48 | + let total = 0; |
| 49 | + for (let j = 0; j < embeddings.length; j++) { |
| 50 | + if (i !== j) { |
| 51 | + total += cosineDistance(embeddings[i], embeddings[j]); |
| 52 | + } |
| 53 | + } |
| 54 | + if (total < bestTotal) { |
| 55 | + bestTotal = total; |
| 56 | + bestIdx = i; |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + return bestIdx; |
| 61 | +} |
| 62 | + |
| 63 | +interface CandidateEntry { |
| 64 | + pageId: Hash; |
| 65 | + embeddingOffset: number; |
| 66 | + embeddingDim: number; |
| 67 | +} |
| 68 | + |
| 69 | +interface CandidateWithEmbedding extends CandidateEntry { |
| 70 | + embedding: Float32Array; |
| 71 | +} |
| 72 | + |
| 73 | +/** |
| 74 | + * Searches for m2 among `others` (candidates excluding m1) using the free |
| 75 | + * dimensions starting at `protectedDim`. |
| 76 | + * |
| 77 | + * Returns the selected medoid candidate or `null` if no valid opposite set |
| 78 | + * can be assembled. |
| 79 | + */ |
| 80 | +function searchM2( |
| 81 | + others: CandidateWithEmbedding[], |
| 82 | + m1Embedding: Float32Array, |
| 83 | + protectedDim: number, |
| 84 | +): CandidateWithEmbedding | null { |
| 85 | + if (others.length === 0) return null; |
| 86 | + |
| 87 | + const m1Free = m1Embedding.slice(protectedDim); |
| 88 | + |
| 89 | + const scored = others.map((c) => { |
| 90 | + const free = c.embedding.slice(protectedDim); |
| 91 | + return { candidate: c, score: -cosineSimilarity(free, m1Free) }; |
| 92 | + }); |
| 93 | + |
| 94 | + // Prefer candidates that are genuinely opposite (score >= 0). |
| 95 | + let oppositeSet = scored.filter((s) => s.score >= 0); |
| 96 | + |
| 97 | + // Fall back to the top 50% when the genuine-opposite set is too small. |
| 98 | + if (oppositeSet.length < 2) { |
| 99 | + const byScore = [...scored].sort((a, b) => b.score - a.score); |
| 100 | + const topHalf = Math.max(1, Math.ceil(byScore.length / 2)); |
| 101 | + oppositeSet = byScore.slice(0, topHalf); |
| 102 | + } |
| 103 | + |
| 104 | + if (oppositeSet.length === 0) return null; |
| 105 | + |
| 106 | + const medoidIdx = findMedoidIndex(oppositeSet.map((s) => s.candidate.embedding.slice(protectedDim))); |
| 107 | + return oppositeSet[medoidIdx].candidate; |
| 108 | +} |
| 109 | + |
| 110 | +/** |
| 111 | + * Builds the dialectical probe (Metroid) for a given query embedding and a |
| 112 | + * ranked list of candidate memory nodes. |
| 113 | + * |
| 114 | + * Step overview |
| 115 | + * 1. Select m1 (thesis): the candidate with highest cosine similarity to the query. |
| 116 | + * 2. Select m2 (antithesis): the medoid of the cosine-opposite set in free dims. |
| 117 | + * Uses Matryoshka dimensional unwinding when the initial tier yields no m2. |
| 118 | + * 3. Compute centroid c (synthesis): protected dims copied from m1, free dims |
| 119 | + * averaged between m1 and m2. |
| 120 | + */ |
| 121 | +export async function buildMetroid( |
| 122 | + queryEmbedding: Float32Array, |
| 123 | + candidateMedoids: Array<{ pageId: Hash; embeddingOffset: number; embeddingDim: number }>, |
| 124 | + options: MetroidBuilderOptions, |
| 125 | +): Promise<Metroid> { |
| 126 | + const { modelProfile, vectorStore } = options; |
| 127 | + |
| 128 | + if (candidateMedoids.length === 0) { |
| 129 | + return { m1: "", m2: null, c: null, knowledgeGap: true }; |
| 130 | + } |
| 131 | + |
| 132 | + // Load all candidate embeddings in one pass. |
| 133 | + const candidates: CandidateWithEmbedding[] = await Promise.all( |
| 134 | + candidateMedoids.map(async (cand) => ({ |
| 135 | + ...cand, |
| 136 | + embedding: await vectorStore.readVector(cand.embeddingOffset, cand.embeddingDim), |
| 137 | + })), |
| 138 | + ); |
| 139 | + |
| 140 | + // Select m1: highest cosine similarity to the query. |
| 141 | + let m1Candidate = candidates[0]; |
| 142 | + let m1Score = cosineSimilarity(queryEmbedding, candidates[0].embedding); |
| 143 | + |
| 144 | + for (let i = 1; i < candidates.length; i++) { |
| 145 | + const score = cosineSimilarity(queryEmbedding, candidates[i].embedding); |
| 146 | + if (score > m1Score) { |
| 147 | + m1Score = score; |
| 148 | + m1Candidate = candidates[i]; |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | + const protectedDim = modelProfile.matryoshkaProtectedDim; |
| 153 | + |
| 154 | + if (protectedDim === undefined) { |
| 155 | + // Non-Matryoshka model: antithesis search is impossible. |
| 156 | + return { m1: m1Candidate.pageId, m2: null, c: null, knowledgeGap: true }; |
| 157 | + } |
| 158 | + |
| 159 | + const others = candidates.filter((c) => c.pageId !== m1Candidate.pageId); |
| 160 | + |
| 161 | + // --- Matryoshka dimensional unwinding --- |
| 162 | + // Start at modelProfile.matryoshkaProtectedDim. If m2 not found, progressively |
| 163 | + // shrink the protected boundary (expand the free-dimension search region). |
| 164 | + |
| 165 | + const startingTierIndex = MATRYOSHKA_TIERS.indexOf( |
| 166 | + protectedDim as (typeof MATRYOSHKA_TIERS)[number], |
| 167 | + ); |
| 168 | + |
| 169 | + // Build the list of tier boundaries to attempt, from the configured value |
| 170 | + // down to the smallest tier (expanding the free region at each step). |
| 171 | + const tierBoundaries: number[] = []; |
| 172 | + if (startingTierIndex !== -1) { |
| 173 | + for (let i = startingTierIndex; i >= 0; i--) { |
| 174 | + tierBoundaries.push(MATRYOSHKA_TIERS[i]); |
| 175 | + } |
| 176 | + } else { |
| 177 | + // protectedDim is not a standard tier; try it as-is plus any smaller standard tiers. |
| 178 | + tierBoundaries.push(protectedDim); |
| 179 | + for (const t of [...MATRYOSHKA_TIERS].reverse()) { |
| 180 | + if (t < protectedDim) tierBoundaries.push(t); |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + let m2Candidate: CandidateWithEmbedding | null = null; |
| 185 | + let usedProtectedDim = protectedDim; |
| 186 | + |
| 187 | + for (const tierBoundary of tierBoundaries) { |
| 188 | + const found = searchM2(others, m1Candidate.embedding, tierBoundary); |
| 189 | + if (found !== null) { |
| 190 | + m2Candidate = found; |
| 191 | + usedProtectedDim = tierBoundary; |
| 192 | + break; |
| 193 | + } |
| 194 | + } |
| 195 | + |
| 196 | + if (m2Candidate === null) { |
| 197 | + return { m1: m1Candidate.pageId, m2: null, c: null, knowledgeGap: true }; |
| 198 | + } |
| 199 | + |
| 200 | + // Compute frozen synthesis centroid c. |
| 201 | + const fullDim = m1Candidate.embedding.length; |
| 202 | + const c = new Float32Array(fullDim); |
| 203 | + |
| 204 | + for (let i = 0; i < usedProtectedDim; i++) { |
| 205 | + c[i] = m1Candidate.embedding[i]; |
| 206 | + } |
| 207 | + for (let i = usedProtectedDim; i < fullDim; i++) { |
| 208 | + c[i] = (m1Candidate.embedding[i] + m2Candidate.embedding[i]) / 2; |
| 209 | + } |
| 210 | + |
| 211 | + return { |
| 212 | + m1: m1Candidate.pageId, |
| 213 | + m2: m2Candidate.pageId, |
| 214 | + c, |
| 215 | + knowledgeGap: false, |
| 216 | + }; |
| 217 | +} |
0 commit comments