Skip to content

Commit de785cd

Browse files
Copilotdevlux76
andcommitted
fix: add prev/next SemanticNeighbor edges between consecutive book-slice pages in HierarchyBuilder
Co-authored-by: devlux76 <86517969+devlux76@users.noreply.github.com>
1 parent eafff4c commit de785cd

3 files changed

Lines changed: 140 additions & 15 deletions

File tree

hippocampus/HierarchyBuilder.ts

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { Book, Hash, MetadataStore, Shelf, Volume, VectorStore } from "../core/types";
1+
import type { Book, Hash, MetadataStore, SemanticNeighbor, Shelf, Volume, VectorStore } from "../core/types";
22
import type { ModelProfile } from "../core/ModelProfile";
33
import type { HotpathPolicy } from "../core/HotpathPolicy";
44
import { hashText } from "../core/crypto/hash";
@@ -12,6 +12,11 @@ const PAGES_PER_BOOK = 8;
1212
const BOOKS_PER_VOLUME = 4;
1313
const VOLUMES_PER_SHELF = 4;
1414

15+
// Max neighbors per page for the adjacency edges added by the hierarchy builder.
16+
// Adjacency edges represent document-order contiguity and bypass the cosine
17+
// cutoff used by FastNeighborInsert, so they must still be bounded by policy.
18+
const ADJACENCY_MAX_DEGREE = 16;
19+
1520
export interface BuildHierarchyOptions {
1621
modelProfile: ModelProfile;
1722
vectorStore: VectorStore;
@@ -80,6 +85,36 @@ function chunkArray<T>(arr: T[], size: number): T[][] {
8085
return chunks;
8186
}
8287

88+
/**
89+
* Merge a candidate into a neighbor list, respecting maxDegree.
90+
* If at capacity, evicts the neighbor with the lowest cosineSimilarity.
91+
* Returns the updated list sorted by cosineSimilarity descending.
92+
*/
93+
function mergeAdjacentNeighbor(
94+
existing: SemanticNeighbor[],
95+
candidate: SemanticNeighbor,
96+
maxDegree: number,
97+
): SemanticNeighbor[] {
98+
const deduped = existing.filter((n) => n.neighborPageId !== candidate.neighborPageId);
99+
100+
if (deduped.length < maxDegree) {
101+
deduped.push(candidate);
102+
} else {
103+
let weakestIdx = 0;
104+
for (let i = 1; i < deduped.length; i++) {
105+
if (deduped[i].cosineSimilarity < deduped[weakestIdx].cosineSimilarity) {
106+
weakestIdx = i;
107+
}
108+
}
109+
if (candidate.cosineSimilarity > deduped[weakestIdx].cosineSimilarity) {
110+
deduped[weakestIdx] = candidate;
111+
}
112+
}
113+
114+
deduped.sort((a, b) => b.cosineSimilarity - a.cosineSimilarity);
115+
return deduped;
116+
}
117+
83118
export async function buildHierarchy(
84119
pageIds: Hash[],
85120
options: BuildHierarchyOptions,
@@ -99,6 +134,12 @@ export async function buildHierarchy(
99134
});
100135
const pageVectors = await vectorStore.readVectors(pageOffsets, dim);
101136

137+
// Build a Map<pageId, vector> for O(1) lookups throughout the hierarchy build.
138+
const pageVectorMap = new Map<Hash, Float32Array>();
139+
for (let i = 0; i < pageIds.length; i++) {
140+
pageVectorMap.set(pageIds[i], pageVectors[i]);
141+
}
142+
102143
// -------------------------------------------------------------------------
103144
// Level 1: Pages → Books
104145
// -------------------------------------------------------------------------
@@ -110,8 +151,9 @@ export async function buildHierarchy(
110151
const bookId = await hashText(sortedChunk.join("|"));
111152

112153
const chunkVectors = chunk.map((id) => {
113-
const idx = pageIds.indexOf(id);
114-
return pageVectors[idx];
154+
const vec = pageVectorMap.get(id);
155+
if (!vec) throw new Error(`Vector not found for page ${id}`);
156+
return vec;
115157
});
116158

117159
const medoidIdx = selectMedoidIndex(chunkVectors);
@@ -122,6 +164,32 @@ export async function buildHierarchy(
122164
books.push(book);
123165
}
124166

167+
// Add SemanticNeighbor edges between consecutive pages within each book slice.
168+
// These document-order adjacency edges are always inserted regardless of cosine
169+
// cutoff, because adjacent text chunks of the same source are always related.
170+
for (const book of books) {
171+
for (let i = 0; i < book.pageIds.length - 1; i++) {
172+
const aId = book.pageIds[i];
173+
const bId = book.pageIds[i + 1];
174+
const aVec = pageVectorMap.get(aId);
175+
const bVec = pageVectorMap.get(bId);
176+
if (!aVec || !bVec) continue;
177+
178+
const sim = cosineSimilarity(aVec, bVec);
179+
const dist = 1 - sim;
180+
const forwardEdge: SemanticNeighbor = { neighborPageId: bId, cosineSimilarity: sim, distance: dist };
181+
const reverseEdge: SemanticNeighbor = { neighborPageId: aId, cosineSimilarity: sim, distance: dist };
182+
183+
// Forward: a → b
184+
const existingA = await metadataStore.getSemanticNeighbors(aId);
185+
await metadataStore.putSemanticNeighbors(aId, mergeAdjacentNeighbor(existingA, forwardEdge, ADJACENCY_MAX_DEGREE));
186+
187+
// Reverse: b → a
188+
const existingB = await metadataStore.getSemanticNeighbors(bId);
189+
await metadataStore.putSemanticNeighbors(bId, mergeAdjacentNeighbor(existingB, reverseEdge, ADJACENCY_MAX_DEGREE));
190+
}
191+
}
192+
125193
await runPromotionSweep(books.map((b) => b.bookId), metadataStore, policy);
126194

127195
// -------------------------------------------------------------------------
@@ -135,8 +203,9 @@ export async function buildHierarchy(
135203
const volumeId = await hashText(sortedBookIds.join("|"));
136204

137205
const medoidVectors = bookChunk.map((b) => {
138-
const idx = pageIds.indexOf(b.medoidPageId);
139-
return pageVectors[idx];
206+
const vec = pageVectorMap.get(b.medoidPageId);
207+
if (!vec) throw new Error(`Vector not found for medoid page ${b.medoidPageId}`);
208+
return vec;
140209
});
141210

142211
const centroid = computeCentroid(medoidVectors);

tests/hippocampus/FastNeighborInsert.test.ts

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ import { MemoryVectorStore } from "../../storage/MemoryVectorStore";
66
import { DeterministicDummyEmbeddingBackend } from "../../embeddings/DeterministicDummyEmbeddingBackend";
77
import { EmbeddingRunner } from "../../embeddings/EmbeddingRunner";
88
import { generateKeyPair } from "../../core/crypto/sign";
9-
import { ingestText } from "../../hippocampus/Ingest";
9+
import { buildPage } from "../../hippocampus/PageBuilder";
10+
import { chunkText } from "../../hippocampus/Chunker";
1011
import { insertSemanticNeighbors } from "../../hippocampus/FastNeighborInsert";
1112
import type { ModelProfile } from "../../core/ModelProfile";
1213

@@ -24,6 +25,11 @@ const PROFILE: ModelProfile = {
2425
source: "metadata",
2526
};
2627

28+
/**
29+
* Builds `pageCount` pages directly without calling ingestText/buildHierarchy,
30+
* so the SemanticNeighbor graph starts empty. This keeps FastNeighborInsert
31+
* tests fully isolated from HierarchyBuilder's adjacency-edge insertion.
32+
*/
2733
async function makeFixture(pageCount: number) {
2834
const metadataStore = await IndexedDbMetadataStore.open(freshDbName());
2935
const vectorStore = new MemoryVectorStore();
@@ -40,16 +46,31 @@ async function makeFixture(pageCount: number) {
4046

4147
const words = Array.from({ length: pageCount * 4 }, (_, i) => `word${i}`);
4248
const text = words.join(" ");
49+
const chunks = chunkText(text, PROFILE);
50+
const useChunks = chunks.slice(0, pageCount);
51+
const embeddings = await runner.embed(useChunks);
52+
53+
const createdAt = new Date().toISOString();
54+
const pageIds: string[] = [];
55+
56+
for (let i = 0; i < useChunks.length; i++) {
57+
const embedding = embeddings[i];
58+
const offset = await vectorStore.appendVector(embedding);
59+
const page = await buildPage({
60+
content: useChunks[i],
61+
embedding,
62+
embeddingOffset: offset,
63+
embeddingDim: PROFILE.embeddingDimension,
64+
creatorPubKey: keyPair.publicKey,
65+
signingKey: keyPair.signingKey,
66+
createdAt,
67+
});
68+
await metadataStore.putPage(page);
69+
await metadataStore.putPageActivity({ pageId: page.pageId, queryHitCount: 0, lastQueryAt: createdAt });
70+
pageIds.push(page.pageId);
71+
}
4372

44-
const result = await ingestText(text, {
45-
modelProfile: PROFILE,
46-
embeddingRunner: runner,
47-
vectorStore,
48-
metadataStore,
49-
keyPair,
50-
});
51-
52-
return { metadataStore, vectorStore, pageIds: result.pages.map((p) => p.pageId) };
73+
return { metadataStore, vectorStore, pageIds };
5374
}
5475

5576
describe("FastNeighborInsert", () => {

tests/hippocampus/HierarchyBuilder.test.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,4 +286,39 @@ describe("HierarchyBuilder", () => {
286286
expect(result.volumes!.length).toBeGreaterThanOrEqual(1);
287287
expect(result.shelves!.length).toBeGreaterThanOrEqual(1);
288288
});
289+
290+
it("adds SemanticNeighbor edges between consecutive pages within each book slice", async () => {
291+
const { metadataStore, vectorStore, pageIds } = await makeFixture(4);
292+
293+
const { books } = await buildHierarchy(pageIds, {
294+
modelProfile: PROFILE,
295+
vectorStore,
296+
metadataStore,
297+
});
298+
299+
// For each book with at least 2 pages, every consecutive pair should have
300+
// a SemanticNeighbor edge in both directions.
301+
for (const book of books) {
302+
for (let i = 0; i < book.pageIds.length - 1; i++) {
303+
const aId = book.pageIds[i];
304+
const bId = book.pageIds[i + 1];
305+
306+
// Forward: a → b
307+
const aNeighbors = await metadataStore.getSemanticNeighbors(aId);
308+
const aHasB = aNeighbors.some((n) => n.neighborPageId === bId);
309+
expect(aHasB).toBe(true);
310+
311+
// Reverse: b → a
312+
const bNeighbors = await metadataStore.getSemanticNeighbors(bId);
313+
const bHasA = bNeighbors.some((n) => n.neighborPageId === aId);
314+
expect(bHasA).toBe(true);
315+
316+
// Edge data should be structurally valid.
317+
const edge = aNeighbors.find((n) => n.neighborPageId === bId)!;
318+
expect(edge.cosineSimilarity).toBeGreaterThanOrEqual(-1);
319+
expect(edge.cosineSimilarity).toBeLessThanOrEqual(1);
320+
expect(edge.distance).toBeCloseTo(1 - edge.cosineSimilarity, 5);
321+
}
322+
}
323+
});
289324
});

0 commit comments

Comments
 (0)