Skip to content

Commit fa746a2

Browse files
committed
feat: Add DeterministicDummyEmbeddingBackend for SHA-256 embedding and benchmarking; update execution plan and tests
1 parent d08d72f commit fa746a2

7 files changed

Lines changed: 223 additions & 3 deletions

CORTEX-DESIGN-PLAN-TODO.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ Completed since the prior snapshot:
3030
3. Guard command exists for model-related hardcoded numeric literals (`npm run guard:model-derived`).
3131
4. Runtime numeric constants are centralized for backend/storage internals (`core/NumericConstants.ts`).
3232
5. Runtime helper resolves model metadata and derives routing policy in one call (`resolveRoutingPolicyForModel` in `Policy.ts`).
33+
6. Deterministic dummy SHA-256 embedder exists for pre-model hotpath testing (`embeddings/DeterministicDummyEmbeddingBackend.ts`).
34+
7. Benchmark harness exists for dummy embedder throughput baselining (`npm run benchmark:dummy`).
3335

3436
Next focus:
3537
1. Wire resolved model profiles into runtime ingest/query entry points.

PROJECT-EXECUTION-PLAN.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ Completed in this pass:
2424
7. Added model-profile-to-policy bridge helper so runtime callers can resolve profile and derive routing in one step:
2525
- `resolveRoutingPolicyForModel(...)` in `Policy.ts`
2626
- integration tests in `tests/model/RoutingPolicy.test.ts`
27+
8. Added deterministic dummy embedder for hotpath benchmarking before real model wiring:
28+
- `embeddings/EmbeddingBackend.ts`
29+
- `embeddings/DeterministicDummyEmbeddingBackend.ts`
30+
- tests: `tests/embeddings/DeterministicDummyEmbeddingBackend.test.ts`
31+
9. Added executable dummy hotpath benchmark harness:
32+
- `tests/benchmarks/DummyEmbedderHotpath.bench.ts`
33+
- `npm run benchmark:dummy`
2734

2835
Open items carried to next pass:
2936
1. Wire resolved `ModelProfile` into first concrete ingest/query orchestrator path (once those runtime modules are added).
@@ -94,15 +101,17 @@ Available now:
94101
2. `npm run test:unit -- tests/model/ModelProfileResolver.test.ts`
95102
3. `npm run test:unit -- tests/model/ModelDefaults.test.ts`
96103
4. `npm run guard:model-derived`
97-
5. `npm run build && npm run lint`
104+
5. `npm run test:unit -- tests/embeddings/DeterministicDummyEmbeddingBackend.test.ts`
105+
6. `npm run benchmark:dummy`
106+
7. `npm run benchmark`
107+
8. `npm run build && npm run lint`
98108

99109
Planned commands to add in later passes:
100110
1. `npm run test:unit -- tests/embeddings/ProviderResolver.test.ts`
101111
2. `npm run test:unit -- tests/embeddings/OnnxEmbeddingRunner.test.ts`
102112
3. `npm run test:browser`
103113
4. `npm run test:electron`
104114
5. `npm run test:all`
105-
6. `npm run benchmark`
106115

107116
## Known Hardcoded Hotspots To Clean First
108117

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import type { EmbeddingBackend } from "./EmbeddingBackend";
2+
3+
export const DEFAULT_DUMMY_EMBEDDING_DIMENSION = 1024;
4+
export const SHA256_BLOCK_BYTES = 64;
5+
6+
const SHA256_DIGEST_BYTES = 32;
7+
const COUNTER_BYTES = 4;
8+
const BYTE_TO_UNIT_SCALE = 127.5;
9+
10+
export interface DeterministicDummyEmbeddingBackendOptions {
11+
dimension?: number;
12+
blockBytes?: number;
13+
}
14+
15+
function assertPositiveInteger(name: string, value: number): void {
16+
if (!Number.isInteger(value) || value <= 0) {
17+
throw new Error(`${name} must be a positive integer`);
18+
}
19+
}
20+
21+
function getSubtleCrypto(): SubtleCrypto {
22+
const subtle = globalThis.crypto?.subtle;
23+
if (!subtle) {
24+
throw new Error("SubtleCrypto is required for DeterministicDummyEmbeddingBackend");
25+
}
26+
return subtle;
27+
}
28+
29+
function padBytesToBoundary(input: Uint8Array, blockBytes: number): Uint8Array {
30+
const remainder = input.byteLength % blockBytes;
31+
if (remainder === 0) {
32+
return input;
33+
}
34+
35+
const padLength = blockBytes - remainder;
36+
const padded = new Uint8Array(input.byteLength + padLength);
37+
padded.set(input);
38+
return padded;
39+
}
40+
41+
function byteToUnitFloat(byteValue: number): number {
42+
return byteValue / BYTE_TO_UNIT_SCALE - 1;
43+
}
44+
45+
export class DeterministicDummyEmbeddingBackend implements EmbeddingBackend {
46+
readonly kind = "dummy-sha256" as const;
47+
readonly dimension: number;
48+
49+
private readonly blockBytes: number;
50+
private readonly subtle = getSubtleCrypto();
51+
private readonly encoder = new TextEncoder();
52+
53+
constructor(options: DeterministicDummyEmbeddingBackendOptions = {}) {
54+
this.dimension = options.dimension ?? DEFAULT_DUMMY_EMBEDDING_DIMENSION;
55+
this.blockBytes = options.blockBytes ?? SHA256_BLOCK_BYTES;
56+
57+
assertPositiveInteger("dimension", this.dimension);
58+
assertPositiveInteger("blockBytes", this.blockBytes);
59+
}
60+
61+
async embed(texts: string[]): Promise<Float32Array[]> {
62+
return Promise.all(texts.map((text) => this.embedOne(text)));
63+
}
64+
65+
private async embedOne(text: string): Promise<Float32Array> {
66+
const sourceBytes = padBytesToBoundary(
67+
this.encoder.encode(text),
68+
this.blockBytes,
69+
);
70+
71+
const embedding = new Float32Array(this.dimension);
72+
let counter = 0;
73+
let writeIndex = 0;
74+
75+
while (writeIndex < this.dimension) {
76+
const digest = await this.digestWithCounter(sourceBytes, counter);
77+
for (
78+
let digestIndex = 0;
79+
digestIndex < SHA256_DIGEST_BYTES && writeIndex < this.dimension;
80+
digestIndex++
81+
) {
82+
embedding[writeIndex] = byteToUnitFloat(digest[digestIndex]);
83+
writeIndex++;
84+
}
85+
counter++;
86+
}
87+
88+
return embedding;
89+
}
90+
91+
private async digestWithCounter(
92+
sourceBytes: Uint8Array,
93+
counter: number,
94+
): Promise<Uint8Array> {
95+
const payload = new Uint8Array(sourceBytes.byteLength + COUNTER_BYTES);
96+
payload.set(sourceBytes, 0);
97+
98+
const counterView = new DataView(
99+
payload.buffer,
100+
payload.byteOffset + sourceBytes.byteLength,
101+
COUNTER_BYTES,
102+
);
103+
counterView.setUint32(0, counter, false);
104+
105+
const digest = await this.subtle.digest("SHA-256", payload);
106+
return new Uint8Array(digest);
107+
}
108+
}

embeddings/EmbeddingBackend.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
export interface EmbeddingBackend {
2+
readonly kind: string;
3+
readonly dimension: number;
4+
5+
embed(texts: string[]): Promise<Float32Array[]>;
6+
}

package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
"test": "vitest run",
1111
"test:unit": "vitest run",
1212
"test:watch": "vitest",
13-
"guard:model-derived": "node scripts/guard-model-derived.mjs"
13+
"guard:model-derived": "node scripts/guard-model-derived.mjs",
14+
"benchmark:dummy": "vitest bench --watch=false tests/benchmarks/DummyEmbedderHotpath.bench.ts",
15+
"benchmark": "npm run benchmark:dummy"
1416
},
1517
"devDependencies": {
1618
"@eslint/js": "^9.38.0",
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import { bench, describe } from "vitest";
2+
3+
import { DeterministicDummyEmbeddingBackend } from "../../embeddings/DeterministicDummyEmbeddingBackend";
4+
5+
const backend = new DeterministicDummyEmbeddingBackend({ dimension: 1024 });
6+
7+
function buildDeterministicBatch(count: number, tokenSeed: string): string[] {
8+
return Array.from({ length: count }, (_, index) => {
9+
const suffix = `${tokenSeed}-${index.toString(16).padStart(4, "0")}`;
10+
return `${suffix} ${suffix} ${suffix} ${suffix}`;
11+
});
12+
}
13+
14+
const singleShort = ["where is the cache invalidation boundary?"];
15+
const batch16Medium = buildDeterministicBatch(16, "cortex-medium");
16+
const batch64Short = buildDeterministicBatch(64, "cortex-short");
17+
18+
describe("Dummy Embedder Hotpath", () => {
19+
bench("single short input", async () => {
20+
await backend.embed(singleShort);
21+
});
22+
23+
bench("batch 16 medium inputs", async () => {
24+
await backend.embed(batch16Medium);
25+
});
26+
27+
bench("batch 64 short inputs", async () => {
28+
await backend.embed(batch64Short);
29+
});
30+
});
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import { describe, expect, it } from "vitest";
2+
3+
import {
4+
DEFAULT_DUMMY_EMBEDDING_DIMENSION,
5+
SHA256_BLOCK_BYTES,
6+
DeterministicDummyEmbeddingBackend,
7+
} from "../../embeddings/DeterministicDummyEmbeddingBackend";
8+
9+
describe("DeterministicDummyEmbeddingBackend", () => {
10+
it("returns 1024 dimensions by default", async () => {
11+
const backend = new DeterministicDummyEmbeddingBackend();
12+
const [vector] = await backend.embed(["hello cortex"]);
13+
14+
expect(vector).toHaveLength(DEFAULT_DUMMY_EMBEDDING_DIMENSION);
15+
});
16+
17+
it("is deterministic for identical input", async () => {
18+
const backend = new DeterministicDummyEmbeddingBackend();
19+
20+
const [first] = await backend.embed(["same input"]);
21+
const [second] = await backend.embed(["same input"]);
22+
23+
expect(Array.from(first)).toEqual(Array.from(second));
24+
});
25+
26+
it("produces different vectors for different input", async () => {
27+
const backend = new DeterministicDummyEmbeddingBackend();
28+
29+
const [a, b] = await backend.embed(["alpha", "beta"]);
30+
31+
const differs = a.some((value, index) => value !== b[index]);
32+
expect(differs).toBe(true);
33+
});
34+
35+
it("pads input to SHA-256 block boundary using zero bytes", async () => {
36+
const backend = new DeterministicDummyEmbeddingBackend();
37+
const base = "abc";
38+
39+
const inputBytes = new TextEncoder().encode(base).byteLength;
40+
const remainder = inputBytes % SHA256_BLOCK_BYTES;
41+
const padCount = remainder === 0 ? 0 : SHA256_BLOCK_BYTES - remainder;
42+
43+
const equivalentExplicitlyPadded = base + "\0".repeat(padCount);
44+
45+
const [autoPadded] = await backend.embed([base]);
46+
const [alreadyPadded] = await backend.embed([equivalentExplicitlyPadded]);
47+
48+
expect(Array.from(autoPadded)).toEqual(Array.from(alreadyPadded));
49+
});
50+
51+
it("supports custom output dimensions", async () => {
52+
const backend = new DeterministicDummyEmbeddingBackend({ dimension: 97 });
53+
const [vector] = await backend.embed(["custom-dim"]);
54+
55+
expect(vector).toHaveLength(97);
56+
});
57+
58+
it("rejects invalid dimensions", () => {
59+
expect(() => new DeterministicDummyEmbeddingBackend({ dimension: 0 })).toThrow(
60+
/dimension/i,
61+
);
62+
});
63+
});

0 commit comments

Comments
 (0)