Skip to content

Commit d7613c4

Browse files
committed
Add bench-tpuf.js: apples-to-apples vs turbopuffer vector-1m
Reproduces the turbopuffer tpuf-benchmark vector-1m workload spec on hypvector: 1M Cohere Wikipedia embeddings (1024-dim, cosine, topK=10), random unit-vector queries, sweeps rerankFactor and probe across cold (fresh CloudFront client per query) and warm (single client + prefetched binary) modes. Also brute-force recall@10 vs ground truth. bench-tpuf-build.js downloads 10 Cohere shards, writes the hypvector parquet with binary + 256 clusters. Hosted at s3://hyperparam-public/tpuf-bench/tpuf-bench-1m.parquet.
1 parent acd3b41 commit d7613c4

2 files changed

Lines changed: 306 additions & 0 deletions

File tree

scripts/bench-tpuf-build.js

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/**
2+
* Build a hypvector parquet that matches the turbopuffer `vector-10m` workload
3+
* spec for apples-to-apples comparison. At 1M scale here (10 Cohere shards).
4+
*
5+
* Source: Cohere/wikipedia-2023-11-embed-multilingual-v3 (en), 1024-dim
6+
* Output: data/tpuf-bench-1m.parquet
7+
*
8+
* Run with extra heap; ~4GB of packed float32 bytes live in memory before
9+
* parquet write commits them:
10+
* node --max-old-space-size=16384 scripts/bench-tpuf-build.js
11+
*/
12+
import { createWriteStream, promises as fs } from 'node:fs'
13+
import { pipeline } from 'node:stream/promises'
14+
import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
15+
import { fileWriter } from 'hyparquet-writer'
16+
import { writeVectors } from '../src/writeVectors.js'
17+
18+
const SHARDS = parseInt(process.argv[2] ?? '10', 10)
19+
const SHARD_DIR = 'data/cohere-shards'
20+
const DEST = `data/tpuf-bench-${SHARDS}00k.parquet`
21+
const DIM = 1024
22+
const CLUSTERS = 256
23+
const URL_BASE = 'https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/resolve/main/en'
24+
25+
await fs.mkdir(SHARD_DIR, { recursive: true })
26+
27+
/**
28+
* Download a single shard if missing.
29+
* @param {number} i shard index
30+
* @returns {Promise<string>} local file path
31+
*/
32+
async function ensureShard(i) {
33+
const name = String(i).padStart(4, '0') + '.parquet'
34+
const dest = `${SHARD_DIR}/${name}`
35+
const existing = await fs.stat(dest).catch(() => undefined)
36+
if (existing) return dest
37+
const url = `${URL_BASE}/${name}`
38+
console.log(` downloading ${url}`)
39+
const res = await fetch(url)
40+
if (!res.ok) throw new Error(`fetch ${url} failed: ${res.status}`)
41+
if (!res.body) throw new Error(`no body for ${url}`)
42+
await pipeline(res.body, createWriteStream(dest))
43+
return dest
44+
}
45+
46+
console.log(`Downloading ${SHARDS} shards in parallel...`)
47+
const tDownload = performance.now()
48+
const paths = await Promise.all(Array.from({ length: SHARDS }, (_, i) => ensureShard(i)))
49+
console.log(` ${SHARDS} shards ready in ${((performance.now() - tDownload) / 1000).toFixed(1)}s`)
50+
51+
/**
52+
* Stream {id, vector} records from each shard in sequence.
53+
* @returns {AsyncGenerator<{id: string, vector: Float32Array}>}
54+
*/
55+
async function* iterRecords() {
56+
let yielded = 0
57+
for (const path of paths) {
58+
const file = await asyncBufferFromFile(path)
59+
const metadata = await parquetMetadataAsync(file)
60+
const rows = Number(metadata.num_rows)
61+
const BATCH = 1000
62+
for (let i = 0; i < rows; i += BATCH) {
63+
const end = Math.min(i + BATCH, rows)
64+
const objs = await parquetReadObjects({
65+
file, metadata, rowStart: i, rowEnd: end, columns: ['_id', 'emb'],
66+
})
67+
for (const r of objs) {
68+
// emb is { list: [{item: float}, ...] } in this parquet's nested schema
69+
const { emb } = r
70+
// hyparquet decodes list<float> as a plain Float32Array or number[] — both work for writeVectors.
71+
if (!emb || emb.length !== DIM) {
72+
throw new Error(`row ${yielded}: emb length ${emb?.length} != ${DIM}`)
73+
}
74+
yield { id: String(r._id), vector: emb }
75+
yielded += 1
76+
}
77+
}
78+
process.stdout.write(`\r read ${yielded.toLocaleString()} vectors `)
79+
}
80+
process.stdout.write('\n')
81+
}
82+
83+
console.log(`Writing ${DEST} (binary + clusters=${CLUSTERS})...`)
84+
const tWrite = performance.now()
85+
const writer = fileWriter(DEST)
86+
await writeVectors({
87+
writer,
88+
dimension: DIM,
89+
vectors: iterRecords(),
90+
metric: 'cosine',
91+
normalize: true,
92+
binary: true,
93+
clusters: CLUSTERS,
94+
})
95+
const stat = await fs.stat(DEST)
96+
const secs = (performance.now() - tWrite) / 1000
97+
console.log(` wrote ${(stat.size / 1e6).toFixed(1)} MB in ${secs.toFixed(1)}s`)

scripts/bench-tpuf.js

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/**
2+
* Apples-to-apples benchmark against the turbopuffer `vector-10m` workload
3+
* spec, scaled to 1M Cohere Wikipedia embeddings.
4+
*
5+
* Dataset: Cohere/wikipedia-2023-11-embed-multilingual-v3 (1M × 1024-dim)
6+
* Metric: cosine
7+
* topK: 10
8+
* Queries: random unit vectors (NOT from dataset, matches tpuf)
9+
*
10+
* Cold mode: fresh AsyncBuffer per query (no metadata cache, no binary
11+
* prefetch). Mirrors tpuf's `disable_cache: true`.
12+
* Warm mode: single client; metadata parsed once and binary prefetched once
13+
* before any query is scored. Mirrors tpuf's warm-cache namespace.
14+
*
15+
* Recall@10 is also reported (tpuf reports latency only) so the config we
16+
* pick can't cheat by trading recall for speed.
17+
*
18+
* node --max-old-space-size=8192 scripts/bench-tpuf.js [url] [local-file] [n-cold] [n-warm]
19+
*
20+
* Defaults:
21+
* url https://s3.hyperparam.app/tpuf-bench/tpuf-bench-1m.parquet
22+
* local data/tpuf-bench-1000k.parquet
23+
* n-cold 30 (each cold query is a fresh CloudFront fetch — keep small)
24+
* n-warm 200 (per-config; sampled over the entire sweep)
25+
*/
26+
// Force IPv4: home network advertises IPv6 to CloudFront but drops packets.
27+
import dns from 'node:dns'
28+
const origLookup = dns.lookup
29+
// @ts-expect-error overload signature
30+
dns.lookup = (hostname, opts, cb) => {
31+
if (typeof opts === 'function') { cb = opts; opts = {} }
32+
return origLookup.call(dns, hostname, { ...opts ?? {}, family: 4 }, cb)
33+
}
34+
35+
import {
36+
asyncBufferFromFile, asyncBufferFromUrl, cachedAsyncBuffer, parquetMetadataAsync, parquetRead,
37+
} from 'hyparquet'
38+
import { defaultVectorColumn } from '../src/constants.js'
39+
import { prefetchBinary } from '../src/prefetch.js'
40+
import { searchVectors } from '../src/searchVectors.js'
41+
import { dotProduct, l2Normalize, parseKvMetadata, unpackFloat32 } from '../src/utils.js'
42+
43+
const URL = process.argv[2] ?? 'https://s3.hyperparam.app/tpuf-bench/tpuf-bench-1m.parquet'
44+
const LOCAL = process.argv[3] ?? 'data/tpuf-bench-1000k.parquet'
45+
const N_COLD = parseInt(process.argv[4] ?? '30', 10)
46+
const N_WARM = parseInt(process.argv[5] ?? '200', 10)
47+
const TOP_K = 10
48+
49+
// Configurations to sweep. Each is run in both cold and warm modes so we can
50+
// see the recall/latency frontier and pick a fair "apples-to-apples" pick.
51+
const CONFIGS = [
52+
{ name: 'rerank=10 probe=0.10', rerankFactor: 10, probe: 0.10 },
53+
{ name: 'rerank=10 probe=0.25', rerankFactor: 10, probe: 0.25 },
54+
{ name: 'rerank=100 probe=0.10', rerankFactor: 100, probe: 0.10 },
55+
{ name: 'rerank=100 probe=0.25', rerankFactor: 100, probe: 0.25 },
56+
]
57+
58+
console.log(`URL: ${URL}`)
59+
console.log(`Local: ${LOCAL}`)
60+
console.log(`Queries: cold=${N_COLD}/config, warm=${N_WARM}/config, topK=${TOP_K}`)
61+
62+
const local = await asyncBufferFromFile(LOCAL)
63+
const localMeta = await parquetMetadataAsync(local)
64+
const hv = parseKvMetadata(localMeta)
65+
console.log(`File: ${hv.count.toLocaleString()} × ${hv.dimension}-dim, metric=${hv.metric}, clusters=${hv.clusters}`)
66+
67+
// Random unit query vectors — matches tpuf's pseudorandom template.
68+
let lcg = 1
69+
function rand() {
70+
lcg = Math.imul(lcg, 1664525) + 1013904223 >>> 0
71+
return lcg / 0x100000000
72+
}
73+
function randomUnit(dim) {
74+
const v = new Float32Array(dim)
75+
for (let i = 0; i < dim; i += 1) v[i] = rand() * 2 - 1
76+
return l2Normalize(v)
77+
}
78+
const nQueries = Math.max(N_COLD, N_WARM)
79+
console.log(`\nGenerating ${nQueries} random ${hv.dimension}-dim unit queries...`)
80+
const queries = Array.from({ length: nQueries }, () => randomUnit(hv.dimension))
81+
82+
// Brute-force ground truth — heavy at this scale, so we limit to the cold
83+
// sample size. Warm reuses ground truth for queries < N_COLD; queries beyond
84+
// that count only toward latency, not recall (still statistically valid).
85+
const N_GT = Math.min(N_COLD, nQueries)
86+
console.log(`Computing brute-force ground truth for ${N_GT} queries (~50s)...`)
87+
const tGT = performance.now()
88+
/** @type {Array<{score: number, idx: number}[]>} */
89+
const groundTruthHeaps = Array.from({ length: N_GT }, () => [])
90+
let rowsScored = 0
91+
await parquetRead({
92+
file: local,
93+
metadata: localMeta,
94+
columns: [defaultVectorColumn],
95+
onChunk: ({ columnData, rowStart }) => {
96+
for (let r = 0; r < columnData.length; r += 1) {
97+
const vec = unpackFloat32(columnData[r])
98+
const rowIdx = rowStart + r
99+
for (let q = 0; q < N_GT; q += 1) {
100+
const score = dotProduct(queries[q], vec)
101+
const heap = groundTruthHeaps[q]
102+
if (heap.length < TOP_K) {
103+
heap.push({ score, idx: rowIdx })
104+
if (heap.length === TOP_K) heap.sort((a, b) => a.score - b.score)
105+
} else if (score > heap[0].score) {
106+
heap[0] = { score, idx: rowIdx }
107+
heap.sort((a, b) => a.score - b.score)
108+
}
109+
}
110+
}
111+
rowsScored += columnData.length
112+
if (rowsScored % 50000 < columnData.length) {
113+
process.stdout.write(`\r ${rowsScored.toLocaleString()} rows scored `)
114+
}
115+
},
116+
})
117+
process.stdout.write('\n')
118+
const groundTruth = groundTruthHeaps.map(h => new Set(h.map(e => e.idx)))
119+
console.log(` ground truth in ${((performance.now() - tGT) / 1000).toFixed(1)}s`)
120+
121+
/**
122+
* @param {{rowIndex: number}[]} results
123+
* @param {Set<number>} truth
124+
* @returns {number}
125+
*/
126+
function recallAt(results, truth) {
127+
let matches = 0
128+
for (const r of results) if (truth.has(r.rowIndex)) matches += 1
129+
return matches / truth.size
130+
}
131+
132+
/**
133+
* @param {number[]} arr sorted ascending
134+
* @param {number} q quantile in [0,1]
135+
* @returns {number}
136+
*/
137+
function pct(arr, q) { return arr[Math.min(arr.length - 1, Math.floor(q * arr.length))] }
138+
139+
/**
140+
* @param {string} label
141+
* @param {number[]} times
142+
* @param {number[]} recalls
143+
*/
144+
function printRow(label, times, recalls) {
145+
times.sort((a, b) => a - b)
146+
const avgRecall = recalls.length ? recalls.reduce((a, b) => a + b, 0) / recalls.length : NaN
147+
const recallStr = Number.isFinite(avgRecall) ? `${(avgRecall * 100).toFixed(1).padStart(5)}%` : ' n/a'
148+
console.log(
149+
` ${label.padEnd(28)} ` +
150+
`p50=${pct(times, 0.5).toFixed(0).padStart(5)}ms ` +
151+
`p90=${pct(times, 0.9).toFixed(0).padStart(5)}ms ` +
152+
`p99=${pct(times, 0.99).toFixed(0).padStart(5)}ms ` +
153+
`recall(n=${String(recalls.length).padStart(3)})=${recallStr}`
154+
)
155+
}
156+
157+
// ── COLD: fresh AsyncBuffer per query, per config ────────────────────────
158+
console.log('\n=== COLD (fresh CloudFront client per query) ===')
159+
for (const cfg of CONFIGS) {
160+
const times = []
161+
const recalls = []
162+
for (let q = 0; q < N_COLD; q += 1) {
163+
const raw = await asyncBufferFromUrl({ url: URL })
164+
const cached = cachedAsyncBuffer(raw)
165+
const t0 = performance.now()
166+
const hits = await searchVectors({
167+
source: cached, query: queries[q], topK: TOP_K,
168+
rerankFactor: cfg.rerankFactor, probe: cfg.probe,
169+
})
170+
times.push(performance.now() - t0)
171+
recalls.push(recallAt(hits, groundTruth[q]))
172+
process.stdout.write(`\r ${cfg.name}: ${q + 1}/${N_COLD} `)
173+
}
174+
process.stdout.write('\n')
175+
printRow(cfg.name, times, recalls)
176+
}
177+
178+
// ── WARM: single client + prefetched binary, sweep configs ──────────────
179+
console.log('\n=== WARM (single client + prefetched binary) ===')
180+
const warmRaw = await asyncBufferFromUrl({ url: URL })
181+
const warmCached = cachedAsyncBuffer(warmRaw)
182+
const warmMeta = await parquetMetadataAsync(warmCached)
183+
const tPrefetch = performance.now()
184+
const warmBin = await prefetchBinary({ source: warmCached, metadata: warmMeta })
185+
console.log(` setup: metadata + ${(warmBin.byteLength / 1e6).toFixed(0)} MB binary in ${((performance.now() - tPrefetch) / 1000).toFixed(2)}s`)
186+
for (const cfg of CONFIGS) {
187+
const times = []
188+
const recalls = []
189+
for (let q = 0; q < N_WARM; q += 1) {
190+
const t0 = performance.now()
191+
const hits = await searchVectors({
192+
source: warmCached, metadata: warmMeta, binary: warmBin,
193+
query: queries[q], topK: TOP_K,
194+
rerankFactor: cfg.rerankFactor, probe: cfg.probe,
195+
})
196+
times.push(performance.now() - t0)
197+
if (q < N_GT) recalls.push(recallAt(hits, groundTruth[q]))
198+
process.stdout.write(`\r ${cfg.name}: ${q + 1}/${N_WARM} `)
199+
}
200+
process.stdout.write('\n')
201+
printRow(cfg.name, times, recalls)
202+
}
203+
204+
console.log('\nReference (tpuf vector-1m, 768-dim, c2-standard-30 in GCP us-central1):')
205+
console.log(' warm: p50= 8ms p90= 10ms p99= 35ms')
206+
console.log(' cold: p50=343ms p90=444ms p99=554ms')
207+
console.log('Reference (tpuf vector-10m, 1024-dim — for scale, 10× our N):')
208+
console.log(' warm: p50= 14ms p90= 17ms p99= 27ms')
209+
console.log(' cold: p50=874ms p90=1214ms p99=1686ms')

0 commit comments

Comments
 (0)