Skip to content

Commit acd3b41

Browse files
committed
Add bench-zilliz.js: filter-recall comparison
Builds a random-unit-vector corpus with independent random tenant labels — matches the real "metadata filter on semantic embeddings" workload (e.g. agent_id on logs) the Zilliz benchmark exposes. Compares three search strategies for "top-K within tenant X": A) unfiltered whole-corpus B) search whole, post-filter to target tenant (with overfetch sweep) C) search just the target tenant's parquet file (shard-as-filter) Result at 150k vectors / 128 tenants / 30 queries / top-100: A) unfiltered recall= 1.8% 335 ms B) post-filter 1x recall= 1.8% 332 ms B) post-filter 10x recall= 9.3% 2283 ms B) post-filter 30x recall= 25.3% 4904 ms B) post-filter 100x recall= 81.9% 5768 ms C) shard-as-filter recall=100.0% 2.5 ms Shard-as-filter beats post-filter by 2300x on latency and 18pp on recall simultaneously, using the same binary+rerank engine on both sides — the difference is purely architectural (pre-filter via file sharding vs post-filter on a single mixed index). Also exercises the new multi-source searchVectors API (mode C') — results bit-identical to single-source.
1 parent 2172dab commit acd3b41

1 file changed

Lines changed: 193 additions & 0 deletions

File tree

scripts/bench-zilliz.js

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
/**
2+
* Approximate the Zilliz "filtered recall" experiment against hypvector.
3+
*
4+
* Mirror the real workload: vectors are semantically distributed (random
5+
* unit vectors across the embedding sphere), and tenant_id is a *metadata
6+
* filter*, independent of vector position. The interesting query is
7+
* "top-K similar to Q within tenant X", and the question is whether each
8+
* strategy actually recovers the true within-tenant top-K.
9+
*
10+
* A) unfiltered — search the whole corpus, ignore tenant. The
11+
* result is mostly cross-tenant noise; recall
12+
* against the within-tenant truth is ~fraction-of-
13+
* corpus-in-target-tenant.
14+
* B) post-filter — search the whole corpus with an inflated topK,
15+
* then reject rows from the wrong tenant. This is
16+
* the failure mode the Zilliz benchmark exposes:
17+
* the global top-K doesn't contain enough target-
18+
* tenant rows for the filter to recover them.
19+
* C) shard-as-filter — search just the target tenant's parquet file.
20+
* hypvector's native pre-filter via file sharding.
21+
*
22+
* Usage:
23+
* node scripts/bench-zilliz.js [vectors] [tenants] [queries] [dim]
24+
*
25+
* Defaults: 50000 / 16 / 50 / 384
26+
*/
27+
import { promises as fs } from 'node:fs'
28+
import { asyncBufferFromFile, parquetMetadataAsync } from 'hyparquet'
29+
import { fileWriter } from 'hyparquet-writer'
30+
import { prefetchBinary } from '../src/prefetch.js'
31+
import { searchVectors } from '../src/searchVectors.js'
32+
import { dotProduct, l2Normalize } from '../src/utils.js'
33+
import { writeVectors } from '../src/writeVectors.js'
34+
35+
const TOTAL = parseInt(process.argv[2] ?? '50000', 10)
36+
const TENANTS = parseInt(process.argv[3] ?? '16', 10)
37+
const QUERIES = parseInt(process.argv[4] ?? '50', 10)
38+
const DIM = parseInt(process.argv[5] ?? '384', 10)
39+
const TOP_K = 100
40+
// Sweep post-filter overfetch factors so we can see whether mode B catches up
41+
// once we ask for enough global candidates. 1× = no overfetch, just topK.
42+
const POST_FILTER_OVERFETCHES = [1, 10, 30, 100]
43+
const WHOLE_FILE = 'data/zilliz_whole.parquet'
44+
const TENANT_DIR = 'data/zilliz_tenants'
45+
const PER_TENANT = Math.ceil(TOTAL / TENANTS)
46+
47+
console.log(`Config: ${TOTAL.toLocaleString()} vectors × ${DIM}-dim, ${TENANTS} tenants (~${PER_TENANT.toLocaleString()} each), ${QUERIES} queries, top-${TOP_K}`)
48+
49+
// Deterministic LCG so re-runs match.
50+
let lcg = 1
51+
function rand() {
52+
lcg = Math.imul(lcg, 1664525) + 1013904223 >>> 0
53+
return lcg / 0x100000000
54+
}
55+
// Uniform cube → normalized. Not strictly uniform on the sphere, but in dim=384
56+
// the bias is tiny (concentration of measure) and it's ~10× faster than
57+
// Box-Muller, which matters at 1M scale.
58+
function unitVec(dim) {
59+
const v = new Float32Array(dim)
60+
for (let i = 0; i < dim; i += 1) v[i] = rand() * 2 - 1
61+
return l2Normalize(v)
62+
}
63+
64+
console.log(`Generating ${TOTAL.toLocaleString()} random unit vectors with independent tenant labels...`)
65+
// Tenant_id is independent of vector position — matches the real-world
66+
// "metadata filter on semantic embeddings" workload (e.g., agent_id on logs).
67+
const genStart = performance.now()
68+
/** @type {Float32Array[]} */
69+
const allVecs = new Array(TOTAL)
70+
/** @type {Int32Array} */
71+
const tenantOf = new Int32Array(TOTAL)
72+
for (let i = 0; i < TOTAL; i += 1) {
73+
allVecs[i] = unitVec(DIM)
74+
tenantOf[i] = Math.floor(rand() * TENANTS)
75+
}
76+
console.log(` generated in ${((performance.now() - genStart) / 1000).toFixed(1)}s`)
77+
78+
// Indexes within each tenant for ground-truth and per-tenant files.
79+
/** @type {number[][]} */
80+
const tenantRows = Array.from({ length: TENANTS }, () => [])
81+
for (let i = 0; i < TOTAL; i += 1) tenantRows[tenantOf[i]].push(i)
82+
83+
console.log('\nWriting whole-corpus parquet...')
84+
{
85+
// No clustering on the whole-corpus file: tenant_id is independent of
86+
// vector position, so k-means partitions would be orthogonal to tenants
87+
// and probe<1 would silently cap recall. Use plain binary+rerank so the
88+
// comparison with per-tenant files is on equal footing.
89+
const writer = fileWriter(WHOLE_FILE)
90+
await writeVectors({
91+
writer, dimension: DIM, normalize: false, binary: true,
92+
vectors: function* () {
93+
for (let i = 0; i < TOTAL; i += 1) yield { id: `t${tenantOf[i]}-r${i}`, vector: allVecs[i] }
94+
}(),
95+
})
96+
const stat = await fs.stat(WHOLE_FILE)
97+
console.log(` ${WHOLE_FILE}: ${(stat.size / 1e6).toFixed(1)} MB`)
98+
}
99+
100+
console.log('Writing per-tenant parquets...')
101+
await fs.mkdir(TENANT_DIR, { recursive: true })
102+
const tenantFiles = []
103+
for (let t = 0; t < TENANTS; t += 1) {
104+
const path = `${TENANT_DIR}/t${t}.parquet`
105+
tenantFiles.push(path)
106+
const rows = tenantRows[t]
107+
const writer = fileWriter(path)
108+
await writeVectors({
109+
writer, dimension: DIM, normalize: false, binary: true,
110+
vectors: function* () {
111+
for (const i of rows) yield { id: `t${t}-r${i}`, vector: allVecs[i] }
112+
}(),
113+
})
114+
}
115+
console.log(` ${tenantFiles.length} files in ${TENANT_DIR}/`)
116+
117+
// Brute-force ground truth: for each query, return the true within-tenant top-K.
118+
function trueTopKInTenant(query, t) {
119+
const rows = tenantRows[t]
120+
const scored = rows.map(i => ({ id: `t${t}-r${i}`, score: dotProduct(query, allVecs[i]) }))
121+
scored.sort((a, b) => b.score - a.score)
122+
return new Set(scored.slice(0, TOP_K).map(s => s.id))
123+
}
124+
125+
// Pick query vectors from random rows, perturbed slightly so the query isn't a perfect self-hit.
126+
/** @type {{ tenant: number, vec: Float32Array }[]} */
127+
const queries = []
128+
for (let q = 0; q < QUERIES; q += 1) {
129+
const t = Math.floor(rand() * TENANTS)
130+
const rows = tenantRows[t]
131+
const pickRow = rows[Math.floor(rand() * rows.length)]
132+
const base = allVecs[pickRow]
133+
// Small uniform perturbation so the query isn't a perfect self-hit.
134+
const v = new Float32Array(DIM)
135+
for (let d = 0; d < DIM; d += 1) v[d] = base[d] + 0.05 * (rand() * 2 - 1)
136+
queries.push({ tenant: t, vec: l2Normalize(v) })
137+
}
138+
139+
console.log('\nComputing ground truth (brute-force within-tenant top-K)...')
140+
const groundTruth = queries.map(({ tenant, vec }) => trueTopKInTenant(vec, tenant))
141+
142+
// Open files + parse metadata + prefetch binary, once.
143+
const wholeBuf = await asyncBufferFromFile(WHOLE_FILE)
144+
const wholeMeta = await parquetMetadataAsync(wholeBuf)
145+
const wholeBin = await prefetchBinary({ source: wholeBuf, metadata: wholeMeta })
146+
147+
const tenantBufs = await Promise.all(tenantFiles.map(p => asyncBufferFromFile(p)))
148+
const tenantMetas = await Promise.all(tenantBufs.map(b => parquetMetadataAsync(b)))
149+
const tenantBins = await Promise.all(tenantBufs.map((b, i) => prefetchBinary({ source: b, metadata: tenantMetas[i] })))
150+
151+
/**
152+
* @param {string} label
153+
* @param {(query: { tenant: number, vec: Float32Array }) => Promise<Array<{ id: string }>>} queryFn
154+
* @returns {Promise<{ recall: number, ms: number }>}
155+
*/
156+
async function runMode(label, queryFn) {
157+
let totalRecall = 0
158+
const t0 = performance.now()
159+
for (let q = 0; q < queries.length; q += 1) {
160+
const hits = await queryFn(queries[q])
161+
const truth = groundTruth[q]
162+
let matches = 0
163+
for (const h of hits) if (truth.has(String(h.id))) matches += 1
164+
totalRecall += matches / truth.size
165+
}
166+
const ms = (performance.now() - t0) / queries.length
167+
const recall = totalRecall / queries.length
168+
console.log(`${label.padEnd(28)} recall@${TOP_K}=${(recall * 100).toFixed(1).padStart(5)}% ${ms.toFixed(1).padStart(6)} ms/query`)
169+
return { recall, ms }
170+
}
171+
172+
console.log('\n=== Results ===')
173+
await runMode('A) unfiltered whole-corpus', async ({ vec }) =>
174+
searchVectors({ source: wholeBuf, metadata: wholeMeta, binary: wholeBin, query: vec, topK: TOP_K })
175+
)
176+
177+
for (const f of POST_FILTER_OVERFETCHES) {
178+
await runMode(`B) post-filter (overfetch ${f.toString().padStart(3)}×)`, async ({ tenant, vec }) => {
179+
const hits = await searchVectors({
180+
source: wholeBuf, metadata: wholeMeta, binary: wholeBin, query: vec, topK: TOP_K * f,
181+
})
182+
const prefix = `t${tenant}-r`
183+
return hits.filter(h => String(h.id).startsWith(prefix)).slice(0, TOP_K)
184+
})
185+
}
186+
187+
await runMode('C) shard-as-filter (1 file)', async ({ tenant, vec }) =>
188+
searchVectors({ source: tenantBufs[tenant], metadata: tenantMetas[tenant], binary: tenantBins[tenant], query: vec, topK: TOP_K })
189+
)
190+
191+
await runMode('C\') shard via array (1 file)', async ({ tenant, vec }) =>
192+
searchVectors({ source: [tenantBufs[tenant]], metadata: [tenantMetas[tenant]], binary: [tenantBins[tenant]], query: vec, topK: TOP_K })
193+
)

0 commit comments

Comments
 (0)