|
| 1 | +/** |
| 2 | + * Validation harness for the still-open experiments in PLAN_AUTO.md. |
| 3 | + * |
| 4 | + * Subcommands: |
| 5 | + * |
| 6 | + * recall <src> [N] — clusters sweep reporting BOTH recall@10 and recall@100, |
| 7 | + * to discriminate the ~94% recall@10 ceiling on LLM logs. |
| 8 | + * smalln <src> — binary-column crossover: for small N, compare file size |
| 9 | + * and search latency/recall of binary-rerank vs exact scan. |
| 10 | + * scale <src> <Ns> — clusters sweep at one or more N subsets (comma-separated), |
| 11 | + * to confirm the sqrt(N)/2 latency optimum holds across sizes. |
| 12 | + * |
| 13 | + * All files are written under data/_vp_*.parquet and reused if present. |
| 14 | + * |
| 15 | + * Usage: |
| 16 | + * node scripts/validate-params.js recall data/llmlog.vectors.parquet |
| 17 | + * node scripts/validate-params.js smalln data/llmlog.vectors.parquet |
| 18 | + * node scripts/validate-params.js scale data/tpuf-bench-1000k.parquet 250000,500000,1000000 |
| 19 | + */ |
| 20 | +import { promises as fs } from 'node:fs' |
| 21 | +import { asyncBufferFromFile, cachedAsyncBuffer, parquetMetadataAsync } from 'hyparquet' |
| 22 | +import { fileWriter } from 'hyparquet-writer' |
| 23 | +import { readVectors } from '../src/readVectors.js' |
| 24 | +import { searchVectors } from '../src/searchVectors.js' |
| 25 | +import { parseKvMetadata } from '../src/utils.js' |
| 26 | +import { writeVectors } from '../src/writeVectors.js' |
| 27 | + |
| 28 | +/** @import { AsyncBuffer } from 'hyparquet' */ |
| 29 | + |
| 30 | +const MODE = process.argv[2] |
| 31 | +const SRC = process.argv[3] |
| 32 | +const ARG = process.argv[4] |
| 33 | +const QUERY_COUNT = 20 |
| 34 | + |
| 35 | +if (!MODE || !SRC) { |
| 36 | + console.error('Usage: node scripts/validate-params.js <recall|smalln|scale> <src> [arg]') |
| 37 | + process.exit(1) |
| 38 | +} |
| 39 | + |
| 40 | +/** |
| 41 | + * Read up to `limit` records from a vectors parquet into memory. |
| 42 | + * @param {string} src |
| 43 | + * @param {number} [limit] |
| 44 | + * @returns {Promise<{ records: { id: string, vector: Float32Array }[], meta: any }>} |
| 45 | + */ |
| 46 | +async function loadRecords(src, limit) { |
| 47 | + const file = await asyncBufferFromFile(src) |
| 48 | + const metadata = await parquetMetadataAsync(file) |
| 49 | + const meta = parseKvMetadata(metadata) |
| 50 | + const records = [] |
| 51 | + for await (const record of readVectors({ file, metadata, includeMetadata: false })) { |
| 52 | + records.push(record) |
| 53 | + if (limit && records.length >= limit) break |
| 54 | + } |
| 55 | + return { records, meta } |
| 56 | +} |
| 57 | + |
| 58 | +/** |
| 59 | + * Pick evenly spaced query vectors from the corpus. |
| 60 | + * @param {{ vector: Float32Array }[]} records |
| 61 | + * @param {number} count |
| 62 | + * @returns {Float32Array[]} |
| 63 | + */ |
| 64 | +function pickQueries(records, count) { |
| 65 | + const queries = [] |
| 66 | + const step = Math.max(1, Math.floor(records.length / (count + 1))) |
| 67 | + for (let i = 0, pick = step; i < records.length && queries.length < count; i += 1) { |
| 68 | + if (i === pick) { queries.push(records[i].vector); pick += step } |
| 69 | + } |
| 70 | + return queries |
| 71 | +} |
| 72 | + |
| 73 | +/** |
| 74 | + * @param {AsyncBuffer} buf |
| 75 | + * @returns {AsyncBuffer & { bytes: number, fetches: number }} |
| 76 | + */ |
| 77 | +function instrument(buf) { |
| 78 | + const slice = buf.slice.bind(buf) |
| 79 | + const w = { |
| 80 | + byteLength: buf.byteLength, bytes: 0, fetches: 0, |
| 81 | + slice(s, e) { w.bytes += (e ?? buf.byteLength) - s; w.fetches += 1; return slice(s, e) }, |
| 82 | + } |
| 83 | + return w |
| 84 | +} |
| 85 | + |
| 86 | +function avg(a) { let s = 0; for (const x of a) s += x; return s / a.length } |
| 87 | + |
| 88 | +/** |
| 89 | + * Run a search over every query and collect timing + the returned id lists. |
| 90 | + * @param {string} path |
| 91 | + * @param {Float32Array[]} queries |
| 92 | + * @param {number} topK |
| 93 | + * @param {object} extra |
| 94 | + * @returns {Promise<{ ms: number, mb: number, fetches: number, tops: string[][] }>} |
| 95 | + */ |
| 96 | +async function bench(path, queries, topK, extra) { |
| 97 | + const times = [], bytesA = [], fetchesA = [], tops = [] |
| 98 | + for (const q of queries) { |
| 99 | + const raw = instrument(await asyncBufferFromFile(path)) |
| 100 | + const cached = cachedAsyncBuffer(raw) |
| 101 | + const start = performance.now() |
| 102 | + const r = await searchVectors({ source: cached, query: q, topK, ...extra }) |
| 103 | + times.push(performance.now() - start) |
| 104 | + bytesA.push(raw.bytes); fetchesA.push(raw.fetches); tops.push(r.map(x => String(x.id))) |
| 105 | + } |
| 106 | + return { ms: avg(times), mb: avg(bytesA) / 1e6, fetches: avg(fetchesA), tops } |
| 107 | +} |
| 108 | + |
| 109 | +/** |
| 110 | + * Recall of `tops` against reference `refTops`, truncating both to `k`. |
| 111 | + * @param {string[][]} refTops |
| 112 | + * @param {string[][]} tops |
| 113 | + * @param {number} k |
| 114 | + * @returns {number} |
| 115 | + */ |
| 116 | +function recallAt(refTops, tops, k) { |
| 117 | + let hits = 0, total = 0 |
| 118 | + for (let i = 0; i < refTops.length; i += 1) { |
| 119 | + const refSet = new Set(refTops[i].slice(0, k)) |
| 120 | + for (const id of tops[i].slice(0, k)) if (refSet.has(id)) hits += 1 |
| 121 | + total += refSet.size |
| 122 | + } |
| 123 | + return hits / total |
| 124 | +} |
| 125 | + |
| 126 | +/** |
| 127 | + * Write a clustered+binary file for a given cluster count (idempotent). |
| 128 | + * @param {string} tag |
| 129 | + * @param {{ id: string, vector: Float32Array }[]} records |
| 130 | + * @param {any} meta |
| 131 | + * @param {number} clusters |
| 132 | + * @param {boolean} binary |
| 133 | + * @returns {Promise<string>} |
| 134 | + */ |
| 135 | +async function writeVariant(tag, records, meta, clusters, binary) { |
| 136 | + const path = `data/_vp_${tag}.parquet` |
| 137 | + if (await fs.stat(path).catch(() => undefined)) return path |
| 138 | + const start = performance.now() |
| 139 | + await writeVectors({ |
| 140 | + writer: fileWriter(path), |
| 141 | + dimension: meta.dimension, |
| 142 | + metric: meta.metric, |
| 143 | + normalize: meta.normalized, |
| 144 | + vectors: records, |
| 145 | + binary, |
| 146 | + clusters, |
| 147 | + }) |
| 148 | + console.log(` wrote ${path} (clusters=${clusters}, binary=${binary}) in ${((performance.now() - start) / 1000).toFixed(1)}s`) |
| 149 | + return path |
| 150 | +} |
| 151 | + |
| 152 | +// --- recall@10 + recall@100 sweep --------------------------------------- |
| 153 | +async function runRecall() { |
| 154 | + const limit = ARG ? Number(ARG) : undefined |
| 155 | + const { records, meta } = await loadRecords(SRC, limit) |
| 156 | + const N = records.length |
| 157 | + const sqrtN = Math.round(Math.sqrt(N)) |
| 158 | + const queries = pickQueries(records, QUERY_COUNT) |
| 159 | + console.log(`recall: ${SRC} N=${N.toLocaleString()} dim=${meta.dimension} sqrtN=${sqrtN}`) |
| 160 | + |
| 161 | + const clusterValues = [0, Math.round(sqrtN / 2), sqrtN, 2 * sqrtN] |
| 162 | + const base = `${SRC.replace(/\.parquet$/, '').split('/').pop()}_N${N}` |
| 163 | + const paths = {} |
| 164 | + for (const c of clusterValues) paths[c] = await writeVariant(`${base}_c${c}`, records, meta, c, true) |
| 165 | + |
| 166 | + // Reference: exact full scan, top-100. |
| 167 | + console.log('Reference: exact top-100 full scan...') |
| 168 | + const ref = await bench(paths[0], queries, 100, { rerankFactor: 0 }) |
| 169 | + |
| 170 | + console.log('\n=== clusters sweep, recall@10 vs recall@100 (probe/rerank default) ===') |
| 171 | + console.log(`${'clusters'.padStart(10)} ${'ms'.padStart(7)} ${'fetches'.padStart(8)} ${'MB read'.padStart(9)} ${'r@10'.padStart(7)} ${'r@100'.padStart(7)}`) |
| 172 | + console.log('-'.repeat(58)) |
| 173 | + for (const c of clusterValues) { |
| 174 | + const opts = c === 0 ? { rerankFactor: 10 } : {} |
| 175 | + const r = await bench(paths[c], queries, 100, opts) |
| 176 | + const r10 = recallAt(ref.tops, r.tops, 10) |
| 177 | + const r100 = recallAt(ref.tops, r.tops, 100) |
| 178 | + console.log(`${String(c).padStart(10)} ${r.ms.toFixed(1).padStart(7)} ${r.fetches.toFixed(0).padStart(8)} ${r.mb.toFixed(2).padStart(9)} ${(r10 * 100).toFixed(1).padStart(6)}% ${(r100 * 100).toFixed(1).padStart(6)}%`) |
| 179 | + } |
| 180 | +} |
| 181 | + |
| 182 | +// --- small-N binary crossover ------------------------------------------- |
| 183 | +async function runSmallN() { |
| 184 | + const sizes = (ARG ?? '500,1000,2000,5000,10000,20000').split(',').map(Number) |
| 185 | + const maxN = Math.max(...sizes) |
| 186 | + const { records: all, meta } = await loadRecords(SRC, maxN) |
| 187 | + console.log(`smalln: ${SRC} dim=${meta.dimension}, sizes=${sizes.join(',')}`) |
| 188 | + console.log(`\n${'N'.padStart(7)} ${'noBin MB'.padStart(9)} ${'bin MB'.padStart(8)} ${'+%'.padStart(6)} ${'exact ms'.padStart(9)} ${'rerank ms'.padStart(10)} ${'speedup'.padStart(8)} ${'recall'.padStart(7)}`) |
| 189 | + console.log('-'.repeat(74)) |
| 190 | + for (const N of sizes) { |
| 191 | + const records = all.slice(0, N) |
| 192 | + const queries = pickQueries(records, Math.min(QUERY_COUNT, N)) |
| 193 | + const tag = `${SRC.replace(/\.parquet$/, '').split('/').pop()}_sn${N}` |
| 194 | + // No-binary file (exact scan only) and binary file (no clusters, rerank path). |
| 195 | + const exactPath = await writeVariant(`${tag}_nobin`, records, meta, 0, false) |
| 196 | + const binPath = await writeVariant(`${tag}_bin`, records, meta, 0, true) |
| 197 | + const exactSize = (await fs.stat(exactPath)).size |
| 198 | + const binSize = (await fs.stat(binPath)).size |
| 199 | + // Reference = exact top-10 on the no-binary file. |
| 200 | + const ref = await bench(exactPath, queries, 10, { rerankFactor: 0 }) |
| 201 | + const rerank = await bench(binPath, queries, 10, {}) |
| 202 | + const recall = recallAt(ref.tops, rerank.tops, 10) |
| 203 | + const pct = (binSize - exactSize) / exactSize * 100 |
| 204 | + const speedup = ref.ms / rerank.ms |
| 205 | + console.log(`${String(N).padStart(7)} ${(exactSize / 1e6).toFixed(2).padStart(9)} ${(binSize / 1e6).toFixed(2).padStart(8)} ${pct.toFixed(1).padStart(5)}% ${ref.ms.toFixed(2).padStart(9)} ${rerank.ms.toFixed(2).padStart(10)} ${speedup.toFixed(2).padStart(7)}x ${(recall * 100).toFixed(1).padStart(6)}%`) |
| 206 | + } |
| 207 | +} |
| 208 | + |
| 209 | +// --- clusters sweep at scale -------------------------------------------- |
| 210 | +async function runScale() { |
| 211 | + const Ns = (ARG ?? '').split(',').filter(Boolean).map(Number) |
| 212 | + if (!Ns.length) { console.error('scale needs comma-separated N list'); process.exit(1) } |
| 213 | + const maxN = Math.max(...Ns) |
| 214 | + console.log(`scale: loading up to ${maxN.toLocaleString()} from ${SRC}...`) |
| 215 | + const { records: all, meta } = await loadRecords(SRC, maxN) |
| 216 | + console.log(` loaded ${all.length.toLocaleString()} × ${meta.dimension}-dim`) |
| 217 | + for (const N of Ns) { |
| 218 | + const records = all.slice(0, N) |
| 219 | + const sqrtN = Math.round(Math.sqrt(N)) |
| 220 | + const queries = pickQueries(records, QUERY_COUNT) |
| 221 | + const clusterValues = [Math.round(sqrtN / 2), sqrtN, 2 * sqrtN] |
| 222 | + const base = `${SRC.replace(/\.parquet$/, '').split('/').pop()}_sc${N}` |
| 223 | + console.log(`\n=== N=${N.toLocaleString()} (sqrtN=${sqrtN}) ===`) |
| 224 | + const paths = {} |
| 225 | + // c=0 reference file (binary, no clusters) for exact top-10. |
| 226 | + const refPath = await writeVariant(`${base}_c0`, records, meta, 0, true) |
| 227 | + for (const c of clusterValues) paths[c] = await writeVariant(`${base}_c${c}`, records, meta, c, true) |
| 228 | + const ref = await bench(refPath, queries, 10, { rerankFactor: 0 }) |
| 229 | + console.log(`${'clusters'.padStart(10)} ${'ms'.padStart(7)} ${'fetches'.padStart(8)} ${'MB read'.padStart(9)} ${'recall'.padStart(8)}`) |
| 230 | + console.log('-'.repeat(50)) |
| 231 | + for (const c of clusterValues) { |
| 232 | + const r = await bench(paths[c], queries, 10, {}) |
| 233 | + const rec = recallAt(ref.tops, r.tops, 10) |
| 234 | + const label = c === Math.round(sqrtN / 2) ? `${c} (√N/2)` : c === sqrtN ? `${c} (√N)` : `${c} (2√N)` |
| 235 | + console.log(`${label.padStart(10)} ${r.ms.toFixed(1).padStart(7)} ${r.fetches.toFixed(0).padStart(8)} ${r.mb.toFixed(2).padStart(9)} ${(rec * 100).toFixed(1).padStart(7)}%`) |
| 236 | + } |
| 237 | + } |
| 238 | +} |
| 239 | + |
| 240 | +if (MODE === 'recall') await runRecall() |
| 241 | +else if (MODE === 'smalln') await runSmallN() |
| 242 | +else if (MODE === 'scale') await runScale() |
| 243 | +else { console.error(`unknown mode: ${MODE}`); process.exit(1) } |
0 commit comments