hyparam
diff --git a/‎README.md‎
Lines changed: 11 additions & 2 deletions b/‎README.md‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎bin/inspect.js‎
Lines changed: 5 additions & 0 deletions b/‎bin/inspect.js‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/ablation.js‎
Lines changed: 4 additions & 2 deletions b/‎scripts/ablation.js‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/constants.js‎
Lines changed: 10 additions & 0 deletions b/‎src/constants.js‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/index.d.ts‎
Lines changed: 1 addition & 0 deletions b/‎src/index.d.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/pq.js‎
Lines changed: 243 additions & 0 deletions b/‎src/pq.js‎
Lines changed: 243 additions & 0 deletions
@@ -66,6 +66,7 @@ await writeVectors({
   normalize: true,    // L2-normalize on write; lets search skip sqrt for cosine
   binary: true,       // also write 1-bit-per-dim sign column for binary+rerank search
   clusters: 128,      // k-means clusters for phase-1 pruning (implies binary: true)
+  pq: true,           // optional product-quantized codes for approximate scoring before rerank
   vectors: myEmbedder(), // any sync or async iterable of { id, vector }
 })
 ```
@@ -155,6 +156,7 @@ const results = await searchVectors({
   source: 'https://example.com/vectors.parquet', // URL, local file path, or an open AsyncBuffer
   query: queryVec,    // Float32Array of length `dimension`
   topK: 10,
+  algorithm: 'auto', // 'auto' | 'exact' | 'binary' | 'pq'
   rerankFactor: 10,   // candidate pool = topK * rerankFactor (default 10). Set to 0 to force exact full scan.
   probe: 0.25,        // fraction of clusters to scan in phase 1 (default 0.25). Set to 1 to scan all clusters; pass an integer > 1 for an absolute count.
 })
@@ -166,11 +168,11 @@ const results = await searchVectors({
 
 ### How it works
 
-Three columns: `id` (STRING), `vector` (`FIXED_LEN_BYTE_ARRAY(4 × dim)`, raw float32 bytes, `UNCOMPRESSED`), and — when `binary: true` — `vector_bin` (`FIXED_LEN_BYTE_ARRAY(dim/8)`, 1 bit per dim).
+Core columns: `id` (STRING), `vector` (`FIXED_LEN_BYTE_ARRAY(4 × dim)`, raw float32 bytes, `UNCOMPRESSED`), and optional ANN columns: `vector_bin` (`FIXED_LEN_BYTE_ARRAY(dim/8)`, 1 bit per dim) when `binary: true`, and `vector_pq` (`FIXED_LEN_BYTE_ARRAY(pqSegments)`) when `pq: true`.
 
 **Exact search path** (no binary column, or `rerankFactor: 0`): single pass over the float32 column via `parquetRead({ onChunk })`. Each row-group's decoded `Uint8Array[]` shares a backing buffer, so we view it as one aligned `Float32Array` and stride by `dim` — zero per-row allocations.
 
-**Binary + cluster + rerank path** (default when `binary: true`):
+**Binary + cluster + rerank path** (default when `binary: true` and no PQ column is present):
 
 1. **Build-time clustering** (when `clusters > 0`): k-means on the 1-bit codes using Hamming distance and bit-majority voting. Cluster ids are then renumbered via a greedy nearest-neighbor walk so that adjacent ids = similar centroids — this makes the top-N nearest clusters at query time tend to land in fewer contiguous row ranges. Rows are sorted by the new cluster id. Centroids and per-cluster row counts go into KV metadata.
 2. **Phase 1 — cluster pruning**: rank clusters by Hamming(query, centroid), pick the top `probe` fraction, and Hamming-scan only those clusters' row ranges. With 32 KB pages and `useOffsetIndex`, hyparquet fetches only the pages covering each cluster's rows.
@@ -179,6 +181,8 @@ Three columns: `id` (STRING), `vector` (`FIXED_LEN_BYTE_ARRAY(4 × dim)`, raw fl
 
 A `cachedAsyncBuffer` deduplicates footer / offset-index byte ranges across all the parallel `parquetRead` calls.
 
+**PQ + rerank path** (`algorithm: 'pq'`, or `auto` when a file has PQ but no binary column): scan compact `vector_pq` codes over the selected cluster ranges, approximate-score candidates with lookup tables built from the query and stored PQ codebooks, then fetch full float32 vectors only for the candidate pool and exact-rerank as above. When `clusters > 0`, PQ uses the same contiguous cluster row ranges as the binary path.
+
 For pre-normalized vectors with `metric: 'cosine'`, the search normalizes the query once and scores via dot product to skip the per-candidate sqrt loop.
 
 ### File layout
@@ -188,6 +192,7 @@ For pre-normalized vectors with `metric: 'cosine'`, the search normalizes the qu
 | `id` | `STRING` (UTF8) | variable | always |
 | `vector` | `FIXED_LEN_BYTE_ARRAY(4 × dim)` | `4 × dim` | always |
 | `vector_bin` | `FIXED_LEN_BYTE_ARRAY(dim/8)` | `dim/8` | when `binary: true` |
+| `vector_pq` | `FIXED_LEN_BYTE_ARRAY(pqSegments)` | `pqSegments` | when `pq: true` |
 
 Key-value metadata:
 
@@ -198,10 +203,14 @@ Key-value metadata:
 | `hypvector.metric` | `cosine` \| `dot` \| `euclidean` |
 | `hypvector.normalized` | `true` if vectors were L2-normalized on write |
 | `hypvector.binary` | `true` if the `vector_bin` column is present |
+| `hypvector.pq` | `true` if the `vector_pq` column is present |
 | `hypvector.count` | number of vectors |
 | `hypvector.clusters` | number of k-means clusters (0 if not clustered) |
 | `hypvector.centroids` | base64-encoded centroid binary codes (`clusters × dim/8` bytes); present when `clusters > 0` |
 | `hypvector.clusterCounts` | base64-encoded `Uint32Array` of per-cluster row counts; present when `clusters > 0` |
+| `hypvector.pq.segments` | number of PQ sub-vectors / bytes per code; present when `pq: true` |
+| `hypvector.pq.centroids` | centroids per PQ sub-vector; present when `pq: true` |
+| `hypvector.pq.codebooks` | base64-encoded `Float32Array` codebooks (`pq.centroids × dim` floats); present when `pq: true` |
 
 ### CLI
 
 
@@ -22,6 +22,11 @@ export async function inspect({ path }) {
   console.log(`Metric: ${meta.metric}`)
   console.log(`Normalized: ${meta.normalized}`)
   console.log(`Binary column: ${meta.hasBinary}`)
+  console.log(`PQ column: ${meta.hasPq}`)
+  if (meta.hasPq) {
+    console.log(`PQ segments: ${meta.pqSegments}`)
+    console.log(`PQ centroids: ${meta.pqCentroids}`)
+  }
   console.log(`Row groups: ${metadata.row_groups.length.toLocaleString()}`)
   console.log(`Raw float32 size: ${rawSize.toLocaleString()} bytes`)
   console.log(`Overhead: ${(ratio * 100).toFixed(1)}% of raw`)
 
@@ -6,8 +6,8 @@
  * Variants:
  *   A) base               vector + id only  (search must use exact full scan)
  *   B) +binary            adds vector_bin column (binary phase 1 + per-cand phase 2 reads)
- *   C) +cluster           B plus k-means clustering + cluster_id col + centroids/counts KV
- *   D) +int8              C plus vector_i8 column (int8 cascade between phases 1 and 2)
+ *   C) +cluster           B plus k-means clustering + centroids/counts KV
+ *   D) +PQ                C plus vector_pq column + PQ codebooks
  *
  * Page size is held at 32 KB for B-D so we isolate the feature contribution
  * from the page-size knob.
@@ -41,6 +41,7 @@ const variants = [
   { name: 'A_base', label: 'A) base (vec only)', opts: { binary: false } },
   { name: 'B_binary', label: 'B) +binary', opts: { binary: true } },
   { name: 'C_cluster', label: 'C) +cluster', opts: { binary: true, clusters: 128 } },
+  { name: 'D_pq', label: 'D) +cluster+PQ', opts: { binary: true, clusters: 128, pq: true }, search: { algorithm: 'pq' } },
 ]
 
 for (const v of variants) {
@@ -130,6 +131,7 @@ for (const v of variants) {
   const opts = {}
   // For base file, rerankFactor=0 forces exact path. For others, default rerank/probe.
   if (v.name === 'A_base') opts.rerankFactor = 0
+  Object.assign(opts, v.search)
   const r = await bench(v.path, opts)
   let hits = 0, total = 0
   for (let q = 0; q < ref.tops.length; q += 1) {
 
@@ -13,6 +13,9 @@ export const defaultVectorColumn = 'vector'
 // Default name of the binary (sign-bit) rerank column
 export const defaultBinaryColumn = 'vector_bin'
 
+// Default name of the product-quantized vector code column
+export const defaultPqColumn = 'vector_pq'
+
 // Default name of the id column
 export const defaultIdColumn = 'id'
 
@@ -29,3 +32,10 @@ export const defaultClusterIterations = 6
 // Default fraction of clusters scanned in phase 1 at query time when the
 // file has cluster metadata. Lower = faster but lower recall.
 export const defaultClusterProbeFraction = 0.25
+
+// Default product quantization settings. The initial PQ path stores one
+// code byte per segment, with values in [0, defaultPqCentroids).
+export const defaultPqSegments = 32
+export const defaultPqCentroids = 16
+export const defaultPqIterations = 8
+export const defaultPqSampleSize = 4096
@@ -12,6 +12,7 @@ export type {
   HypVectorMetadata,
   PrefetchBinaryOptions,
   ReadVectorsOptions,
+  SearchAlgorithm,
   SearchResult,
   SearchVectorsOptions,
   VectorRecord,
 
@@ -0,0 +1,243 @@
+/**
+ * Product quantization helpers.
+ *
+ * Codebooks are stored segment-major. For segment s with bounds
+ * [bounds[s], bounds[s + 1]), the codebook block starts at
+ * `centroids * bounds[s]` and contains `centroids * segmentDim` float32s.
+ */
+
+/**
+ * @import { DistanceMetric, HypVectorMetadata } from './types.js'
+ */
+
+/**
+ * Build product-quantized codes for a set of vectors.
+ *
+ * @param {object} options
+ * @param {Float32Array[]} options.vectors
+ * @param {number} options.dimension
+ * @param {number} options.segments
+ * @param {number} options.centroids
+ * @param {number} options.iterations
+ * @param {number} options.sampleSize
+ * @param {number} options.seed
+ * @returns {{ codes: Uint8Array[], codebooks: Float32Array, segments: number, centroids: number }}
+ */
+export function buildPq({ vectors, dimension, segments, centroids, iterations, sampleSize, seed }) {
+  if (!Number.isInteger(segments) || segments <= 0) {
+    throw new Error(`pqSegments must be a positive integer, got ${segments}`)
+  }
+  if (!Number.isInteger(centroids) || centroids <= 1 || centroids > 256) {
+    throw new Error(`pqCentroids must be an integer in [2, 256], got ${centroids}`)
+  }
+  const effectiveSegments = Math.min(segments, dimension)
+  const bounds = pqSegmentBounds(dimension, effectiveSegments)
+  const sample = sampleIndices(vectors.length, sampleSize)
+  const codebooks = new Float32Array(centroids * dimension)
+
+  for (let s = 0; s < effectiveSegments; s += 1) {
+    trainSegment({
+      vectors,
+      sample,
+      start: bounds[s],
+      end: bounds[s + 1],
+      centroids,
+      iterations,
+      seed: seed + s * 1009,
+      out: codebooks,
+    })
+  }
+
+  const codes = new Array(vectors.length)
+  for (let i = 0; i < vectors.length; i += 1) {
+    codes[i] = encodePqVector(vectors[i], codebooks, dimension, effectiveSegments, centroids)
+  }
+
+  return { codes, codebooks, segments: effectiveSegments, centroids }
+}
+
+/**
+ * Return segment boundaries that cover [0, dimension).
+ *
+ * @param {number} dimension
+ * @param {number} segments
+ * @returns {Uint32Array}
+ */
+export function pqSegmentBounds(dimension, segments) {
+  const bounds = new Uint32Array(segments + 1)
+  for (let s = 0; s <= segments; s += 1) {
+    bounds[s] = Math.floor(s * dimension / segments)
+  }
+  return bounds
+}
+
+/**
+ * Encode one vector against trained PQ codebooks.
+ *
+ * @param {Float32Array} vector
+ * @param {Float32Array} codebooks
+ * @param {number} dimension
+ * @param {number} segments
+ * @param {number} centroids
+ * @returns {Uint8Array}
+ */
+export function encodePqVector(vector, codebooks, dimension, segments, centroids) {
+  const bounds = pqSegmentBounds(dimension, segments)
+  const code = new Uint8Array(segments)
+  for (let s = 0; s < segments; s += 1) {
+    const start = bounds[s]
+    const end = bounds[s + 1]
+    code[s] = nearestCentroid(vector, codebooks, start, end, centroids)
+  }
+  return code
+}
+
+/**
+ * Build per-segment lookup tables for approximate PQ scoring.
+ *
+ * For euclidean search the table stores squared L2 contributions and lower
+ * values are better. For dot/cosine search it stores dot-product
+ * contributions and higher values are better.
+ *
+ * @param {Float32Array} query
+ * @param {HypVectorMetadata} meta
+ * @param {DistanceMetric} metric
+ * @returns {{ table: Float32Array, approxMetric: DistanceMetric }}
+ */
+export function buildPqTables(query, meta, metric) {
+  if (!meta.hasPq || !meta.pqCodebooks || !meta.pqSegments || !meta.pqCentroids) {
+    throw new Error('PQ metadata is missing')
+  }
+  const table = new Float32Array(meta.pqSegments * meta.pqCentroids)
+  const bounds = pqSegmentBounds(meta.dimension, meta.pqSegments)
+  for (let s = 0; s < meta.pqSegments; s += 1) {
+    const start = bounds[s]
+    const end = bounds[s + 1]
+    const dim = end - start
+    const block = meta.pqCentroids * start
+    for (let c = 0; c < meta.pqCentroids; c += 1) {
+      const centroid = block + c * dim
+      let score = 0
+      if (metric === 'euclidean') {
+        for (let d = 0; d < dim; d += 1) {
+          const delta = query[start + d] - meta.pqCodebooks[centroid + d]
+          score += delta * delta
+        }
+      } else {
+        for (let d = 0; d < dim; d += 1) {
+          score += query[start + d] * meta.pqCodebooks[centroid + d]
+        }
+      }
+      table[s * meta.pqCentroids + c] = score
+    }
+  }
+  return { table, approxMetric: metric === 'euclidean' ? 'euclidean' : 'dot' }
+}
+
+/**
+ * Train one subspace codebook with k-means over a deterministic sample.
+ *
+ * @param {object} options
+ * @param {Float32Array[]} options.vectors
+ * @param {Int32Array} options.sample
+ * @param {number} options.start
+ * @param {number} options.end
+ * @param {number} options.centroids
+ * @param {number} options.iterations
+ * @param {number} options.seed
+ * @param {Float32Array} options.out
+ */
+function trainSegment({ vectors, sample, start, end, centroids, iterations, seed, out }) {
+  const dim = end - start
+  const block = centroids * start
+  const sampleCount = sample.length
+  if (sampleCount === 0) return
+
+  for (let c = 0; c < centroids; c += 1) {
+    const src = vectors[sample[Math.floor(c * sampleCount / centroids)]]
+    out.set(src.subarray(start, end), block + c * dim)
+  }
+
+  for (let iter = 0; iter < iterations; iter += 1) {
+    const counts = new Int32Array(centroids)
+    const sums = new Float32Array(centroids * dim)
+
+    for (let i = 0; i < sampleCount; i += 1) {
+      const vector = vectors[sample[i]]
+      const best = nearestCentroid(vector, out, start, end, centroids)
+      counts[best] += 1
+      const sumOff = best * dim
+      for (let d = 0; d < dim; d += 1) sums[sumOff + d] += vector[start + d]
+    }
+
+    for (let c = 0; c < centroids; c += 1) {
+      const dst = block + c * dim
+      if (counts[c] === 0) {
+        const src = vectors[sample[reseedIndex(seed, iter, c, sampleCount)]]
+        out.set(src.subarray(start, end), dst)
+        continue
+      }
+      const inv = 1 / counts[c]
+      const sumOff = c * dim
+      for (let d = 0; d < dim; d += 1) out[dst + d] = sums[sumOff + d] * inv
+    }
+  }
+}
+
+/**
+ * Find the nearest centroid for one segment under squared L2.
+ *
+ * @param {Float32Array} vector
+ * @param {Float32Array} codebooks
+ * @param {number} start
+ * @param {number} end
+ * @param {number} centroids
+ * @returns {number}
+ */
+function nearestCentroid(vector, codebooks, start, end, centroids) {
+  const dim = end - start
+  const block = centroids * start
+  let best = 0
+  let bestDist = Infinity
+  for (let c = 0; c < centroids; c += 1) {
+    const off = block + c * dim
+    let dist = 0
+    for (let d = 0; d < dim; d += 1) {
+      const delta = vector[start + d] - codebooks[off + d]
+      dist += delta * delta
+      if (dist >= bestDist) break
+    }
+    if (dist < bestDist) {
+      bestDist = dist
+      best = c
+    }
+  }
+  return best
+}
+
+/**
+ * Deterministic evenly-spaced sample indices.
+ *
+ * @param {number} count
+ * @param {number} sampleSize
+ * @returns {Int32Array}
+ */
+function sampleIndices(count, sampleSize) {
+  const n = Math.min(count, Math.max(1, sampleSize))
+  const out = new Int32Array(n)
+  for (let i = 0; i < n; i += 1) out[i] = Math.floor(i * count / n)
+  return out
+}
+
+/**
+ * @param {number} seed
+ * @param {number} iter
+ * @param {number} centroid
+ * @param {number} sampleCount
+ * @returns {number}
+ */
+function reseedIndex(seed, iter, centroid, sampleCount) {
+  let s = (seed ^ Math.imul(iter + 1, 2654435761) ^ Math.imul(centroid + 1, 2246822519)) >>> 0
+  s = Math.imul(s, 1664525) + 1013904223 >>> 0
+  return s % sampleCount
+}