asg017
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks-ann/.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks-ann/.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks-ann/Makefile‎
Lines changed: 14 additions & 14 deletions b/‎benchmarks-ann/Makefile‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎benchmarks-ann/README.md‎
Lines changed: 68 additions & 38 deletions b/‎benchmarks-ann/README.md‎
Lines changed: 68 additions & 38 deletions
@@ -31,3 +31,6 @@ poetry.lock
 
 memstat.c
 memstat.*
+
+
+.DS_Store
@@ -1,2 +1,8 @@
 *.db
+*.db-shm
+*.db-wal
+*.parquet
 runs/
+
+viewer/
+searcher/
@@ -1,5 +1,5 @@
 BENCH = python bench.py
-BASE_DB = seed/base.db
+BASE_DB = cohere1m/base.db
 EXT = ../dist/vec0
 
 # --- Baseline (brute-force) configs ---
@@ -33,7 +33,7 @@ ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) $(DISKANN_CONFIGS)
 
 # --- Data preparation ---
 seed:
-	$(MAKE) -C seed
+	$(MAKE) -C cohere1m
 
 ground-truth: seed
 	python ground_truth.py --subset-size 10000
@@ -42,43 +42,43 @@ ground-truth: seed
 
 # --- Quick smoke test ---
 bench-smoke: seed
-	$(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \
+	$(BENCH) --subset-size 5000 -k 10 -n 20 --dataset cohere1m -o runs \
 		"brute-float:type=baseline,variant=float" \
 		"ivf-quick:type=ivf,nlist=16,nprobe=4" \
 		"diskann-quick:type=diskann,R=48,L=64,quantizer=binary"
 
 bench-rescore: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/rescore \
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs \
 		$(RESCORE_CONFIGS)
 
 
 # --- Standard sizes ---
 bench-10k: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS)
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS)
 
 bench-50k: seed
-	$(BENCH) --subset-size 50000 -k 10 -o runs/50k $(ALL_CONFIGS)
+	$(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS)
 
 bench-100k: seed
-	$(BENCH) --subset-size 100000 -k 10 -o runs/100k $(ALL_CONFIGS)
+	$(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS)
 
 bench-all: bench-10k bench-50k bench-100k
 
 # --- IVF across sizes ---
 bench-ivf: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
-	$(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
-	$(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS)
+	$(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS)
+	$(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS)
 
 # --- DiskANN across sizes ---
 bench-diskann: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
-	$(BENCH) --subset-size 50000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
-	$(BENCH) --subset-size 100000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS)
+	$(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS)
+	$(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS)
 
 # --- Report ---
 report:
-	@echo "Use: sqlite3 runs/<dir>/results.db 'SELECT * FROM bench_results ORDER BY recall DESC'"
+	@echo "Use: sqlite3 runs/cohere1m/<size>/results.db 'SELECT run_id, config_name, status, recall FROM runs JOIN run_results USING(run_id)'"
 
 # --- Cleanup ---
 clean:
 
@@ -1,81 +1,111 @@
 # KNN Benchmarks for sqlite-vec
 
 Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force
-baselines (float, int8, bit); index-specific branches add their own types
-via the `INDEX_REGISTRY` in `bench.py`.
+baselines (float, int8, bit), rescore, IVF, and DiskANN index types.
+
+## Datasets
+
+Each dataset is a subdirectory containing a `Makefile` and `build_base_db.py`
+that produce a `base.db`. The benchmark runner auto-discovers any subdirectory
+with a `base.db` file.
+
+```
+cohere1m/           # Cohere 768d cosine, 1M vectors
+  Makefile          # downloads parquets from Zilliz, builds base.db
+  build_base_db.py
+  base.db           # (generated)
+
+cohere10m/          # Cohere 768d cosine, 10M vectors (10 train shards)
+  Makefile          # make -j12 download to fetch all shards in parallel
+  build_base_db.py
+  base.db           # (generated)
+```
+
+Every `base.db` has the same schema:
+
+| Table | Columns | Description |
+|-------|---------|-------------|
+| `train` | `id INTEGER PRIMARY KEY, vector BLOB` | Indexed vectors (f32 blobs) |
+| `query_vectors` | `id INTEGER PRIMARY KEY, vector BLOB` | Query vectors for KNN evaluation |
+| `neighbors` | `query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT` | Ground-truth nearest neighbors |
+
+To add a new dataset, create a directory with a `Makefile` that builds `base.db`
+with the tables above. It will be available via `--dataset <dirname>` automatically.
+
+### Building datasets
+
+```bash
+# Cohere 1M
+cd cohere1m && make download && make && cd ..
+
+# Cohere 10M (parallel download recommended — 10 train shards + test + neighbors)
+cd cohere10m && make -j12 download && make && cd ..
+```
 
 ## Prerequisites
 
-- Built `dist/vec0` extension (run `make` from repo root)
+- Built `dist/vec0` extension (run `make loadable` from repo root)
 - Python 3.10+
-- `uv` (for seed data prep): `pip install uv`
+- `uv`
 
 ## Quick start
 
 ```bash
-# 1. Download dataset and build seed DB (~3 GB download, ~5 min)
-make seed
+# 1. Build a dataset
+cd cohere1m && make && cd ..
 
-# 2. Run a quick smoke test (5k vectors, ~1 min)
+# 2. Quick smoke test (5k vectors)
 make bench-smoke
 
-# 3. Run full benchmark at 10k
+# 3. Full benchmark at 10k
 make bench-10k
 ```
 
 ## Usage
 
-### Direct invocation
-
 ```bash
-python bench.py --subset-size 10000 \
+uv run python bench.py --subset-size 10000 -k 10 -n 50 --dataset cohere1m \
   "brute-float:type=baseline,variant=float" \
-  "brute-int8:type=baseline,variant=int8" \
-  "brute-bit:type=baseline,variant=bit"
+  "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8"
 ```
 
 ### Config format
 
 `name:type=<index_type>,key=val,key=val`
 
-| Index type | Keys | Branch |
-|-----------|------|--------|
-| `baseline` | `variant` (float/int8/bit), `oversample` | this branch |
-
-Index branches register additional types in `INDEX_REGISTRY`. See the
-docstring in `bench.py` for the extension API.
+| Index type | Keys |
+|-----------|------|
+| `baseline` | `variant` (float/int8/bit), `oversample` |
+| `rescore` | `quantizer` (bit/int8), `oversample` |
+| `ivf` | `nlist`, `nprobe` |
+| `diskann` | `R`, `L`, `quantizer` (binary/int8), `buffer_threshold` |
 
 ### Make targets
 
 | Target | Description |
 |--------|-------------|
-| `make seed` | Download COHERE 1M dataset |
-| `make ground-truth` | Pre-compute ground truth for 10k/50k/100k |
-| `make bench-smoke` | Quick 5k baseline test |
+| `make seed` | Download and build default dataset |
+| `make bench-smoke` | Quick 5k test (3 configs) |
 | `make bench-10k` | All configs at 10k vectors |
 | `make bench-50k` | All configs at 50k vectors |
 | `make bench-100k` | All configs at 100k vectors |
 | `make bench-all` | 10k + 50k + 100k |
+| `make bench-ivf` | Baselines + IVF across 10k/50k/100k |
+| `make bench-diskann` | Baselines + DiskANN across 10k/50k/100k |
 
-## Adding an index type
-
-In your index branch, add an entry to `INDEX_REGISTRY` in `bench.py` and
-append your configs to `ALL_CONFIGS` in the `Makefile`. See the existing
-`baseline` entry and the comments in both files for the pattern.
-
-## Results
+## Results DB
 
-Results are stored in `runs/<dir>/results.db` using the schema in `schema.sql`.
+Each run writes to `runs/<dataset>/<subset_size>/results.db` (SQLite, WAL mode).
+Progress is written continuously — query from another terminal to monitor:
 
 ```bash
-sqlite3 runs/10k/results.db "
-  SELECT config_name, recall, mean_ms, qps
-  FROM bench_results
-  ORDER BY recall DESC
-"
+sqlite3 runs/cohere1m/10000/results.db "SELECT run_id, config_name, status FROM runs"
 ```
 
-## Dataset
+See `results_schema.sql` for the full schema (tables: `runs`, `run_results`,
+`insert_batches`, `queries`).
+
+## Adding an index type
 
-[Zilliz COHERE Medium 1M](https://zilliz.com/learn/datasets-for-vector-database-benchmarks):
-768 dimensions, cosine distance, 1M train vectors + 10k query vectors with precomputed neighbors.
+Add an entry to `INDEX_REGISTRY` in `bench.py` and append configs to
+`ALL_CONFIGS` in the `Makefile`. See existing entries for the pattern.