docs: Update README and benchmarks to reflect new overquery_factor settings and results

tae898 · tae898 · commit f10366881626 · 2026-01-20T00:10:16.000+01:00
diff --git a/bindings/python/examples/benchmark-vector/README.md b/bindings/python/examples/benchmark-vector/README.md
@@ -11,6 +11,25 @@
 - Take the duration with a grain of salt, since there are other processes running on the machine. RSS and DB size are more stable. 4 threads were allocated per task, but there aren't always the same number of tasks runing in parallel, so effective CPU usage may vary.
 - If not mentioned, `MAX_CONNECTIONS` is fixed as 12, `BEAM_WIDTHS` as 64, and `OVERQUERY_FACTORS` as 1
 
+### 17 January 2026 Update
+
+- Ran the experiments with varying `overquery_factor` again to see how it affects recall and latency. Increasing `overquery_factor` should improve recall at the cost of latency. as it increases the number of candidates considered during search (more efforts on search)
+
+#### MSMARCO-1M (1000 queries, Recall@50)
+
+`quantization=INT8`, `store_vectors_in_graph=False`, `add_hierarchy=True`, `max_connections=16`,`beam_width=100` , `batch_size=10000`, 4 threads
+
+| overquery_factor | search_s | recall@50_before_close | peak_rss_mb | db_size_mb | total_duration |
+| ---------------: | -------: | ---------------------: | ----------: | ---------: | :------------- |
+|               16 |  280.916 |                 0.9919 |     4531.02 |    6753.95 | 4m 59s         |
+|                8 |  155.385 |                 0.9902 |     4495.21 |    6753.95 | 2m 54s         |
+|                4 |   96.178 |                 0.9835 |     4477.92 |    6753.95 | 1m 53s         |
+|                2 |   56.095 |                 0.9758 |     4465.29 |    6753.95 | 1m 15s         |
+|                1 |   39.802 |                 0.9486 |     4409.59 |    6753.95 | 1m 8s          |
+
+- As for this dataset, increasing `overquery_factor` improves recall but increases search latency. The peak RSS and DB size remain relatively stable across different `overquery_factor` settings. Even increasing from 1 to 2 helps a lot.
+- A "good" overquery factor will heavily depend on the nature of the dataset. We'll try 10M and even possibly 20M vectors next to see how it behaves.
+
 ### Commit/Date: main @ 6ef8858 (Thu Jan 15 16:40:51 2026 -0500)
 
 - This commit adds Product Quantization (PQ) support to JVector index.
diff --git a/bindings/python/examples/benchmark-vector/benchmark_arcadedb_msmarco.py b/bindings/python/examples/benchmark-vector/benchmark_arcadedb_msmarco.py
@@ -388,7 +388,7 @@ def main():
     ap.add_argument(
         "--count",
         type=int,
-        help="Override corpus count; default = dataset size (1M/10M/100M)",
+        help="Override corpus count; default = dataset size (1M/10M/20M/100M)",
     )
     ap.add_argument(
         "--max-connections", type=int, default=32, help="JVector max_connections"
diff --git a/bindings/python/examples/benchmark-vector/run_arcadedb_sweep.sh b/bindings/python/examples/benchmark-vector/run_arcadedb_sweep.sh
@@ -26,9 +26,10 @@ fi
 # Choose heap based on dataset size if ARCADEDB_JVM_ARGS not already set
 if [[ -z "${ARCADEDB_JVM_ARGS:-}" ]]; then
     case "$(basename "$DATASET_DIR")" in
-        *MSMARCO-1M*) XMX="4g" ;;
-        *MSMARCO-10M*) XMX="16g" ;;
-        *MSMARCO-100M*) XMX="64g" ;;
+        *MSMARCO-1M*) XMX="4g" ;;    # 4G works for 1M
+        *MSMARCO-10M*) XMX="16g" ;;  # 16G testing ...
+        *MSMARCO-20M*) XMX="32g" ;;  # 32G testing ...
+        *MSMARCO-100M*) XMX="64g" ;; # Not sure if I'll ever test this. It's humongous.
     esac
     if [[ -n "${XMX:-}" ]]; then
         JVM_ARGS="-Xmx${XMX} -Xms${XMX}"
@@ -54,8 +55,8 @@ cmds=()
 # Parameter sweeps (cosine only):
 # max_connections x beam_width x overquery_factor x quantization x store_vectors_in_graph x add_hierarchy
 # Expanded for real runs; still bounded to avoid explosion
-MAX_CONNECTIONS=(12)
-BEAM_WIDTHS=(64)
+MAX_CONNECTIONS=(16)
+BEAM_WIDTHS=(100)
 OVERQUERY_FACTORS=(1)
 QUANTIZATIONS=(INT8)
 STORE_GRAPH_FLAGS=(false)
diff --git a/bindings/python/examples/benchmark-vector/run_overquery_sweep.py b/bindings/python/examples/benchmark-vector/run_overquery_sweep.py
@@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+"""
+Run search-only sweeps on an existing ArcadeDB MSMARCO database to compare
+different `overquery_factor` values without rebuilding the index.
+
+For each factor, we:
+- load the 1,000 query vectors and ground-truth labels
+- open the existing DB
+- warm up once
+- run a full search pass (recall + latency)
+- write results.json / results.md under an output directory that the
+  existing summarizer can consume.
+
+Place the outputs under `arcadedb_runs/*/results.json` (default) so
+`summarize_arcadedb_msmarco.py` will include them in its markdown tables.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Dict, List
+
+import arcadedb_embedded as arcadedb
+import numpy as np
+from benchmark_arcadedb_msmarco import (
+    dir_size_mb,
+    load_ground_truth,
+    load_queries,
+    materialize_queries,
+    resolve_dataset,
+    rss_mb,
+    search_index,
+    timed_section,
+    warmup,
+)
+
+
+def parse_overqueries(raw: str) -> List[int]:
+    vals: List[int] = []
+    for part in raw.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        try:
+            v = int(part)
+        except ValueError:
+            raise SystemExit(f"Invalid overquery value: {part}") from None
+        if v <= 0:
+            raise SystemExit("overquery values must be positive")
+        vals.append(v)
+    if not vals:
+        raise SystemExit("No overquery values provided")
+    return vals
+
+
+def load_existing_config(db_path: Path) -> Dict:
+    cfg: Dict = {}
+    res_json = db_path / "results.json"
+    if res_json.exists():
+        try:
+            cfg = json.loads(res_json.read_text()).get("config", {})
+        except Exception:
+            cfg = {}
+    return cfg
+
+
+def record(
+    phases: Dict[str, dict],
+    name: str,
+    result,
+    dur: float,
+    rss_start: float,
+    rss_end: float,
+) -> None:
+    phases[name] = {
+        "time_sec": dur,
+        "rss_before_mb": rss_start,
+        "rss_after_mb": rss_end,
+        "rss_delta_mb": rss_end - rss_start,
+    }
+    if isinstance(result, dict):
+        phases[name].update(result)
+
+
+def run_single(
+    db_path: Path,
+    dataset_dir: Path,
+    overquery: int,
+    k: int,
+    quantization: str,
+    output_root: Path,
+    tag: str | None,
+    base_config: Dict,
+) -> Path:
+    sources, gt_path, dim, label = resolve_dataset(dataset_dir)
+    total_rows = sum(s["count"] for s in sources)
+    gt_full = load_ground_truth(gt_path)
+
+    qids = load_queries(gt_path, limit=1000)
+    qids = [qid for qid in qids if qid < total_rows][:1000]
+    qids = [qid for qid in qids if qid in gt_full][:1000]
+    if not qids:
+        raise SystemExit("No valid query IDs with ground truth found")
+
+    phases: Dict[str, dict] = {}
+
+    (queries, dur, r0, r1) = timed_section(
+        "load_queries", lambda: materialize_queries(sources, qids, dim=dim)
+    )
+    record(phases, "load_queries", {"queries": len(queries)}, dur, r0, r1)
+
+    (db, dur, r0, r1) = timed_section(
+        "open_db", lambda: arcadedb.open_database(str(db_path))
+    )
+    record(phases, "open_db", {}, dur, r0, r1)
+
+    index = db.schema.get_vector_index("VectorData", "vector")
+
+    (warm_info, dur, r0, r1) = timed_section(
+        "warmup",
+        lambda: warmup(index, queries, overquery, k, quantization),
+    )
+    record(phases, "warmup", warm_info, dur, r0, r1)
+
+    (search_stats, dur, r0, r1) = timed_section(
+        "search",
+        lambda: search_index(
+            index,
+            queries,
+            qids,
+            gt_full,
+            k=k,
+            overquery_factor=overquery,
+            quantization=quantization,
+        ),
+    )
+    record(phases, "search", search_stats, dur, r0, r1)
+
+    try:
+        (_, dur, r0, r1) = timed_section("close_db_final", lambda: db.close())
+        record(phases, "close_db_final", {}, dur, r0, r1)
+    except Exception:
+        pass
+
+    rss_after_vals = [
+        v.get("rss_after_mb")
+        for v in phases.values()
+        if v.get("rss_after_mb") is not None
+    ]
+    peak_rss = max(rss_after_vals) if rss_after_vals else None
+
+    recall_stats = {
+        "search": {
+            "mean": phases.get("search", {}).get("recall_mean"),
+            "n": phases.get("search", {}).get("recall_count"),
+        },
+        "search_after_reopen": {"mean": None, "n": None},
+    }
+
+    latency_ms = {
+        "search": {
+            "mean": phases.get("search", {}).get("latency_ms_mean"),
+            "p95": phases.get("search", {}).get("latency_ms_p95"),
+        },
+        "search_after_reopen": {"mean": None, "p95": None},
+    }
+
+    dataset_info = {
+        "label": label or "dataset",
+        "dim": dim,
+        "shards": len(sources),
+        "rows": total_rows,
+    }
+
+    config_info = {
+        **{k: v for k, v in base_config.items() if v is not None},
+        "overquery_factor": overquery,
+        "quantization": quantization,
+        "queries": len(qids),
+        "k": k,
+    }
+
+    results = {
+        "dataset": dataset_info,
+        "config": config_info,
+        "phases": phases,
+        "recall": recall_stats,
+        "latency_ms": latency_ms,
+        "db_path": str(db_path),
+        "db_size_mb": dir_size_mb(db_path),
+    }
+
+    run_dir_name_parts = [
+        f"dataset={dataset_dir.name}",
+        f"label={label or 'dataset'}",
+        f"oq={overquery}",
+        f"reuse={db_path.name}",
+    ]
+    if tag:
+        run_dir_name_parts.append(f"tag={tag}")
+    run_dir = output_root / "_".join(run_dir_name_parts)
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    results_json = run_dir / "results.json"
+    results_json.write_text(json.dumps(results, indent=2))
+
+    md_lines = [
+        f"# ArcadeDB overquery sweep ({dataset_info['label']})",
+        "",
+        "## Config",
+        f"- overquery_factor: {overquery}",
+        f"- quantization: {quantization}",
+        f"- k: {k}",
+        f"- db_path: {db_path}",
+        "",
+        "## Recall",
+        (
+            f"- search: {recall_stats['search']['mean']:.4f} (n={recall_stats['search']['n']})"
+            if recall_stats["search"]["mean"] is not None
+            else "- search: n/a"
+        ),
+        "",
+        "## Latency (ms)",
+        (
+            f"- search mean: {latency_ms['search']['mean']:.2f} | p95: {latency_ms['search']['p95']:.2f}"
+            if latency_ms["search"]["mean"] is not None
+            else "- search: n/a"
+        ),
+        "",
+        "## Phases (time sec / RSS MB)",
+    ]
+
+    for name in ("load_queries", "open_db", "warmup", "search", "close_db_final"):
+        if name not in phases:
+            continue
+        p = phases[name]
+        line = (
+            f"- {name}: time={p['time_sec']:.3f}s, rss_before={p['rss_before_mb']:.1f} MB, "
+            f"rss_after={p['rss_after_mb']:.1f} MB, delta={p['rss_delta_mb']:.1f} MB"
+        )
+        if "recall_mean" in p:
+            line += f", recall@{k}={p['recall_mean']:.4f}"
+        if "latency_ms_mean" in p and p["latency_ms_mean"] is not None:
+            line += f", latency_ms={p['latency_ms_mean']:.2f}"
+        md_lines.append(line)
+
+    def fmt(val: float | None) -> str:
+        return "nan" if val is None else f"{val:.1f}"
+
+    md_lines.extend(
+        [
+            "",
+            f"- db_size_mb: {fmt(results['db_size_mb'])}",
+            f"- peak_rss_mb: {fmt(peak_rss)}",
+        ]
+    )
+
+    results_md = run_dir / "results.md"
+    results_md.write_text("\n".join(md_lines))
+    print(f"Wrote {results_json}")
+    print(f"Wrote {results_md}")
+
+    return run_dir
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description="Run overquery sweeps against an existing ArcadeDB MSMARCO DB"
+    )
+    ap.add_argument("--db-path", required=True, help="Path to existing ArcadeDB DB")
+    ap.add_argument("--dataset-dir", required=True, help="Path to MSMARCO dataset dir")
+    ap.add_argument(
+        "--overquery-factors",
+        required=True,
+        help="Comma-separated overquery values (e.g., 1,2,4,8,16)",
+    )
+    ap.add_argument(
+        "--output-root",
+        default="arcadedb_runs",
+        help="Where to place per-factor results (default: arcadedb_runs)",
+    )
+    ap.add_argument("--k", type=int, default=50, help="Top-K for recall/latency")
+    ap.add_argument(
+        "--quantization",
+        choices=["NONE", "INT8", "BINARY", "PRODUCT"],
+        help="Override quantization (default: from existing results.json or NONE)",
+    )
+    ap.add_argument("--tag", help="Optional tag appended to output directory name")
+
+    args = ap.parse_args()
+
+    overqueries = parse_overqueries(args.overquery_factors)
+    db_path = Path(args.db_path)
+    dataset_dir = Path(args.dataset_dir)
+    output_root = Path(args.output_root)
+
+    if not db_path.exists():
+        raise SystemExit(f"DB path not found: {db_path}")
+
+    base_config = load_existing_config(db_path)
+    quant = (args.quantization or base_config.get("quantization") or "NONE").upper()
+
+    created: List[Path] = []
+    for oq in overqueries:
+        created.append(
+            run_single(
+                db_path=db_path,
+                dataset_dir=dataset_dir,
+                overquery=oq,
+                k=args.k,
+                quantization=quant,
+                output_root=output_root,
+                tag=args.tag,
+                base_config=base_config,
+            )
+        )
+
+    print("\nCompleted runs:")
+    for p in created:
+        print(f"- {p}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bindings/python/examples/benchmark-vector/summaries/arcadedb_msmarco_MSMARCO-1M.md b/bindings/python/examples/benchmark-vector/summaries/arcadedb_msmarco_MSMARCO-1M.md
diff --git a/plan_update_python_bindings.md b/plan_update_python_bindings.md

Original file line number	Diff line number	Diff line change
`@@ -388,7 +388,7 @@ def main():`
`388`	`388`	`ap.add_argument(`
`389`	`389`	`"--count",`
`390`	`390`	`type=int,`
`391`		`- help="Override corpus count; default = dataset size (1M/10M/100M)",`
	`391`	`+ help="Override corpus count; default = dataset size (1M/10M/20M/100M)",`
`392`	`392`	`)`
`393`	`393`	`ap.add_argument(`
`394`	`394`	`"--max-connections", type=int, default=32, help="JVector max_connections"`