vortex-data
diff --git a/‎.github/workflows/bench-pr.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/bench-pr.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/bench.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/bench.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 22 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/datafusion-bench/src/lib.rs‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/datafusion-bench/src/lib.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/vector-search-bench/Cargo.toml‎
Lines changed: 37 additions & 0 deletions b/‎benchmarks/vector-search-bench/Cargo.toml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎benchmarks/vector-search-bench/README.md‎
Lines changed: 120 additions & 0 deletions b/‎benchmarks/vector-search-bench/README.md‎
Lines changed: 120 additions & 0 deletions
@@ -33,6 +33,8 @@ jobs:
             build_args: "--features lance"
           - id: compress-bench
             name: Compression
+          - id: vector-search-bench
+            name: Vector Similarity Search
     steps:
       - uses: runs-on/action@v2
         if: github.event.pull_request.head.repo.fork == false
 
@@ -49,6 +49,10 @@ jobs:
             name: Compression
             build_args: "--features lance"
             formats: "parquet,lance,vortex"
+          - id: vector-search-bench
+            name: Vector Similarity Search
+            build_args: ""
+            formats: "handrolled,vortex-uncompressed,vortex-default,vortex-turboquant"
     steps:
       - uses: runs-on/action@v2
         if: github.repository == 'vortex-data/vortex'
 
@@ -59,6 +59,7 @@ members = [
     "benchmarks/datafusion-bench",
     "benchmarks/duckdb-bench",
     "benchmarks/random-access-bench",
+    "benchmarks/vector-search-bench",
 ]
 exclude = ["java/testfiles", "wasm-test"]
 resolver = "2"
 
@@ -114,7 +114,7 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
         Format::Csv => Arc::new(CsvFormat::default()) as _,
         Format::Arrow => Arc::new(ArrowFormat),
         Format::Parquet => Arc::new(ParquetFormat::new()),
-        Format::OnDiskVortex | Format::VortexCompact => {
+        Format::OnDiskVortex | Format::VortexCompact | Format::VortexLossy => {
             Arc::new(VortexFormat::new(SESSION.clone()))
         }
         Format::OnDiskDuckDB | Format::Lance => {
 
@@ -0,0 +1,37 @@
+[package]
+name = "vector-search-bench"
+description = "Vector similarity search benchmarks for Vortex on public embedding datasets"
+authors.workspace = true
+categories.workspace = true
+edition.workspace = true
+homepage.workspace = true
+include.workspace = true
+keywords.workspace = true
+license.workspace = true
+readme.workspace = true
+repository.workspace = true
+rust-version.workspace = true
+version.workspace = true
+publish = false
+
+[dependencies]
+anyhow = { workspace = true }
+arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
+arrow-schema = { workspace = true }
+clap = { workspace = true, features = ["derive"] }
+indicatif = { workspace = true }
+parquet = { workspace = true }
+tabled = { workspace = true, features = ["std"] }
+tokio = { workspace = true, features = ["full"] }
+tracing = { workspace = true }
+vortex = { workspace = true }
+vortex-bench = { workspace = true }
+vortex-btrblocks = { workspace = true }
+vortex-tensor = { workspace = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+
+[lints]
+workspace = true
@@ -0,0 +1,120 @@
+# vector-search-bench
+
+Brute-force cosine-similarity benchmark for Vortex on public VectorDBBench
+embedding corpora.
+
+The current benchmark pipeline supports source embedding columns with `f32`
+or `f64` elements. The lower-level `list_to_vector_ext` conversion helper can
+rewrap `f16` lists as `Vector` extension arrays, but `vector-search-bench`
+itself does not yet support `f16` query extraction or the hand-rolled parquet
+baseline.
+
+## What it measures
+
+For each `(dataset, format)` pair, the benchmark records:
+
+1. **`nbytes`** — in-memory footprint of the variant's array tree, in bytes.
+   Reporting the in-memory `.nbytes()` instead of an on-disk file size is
+   deliberate: the Vortex default write path runs BtrBlocks on every tree
+   regardless of whether it's already compressed, so "on-disk size" would
+   collapse `vortex-uncompressed` and `vortex-default` to the same bytes
+   even though their in-memory trees are different. The `nbytes()`
+   number is consistent with what the *compute* measurements actually
+   operate on.
+   - The `handrolled` baseline reports the canonical parquet file size
+     on disk — that's the only encoded representation it has.
+2. **Compress time** — wall time to build the variant tree from the
+   materialized uncompressed source. ~0 for `vortex-uncompressed` (identity),
+   meaningful for the two compressed variants.
+3. **Decompress time** — wall time to execute the variant tree all the way
+   back into a canonical `FixedSizeListArray<f32>` with a materialized f32
+   element buffer. For `vortex-uncompressed` this is a no-op; for
+   `vortex-default` it includes ALP-RD bit-unpacking; for
+   `vortex-turboquant` it includes the inverse SORF rotation and
+   dictionary lookup.
+4. **Cosine-similarity time** — `CosineSimilarity(data, const_query)`
+   executed to a materialized f32 array.
+5. **Cosine-filter time** — `Binary(Gt, [CosineSimilarity, threshold])`
+   executed to a `BoolArray`.
+6. **Recall@10** (TurboQuant only) — the fraction of the exact top-10
+   nearest neighbours that TurboQuant recovers, using the uncompressed
+   Vortex scan as local ground truth.
+
+Before any timing starts, the benchmark runs a **correctness verification
+pass**: cosine scores for a single query are computed against every
+variant and compared to the uncompressed baseline. Lossless variants must
+match within `1e-4` max-abs-diff; TurboQuant must stay within `0.2`. A
+mismatch bails the run — you cannot publish throughput numbers for a
+variant that returns wrong answers.
+
+## Formats
+
+- `handrolled` — Hand-rolled Rust scalar cosine loop over a flat
+  `Vec<f32>` that was decoded from the canonical parquet file via
+  `parquet-rs` / `arrow-rs`. The **decompress** phase does the parquet
+  read, downcasts to `Float32Array`, and memcpies into a plain `Vec<f32>`.
+  The **compute** phase is a plain scalar loop over `&[f32]` — no Arrow
+  compute kernels, no scalar-function dispatch, no SIMD annotations.
+
+  This is a **compute-cost floor**, not a realistic parquet-on-DBMS
+  baseline. It answers the question "what's the minimum cost you could
+  get away with if you wrote a vector-search scan by hand with no query
+  engine?" Real parquet users would pay substantially more (DuckDB
+  `list_cosine_similarity`, DataFusion with a vector UDF, etc.) —
+  adding those as additional baselines is a natural v2 direction.
+- `vortex-uncompressed` — Raw `Vector<dim, f32>` extension array, no
+  encoding-level compression applied.
+- `vortex-default` — `BtrBlocksCompressor::default()` applied to the FSL
+  storage child. On float vectors this typically finds ~15% lossless
+  savings via ALP-RD (mantissa/exponent split + bitpacking).
+- `vortex-turboquant` — The full
+  `L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)` pipeline.
+  Lossy; recall@10 is reported alongside throughput. At the default 8-bit
+  config this typically gives ~3× storage reduction at >90% top-10
+  recall.
+
+## Datasets
+
+The smallest built-in dataset is **Cohere-100K** (`cohere-small`): 100K
+rows × 768 dims, cosine metric, ~150 MB zstd-parquet. It's the smallest
+VectorDBBench-supplied corpus that still exercises every encoding path.
+Larger variants (`cohere-medium`, `openai-small`, `openai-medium`,
+`bioasq-medium`, `glove-medium`) are wired up for local / on-demand
+experiments; see `vortex-bench/src/vector_dataset.rs` for the full list.
+
+The upstream URL for Cohere-100K is
+`https://assets.zilliz.com/benchmark/cohere_small_100k/train.parquet`.
+The public Zilliz bucket is anonymous-readable so the code can hit it
+directly.
+
+## Running locally
+
+```bash
+cargo run -p vector-search-bench --release -- \
+    --datasets cohere-small \
+    --formats handrolled,vortex-uncompressed,vortex-default,vortex-turboquant \
+    --iterations 5 \
+    -d table
+```
+
+The first run downloads the parquet file into
+`vortex-bench/data/cohere-small/cohere-small.parquet` and caches it
+idempotently for subsequent runs.
+
+## CI note: dataset mirror
+
+CI runs after every develop-branch merge. Hitting `assets.zilliz.com`
+from every merge would create recurring egress traffic on a third-party
+bucket — the same courtesy reason `RPlace` / `AirQuality` are excluded
+from CI in `compress-bench`.
+
+Before enabling the `vector-search-bench` entry in `.github/workflows/bench.yml`
+on a fork, either:
+
+1. **Mirror the file into an internal bucket** and swap the URL in
+   `vortex-bench/src/vector_dataset.rs::VectorDataset::parquet_url`, or
+2. **Accept the upstream egress cost** and leave the URL as-is.
+
+The mirror step is a one-off `aws s3 cp` and is documented here rather
+than automated in the build because the destination bucket is
+organization-specific.
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,7 @@ members = [`
`59`	`59`	`"benchmarks/datafusion-bench",`
`60`	`60`	`"benchmarks/duckdb-bench",`
`61`	`61`	`"benchmarks/random-access-bench",`
	`62`	`+ "benchmarks/vector-search-bench",`
`62`	`63`	`]`
`63`	`64`	`exclude = ["java/testfiles", "wasm-test"]`
`64`	`65`	`resolver = "2"`
Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {`
`114`	`114`	`Format::Csv => Arc::new(CsvFormat::default()) as _,`
`115`	`115`	`Format::Arrow => Arc::new(ArrowFormat),`
`116`	`116`	`Format::Parquet => Arc::new(ParquetFormat::new()),`
`117`		`- Format::OnDiskVortex \| Format::VortexCompact => {`
	`117`	`+ Format::OnDiskVortex \| Format::VortexCompact \| Format::VortexLossy => {`
`118`	`118`	`Arc::new(VortexFormat::new(SESSION.clone()))`
`119`	`119`	`}`
`120`	`120`	`Format::OnDiskDuckDB \| Format::Lance => {`