Skip to content

Commit 609e9aa

Browse files
authored
Vector similarity search scan benchmarks (#7499)
## Summary Tracking Issue: #7297 Adds a basic vector similarity search benchmark to the Vortex benchmark suite as a binary. Here is an example of how to run this: ```sh cargo run -p vector-search-bench --release -- --dataset cohere-small-100k --layout single --iterations 100 cargo run -p vector-search-bench --release -- --dataset bioasq-large-10m --layout partitioned-shuffled --iterations 1 ``` The main scan logic is in `scan_one_file` in `benchmarks/vector-search-bench/src/scan.rs`, and everything else is just setup for that. ## Future Work - This does not measure recall, but that can come in a followup PR. - We will want to have a handrolled baseline implementation that is the "theoretical" minimum over uncompressed f32 vectors that we can compare against. We can then add more flavors of compression and quantization in the future. - No filter pushdown still (doing filter pushdown before similarity search), have to figure out why that isn't working when we eventually benchmark that - Recall over filtered cosine search ## Testing The benchmark running successfully for all datasets is sufficient. Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent f308dfe commit 609e9aa

8 files changed

Lines changed: 690 additions & 77 deletions

File tree

benchmarks/vector-search-bench/src/compression.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
//!
99
//! The benchmark writes one `.vortex` file per flavor per data file, then scans them all with the
1010
//! same query so the comparison is apples-to-apples with the Parquet files.
11-
//!
12-
//! Note that the handrolled `&[f32]` parquet baseline is **not** a flavor here.
1311
1412
use clap::ValueEnum;
1513
use vortex::array::ArrayId;
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Local table renderer for the vector-search benchmark.
5+
//!
6+
//! Groups columns by **flavor** (`vortex-uncompressed`, `vortex-turboquant`) rather than by
7+
//! [`vortex_bench::Format`], because the two Vortex flavors share a single
8+
//! `Format::OnDiskVortex`/`Format::VortexLossy` pair and the generic
9+
//! [`vortex_bench::display::render_table`] groups by Format. Local renderer keeps the
10+
//! column-per-flavor invariant intact without introducing a new global Format value.
11+
//!
12+
//! Output rows:
13+
//!
14+
//! ```text
15+
//! Metric | vortex-uncompressed | vortex-turboquant
16+
//! ------------------ + ------------------- + -----------------
17+
//! scan wall (mean) | 485 ms | 212 ms
18+
//! scan wall (median) | 490 ms | 215 ms
19+
//! matches | 42 | 39
20+
//! rows scanned | 10,000,000 | 10,000,000
21+
//! bytes scanned | 30.5 GB | 7.62 GB
22+
//! rows / sec | 5.2e6 | 1.2e7
23+
//! ```
24+
25+
use std::io::Write;
26+
27+
use anyhow::Result;
28+
use tabled::settings::Style;
29+
30+
use crate::compression::VectorFlavor;
31+
use crate::prepare::CompressedVortexDataset;
32+
use crate::scan::ScanTiming;
33+
34+
/// Final column-per-flavor row set for one dataset.
35+
pub struct DatasetReport<'a> {
36+
pub dataset_name: &'a str,
37+
pub vortex_results: &'a [(VectorFlavor, &'a CompressedVortexDataset, &'a ScanTiming)],
38+
}
39+
40+
/// Render the full report into the given writer as a tabled table.
41+
pub fn render(report: &DatasetReport<'_>, writer: &mut dyn Write) -> Result<()> {
42+
let mut headers: Vec<String> = vec!["metric".to_owned()];
43+
for &(flavor, ..) in report.vortex_results {
44+
headers.push(flavor.label().to_owned());
45+
}
46+
47+
let rows: Vec<Vec<String>> = vec![
48+
make_row("scan wall (mean)", report, |_, _, scan| {
49+
format_duration(scan.mean)
50+
}),
51+
make_row("scan wall (median)", report, |_, _, scan| {
52+
format_duration(scan.median)
53+
}),
54+
make_row("matches", report, |_, _, scan| scan.matches.to_string()),
55+
make_row("rows scanned", report, |_, _, scan| {
56+
scan.rows_scanned.to_string()
57+
}),
58+
make_row("bytes scanned", report, |_, _, scan| {
59+
format_bytes(scan.bytes_scanned)
60+
}),
61+
make_row("rows / sec", report, |_, _, scan| {
62+
format_throughput_rows(scan.rows_scanned, scan.mean)
63+
}),
64+
];
65+
66+
writeln!(writer, "## {}", report.dataset_name)?;
67+
let mut builder = tabled::builder::Builder::new();
68+
builder.push_record(headers);
69+
for row in rows {
70+
builder.push_record(row);
71+
}
72+
let mut table = builder.build();
73+
table.with(Style::modern());
74+
writeln!(writer, "{table}")?;
75+
Ok(())
76+
}
77+
78+
fn make_row<F>(metric: &str, report: &DatasetReport<'_>, vortex_cell: F) -> Vec<String>
79+
where
80+
F: Fn(VectorFlavor, &CompressedVortexDataset, &ScanTiming) -> String,
81+
{
82+
let mut row = vec![metric.to_owned()];
83+
for &(flavor, prep, scan) in report.vortex_results {
84+
row.push(vortex_cell(flavor, prep, scan));
85+
}
86+
row
87+
}
88+
89+
fn format_duration(d: std::time::Duration) -> String {
90+
let secs = d.as_secs_f64();
91+
if secs >= 1.0 {
92+
format!("{secs:.2} s")
93+
} else if secs >= 1e-3 {
94+
format!("{:.1} ms", secs * 1e3)
95+
} else {
96+
format!("{:.1} µs", secs * 1e6)
97+
}
98+
}
99+
100+
fn format_bytes(bytes: u64) -> String {
101+
const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
102+
let mut value = bytes as f64;
103+
let mut unit = UNITS[0];
104+
for next in &UNITS[1..] {
105+
if value < 1024.0 {
106+
break;
107+
}
108+
value /= 1024.0;
109+
unit = next;
110+
}
111+
if unit == "B" {
112+
format!("{bytes} B")
113+
} else {
114+
format!("{value:.2} {unit}")
115+
}
116+
}
117+
118+
fn format_throughput_rows(rows: u64, wall: std::time::Duration) -> String {
119+
let secs = wall.as_secs_f64();
120+
if secs <= 0.0 {
121+
return "—".to_owned();
122+
}
123+
let rps = rows as f64 / secs;
124+
if rps >= 1e9 {
125+
format!("{:.2}G", rps / 1e9)
126+
} else if rps >= 1e6 {
127+
format!("{:.2}M", rps / 1e6)
128+
} else if rps >= 1e3 {
129+
format!("{:.2}K", rps / 1e3)
130+
} else {
131+
format!("{rps:.0}")
132+
}
133+
}

benchmarks/vector-search-bench/src/ingest.rs

Lines changed: 47 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
//! 1. Project the `emb` column out of each struct chunk.
99
//! 2. Rewrap the `emb` column as `Extension<Vector<f32, dim>>` via
1010
//! [`vortex_bench::vector_dataset::list_to_vector_ext`].
11-
//! 3. Cast the FSL element buffer from `f64` -> `f32` if the source is `f64`. After this point all
11+
//! 3. Detect the FSL element ptype at runtime and cast `f64` -> `f32` when needed. Detection is
12+
//! from the arrow schema rather than a catalog declaration so upstream parquets whose actual
13+
//! precision disagrees with the catalog still ingest correctly. After this point all
1214
//! downstream code (compression, scan, recall) is f32-only.
1315
//! 4. Optionally project the `scalar_labels` column through unchanged so future filtered-search
1416
//! benchmarks have it without re-ingest.
@@ -39,48 +41,50 @@ use vortex_bench::vector_dataset::list_to_vector_ext;
3941
use vortex_tensor::vector::AnyVector;
4042
use vortex_tensor::vector::Vector;
4143

42-
/// Configuration passed alongside each chunk so the transform can stay stateless.
43-
#[derive(Debug, Clone, Copy)]
44-
pub struct ChunkTransform {
45-
/// Source element ptype as declared by the dataset catalog. Used purely to decide whether the
46-
/// f64 -> f32 cast is needed.
47-
pub src_ptype: PType,
48-
// /// Whether to project the `scalar_labels` column through the output struct.
49-
// pub include_scalar_labels: bool,
50-
}
44+
/// Apply the transform to a single struct chunk and return the rebuilt chunk.
45+
///
46+
/// `chunk` must be a non-chunked `Struct { id: i64, emb: List<f32> }`, where all of the list
47+
/// elements are
48+
///
49+
/// The returned array is always a `Struct { id: i64, emb: Vector<f32, dim> }`.
50+
pub fn transform_chunk(chunk: ArrayRef, ctx: &mut ExecutionCtx) -> Result<ArrayRef> {
51+
let struct_view = chunk
52+
.as_opt::<Struct>()
53+
.with_context(|| format!("ingest: expected struct chunk, got dtype {}", chunk.dtype()))?;
54+
55+
let id = struct_view
56+
.unmasked_field_by_name("id")
57+
.context("ingest: chunk missing `id` column")?
58+
.clone();
59+
let emb = struct_view
60+
.unmasked_field_by_name("emb")
61+
.context("ingest: chunk missing `emb` column")?
62+
.clone();
63+
64+
let emb_ext: ExtensionArray = list_to_vector_ext(emb)?.execute(ctx)?;
65+
66+
// Detect the actual FSL element ptype from the extension storage dtype. The dataset catalog
67+
// cannot be trusted here: at least one upstream parquet (`sift-medium-5m`) ships f64
68+
// embeddings despite the catalog advertising f32.
69+
let element_ptype = {
70+
let storage_dtype = emb_ext.storage_array().dtype();
71+
match storage_dtype {
72+
DType::FixedSizeList(elem, ..) => match elem.as_ref() {
73+
DType::Primitive(ptype, _) => *ptype,
74+
other => bail!("ingest: expected primitive FSL element dtype, got {other}"),
75+
},
76+
other => bail!("ingest: expected FSL storage dtype, got {other}"),
77+
}
78+
};
5179

52-
impl ChunkTransform {
53-
/// Apply the transform to a single struct chunk and return the rebuilt chunk.
54-
///
55-
/// `chunk` must be a non-chunked `Struct { id: i64, emb: List<f32> }`, where all of the list
56-
/// elements are
57-
///
58-
/// The returned array is always a `Struct { id: i64, emb: Vector<f32, dim> }`.
59-
pub fn apply(&self, chunk: ArrayRef, ctx: &mut ExecutionCtx) -> Result<ArrayRef> {
60-
let struct_view = chunk.as_opt::<Struct>().with_context(|| {
61-
format!("ingest: expected struct chunk, got dtype {}", chunk.dtype())
62-
})?;
63-
64-
let id = struct_view
65-
.unmasked_field_by_name("id")
66-
.context("ingest: chunk missing `id` column")?
67-
.clone();
68-
let emb = struct_view
69-
.unmasked_field_by_name("emb")
70-
.context("ingest: chunk missing `emb` column")?
71-
.clone();
72-
73-
let emb_ext: ExtensionArray = list_to_vector_ext(emb)?.execute(ctx)?;
74-
75-
let f32_vector_array = if self.src_ptype == PType::F64 {
76-
convert_f64_to_f32_vectors(&emb_ext, ctx)?
77-
} else {
78-
emb_ext.into_array()
79-
};
80+
let f32_vector_array = match element_ptype {
81+
PType::F32 => emb_ext.into_array(),
82+
PType::F64 => convert_f64_to_f32_vectors(&emb_ext, ctx)?,
83+
other => bail!("ingest: unsupported emb element ptype {other}, expected f32 or f64"),
84+
};
8085

81-
let fields = [("id", id), ("emb", f32_vector_array)];
82-
Ok(StructArray::from_fields(&fields)?.into_array())
83-
}
86+
let fields = [("id", id), ("emb", f32_vector_array)];
87+
Ok(StructArray::from_fields(&fields)?.into_array())
8488
}
8589

8690
/// Convert a `Vector<f64, dim>` extension array down to `Vector<f32, dim>`.
@@ -164,10 +168,7 @@ mod tests {
164168
let emb = list_chunk_f64(&[&[1.0, 2.0, 3.0], &[4.0, 5.0, 6.0]]);
165169
let chunk =
166170
StructArray::from_fields(&[("id", id_array(&[0, 1])), ("emb", emb)])?.into_array();
167-
let transform = ChunkTransform {
168-
src_ptype: PType::F64,
169-
};
170-
let out = transform.apply(chunk, &mut ctx)?;
171+
let out = transform_chunk(chunk, &mut ctx)?;
171172
let out_struct = out.as_opt::<Struct>().expect("returns Struct");
172173
let out_emb = out_struct.unmasked_field_by_name("emb").unwrap().clone();
173174
let DType::Extension(ext) = out_emb.dtype() else {
@@ -207,10 +208,7 @@ mod tests {
207208
let chunk =
208209
StructArray::from_fields(&[("id", id_array(&[0, 1])), ("emb", emb)])?.into_array();
209210

210-
let transform = ChunkTransform {
211-
src_ptype: PType::F32,
212-
};
213-
let out = transform.apply(chunk, &mut ctx)?;
211+
let out = transform_chunk(chunk, &mut ctx)?;
214212
let out_struct = out.as_opt::<Struct>().expect("returns Struct");
215213
assert_eq!(out_struct.len(), 2);
216214
Ok(())

benchmarks/vector-search-bench/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
//! `vector-search-bench` vector similarity-search benchmark over several datasets.
55
66
pub mod compression;
7+
pub mod display;
78
pub mod expression;
89
pub mod ingest;
910
pub mod prepare;
11+
pub mod query;
12+
pub mod scan;
1013

1114
use std::sync::LazyLock;
1215

0 commit comments

Comments
 (0)