Skip to content

Commit ea9d592

Browse files
Merge branch 'develop' into ji/fsst-like-kernel
2 parents 9f2ff66 + ad7b09f commit ea9d592

6 files changed

Lines changed: 800 additions & 124 deletions

File tree

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ rust-version = "1.90"
8181
version = "0.1.0"
8282

8383
[workspace.dependencies]
84+
aho-corasick = "1.1.3"
8485
anyhow = "1.0.97"
8586
arbitrary = "1.3.2"
8687
arc-swap = "1.8"
@@ -121,6 +122,7 @@ cudarc = { version = "0.18.2", features = [
121122
"cuda-12050",
122123
] }
123124
custom-labels = "0.4.4"
125+
daachorse = "1.0.0"
124126
dashmap = "6.1.0"
125127
datafusion = { version = "52", default-features = false, features = ["sql"] }
126128
datafusion-catalog = { version = "52" }
@@ -155,6 +157,7 @@ indicatif = "0.18.0"
155157
insta = "1.43"
156158
inventory = "0.3.20"
157159
itertools = "0.14.0"
160+
jetscii = "0.5.3"
158161
jiff = "0.2.0"
159162
kanal = "0.1.1"
160163
lending-iterator = "0.1.7"
@@ -163,6 +166,7 @@ libloading = "0.8"
163166
liblzma = "0.4"
164167
log = { version = "0.4.21" }
165168
loom = { version = "0.7", features = ["checkpoint"] }
169+
memchr = "2.8.0"
166170
memmap2 = "0.9.5"
167171
mimalloc = "0.1.42"
168172
moka = { version = "0.12.10", default-features = false }
@@ -196,6 +200,7 @@ rand = "0.9.0"
196200
rand_distr = "0.5"
197201
ratatui = { version = "0.30", default-features = false }
198202
regex = "1.11.0"
203+
regex-automata = "0.4"
199204
reqwest = { version = "0.12.4", features = [
200205
"charset",
201206
"http2",

docs/developer-guide/benchmarking.md

Lines changed: 146 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,158 @@
11
# Benchmarking
22

33
Vortex has two categories of benchmarks: microbenchmarks for individual operations, and SQL
4-
benchmarks for end-to-end query performance. The `bench-orchestrator` tool coordinates running
5-
SQL benchmarks across different engines without compiling them all into a single binary.
4+
benchmarks for end-to-end query performance.
65

76
## Microbenchmarks
87

9-
Microbenchmarks use the Divan framework and live in `benches/` directories within individual
10-
crates. They cover low-level operations such as encoding, decoding, compute kernels, buffer
11-
operations, and scalar access.
8+
Microbenchmarks use the Divan framework and live in `benches/` directories within individual crates.
129

1310
Run microbenchmarks for a specific crate with:
1411

1512
```bash
1613
cargo bench -p <crate-name>
1714
```
1815

16+
## Best Practices
17+
18+
### Separate setup from profiled code
19+
20+
Always use `bencher.with_inputs(|| ...)` so fixture construction is excluded from timing:
21+
22+
```rust
23+
bencher
24+
.with_inputs(|| bench_fixture()))
25+
.bench_refs(|(array, indices)| {
26+
array.take(indices.to_array()).unwrap()
27+
});
28+
```
29+
30+
### Exclude `Drop` from measurements
31+
32+
Divan measures only the closure body, **not** the `Drop` of its return value.
33+
Structure your benchmark so that expensive drops happen via the return value or
34+
via bench_refs inputs.
35+
36+
- **Return the value** from the closure — Divan will drop it after timing stops:
37+
38+
```rust
39+
bencher
40+
.with_inputs(|| make_big_vec())
41+
.bench_values(|v| transform(v)) // drop of the result is NOT timed
42+
```
43+
44+
- **Use `bench_refs`** — the input is dropped after the entire sample loop, not per-iteration:
45+
46+
```rust
47+
bencher
48+
.with_inputs(|| make_big_vec())
49+
.bench_refs(|v| v.sort()) // v is dropped outside the timed region
50+
```
51+
52+
Structure your benchmark so that expensive drops happen via the return value or via `bench_refs` inputs.
53+
54+
### Black-box inputs to prevent compiler optimization
55+
56+
The compiler can constant-fold or eliminate work if it can prove that inputs are known at
57+
compile time.
58+
59+
Values provided through `with_inputs` are automatically black-boxed by Divan — no action
60+
needed:
61+
62+
```rust
63+
// ✓ `array` and `indices` are automatically black-boxed by Divan
64+
bencher
65+
.with_inputs(|| (&prebuilt_array, &prebuilt_indices))
66+
.bench_refs(|(array, indices)| array.take(indices.to_array()).unwrap());
67+
```
68+
69+
### Captured variables
70+
71+
Variables captured from the surrounding scope are _not_ black-boxed. Wrap them with
72+
`divan::black_box()` or pass them through `with_inputs` instead:
73+
74+
```rust
75+
let array = make_array();
76+
77+
// ✗ `array` is captured — the compiler may optimize based on its known contents
78+
bencher.bench(|| process(&array));
79+
80+
// ✓ Option A: pass through with_inputs
81+
bencher
82+
.with_inputs(|| &array)
83+
.bench_refs(|array| process(array));
84+
85+
// ✓ Option B: explicit black_box on the capture
86+
bencher.bench(|| process(divan::black_box(&array)));
87+
```
88+
89+
### Return values and manual loops
90+
91+
Return values are automatically black-boxed. You only need explicit
92+
`black_box` for side-effect-free results inside manual loops:
93+
94+
```rust
95+
bencher.with_inputs(|| &array).bench_refs(|array| {
96+
for idx in 0..len {
97+
divan::black_box(array.scalar_at(idx).unwrap());
98+
}
99+
});
100+
```
101+
102+
### Use deterministic, seeded RNG
103+
104+
Always use `StdRng::seed_from_u64(N)` for reproducible data generation:
105+
106+
```rust
107+
let mut rng = StdRng::seed_from_u64(0);
108+
```
109+
110+
### Parameterize with `args`, `consts`, and `types`
111+
112+
Use Divan's parameterization features and define parameter arrays as named constants:
113+
114+
```rust
115+
const NUM_INDICES: &[usize] = &[1_000, 10_000, 100_000];
116+
const VECTOR_SIZE: &[usize] = &[16, 256, 2048, 8192];
117+
118+
#[divan::bench(args = NUM_INDICES, consts = VECTOR_SIZE)]
119+
fn my_bench<const N: usize>(bencher: Bencher, num_indices: usize) { ... }
120+
```
121+
122+
### Keep per-iteration execution time under ~1 ms
123+
124+
Each individual iteration of the benchmarked closure should complete in
125+
**less than 1ms**. This is to keep benchmarks snappy, locally and on CI.
126+
127+
### Gate CodSpeed-incompatible benchmarks
128+
129+
Use `#[cfg(not(codspeed))]` for benchmarks that are incompatible with CodSpeed.
130+
131+
### CodSpeed's single-run model
132+
133+
CI benchmarks run under [CodSpeed's CPU simulation](https://codspeed.io/docs/instruments/cpu),
134+
which executes each benchmark **exactly once** and estimates CPU cycles from the instruction
135+
trace — including cache and memory access costs. This has several implications:
136+
137+
- **`sample_count` and `sample_size` have no effect** — CodSpeed always runs one iteration.
138+
- **Results are deterministic** — the simulated cycle count is derived from the instruction
139+
trace, not wall-clock time, so there is no noise from system load or scheduling.
140+
- **System calls are excluded** — CodSpeed only measures user-space code. Benchmarks that
141+
rely on I/O or kernel interactions will not reflect those costs, so they should use the
142+
[walltime instrument](https://codspeed.io/docs/instruments/walltime) or be gated with
143+
`#[cfg(not(codspeed))]`.
144+
145+
### Prefer `mimalloc` for throughput benchmarks
146+
147+
Throughput benchmarks should use `mimalloc` as the global allocator to reduce system allocator
148+
noise:
149+
150+
```rust
151+
use mimalloc::MiMalloc;
152+
#[global_allocator]
153+
static GLOBAL: MiMalloc = MiMalloc;
154+
```
155+
19156
## SQL Benchmarks
20157

21158
SQL benchmarks measure end-to-end query performance across different engines and file formats.
@@ -48,51 +185,11 @@ cargo run --release --bin duckdb-bench -- <benchmark>
48185

49186
## Orchestrator
50187

51-
The `bench-orchestrator` is a Python CLI tool (`vx-bench`) that coordinates running benchmarks
52-
across multiple engines. It builds and invokes the per-engine binaries, stores results, and
53-
provides comparison tooling. This avoids compiling all engines into a single binary, which
54-
would be slow and create dependency conflicts.
55-
56-
Install it with:
57-
58-
```bash
59-
uv tool install "bench_orchestrator @ ./bench-orchestrator/"
60-
```
61-
62-
### Running Benchmarks
63-
64-
```bash
65-
# Run TPC-H on DataFusion and DuckDB, comparing Parquet and Vortex
66-
vx-bench run tpch --engine datafusion,duckdb --format parquet,vortex
67-
68-
# Run a subset of queries with fewer iterations
69-
vx-bench run tpch -q 1,6,12 -i 3
70-
71-
# Run with memory tracking
72-
vx-bench run tpch --track-memory
73-
74-
# Run with CPU profiling
75-
vx-bench run tpch --samply
76-
```
77-
78-
### Comparing Results
79-
80-
```bash
81-
# Compare formats/engines within the most recent run
82-
vx-bench compare --run latest
83-
84-
# Compare across two labeled runs
85-
vx-bench compare --runs baseline,feature
86-
```
87-
88-
Comparison output is color-coded: green for improvements (>10%), yellow for neutral, red for
89-
regressions.
90-
91-
### Result Storage
188+
The `bench-orchestrator` is a Python CLI tool (`vx-bench`) that coordinates running SQL
189+
benchmarks across multiple engines, stores results, and provides comparison tooling.
92190

93-
Results are stored as JSON Lines files under `target/vortex-bench/runs/`, with each run
94-
containing metadata (git commit, timestamp, configuration) and per-query timing data. The
95-
`vx-bench list` command shows recent runs.
191+
See [`bench-orchestrator/README.md`](https://github.com/vortex-data/vortex/blob/develop/bench-orchestrator/README.md) for installation,
192+
commands, and example workflows.
96193

97194
## CI Benchmarks
98195

encodings/fsst/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,17 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
3838
[[bench]]
3939
name = "fsst_compress"
4040
harness = false
41+
required-features = ["_test-harness"]
42+
43+
[[bench]]
44+
name = "fsst_contains"
45+
harness = false
46+
required-features = ["_test-harness"]
4147

4248
[[bench]]
4349
name = "fsst_url_compare"
4450
harness = false
51+
required-features = ["_test-harness"]
4552

4653
[[bench]]
4754
name = "chunked_dict_fsst_builder"
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
6+
use std::fmt;
7+
use std::sync::LazyLock;
8+
9+
use divan::Bencher;
10+
use vortex_array::Canonical;
11+
use vortex_array::IntoArray;
12+
use vortex_array::VortexSessionExecute;
13+
use vortex_array::arrays::ConstantArray;
14+
use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
15+
use vortex_array::scalar_fn::fns::like::Like;
16+
use vortex_array::scalar_fn::fns::like::LikeOptions;
17+
use vortex_array::session::ArraySession;
18+
use vortex_fsst::FSSTArray;
19+
use vortex_fsst::test_utils::NUM_STRINGS;
20+
use vortex_fsst::test_utils::make_fsst_clickbench_urls;
21+
use vortex_fsst::test_utils::make_fsst_emails;
22+
use vortex_fsst::test_utils::make_fsst_file_paths;
23+
use vortex_fsst::test_utils::make_fsst_json_strings;
24+
use vortex_fsst::test_utils::make_fsst_log_lines;
25+
use vortex_fsst::test_utils::make_fsst_rare_match;
26+
use vortex_fsst::test_utils::make_fsst_short_urls;
27+
use vortex_session::VortexSession;
28+
29+
fn main() {
30+
divan::main();
31+
}
32+
33+
static SESSION: LazyLock<VortexSession> =
34+
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
35+
36+
const N: usize = NUM_STRINGS;
37+
38+
static FSST_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_short_urls(N));
39+
static FSST_CB_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_clickbench_urls(N));
40+
static FSST_LOG_LINES: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_log_lines(N));
41+
static FSST_JSON_STRINGS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_json_strings(N));
42+
static FSST_FILE_PATHS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_file_paths(N));
43+
static FSST_EMAILS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_emails(N));
44+
static FSST_RARE_MATCH: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_rare_match(N));
45+
46+
enum Dataset {
47+
Urls,
48+
Cb,
49+
Log,
50+
Json,
51+
Path,
52+
Email,
53+
Rare,
54+
}
55+
56+
impl fmt::Display for Dataset {
57+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
58+
match self {
59+
Self::Urls => f.write_str("urls"),
60+
Self::Cb => f.write_str("cb"),
61+
Self::Log => f.write_str("log"),
62+
Self::Json => f.write_str("json"),
63+
Self::Path => f.write_str("path"),
64+
Self::Email => f.write_str("email"),
65+
Self::Rare => f.write_str("rare"),
66+
}
67+
}
68+
}
69+
70+
impl Dataset {
71+
fn fsst_array(&self) -> &'static FSSTArray {
72+
match self {
73+
Self::Urls => &FSST_URLS,
74+
Self::Cb => &FSST_CB_URLS,
75+
Self::Log => &FSST_LOG_LINES,
76+
Self::Json => &FSST_JSON_STRINGS,
77+
Self::Path => &FSST_FILE_PATHS,
78+
Self::Email => &FSST_EMAILS,
79+
Self::Rare => &FSST_RARE_MATCH,
80+
}
81+
}
82+
83+
fn pattern(&self) -> &'static str {
84+
match self {
85+
Self::Urls => "%google%",
86+
Self::Cb => "%yandex%",
87+
Self::Log => "%Googlebot%",
88+
Self::Json => "%enterprise%",
89+
Self::Path => "%target/release%",
90+
Self::Email => "%gmail%",
91+
Self::Rare => "%xyzzy%",
92+
}
93+
}
94+
}
95+
96+
#[divan::bench(args = [
97+
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
98+
Dataset::Path, Dataset::Email, Dataset::Rare,
99+
])]
100+
fn fsst_like(bencher: Bencher, dataset: &Dataset) {
101+
let fsst = dataset.fsst_array();
102+
let len = fsst.len();
103+
let arr = fsst.clone().into_array();
104+
let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
105+
bencher.bench_local(|| {
106+
Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
107+
.unwrap()
108+
.into_array()
109+
.execute::<Canonical>(&mut SESSION.create_execution_ctx())
110+
.unwrap()
111+
});
112+
}

0 commit comments

Comments
 (0)