Merge branch 'develop' into ji/fsst-like-kernel

joseph-isaacs · web-flow · commit ea9d592acc27 · 2026-03-13T11:52:17.000Z
diff --git a/Cargo.toml b/Cargo.toml
@@ -81,6 +81,7 @@ rust-version = "1.90"
 version = "0.1.0"
 
 [workspace.dependencies]
+aho-corasick = "1.1.3"
 anyhow = "1.0.97"
 arbitrary = "1.3.2"
 arc-swap = "1.8"
@@ -121,6 +122,7 @@ cudarc = { version = "0.18.2", features = [
     "cuda-12050",
 ] }
 custom-labels = "0.4.4"
+daachorse = "1.0.0"
 dashmap = "6.1.0"
 datafusion = { version = "52", default-features = false, features = ["sql"] }
 datafusion-catalog = { version = "52" }
@@ -155,6 +157,7 @@ indicatif = "0.18.0"
 insta = "1.43"
 inventory = "0.3.20"
 itertools = "0.14.0"
+jetscii = "0.5.3"
 jiff = "0.2.0"
 kanal = "0.1.1"
 lending-iterator = "0.1.7"
@@ -163,6 +166,7 @@ libloading = "0.8"
 liblzma = "0.4"
 log = { version = "0.4.21" }
 loom = { version = "0.7", features = ["checkpoint"] }
+memchr = "2.8.0"
 memmap2 = "0.9.5"
 mimalloc = "0.1.42"
 moka = { version = "0.12.10", default-features = false }
@@ -196,6 +200,7 @@ rand = "0.9.0"
 rand_distr = "0.5"
 ratatui = { version = "0.30", default-features = false }
 regex = "1.11.0"
+regex-automata = "0.4"
 reqwest = { version = "0.12.4", features = [
     "charset",
     "http2",
diff --git a/docs/developer-guide/benchmarking.md b/docs/developer-guide/benchmarking.md
@@ -1,21 +1,158 @@
 # Benchmarking
 
 Vortex has two categories of benchmarks: microbenchmarks for individual operations, and SQL
-benchmarks for end-to-end query performance. The `bench-orchestrator` tool coordinates running
-SQL benchmarks across different engines without compiling them all into a single binary.
+benchmarks for end-to-end query performance.
 
 ## Microbenchmarks
 
-Microbenchmarks use the Divan framework and live in `benches/` directories within individual
-crates. They cover low-level operations such as encoding, decoding, compute kernels, buffer
-operations, and scalar access.
+Microbenchmarks use the Divan framework and live in `benches/` directories within individual crates.
 
 Run microbenchmarks for a specific crate with:
 
 ```bash
 cargo bench -p <crate-name>
 ```
 
+## Best Practices
+
+### Separate setup from profiled code
+
+Always use `bencher.with_inputs(|| ...)` so fixture construction is excluded from timing:
+
+```rust
+bencher
+    .with_inputs(|| bench_fixture()))
+    .bench_refs(|(array, indices)| {
+        array.take(indices.to_array()).unwrap()
+    });
+```
+
+### Exclude `Drop` from measurements
+
+Divan measures only the closure body, **not** the `Drop` of its return value.
+Structure your benchmark so that expensive drops happen via the return value or
+via bench_refs inputs.
+
+- **Return the value** from the closure — Divan will drop it after timing stops:
+
+  ```rust
+  bencher
+      .with_inputs(|| make_big_vec())
+      .bench_values(|v| transform(v))  // drop of the result is NOT timed
+  ```
+
+- **Use `bench_refs`** — the input is dropped after the entire sample loop, not per-iteration:
+
+  ```rust
+  bencher
+      .with_inputs(|| make_big_vec())
+      .bench_refs(|v| v.sort())  // v is dropped outside the timed region
+  ```
+
+Structure your benchmark so that expensive drops happen via the return value or via `bench_refs` inputs.
+
+### Black-box inputs to prevent compiler optimization
+
+The compiler can constant-fold or eliminate work if it can prove that inputs are known at
+compile time.
+
+Values provided through `with_inputs` are automatically black-boxed by Divan — no action
+needed:
+
+```rust
+// ✓ `array` and `indices` are automatically black-boxed by Divan
+bencher
+    .with_inputs(|| (&prebuilt_array, &prebuilt_indices))
+    .bench_refs(|(array, indices)| array.take(indices.to_array()).unwrap());
+```
+
+### Captured variables
+
+Variables captured from the surrounding scope are _not_ black-boxed. Wrap them with
+`divan::black_box()` or pass them through `with_inputs` instead:
+
+```rust
+let array = make_array();
+
+// ✗ `array` is captured — the compiler may optimize based on its known contents
+bencher.bench(|| process(&array));
+
+// ✓ Option A: pass through with_inputs
+bencher
+    .with_inputs(|| &array)
+    .bench_refs(|array| process(array));
+
+// ✓ Option B: explicit black_box on the capture
+bencher.bench(|| process(divan::black_box(&array)));
+```
+
+### Return values and manual loops
+
+Return values are automatically black-boxed. You only need explicit
+`black_box` for side-effect-free results inside manual loops:
+
+```rust
+bencher.with_inputs(|| &array).bench_refs(|array| {
+    for idx in 0..len {
+        divan::black_box(array.scalar_at(idx).unwrap());
+    }
+});
+```
+
+### Use deterministic, seeded RNG
+
+Always use `StdRng::seed_from_u64(N)` for reproducible data generation:
+
+```rust
+let mut rng = StdRng::seed_from_u64(0);
+```
+
+### Parameterize with `args`, `consts`, and `types`
+
+Use Divan's parameterization features and define parameter arrays as named constants:
+
+```rust
+const NUM_INDICES: &[usize] = &[1_000, 10_000, 100_000];
+const VECTOR_SIZE: &[usize] = &[16, 256, 2048, 8192];
+
+#[divan::bench(args = NUM_INDICES, consts = VECTOR_SIZE)]
+fn my_bench<const N: usize>(bencher: Bencher, num_indices: usize) { ... }
+```
+
+### Keep per-iteration execution time under ~1 ms
+
+Each individual iteration of the benchmarked closure should complete in
+**less than 1ms**. This is to keep benchmarks snappy, locally and on CI.
+
+### Gate CodSpeed-incompatible benchmarks
+
+Use `#[cfg(not(codspeed))]` for benchmarks that are incompatible with CodSpeed.
+
+### CodSpeed's single-run model
+
+CI benchmarks run under [CodSpeed's CPU simulation](https://codspeed.io/docs/instruments/cpu),
+which executes each benchmark **exactly once** and estimates CPU cycles from the instruction
+trace — including cache and memory access costs. This has several implications:
+
+- **`sample_count` and `sample_size` have no effect** — CodSpeed always runs one iteration.
+- **Results are deterministic** — the simulated cycle count is derived from the instruction
+  trace, not wall-clock time, so there is no noise from system load or scheduling.
+- **System calls are excluded** — CodSpeed only measures user-space code. Benchmarks that
+  rely on I/O or kernel interactions will not reflect those costs, so they should use the
+  [walltime instrument](https://codspeed.io/docs/instruments/walltime) or be gated with
+  `#[cfg(not(codspeed))]`.
+
+### Prefer `mimalloc` for throughput benchmarks
+
+Throughput benchmarks should use `mimalloc` as the global allocator to reduce system allocator
+noise:
+
+```rust
+use mimalloc::MiMalloc;
+#[global_allocator]
+static GLOBAL: MiMalloc = MiMalloc;
+```
+
 ## SQL Benchmarks
 
 SQL benchmarks measure end-to-end query performance across different engines and file formats.
@@ -48,51 +185,11 @@ cargo run --release --bin duckdb-bench -- <benchmark>
 
 ## Orchestrator
 
-The `bench-orchestrator` is a Python CLI tool (`vx-bench`) that coordinates running benchmarks
-across multiple engines. It builds and invokes the per-engine binaries, stores results, and
-provides comparison tooling. This avoids compiling all engines into a single binary, which
-would be slow and create dependency conflicts.
-
-Install it with:
-
-```bash
-uv tool install "bench_orchestrator @ ./bench-orchestrator/"
-```
-
-### Running Benchmarks
-
-```bash
-# Run TPC-H on DataFusion and DuckDB, comparing Parquet and Vortex
-vx-bench run tpch --engine datafusion,duckdb --format parquet,vortex
-
-# Run a subset of queries with fewer iterations
-vx-bench run tpch -q 1,6,12 -i 3
-
-# Run with memory tracking
-vx-bench run tpch --track-memory
-
-# Run with CPU profiling
-vx-bench run tpch --samply
-```
-
-### Comparing Results
-
-```bash
-# Compare formats/engines within the most recent run
-vx-bench compare --run latest
-
-# Compare across two labeled runs
-vx-bench compare --runs baseline,feature
-```
-
-Comparison output is color-coded: green for improvements (>10%), yellow for neutral, red for
-regressions.
-
-### Result Storage
+The `bench-orchestrator` is a Python CLI tool (`vx-bench`) that coordinates running SQL
+benchmarks across multiple engines, stores results, and provides comparison tooling.
 
-Results are stored as JSON Lines files under `target/vortex-bench/runs/`, with each run
-containing metadata (git commit, timestamp, configuration) and per-query timing data. The
-`vx-bench list` command shows recent runs.
+See [`bench-orchestrator/README.md`](https://github.com/vortex-data/vortex/blob/develop/bench-orchestrator/README.md) for installation,
+commands, and example workflows.
 
 ## CI Benchmarks
 
diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml
@@ -38,10 +38,17 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
 [[bench]]
 name = "fsst_compress"
 harness = false
+required-features = ["_test-harness"]
+
+[[bench]]
+name = "fsst_contains"
+harness = false
+required-features = ["_test-harness"]
 
 [[bench]]
 name = "fsst_url_compare"
 harness = false
+required-features = ["_test-harness"]
 
 [[bench]]
 name = "chunked_dict_fsst_builder"
diff --git a/encodings/fsst/benches/fsst_contains.rs b/encodings/fsst/benches/fsst_contains.rs
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(clippy::unwrap_used)]
+
+use std::fmt;
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use vortex_array::Canonical;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex_array::scalar_fn::fns::like::Like;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_array::session::ArraySession;
+use vortex_fsst::FSSTArray;
+use vortex_fsst::test_utils::NUM_STRINGS;
+use vortex_fsst::test_utils::make_fsst_clickbench_urls;
+use vortex_fsst::test_utils::make_fsst_emails;
+use vortex_fsst::test_utils::make_fsst_file_paths;
+use vortex_fsst::test_utils::make_fsst_json_strings;
+use vortex_fsst::test_utils::make_fsst_log_lines;
+use vortex_fsst::test_utils::make_fsst_rare_match;
+use vortex_fsst::test_utils::make_fsst_short_urls;
+use vortex_session::VortexSession;
+
+fn main() {
+    divan::main();
+}
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+const N: usize = NUM_STRINGS;
+
+static FSST_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_short_urls(N));
+static FSST_CB_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_clickbench_urls(N));
+static FSST_LOG_LINES: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_log_lines(N));
+static FSST_JSON_STRINGS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_json_strings(N));
+static FSST_FILE_PATHS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_file_paths(N));
+static FSST_EMAILS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_emails(N));
+static FSST_RARE_MATCH: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_rare_match(N));
+
+enum Dataset {
+    Urls,
+    Cb,
+    Log,
+    Json,
+    Path,
+    Email,
+    Rare,
+}
+
+impl fmt::Display for Dataset {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Urls => f.write_str("urls"),
+            Self::Cb => f.write_str("cb"),
+            Self::Log => f.write_str("log"),
+            Self::Json => f.write_str("json"),
+            Self::Path => f.write_str("path"),
+            Self::Email => f.write_str("email"),
+            Self::Rare => f.write_str("rare"),
+        }
+    }
+}
+
+impl Dataset {
+    fn fsst_array(&self) -> &'static FSSTArray {
+        match self {
+            Self::Urls => &FSST_URLS,
+            Self::Cb => &FSST_CB_URLS,
+            Self::Log => &FSST_LOG_LINES,
+            Self::Json => &FSST_JSON_STRINGS,
+            Self::Path => &FSST_FILE_PATHS,
+            Self::Email => &FSST_EMAILS,
+            Self::Rare => &FSST_RARE_MATCH,
+        }
+    }
+
+    fn pattern(&self) -> &'static str {
+        match self {
+            Self::Urls => "%google%",
+            Self::Cb => "%yandex%",
+            Self::Log => "%Googlebot%",
+            Self::Json => "%enterprise%",
+            Self::Path => "%target/release%",
+            Self::Email => "%gmail%",
+            Self::Rare => "%xyzzy%",
+        }
+    }
+}
+
+#[divan::bench(args = [
+    Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
+    Dataset::Path, Dataset::Email, Dataset::Rare,
+])]
+fn fsst_like(bencher: Bencher, dataset: &Dataset) {
+    let fsst = dataset.fsst_array();
+    let len = fsst.len();
+    let arr = fsst.clone().into_array();
+    let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
+    bencher.bench_local(|| {
+        Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
+            .unwrap()
+            .into_array()
+            .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+            .unwrap()
+    });
+}
diff --git a/encodings/fsst/benches/fsst_url_compare.rs b/encodings/fsst/benches/fsst_url_compare.rs
diff --git a/encodings/fsst/src/test_utils.rs b/encodings/fsst/src/test_utils.rs