Skip to content

Commit 051bb91

Browse files
committed
feat: add clickhouse-bench for ClickBench Parquet benchmarks
Introduce a new clickhouse-bench benchmark crate that runs ClickBench queries against Parquet data via clickhouse-local, providing a baseline for comparing Vortex performance against ClickHouse. Key design decisions: - The ClickHouse binary is resolved at runtime: via CLICKHOUSE_BINARY env var or from $PATH. CI installs it via the official installer before benchmarks. - ClickHouse-dialect SQL queries use the same query file as other engines (ClickHouse handles quoted identifiers correctly). - Each query spawns a fresh clickhouse-local process for complete isolation (no cross-query caching, no warm-up effects). - CI workflows updated to include clickhouse:parquet target in ClickBench benchmarks and conditionally build clickhouse-bench. Closes #6425 Signed-off-by: fastio <niclas@fastio.me>
1 parent 1c8667c commit 051bb91

12 files changed

Lines changed: 475 additions & 18 deletions

File tree

.github/scripts/run-sql-bench.sh

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,22 @@
22
# SPDX-License-Identifier: Apache-2.0
33
# SPDX-FileCopyrightText: Copyright the Vortex contributors
44
#
5-
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
6-
# This script is used by the sql-benchmarks.yml workflow.
5+
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench, clickhouse-bench)
6+
# for the given targets. This script is used by the sql-benchmarks.yml workflow.
77
#
88
# Usage:
99
# run-sql-bench.sh <subcommand> <targets> [options]
1010
#
1111
# Arguments:
1212
# subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds)
1313
# targets Comma-separated list of engine:format pairs
14-
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
14+
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet,clickhouse:parquet")
1515
#
1616
# Options:
1717
# --scale-factor <sf> Scale factor for the benchmark (e.g., 1.0, 10.0)
1818
# --iterations <n> Number of iterations to pass to each benchmark binary
1919
# --remote-storage <url> Remote storage URL (e.g., s3://bucket/path/)
20-
# If provided, runs in remote mode (no lance support).
20+
# If provided, runs in remote mode (no lance/clickhouse support).
2121
# --benchmark-id <id> Benchmark ID for error messages (e.g., tpch-s3)
2222

2323
set -Eeu -o pipefail
@@ -71,6 +71,13 @@ if $is_remote && echo "$targets" | grep -q 'lance'; then
7171
exit 1
7272
fi
7373

74+
# ClickHouse on remote storage is not supported. clickhouse-local reads local files only.
75+
if $is_remote && echo "$targets" | grep -q 'clickhouse:'; then
76+
echo "ERROR: ClickHouse benchmarks are not supported for remote storage."
77+
echo "Remove 'clickhouse:' targets for benchmark '${benchmark_id:-unknown}'."
78+
exit 1
79+
fi
80+
7481
# Extract formats for each engine from the targets string.
7582
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
7683
#
@@ -84,6 +91,7 @@ fi
8491
df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
8592
ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
8693
has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
94+
has_clickhouse=$(echo "$targets" | grep -q 'clickhouse:' && echo "true" || echo "false")
8795

8896
# Build options string.
8997
opts=""
@@ -136,3 +144,14 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l
136144

137145
cat lance-results.json >> results.json
138146
fi
147+
148+
# ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files).
149+
if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then
150+
# shellcheck disable=SC2086
151+
target/release_debug/clickhouse-bench "$subcommand" \
152+
-d gh-json \
153+
$opts \
154+
-o ch-results.json
155+
156+
cat ch-results.json >> results.json
157+
fi

.github/workflows/bench.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ jobs:
121121
"id": "clickbench-nvme",
122122
"subcommand": "clickbench",
123123
"name": "Clickbench on NVME",
124-
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
124+
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet",
125125
"build_lance": true
126126
},
127127
{

.github/workflows/sql-benchmarks.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ on:
2121
"id": "clickbench-nvme",
2222
"subcommand": "clickbench",
2323
"name": "Clickbench on NVME",
24-
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb"
24+
"targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb,clickhouse:parquet"
2525
},
2626
{
2727
"id": "tpch-nvme",
@@ -135,6 +135,16 @@ jobs:
135135
136136
- uses: ./.github/actions/system-info
137137

138+
- name: Install ClickHouse
139+
if: contains(matrix.targets, 'clickhouse:')
140+
env:
141+
CLICKHOUSE_VERSION: "25.8.18.1"
142+
run: |
143+
wget -qO- "https://github.com/ClickHouse/ClickHouse/releases/download/v${CLICKHOUSE_VERSION}-lts/clickhouse-common-static-${CLICKHOUSE_VERSION}-amd64.tgz" | tar xz
144+
cp clickhouse-common-static-${CLICKHOUSE_VERSION}/usr/bin/clickhouse .
145+
chmod +x clickhouse
146+
echo "CLICKHOUSE_BINARY=$PWD/clickhouse" >> $GITHUB_ENV
147+
138148
- name: Build binaries
139149
shell: bash
140150
env:
@@ -144,6 +154,9 @@ jobs:
144154
if [ "${{ matrix.build_lance }}" = "true" ]; then
145155
packages="$packages --bin lance-bench"
146156
fi
157+
if echo "${{ matrix.targets }}" | grep -q 'clickhouse:'; then
158+
packages="$packages --bin clickhouse-bench"
159+
fi
147160
cargo build $packages --profile release_debug
148161
149162
- name: Generate data

Cargo.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ members = [
4949
"encodings/zstd",
5050
"encodings/bytebool",
5151
# Benchmarks
52+
"benchmarks/clickhouse-bench",
5253
"benchmarks/lance-bench",
5354
"benchmarks/compress-bench",
5455
"benchmarks/datafusion-bench",
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
name = "clickhouse-bench"
3+
description = "ClickHouse (clickhouse-local) benchmark runner for Vortex"
4+
authors.workspace = true
5+
edition.workspace = true
6+
homepage.workspace = true
7+
license.workspace = true
8+
readme.workspace = true
9+
repository.workspace = true
10+
rust-version.workspace = true
11+
version.workspace = true
12+
publish = false
13+
14+
[dependencies]
15+
anyhow = { workspace = true }
16+
clap = { workspace = true, features = ["derive"] }
17+
parking_lot = { workspace = true }
18+
tokio = { workspace = true }
19+
tracing = { workspace = true }
20+
vortex-bench = { workspace = true }
21+
22+
[lints]
23+
workspace = true
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Build script that exports the ClickHouse binary path.
5+
//!
6+
//! Resolution order:
7+
//! 1. `CLICKHOUSE_BINARY` env var — use as-is.
8+
//! 2. Falls back to `"clickhouse"` (i.e., resolve from `$PATH` at runtime).
9+
//!
10+
//! Users must install ClickHouse themselves for local runs.
11+
//! In CI, it is installed via the workflow before the benchmark step.
12+
13+
fn main() {
14+
println!("cargo:rerun-if-env-changed=CLICKHOUSE_BINARY");
15+
16+
let binary = std::env::var("CLICKHOUSE_BINARY").unwrap_or_else(|_| "clickhouse".to_string());
17+
println!("cargo:rustc-env=CLICKHOUSE_BINARY={binary}");
18+
}

0 commit comments

Comments
 (0)