Skip to content

Commit f793584

Browse files
authored
Add clickbench-sorted benchmark (#8559)
## Summary Adds a new benchmark that is a Clickbench derivative, using the same data sorted by `("EventDate", "EventTime", "WatchID")`, and only the subset of queries that use the sorting. Its worth noting that even with the extra time spent sorting the input data, its still faster than the full clickbench run. ## Results Seems like it fails because there's no baseline, but the results are: ``` ┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ ┃ Query ┃ datafusion:parquet (base) ┃ datafusion:vortex ┃ duckdb:parquet ┃ duckdb:vortex ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ │ 23 │ 4.81s │ 674.3ms (0.14x) │ 138.5ms (0.03x) │ 178.7ms (0.04x) │ │ 24 │ 32.7ms │ 17.7ms (0.54x) │ 20.8ms (0.64x) │ 27.3ms (0.84x) │ │ 26 │ 32.5ms │ 16.3ms (0.50x) │ 19.0ms (0.58x) │ 47.5ms (1.46x) │ │ 36 │ 180.2ms │ 60.1ms (0.33x) │ 109.2ms (0.61x) │ 61.8ms (0.34x) │ │ 37 │ 111.7ms │ 44.4ms (0.40x) │ 97.0ms (0.87x) │ 50.6ms (0.45x) │ │ 38 │ 167.0ms │ 49.6ms (0.30x) │ 95.6ms (0.57x) │ 55.4ms (0.33x) │ │ 39 │ 301.0ms │ 119.7ms (0.40x) │ 182.3ms (0.61x) │ 116.9ms (0.39x) │ │ 40 │ 65.7ms │ 21.6ms (0.33x) │ 41.0ms (0.62x) │ 28.6ms (0.44x) │ │ 41 │ 62.0ms │ 19.5ms (0.32x) │ 40.0ms (0.64x) │ 26.9ms (0.43x) │ │ 42 │ 32.3ms │ 14.7ms (0.46x) │ 29.2ms (0.90x) │ 22.8ms (0.71x) │ └───────┴───────────────────────────┴───────────────────┴─────────────────┴─────────────────┘ ``` --------- Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent 638f0c1 commit f793584

9 files changed

Lines changed: 398 additions & 17 deletions

File tree

.github/workflows/sql-benchmarks.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,29 @@ on:
3939
{"engine": "duckdb", "format": "duckdb"}
4040
]
4141
},
42+
{
43+
"id": "clickbench-sorted-nvme",
44+
"subcommand": "clickbench-sorted",
45+
"name": "Clickbench Sorted on NVME",
46+
"data_formats": ["parquet", "vortex", "vortex-compact", "duckdb"],
47+
"pr_targets": [
48+
{"engine": "datafusion", "format": "parquet"},
49+
{"engine": "datafusion", "format": "vortex"},
50+
{"engine": "duckdb", "format": "parquet"},
51+
{"engine": "duckdb", "format": "vortex"},
52+
{"engine": "duckdb", "format": "duckdb"}
53+
],
54+
"develop_targets": [
55+
{"engine": "datafusion", "format": "parquet"},
56+
{"engine": "datafusion", "format": "vortex"},
57+
{"engine": "datafusion", "format": "vortex-compact"},
58+
{"engine": "datafusion", "format": "lance"},
59+
{"engine": "duckdb", "format": "parquet"},
60+
{"engine": "duckdb", "format": "vortex"},
61+
{"engine": "duckdb", "format": "vortex-compact"},
62+
{"engine": "duckdb", "format": "duckdb"}
63+
]
64+
},
4265
{
4366
"id": "tpch-nvme",
4467
"subcommand": "tpch",
@@ -302,6 +325,25 @@ on:
302325
{"engine": "duckdb", "format": "vortex"}
303326
]
304327
},
328+
{
329+
"id": "clickbench-sorted-nvme",
330+
"subcommand": "clickbench-sorted",
331+
"name": "Clickbench Sorted on NVME",
332+
"data_formats": ["parquet", "vortex"],
333+
"pr_targets": [
334+
{"engine": "datafusion", "format": "parquet"},
335+
{"engine": "datafusion", "format": "vortex"},
336+
{"engine": "duckdb", "format": "parquet"},
337+
{"engine": "duckdb", "format": "vortex"}
338+
],
339+
"develop_targets": [
340+
{"engine": "datafusion", "format": "parquet"},
341+
{"engine": "datafusion", "format": "vortex"},
342+
{"engine": "datafusion", "format": "lance"},
343+
{"engine": "duckdb", "format": "parquet"},
344+
{"engine": "duckdb", "format": "vortex"}
345+
]
346+
},
305347
{
306348
"id": "tpch-nvme",
307349
"subcommand": "tpch",

bench-orchestrator/bench_orchestrator/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class Benchmark(Enum):
4646
TPCH = "tpch"
4747
TPCDS = "tpcds"
4848
CLICKBENCH = "clickbench"
49+
CLICKBENCH_SORTED = "clickbench-sorted"
4950
FINEWEB = "fineweb"
5051
GHARCHIVE = "gh-archive"
5152
POLARSIGNALS = "polarsignals"

benchmarks-website/src/config.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ export const QUERY_SUITES = [
1414
tags: ["Queries (NVMe)"],
1515
hiddenDatasets: ["datafusion:lance"],
1616
},
17+
{
18+
prefix: "clickbench-sorted",
19+
displayName: "Clickbench Sorted",
20+
queryPrefix: "CLICKBENCH SORTED",
21+
description:
22+
"ClickBench queries over data globally sorted by event date and event time",
23+
tags: ["Queries (NVMe)"],
24+
hiddenDatasets: ["datafusion:lance"],
25+
},
1726
{
1827
prefix: "statpopgen",
1928
displayName: "Statistical and Population Genetics",

benchmarks-website/src/utils.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ export function getBenchmarkDescription(categoryName) {
9797
'Compression': 'Measures encoding and decoding throughput (MB/s) for Vortex and Parquet files',
9898
'Compression Size': 'Compares compressed file sizes across different encoding strategies',
9999
'Clickbench': "ClickHouse's analytical benchmark suite on web analytics data",
100+
'Clickbench Sorted': 'ClickBench queries over data globally sorted by event date and event time',
100101
'Statistical and Population Genetics': 'Statistical and population genetics queries on gnomAD dataset',
101102
};
102103
return descriptions[categoryName] || '';

vortex-bench/src/clickbench/benchmark.rs

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,41 @@ impl ClickBenchBenchmark {
4141
}
4242
}
4343

44+
/// ClickBench sorted by event date and event time.
45+
pub struct ClickBenchSortedBenchmark {
46+
pub queries_file: Option<String>,
47+
pub data_url: Url,
48+
}
49+
50+
impl ClickBenchSortedBenchmark {
51+
/// Create the sorted ClickBench benchmark, optionally using a remote data directory.
52+
pub fn new(use_remote_data_dir: Option<String>) -> Result<Self> {
53+
Ok(Self {
54+
queries_file: None,
55+
data_url: resolve_data_url(use_remote_data_dir.as_deref(), CLICKBENCH_SORTED_NAME)?,
56+
})
57+
}
58+
}
59+
60+
fn read_clickbench_queries(queries_file: Option<&str>) -> Result<Vec<(usize, String)>> {
61+
let queries_filepath = match queries_file {
62+
Some(file) => file.into(),
63+
None => Path::new(env!("CARGO_MANIFEST_DIR")).join("clickbench_queries.sql"),
64+
};
65+
66+
Ok(fs::read_to_string(queries_filepath)?
67+
.split(';')
68+
.map(|s| s.trim())
69+
.filter(|s| !s.is_empty())
70+
.map(|s| s.to_string())
71+
.enumerate()
72+
.collect())
73+
}
74+
4475
#[async_trait::async_trait]
4576
impl Benchmark for ClickBenchBenchmark {
4677
fn queries(&self) -> Result<Vec<(usize, String)>> {
47-
let queries_filepath = match &self.queries_file {
48-
Some(file) => file.into(),
49-
None => Path::new(env!("CARGO_MANIFEST_DIR")).join("clickbench_queries.sql"),
50-
};
51-
52-
Ok(fs::read_to_string(queries_filepath)?
53-
.split(';')
54-
.map(|s| s.trim())
55-
.filter(|s| !s.is_empty())
56-
.map(|s| s.to_string())
57-
.enumerate()
58-
.collect())
78+
read_clickbench_queries(self.queries_file.as_deref())
5979
}
6080

6181
async fn generate_base_data(&self) -> Result<()> {
@@ -70,10 +90,7 @@ impl Benchmark for ClickBenchBenchmark {
7090
}
7191

7292
fn expected_row_counts(&self) -> Option<Vec<usize>> {
73-
Some(vec![
74-
1, 1, 1, 1, 1, 1, 1, 18, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 1, 10, 10, 10,
75-
10, 10, 10, 25, 25, 1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
76-
])
93+
Some(clickbench_expected_row_counts())
7794
}
7895

7996
fn dataset(&self) -> BenchmarkDataset {
@@ -99,6 +116,48 @@ impl Benchmark for ClickBenchBenchmark {
99116
}
100117
}
101118

119+
#[async_trait::async_trait]
120+
impl Benchmark for ClickBenchSortedBenchmark {
121+
fn queries(&self) -> Result<Vec<(usize, String)>> {
122+
Ok(read_clickbench_queries(self.queries_file.as_deref())?
123+
.into_iter()
124+
.filter(|(idx, _)| CLICKBENCH_SORTED_QUERY_IDS.contains(idx))
125+
.collect())
126+
}
127+
128+
async fn generate_base_data(&self) -> Result<()> {
129+
if self.data_url.scheme() != "file" {
130+
return Ok(());
131+
}
132+
133+
generate_sorted_clickbench(CLICKBENCH_SORTED_NAME.to_data_path()).await
134+
}
135+
136+
fn expected_row_counts(&self) -> Option<Vec<usize>> {
137+
Some(clickbench_expected_row_counts())
138+
}
139+
140+
fn dataset(&self) -> BenchmarkDataset {
141+
BenchmarkDataset::ClickBenchSorted
142+
}
143+
144+
fn dataset_name(&self) -> &str {
145+
CLICKBENCH_SORTED_NAME
146+
}
147+
148+
fn dataset_display(&self) -> String {
149+
CLICKBENCH_SORTED_NAME.to_string()
150+
}
151+
152+
fn data_url(&self) -> &Url {
153+
&self.data_url
154+
}
155+
156+
fn table_specs(&self) -> Vec<TableSpec> {
157+
vec![TableSpec::new("hits", Some(HITS_SCHEMA.clone()))]
158+
}
159+
}
160+
102161
fn clickbench_flavor(flavor: Flavor) -> String {
103162
format!("clickbench_{flavor}")
104163
}

0 commit comments

Comments
 (0)