-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathgroup_by.rs
More file actions
82 lines (73 loc) · 2.89 KB
/
Copy pathgroup_by.rs
File metadata and controls
82 lines (73 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
//! W8 — `SELECT k, COUNT(*) FROM big GROUP BY k` at three cardinalities.
//!
//! Three buckets:
//! - **k_10** — 10 distinct groups; lots of rows per group, the
//! hash-aggregator is mostly in cache.
//! - **k_1k** — 1k distinct groups.
//! - **k_100k** — 100k distinct groups; one row per ~10 of input,
//! the high-cardinality stress test.
//!
//! All three queries scan the same 1M-row `big` table. The shape is
//! "scan + group + count" — no WHERE, no ORDER BY, no LIMIT — so
//! the comparison is "engine's hash-aggregator vs SQLite's sort-then-
//! aggregate / hash-aggregate planner choice".
//!
//! Per-iter cost is dominated by:
//! - parse + plan (every iter for SQLRite; cached on SQLite)
//! - full table scan
//! - hash insertion / counter increment per row
//! - result materialization (10 / 1k / 100k rows depending on bucket)
use std::path::Path;
use anyhow::{Context, Result};
use crate::data::{GROUP_B_ROW_COUNT, GroupBDataset, group_b_dataset};
use crate::workloads::aggregate as w7;
use crate::{Driver, WorkloadId};
pub const W8: WorkloadId = WorkloadId {
id: "W8",
name: "group-by",
version: "v2",
};
/// Cardinality buckets. `(label, group-key column, expected group count)`.
pub const BUCKETS: [(&str, &str, usize); 3] = [
("card-10", "k_10", 10),
("card-1k", "k_1k", 1_000),
("card-100k", "k_100k", 100_000),
];
/// Reuses W7's `big` table — same schema, same dataset. The criterion
/// register fn makes one connection and runs all three buckets
/// against it.
pub fn setup<D: Driver>(driver: &D, path: &Path) -> Result<(D::Conn, GroupBDataset)> {
let mut conn = driver.open(path)?;
driver.execute(
&mut conn,
"CREATE TABLE big (id INTEGER PRIMARY KEY, v INTEGER, k_10 INTEGER, k_1k INTEGER, k_100k INTEGER)",
)?;
let dataset = group_b_dataset();
w7::insert_rows(driver, &mut conn, &dataset)?;
Ok((conn, dataset))
}
pub fn select_sql(bucket: &str) -> String {
format!("SELECT {bucket}, COUNT(*) FROM big GROUP BY {bucket}")
}
/// One iteration: run the GROUP BY for one bucket and return the
/// number of groups that came back.
pub fn bench_iter<D: Driver>(driver: &D, conn: &mut D::Conn, bucket: &str) -> Result<usize> {
let sql = select_sql(bucket);
let rows = driver.query_all(conn, &sql, &[])?;
Ok(rows.len())
}
/// Correctness gate. Run the GROUP BY at each cardinality and verify
/// the group count matches.
pub fn correctness_check<D: Driver>(driver: &D, conn: &mut D::Conn) -> Result<()> {
for &(label, bucket, expected) in &BUCKETS {
let got =
bench_iter(driver, conn, bucket).with_context(|| format!("W8 correctness {label}"))?;
if got != expected {
anyhow::bail!(
"W8 correctness ({label}): GROUP BY {bucket} returned {got} groups, expected {expected}"
);
}
}
const _: () = assert!(GROUP_B_ROW_COUNT >= 100_000);
Ok(())
}