-
Notifications
You must be signed in to change notification settings - Fork 2.1k
chore: Add existence (semi / anti ) benchmarks for hashjoinexec #21821
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a9d6d68
6eb40b7
57cfc3e
ad83914
4cfb0c5
499c5d7
436cce4
eff28d9
89307de
e483e59
1ee87f7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,8 +25,6 @@ use std::path::PathBuf; | |
|
|
||
| use futures::StreamExt; | ||
|
|
||
| // TODO: Add existence joins | ||
|
|
||
| /// Run the Hash Join benchmark | ||
| /// | ||
| /// This micro-benchmark focuses on the performance characteristics of Hash Joins. | ||
|
|
@@ -303,6 +301,110 @@ const HASH_QUERIES: &[HashJoinQuery] = &[ | |
| build_size: "100K_(20%_dups)", | ||
| probe_size: "60M", | ||
| }, | ||
| // RightSemi Join benchmarks with Int32 keys | ||
| // | ||
| // Fanout (average build rows matched per probe row, as measured by running | ||
| // the equivalent INNER JOIN under `EXPLAIN ANALYZE` and reading the | ||
| // `HashJoinExec` metrics): 1 for Q16-Q18. Build keys here are primary | ||
| // keys (`n_nationkey`, `s_suppkey`), so each probe row matches at most | ||
| // one build row. `prob_hit` controls what fraction of probe rows find | ||
| // that one match. | ||
| // | ||
| // Fanout still matters because semi joins short-circuit after the first | ||
| // match. Coverage of fanout > 1 (build-side duplicates) is left for a | ||
| // follow-up. | ||
| // | ||
| // Q16: RightSemi, Small build (25 rows), 100% Hit rate | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's also doc the fanout here, it means if we change the join type to inner join, for each probe row, how many matches can be found on average. This can be automatically calculated from And later we should ensure those queries have covered different fanouts.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, my previous explanation may have been confusing. Let me try again. Suppose we have the query: SELECT *
FROM generate_series(100) AS t1(v1)
RIGHT SEMI JOIN generate_series(10) AS t2(v1)
ON (t1.v1 % 10) = t2.v1Here, each probe row from Although a semi join only returns whether a match exists, this ratio still matters for execution behavior, because we are evaluating short-circuit optimizations here. So I suggest we could doc this metric here. See the original reply for how to get this matching ratio metric automatically.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for your patience. I added more documentation around what fanout is and why it matters here |
||
| // Build Side: nation (25 rows) | Probe Side: customer (1.5M rows) | ||
| HashJoinQuery { | ||
| sql: r###"SELECT c.k | ||
| FROM (SELECT CAST(n_nationkey AS INT) as k FROM nation) n | ||
| RIGHT SEMI JOIN (SELECT CAST(c_nationkey AS INT) as k FROM customer) c | ||
| ON n.k = c.k"###, | ||
| density: 1.0, | ||
| prob_hit: 1.0, | ||
| build_size: "25", | ||
| probe_size: "1.5M_RightSemi", | ||
| }, | ||
| // Q17: RightSemi, Medium build (100K rows), 100% Hit rate | ||
| // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows) | ||
| HashJoinQuery { | ||
| sql: r###"SELECT l.k | ||
| FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s | ||
| RIGHT SEMI JOIN (SELECT CAST(l_suppkey AS INT) as k FROM lineitem) l | ||
| ON s.k = l.k"###, | ||
| density: 1.0, | ||
| prob_hit: 1.0, | ||
| build_size: "100K", | ||
| probe_size: "60M_RightSemi", | ||
| }, | ||
| // Q18: RightSemi, Medium build (100K rows), 10% Hit rate | ||
| // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows) | ||
| HashJoinQuery { | ||
| sql: r###"SELECT l.k | ||
| FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s | ||
| RIGHT SEMI JOIN ( | ||
| SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k | ||
| FROM lineitem | ||
| ) l | ||
| ON s.k = l.k"###, | ||
| density: 1.0, | ||
| prob_hit: 0.1, | ||
| build_size: "100K", | ||
| probe_size: "60M_RightSemi", | ||
| }, | ||
| // RightAnti Join benchmarks with Int32 keys | ||
| // | ||
| // Fanout (average build rows matched per probe row, as measured by running | ||
| // the equivalent INNER JOIN under `EXPLAIN ANALYZE` and reading the | ||
| // `HashJoinExec` metrics): 1 for Q19-Q21. Build keys here are primary | ||
| // keys (`n_nationkey`, `s_suppkey`), so each probe row matches at most | ||
| // one build row. `prob_hit` controls what fraction of probe rows find | ||
| // that one match (and are therefore filtered *out* by anti). | ||
| // | ||
| // Fanout still matters because anti joins short-circuit after the first | ||
| // match. Coverage of fanout > 1 (build-side duplicates) is left for a | ||
| // follow-up. | ||
| // | ||
| // Q19: RightAnti, Small build (25 rows), 100% Hit rate (no output) | ||
| // Build Side: nation (25 rows) | Probe Side: customer (1.5M rows) | ||
| HashJoinQuery { | ||
| sql: r###"SELECT c.k | ||
| FROM (SELECT CAST(n_nationkey AS INT) as k FROM nation) n | ||
| RIGHT ANTI JOIN (SELECT CAST(c_nationkey AS INT) as k FROM customer) c | ||
| ON n.k = c.k"###, | ||
| density: 1.0, | ||
| prob_hit: 1.0, | ||
| build_size: "25", | ||
| probe_size: "1.5M_RightAnti", | ||
| }, | ||
| // Q20: RightAnti, Medium build (100K rows), 100% Hit rate (no output) | ||
| // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows) | ||
| HashJoinQuery { | ||
| sql: r###"SELECT l.k | ||
| FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s | ||
| RIGHT ANTI JOIN (SELECT CAST(l_suppkey AS INT) as k FROM lineitem) l | ||
| ON s.k = l.k"###, | ||
| density: 1.0, | ||
| prob_hit: 1.0, | ||
| build_size: "100K", | ||
| probe_size: "60M_RightAnti", | ||
| }, | ||
| // Q21: RightAnti, Medium build (100K rows), 10% Hit rate (90% output) | ||
| // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows) | ||
| HashJoinQuery { | ||
| sql: r###"SELECT l.k | ||
| FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s | ||
| RIGHT ANTI JOIN ( | ||
| SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k | ||
| FROM lineitem | ||
| ) l | ||
| ON s.k = l.k"###, | ||
| density: 1.0, | ||
| prob_hit: 0.1, | ||
| build_size: "100K", | ||
| probe_size: "60M_RightAnti", | ||
| }, | ||
| ]; | ||
|
|
||
| impl RunOpt { | ||
|
|
@@ -323,7 +425,9 @@ impl RunOpt { | |
| None => 1..=HASH_QUERIES.len(), | ||
| }; | ||
|
|
||
| let config = self.common.config()?; | ||
| let mut config = self.common.config()?; | ||
| // Disable join reordering to ensure the optimizer doesn't swap join sides | ||
| config.options_mut().optimizer.join_reordering = false; | ||
| let rt = self.common.build_runtime()?; | ||
| let ctx = SessionContext::new_with_config_rt(config, rt); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it would be great to add this coverage in a follow-up.