Skip to content

Commit 4f45193

Browse files
authored
chore: Add existence (semi / anti ) benchmarks for hashjoinexec (#21821)
# Add Existence Join Benchmarks ### What changes are included in this PR? #### 1. End-to-end benchmarks (`benchmarks/src/hj.rs`) Adds Q16-Q21 for RightSemi and RightAnti joins, following reviewer feedback to focus on core axes: | Query | Join Type | Build Size | Probe Size | Hit Rate | |-------|-----------|------------|------------|----------| | Q16 | RightSemi | 25 (nation) | 1.5M (customer) | 100% | | Q17 | RightSemi | 100K (supplier) | 60M (lineitem) | 100% | | Q18 | RightSemi | 100K (supplier) | 60M (lineitem) | 10% | | Q19 | RightAnti | 25 (nation) | 1.5M (customer) | 100% | | Q20 | RightAnti | 100K (supplier) | 60M (lineitem) | 100% | | Q21 | RightAnti | 100K (supplier) | 60M (lineitem) | 10% | #### 2. Criterion micro-benchmark (`datafusion/physical-plan/benches/hash_join_semi_anti.rs`) Density variations : | Benchmark | Join Type | Density | Hit Rate | |-----------|-----------|---------|----------| | right_semi_d100_h100 | RightSemi | 100% | 100% | | right_semi_d100_h10 | RightSemi | 100% | 10% | | right_semi_d50_h100 | RightSemi | 50% | 100% | | right_semi_d50_h10 | RightSemi | 50% | 10% | | right_semi_d10_h100 | RightSemi | 10% | 100% | | right_semi_d10_h10 | RightSemi | 10% | 10% | | right_anti_d100_h100 | RightAnti | 100% | 100% | | right_anti_d100_h10 | RightAnti | 100% | 10% | | right_anti_d50_h100 | RightAnti | 50% | 100% | | right_anti_d50_h10 | RightAnti | 50% | 10% | | right_anti_d10_h100 | RightAnti | 10% | 100% | | right_anti_d10_h10 | RightAnti | 10% | 10% |
1 parent ba240b2 commit 4f45193

3 files changed

Lines changed: 479 additions & 3 deletions

File tree

benchmarks/src/hj.rs

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ use std::path::PathBuf;
2525

2626
use futures::StreamExt;
2727

28-
// TODO: Add existence joins
29-
3028
/// Run the Hash Join benchmark
3129
///
3230
/// This micro-benchmark focuses on the performance characteristics of Hash Joins.
@@ -303,6 +301,110 @@ const HASH_QUERIES: &[HashJoinQuery] = &[
303301
build_size: "100K_(20%_dups)",
304302
probe_size: "60M",
305303
},
304+
// RightSemi Join benchmarks with Int32 keys
305+
//
306+
// Fanout (average build rows matched per probe row, as measured by running
307+
// the equivalent INNER JOIN under `EXPLAIN ANALYZE` and reading the
308+
// `HashJoinExec` metrics): 1 for Q16-Q18. Build keys here are primary
309+
// keys (`n_nationkey`, `s_suppkey`), so each probe row matches at most
310+
// one build row. `prob_hit` controls what fraction of probe rows find
311+
// that one match.
312+
//
313+
// Fanout still matters because semi joins short-circuit after the first
314+
// match. Coverage of fanout > 1 (build-side duplicates) is left for a
315+
// follow-up.
316+
//
317+
// Q16: RightSemi, Small build (25 rows), 100% Hit rate
318+
// Build Side: nation (25 rows) | Probe Side: customer (1.5M rows)
319+
HashJoinQuery {
320+
sql: r###"SELECT c.k
321+
FROM (SELECT CAST(n_nationkey AS INT) as k FROM nation) n
322+
RIGHT SEMI JOIN (SELECT CAST(c_nationkey AS INT) as k FROM customer) c
323+
ON n.k = c.k"###,
324+
density: 1.0,
325+
prob_hit: 1.0,
326+
build_size: "25",
327+
probe_size: "1.5M_RightSemi",
328+
},
329+
// Q17: RightSemi, Medium build (100K rows), 100% Hit rate
330+
// Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
331+
HashJoinQuery {
332+
sql: r###"SELECT l.k
333+
FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
334+
RIGHT SEMI JOIN (SELECT CAST(l_suppkey AS INT) as k FROM lineitem) l
335+
ON s.k = l.k"###,
336+
density: 1.0,
337+
prob_hit: 1.0,
338+
build_size: "100K",
339+
probe_size: "60M_RightSemi",
340+
},
341+
// Q18: RightSemi, Medium build (100K rows), 10% Hit rate
342+
// Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
343+
HashJoinQuery {
344+
sql: r###"SELECT l.k
345+
FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
346+
RIGHT SEMI JOIN (
347+
SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
348+
FROM lineitem
349+
) l
350+
ON s.k = l.k"###,
351+
density: 1.0,
352+
prob_hit: 0.1,
353+
build_size: "100K",
354+
probe_size: "60M_RightSemi",
355+
},
356+
// RightAnti Join benchmarks with Int32 keys
357+
//
358+
// Fanout (average build rows matched per probe row, as measured by running
359+
// the equivalent INNER JOIN under `EXPLAIN ANALYZE` and reading the
360+
// `HashJoinExec` metrics): 1 for Q19-Q21. Build keys here are primary
361+
// keys (`n_nationkey`, `s_suppkey`), so each probe row matches at most
362+
// one build row. `prob_hit` controls what fraction of probe rows find
363+
// that one match (and are therefore filtered *out* by anti).
364+
//
365+
// Fanout still matters because anti joins short-circuit after the first
366+
// match. Coverage of fanout > 1 (build-side duplicates) is left for a
367+
// follow-up.
368+
//
369+
// Q19: RightAnti, Small build (25 rows), 100% Hit rate (no output)
370+
// Build Side: nation (25 rows) | Probe Side: customer (1.5M rows)
371+
HashJoinQuery {
372+
sql: r###"SELECT c.k
373+
FROM (SELECT CAST(n_nationkey AS INT) as k FROM nation) n
374+
RIGHT ANTI JOIN (SELECT CAST(c_nationkey AS INT) as k FROM customer) c
375+
ON n.k = c.k"###,
376+
density: 1.0,
377+
prob_hit: 1.0,
378+
build_size: "25",
379+
probe_size: "1.5M_RightAnti",
380+
},
381+
// Q20: RightAnti, Medium build (100K rows), 100% Hit rate (no output)
382+
// Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
383+
HashJoinQuery {
384+
sql: r###"SELECT l.k
385+
FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
386+
RIGHT ANTI JOIN (SELECT CAST(l_suppkey AS INT) as k FROM lineitem) l
387+
ON s.k = l.k"###,
388+
density: 1.0,
389+
prob_hit: 1.0,
390+
build_size: "100K",
391+
probe_size: "60M_RightAnti",
392+
},
393+
// Q21: RightAnti, Medium build (100K rows), 10% Hit rate (90% output)
394+
// Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
395+
HashJoinQuery {
396+
sql: r###"SELECT l.k
397+
FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
398+
RIGHT ANTI JOIN (
399+
SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
400+
FROM lineitem
401+
) l
402+
ON s.k = l.k"###,
403+
density: 1.0,
404+
prob_hit: 0.1,
405+
build_size: "100K",
406+
probe_size: "60M_RightAnti",
407+
},
306408
];
307409

308410
impl RunOpt {
@@ -323,7 +425,9 @@ impl RunOpt {
323425
None => 1..=HASH_QUERIES.len(),
324426
};
325427

326-
let config = self.common.config()?;
428+
let mut config = self.common.config()?;
429+
// Disable join reordering to ensure the optimizer doesn't swap join sides
430+
config.options_mut().optimizer.join_reordering = false;
327431
let rt = self.common.build_runtime()?;
328432
let ctx = SessionContext::new_with_config_rt(config, rt);
329433

datafusion/physical-plan/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ harness = false
113113
name = "aggregate_vectorized"
114114
required-features = ["test_utils"]
115115

116+
[[bench]]
117+
harness = false
118+
name = "hash_join_semi_anti"
119+
required-features = ["test_utils"]
120+
116121
[[bench]]
117122
harness = false
118123
name = "dictionary_group_values"

0 commit comments

Comments
 (0)