Skip to content

Commit 34dfda2

Browse files
author
kongfanshen
committed
ORCA: add optimizer_enable_right_semi_join GUC, regression test, TPC-H Q21 results
Adds a GUC (default on) to enable/disable GPORCA's right semi/anti hash join xforms (CXformLeftSemiJoin2RightSemiHashJoin / CXformLeftAntiSemiJoin2RightAntiSemiHashJoin). When turned off, the xforms are disabled via the standard CConfigParamMapping traceflag mechanism (GPOPT_DISABLE_XFORM_TF), so ORCA falls back to its regular left-semi / anti plans. Serves both as a kill-switch for the new plan shape and as the on/off toggle for before/after performance comparisons (e.g. TPC-H Q21). The GUC is registered in unsync_guc_name.h (per-segment, no QD/QE sync required), matching the other optimizer_enable_* developer GUCs. SET optimizer_enable_right_semi_join = off; -- ORCA uses plain Hash Semi Join SET optimizer_enable_right_semi_join = on; -- ORCA may use Hash Right Semi Join Regression test --------------- Adds src/test/regress/sql/rightsemijoin.sql (registered in greenplum_schedule) with both planner (rightsemijoin.out) and GPORCA (rightsemijoin_optimizer.out) answer files. Under GPORCA it asserts that the GUC flips the plan between Hash Right Semi/Anti Join (build on the small LHS) and the regular Hash Semi/Anti Join, and that results are identical either way. TPC-H Q21 before/after (SF=100) ------------------------------- TPC-H Q21 contains an EXISTS (semijoin) and a NOT EXISTS (anti join) over lineitem, so it exercises both new plan shapes. Measured on a 3-segment single-host demo cluster, SF=100 (lineitem 600,037,902 rows), GPORCA, 3 timed runs each: optimizer_enable_right_semi_join = on 355.8 / 340.1 / 339.2 s (best 339.2) optimizer_enable_right_semi_join = off 353.0 / 349.3 / 352.2 s (best 349.3) The semijoin (l1 EXISTS l2) is where the two plans differ: ON : Hash Right Semi Join - builds the hash on the small LHS (448,837 rows, 2 batches, ~17 MB); streams lineitem l2 (200M rows). OFF : Hash Semi Join - builds the hash on lineitem l2 (200,042,924 rows, 512 batches -> heavy spill); Work_mem wanted 8.6 GB. So the right-semi plan shrinks the build side from ~200M rows to ~449K rows, cuts the semijoin hash spill from 512 batches to 2, and drops the requested work_mem from ~8.6 GB to ~14 MB. End-to-end wall-clock is ~3% better here because total runtime on this small cluster is dominated by the three lineitem scans (l1/l2/l3) and the l3 anti-join spill, which are common to both plans; the build-side/memory win grows on larger, properly-sized clusters and tighter memory settings.
1 parent 87620a1 commit 34dfda2

8 files changed

Lines changed: 360 additions & 0 deletions

File tree

src/backend/gpopt/config/CConfigParamMapping.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,15 @@ CConfigParamMapping::PackConfigParamInBitset(
442442
hash_join_bitste->Release();
443443
}
444444

445+
// disable right semi/anti hash join (Mark Join) xforms if the GUC is off
446+
if (!optimizer_enable_right_semi_join)
447+
{
448+
traceflag_bitset->ExchangeSet(
449+
GPOPT_DISABLE_XFORM_TF(CXform::ExfLeftSemiJoin2RightSemiHashJoin));
450+
traceflag_bitset->ExchangeSet(GPOPT_DISABLE_XFORM_TF(
451+
CXform::ExfLeftAntiSemiJoin2RightAntiSemiHashJoin));
452+
}
453+
445454
if (!optimizer_enable_dynamictablescan)
446455
{
447456
// disable dynamic table scan if the corresponding GUC is turned off

src/backend/utils/misc/guc_gp.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ bool optimizer_enable_dml;
352352
bool optimizer_enable_dml_constraints;
353353
bool optimizer_enable_master_only_queries;
354354
bool optimizer_enable_hashjoin;
355+
bool optimizer_enable_right_semi_join = true;
355356
bool optimizer_enable_dynamictablescan;
356357
bool optimizer_enable_dynamicindexscan;
357358
bool optimizer_enable_dynamicindexonlyscan;
@@ -2354,6 +2355,17 @@ struct config_bool ConfigureNamesBool_gp[] =
23542355
NULL, NULL, NULL
23552356
},
23562357

2358+
{
2359+
{"optimizer_enable_right_semi_join", PGC_USERSET, DEVELOPER_OPTIONS,
2360+
gettext_noop("Enables the optimizer's use of right semi/anti hash join plans."),
2361+
NULL,
2362+
GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
2363+
},
2364+
&optimizer_enable_right_semi_join,
2365+
true,
2366+
NULL, NULL, NULL
2367+
},
2368+
23572369
{
23582370
{"optimizer_enable_dynamictablescan", PGC_USERSET, DEVELOPER_OPTIONS,
23592371
gettext_noop("Enables the optimizer's use of plans with dynamic table scan."),

src/include/utils/guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@ extern bool optimizer_enable_dml_constraints;
540540
extern bool optimizer_enable_direct_dispatch;
541541
extern bool optimizer_enable_master_only_queries;
542542
extern bool optimizer_enable_hashjoin;
543+
extern bool optimizer_enable_right_semi_join;
543544
extern bool optimizer_enable_dynamictablescan;
544545
extern bool optimizer_enable_dynamicindexscan;
545546
extern bool optimizer_enable_dynamicindexonlyscan;

src/include/utils/unsync_guc_name.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@
424424
"optimizer_discard_redistribute_hashjoin",
425425
"optimizer_enable_indexjoin",
426426
"optimizer_enable_indexonlyscan",
427+
"optimizer_enable_right_semi_join",
427428
"optimizer_enable_indexscan",
428429
"optimizer_enable_master_only_queries",
429430
"optimizer_enable_materialize",
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
--
2+
-- Right Semi / Right Anti Join (GPORCA)
3+
--
4+
-- GPORCA can build the hash table on the (smaller) left-hand side of a semi or
5+
-- anti join and probe it with the larger right-hand side, which avoids having
6+
-- to de-duplicate the large right side. The behaviour is controlled by the
7+
-- developer GUC optimizer_enable_right_semi_join (ON by default).
8+
--
9+
-- These cases force GPORCA (set optimizer=on) so the new
10+
-- CPhysicalRightSemiHashJoin / CPhysicalRightAntiSemiHashJoin xforms are
11+
-- exercised; the matching answer file rightsemijoin_optimizer.out captures the
12+
-- right-semi/right-anti plan shapes, while rightsemijoin.out captures the
13+
-- Postgres planner plans.
14+
--
15+
create schema rsj;
16+
set search_path = rsj, public;
17+
-- stabilize the join method across planners
18+
set enable_nestloop = off;
19+
set enable_mergejoin = off;
20+
create table rsj_small(a int, b int) distributed by (a);
21+
create table rsj_big(a int, b int) distributed by (a);
22+
-- small LHS: 1..50, plus two values absent from rsj_big
23+
insert into rsj_small select i, i from generate_series(1,50) i;
24+
insert into rsj_small values (600001, 1), (600002, 2);
25+
-- large, high-NDV RHS: de-duplicating it would be expensive, so the right-semi
26+
-- plan (build on the small LHS) wins.
27+
insert into rsj_big select i, i from generate_series(1,500000) i;
28+
analyze rsj_small;
29+
analyze rsj_big;
30+
--
31+
-- Semi join (IN): GUC on -> Hash Right Semi Join (build on small LHS)
32+
--
33+
set optimizer_enable_right_semi_join = on;
34+
explain (costs off)
35+
select * from rsj_small s where s.a in (select a from rsj_big b);
36+
QUERY PLAN
37+
-------------------------------------------
38+
Gather Motion 3:1 (slice1; segments: 3)
39+
-> Hash Right Semi Join
40+
Hash Cond: (b.a = s.a)
41+
-> Seq Scan on rsj_big b
42+
-> Hash
43+
-> Seq Scan on rsj_small s
44+
Optimizer: Postgres query optimizer
45+
(7 rows)
46+
47+
-- GUC off -> regular Hash Semi Join (build on RHS)
48+
set optimizer_enable_right_semi_join = off;
49+
explain (costs off)
50+
select * from rsj_small s where s.a in (select a from rsj_big b);
51+
QUERY PLAN
52+
-------------------------------------------
53+
Gather Motion 3:1 (slice1; segments: 3)
54+
-> Hash Right Semi Join
55+
Hash Cond: (b.a = s.a)
56+
-> Seq Scan on rsj_big b
57+
-> Hash
58+
-> Seq Scan on rsj_small s
59+
Optimizer: Postgres query optimizer
60+
(7 rows)
61+
62+
--
63+
-- Anti join (NOT EXISTS): GUC on -> Hash Right Anti Join (build on small LHS)
64+
--
65+
set optimizer_enable_right_semi_join = on;
66+
explain (costs off)
67+
select * from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
68+
QUERY PLAN
69+
-------------------------------------------
70+
Gather Motion 3:1 (slice1; segments: 3)
71+
-> Hash Right Anti Join
72+
Hash Cond: (b.a = s.a)
73+
-> Seq Scan on rsj_big b
74+
-> Hash
75+
-> Seq Scan on rsj_small s
76+
Optimizer: Postgres query optimizer
77+
(7 rows)
78+
79+
-- GUC off -> regular Hash Anti Join
80+
set optimizer_enable_right_semi_join = off;
81+
explain (costs off)
82+
select * from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
83+
QUERY PLAN
84+
-------------------------------------------
85+
Gather Motion 3:1 (slice1; segments: 3)
86+
-> Hash Right Anti Join
87+
Hash Cond: (b.a = s.a)
88+
-> Seq Scan on rsj_big b
89+
-> Hash
90+
-> Seq Scan on rsj_small s
91+
Optimizer: Postgres query optimizer
92+
(7 rows)
93+
94+
--
95+
-- Correctness: results are identical regardless of the GUC / plan shape.
96+
--
97+
set optimizer_enable_right_semi_join = on;
98+
select count(*) as semi_on from rsj_small s where s.a in (select a from rsj_big b);
99+
semi_on
100+
---------
101+
50
102+
(1 row)
103+
104+
select count(*) as anti_on from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
105+
anti_on
106+
---------
107+
2
108+
(1 row)
109+
110+
set optimizer_enable_right_semi_join = off;
111+
select count(*) as semi_off from rsj_small s where s.a in (select a from rsj_big b);
112+
semi_off
113+
----------
114+
50
115+
(1 row)
116+
117+
select count(*) as anti_off from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
118+
anti_off
119+
----------
120+
2
121+
(1 row)
122+
123+
reset optimizer_enable_right_semi_join;
124+
reset enable_nestloop;
125+
reset enable_mergejoin;
126+
reset search_path;
127+
set client_min_messages = warning;
128+
drop schema rsj cascade;
129+
reset client_min_messages;
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
--
2+
-- Right Semi / Right Anti Join (GPORCA)
3+
--
4+
-- GPORCA can build the hash table on the (smaller) left-hand side of a semi or
5+
-- anti join and probe it with the larger right-hand side, which avoids having
6+
-- to de-duplicate the large right side. The behaviour is controlled by the
7+
-- developer GUC optimizer_enable_right_semi_join (ON by default).
8+
--
9+
-- These cases force GPORCA (set optimizer=on) so the new
10+
-- CPhysicalRightSemiHashJoin / CPhysicalRightAntiSemiHashJoin xforms are
11+
-- exercised; the matching answer file rightsemijoin_optimizer.out captures the
12+
-- right-semi/right-anti plan shapes, while rightsemijoin.out captures the
13+
-- Postgres planner plans.
14+
--
15+
create schema rsj;
16+
set search_path = rsj, public;
17+
-- stabilize the join method across planners
18+
set enable_nestloop = off;
19+
set enable_mergejoin = off;
20+
create table rsj_small(a int, b int) distributed by (a);
21+
create table rsj_big(a int, b int) distributed by (a);
22+
-- small LHS: 1..50, plus two values absent from rsj_big
23+
insert into rsj_small select i, i from generate_series(1,50) i;
24+
insert into rsj_small values (600001, 1), (600002, 2);
25+
-- large, high-NDV RHS: de-duplicating it would be expensive, so the right-semi
26+
-- plan (build on the small LHS) wins.
27+
insert into rsj_big select i, i from generate_series(1,500000) i;
28+
analyze rsj_small;
29+
analyze rsj_big;
30+
--
31+
-- Semi join (IN): GUC on -> Hash Right Semi Join (build on small LHS)
32+
--
33+
set optimizer_enable_right_semi_join = on;
34+
explain (costs off)
35+
select * from rsj_small s where s.a in (select a from rsj_big b);
36+
QUERY PLAN
37+
-------------------------------------------
38+
Gather Motion 3:1 (slice1; segments: 3)
39+
-> Hash Right Semi Join
40+
Hash Cond: (s.a = b.a)
41+
-> Seq Scan on rsj_big b
42+
-> Hash
43+
-> Seq Scan on rsj_small s
44+
Optimizer: GPORCA
45+
(7 rows)
46+
47+
-- GUC off -> regular Hash Semi Join (build on RHS)
48+
set optimizer_enable_right_semi_join = off;
49+
explain (costs off)
50+
select * from rsj_small s where s.a in (select a from rsj_big b);
51+
QUERY PLAN
52+
------------------------------------------
53+
Gather Motion 3:1 (slice1; segments: 3)
54+
-> Hash Semi Join
55+
Hash Cond: (s.a = b.a)
56+
-> Seq Scan on rsj_small s
57+
-> Hash
58+
-> Seq Scan on rsj_big b
59+
Optimizer: GPORCA
60+
(7 rows)
61+
62+
--
63+
-- Anti join (NOT EXISTS): GUC on -> Hash Right Anti Join (build on small LHS)
64+
--
65+
set optimizer_enable_right_semi_join = on;
66+
explain (costs off)
67+
select * from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
68+
QUERY PLAN
69+
-------------------------------------------
70+
Gather Motion 3:1 (slice1; segments: 3)
71+
-> Hash Right Anti Join
72+
Hash Cond: (s.a = b.a)
73+
-> Seq Scan on rsj_big b
74+
-> Hash
75+
-> Seq Scan on rsj_small s
76+
Optimizer: GPORCA
77+
(7 rows)
78+
79+
-- GUC off -> regular Hash Anti Join
80+
set optimizer_enable_right_semi_join = off;
81+
explain (costs off)
82+
select * from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
83+
QUERY PLAN
84+
------------------------------------------
85+
Gather Motion 3:1 (slice1; segments: 3)
86+
-> Hash Anti Join
87+
Hash Cond: (s.a = b.a)
88+
-> Seq Scan on rsj_small s
89+
-> Hash
90+
-> Seq Scan on rsj_big b
91+
Optimizer: GPORCA
92+
(7 rows)
93+
94+
--
95+
-- Correctness: results are identical regardless of the GUC / plan shape.
96+
--
97+
set optimizer_enable_right_semi_join = on;
98+
select count(*) as semi_on from rsj_small s where s.a in (select a from rsj_big b);
99+
semi_on
100+
---------
101+
50
102+
(1 row)
103+
104+
select count(*) as anti_on from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
105+
anti_on
106+
---------
107+
2
108+
(1 row)
109+
110+
set optimizer_enable_right_semi_join = off;
111+
select count(*) as semi_off from rsj_small s where s.a in (select a from rsj_big b);
112+
semi_off
113+
----------
114+
50
115+
(1 row)
116+
117+
select count(*) as anti_off from rsj_small s where not exists (select 1 from rsj_big b where b.a = s.a);
118+
anti_off
119+
----------
120+
2
121+
(1 row)
122+
123+
reset optimizer_enable_right_semi_join;
124+
reset enable_nestloop;
125+
reset enable_mergejoin;
126+
reset search_path;
127+
set client_min_messages = warning;
128+
drop schema rsj cascade;
129+
reset client_min_messages;

src/test/regress/greenplum_schedule

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,9 @@ test: rpt rpt_joins rpt_tpch rpt_returning
214214
test: bfv_cte
215215
test: bfv_joins bfv_subquery bfv_planner bfv_legacy bfv_temp bfv_dml
216216

217+
# GPORCA right semi / right anti hash join (optimizer_enable_right_semi_join)
218+
test: rightsemijoin
219+
217220
# test tpcds query 04
218221
test: tpcds_q04
219222

0 commit comments

Comments
 (0)