Skip to content

Commit c3be6e5

Browse files
authored
ORCA: skip MVCC system columns for standalone AO tables (#1409)
Fix ORCA fallback error when querying AO tables with multiple DISTINCT aggregates: "Invalid system target list found for AO table" AO tables don't support MVCC system columns (xmin, xmax, cmin, cmax). Skip these columns when building metadata for standalone AO tables, but preserve them for partitioned tables to maintain column mapping consistency.
1 parent 5cbac92 commit c3be6e5

7 files changed

Lines changed: 359 additions & 195 deletions

File tree

contrib/pax_storage/src/test/regress/expected/gporca_optimizer.out

Lines changed: 202 additions & 174 deletions
Large diffs are not rendered by default.

contrib/pax_storage/src/test/regress/expected/tsrf_optimizer.out

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,15 +88,16 @@ ANALYZE few;
8888
-- SRF with a provably-dummy relation
8989
explain (verbose, costs off)
9090
SELECT unnest(ARRAY[1, 2]) FROM few WHERE false;
91-
QUERY PLAN
92-
--------------------------------------
91+
QUERY PLAN
92+
-------------------------------------------------------------------------------
9393
ProjectSet
9494
Output: unnest('{1,2}'::integer[])
9595
-> Result
96-
Output: NULL::integer, NULL::tid, NULL::xid, NULL::cid, NULL::xid, NULL::cid, NULL::oid, NULL::integer, NULL::oid
96+
Output: NULL::integer, NULL::tid, NULL::oid, NULL::integer, NULL::oid
9797
One-Time Filter: false
98-
Optimizer: Pivotal Optimizer (GPORCA)
99-
(6 rows)
98+
Settings: optimizer = 'on'
99+
Optimizer: GPORCA
100+
(7 rows)
100101

101102
SELECT unnest(ARRAY[1, 2]) FROM few WHERE false;
102103
unnest
@@ -117,12 +118,12 @@ SELECT * FROM few f1,
117118
-> ProjectSet
118119
Output: unnest('{1,2}'::integer[])
119120
-> Result
120-
Output: NULL::integer, NULL::tid, NULL::xid, NULL::cid, NULL::xid, NULL::cid, NULL::oid, NULL::integer, NULL::oid
121+
Output: NULL::integer, NULL::tid, NULL::oid, NULL::integer, NULL::oid
121122
One-Time Filter: false
122123
-> Seq Scan on public.few f1
123124
Output: id, dataa, datab
124-
Settings: enable_parallel = 'off', optimizer = 'on'
125-
Optimizer: Pivotal Optimizer (GPORCA)
125+
Settings: optimizer = 'on'
126+
Optimizer: GPORCA
126127
(14 rows)
127128

128129
SELECT * FROM few f1,

src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -862,13 +862,31 @@ CTranslatorRelcacheToDXL::RetrieveRelDistributionOpFamilies(CMemoryPool *mp,
862862
void
863863
CTranslatorRelcacheToDXL::AddSystemColumns(CMemoryPool *mp,
864864
CMDColumnArray *mdcol_array,
865-
Relation /*rel*/)
865+
Relation rel)
866866
{
867+
// Get storage type to determine which system columns are supported
868+
IMDRelation::Erelstoragetype rel_storage_type = RetrieveRelStorageType(rel);
869+
BOOL is_standalone_ao_table = ((rel_storage_type == IMDRelation::ErelstorageAppendOnlyRows ||
870+
rel_storage_type == IMDRelation::ErelstorageAppendOnlyCols ||
871+
rel_storage_type == IMDRelation::ErelstoragePAX)) &&
872+
rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE &&
873+
!rel->rd_rel->relispartition;
874+
867875
for (INT i = SelfItemPointerAttributeNumber;
868876
i > FirstLowInvalidHeapAttributeNumber; i--)
869877
{
870878
AttrNumber attno = AttrNumber(i);
871879
GPOS_ASSERT(0 != attno);
880+
// AO tables don't support MVCC-related system columns (xmin, cmin, xmax, cmax)
881+
// Skip these columns for AO tables to avoid "Invalid system target list" errors
882+
if (is_standalone_ao_table &&
883+
(attno == MinTransactionIdAttributeNumber || // xmin (-2)
884+
attno == MinCommandIdAttributeNumber || // cmin (-3)
885+
attno == MaxTransactionIdAttributeNumber || // xmax (-4)
886+
attno == MaxCommandIdAttributeNumber)) // cmax (-5)
887+
{
888+
continue;
889+
}
872890

873891
const FormData_pg_attribute *att_tup = SystemAttributeDefinition(attno);
874892

src/test/regress/expected/gp_dqa.out

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3067,5 +3067,51 @@ select count(distinct a), count(distinct b) from dqa_f4 group by c;
30673067
1 | 1
30683068
(3 rows)
30693069

3070+
-- Test AO table dqa with multiple distinct aggs
3071+
create table dqa_f5(a int, b int, c int) using ao_column;
3072+
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
3073+
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
3074+
insert into dqa_f5 values(null, null, null);
3075+
insert into dqa_f5 values(1, 1, 1);
3076+
insert into dqa_f5 values(2, 2, 2);
3077+
explain (verbose on, costs off) select count(distinct a), count(distinct b) from dqa_f5 group by c;
3078+
QUERY PLAN
3079+
-----------------------------------------------------------------------------------------------------------
3080+
Finalize HashAggregate
3081+
Output: count(a), count(b), c
3082+
Group Key: dqa_f5.c
3083+
-> Gather Motion 3:1 (slice1; segments: 3)
3084+
Output: c, (PARTIAL count(a)), (PARTIAL count(b))
3085+
-> Partial HashAggregate
3086+
Output: c, PARTIAL count(a), PARTIAL count(b)
3087+
Group Key: dqa_f5.c
3088+
-> HashAggregate
3089+
Output: c, a, b, (AggExprId)
3090+
Group Key: (AggExprId), dqa_f5.a, dqa_f5.b, dqa_f5.c
3091+
-> Redistribute Motion 3:3 (slice2; segments: 3)
3092+
Output: c, a, b, (AggExprId)
3093+
Hash Key: c, a, b, (AggExprId)
3094+
-> Streaming HashAggregate
3095+
Output: c, a, b, (AggExprId)
3096+
Group Key: AggExprId, dqa_f5.a, dqa_f5.b, dqa_f5.c
3097+
-> TupleSplit
3098+
Output: c, a, b, AggExprId
3099+
Split by Col: (dqa_f5.a), (dqa_f5.b)
3100+
Group Key: dqa_f5.c
3101+
-> Seq Scan on public.dqa_f5
3102+
Output: c, a, b
3103+
Settings: enable_groupagg = 'off', enable_hashagg = 'on', gp_motion_cost_per_row = '2', optimizer = 'off'
3104+
Optimizer: Postgres query optimizer
3105+
(25 rows)
3106+
3107+
select count(distinct a), count(distinct b) from dqa_f5 group by c;
3108+
count | count
3109+
-------+-------
3110+
0 | 0
3111+
1 | 1
3112+
1 | 1
3113+
(3 rows)
3114+
30703115
reset optimizer_enable_multiple_distinct_aggs;
30713116
drop table dqa_f4;
3117+
drop table dqa_f5;

src/test/regress/expected/gp_dqa_optimizer.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3304,5 +3304,67 @@ select count(distinct a), count(distinct b) from dqa_f4 group by c;
33043304
0 | 0
33053305
(3 rows)
33063306

3307+
-- Test AO table dqa with multiple distinct aggs
3308+
create table dqa_f5(a int, b int, c int) using ao_column;
3309+
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
3310+
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
3311+
insert into dqa_f5 values(null, null, null);
3312+
insert into dqa_f5 values(1, 1, 1);
3313+
insert into dqa_f5 values(2, 2, 2);
3314+
explain (verbose on, costs off) select count(distinct a), count(distinct b) from dqa_f5 group by c;
3315+
QUERY PLAN
3316+
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
3317+
Gather Motion 3:1 (slice1; segments: 3)
3318+
Output: (count(DISTINCT share0_ref3.a)), (count(DISTINCT share0_ref2.b))
3319+
-> Sequence
3320+
Output: (count(DISTINCT share0_ref3.a)), (count(DISTINCT share0_ref2.b))
3321+
-> Shared Scan (share slice:id 1:0)
3322+
Output: share0_ref1.a, share0_ref1.b, share0_ref1.c, share0_ref1.ctid, share0_ref1.tableoid, share0_ref1.gp_segment_id, share0_ref1.gp_foreign_server
3323+
-> Seq Scan on public.dqa_f5
3324+
Output: dqa_f5.a, dqa_f5.b, dqa_f5.c, dqa_f5.ctid, dqa_f5.tableoid, dqa_f5.gp_segment_id, dqa_f5.gp_foreign_server
3325+
-> Hash Join
3326+
Output: (count(DISTINCT share0_ref3.a)), (count(DISTINCT share0_ref2.b))
3327+
Hash Cond: (NOT (share0_ref3.c IS DISTINCT FROM share0_ref2.c))
3328+
-> GroupAggregate
3329+
Output: count(DISTINCT share0_ref3.a), share0_ref3.c
3330+
Group Key: share0_ref3.c
3331+
-> Sort
3332+
Output: share0_ref3.a, share0_ref3.c
3333+
Sort Key: share0_ref3.c
3334+
-> Redistribute Motion 3:3 (slice2; segments: 3)
3335+
Output: share0_ref3.a, share0_ref3.c
3336+
Hash Key: share0_ref3.c
3337+
-> Result
3338+
Output: share0_ref3.a, share0_ref3.c
3339+
-> Shared Scan (share slice:id 2:0)
3340+
Output: share0_ref3.a, share0_ref3.b, share0_ref3.c
3341+
-> Hash
3342+
Output: (count(DISTINCT share0_ref2.b)), share0_ref2.c
3343+
-> GroupAggregate
3344+
Output: count(DISTINCT share0_ref2.b), share0_ref2.c
3345+
Group Key: share0_ref2.c
3346+
-> Sort
3347+
Output: share0_ref2.b, share0_ref2.c
3348+
Sort Key: share0_ref2.c
3349+
-> Redistribute Motion 3:3 (slice3; segments: 3)
3350+
Output: share0_ref2.b, share0_ref2.c
3351+
Hash Key: share0_ref2.c
3352+
-> Result
3353+
Output: share0_ref2.b, share0_ref2.c
3354+
-> Shared Scan (share slice:id 3:0)
3355+
Output: share0_ref2.a, share0_ref2.b, share0_ref2.c
3356+
Settings: enable_groupagg = 'off', enable_hashagg = 'on', gp_motion_cost_per_row = '2'
3357+
Optimizer: GPORCA
3358+
(41 rows)
3359+
3360+
select count(distinct a), count(distinct b) from dqa_f5 group by c;
3361+
count | count
3362+
-------+-------
3363+
1 | 1
3364+
0 | 0
3365+
1 | 1
3366+
(3 rows)
3367+
33073368
reset optimizer_enable_multiple_distinct_aggs;
33083369
drop table dqa_f4;
3370+
drop table dqa_f5;

src/test/regress/expected/rowhints_optimizer.out

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ ANALYZE my_table, your_table, our_table;
3333
EXPLAIN SELECT t1.a, t2.a FROM my_table AS t1, your_table AS t2, our_table AS t3;
3434
QUERY PLAN
3535
-----------------------------------------------------------------------------------------------------------
36-
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1356765552.28 rows=1000000 width=8)
37-
-> Nested Loop (cost=0.00..1356765522.47 rows=333334 width=8)
36+
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..1356765398.57 rows=1000000 width=8)
37+
-> Nested Loop (cost=0.00..1356765368.75 rows=333334 width=8)
3838
Join Filter: true
3939
-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=0.00..431.01 rows=100 width=4)
40-
-> Seq Scan on your_table t2 (cost=0.00..431.00 rows=34 width=4)
41-
-> Nested Loop (cost=0.00..1324091.78 rows=3334 width=4)
42-
Join Filter: true
4340
-> Seq Scan on my_table t1 (cost=0.00..431.00 rows=34 width=4)
41+
-> Nested Loop (cost=0.00..1324091.63 rows=3334 width=4)
42+
Join Filter: true
43+
-> Seq Scan on your_table t2 (cost=0.00..431.00 rows=34 width=4)
4444
-> Materialize (cost=0.00..431.00 rows=100 width=1)
4545
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=100 width=1)
4646
-> Dynamic Seq Scan on our_table t3 (cost=0.00..431.00 rows=34 width=1)
@@ -78,10 +78,10 @@ not used hint:
7878
-> Nested Loop (cost=xxx..xxx rows=41 width=xxx)
7979
Join Filter: true
8080
-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=xxx..xxx rows=100 width=xxx)
81-
-> Seq Scan on your_table t2 (cost=xxx..xxx rows=34 width=xxx)
81+
-> Seq Scan on my_table t1 (cost=xxx..xxx rows=34 width=xxx)
8282
-> Nested Loop (cost=xxx..xxx rows=3334 width=xxx)
8383
Join Filter: true
84-
-> Seq Scan on my_table t1 (cost=xxx..xxx rows=34 width=xxx)
84+
-> Seq Scan on your_table t2 (cost=xxx..xxx rows=34 width=xxx)
8585
-> Materialize (cost=xxx..xxx rows=100 width=xxx)
8686
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=xxx..xxx rows=100 width=xxx)
8787
-> Dynamic Seq Scan on our_table t3 (cost=xxx..xxx rows=34 width=xxx)
@@ -110,10 +110,10 @@ not used hint:
110110
-> Nested Loop (cost=xxx..xxx rows=333375 width=xxx)
111111
Join Filter: true
112112
-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=xxx..xxx rows=100 width=xxx)
113-
-> Seq Scan on your_table t2 (cost=xxx..xxx rows=34 width=xxx)
113+
-> Seq Scan on my_table t1 (cost=xxx..xxx rows=34 width=xxx)
114114
-> Nested Loop (cost=xxx..xxx rows=3334 width=xxx)
115115
Join Filter: true
116-
-> Seq Scan on my_table t1 (cost=xxx..xxx rows=34 width=xxx)
116+
-> Seq Scan on your_table t2 (cost=xxx..xxx rows=34 width=xxx)
117117
-> Materialize (cost=xxx..xxx rows=100 width=xxx)
118118
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=xxx..xxx rows=100 width=xxx)
119119
-> Dynamic Seq Scan on our_table t3 (cost=xxx..xxx rows=34 width=xxx)
@@ -142,10 +142,10 @@ not used hint:
142142
-> Nested Loop (cost=xxx..xxx rows=333293 width=xxx)
143143
Join Filter: true
144144
-> Broadcast Motion 3:3 (slice3; segments: 3) (cost=xxx..xxx rows=100 width=xxx)
145-
-> Seq Scan on your_table t2 (cost=xxx..xxx rows=34 width=xxx)
145+
-> Seq Scan on my_table t1 (cost=xxx..xxx rows=34 width=xxx)
146146
-> Nested Loop (cost=xxx..xxx rows=3334 width=xxx)
147147
Join Filter: true
148-
-> Seq Scan on my_table t1 (cost=xxx..xxx rows=34 width=xxx)
148+
-> Seq Scan on your_table t2 (cost=xxx..xxx rows=34 width=xxx)
149149
-> Materialize (cost=xxx..xxx rows=100 width=xxx)
150150
-> Broadcast Motion 3:3 (slice2; segments: 3) (cost=xxx..xxx rows=100 width=xxx)
151151
-> Dynamic Seq Scan on our_table t3 (cost=xxx..xxx rows=34 width=xxx)

src/test/regress/sql/gp_dqa.sql

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,15 @@ select count(distinct a), count(distinct b) from dqa_f4 group by c;
604604
set optimizer_enable_multiple_distinct_aggs=on;
605605
explain (verbose on, costs off) select count(distinct a), count(distinct b) from dqa_f4 group by c;
606606
select count(distinct a), count(distinct b) from dqa_f4 group by c;
607-
reset optimizer_enable_multiple_distinct_aggs;
608607

608+
-- Test AO table dqa with multiple distinct aggs
609+
create table dqa_f5(a int, b int, c int) using ao_column;
610+
insert into dqa_f5 values(null, null, null);
611+
insert into dqa_f5 values(1, 1, 1);
612+
insert into dqa_f5 values(2, 2, 2);
613+
explain (verbose on, costs off) select count(distinct a), count(distinct b) from dqa_f5 group by c;
614+
select count(distinct a), count(distinct b) from dqa_f5 group by c;
615+
616+
reset optimizer_enable_multiple_distinct_aggs;
609617
drop table dqa_f4;
618+
drop table dqa_f5;

0 commit comments

Comments
 (0)