Skip to content

Commit 56cb1fc

Browse files
committed
ORCA: order-preserving LINT mapping for string statistics
Replace the hash-based LINT mapping used by ORCA's varchar/text/bpchar statistics with an order-preserving 7-byte locale sort-key prefix, so that single-column MCV and histogram estimates respect the column's collation order instead of collapsing to hash values.
1 parent 9052b7a commit 56cb1fc

12 files changed

Lines changed: 336 additions & 172 deletions

File tree

contrib/pax_storage/src/test/regress/expected/stats_ext_optimizer.out

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,19 +1219,19 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
12191219
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
12201220
estimated | actual
12211221
-----------+--------
1222-
1815 | 2400
1222+
2551 | 2400
12231223
(1 row)
12241224

12251225
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
12261226
estimated | actual
12271227
-----------+--------
1228-
4801 | 1250
1228+
1301 | 1250
12291229
(1 row)
12301230

12311231
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
12321232
estimated | actual
12331233
-----------+--------
1234-
1133 | 2550
1234+
2651 | 2550
12351235
(1 row)
12361236

12371237
-- ALL (should not benefit from functional dependencies)
@@ -1385,19 +1385,19 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
13851385
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
13861386
estimated | actual
13871387
-----------+--------
1388-
1815 | 2400
1388+
2551 | 2400
13891389
(1 row)
13901390

13911391
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
13921392
estimated | actual
13931393
-----------+--------
1394-
4801 | 1250
1394+
1301 | 1250
13951395
(1 row)
13961396

13971397
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
13981398
estimated | actual
13991399
-----------+--------
1400-
1133 | 2550
1400+
2651 | 2550
14011401
(1 row)
14021402

14031403
-- ALL (should not benefit from functional dependencies)
@@ -1944,25 +1944,25 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = a AND ''1'
19441944
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 1 AND b < ''1''');
19451945
estimated | actual
19461946
-----------+--------
1947-
36 | 50
1947+
2 | 50
19481948
(1 row)
19491949

19501950
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > a AND ''1'' > b');
19511951
estimated | actual
19521952
-----------+--------
1953-
36 | 50
1953+
2 | 50
19541954
(1 row)
19551955

19561956
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 0 AND b <= ''0''');
19571957
estimated | actual
19581958
-----------+--------
1959-
36 | 50
1959+
2 | 50
19601960
(1 row)
19611961

19621962
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 0 >= a AND ''0'' >= b');
19631963
estimated | actual
19641964
-----------+--------
1965-
36 | 50
1965+
2 | 50
19661966
(1 row)
19671967

19681968
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
@@ -1974,25 +1974,25 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
19741974
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND b < ''1'' AND c < 5');
19751975
estimated | actual
19761976
-----------+--------
1977-
85 | 50
1977+
5 | 50
19781978
(1 row)
19791979

19801980
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND ''1'' > b AND 5 > c');
19811981
estimated | actual
19821982
-----------+--------
1983-
85 | 50
1983+
5 | 50
19841984
(1 row)
19851985

19861986
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b <= ''0'' AND c <= 4');
19871987
estimated | actual
19881988
-----------+--------
1989-
85 | 50
1989+
5 | 50
19901990
(1 row)
19911991

19921992
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0'' >= b AND 4 >= c');
19931993
estimated | actual
19941994
-----------+--------
1995-
85 | 50
1995+
5 | 50
19961996
(1 row)
19971997

19981998
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1');
@@ -2085,25 +2085,25 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = a AND ''1'
20852085
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 1 AND b < ''1''');
20862086
estimated | actual
20872087
-----------+--------
2088-
36 | 50
2088+
2 | 50
20892089
(1 row)
20902090

20912091
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > a AND ''1'' > b');
20922092
estimated | actual
20932093
-----------+--------
2094-
36 | 50
2094+
2 | 50
20952095
(1 row)
20962096

20972097
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 0 AND b <= ''0''');
20982098
estimated | actual
20992099
-----------+--------
2100-
36 | 50
2100+
2 | 50
21012101
(1 row)
21022102

21032103
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 0 >= a AND ''0'' >= b');
21042104
estimated | actual
21052105
-----------+--------
2106-
36 | 50
2106+
2 | 50
21072107
(1 row)
21082108

21092109
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
@@ -2115,25 +2115,25 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
21152115
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND b < ''1'' AND c < 5');
21162116
estimated | actual
21172117
-----------+--------
2118-
85 | 50
2118+
5 | 50
21192119
(1 row)
21202120

21212121
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND ''1'' > b AND 5 > c');
21222122
estimated | actual
21232123
-----------+--------
2124-
85 | 50
2124+
5 | 50
21252125
(1 row)
21262126

21272127
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b <= ''0'' AND c <= 4');
21282128
estimated | actual
21292129
-----------+--------
2130-
85 | 50
2130+
5 | 50
21312131
(1 row)
21322132

21332133
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0'' >= b AND 4 >= c');
21342134
estimated | actual
21352135
-----------+--------
2136-
85 | 50
2136+
5 | 50
21372137
(1 row)
21382138

21392139
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1');

src/backend/gpopt/gpdbwrappers.cpp

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ extern "C" {
4343
#include "catalog/pg_inherits.h"
4444
#include "cdb/cdbvars.h"
4545
#include "foreign/fdwapi.h"
46+
#include "mb/pg_wchar.h"
4647
#include "nodes/nodeFuncs.h"
4748
#include "optimizer/clauses.h"
4849
#include "optimizer/optimizer.h"
@@ -56,10 +57,13 @@ extern "C" {
5657
#include "utils/lsyscache.h"
5758
#include "utils/memutils.h"
5859
#include "utils/partcache.h"
60+
#include "utils/pg_locale.h"
5961

6062
extern bool enable_parallel;
6163
extern int max_parallel_workers_per_gather;
64+
6265
}
66+
6367
#define GP_WRAP_START \
6468
sigjmp_buf local_sigjmp_buf; \
6569
{ \
@@ -2450,6 +2454,133 @@ gpdb::MakeGpPolicy(GpPolicyType ptype, int nattrs, int numsegments)
24502454
GP_WRAP_END;
24512455
}
24522456

2457+
size_t
2458+
gpdb::ComputeLocaleSortKey(char *dest, size_t destsize, const char *src,
2459+
size_t srclen, Oid collation)
2460+
{
2461+
GP_WRAP_START;
2462+
{
2463+
if (destsize == 0)
2464+
{
2465+
return 0;
2466+
}
2467+
2468+
// C/POSIX collation: byte order already matches sort order, just copy.
2469+
// Treat InvalidOid the same way (no collation info available).
2470+
if (!OidIsValid(collation) || lc_collate_is_c(collation))
2471+
{
2472+
size_t n = (srclen < destsize) ? srclen : destsize;
2473+
memcpy(dest, src, n);
2474+
return n;
2475+
}
2476+
2477+
pg_locale_t locale = pg_newlocale_from_collation(collation);
2478+
2479+
// Non-deterministic collations (case/accent-insensitive ICU) don't
2480+
// produce a totally-ordered sort key for our purposes; bail.
2481+
if (locale != NULL && !locale->deterministic)
2482+
{
2483+
return 0;
2484+
}
2485+
2486+
#ifdef USE_ICU
2487+
if (locale != NULL && locale->provider == COLLPROVIDER_ICU)
2488+
{
2489+
// Best path for UTF-8 databases: ICU iterator gives us exactly
2490+
// the prefix we want without computing the full sort key.
2491+
if (GetDatabaseEncoding() == PG_UTF8)
2492+
{
2493+
UCharIterator iter;
2494+
uint32_t state[2] = {0, 0};
2495+
UErrorCode status = U_ZERO_ERROR;
2496+
uiter_setUTF8(&iter, src, (int32_t) srclen);
2497+
int32_t bsize = ucol_nextSortKeyPart(
2498+
locale->info.icu.ucol, &iter, state,
2499+
(uint8_t *) dest, (int32_t) destsize, &status);
2500+
if (U_FAILURE(status))
2501+
{
2502+
return 0;
2503+
}
2504+
return (size_t) bsize;
2505+
}
2506+
// Non-UTF8 ICU: skip the conversion dance; caller falls back.
2507+
return 0;
2508+
}
2509+
#endif
2510+
2511+
// libc path: strxfrm[_l] needs a NUL-terminated input and an output
2512+
// buffer big enough to hold the full transform (the C standard
2513+
// leaves dest indeterminate if the result didn't fit). Allocate a
2514+
// big-enough work buffer, then copy the prefix we actually need.
2515+
char nullterm_stack[256];
2516+
char *nullterm = nullterm_stack;
2517+
if (srclen + 1 > sizeof(nullterm_stack))
2518+
{
2519+
nullterm = (char *) palloc(srclen + 1);
2520+
}
2521+
memcpy(nullterm, src, srclen);
2522+
nullterm[srclen] = '\0';
2523+
2524+
char xfrm_stack[1024];
2525+
char *xfrm = xfrm_stack;
2526+
size_t xfrm_size = sizeof(xfrm_stack);
2527+
size_t bsize;
2528+
for (;;)
2529+
{
2530+
#ifdef HAVE_LOCALE_T
2531+
if (locale != NULL && locale->provider == COLLPROVIDER_LIBC)
2532+
{
2533+
bsize =
2534+
strxfrm_l(xfrm, nullterm, xfrm_size, locale->info.lt);
2535+
}
2536+
else
2537+
#endif
2538+
{
2539+
bsize = strxfrm(xfrm, nullterm, xfrm_size);
2540+
}
2541+
if (bsize < xfrm_size)
2542+
{
2543+
break;
2544+
}
2545+
// grow and retry; cap to avoid runaway allocations.
2546+
if (xfrm_size >= (size_t) 64 * 1024)
2547+
{
2548+
if (xfrm != xfrm_stack)
2549+
{
2550+
pfree(xfrm);
2551+
}
2552+
if (nullterm != nullterm_stack)
2553+
{
2554+
pfree(nullterm);
2555+
}
2556+
return 0;
2557+
}
2558+
if (xfrm != xfrm_stack)
2559+
{
2560+
pfree(xfrm);
2561+
}
2562+
xfrm_size = (bsize + 1 > xfrm_size * 2) ? bsize + 1
2563+
: xfrm_size * 2;
2564+
xfrm = (char *) palloc(xfrm_size);
2565+
}
2566+
2567+
size_t out = (bsize < destsize) ? bsize : destsize;
2568+
memcpy(dest, xfrm, out);
2569+
2570+
if (xfrm != xfrm_stack)
2571+
{
2572+
pfree(xfrm);
2573+
}
2574+
if (nullterm != nullterm_stack)
2575+
{
2576+
pfree(nullterm);
2577+
}
2578+
return out;
2579+
}
2580+
GP_WRAP_END;
2581+
return 0;
2582+
}
2583+
24532584
uint32
24542585
gpdb::HashChar(Datum d)
24552586
{

0 commit comments

Comments
 (0)