|
| 1 | +-- Correlated-proxy dataset: a cheap integer predicate that is a perfect proxy |
| 2 | +-- for three string predicates, plus one independent string predicate. |
| 3 | +-- |
| 4 | +-- c0 = 1 for ~30% of rows (cheap proxy) |
| 5 | +-- s1, s2, s3 each contain a marker exactly where c0 = 1 (correlated) |
| 6 | +-- s4 contains a marker for an independent ~30% (independent) |
| 7 | +-- |
| 8 | +-- The four string columns are deliberately *identical in shape*: same width, |
| 9 | +-- the same single marker at the same offset, each matched by an equally cheap |
| 10 | +-- regex with the same ~30% marginal selectivity. Marginally the four regex |
| 11 | +-- predicates are therefore indistinguishable -- same cost, same selectivity, in |
| 12 | +-- every position -- so neither a marginal cost/selectivity estimator nor |
| 13 | +-- runtime timing can prefer one over another. Only their *conditional* |
| 14 | +-- behaviour behind the proxy differs: after `c0 = 1`, the s1/s2/s3 regexes keep |
| 15 | +-- every survivor (each re-tests the proxy's condition) while the s4 regex still |
| 16 | +-- discards ~70%. Only joint statistics can see that; an independence assumption |
| 17 | +-- prices all four regexes identically in every position. |
| 18 | +-- |
| 19 | +-- PRED_FILL sets the filler width on each side of the marker (a non-matching |
| 20 | +-- `regexp_like` must scan the whole value), and PRED_ROWS sizes the table. |
| 21 | +CREATE TABLE t AS |
| 22 | +WITH base AS ( |
| 23 | + SELECT |
| 24 | + -- The cheap proxy and the independent control share one definition each, so |
| 25 | + -- the perfect-proxy / independence invariants can't drift apart silently. |
| 26 | + (value * 7) % 100 < 30 AS proxy, -- ~30%, drives c0 and s1/s2/s3 |
| 27 | + (value * 13) % 100 < 30 AS indep -- ~30%, independent of proxy, drives s4 |
| 28 | + FROM generate_series(1, ${PRED_ROWS:-1000000}) |
| 29 | +) |
| 30 | +SELECT |
| 31 | + CASE WHEN proxy THEN 1 ELSE 0 END AS c0, |
| 32 | + repeat('q', ${PRED_FILL:-30}) |
| 33 | + || CASE WHEN proxy THEN 'aaa' ELSE 'zzz' END |
| 34 | + || repeat('q', ${PRED_FILL:-30}) AS s1, |
| 35 | + repeat('q', ${PRED_FILL:-30}) |
| 36 | + || CASE WHEN proxy THEN 'ccc' ELSE 'zzz' END |
| 37 | + || repeat('q', ${PRED_FILL:-30}) AS s2, |
| 38 | + repeat('q', ${PRED_FILL:-30}) |
| 39 | + || CASE WHEN proxy THEN 'ddd' ELSE 'zzz' END |
| 40 | + || repeat('q', ${PRED_FILL:-30}) AS s3, |
| 41 | + repeat('q', ${PRED_FILL:-30}) |
| 42 | + || CASE WHEN indep THEN 'bbb' ELSE 'zzz' END |
| 43 | + || repeat('q', ${PRED_FILL:-30}) AS s4 |
| 44 | +FROM base; |
0 commit comments