web: recursive-CTE skip scans for summary, discovery, and filter universe

connortsui20 · connortsui20 · commit 15b778b01281 · 2026-06-11T14:38:51.000-04:00
Replace the three remaining whole-group/whole-table scans on the read path with
loose index scans (PR-5.1.5, cold-render fix):

- collectQuerySummary: latest-per-(query_idx, engine, format) via a recursive
  successor walk of idx_query_measurements_summary plus a per-series latest
  probe (NULLS LAST semantics preserved via an IS NOT NULL descent with an
  IS NULL fallback). Prod tpcds: 2796ms -&gt; 63ms.
- collectQueryGroups: distinct discovery tuples via a 15-branch NULL-aware
  successor walk of idx_query_measurements_chart instead of a 4.85M-row
  GROUP BY. Prod: 2333ms -&gt; 20ms.
- collectFilterUniverse: distinct engine/format on query_measurements via
  single-column skip scans over the 006 indexes. Prod: 565ms -&gt; 0.2ms.

Every probe spells out the full index-prefix ORDER BY because IS NULL pins do
not join planner equivalence classes (a bare ORDER BY on the suffix columns
forces a Sort over the whole group), and successor seeks are single-inequality
branches under a lazy Append because row comparisons are only btree index quals
from the index's first column. Results verified byte-identical against the
replaced queries on both the testcontainer seed and the full prod seed.

Signed-off-by: "Connor Tsui" &lt;connor@spiraldb.com&gt;
diff --git a/benchmarks-website/web/lib/queries.ts b/benchmarks-website/web/lib/queries.ts
@@ -755,17 +755,118 @@ type QueryGroupRow = {
   query_idx: number;
 };
 
+/**
+ * The five discovery dimensions of `query_measurements`, in the column order of
+ * `idx_query_measurements_chart`. Doubles as the probes' ORDER BY: spelling out
+ * the full index prefix is what lets the planner prove each probe is an ordered
+ * index descent even under `IS NULL` pins (see `collectQuerySummary` in
+ * `summary.ts` for the pathkey rationale).
+ */
+const DISCOVERY_COLS = 'q.dataset, q.dataset_variant, q.scale_factor, q.storage, q.query_idx';
+
+/**
+ * The successor probe of the discovery skip scan: given the current tuple `s`,
+ * find the next distinct `(dataset, dataset_variant, scale_factor, storage,
+ * query_idx)` tuple in index order (ASC, NULLS LAST on the two nullable
+ * columns). A single row comparison cannot express this (it would not be a
+ * btree index qual past column 1, and NULL components poison it), so the
+ * successor is a `UNION ALL` of mutually-ordered single-inequality branches,
+ * evaluated lazily under the caller's `LIMIT 1` (Append stops at the first
+ * row): deepest level first (next query_idx within the same group), then next
+ * storage, scale_factor, dataset_variant, dataset.
+ *
+ * The nullable levels (scale_factor, dataset_variant) follow NULLS LAST order
+ * with two branches each: `col > s.col` walks the non-NULL values (vacuously
+ * empty when `s.col` is NULL, since a NULL comparison is never true), then
+ * `col IS NULL AND s.col IS NOT NULL` steps from the last non-NULL value into
+ * the NULL partition; the `s.col IS NOT NULL` guard keeps the NULL partition
+ * from succeeding itself forever. Equality pins on a nullable column likewise
+ * need both forms (`= s.col` / `IS NULL AND s.col IS NULL`) because
+ * `IS NOT DISTINCT FROM` is not index-sargable; the dead combination returns
+ * no rows at the btree layer for free. Every branch is a pure O(log n) descent
+ * of `idx_query_measurements_chart`.
+ */
+function discoverySuccessorSql(): string {
+  const variantPins = [
+    'q.dataset_variant = s.dataset_variant',
+    'q.dataset_variant IS NULL AND s.dataset_variant IS NULL',
+  ];
+  const scalePins = [
+    'q.scale_factor = s.scale_factor',
+    'q.scale_factor IS NULL AND s.scale_factor IS NULL',
+  ];
+  const branches: string[] = [];
+  for (const variantPin of variantPins) {
+    for (const scalePin of scalePins) {
+      branches.push(
+        `q.dataset = s.dataset AND ${variantPin} AND ${scalePin}
+              AND q.storage = s.storage AND q.query_idx > s.query_idx`,
+      );
+    }
+  }
+  for (const variantPin of variantPins) {
+    for (const scalePin of scalePins) {
+      branches.push(
+        `q.dataset = s.dataset AND ${variantPin} AND ${scalePin} AND q.storage > s.storage`,
+      );
+    }
+  }
+  for (const variantPin of variantPins) {
+    branches.push(`q.dataset = s.dataset AND ${variantPin} AND q.scale_factor > s.scale_factor`);
+  }
+  for (const variantPin of variantPins) {
+    branches.push(
+      `q.dataset = s.dataset AND ${variantPin}
+            AND q.scale_factor IS NULL AND s.scale_factor IS NOT NULL`,
+    );
+  }
+  branches.push('q.dataset = s.dataset AND q.dataset_variant > s.dataset_variant');
+  branches.push(
+    'q.dataset = s.dataset AND q.dataset_variant IS NULL AND s.dataset_variant IS NOT NULL',
+  );
+  branches.push('q.dataset > s.dataset');
+  return branches
+    .map(
+      (branch) => `(SELECT ${DISCOVERY_COLS}
+             FROM query_measurements q
+            WHERE ${branch}
+            ORDER BY ${DISCOVERY_COLS}
+            LIMIT 1)`,
+    )
+    .join('\n          UNION ALL\n          ');
+}
+
 /**
  * Distinct query groups, one per `(dataset, dataset_variant, scale_factor,
  * storage)` tuple, each with one `Q{idx}` chart link per query index. Rows
  * arrive grouped by the tuple (ORDER BY matches `groups.rs`), so a new group
  * starts whenever the tuple changes.
+ *
+ * The distinct tuples come from a recursive-CTE skip scan (anchor = first index
+ * tuple, step = [`discoverySuccessorSql`]) instead of a `GROUP BY` that scans
+ * all of `query_measurements` (~1.3s at the prod seed vs ~ms; PR-5.1.5, the
+ * same loose-index-scan treatment as `collectQuerySummary`). The skip scan
+ * walks `idx_query_measurements_chart` in its native order (NULLS LAST), so the
+ * outer ORDER BY re-sorts the few hundred result tuples into the v3-parity
+ * NULLS FIRST order the group builder expects.
  */
 async function collectQueryGroups(): Promise<Group[]> {
   const text = `
+    WITH RECURSIVE tuples AS (
+      (SELECT ${DISCOVERY_COLS}
+         FROM query_measurements q
+        ORDER BY ${DISCOVERY_COLS}
+        LIMIT 1)
+      UNION ALL
+      SELECT nxt.dataset, nxt.dataset_variant, nxt.scale_factor, nxt.storage, nxt.query_idx
+        FROM tuples s
+        CROSS JOIN LATERAL (
+          ${discoverySuccessorSql()}
+          LIMIT 1
+        ) nxt
+    )
     SELECT dataset, dataset_variant, scale_factor, storage, query_idx
-      FROM query_measurements
-     GROUP BY dataset, dataset_variant, scale_factor, storage, query_idx
+      FROM tuples
      ORDER BY dataset, dataset_variant NULLS FIRST,
               scale_factor NULLS FIRST, storage, query_idx
   `;
@@ -1076,18 +1177,57 @@ export async function collectGroupCharts(
 export async function collectFilterUniverse(): Promise<FilterUniverse> {
   // The two queries are independent; run them on parallel pool connections
   // since this collector sits on every force-dynamic page render.
+  //
+  // The `query_measurements` DISTINCTs are recursive-CTE skip scans over the 006
+  // single-column indexes (`idx_query_measurements_engine` / `_format`) instead
+  // of full index scans (~460ms each at the prod seed vs ~ms; PR-5.1.5, the same
+  // loose-index-scan treatment as `collectQuerySummary` -- see `summary.ts` for
+  // the mechanics). One descent per distinct value: the anchor takes the
+  // smallest, each step seeks the first value strictly greater than the last.
+  // `engine`/`format` are NOT NULL by schema; the `IS NOT NULL` anchor guards
+  // are kept so an all-NULL column would yield `[]` rather than `[null]`,
+  // preserving the prior queries' explicit-filter semantics. The three small
+  // fact tables (a few thousand rows each) stay plain DISTINCT arms; `UNION`
+  // dedupes across all four sources.
   const [engines, formats] = await Promise.all([
     getPool().query<{ value: string }>(
-      `SELECT DISTINCT engine AS value FROM query_measurements
-        WHERE engine IS NOT NULL`,
+      `WITH RECURSIVE engines AS (
+        (SELECT q.engine AS value FROM query_measurements q
+          WHERE q.engine IS NOT NULL
+          ORDER BY q.engine
+          LIMIT 1)
+        UNION ALL
+        SELECT nxt.value
+          FROM engines e
+          CROSS JOIN LATERAL (
+            SELECT q.engine AS value FROM query_measurements q
+             WHERE q.engine > e.value
+             ORDER BY q.engine
+             LIMIT 1
+          ) nxt
+      )
+      SELECT value FROM engines`,
     ),
     getPool().query<{ value: string }>(
-      `SELECT DISTINCT value FROM (
-              SELECT format AS value FROM query_measurements   WHERE format IS NOT NULL
-        UNION SELECT format AS value FROM compression_times    WHERE format IS NOT NULL
-        UNION SELECT format AS value FROM compression_sizes    WHERE format IS NOT NULL
-        UNION SELECT format AS value FROM random_access_times  WHERE format IS NOT NULL
-       ) AS f`,
+      `WITH RECURSIVE qm_formats AS (
+        (SELECT q.format AS value FROM query_measurements q
+          WHERE q.format IS NOT NULL
+          ORDER BY q.format
+          LIMIT 1)
+        UNION ALL
+        SELECT nxt.value
+          FROM qm_formats f
+          CROSS JOIN LATERAL (
+            SELECT q.format AS value FROM query_measurements q
+             WHERE q.format > f.value
+             ORDER BY q.format
+             LIMIT 1
+          ) nxt
+      )
+            SELECT value FROM qm_formats
+      UNION SELECT format AS value FROM compression_times    WHERE format IS NOT NULL
+      UNION SELECT format AS value FROM compression_sizes    WHERE format IS NOT NULL
+      UNION SELECT format AS value FROM random_access_times  WHERE format IS NOT NULL`,
     ),
   ]);
   return {
diff --git a/benchmarks-website/web/lib/summary.ts b/benchmarks-website/web/lib/summary.ts
@@ -352,23 +352,56 @@ async function collectQuerySummary(
   // penalty model: each series scores the geomean of `(10 + value) / (10 +
   // best)` over every query, imputing a penalty where the series has no value.
   //
-  // "Latest per series" is a `DISTINCT ON (query_idx, engine, format)` ordered by
-  // the DENORMALIZED `commit_timestamp DESC` (migrations 006/007, PR-5.1.5 fix c)
-  // instead of joining `commits` and `row_number()`-windowing the whole group.
-  // The covering index `idx_query_measurements_summary` (dataset, dataset_variant,
+  // "Latest per series" is a recursive-CTE skip scan (loose index scan) over the
+  // covering index `idx_query_measurements_summary` (dataset, dataset_variant,
   // scale_factor, storage, query_idx, engine, format, commit_timestamp DESC)
-  // INCLUDE (value_ns) makes this an Index Only Scan: the sargable group filter
-  // seeks the index prefix, the remaining (query_idx, engine, format,
-  // commit_timestamp DESC) order matches the DISTINCT ON + ORDER BY, and value_ns
-  // (both the `> 0` filter and the projection) is read from the index leaf so
-  // there is no heap fetch. ~1.95s warm for tpcds at the prod seed, vs ~6.2s for
-  // the original commits-join `row_number()` window (006's non-covering index was
-  // ignored because value_ns forced a per-row heap fetch -- see 007). `NULLS LAST`
-  // keeps a transient NULL (a row from a writer not yet populating
-  // `commit_timestamp`, before the post-deploy re-backfill) from winning "latest";
-  // the timestamp value is the same `commits.timestamp` the join used, so the
-  // same-second-tie behavior (an accepted tradeoff) is unchanged. `$1` = dataset,
-  // `$2` = storage; variant/scale params append only when non-null.
+  // INCLUDE (value_ns) from migrations 006/007 (PR-5.1.5 fix c). The previous
+  // `DISTINCT ON (query_idx, engine, format) ... ORDER BY commit_timestamp DESC
+  // NULLS LAST` form scanned the group's entire history (~1.8M index entries,
+  // ~2.4s warm for tpcds at the prod seed); the skip scan instead does one
+  // O(log n) index descent per distinct series tuple plus one per series for its
+  // latest value (~1.3K descents, ~20ms), which is what makes the cache-cold
+  // `/api/groups` render fast.
+  //
+  // Three non-obvious constructions keep every probe a pure index descent:
+  //
+  //  - The successor probe cannot be a single row comparison
+  //    `(query_idx, engine, format) > (...)`: a row comparison is only a btree
+  //    index qual when its first column is the index's FIRST column, and these
+  //    are index columns 5-7 (the planner degrades the row form to a filter over
+  //    a full scan). It is instead three single-column-inequality branches (next
+  //    format within the series' query_idx+engine, then next engine within its
+  //    query_idx, then next query_idx), each fully index-sargable. `UNION ALL`
+  //    under `LIMIT 1` evaluates the branches lazily in order (Append stops at
+  //    the first row), and branch order equals tuple-successor order, so the
+  //    first non-empty branch IS the successor.
+  //
+  //  - Every probe's ORDER BY spells out the full index prefix (dataset,
+  //    dataset_variant, scale_factor, storage, ...) even though those columns
+  //    are pinned by the WHERE. An `IS NULL` pin (NULL-variant/scale groups)
+  //    does not join the planner's equivalence classes, so with the short
+  //    `ORDER BY query_idx, engine, format` form the planner cannot prove the
+  //    index already provides the order and inserts a Sort over the whole
+  //    group. The pinned columns are constant per group, so the long form is
+  //    semantically identical.
+  //
+  //  - "Latest" must order `commit_timestamp DESC NULLS LAST` (a transient NULL
+  //    -- a row from a writer not yet populating `commit_timestamp`, before the
+  //    post-deploy re-backfill -- must not win over real timestamps), but the
+  //    index is `commit_timestamp DESC`, i.e. NULLS FIRST, so that order is not
+  //    index-provided. The per-series latest probe therefore takes the newest
+  //    `commit_timestamp IS NOT NULL` row first (index-ordered descent past the
+  //    NULL block) and falls back to an arbitrary NULL-timestamp row only when
+  //    the series has no timestamped rows, which is exactly the NULLS LAST
+  //    semantics. (Which row wins among all-NULL ties is unspecified, as it
+  //    already was under `DISTINCT ON`.)
+  //
+  // The `value_ns > 0` filter rides inside every probe: it is read from the
+  // index leaf (INCLUDE), so the enumeration lands directly on series that have
+  // at least one valid row, the same set `DISTINCT ON` produced. The timestamp
+  // value is the same `commits.timestamp` the original join used, so the
+  // same-second-tie behavior (an accepted tradeoff) is unchanged. `$1` =
+  // dataset, `$2` = storage; variant/scale params append only when non-null.
   const params: unknown[] = [dataset, storage];
   const variantPred =
     datasetVariant === null
@@ -378,22 +411,81 @@ async function collectQuerySummary(
     scaleFactor === null
       ? 'q.scale_factor IS NULL'
       : `q.scale_factor = $${params.push(scaleFactor)}`;
-  const text = `
-    SELECT query_idx, series, value_ns
-      FROM (
-        SELECT DISTINCT ON (q.query_idx, q.engine, q.format)
-               q.query_idx AS query_idx,
-               q.engine || ':' || q.format AS series,
-               q.value_ns::float8 AS value_ns
-          FROM query_measurements q
-         WHERE q.dataset = $1
+  const groupPred = `q.dataset = $1
            AND ${variantPred}
            AND ${scalePred}
-           AND q.storage = $2
-           AND q.value_ns > 0
-         ORDER BY q.query_idx, q.engine, q.format, q.commit_timestamp DESC NULLS LAST
+           AND q.storage = $2`;
+  const indexOrder =
+    'q.dataset, q.dataset_variant, q.scale_factor, q.storage, q.query_idx, q.engine, q.format';
+  const text = `
+    WITH RECURSIVE series AS (
+      (SELECT q.query_idx, q.engine, q.format
+         FROM query_measurements q
+        WHERE ${groupPred}
+          AND q.value_ns > 0
+        ORDER BY ${indexOrder}
+        LIMIT 1)
+      UNION ALL
+      SELECT nxt.query_idx, nxt.engine, nxt.format
+        FROM series s
+        CROSS JOIN LATERAL (
+          (SELECT q.query_idx, q.engine, q.format
+             FROM query_measurements q
+            WHERE ${groupPred}
+              AND q.query_idx = s.query_idx
+              AND q.engine = s.engine
+              AND q.format > s.format
+              AND q.value_ns > 0
+            ORDER BY ${indexOrder}
+            LIMIT 1)
+          UNION ALL
+          (SELECT q.query_idx, q.engine, q.format
+             FROM query_measurements q
+            WHERE ${groupPred}
+              AND q.query_idx = s.query_idx
+              AND q.engine > s.engine
+              AND q.value_ns > 0
+            ORDER BY ${indexOrder}
+            LIMIT 1)
+          UNION ALL
+          (SELECT q.query_idx, q.engine, q.format
+             FROM query_measurements q
+            WHERE ${groupPred}
+              AND q.query_idx > s.query_idx
+              AND q.value_ns > 0
+            ORDER BY ${indexOrder}
+            LIMIT 1)
+          LIMIT 1
+        ) nxt
+    )
+    SELECT s.query_idx AS query_idx,
+           s.engine || ':' || s.format AS series,
+           latest.value_ns AS value_ns
+      FROM series s
+      CROSS JOIN LATERAL (
+        (SELECT q.value_ns::float8 AS value_ns
+           FROM query_measurements q
+          WHERE ${groupPred}
+            AND q.query_idx = s.query_idx
+            AND q.engine = s.engine
+            AND q.format = s.format
+            AND q.value_ns > 0
+            AND q.commit_timestamp IS NOT NULL
+          ORDER BY ${indexOrder}, q.commit_timestamp DESC
+          LIMIT 1)
+        UNION ALL
+        (SELECT q.value_ns::float8 AS value_ns
+           FROM query_measurements q
+          WHERE ${groupPred}
+            AND q.query_idx = s.query_idx
+            AND q.engine = s.engine
+            AND q.format = s.format
+            AND q.value_ns > 0
+            AND q.commit_timestamp IS NULL
+          LIMIT 1)
+        LIMIT 1
       ) latest
-     ORDER BY query_idx, series
+     ORDER BY s.query_idx, s.engine || ':' || s.format
   `;
   const rows = (
     await getPool().query<{ query_idx: number; series: string; value_ns: number }>(text, params)