@@ -352,23 +352,56 @@ async function collectQuerySummary(
352352 // penalty model: each series scores the geomean of `(10 + value) / (10 +
353353 // best)` over every query, imputing a penalty where the series has no value.
354354 //
355- // "Latest per series" is a `DISTINCT ON (query_idx, engine, format)` ordered by
356- // the DENORMALIZED `commit_timestamp DESC` (migrations 006/007, PR-5.1.5 fix c)
357- // instead of joining `commits` and `row_number()`-windowing the whole group.
358- // The covering index `idx_query_measurements_summary` (dataset, dataset_variant,
355+ // "Latest per series" is a recursive-CTE skip scan (loose index scan) over the
356+ // covering index `idx_query_measurements_summary` (dataset, dataset_variant,
359357 // scale_factor, storage, query_idx, engine, format, commit_timestamp DESC)
360- // INCLUDE (value_ns) makes this an Index Only Scan: the sargable group filter
361- // seeks the index prefix, the remaining (query_idx, engine, format,
362- // commit_timestamp DESC) order matches the DISTINCT ON + ORDER BY, and value_ns
363- // (both the `> 0` filter and the projection) is read from the index leaf so
364- // there is no heap fetch. ~1.95s warm for tpcds at the prod seed, vs ~6.2s for
365- // the original commits-join `row_number()` window (006's non-covering index was
366- // ignored because value_ns forced a per-row heap fetch -- see 007). `NULLS LAST`
367- // keeps a transient NULL (a row from a writer not yet populating
368- // `commit_timestamp`, before the post-deploy re-backfill) from winning "latest";
369- // the timestamp value is the same `commits.timestamp` the join used, so the
370- // same-second-tie behavior (an accepted tradeoff) is unchanged. `$1` = dataset,
371- // `$2` = storage; variant/scale params append only when non-null.
358+ // INCLUDE (value_ns) from migrations 006/007 (PR-5.1.5 fix c). The previous
359+ // `DISTINCT ON (query_idx, engine, format) ... ORDER BY commit_timestamp DESC
360+ // NULLS LAST` form scanned the group's entire history (~1.8M index entries,
361+ // ~2.4s warm for tpcds at the prod seed); the skip scan instead does one
362+ // O(log n) index descent per distinct series tuple plus one per series for its
363+ // latest value (~1.3K descents, ~20ms), which is what makes the cache-cold
364+ // `/api/groups` render fast.
365+ //
366+ // Three non-obvious constructions keep every probe a pure index descent:
367+ //
368+ // - The successor probe cannot be a single row comparison
369+ // `(query_idx, engine, format) > (...)`: a row comparison is only a btree
370+ // index qual when its first column is the index's FIRST column, and these
371+ // are index columns 5-7 (the planner degrades the row form to a filter over
372+ // a full scan). It is instead three single-column-inequality branches (next
373+ // format within the series' query_idx+engine, then next engine within its
374+ // query_idx, then next query_idx), each fully index-sargable. `UNION ALL`
375+ // under `LIMIT 1` evaluates the branches lazily in order (Append stops at
376+ // the first row), and branch order equals tuple-successor order, so the
377+ // first non-empty branch IS the successor.
378+ //
379+ // - Every probe's ORDER BY spells out the full index prefix (dataset,
380+ // dataset_variant, scale_factor, storage, ...) even though those columns
381+ // are pinned by the WHERE. An `IS NULL` pin (NULL-variant/scale groups)
382+ // does not join the planner's equivalence classes, so with the short
383+ // `ORDER BY query_idx, engine, format` form the planner cannot prove the
384+ // index already provides the order and inserts a Sort over the whole
385+ // group. The pinned columns are constant per group, so the long form is
386+ // semantically identical.
387+ //
388+ // - "Latest" must order `commit_timestamp DESC NULLS LAST` (a transient NULL
389+ // -- a row from a writer not yet populating `commit_timestamp`, before the
390+ // post-deploy re-backfill -- must not win over real timestamps), but the
391+ // index is `commit_timestamp DESC`, i.e. NULLS FIRST, so that order is not
392+ // index-provided. The per-series latest probe therefore takes the newest
393+ // `commit_timestamp IS NOT NULL` row first (index-ordered descent past the
394+ // NULL block) and falls back to an arbitrary NULL-timestamp row only when
395+ // the series has no timestamped rows, which is exactly the NULLS LAST
396+ // semantics. (Which row wins among all-NULL ties is unspecified, as it
397+ // already was under `DISTINCT ON`.)
398+ //
399+ // The `value_ns > 0` filter rides inside every probe: it is read from the
400+ // index leaf (INCLUDE), so the enumeration lands directly on series that have
401+ // at least one valid row, the same set `DISTINCT ON` produced. The timestamp
402+ // value is the same `commits.timestamp` the original join used, so the
403+ // same-second-tie behavior (an accepted tradeoff) is unchanged. `$1` =
404+ // dataset, `$2` = storage; variant/scale params append only when non-null.
372405 const params : unknown [ ] = [ dataset , storage ] ;
373406 const variantPred =
374407 datasetVariant === null
@@ -378,22 +411,81 @@ async function collectQuerySummary(
378411 scaleFactor === null
379412 ? 'q.scale_factor IS NULL'
380413 : `q.scale_factor = $${ params . push ( scaleFactor ) } ` ;
381- const text = `
382- SELECT query_idx, series, value_ns
383- FROM (
384- SELECT DISTINCT ON (q.query_idx, q.engine, q.format)
385- q.query_idx AS query_idx,
386- q.engine || ':' || q.format AS series,
387- q.value_ns::float8 AS value_ns
388- FROM query_measurements q
389- WHERE q.dataset = $1
414+ const groupPred = `q.dataset = $1
390415 AND ${ variantPred }
391416 AND ${ scalePred }
392- AND q.storage = $2
393- AND q.value_ns > 0
394- ORDER BY q.query_idx, q.engine, q.format, q.commit_timestamp DESC NULLS LAST
417+ AND q.storage = $2` ;
418+ const indexOrder =
419+ 'q.dataset, q.dataset_variant, q.scale_factor, q.storage, q.query_idx, q.engine, q.format' ;
420+ const text = `
421+ WITH RECURSIVE series AS (
422+ (SELECT q.query_idx, q.engine, q.format
423+ FROM query_measurements q
424+ WHERE ${ groupPred }
425+ AND q.value_ns > 0
426+ ORDER BY ${ indexOrder }
427+ LIMIT 1)
428+ UNION ALL
429+ SELECT nxt.query_idx, nxt.engine, nxt.format
430+ FROM series s
431+ CROSS JOIN LATERAL (
432+ (SELECT q.query_idx, q.engine, q.format
433+ FROM query_measurements q
434+ WHERE ${ groupPred }
435+ AND q.query_idx = s.query_idx
436+ AND q.engine = s.engine
437+ AND q.format > s.format
438+ AND q.value_ns > 0
439+ ORDER BY ${ indexOrder }
440+ LIMIT 1)
441+ UNION ALL
442+ (SELECT q.query_idx, q.engine, q.format
443+ FROM query_measurements q
444+ WHERE ${ groupPred }
445+ AND q.query_idx = s.query_idx
446+ AND q.engine > s.engine
447+ AND q.value_ns > 0
448+ ORDER BY ${ indexOrder }
449+ LIMIT 1)
450+ UNION ALL
451+ (SELECT q.query_idx, q.engine, q.format
452+ FROM query_measurements q
453+ WHERE ${ groupPred }
454+ AND q.query_idx > s.query_idx
455+ AND q.value_ns > 0
456+ ORDER BY ${ indexOrder }
457+ LIMIT 1)
458+ LIMIT 1
459+ ) nxt
460+ )
461+ SELECT s.query_idx AS query_idx,
462+ s.engine || ':' || s.format AS series,
463+ latest.value_ns AS value_ns
464+ FROM series s
465+ CROSS JOIN LATERAL (
466+ (SELECT q.value_ns::float8 AS value_ns
467+ FROM query_measurements q
468+ WHERE ${ groupPred }
469+ AND q.query_idx = s.query_idx
470+ AND q.engine = s.engine
471+ AND q.format = s.format
472+ AND q.value_ns > 0
473+ AND q.commit_timestamp IS NOT NULL
474+ ORDER BY ${ indexOrder } , q.commit_timestamp DESC
475+ LIMIT 1)
476+ UNION ALL
477+ (SELECT q.value_ns::float8 AS value_ns
478+ FROM query_measurements q
479+ WHERE ${ groupPred }
480+ AND q.query_idx = s.query_idx
481+ AND q.engine = s.engine
482+ AND q.format = s.format
483+ AND q.value_ns > 0
484+ AND q.commit_timestamp IS NULL
485+ LIMIT 1)
486+ LIMIT 1
395487 ) latest
396- ORDER BY query_idx, series
488+ ORDER BY s. query_idx, s.engine || ':' || s.format
397489 ` ;
398490 const rows = (
399491 await getPool ( ) . query < { query_idx : number ; series : string ; value_ns : number } > ( text , params )
0 commit comments