fix(metrics): port pgwatch2 top-N + 'other' bucket to pg_stat/statio_all_*

claude · claude · commit 0668e6a59b8f · 2026-05-15T14:50:34.000Z
The four per-relation metrics (pg_stat_all_indexes, pg_stat_all_tables,
pg_statio_all_tables, pg_statio_all_indexes) had no schema filter and a
flat LIMIT 5000 truncation. On extension- or schema-heavy databases this
overran prometheus.yml's sample_limit (10000) so the entire scrape was
silently rejected, and the LIMIT tail was dropped without any aggregate
row left behind — dashboard sums drifted.

Port the gen2 (gitlab.com/postgres-ai/pgwatch2) approach faithfully
instead of reinventing it:

- Read pg_stat_user_*/pg_statio_user_* — pg_catalog, information_schema
  and pg_toast are excluded by the Postgres view itself, so we don't
  maintain a hand-curated nspname LIKE pattern that has to grow every
  time a new extension ships its own schema.
- row_number() OVER (ORDER BY &lt;relevance&gt;) &lt;= 100 per database. Rank by
  pg_total_relation_size for tables (big tables are the interesting ones;
  n_live_tup+n_dead_tup starved big-but-static tables) and by activity
  for indexes/IO views.
- UNION ALL an 'other' row that sums the tail so totals stay correct
  under the cap. HAVING count(*) &gt; 0 suppresses the row when nothing
  was truncated.
- Skip rows with no I/O activity in the statio views — most of the tail
  on schema-heavy DBs is dead-cold relations.
- Filter pg_temp% from index metrics so leftover temp objects from dead
  sessions stop leaking samples.

Metric names and exposed tag_* labels are unchanged so Dashboards 8–11
keep working. Adds two compliance-vector tests that pin the pattern.
diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml
@@ -1552,28 +1552,85 @@ metrics:
       - total_relation_size_bytes
     statement_timeout_seconds: 15
   pg_stat_all_indexes:
+    # Top-N + "other" bucket pattern ported from pgwatch2 (gen2). Reads
+    # pg_stat_user_indexes so pg_catalog/information_schema/pg_toast are
+    # excluded by the Postgres view itself, no hand-curated nspname pattern.
+    # The "other" row aggregates the tail so totals stay correct under a
+    # hard cardinality cap.
     sqls:
       11: |
-        select /* pgwatch_generated */
+        with ranked as ( /* pgwatch_generated */
+          select
+            row_number() over (order by idx_scan desc nulls last) as rownum,
+            schemaname,
+            relname,
+            indexrelname,
+            idx_scan,
+            idx_tup_read,
+            idx_tup_fetch
+          from pg_stat_user_indexes
+          where not schemaname like E'pg\\_temp%'
+        )
+        select
           current_database() as tag_datname,
           schemaname as tag_schemaname,
           relname as tag_relname,
           indexrelname as tag_indexrelname,
           idx_scan,
           idx_tup_read,
           idx_tup_fetch
-        from pg_stat_all_indexes
-        order by idx_scan desc
-        limit 5000
+        from ranked
+        where rownum <= 100
+        union all
+        select
+          current_database() as tag_datname,
+          'other'::text as tag_schemaname,
+          'other'::text as tag_relname,
+          'other'::text as tag_indexrelname,
+          coalesce(sum(idx_scan), 0)::int8 as idx_scan,
+          coalesce(sum(idx_tup_read), 0)::int8 as idx_tup_read,
+          coalesce(sum(idx_tup_fetch), 0)::int8 as idx_tup_fetch
+        from ranked
+        where rownum > 100
+        having count(*) > 0
     gauges:
       - idx_scan
       - idx_tup_read
       - idx_tup_fetch
     statement_timeout_seconds: 15
   pg_stat_all_tables:
+    # Top-N + "other" bucket pattern ported from pgwatch2 (gen2). Ranks by
+    # pg_total_relation_size — large tables are usually the interesting ones,
+    # which avoids starving big-but-static tables out of the top-N (the old
+    # n_live_tup+n_dead_tup ordering did exactly that).
     sqls:
       11: |
-        select /* pgwatch_generated */
+        with ranked as ( /* pgwatch_generated */
+          select
+            row_number() over (order by pg_total_relation_size(relid) desc nulls last) as rownum,
+            schemaname,
+            relname,
+            seq_scan,
+            seq_tup_read,
+            idx_scan,
+            idx_tup_fetch,
+            n_tup_ins,
+            n_tup_upd,
+            n_tup_del,
+            n_tup_hot_upd,
+            n_live_tup,
+            n_dead_tup,
+            last_vacuum,
+            last_autovacuum,
+            last_analyze,
+            last_autoanalyze,
+            vacuum_count,
+            autovacuum_count,
+            analyze_count,
+            autoanalyze_count
+          from pg_stat_user_tables
+        )
+        select
           current_database() as tag_datname,
           schemaname as tag_schemaname,
           relname as tag_relname,
@@ -1592,10 +1649,30 @@ metrics:
           extract(epoch from greatest(last_autoanalyze, last_analyze, '1970-01-01Z'))::int8 as last_analyze,
           (vacuum_count + autovacuum_count) as vacuum_count,
           (analyze_count + autoanalyze_count) as analyze_count
-        from
-          pg_stat_all_tables
-        order by n_live_tup + n_dead_tup desc
-        limit 5000
+        from ranked
+        where rownum <= 100
+        union all
+        select
+          current_database() as tag_datname,
+          'other'::text as tag_schemaname,
+          'other'::text as tag_relname,
+          coalesce(sum(seq_scan), 0)::int8 as seq_scan,
+          coalesce(sum(seq_tup_read), 0)::int8 as seq_tup_read,
+          coalesce(sum(idx_scan), 0)::int8 as idx_scan,
+          coalesce(sum(idx_tup_fetch), 0)::int8 as idx_tup_fetch,
+          coalesce(sum(n_tup_ins), 0)::int8 as n_tup_ins,
+          coalesce(sum(n_tup_upd), 0)::int8 as n_tup_upd,
+          coalesce(sum(n_tup_del), 0)::int8 as n_tup_del,
+          coalesce(sum(n_tup_hot_upd), 0)::int8 as n_tup_hot_upd,
+          coalesce(sum(n_live_tup), 0)::int8 as n_live_tup,
+          coalesce(sum(n_dead_tup), 0)::int8 as n_dead_tup,
+          0::int8 as last_vacuum,
+          0::int8 as last_analyze,
+          coalesce(sum(vacuum_count + autovacuum_count), 0)::int8 as vacuum_count,
+          coalesce(sum(analyze_count + autoanalyze_count), 0)::int8 as analyze_count
+        from ranked
+        where rownum > 100
+        having count(*) > 0
     gauges:
       - seq_scan
       - seq_tup_read
@@ -2881,60 +2958,115 @@ metrics:
     statement_timeout_seconds: 15
   pg_statio_all_tables:
     description: >
-      Retrieves table-level I/O statistics from the PostgreSQL `pg_statio_all_tables` view, providing insights into I/O operations for all tables.
-      It returns block-level read and hit statistics for heap, index, TOAST, and TOAST index operations broken down by schema and table.
-      Joined with pg_class for efficient ordering by table size.
-      This metric helps administrators monitor table-level I/O performance and identify which tables are generating the most I/O activity.
+      Retrieves table-level I/O statistics from `pg_statio_user_tables`, returning
+      block-level read and hit counters for heap, index, TOAST and TOAST-index pages.
+      Ports the pgwatch2 (gen2) top-N + `'other'` bucket pattern: ranks tables by
+      heap_blks_read, keeps the top 100, and folds the tail into a single `'other'`
+      row so totals remain accurate while cardinality stays bounded. Drops rows
+      with no I/O activity at all (every counter zero).
       Compatible with all PostgreSQL versions.
     sqls:
       11: |-
-        select /* pgwatch_generated */
+        with ranked as ( /* pgwatch_generated */
+          select
+            row_number() over (order by heap_blks_read desc nulls last) as rownum,
+            schemaname,
+            relname,
+            heap_blks_read,
+            heap_blks_hit,
+            idx_blks_read,
+            idx_blks_hit,
+            toast_blks_read,
+            toast_blks_hit,
+            tidx_blks_read,
+            tidx_blks_hit
+          from pg_statio_user_tables
+          where
+            heap_blks_read > 0 or heap_blks_hit > 0
+            or idx_blks_read > 0 or idx_blks_hit > 0
+            or toast_blks_read > 0 or toast_blks_hit > 0
+            or tidx_blks_read > 0 or tidx_blks_hit > 0
+        )
+        select
           (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
           current_database() as tag_datname,
-          s.schemaname as tag_schemaname,
-          s.relname as tag_relname,
-          s.heap_blks_read,
-          s.heap_blks_hit,
-          s.idx_blks_read,
-          s.idx_blks_hit,
-          s.toast_blks_read,
-          s.toast_blks_hit,
-          s.tidx_blks_read,
-          s.tidx_blks_hit
-        from
-          pg_statio_all_tables as s
-          join pg_class as c on
-            s.relname = c.relname
-            and s.schemaname = c.relnamespace::regnamespace::name
-        order by c.relpages desc
-        limit 5000;
+          schemaname as tag_schemaname,
+          relname as tag_relname,
+          heap_blks_read,
+          heap_blks_hit,
+          idx_blks_read,
+          idx_blks_hit,
+          toast_blks_read,
+          toast_blks_hit,
+          tidx_blks_read,
+          tidx_blks_hit
+        from ranked
+        where rownum <= 100
+        union all
+        select
+          (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
+          current_database() as tag_datname,
+          'other'::text as tag_schemaname,
+          'other'::text as tag_relname,
+          coalesce(sum(heap_blks_read), 0)::int8 as heap_blks_read,
+          coalesce(sum(heap_blks_hit), 0)::int8 as heap_blks_hit,
+          coalesce(sum(idx_blks_read), 0)::int8 as idx_blks_read,
+          coalesce(sum(idx_blks_hit), 0)::int8 as idx_blks_hit,
+          coalesce(sum(toast_blks_read), 0)::int8 as toast_blks_read,
+          coalesce(sum(toast_blks_hit), 0)::int8 as toast_blks_hit,
+          coalesce(sum(tidx_blks_read), 0)::int8 as tidx_blks_read,
+          coalesce(sum(tidx_blks_hit), 0)::int8 as tidx_blks_hit
+        from ranked
+        where rownum > 100
+        having count(*) > 0;
     gauges:
       - '*'
     statement_timeout_seconds: 15
   pg_statio_all_indexes:
     description: >
-      Retrieves index-level I/O statistics from the PostgreSQL `pg_statio_all_indexes` view, providing insights into I/O operations for all indexes.
-      It returns block-level read and hit statistics for index operations broken down by schema, table, and index name.
-      Joined with pg_class for efficient ordering by index size.
-      This metric helps administrators monitor index-level I/O performance and identify which indexes are generating the most I/O activity.
+      Retrieves index-level I/O statistics from `pg_statio_user_indexes`, returning
+      block-level read and hit counters per index. Ports the pgwatch2 (gen2)
+      top-N + `'other'` bucket pattern: ranks indexes by idx_blks_read, keeps the
+      top 100, folds the tail into a single `'other'` row, and drops indexes with
+      no I/O activity. Filters temp schemas.
       Compatible with all PostgreSQL versions.
     sqls:
       11: |-
-        select /* pgwatch_generated */
+        with ranked as ( /* pgwatch_generated */
+          select
+            row_number() over (order by idx_blks_read desc nulls last) as rownum,
+            schemaname,
+            relname,
+            indexrelname,
+            idx_blks_read,
+            idx_blks_hit
+          from pg_statio_user_indexes
+          where
+            not schemaname like E'pg\\_temp%'
+            and (idx_blks_read > 0 or idx_blks_hit > 0)
+        )
+        select
           (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
           current_database() as tag_datname,
-          s.schemaname as tag_schemaname,
-          s.relname as tag_relname,
-          s.indexrelname as tag_indexrelname,
-          s.idx_blks_read,
-          s.idx_blks_hit
-        from
-          pg_statio_all_indexes as s
-          join pg_class as c on
-            s.indexrelname = c.relname
-            and s.schemaname = c.relnamespace::regnamespace::name
-        order by c.relpages desc
-        limit 5000;
+          schemaname as tag_schemaname,
+          relname as tag_relname,
+          indexrelname as tag_indexrelname,
+          idx_blks_read,
+          idx_blks_hit
+        from ranked
+        where rownum <= 100
+        union all
+        select
+          (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
+          current_database() as tag_datname,
+          'other'::text as tag_schemaname,
+          'other'::text as tag_relname,
+          'other'::text as tag_indexrelname,
+          coalesce(sum(idx_blks_read), 0)::int8 as idx_blks_read,
+          coalesce(sum(idx_blks_hit), 0)::int8 as idx_blks_hit
+        from ranked
+        where rownum > 100
+        having count(*) > 0;
     gauges:
       - '*'
     statement_timeout_seconds: 15
diff --git a/tests/compliance_vectors/test_mr219_monitoring_guards.py b/tests/compliance_vectors/test_mr219_monitoring_guards.py
@@ -80,6 +80,52 @@ def test_pgwatch_metrics_yml_pg_stat_statements_has_top_n_filter():
             assert "limit 100" in compact_sql
 
 
+def test_pgwatch_stat_views_use_topn_and_other_bucket():
+    """High-cardinality per-relation metrics must port the pgwatch2 (gen2)
+    pattern: read pg_stat_user_*/pg_statio_user_* (so pg_catalog,
+    information_schema and pg_toast are excluded by the Postgres view
+    itself, no hand-curated nspname pattern), keep the top 100 by relevance,
+    and aggregate the tail into a single `'other'` tag row so dashboard
+    totals stay correct under a hard cardinality cap. Hand-rolled nspname
+    LIKE filters or LIMIT-only truncation silently drop the tail and break
+    sums on extension-heavy or schema-heavy databases.
+    """
+    metrics = yaml.safe_load(
+        (PROJECT_ROOT / "config/pgwatch-prometheus/metrics.yml").read_text()
+    )
+    expectations = {
+        "pg_stat_all_indexes": "pg_stat_user_indexes",
+        "pg_stat_all_tables": "pg_stat_user_tables",
+        "pg_statio_all_tables": "pg_statio_user_tables",
+        "pg_statio_all_indexes": "pg_statio_user_indexes",
+    }
+    for metric_name, base_view in expectations.items():
+        for sql in metrics["metrics"][metric_name]["sqls"].values():
+            compact_sql = _compact_sql(sql)
+            assert base_view in compact_sql, metric_name
+            # Top-N window + tail aggregation
+            assert "row_number() over" in compact_sql, metric_name
+            assert "rownum <= 100" in compact_sql, metric_name
+            assert "rownum > 100" in compact_sql, metric_name
+            assert "'other'" in compact_sql, metric_name
+            # No unfiltered LIMIT-only truncation left in place
+            assert "limit 5000" not in compact_sql, metric_name
+
+
+def test_pgwatch_statio_skips_zero_activity_rows():
+    """pg_statio_user_* tail is mostly zero-I/O rows on schema-heavy DBs.
+    Filtering them out (pgwatch2 behavior) cuts cardinality before the
+    top-N cap is even reached and keeps the `'other'` bucket meaningful.
+    """
+    metrics = yaml.safe_load(
+        (PROJECT_ROOT / "config/pgwatch-prometheus/metrics.yml").read_text()
+    )
+    for sql in metrics["metrics"]["pg_statio_all_tables"]["sqls"].values():
+        assert "heap_blks_read > 0" in _compact_sql(sql)
+    for sql in metrics["metrics"]["pg_statio_all_indexes"]["sqls"].values():
+        assert "idx_blks_read > 0" in _compact_sql(sql)
+
+
 def test_pgwatch_dockerfile_sha_pin_and_patch_present():
     dockerfile = (PROJECT_ROOT / "pgwatch/Dockerfile").read_text()