Merge branch 'feature/fix-pgwatch-prometheus-drain' into 'main'

Sarumyan · Sarumyan · commit afbc4f6a4b18 · 2026-05-25T14:24:57.000Z
fix(pgwatch): keep Prometheus sink cache across scrapes

Closes #198

See merge request postgres-ai/postgresai!266
diff --git a/pgwatch/Dockerfile b/pgwatch/Dockerfile
@@ -1,12 +1,19 @@
 # Patched pgwatch build
 #
-# Fixes: "unexpected extension X version input: 0.0" error that kills all
-# metric gathering when the monitored DB has extensions whose version
-# parses to 0 (e.g. supabase-dbdev 0.0.4 → regex extracts "0.0"
-# → VersionToInt returns 0 → pgwatch treats it as invalid and aborts).
+# Fixes:
+#   1. "unexpected extension X version input: 0.0" error that kills all metric
+#      gathering when the monitored DB has extensions whose version parses to 0
+#      (e.g. supabase-dbdev 0.0.4 → regex extracts "0.0" → VersionToInt returns
+#      0 → pgwatch treats it as invalid and aborts). The one-line fix: skip the
+#      extension instead of returning a fatal error from FetchRuntimeInfo.
 #
-# The one-line fix: skip the extension instead of returning a fatal
-# error from FetchRuntimeInfo.
+#   2. Prometheus sink wipes its per-DB metric cache on every scrape (regression
+#      introduced upstream in v3.6.0, commit fb7abf39 / PR #790). This turns the
+#      /pgwatch endpoint into a transient drain: scrapes between collector polls
+#      return zero pg metrics; scrapes after multiple polls return the union and
+#      can exceed Prometheus sample_limit. Both halves are the same bug. Fix:
+#      remove the 3-line wipe so the cache holds the latest sample per metric
+#      until the next poll overwrites it. See gitlab.com/postgres-ai/postgresai#195.
 #
 # Based on: cybertec-postgresql/pgwatch v3.7.0
 
@@ -45,7 +52,7 @@ ARG TARGETARCH
 COPY --from=uibuilder /src /pgwatch
 COPY --from=uibuilder /src/internal/webui/build /pgwatch/internal/webui/build
 
-# Apply the fix: skip extensions with unparseable versions instead of aborting.
+# Patch 1: skip extensions with unparseable versions instead of aborting.
 # pgwatch's regex extracts only major.minor from extension versions. For
 # extensions like supabase-dbdev (0.0.4), this yields "0.0" which
 # VersionToInt() maps to 0 — treated as invalid, killing all metrics.
@@ -56,6 +63,39 @@ RUN grep -q 'return fmt.Errorf("unexpected extension %s version input: %s", ext,
 RUN sed -i 's|return fmt.Errorf("unexpected extension %s version input: %s", ext, ver)|return nil /* skip unparseable extension version */|' \
     /pgwatch/internal/sources/conn.go
 
+# Patch 2: keep the Prometheus sink's per-DB metric cache across scrapes.
+# Upstream commit fb7abf39 (v3.6.0, "improve Prometheus scrapping (#790)") added
+# a wipe of promAsyncMetricCache[dbname] at the end of every Collect(), turning
+# the cache into a transient drain. Effect: VM scrapes landing between collector
+# polls return zero pg metrics (empty Grafana); scrapes landing after multiple
+# poll cycles return the union of all of them at once and routinely exceed the
+# configured Prometheus sample_limit (rejected scrapes). Both halves are the
+# same bug.
+# Fix: remove the 3-line wipe so the cache holds the latest sample per metric
+# until overwritten by the next poll. Samples are emitted with their original
+# collection epoch via NewMetricWithTimestamp(), so VM deduplicates repeats at
+# storage time — re-emitting the same (metric, timestamp) across scrapes is a
+# no-op. The 10-min promScrapingStalenessHardDropLimit guard in
+# MetricStoreMessageToPromMetrics already covers the "collection stalled, stop
+# emitting" case the wipe was defending against.
+# Refs: gitlab.com/postgres-ai/postgresai#195
+RUN grep -Fq 'clear the cache for this db after metrics are collected' \
+      /pgwatch/internal/sinks/prometheus.go \
+    || (echo "ERROR: drain patch target not found in /pgwatch/internal/sinks/prometheus.go — upstream may have changed"; exit 1)
+RUN grep -Fq 'for dbname, metricsMessages := range promAsyncMetricCache' \
+      /pgwatch/internal/sinks/prometheus.go \
+    || (echo "ERROR: drain patch loop header not found in /pgwatch/internal/sinks/prometheus.go — upstream may have changed"; exit 1)
+# Remove the 3-line wipe (Lock + assignment-with-unique-comment + Unlock).
+RUN sed -i '/promAsyncMetricCacheLock\.Lock()$/{N;N;/clear the cache for this db after metrics are collected/d;}' \
+    /pgwatch/internal/sinks/prometheus.go
+# Removing the wipe leaves the `dbname` loop variable unused — rename to `_`.
+RUN sed -i 's|for dbname, metricsMessages := range promAsyncMetricCache|for _, metricsMessages := range promAsyncMetricCache|' \
+    /pgwatch/internal/sinks/prometheus.go
+RUN ! grep -Fq 'clear the cache for this db after metrics are collected' /pgwatch/internal/sinks/prometheus.go \
+    || (echo "ERROR: drain patch applied but wipe comment still present in /pgwatch/internal/sinks/prometheus.go"; exit 1)
+RUN ! grep -Fq 'for dbname, metricsMessages := range promAsyncMetricCache' /pgwatch/internal/sinks/prometheus.go \
+    || (echo "ERROR: drain patch applied but dbname loop var still present in /pgwatch/internal/sinks/prometheus.go"; exit 1)
+
 RUN cd /pgwatch && CGO_ENABLED=0 GOOS=$TARGETOS GOARCH=$TARGETARCH go build \
       -ldflags "-X 'main.version=3.7.0-patched'" \
       ./cmd/pgwatch