Skip to content

Commit 7d41d6c

Browse files
committed
Merge branch 'maya/fix-work-item-196-monitoring' into 'main'
fix(monitoring): address work item 196 release blockers Closes #196 See merge request postgres-ai/postgresai!277
2 parents 14abda1 + dbcd138 commit 7d41d6c

5 files changed

Lines changed: 229 additions & 12 deletions

File tree

config/grafana/dashboards/Dashboard_1_Node_performance_overview.json

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2946,7 +2946,7 @@
29462946
{
29472947
"disableTextWrap": false,
29482948
"editorMode": "code",
2949-
"expr": "(\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"7\"}) * 0 + 7 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"6\"}) * 0 + 6 or\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"5\"}) * 0 + 5 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"4\"}) * 0 + 4 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"3\"}) * 0 + 3 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"2\"}) * 0 + 2 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"1\"}) * 0 + 1\n)",
2949+
"expr": "(\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"7\"}) * 0 + 7 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"6\"}) * 0 + 6 or\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"5\"}) * 0 + 5 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"4\"}) * 0 + 4 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"3\"}) * 0 + 3 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"2\"}) * 0 + 2 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"1\"}) * 0 + 1\n)",
29502950
"fullMetaSearch": false,
29512951
"includeNullMetadata": true,
29522952
"interval": "",
@@ -3354,12 +3354,12 @@
33543354
"overrides": []
33553355
},
33563356
"gridPos": {
3357-
"h": 14,
3357+
"h": 10,
33583358
"w": 12,
33593359
"x": 12,
33603360
"y": 113
33613361
},
3362-
"id": 50,
3362+
"id": 51,
33633363
"options": {
33643364
"legend": {
33653365
"calcs": [
@@ -3383,7 +3383,7 @@
33833383
"uid": "P7A0D6631BB10B34F"
33843384
},
33853385
"editorMode": "code",
3386-
"expr": "last_over_time(pgwatch_pg_wal_size_bytes{cluster=\"$cluster_name\", node_name=\"$node_name\"}[1h])",
3386+
"expr": "last_over_time(pgwatch_pg_wal_size_bytes{cluster=\"$cluster_name\", node_name=\"$node_name\"}[10m])",
33873387
"legendFormat": "pg_wal_size",
33883388
"range": true,
33893389
"refId": "A"
@@ -3392,6 +3392,89 @@
33923392
"title": "pg_wal directory size",
33933393
"type": "timeseries"
33943394
},
3395+
{
3396+
"datasource": {
3397+
"type": "prometheus",
3398+
"uid": "P7A0D6631BB10B34F"
3399+
},
3400+
"description": "Explains why pg_wal size bytes are missing. 0 = OK, 1 = pg_ls_waldir() is unavailable, 2 = monitoring role lacks EXECUTE privilege.",
3401+
"fieldConfig": {
3402+
"defaults": {
3403+
"color": {
3404+
"mode": "thresholds"
3405+
},
3406+
"mappings": [
3407+
{
3408+
"options": {
3409+
"0": {
3410+
"text": "OK"
3411+
},
3412+
"1": {
3413+
"text": "pg_ls_waldir() unavailable"
3414+
},
3415+
"2": {
3416+
"text": "EXECUTE missing"
3417+
}
3418+
},
3419+
"type": "value"
3420+
}
3421+
],
3422+
"thresholds": {
3423+
"mode": "absolute",
3424+
"steps": [
3425+
{
3426+
"color": "green"
3427+
},
3428+
{
3429+
"color": "red",
3430+
"value": 1
3431+
}
3432+
]
3433+
}
3434+
},
3435+
"overrides": []
3436+
},
3437+
"gridPos": {
3438+
"h": 4,
3439+
"w": 12,
3440+
"x": 12,
3441+
"y": 123
3442+
},
3443+
"id": 52,
3444+
"options": {
3445+
"colorMode": "value",
3446+
"graphMode": "none",
3447+
"justifyMode": "auto",
3448+
"orientation": "auto",
3449+
"percentChangeColorMode": "standard",
3450+
"reduceOptions": {
3451+
"calcs": [
3452+
"lastNotNull"
3453+
],
3454+
"fields": "",
3455+
"values": false
3456+
},
3457+
"showPercentChange": false,
3458+
"textMode": "auto",
3459+
"wideLayout": true
3460+
},
3461+
"pluginVersion": "12.0.2",
3462+
"targets": [
3463+
{
3464+
"datasource": {
3465+
"type": "prometheus",
3466+
"uid": "P7A0D6631BB10B34F"
3467+
},
3468+
"editorMode": "code",
3469+
"expr": "last_over_time(pgwatch_pg_wal_size_status_code{cluster=\"$cluster_name\", node_name=\"$node_name\"}[10m])",
3470+
"legendFormat": "status_code",
3471+
"range": true,
3472+
"refId": "A"
3473+
}
3474+
],
3475+
"title": "pg_wal size collection status",
3476+
"type": "stat"
3477+
},
33953478
{
33963479
"fieldConfig": {
33973480
"defaults": {},

config/grafana/dashboards/Dashboard_7_Autovacuum_and_xmin_horizon.json

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@
249249
"uid": "P7A0D6631BB10B34F"
250250
},
251251
"editorMode": "code",
252-
"expr": "topk($top_n, last_over_time(pgwatch_table_stats_tx_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"}[5m]))",
252+
"expr": "topk($top_n, last_over_time(pgwatch_table_stats_tx_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\", schema!~\"pg_.*|information_schema|_timescaledb.*\"}[5m]))",
253253
"legendFormat": "{{schema}}.{{table_name}}",
254254
"range": true,
255255
"refId": "A"
@@ -492,7 +492,7 @@
492492
"uid": "P7A0D6631BB10B34F"
493493
},
494494
"editorMode": "code",
495-
"expr": "topk($top_n, last_over_time(pgwatch_table_stats_mxid_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\"}[5m]))",
495+
"expr": "topk($top_n, last_over_time(pgwatch_table_stats_mxid_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=~\"$db_name\", schema!~\"pg_.*|information_schema|_timescaledb.*\"}[5m]))",
496496
"legendFormat": "{{schema}}.{{table_name}}",
497497
"range": true,
498498
"refId": "A"
@@ -1598,7 +1598,7 @@
15981598
{
15991599
"disableTextWrap": false,
16001600
"editorMode": "code",
1601-
"expr": "(\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"7\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 7 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"6\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 6 or\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"5\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 5 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"4\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 4 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"3\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 3 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"2\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 2 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{phase=\"1\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 1\n)",
1601+
"expr": "(\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"7\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 7 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"6\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 6 or\n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"5\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 5 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"4\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 4 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"3\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 3 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"2\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 2 or \n group by (schema_name, table_name, vacuum_mode) (\n pgwatch_pg_vacuum_progress_index_vacuum_count{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", phase=\"1\", schema_name=~\"$schema_name\", table_name=~\"$table_name\"}) * 0 + 1\n)",
16021602
"fullMetaSearch": false,
16031603
"includeNullMetadata": true,
16041604
"interval": "",
@@ -1678,18 +1678,17 @@
16781678
"type": "query"
16791679
},
16801680
{
1681-
"allValue": ".+",
16821681
"current": {
16831682
"text": "public",
16841683
"value": "public"
16851684
},
1686-
"definition": "label_values(pgwatch_table_stats_tx_freeze_age,schema)",
1685+
"definition": "label_values(pgwatch_table_stats_tx_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", schema!~\"pg_.*|information_schema|_timescaledb.*\"},schema)",
16871686
"includeAll": true,
16881687
"name": "schema_name",
16891688
"options": [],
16901689
"query": {
16911690
"qryType": 1,
1692-
"query": "label_values(pgwatch_table_stats_tx_freeze_age,schema)",
1691+
"query": "label_values(pgwatch_table_stats_tx_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", schema!~\"pg_.*|information_schema|_timescaledb.*\"},schema)",
16931692
"refId": "PrometheusVariableQueryEditor-VariableQuery"
16941693
},
16951694
"refresh": 1,
@@ -1704,14 +1703,14 @@
17041703
"$__all"
17051704
]
17061705
},
1707-
"definition": "label_values(pgwatch_table_stats_tx_freeze_age{schema=~\"$schema_name\"},table_name)",
1706+
"definition": "label_values(pgwatch_table_stats_tx_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", schema=~\"$schema_name\"},table_name)",
17081707
"includeAll": true,
17091708
"multi": true,
17101709
"name": "table_name",
17111710
"options": [],
17121711
"query": {
17131712
"qryType": 1,
1714-
"query": "label_values(pgwatch_table_stats_tx_freeze_age{schema=~\"$schema_name\"},table_name)",
1713+
"query": "label_values(pgwatch_table_stats_tx_freeze_age{cluster=\"$cluster_name\", node_name=\"$node_name\", datname=\"$db_name\", schema=~\"$schema_name\"},table_name)",
17151714
"refId": "PrometheusVariableQueryEditor-VariableQuery"
17161715
},
17171716
"refresh": 1,

config/pgwatch-prometheus/metrics.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,7 @@ metrics:
10611061
pid <> pg_backend_pid()
10621062
and backend_type = 'client backend'
10631063
and backend_xmin is not null
1064+
and usename <> current_user
10641065
),
10651066
slots as (
10661067
select
@@ -1179,6 +1180,7 @@ metrics:
11791180
pid <> pg_backend_pid()
11801181
and backend_type = 'client backend'
11811182
and backend_xmin is not null
1183+
and usename <> current_user
11821184
order by age(backend_xmin) desc, pid asc
11831185
limit 1
11841186
),
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Regression coverage for work item 196 monitoring release fixes."""
2+
3+
from __future__ import annotations
4+
5+
import json
6+
from collections import Counter
7+
from pathlib import Path
8+
from typing import Any
9+
10+
11+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
12+
DASHBOARD_DIRS = (
13+
PROJECT_ROOT / "config" / "grafana" / "dashboards",
14+
PROJECT_ROOT / "postgres_ai_helm" / "config" / "grafana" / "dashboards",
15+
)
16+
DASHBOARD_1 = DASHBOARD_DIRS[0] / "Dashboard_1_Node_performance_overview.json"
17+
DASHBOARD_7 = DASHBOARD_DIRS[0] / "Dashboard_7_Autovacuum_and_xmin_horizon.json"
18+
19+
20+
def load_dashboard(path: Path) -> dict[str, Any]:
21+
return json.loads(path.read_text())
22+
23+
24+
def iter_panels(value: Any):
25+
if isinstance(value, dict):
26+
if "id" in value and "type" in value:
27+
yield value
28+
for child in value.values():
29+
yield from iter_panels(child)
30+
elif isinstance(value, list):
31+
for child in value:
32+
yield from iter_panels(child)
33+
34+
35+
def panel_by_title(dashboard: dict[str, Any], title: str) -> dict[str, Any]:
36+
for panel in iter_panels(dashboard):
37+
if panel.get("title") == title:
38+
return panel
39+
raise AssertionError(f"Panel {title!r} not found")
40+
41+
42+
def variable_by_name(dashboard: dict[str, Any], name: str) -> dict[str, Any]:
43+
for variable in dashboard.get("templating", {}).get("list", []):
44+
if variable.get("name") == name:
45+
return variable
46+
raise AssertionError(f"Variable {name!r} not found")
47+
48+
49+
def target_expr(panel: dict[str, Any]) -> str:
50+
targets = panel.get("targets") or []
51+
assert targets, f"Panel {panel.get('title')!r} has no targets"
52+
return targets[0].get("expr", "")
53+
54+
55+
def test_dashboard_panel_ids_are_unique() -> None:
56+
for dashboard_dir in DASHBOARD_DIRS:
57+
for path in dashboard_dir.glob("*.json"):
58+
ids = [panel["id"] for panel in iter_panels(load_dashboard(path))]
59+
duplicates = sorted(
60+
panel_id for panel_id, count in Counter(ids).items() if count > 1
61+
)
62+
assert not duplicates, f"{path.relative_to(PROJECT_ROOT)} duplicate ids: {duplicates}"
63+
64+
65+
def test_helm_dashboard_parity_is_preserved() -> None:
66+
source_dir, helm_dir = DASHBOARD_DIRS
67+
for source_path in source_dir.glob("*.json"):
68+
helm_path = helm_dir / source_path.name
69+
assert helm_path.exists(), f"Missing helm dashboard {helm_path.name}"
70+
assert source_path.read_text() == helm_path.read_text(), source_path.name
71+
72+
73+
def test_vacuum_timeline_queries_are_scoped() -> None:
74+
expectations = {
75+
DASHBOARD_1: (),
76+
DASHBOARD_7: ('schema_name=~"$schema_name"', 'table_name=~"$table_name"'),
77+
}
78+
for path, extra_filters in expectations.items():
79+
expr = target_expr(panel_by_title(load_dashboard(path), "Vacuum timeline"))
80+
for phase in range(1, 8):
81+
fragment = f'pgwatch_pg_vacuum_progress_index_vacuum_count{{cluster="$cluster_name", node_name="$node_name", datname="$db_name", phase="{phase}"'
82+
assert fragment in expr
83+
for extra_filter in extra_filters:
84+
assert extra_filter in expr
85+
assert 'pgwatch_pg_vacuum_progress_index_vacuum_count{phase="' not in expr
86+
87+
88+
def test_dashboard_7_schema_and_table_variables_are_scoped() -> None:
89+
dashboard = load_dashboard(DASHBOARD_7)
90+
schema_query = variable_by_name(dashboard, "schema_name")["query"]["query"]
91+
table_query = variable_by_name(dashboard, "table_name")["query"]["query"]
92+
93+
for query in (schema_query, table_query):
94+
assert 'cluster="$cluster_name"' in query
95+
assert 'node_name="$node_name"' in query
96+
assert 'datname="$db_name"' in query
97+
98+
assert 'schema!~"pg_.*|information_schema|_timescaledb.*"' in schema_query
99+
assert "allValue" not in variable_by_name(dashboard, "schema_name")
100+
assert 'schema=~"$schema_name"' in table_query
101+
102+
103+
def test_dashboard_7_wraparound_topn_excludes_system_schemas() -> None:
104+
dashboard = load_dashboard(DASHBOARD_7)
105+
system_schema_filter = 'schema!~"pg_.*|information_schema|_timescaledb.*"'
106+
107+
xid_expr = target_expr(panel_by_title(dashboard, "Top-N tables by XID age (relfrozenxid)"))
108+
mxid_expr = target_expr(panel_by_title(dashboard, "Top-N tables by MultiXID age (relminmxid)"))
109+
110+
for expr in (xid_expr, mxid_expr):
111+
assert 'cluster="$cluster_name"' in expr
112+
assert 'node_name="$node_name"' in expr
113+
assert 'datname=~"$db_name"' in expr
114+
assert system_schema_filter in expr
115+
116+
117+
def test_dashboard_1_surfaces_pg_wal_size_status_code() -> None:
118+
dashboard = load_dashboard(DASHBOARD_1)
119+
size_panel = panel_by_title(dashboard, "pg_wal directory size")
120+
status_panel = panel_by_title(dashboard, "pg_wal size collection status")
121+
122+
assert size_panel["id"] != status_panel["id"]
123+
assert size_panel["gridPos"]["y"] == 113
124+
assert status_panel["gridPos"]["y"] == 123
125+
assert "[10m]" in target_expr(size_panel)
126+
status_expr = target_expr(status_panel)
127+
assert "pgwatch_pg_wal_size_status_code" in status_expr
128+
assert "[10m]" in status_expr
129+
mapping_text = json.dumps(status_panel.get("fieldConfig", {}))
130+
assert "pg_ls_waldir() unavailable" in mapping_text
131+
assert "EXECUTE missing" in mapping_text

tests/xmin_horizon/test_metrics_sql_static.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ def test_summary_activity_filters_client_backends_with_xmin(self) -> None:
208208
self.assertIn("pid <> pg_backend_pid()", body)
209209
self.assertIn("backend_type = 'client backend'", body)
210210
self.assertIn("backend_xmin is not null", body)
211+
self.assertIn("usename <> current_user", body)
211212

212213
def test_blocker_activity_filters_client_backends_with_xmin(self) -> None:
213214
body = cte_body("xmin_horizon_blockers", "activity")
@@ -216,6 +217,7 @@ def test_blocker_activity_filters_client_backends_with_xmin(self) -> None:
216217
self.assertIn("pid <> pg_backend_pid()", body)
217218
self.assertIn("backend_type = 'client backend'", body)
218219
self.assertIn("backend_xmin is not null", body)
220+
self.assertIn("usename <> current_user", body)
219221

220222
def test_blocker_slot_detail_separates_xmin_and_catalog_xmin(self) -> None:
221223
slots_body = cte_body("xmin_horizon_blockers", "slots")

0 commit comments

Comments
 (0)