Skip to content

Commit 5d61a0a

Browse files
authored
Align pgmonitor with Containers (#471)
Containers alerts/dashboards have some changes; this PR aligns the two sources.
1 parent 29e6386 commit 5d61a0a

5 files changed

Lines changed: 31 additions & 8 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
minor_changes:
2+
- Updated containers dashboards/alerts to allow OTel or postgres-exporter values
3+
- Add requested PGNoPrimary and PGNoReplica alerts for containers

grafana/containers/postgresql_details.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@
151151
"pluginVersion": "7.4.5",
152152
"targets": [
153153
{
154-
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} < ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"} or ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}) ",
154+
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}, ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"}, ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\"})",
155155
"format": "time_series",
156156
"interval": "",
157157
"intervalFactor": 1,

grafana/containers/postgresql_overview.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@
163163
"targets": [
164164
{
165165
"$hashKey": "object:243",
166-
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
166+
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"}) or sum(patroni_postgres_running{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
167167
"format": "time_series",
168168
"interval": "",
169169
"intervalFactor": 1,

grafana/containers/prometheus_alerts.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@
136136
"pluginVersion": "7.4.5",
137137
"targets": [
138138
{
139-
"expr": "count(count by (kubernetes_namespace) (pg_up)) or count(count by (kubernetes_namespace) (up))",
139+
"expr": "sum(count by (kubernetes_namespace) (pg_up{pg_cluster!=''})) + sum(count by (kubernetes_namespace) (patroni_postgres_running{pg_cluster!=''}))",
140140
"format": "time_series",
141141
"instant": true,
142142
"interval": "",
@@ -208,7 +208,7 @@
208208
"pluginVersion": "7.4.5",
209209
"targets": [
210210
{
211-
"expr": "count(count by (pg_cluster) (pg_up)) or count(count by (pg_cluster) (up))",
211+
"expr": "sum(count by (pg_cluster) (pg_up{pg_cluster!=''})) + sum(count by (pg_cluster) (patroni_postgres_running{pg_cluster!=''}))",
212212
"format": "time_series",
213213
"instant": true,
214214
"interval": "",
@@ -280,7 +280,7 @@
280280
"pluginVersion": "7.4.5",
281281
"targets": [
282282
{
283-
"expr": "count(pg_up) or count(up)",
283+
"expr": "sum(count(pg_up{pg_cluster!=''})) + sum(count(patroni_postgres_running{pg_cluster!=''}))",
284284
"format": "time_series",
285285
"instant": true,
286286
"interval": "",

prometheus/containers/alert-rules.d/crunchy-alert-rules-pg.yml.containers.example

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ groups:
2222

2323
########## SYSTEM RULES ##########
2424
- alert: ExporterDown
25-
expr: avg_over_time(up[5m]) < 0.5
25+
expr: avg_over_time(up{job=~"crunchy-otel-collector|crunchy-postgres-exporter",exported_job!="patroni"}[5m]) < 0.5
2626
for: 10s
2727
labels:
2828
service: system
@@ -35,15 +35,35 @@ groups:
3535

3636
########## POSTGRESQL RULES ##########
3737
- alert: PGIsUp
38-
expr: pg_up < 1
38+
expr: pg_up < 1 or patroni_postgres_running < 1
3939
for: 60s
4040
labels:
4141
service: postgresql
4242
severity: critical
4343
severity_num: 300
4444
annotations:
45-
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
45+
summary: 'Metrics exporter running on {{ $labels.job }} is unable to communicate with the configured database'
4646

47+
- alert: PGNoPrimary
48+
expr: max by (cluster_name) (ccp_is_in_recovery_status) < 2
49+
for: 30s
50+
labels:
51+
service: postgresql
52+
severity: critical
53+
severity_num: 300
54+
annotations:
55+
summary: 'cluster {{ $labels.cluster_name }} does not have a primary instance'
56+
57+
# Alert on missing or absent replicas
58+
# - alert: PGNoReplica
59+
# expr: min by (cluster_name) (ccp_is_in_recovery_status) > 1
60+
# for: 30s
61+
# labels:
62+
# service: postgresql
63+
# severity: critical
64+
# severity_num: 300
65+
# annotations:
66+
# summary: 'cluster {{ $labels.cluster_name }} does not have a replica instance'
4767

4868
# Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
4969
#

0 commit comments

Comments
 (0)