Skip to content

Commit d8205c6

Browse files
committed
feat: implement monitoring alert manager
1 parent 645c404 commit d8205c6

4 files changed

Lines changed: 193 additions & 1 deletion

File tree

docker-compose.monitoring.yml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ services:
1818
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
1919

2020
cadvisor:
21-
image: gcr.io/cadvisor/cadvisor:latest
21+
image: gcr.io/cadvisor/cadvisor:v0.49.1
2222
restart: unless-stopped
2323
privileged: true
2424
networks:
@@ -41,13 +41,34 @@ services:
4141
- "127.0.0.1:9090:9090"
4242
volumes:
4343
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
44+
- ./monitoring/alerts.yml:/etc/prometheus/alerts.yml:ro
4445
- prometheus-data:/prometheus
4546
command:
4647
- '--config.file=/etc/prometheus/prometheus.yml'
4748
- '--storage.tsdb.path=/prometheus'
4849
- '--storage.tsdb.retention.time=30d'
4950
- '--web.enable-lifecycle'
5051

52+
alertmanager:
53+
image: prom/alertmanager:latest
54+
restart: unless-stopped
55+
networks:
56+
- monitoring
57+
ports:
58+
- "127.0.0.1:9093:9093"
59+
volumes:
60+
- ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
61+
- alertmanager-data:/alertmanager
62+
command:
63+
- '--config.file=/etc/alertmanager/alertmanager.yml'
64+
- '--storage.path=/alertmanager'
65+
environment:
66+
SMTP_ADDRESS: '${SMTP_ADDRESS:-smtp.gmail.com}'
67+
SMTP_PORT: '${SMTP_PORT:-587}'
68+
SMTP_USERNAME: '${SMTP_USERNAME}'
69+
SMTP_PASSWORD: '${SMTP_PASSWORD}'
70+
ALERT_EMAIL_TO: '${ALERT_EMAIL_TO}'
71+
5172
grafana:
5273
image: grafana/grafana:latest
5374
restart: unless-stopped
@@ -86,6 +107,8 @@ volumes:
86107
driver: local
87108
grafana-data:
88109
driver: local
110+
alertmanager-data:
111+
driver: local
89112

90113
networks:
91114
monitoring:

monitoring/alertmanager.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
global:
2+
smtp_smarthost: '${SMTP_ADDRESS:-smtp.gmail.com}:${SMTP_PORT:-587}'
3+
smtp_from: '${SMTP_USERNAME}'
4+
smtp_auth_username: '${SMTP_USERNAME}'
5+
smtp_auth_password: '${SMTP_PASSWORD}'
6+
smtp_require_tls: true
7+
8+
route:
9+
receiver: email
10+
group_by: ['alertname', 'severity']
11+
group_wait: 30s
12+
group_interval: 5m
13+
repeat_interval: 4h
14+
routes:
15+
- match:
16+
severity: critical
17+
receiver: email
18+
repeat_interval: 1h
19+
20+
receivers:
21+
- name: email
22+
email_configs:
23+
- to: '${ALERT_EMAIL_TO}'
24+
send_resolved: true
25+
headers:
26+
Subject: '[ProStaff] {{ .Status | toUpper }}: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
27+
html: |
28+
{{ range .Alerts }}
29+
<b>{{ .Status | toUpper }}</b> — {{ .Annotations.summary }}<br>
30+
{{ .Annotations.description }}<br>
31+
<small>Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}</small>
32+
<hr>
33+
{{ end }}
34+
35+
inhibit_rules:
36+
- source_match:
37+
severity: critical
38+
target_match:
39+
severity: warning
40+
equal: ['alertname']

monitoring/alerts.yml

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
groups:
2+
- name: host
3+
rules:
4+
- alert: InstanceDown
5+
expr: up == 0
6+
for: 1m
7+
labels:
8+
severity: critical
9+
annotations:
10+
summary: "Scrape target down: {{ $labels.job }}"
11+
description: "{{ $labels.job }} ({{ $labels.instance }}) has been unreachable for more than 1 minute."
12+
13+
- alert: HighCPU
14+
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
15+
for: 5m
16+
labels:
17+
severity: warning
18+
annotations:
19+
summary: "Host CPU above 80%"
20+
description: "CPU usage is {{ $value | printf \"%.1f\" }}% (threshold: 80%)."
21+
22+
- alert: HighCPUCritical
23+
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
24+
for: 2m
25+
labels:
26+
severity: critical
27+
annotations:
28+
summary: "Host CPU above 95%"
29+
description: "CPU usage is {{ $value | printf \"%.1f\" }}% — host may be unresponsive."
30+
31+
- alert: HighMemory
32+
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
33+
for: 5m
34+
labels:
35+
severity: warning
36+
annotations:
37+
summary: "Host memory above 85%"
38+
description: "Memory usage is {{ $value | printf \"%.1f\" }}% (threshold: 85%)."
39+
40+
- alert: DiskAlmostFull
41+
expr: (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) * 100 > 85
42+
for: 5m
43+
labels:
44+
severity: warning
45+
annotations:
46+
summary: "Disk (/) above 85%"
47+
description: "Disk usage is {{ $value | printf \"%.1f\" }}%. Clean up soon."
48+
49+
- alert: DiskCritical
50+
expr: (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) * 100 > 95
51+
for: 1m
52+
labels:
53+
severity: critical
54+
annotations:
55+
summary: "Disk (/) above 95% — critical"
56+
description: "Disk usage is {{ $value | printf \"%.1f\" }}%. Writes may fail."
57+
58+
- alert: HighIOWait
59+
expr: avg(rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 20
60+
for: 5m
61+
labels:
62+
severity: warning
63+
annotations:
64+
summary: "High I/O wait"
65+
description: "IOWait is {{ $value | printf \"%.1f\" }}% — storage may be saturated."
66+
67+
- name: containers
68+
rules:
69+
- alert: APIContainerDown
70+
expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-api.*"})
71+
for: 2m
72+
labels:
73+
severity: critical
74+
annotations:
75+
summary: "ProStaff API container is down"
76+
description: "The API container has not been seen for more than 2 minutes."
77+
78+
- alert: SidekiqContainerDown
79+
expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-sidekiq.*"})
80+
for: 2m
81+
labels:
82+
severity: critical
83+
annotations:
84+
summary: "Sidekiq container is down"
85+
description: "The Sidekiq container has not been seen for more than 2 minutes."
86+
87+
- alert: PostgresContainerDown
88+
expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-postgres.*"})
89+
for: 2m
90+
labels:
91+
severity: critical
92+
annotations:
93+
summary: "Postgres container is down"
94+
description: "The Postgres container has not been seen for more than 2 minutes."
95+
96+
- alert: RedisContainerDown
97+
expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-redis.*"})
98+
for: 2m
99+
labels:
100+
severity: critical
101+
annotations:
102+
summary: "Redis container is down"
103+
description: "The Redis container has not been seen for more than 2 minutes. ActionCable and Sidekiq will fail."
104+
105+
- alert: ContainerHighCPU
106+
expr: sum by (name) (rate(container_cpu_usage_seconds_total{name!="",name!~".*pause.*"}[5m])) * 100 > 80
107+
for: 5m
108+
labels:
109+
severity: warning
110+
annotations:
111+
summary: "Container CPU above 80%: {{ $labels.name }}"
112+
description: "{{ $labels.name }} CPU is {{ $value | printf \"%.1f\" }}%."
113+
114+
- alert: ContainerRestarting
115+
expr: increase(container_start_time_seconds{name!="",name!~".*pause.*"}[30m]) > 2
116+
for: 0m
117+
labels:
118+
severity: warning
119+
annotations:
120+
summary: "Container restarting: {{ $labels.name }}"
121+
description: "{{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last 30 minutes."

monitoring/prometheus.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ global:
55
project: 'prostaff'
66
env: 'production'
77

8+
rule_files:
9+
- /etc/prometheus/alerts.yml
10+
11+
alerting:
12+
alertmanagers:
13+
- static_configs:
14+
- targets: ['alertmanager:9093']
15+
816
scrape_configs:
917
- job_name: 'prometheus'
1018
static_configs:

0 commit comments

Comments
 (0)