feat: implement monitoring alert manager

Bulletdev · Bulletdev · commit d8205c64fb63 · 2026-05-08T10:01:35.000-03:00
diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml
@@ -18,7 +18,7 @@ services:
       - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
 
   cadvisor:
-    image: gcr.io/cadvisor/cadvisor:latest
+    image: gcr.io/cadvisor/cadvisor:v0.49.1
     restart: unless-stopped
     privileged: true
     networks:
@@ -41,13 +41,34 @@ services:
       - "127.0.0.1:9090:9090"
     volumes:
       - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./monitoring/alerts.yml:/etc/prometheus/alerts.yml:ro
       - prometheus-data:/prometheus
     command:
       - '--config.file=/etc/prometheus/prometheus.yml'
       - '--storage.tsdb.path=/prometheus'
       - '--storage.tsdb.retention.time=30d'
       - '--web.enable-lifecycle'
 
+  alertmanager:
+    image: prom/alertmanager:latest
+    restart: unless-stopped
+    networks:
+      - monitoring
+    ports:
+      - "127.0.0.1:9093:9093"
+    volumes:
+      - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - alertmanager-data:/alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+    environment:
+      SMTP_ADDRESS: '${SMTP_ADDRESS:-smtp.gmail.com}'
+      SMTP_PORT: '${SMTP_PORT:-587}'
+      SMTP_USERNAME: '${SMTP_USERNAME}'
+      SMTP_PASSWORD: '${SMTP_PASSWORD}'
+      ALERT_EMAIL_TO: '${ALERT_EMAIL_TO}'
+
   grafana:
     image: grafana/grafana:latest
     restart: unless-stopped
@@ -86,6 +107,8 @@ volumes:
     driver: local
   grafana-data:
     driver: local
+  alertmanager-data:
+    driver: local
 
 networks:
   monitoring:
diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml
@@ -0,0 +1,40 @@
+global:
+  smtp_smarthost: '${SMTP_ADDRESS:-smtp.gmail.com}:${SMTP_PORT:-587}'
+  smtp_from: '${SMTP_USERNAME}'
+  smtp_auth_username: '${SMTP_USERNAME}'
+  smtp_auth_password: '${SMTP_PASSWORD}'
+  smtp_require_tls: true
+
+route:
+  receiver: email
+  group_by: ['alertname', 'severity']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+  routes:
+    - match:
+        severity: critical
+      receiver: email
+      repeat_interval: 1h
+
+receivers:
+  - name: email
+    email_configs:
+      - to: '${ALERT_EMAIL_TO}'
+        send_resolved: true
+        headers:
+          Subject: '[ProStaff] {{ .Status | toUpper }}: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
+        html: |
+          {{ range .Alerts }}
+          <b>{{ .Status | toUpper }}</b> — {{ .Annotations.summary }}<br>
+          {{ .Annotations.description }}<br>
+          <small>Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}</small>
+          <hr>
+          {{ end }}
+
+inhibit_rules:
+  - source_match:
+      severity: critical
+    target_match:
+      severity: warning
+    equal: ['alertname']
diff --git a/monitoring/alerts.yml b/monitoring/alerts.yml
@@ -0,0 +1,121 @@
+groups:
+  - name: host
+    rules:
+      - alert: InstanceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Scrape target down: {{ $labels.job }}"
+          description: "{{ $labels.job }} ({{ $labels.instance }}) has been unreachable for more than 1 minute."
+
+      - alert: HighCPU
+        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host CPU above 80%"
+          description: "CPU usage is {{ $value | printf \"%.1f\" }}% (threshold: 80%)."
+
+      - alert: HighCPUCritical
+        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host CPU above 95%"
+          description: "CPU usage is {{ $value | printf \"%.1f\" }}% — host may be unresponsive."
+
+      - alert: HighMemory
+        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host memory above 85%"
+          description: "Memory usage is {{ $value | printf \"%.1f\" }}% (threshold: 85%)."
+
+      - alert: DiskAlmostFull
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk (/) above 85%"
+          description: "Disk usage is {{ $value | printf \"%.1f\" }}%. Clean up soon."
+
+      - alert: DiskCritical
+        expr: (1 - node_filesystem_avail_bytes{mountpoint="/",fstype!="tmpfs"} / node_filesystem_size_bytes{mountpoint="/",fstype!="tmpfs"}) * 100 > 95
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Disk (/) above 95% — critical"
+          description: "Disk usage is {{ $value | printf \"%.1f\" }}%. Writes may fail."
+
+      - alert: HighIOWait
+        expr: avg(rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 20
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High I/O wait"
+          description: "IOWait is {{ $value | printf \"%.1f\" }}% — storage may be saturated."
+
+  - name: containers
+    rules:
+      - alert: APIContainerDown
+        expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-api.*"})
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "ProStaff API container is down"
+          description: "The API container has not been seen for more than 2 minutes."
+
+      - alert: SidekiqContainerDown
+        expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-sidekiq.*"})
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Sidekiq container is down"
+          description: "The Sidekiq container has not been seen for more than 2 minutes."
+
+      - alert: PostgresContainerDown
+        expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-postgres.*"})
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Postgres container is down"
+          description: "The Postgres container has not been seen for more than 2 minutes."
+
+      - alert: RedisContainerDown
+        expr: absent(container_last_seen{name=~".*x8ogsg0s4gws0840w8kksokk-redis.*"})
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Redis container is down"
+          description: "The Redis container has not been seen for more than 2 minutes. ActionCable and Sidekiq will fail."
+
+      - alert: ContainerHighCPU
+        expr: sum by (name) (rate(container_cpu_usage_seconds_total{name!="",name!~".*pause.*"}[5m])) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container CPU above 80%: {{ $labels.name }}"
+          description: "{{ $labels.name }} CPU is {{ $value | printf \"%.1f\" }}%."
+
+      - alert: ContainerRestarting
+        expr: increase(container_start_time_seconds{name!="",name!~".*pause.*"}[30m]) > 2
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container restarting: {{ $labels.name }}"
+          description: "{{ $labels.name }} has restarted {{ $value | printf \"%.0f\" }} times in the last 30 minutes."
diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
@@ -5,6 +5,14 @@ global:
     project: 'prostaff'
     env: 'production'
 
+rule_files:
+  - /etc/prometheus/alerts.yml
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
 scrape_configs:
   - job_name: 'prometheus'
     static_configs: