Skip to content

Latest commit

 

History

History
331 lines (284 loc) · 8.76 KB

File metadata and controls

331 lines (284 loc) · 8.76 KB
title Set Up Alerting
id observability-alerting
skillLevel advanced
applicationPatternId observability
summary Configure alerts to notify you when your Effect application has problems.
tags
observability
alerting
monitoring
sre
rule
description
Create alerts based on SLOs and symptoms, not causes.
author PaulJPhilp
related
observability-prometheus
observability-dashboards
lessonOrder 5

Guideline

Set up alerts based on user-facing symptoms (SLO violations) rather than system metrics (CPU usage).


Rationale

Good alerting:

  1. Catches real problems - Alerts when users are affected
  2. Reduces noise - Fewer false positives
  3. Enables response - Actionable information
  4. Supports SLOs - Tracks service level objectives

Good Example

import { Effect, Metric, Schedule, Duration, Ref } from "effect"

// ============================================
// 1. Define alertable conditions
// ============================================

interface Alert {
  readonly name: string
  readonly severity: "critical" | "warning" | "info"
  readonly message: string
  readonly timestamp: Date
  readonly labels: Record<string, string>
}

interface AlertRule {
  readonly name: string
  readonly condition: Effect.Effect<boolean>
  readonly severity: "critical" | "warning" | "info"
  readonly message: string
  readonly labels: Record<string, string>
  readonly forDuration: Duration.DurationInput
}

// ============================================
// 2. Define alert rules
// ============================================

const createAlertRules = (metrics: {
  errorRate: () => Effect.Effect<number>
  latencyP99: () => Effect.Effect<number>
  availability: () => Effect.Effect<number>
}): AlertRule[] => [
  {
    name: "HighErrorRate",
    condition: metrics.errorRate().pipe(Effect.map((rate) => rate > 0.01)),
    severity: "critical",
    message: "Error rate exceeds 1%",
    labels: { team: "backend", service: "api" },
    forDuration: "5 minutes",
  },
  {
    name: "HighLatency",
    condition: metrics.latencyP99().pipe(Effect.map((p99) => p99 > 2)),
    severity: "warning",
    message: "P99 latency exceeds 2 seconds",
    labels: { team: "backend", service: "api" },
    forDuration: "10 minutes",
  },
  {
    name: "LowAvailability",
    condition: metrics.availability().pipe(Effect.map((avail) => avail < 99.9)),
    severity: "critical",
    message: "Availability below 99.9% SLO",
    labels: { team: "backend", service: "api" },
    forDuration: "5 minutes",
  },
  {
    name: "ErrorBudgetLow",
    condition: Effect.succeed(false), // Implement based on error budget calc
    severity: "warning",
    message: "Error budget below 25%",
    labels: { team: "backend", service: "api" },
    forDuration: "0 seconds",
  },
]

// ============================================
// 3. Alert manager
// ============================================

interface AlertState {
  readonly firing: Map<string, { since: Date; alert: Alert }>
  readonly resolved: Alert[]
}

const makeAlertManager = Effect.gen(function* () {
  const state = yield* Ref.make<AlertState>({
    firing: new Map(),
    resolved: [],
  })

  const checkRule = (rule: AlertRule) =>
    Effect.gen(function* () {
      const isTriggered = yield* rule.condition

      yield* Ref.modify(state, (s) => {
        const firing = new Map(s.firing)
        const resolved = [...s.resolved]
        const key = rule.name

        if (isTriggered) {
          if (!firing.has(key)) {
            // New alert
            firing.set(key, {
              since: new Date(),
              alert: {
                name: rule.name,
                severity: rule.severity,
                message: rule.message,
                timestamp: new Date(),
                labels: rule.labels,
              },
            })
          }
        } else {
          if (firing.has(key)) {
            // Alert resolved
            const prev = firing.get(key)!
            resolved.push({
              ...prev.alert,
              message: `[RESOLVED] ${prev.alert.message}`,
              timestamp: new Date(),
            })
            firing.delete(key)
          }
        }

        return [undefined, { firing, resolved }]
      })
    })

  const getActiveAlerts = () =>
    Ref.get(state).pipe(
      Effect.map((s) => Array.from(s.firing.values()).map((f) => f.alert))
    )

  const getRecentResolved = () =>
    Ref.get(state).pipe(Effect.map((s) => s.resolved.slice(-10)))

  return {
    checkRule,
    getActiveAlerts,
    getRecentResolved,
  }
})

// ============================================
// 4. Alert notification
// ============================================

interface NotificationChannel {
  readonly send: (alert: Alert) => Effect.Effect<void>
}

const slackChannel: NotificationChannel = {
  send: (alert) =>
    Effect.gen(function* () {
      const emoji =
        alert.severity === "critical"
          ? "🔴"
          : alert.severity === "warning"
            ? "🟡"
            : "🔵"

      yield* Effect.log(`${emoji} [${alert.severity.toUpperCase()}] ${alert.name}`).pipe(
        Effect.annotateLogs({
          message: alert.message,
          labels: JSON.stringify(alert.labels),
        })
      )

      // In real implementation: call Slack API
    }),
}

const pagerDutyChannel: NotificationChannel = {
  send: (alert) =>
    Effect.gen(function* () {
      if (alert.severity === "critical") {
        yield* Effect.log("PagerDuty: Creating incident").pipe(
          Effect.annotateLogs({ alert: alert.name })
        )
        // In real implementation: call PagerDuty API
      }
    }),
}

// ============================================
// 5. Alert evaluation loop
// ============================================

const runAlertEvaluation = (
  rules: AlertRule[],
  channels: NotificationChannel[],
  interval: Duration.DurationInput
) =>
  Effect.gen(function* () {
    const alertManager = yield* makeAlertManager
    const previousAlerts = yield* Ref.make(new Set<string>())

    yield* Effect.forever(
      Effect.gen(function* () {
        // Check all rules
        for (const rule of rules) {
          yield* alertManager.checkRule(rule)
        }

        // Get current active alerts
        const active = yield* alertManager.getActiveAlerts()
        const current = new Set(active.map((a) => a.name))
        const previous = yield* Ref.get(previousAlerts)

        // Find newly firing alerts
        for (const alert of active) {
          if (!previous.has(alert.name)) {
            // New alert - send notifications
            for (const channel of channels) {
              yield* channel.send(alert)
            }
          }
        }

        yield* Ref.set(previousAlerts, current)
        yield* Effect.sleep(interval)
      })
    )
  })

// ============================================
// 6. Prometheus alerting rules (YAML)
// ============================================

const prometheusAlertRules = `
groups:
  - name: effect-app-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_errors_total[5m]))
          /
          sum(rate(http_requests_total[5m]))
          > 0.01
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"

      - alert: HighLatency
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 2
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency"
          description: "P99 latency is {{ $value }}s"

      - alert: SLOViolation
        expr: |
          sum(rate(http_requests_total{status!~"5.."}[30m]))
          /
          sum(rate(http_requests_total[30m]))
          < 0.999
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "SLO violation"
          description: "Availability is {{ $value | humanizePercentage }}"
`

Alert Severity Guidelines

Severity Response Example
Critical Page immediately SLO violation, data loss
Warning Investigate soon Elevated errors, slow queries
Info Review daily Capacity trending

Good vs Bad Alerts

❌ Bad Alert ✅ Good Alert
CPU > 80% Error rate > 1%
Memory > 90% P99 latency > 2s
Disk > 85% Availability < 99.9%
Process restarted Error budget < 25%

Best Practices

  1. Alert on symptoms - User impact, not causes
  2. Use for-duration - Avoid flapping
  3. Include runbook - How to respond
  4. Route by severity - Critical → page, warning → ticket
  5. Review regularly - Remove noisy alerts