EffectPatterns/content/published/patterns/observability/observability-alerting.mdx at main · PaulJPhilp/EffectPatterns

title

Set Up Alerting

observability-alerting

skillLevel

advanced

applicationPatternId

observability

summary

Configure alerts to notify you when your Effect application has problems.

Guideline

Set up alerts based on user-facing symptoms (SLO violations) rather than system metrics (CPU usage).

Rationale

Good alerting:

Catches real problems - Alerts when users are affected
Reduces noise - Fewer false positives
Enables response - Actionable information
Supports SLOs - Tracks service level objectives

Good Example

import { Effect, Metric, Schedule, Duration, Ref } from "effect"

// ============================================
// 1. Define alertable conditions
// ============================================

interface Alert {
  readonly name: string
  readonly severity: "critical" | "warning" | "info"
  readonly message: string
  readonly timestamp: Date
  readonly labels: Record<string, string>
}

interface AlertRule {
  readonly name: string
  readonly condition: Effect.Effect<boolean>
  readonly severity: "critical" | "warning" | "info"
  readonly message: string
  readonly labels: Record<string, string>
  readonly forDuration: Duration.DurationInput
}

// ============================================
// 2. Define alert rules
// ============================================

const createAlertRules = (metrics: {
  errorRate: () => Effect.Effect<number>
  latencyP99: () => Effect.Effect<number>
  availability: () => Effect.Effect<number>
}): AlertRule[] => [
  {
    name: "HighErrorRate",
    condition: metrics.errorRate().pipe(Effect.map((rate) => rate > 0.01)),
    severity: "critical",
    message: "Error rate exceeds 1%",
    labels: { team: "backend", service: "api" },
    forDuration: "5 minutes",
  },
  {
    name: "HighLatency",
    condition: metrics.latencyP99().pipe(Effect.map((p99) => p99 > 2)),
    severity: "warning",
    message: "P99 latency exceeds 2 seconds",
    labels: { team: "backend", service: "api" },
    forDuration: "10 minutes",
  },
  {
    name: "LowAvailability",
    condition: metrics.availability().pipe(Effect.map((avail) => avail < 99.9)),
    severity: "critical",
    message: "Availability below 99.9% SLO",
    labels: { team: "backend", service: "api" },
    forDuration: "5 minutes",
  },
  {
    name: "ErrorBudgetLow",
    condition: Effect.succeed(false), // Implement based on error budget calc
    severity: "warning",
    message: "Error budget below 25%",
    labels: { team: "backend", service: "api" },
    forDuration: "0 seconds",
  },
]

// ============================================
// 3. Alert manager
// ============================================

interface AlertState {
  readonly firing: Map<string, { since: Date; alert: Alert }>
  readonly resolved: Alert[]
}

const makeAlertManager = Effect.gen(function* () {
  const state = yield* Ref.make<AlertState>({
    firing: new Map(),
    resolved: [],
  })

  const checkRule = (rule: AlertRule) =>
    Effect.gen(function* () {
      const isTriggered = yield* rule.condition

      yield* Ref.modify(state, (s) => {
        const firing = new Map(s.firing)
        const resolved = [...s.resolved]
        const key = rule.name

        if (isTriggered) {
          if (!firing.has(key)) {
            // New alert
            firing.set(key, {
              since: new Date(),
              alert: {
                name: rule.name,
                severity: rule.severity,
                message: rule.message,
                timestamp: new Date(),
                labels: rule.labels,
              },
            })
          }
        } else {
          if (firing.has(key)) {
            // Alert resolved
            const prev = firing.get(key)!
            resolved.push({
              ...prev.alert,
              message: `[RESOLVED] ${prev.alert.message}`,
              timestamp: new Date(),
            })
            firing.delete(key)
          }
        }

        return [undefined, { firing, resolved }]
      })
    })

  const getActiveAlerts = () =>
    Ref.get(state).pipe(
      Effect.map((s) => Array.from(s.firing.values()).map((f) => f.alert))
    )

  const getRecentResolved = () =>
    Ref.get(state).pipe(Effect.map((s) => s.resolved.slice(-10)))

  return {
    checkRule,
    getActiveAlerts,
    getRecentResolved,
  }
})

// ============================================
// 4. Alert notification
// ============================================

interface NotificationChannel {
  readonly send: (alert: Alert) => Effect.Effect<void>
}

const slackChannel: NotificationChannel = {
  send: (alert) =>
    Effect.gen(function* () {
      const emoji =
        alert.severity === "critical"
          ? "🔴"
          : alert.severity === "warning"
            ? "🟡"
            : "🔵"

      yield* Effect.log(`${emoji} [${alert.severity.toUpperCase()}] ${alert.name}`).pipe(
        Effect.annotateLogs({
          message: alert.message,
          labels: JSON.stringify(alert.labels),
        })
      )

      // In real implementation: call Slack API
    }),
}

const pagerDutyChannel: NotificationChannel = {
  send: (alert) =>
    Effect.gen(function* () {
      if (alert.severity === "critical") {
        yield* Effect.log("PagerDuty: Creating incident").pipe(
          Effect.annotateLogs({ alert: alert.name })
        )
        // In real implementation: call PagerDuty API
      }
    }),
}

// ============================================
// 5. Alert evaluation loop
// ============================================

const runAlertEvaluation = (
  rules: AlertRule[],
  channels: NotificationChannel[],
  interval: Duration.DurationInput
) =>
  Effect.gen(function* () {
    const alertManager = yield* makeAlertManager
    const previousAlerts = yield* Ref.make(new Set<string>())

    yield* Effect.forever(
      Effect.gen(function* () {
        // Check all rules
        for (const rule of rules) {
          yield* alertManager.checkRule(rule)
        }

        // Get current active alerts
        const active = yield* alertManager.getActiveAlerts()
        const current = new Set(active.map((a) => a.name))
        const previous = yield* Ref.get(previousAlerts)

        // Find newly firing alerts
        for (const alert of active) {
          if (!previous.has(alert.name)) {
            // New alert - send notifications
            for (const channel of channels) {
              yield* channel.send(alert)
            }
          }
        }

        yield* Ref.set(previousAlerts, current)
        yield* Effect.sleep(interval)
      })
    )
  })

// ============================================
// 6. Prometheus alerting rules (YAML)
// ============================================

const prometheusAlertRules = `
groups:
  - name: effect-app-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_errors_total[5m]))
          /
          sum(rate(http_requests_total[5m]))
          > 0.01
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"

      - alert: HighLatency
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 2
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High P99 latency"
          description: "P99 latency is {{ $value }}s"

      - alert: SLOViolation
        expr: |
          sum(rate(http_requests_total{status!~"5.."}[30m]))
          /
          sum(rate(http_requests_total[30m]))
          < 0.999
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "SLO violation"
          description: "Availability is {{ $value | humanizePercentage }}"
`

Alert Severity Guidelines

Severity	Response	Example
Critical	Page immediately	SLO violation, data loss
Warning	Investigate soon	Elevated errors, slow queries
Info	Review daily	Capacity trending

Good vs Bad Alerts

❌ Bad Alert	✅ Good Alert
CPU > 80%	Error rate > 1%
Memory > 90%	P99 latency > 2s
Disk > 85%	Availability < 99.9%
Process restarted	Error budget < 25%

Best Practices

Alert on symptoms - User impact, not causes
Use for-duration - Avoid flapping
Include runbook - How to respond
Route by severity - Critical → page, warning → ticket
Review regularly - Remove noisy alerts

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Guideline

Rationale

Good Example

Alert Severity Guidelines

Good vs Bad Alerts

Best Practices

FilesExpand file tree

observability-alerting.mdx

Latest commit

History

observability-alerting.mdx

File metadata and controls

Guideline

Rationale

Good Example

Alert Severity Guidelines

Good vs Bad Alerts

Best Practices