infra/k8s/prometheus-rules.yaml at master · InstaNode-dev/infra · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
# PrometheusRule for prometheus-operator (monitoring.coreos.com/v1).
# Apply to the namespace where your Prometheus instance is configured to discover rules
# (often "monitoring"). If you do not use the operator, extract the inner "groups" YAML
# into a Prometheus file_sd / prometheus.yml rule_file instead.
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: instant-platform
  namespace: instant
  labels:
    release: prometheus
    role: alert-rules
spec:
  groups:
    - name: instant-api
      rules:
        - alert: HighProvisionErrorRate
          expr: |
            (
              sum(rate(instant_http_errors_total{route=~"/db/new|/cache/new|/nosql/new|/queue/new|/storage/new",status_class="5xx"}[5m]))
            )
            /
            clamp_min(
              sum(rate(instant_http_request_duration_seconds_count{route=~"/db/new|/cache/new|/nosql/new|/queue/new|/storage/new"}[5m])),
              1e-9
            )
            > 0.05
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "Provision 5xx error rate > 5% (instant_http_errors_total / request count)"

        - alert: HighProvisionLatency
          expr: |
            histogram_quantile(
              0.99,
              sum by (le) (
                rate(instant_http_request_duration_seconds_bucket{route=~"/db/new|/cache/new|/nosql/new|/queue/new|/storage/new"}[5m])
              )
            ) > 5
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "P99 provision latency > 5s (instant_http_request_duration_seconds)"

        - alert: APIDown
          expr: up{job="instant-api"} == 0
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: "instant-api scrape target is down"

        - alert: ProvisionerGRPCDown
          expr: up{job="instant-provisioner"} == 0
          for: 1m
          labels:
            severity: critical
          annotations:
            summary: "instant-provisioner scrape target is down"

        - alert: HighClaimFailureRate
          expr: |
            (
              sum(rate(instant_http_errors_total{route="/claim",status_class=~"4xx|5xx"}[10m]))
            )
            /
            clamp_min(sum(rate(instant_http_request_duration_seconds_count{route="/claim"}[10m])), 1e-9)
            > 0.10
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Claim route 4xx/5xx rate > 10%"

        - alert: AnonymousResourceExpiryStopped
          expr: |
            increase(instant_expired_resources_total[1h]) == 0
            and instant_active_anonymous_resources > 100
          for: 2h
          labels:
            severity: warning
          annotations:
            summary: "No anonymous expirations in 1h while many active TTL resources remain (worker metrics)"

        # Storage IAM user creation failures in the /storage/new hot path.
        # A failure here blocks a paying customer's signup-to-first-byte flow.
        # Counter source: api/internal/handlers/storage.go (instant_storage_iam_users_failed_total,
        # labelled op=create|delete). This is the guaranteed-working equivalent of the
        # New Relic storage-iam-create-failed alert, which depends on the NR Prometheus
        # OpenMetrics integration being configured.
        - alert: StorageIamCreateFailures
          expr: increase(instant_storage_iam_users_failed_total{op="create"}[5m]) > 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: "Storage IAM user create failures > 0 in 5m (signup-blocking)"

        # NATS credential issuance failures — the queueprovider abstraction
        # could not mint an isolated per-tenant account JWT for a /queue/new
        # call. MR-P0-5 (NATS per-tenant isolation, 2026-05-20). Each failure
        # means that tenant landed in legacy_open mode = no isolation =
        # shares the unauthenticated NATS namespace with everyone else.
        # Counter source: api/internal/handlers/queue.go (queue.cred_issue_failed
        # → metrics.NatsAuthFailures.Inc()). Common cause: operator seed in
        # the nats-operator Secret is out of sync with the running
        # nats-server's operator JWT.
        - alert: NATSAuthFailures
          expr: increase(nats_auth_failures_total[5m]) > 0
          for: 1m
          labels:
            severity: critical
            service: queue
          annotations:
            summary: "NATS auth failures > 0 in 5m — tenants landing in legacy_open (no isolation)"
            description: "{{ $value }} NATS credential-issuance failures in 5m. Each failure = one tenant /queue/new that fell back to unauthenticated NATS. Check operator seed in nats-operator Secret matches running nats-server."

    # =========================================================================
    # Nightly backup CronJobs — postgres-customers / mongodb / redis-provision
    # =========================================================================
    # Source: kube-state-metrics emits `kube_cronjob_status_last_successful_time`
    # (unix seconds of last successful Job run). Compare against `time()` and
    # alert when the gap exceeds 36h (=1 missed night) or 60h (=2 missed nights).
    # These rules activate only when kube-state-metrics is scraped — until then
    # the New Relic log-based alert (newrelic/alerts/backup-stale-36h.json) is
    # the authoritative coverage. Drill: bash infra/scripts/restore-drill.sh.
    # Runbook: infra/BACKUP-RESTORE-RUNBOOK.md.
    - name: instant-backups
      rules:
        - alert: BackupCronJobStale36h
          expr: |
            time() - max by (cronjob) (
              kube_cronjob_status_last_successful_time{
                namespace="instant-data",
                cronjob=~"postgres-customers-backup|mongodb-backup|redis-provision-backup"
              }
            ) > 129600
          for: 30m
          labels:
            severity: warning
            service: backup
          annotations:
            summary: "Backup CronJob {{ $labels.cronjob }} has no successful run in >36h (one missed night)"
            description: "Last successful run of {{ $labels.cronjob }} was {{ humanizeDuration $value }} ago. Schedule is 03:00 UTC daily; >36h gap = at least one missed night. See infra/BACKUP-RESTORE-RUNBOOK.md."

        - alert: BackupCronJobStale60h
          expr: |
            time() - max by (cronjob) (
              kube_cronjob_status_last_successful_time{
                namespace="instant-data",
                cronjob=~"postgres-customers-backup|mongodb-backup|redis-provision-backup"
              }
            ) > 216000
          for: 30m
          labels:
            severity: critical
            service: backup
          annotations:
            summary: "Backup CronJob {{ $labels.cronjob }} has no successful run in >60h (two missed nights) — durability incident"
            description: "Last successful run of {{ $labels.cronjob }} was {{ humanizeDuration $value }} ago. Two consecutive missed nights = the Pro-tier backup promise is now broken for any data written since the last good backup. Follow infra/BACKUP-RESTORE-RUNBOOK.md."

        - alert: BackupCronJobFailedLastRun
          expr: |
            max by (cronjob) (kube_job_status_failed{namespace="instant-data",job_name=~"(postgres-customers-backup|mongodb-backup|redis-provision-backup)-.*"}) > 0
          for: 15m
          labels:
            severity: warning
            service: backup
          annotations:
            summary: "Backup Job {{ $labels.cronjob }} most recent run is in Failed state"
            description: "kube_job_status_failed > 0 for backup Job in instant-data. Pull pod logs and re-run manually if needed: see infra/BACKUP-RESTORE-RUNBOOK.md."

    # instant-worker — propagation_runner dead-letter alerts (CHAOS F3, 2026-05-20).
    # Fires on the worker's instant_propagation_dead_lettered_total counter
    # (introduced in worker repo, see metrics.go::PropagationDeadLetteredTotal).
    # Three triggers feed this metric — labelled by `reason`:
    #   reason="max_attempts" — propagationMaxAttempts (10) of exponential
    #                           backoff retries exhausted (the modal real-failure
    #                           path: provisioner gRPC down, F1's
    #                           unexpected_skip-as-failure, markApplied DB blip).
    #   reason="unknown_kind" — CHAOS F2: old worker image saw a kind the api
    #                           started enqueuing in a newer release.
    # Both mean a paying customer's regrade fell through. The companion NR
    # log-based alert (newrelic/alerts/propagation-dead-lettered.json) keys on
    # the slog ERROR line; this Prom rule keys on the counter for the case
    # where NR Log ingest is brown-outed but Prometheus is still scraping.
    - name: instant-worker
      rules:
        - alert: PropagationDeadLettered
          expr: |
            sum(rate(instant_propagation_dead_lettered_total[5m])) > 0
          for: 5m
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "instant-worker propagation_runner dead-lettered a row in the last 5m (Pro-on-paper, hobby-grade infra)"
            description: "instant_propagation_dead_lettered_total > 0 for >5m. propagation_runner is the last line of defence between Razorpay webhook delivery and customer infra; any dead-letter means a paying customer's regrade fell through. Inspect pending_propagations.last_error + the team's resources, fix the underlying issue, then either DELETE the row to let entitlement_reconciler converge OR reset failed_at=NULL+attempts=0 to re-arm the runner. See CHAOS-DRILL-2026-05-20.md F1/F2/F3."

        # Per-tick image-skew leading indicator: PropagationUnknownKindTotal
        # fires on EVERY tick while an unknown-kind row is alive (not just
        # at the eventual dead-letter, which is ~24h away on the default
        # backoff). Lets the operator catch a half-rolled-out worker pod
        # within seconds rather than waiting the full backoff exhaustion.
        # Lower-severity than PropagationDeadLettered — the fix here is to
        # finish the rollout, NOT to wake on-call.
        - alert: PropagationUnknownKind
          expr: |
            sum by (kind) (rate(instant_propagation_unknown_kind_total[5m])) > 0
          for: 5m
          labels:
            severity: warning
            service: worker
          annotations:
            summary: "instant-worker saw a pending_propagations kind it doesn't recognise (kind={{ $labels.kind }})"
            description: "instant_propagation_unknown_kind_total{kind=\"{{ $labels.kind }}\"} > 0 for >5m. A worker pod is running an older image than the api enqueued (kind={{ $labels.kind }} is not in propagationHandlers). Finish the rollout — `kubectl rollout status deploy/instant-worker -n instant-infra` and confirm pods are on the same image as instant-api. The row will dead-letter after propagationMaxAttempts (10) attempts (~24h cumulative backoff) which will fire PropagationDeadLettered above; this is the early warning."

    # instant-worker — orphan_sweep PASS 3/4/5/6 reap alerts (2026-05-20).
    # Fires on the worker's instant_orphan_sweep_reaped_total counter
    # (introduced in worker repo, see metrics.go::OrphanSweepReapedTotal).
    # Each reap is labelled by `reason`; the alerts here key on the
    # reasons that imply a distinct upstream bug worth paging on.
    - name: instant-worker-orphan-sweep
      rules:
        - alert: OrphanSweepNoDBRowReap
          expr: |
            sum(rate(instant_orphan_sweep_reaped_total{reason="no_db_row"}[1h])) > 0
          for: 1h
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "orphan_sweep reaped an instant-deploy-* namespace with NO backing deployments row (P0-3 atomic-provision bug)"
            description: |
              instant_orphan_sweep_reaped_total{reason="no_db_row"} > 0 for >1h.
              A no_db_row event means a k8s namespace was provisioned (instant-deploy-<appID>)
              but no deployments row exists for that app_id — the api created the namespace
              but the INSERT into deployments never landed. This is the P0-3 atomic-provision
              symptom surfacing in prod.
              Investigate same hour: search NR Logs for `jobs.orphan_sweep.proposed_reap`
              with reason=no_db_row, capture the app_id, then trace back through the api
              POST /deploy/new logs for the same time window to find the partial-commit
              path that needs the atomic-rollback fix.

        - alert: OrphanSweepStuckBuildSpike
          expr: |
            sum(rate(instant_orphan_sweep_reaped_total{reason="failed_build"}[15m])) * 900 > 5
          for: 15m
          labels:
            severity: warning
            service: worker
          annotations:
            summary: "orphan_sweep PASS 6 flipped >5 stuck builds to failed in 15m (build pipeline degraded)"
            description: |
              instant_orphan_sweep_reaped_total{reason="failed_build"} > 5 events in 15m.
              PASS 6 catches deployments stuck in 'building'/'deploying' for >30min whose
              pod is in ImagePullBackOff/ErrImagePull/CrashLoopBackOff. A burst means many
              customers' builds are wedged at once — the most likely cause is a ghcr.io
              outage, a Kaniko image-push 403 (worker-rbac.yaml GHCR_PUSH_TOKEN scope), or
              an upstream registry auth failure. Check ghcr.io status, the deploy.yml CI
              push step, and the kaniko build pod logs in instant-deploy-* namespaces.

        - alert: OrphanSweepReapFailureRate
          expr: |
            sum(rate(instant_orphan_sweep_reap_failed_total[15m])) by (reason) > 0
          for: 30m
          labels:
            severity: warning
            service: worker
          annotations:
            summary: "orphan_sweep reap_failed > 0 sustained for 30m (reason={{ $labels.reason }})"
            description: |
              instant_orphan_sweep_reap_failed_total{reason="{{ $labels.reason }}"} > 0
              sustained for >30 minutes. The reconciler detected an orphan but could not
              clean it — a k8s API outage or a DB write failure. Single transient events
              are fine; a sustained rate means the reap path itself is broken. Check
              instant-worker pod logs for `jobs.orphan_sweep.*_delete_failed` lines.

    # instant-* — code-defect signals (BugBash 2026-05-20).
    # Both counters are incremented by the safego.Go wrapper's deferred
    # recover() when a panic would otherwise crash a background goroutine.
    # Recovered panics keep the pod up, but they ALMOST ALWAYS indicate a
    # real code defect that escaped the test suite. Page on any occurrence.
    - name: instant-code-defects
      rules:
        - alert: GoroutinePanicsRecovered
          expr: |
            sum(rate(instant_goroutine_panics_total[5m]))
              + sum(rate(instant_worker_goroutine_panics_recovered_total[5m])) > 0
          for: 5m
          labels:
            severity: critical
            service: platform
          annotations:
            summary: "instant-* recovered a goroutine panic — code defect shipped to prod"
            description: |
              instant_goroutine_panics_total (api) + instant_worker_goroutine_panics_recovered_total (worker)
              > 0 for >5m. Some goroutine panicked and the safego.Go wrapper caught it. The
              pod stayed up, but the panic almost certainly indicates a missed error path or
              nil-deref shipped past the test gates. Grep NR Logs for `safego.panic_recovered`
              within the same time window to find the stack trace; fix the root cause and ship.

    # instant-worker — entitlement_regrade_failed > 0 (BugBash 2026-05-20).
    # The entitlement_reconciler job calls provisioner.RegradeResource() to
    # raise a tier-drifted resource's backend limits to the team's current
    # plan tier. A failure here = a paying customer is still on lower-tier
    # backend limits despite paying for the higher tier. Pair with the
    # billing-charge-undeliverable alert (inverse failure mode: tier-not-
    # translated-to-DB).
    - name: instant-worker-entitlements
      rules:
        - alert: EntitlementRegradeFailed
          expr: |
            sum by (service) (rate(instant_entitlement_regrade_failed_total[10m])) > 0
          for: 10m
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "entitlement_regrade_failed > 0 — paying customer on wrong tier limits"
            description: |
              instant_entitlement_regrade_failed_total > 0 for >10m. The entitlement_reconciler
              failed to call provisioner.RegradeResource() to raise a resource's backend limits
              to match the team's current paid tier. A paying customer is getting lower-tier
              infrastructure. Grep worker logs for `jobs.entitlement_reconciler.regrade_failed`;
              pair with billing-charge-undeliverable (inverse: tier not translated to DB at all).

    # CHAOS F1 (2026-05-20) — propagation_runner used to silently mark APPLIED on
    # any row whose target resource was missing/in an unexpected state. The fix
    # added unexpected_skip counting AND treats those rows as Failure (counts
    # against maxAttempts). This rule pages if the counter ticks more than once
    # — every unexpected_skip is one class of customer-facing nondeterminism
    # that the operator should investigate, not just the eventual dead-letter.
    - name: instant-worker-propagation-skip
      rules:
        - alert: PropagationUnexpectedSkip
          expr: |
            sum(rate(instant_propagation_unexpected_skip_total[15m])) by (kind, resource_type, skip_reason) > 0
          for: 15m
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "propagation_runner unexpected_skip {{ $labels.kind }} / {{ $labels.resource_type }} / {{ $labels.skip_reason }}"
            description: |
              instant_propagation_unexpected_skip_total{kind="{{ $labels.kind }}",resource_type="{{ $labels.resource_type }}",skip_reason="{{ $labels.skip_reason }}"} > 0 for >15m.
              The post-CHAOS-F1 fix means the row is now counted as a Failure (will eventually dead-letter and fire PropagationDeadLettered above) — but the operator should investigate the schema/state drift NOW. Find the row in pending_propagations, walk back to the api caller, and decide whether the missing resource should be inserted, the propagation row should be aborted, or the underlying race condition should be patched.

    # Email pipeline SEND-side (worker → Brevo /v3/smtp/email). RECEIVE-side
    # (Brevo → /webhooks/brevo) is covered by the NR email-delivery-ratio
    # alert. The classification label separates fixable-by-key-rotation
    # (permanent: 401/403) from fixable-by-backoff (transient: 5xx, 429) so the
    # operator knows which runbook to follow.
    - name: instant-worker-brevo-send
      rules:
        - alert: BrevoSendErrorsSpike
          expr: |
            sum(rate(brevo_send_errors_total[10m])) by (classification, status_code) * 600 > 10
          for: 10m
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "brevo_send_errors_total > 10 in 10m (classification={{ $labels.classification }} status={{ $labels.status_code }})"
            description: |
              Worker → Brevo /v3/smtp/email is failing at >10 errors / 10m. Brevo
              returns 201 the instant it accepts a POST, so a degraded send path
              never affects the delivery-ratio alert — the messages simply never
              queue at Brevo. Check classification: `permanent` (401/403) means
              rotate BREVO_API_KEY; `transient` (5xx/429) means upstream
              brown-out, the worker's exponential backoff will catch up.

        - alert: BrevoSendErrorsWarning
          expr: |
            sum(rate(brevo_send_errors_total[10m])) by (classification, status_code) * 600 > 3
          for: 10m
          labels:
            severity: warning
            service: worker
          annotations:
            summary: "brevo_send_errors_total > 3 in 10m (early warning) classification={{ $labels.classification }}"
            description: |
              Lower-severity early-warning twin of BrevoSendErrorsSpike. Lets the
              operator catch a slow upstream brown-out before it crosses the page
              threshold. No paging — just investigate during business hours.

    # Login throttle — magic-link rate limiter denial counter. Noisier than a
    # page; fires WARNING on >10 events in 10m so the operator can correlate
    # against a /auth/login spike (legitimate users locked out by NAT, dashboard
    # retry bug, bot sweep).
    - name: instant-api-magic-link
      rules:
        - alert: MagicLinkEmailRateLimited
          expr: |
            sum(rate(instant_magic_link_email_rate_limited_total[10m])) * 600 > 10
          for: 10m
          labels:
            severity: warning
            service: api
          annotations:
            summary: "magic-link rate-limiter denied > 10 requests in 10m"
            description: |
              instant_magic_link_email_rate_limited_total > 10 events in 10m.
              Per-email/per-IP magic-link throttle is denying real users at a
              meaningful rate. Cross-check /auth/login traffic — a spike in
              denials with no spike in requests means the dashboard is firing
              retry storms; a spike in both means a bot sweep is wedging the
              per-IP quota for legitimate users behind the same NAT.

    # Provisioner circuit breaker state. 0=CLOSED (healthy), 1=HALF_OPEN
    # (probing), 2=OPEN (fail-fast). Sustained OPEN means the provisioner
    # stopped attempting calls to a backend after N consecutive failures —
    # better than a hung call, but the customer flow is degraded.
    - name: instant-provisioner-circuit
      rules:
        - alert: ProvisionerCircuitOpen
          expr: |
            max(instant_provisioner_circuit_state) by (backend) == 2
          for: 5m
          labels:
            severity: critical
            service: provisioner
          annotations:
            summary: "provisioner circuit breaker OPEN for backend={{ $labels.backend }}"
            description: |
              instant_provisioner_circuit_state{backend="{{ $labels.backend }}"} = 2
              (OPEN) for >5m. The provisioner has stopped attempting calls to
              this backend; all /db/new, /cache/new, /nosql/new, or /queue/new
              requests targeting this backend are failing fast. Check the
              backend's own /healthz, kubectl logs the upstream pod, and decide:
              roll back the failing change, restart the backend pod, or wait
              for the half-open probe to close the breaker naturally.

        - alert: ProvisionerCircuitHalfOpen
          expr: |
            max(instant_provisioner_circuit_state) by (backend) == 1
          for: 10m
          labels:
            severity: warning
            service: provisioner
          annotations:
            summary: "provisioner circuit breaker HALF_OPEN sustained 10m for backend={{ $labels.backend }}"
            description: |
              Backend hasn't closed back to CLOSED after 10 minutes in
              HALF_OPEN. Probes are still failing intermittently; investigate
              the upstream before the breaker flips back to OPEN.

    # Go-rendered email registry sentinel. After 2026-05-15 expiry-email retro
    # the platform moved 18 email kinds off Brevo templates onto Go renderers.
    # This counter ticks every time the worker tries to send an email kind
    # that isn't in the eventEmailBuilders registry — a silent user-facing
    # breakage that page-priority must be P0.
    - name: instant-worker-email-renderer
      rules:
        - alert: EmailMissingRenderer
          expr: |
            sum(rate(email_missing_renderer_total[5m])) by (kind) > 0
          for: 5m
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "email_missing_renderer kind={{ $labels.kind }} — silent user-facing breakage"
            description: |
              email_missing_renderer_total{kind="{{ $labels.kind }}"} > 0 for >5m.
              The worker tried to send an email with a kind that has no Go
              renderer registered in eventEmailBuilders — the message body
              dropped silently on the floor. Add the renderer to
              worker/internal/email/renderers/ AND add the kind to the
              coverage test that iterates eventEmailBuilders so this can't
              happen again for the same kind.

    # /readyz deep-health gauge, exposed by api, worker, and provisioner via
    # readyz_check_status{service,check}. 1=ok, 0.5=degraded, 0=failed. This
    # rule mirrors the NR alert readyz-component-failed.json (which depends
    # on the OTLP forwarder) so a Prometheus-only operator gets the same
    # coverage. Same thresholds. Same labels.
    - name: instant-readyz
      rules:
        - alert: ReadyzCheckFailed
          expr: |
            max(readyz_check_status) by (service, check) == 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "{{ $labels.service }} /readyz check {{ $labels.check }} is FAILED"
            description: |
              readyz_check_status{service="{{ $labels.service }}",check="{{ $labels.check }}"} == 0 for >5m.
              The upstream this check probes is unreachable or rejecting calls.
              Open the failing pod's /readyz body for last_error, cross-check
              against the upstream status page, rotate keys if 401, file an
              incident if 5xx.

        - alert: ReadyzCheckDegraded
          expr: |
            max(readyz_check_status) by (service, check) == 0.5
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "{{ $labels.service }} /readyz check {{ $labels.check }} is DEGRADED"
            description: |
              readyz_check_status{service="{{ $labels.service }}",check="{{ $labels.check }}"} == 0.5 for >10m.
              Upstream reachable but creds/policy off (the exact pattern that
              would have caught the 2026-05-20 Brevo silent-rejection bug weeks
              earlier — Brevo /v3/account returns 401 on every probe and the
              brevo check goes 0.5). Rotate the credential / re-validate the
              policy before the breaker escalates to failed.

    # Brevo RECEIVE-side delivery-ratio alert (Prom mirror of NR
    # email-delivery-ratio-low.json). Webhook receiver counter
    # brevo_webhook_events_total{event} — `delivered` / total over 1h.
    # The 2026-05-20 sender-domain dropout would have shown here within an
    # hour as a falling ratio. Closed-set event labels keep cardinality
    # bounded.
    - name: instant-api-brevo-webhook
      rules:
        - alert: BrevoDeliveryRatioLow
          expr: |
            (
              sum(rate(brevo_webhook_events_total{event="delivered"}[1h]))
            )
            /
            clamp_min(
              sum(rate(brevo_webhook_events_total{event=~"delivered|bounced_hard|bounced_soft|rejected|complaint|deferred|unsubscribed|error"}[1h])),
              1e-9
            )
            < 0.95
          for: 1h
          labels:
            severity: critical
            service: api
          annotations:
            summary: "Brevo delivery ratio < 95% over 1h"
            description: |
              <95% of webhook-confirmed Brevo POSTs are landing as `delivered`.
              Common causes: sender domain dropped from validated list (the
              2026-05-20 launch incident), DKIM record stale, IP on a blocklist,
              or a broad bounce/rejected event class spiking.

        - alert: BrevoDeliveryRatioWarn
          expr: |
            (
              sum(rate(brevo_webhook_events_total{event="delivered"}[1h]))
            )
            /
            clamp_min(
              sum(rate(brevo_webhook_events_total{event=~"delivered|bounced_hard|bounced_soft|rejected|complaint|deferred|unsubscribed|error"}[1h])),
              1e-9
            )
            < 0.98
          for: 1h
          labels:
            severity: warning
            service: api
          annotations:
            summary: "Brevo delivery ratio < 98% over 1h (early warning)"

    # Make-good worklist signal — billing.charge_undeliverable. Customer paid
    # via Razorpay, platform could NOT translate the charge into a delivered
    # upgrade (team unresolvable, or resolved plan tier not in plans.yaml).
    # Each occurrence is a real customer charged but not upgraded; operator
    # must reconcile in the Razorpay dashboard.
    - name: instant-worker-billing
      rules:
        - alert: BillingChargeUndeliverable
          expr: |
            sum(rate(instant_billing_charge_undeliverable_total[1h])) > 0
          for: 5m
          labels:
            severity: critical
            service: worker
          annotations:
            summary: "billing.charge_undeliverable > 0 — paid customer not upgraded"
            description: |
              instant_billing_charge_undeliverable_total > 0 in the last hour.
              A subscription.charged Razorpay webhook confirmed a real card
              charge that the platform could NOT translate into a delivered
              upgrade. Find the team in audit_log where audit_kind = 'billing.charge_undeliverable',
              cross-check against the Razorpay dashboard, refund or hand-grant
              the tier.