Skip to content

Commit 695f026

Browse files
feat(email): failover ESP provider — secondary on primary reject/error (#35) (#93)
Add a FailoverProvider that wraps an ordered chain of EmailProvider (primary first) and retries the same EventEmail through each secondary ESP when the primary errors or hard-rejects a send. A send is "successful" (forwarder claims the row, preserving worker convention 6 claim-after-2xx) iff ANY provider in the chain returns success. This directly hardens the standing P0 (Brevo sender unvalidated → every send rejected): wire EMAIL_PROVIDER_FALLBACK=ses and a Brevo account-level reject falls through to SES on the same payload. Inert by default: EMAIL_PROVIDER_FALLBACK unset → NewProvider returns the bare single provider, byte-identical to today (no wrapper, no metric). Proven by TestFactory_NoFallback_ReturnsBareProvider. - factory: single provider build factored into buildOne(name, cfg) so primary + fallbacks share ONE switch (no duplicated provider switch). Unknown fallback name → hard boot error; valid name with unset creds → skip-with-warning (fail-open, lets ops stage the flag ahead of the SES_* secret). If all fallbacks skip, boots with the primary only. - config: EMAIL_PROVIDER_FALLBACK comma-separated → ordered Fallbacks. - metric: instant_email_failover_total{outcome=primary_ok|fallback_ok| all_failed}. INFO on fallback engage (work performed), DEBUG on primary-ok (idle), ERROR on all_failed. Emails masked in all logs. Lazy *Vec — absent from /metrics until first emit (inert default). - tests: in-package failover_provider_test.go (fake recording provider): primary-ok-no-fallback-call, primary-fail→fallback-ok (transient/ permanent/skip), all-fail→last-error, all-skip→no all_failed metric, ordering, Name(), empty/nil guards, per-outcome metric increments, email-masking on every log path. Factory inert-default + with-fallback + unknown-name-errors + missing-creds-skipped. Config parse table. internal/email per-package coverage 96.4% (≥95 floor); all new funcs 100%. make gate green. Infra alert/tile/catalog ship in a separate infra PR (rule 25) — referenced in the PR body. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 53078dd commit 695f026

7 files changed

Lines changed: 884 additions & 31 deletions

File tree

internal/config/config.go

Lines changed: 57 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"log/slog"
77
"os"
8+
"strings"
89
)
910

1011
// Config holds all runtime configuration for the worker service.
@@ -53,6 +54,16 @@ type Config struct {
5354
SESFromEmail string // SES_FROM_EMAIL (must be a verified SES identity)
5455
SESTemplateNames map[string]string // SES_TEMPLATE_NAMES (JSON object: audit_log.kind → SES template name)
5556

57+
// EmailProviderFallback is the EMAIL_PROVIDER_FALLBACK env var: a
58+
// comma-separated, ordered list of secondary provider names (e.g. "ses")
59+
// the failover provider retries through when the primary errors or hard-
60+
// rejects a send. Inert by default — unset/empty means single-provider
61+
// mode, byte-identical to the pre-failover worker. Each named fallback
62+
// reuses that provider's existing sub-config (Brevo*/SES*). A fallback
63+
// whose creds are unset is skipped-with-warning at boot (not fatal) so an
64+
// operator can stage EMAIL_PROVIDER_FALLBACK ahead of the secret.
65+
EmailProviderFallback []string // EMAIL_PROVIDER_FALLBACK (comma-separated)
66+
5667
Environment string // ENVIRONMENT
5768
MaxMindLicenseKey string // MAXMIND_LICENSE_KEY — for GeoLite2 refresh job
5869
GeoLite2DBPath string // GEOLITE2_DB_PATH — local path to the GeoLite2 City MMDB
@@ -196,30 +207,31 @@ func require(key string) string {
196207
// Load reads configuration from environment variables. Panics on missing required fields.
197208
func Load() *Config {
198209
cfg := &Config{
199-
DatabaseURL: require("DATABASE_URL"),
200-
RedisURL: getenv("REDIS_URL", "redis://localhost:6379"),
201-
ProvisionerAddr: os.Getenv("PROVISIONER_ADDR"),
202-
ProvisionerSecret: os.Getenv("PROVISIONER_SECRET"),
203-
EmailProvider: os.Getenv("EMAIL_PROVIDER"),
204-
BrevoAPIKey: os.Getenv("BREVO_API_KEY"),
205-
BrevoTemplateIDs: parseBrevoTemplateIDs(os.Getenv("BREVO_TEMPLATE_IDS")),
206-
BrevoSenderEmail: os.Getenv("BREVO_SENDER_EMAIL"),
207-
BrevoSenderName: os.Getenv("BREVO_SENDER_NAME"),
208-
SESAWSRegion: os.Getenv("SES_AWS_REGION"),
209-
SESAWSAccessKey: os.Getenv("SES_AWS_ACCESS_KEY_ID"),
210-
SESAWSSecretKey: os.Getenv("SES_AWS_SECRET_ACCESS_KEY"),
211-
SESFromEmail: os.Getenv("SES_FROM_EMAIL"),
212-
SESTemplateNames: parseSESTemplateNames(os.Getenv("SES_TEMPLATE_NAMES")),
213-
Environment: getenv("ENVIRONMENT", "development"),
214-
MaxMindLicenseKey: os.Getenv("MAXMIND_LICENSE_KEY"),
215-
GeoLite2DBPath: getenv("GEOLITE2_DB_PATH", "./GeoLite2-City.mmdb"),
216-
PlansPath: os.Getenv("PLANS_PATH"),
217-
ObjectStoreEndpoint: os.Getenv("OBJECT_STORE_ENDPOINT"),
218-
ObjectStoreAccessKey: os.Getenv("OBJECT_STORE_ACCESS_KEY"),
219-
ObjectStoreSecretKey: os.Getenv("OBJECT_STORE_SECRET_KEY"),
220-
ObjectStoreBucket: getenv("OBJECT_STORE_BUCKET", "instant-shared"),
221-
ObjectStoreRegion: os.Getenv("OBJECT_STORE_REGION"),
222-
ObjectStoreSecure: os.Getenv("OBJECT_STORE_SECURE") == "true",
210+
DatabaseURL: require("DATABASE_URL"),
211+
RedisURL: getenv("REDIS_URL", "redis://localhost:6379"),
212+
ProvisionerAddr: os.Getenv("PROVISIONER_ADDR"),
213+
ProvisionerSecret: os.Getenv("PROVISIONER_SECRET"),
214+
EmailProvider: os.Getenv("EMAIL_PROVIDER"),
215+
BrevoAPIKey: os.Getenv("BREVO_API_KEY"),
216+
BrevoTemplateIDs: parseBrevoTemplateIDs(os.Getenv("BREVO_TEMPLATE_IDS")),
217+
BrevoSenderEmail: os.Getenv("BREVO_SENDER_EMAIL"),
218+
BrevoSenderName: os.Getenv("BREVO_SENDER_NAME"),
219+
SESAWSRegion: os.Getenv("SES_AWS_REGION"),
220+
SESAWSAccessKey: os.Getenv("SES_AWS_ACCESS_KEY_ID"),
221+
SESAWSSecretKey: os.Getenv("SES_AWS_SECRET_ACCESS_KEY"),
222+
SESFromEmail: os.Getenv("SES_FROM_EMAIL"),
223+
SESTemplateNames: parseSESTemplateNames(os.Getenv("SES_TEMPLATE_NAMES")),
224+
EmailProviderFallback: parseCSVProviders(os.Getenv("EMAIL_PROVIDER_FALLBACK")),
225+
Environment: getenv("ENVIRONMENT", "development"),
226+
MaxMindLicenseKey: os.Getenv("MAXMIND_LICENSE_KEY"),
227+
GeoLite2DBPath: getenv("GEOLITE2_DB_PATH", "./GeoLite2-City.mmdb"),
228+
PlansPath: os.Getenv("PLANS_PATH"),
229+
ObjectStoreEndpoint: os.Getenv("OBJECT_STORE_ENDPOINT"),
230+
ObjectStoreAccessKey: os.Getenv("OBJECT_STORE_ACCESS_KEY"),
231+
ObjectStoreSecretKey: os.Getenv("OBJECT_STORE_SECRET_KEY"),
232+
ObjectStoreBucket: getenv("OBJECT_STORE_BUCKET", "instant-shared"),
233+
ObjectStoreRegion: os.Getenv("OBJECT_STORE_REGION"),
234+
ObjectStoreSecure: os.Getenv("OBJECT_STORE_SECURE") == "true",
223235

224236
MinioEndpoint: os.Getenv("MINIO_ENDPOINT"),
225237
MinioRootUser: os.Getenv("MINIO_ROOT_USER"),
@@ -372,3 +384,24 @@ func parseSESTemplateNames(raw string) map[string]string {
372384
}
373385
return m
374386
}
387+
388+
// parseCSVProviders splits the EMAIL_PROVIDER_FALLBACK env var into an ordered,
389+
// trimmed list of provider names, dropping empty segments. "" → nil (inert
390+
// default: no failover). "ses" → ["ses"]. " ses , brevo " → ["ses","brevo"].
391+
// Returns nil (not an empty slice) when there are no usable entries so the
392+
// factory's len(cfg.Fallbacks)==0 inert-default check is unambiguous.
393+
func parseCSVProviders(raw string) []string {
394+
if strings.TrimSpace(raw) == "" {
395+
return nil
396+
}
397+
out := make([]string, 0, 2)
398+
for _, part := range strings.Split(raw, ",") {
399+
if p := strings.TrimSpace(part); p != "" {
400+
out = append(out, p)
401+
}
402+
}
403+
if len(out) == 0 {
404+
return nil
405+
}
406+
return out
407+
}

internal/config/config_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,3 +306,56 @@ func TestParseSESTemplateNames(t *testing.T) {
306306
t.Errorf("malformed should be empty map: %v", bad)
307307
}
308308
}
309+
310+
func TestParseCSVProviders(t *testing.T) {
311+
cases := []struct {
312+
name string
313+
raw string
314+
want []string
315+
}{
316+
{"empty", "", nil},
317+
{"whitespace-only", " ", nil},
318+
{"single", "ses", []string{"ses"}},
319+
{"ordered-pair", "ses,brevo", []string{"ses", "brevo"}},
320+
{"trims-and-drops-blanks", " ses , , brevo ", []string{"ses", "brevo"}},
321+
{"trailing-comma", "ses,", []string{"ses"}},
322+
{"leading-comma", ",ses", []string{"ses"}},
323+
{"all-blank-segments", " , , ", nil},
324+
}
325+
for _, tc := range cases {
326+
t.Run(tc.name, func(t *testing.T) {
327+
got := parseCSVProviders(tc.raw)
328+
if len(got) != len(tc.want) {
329+
t.Fatalf("parseCSVProviders(%q) = %v; want %v", tc.raw, got, tc.want)
330+
}
331+
for i := range got {
332+
if got[i] != tc.want[i] {
333+
t.Errorf("parseCSVProviders(%q)[%d] = %q; want %q", tc.raw, i, got[i], tc.want[i])
334+
}
335+
}
336+
})
337+
}
338+
}
339+
340+
// TestLoad_EmailProviderFallback — the env var flows through Load() into the
341+
// ordered Fallbacks slice; unset → nil (inert default).
342+
func TestLoad_EmailProviderFallback(t *testing.T) {
343+
t.Setenv("DATABASE_URL", "postgres://x")
344+
t.Setenv("EMAIL_PROVIDER_FALLBACK", "ses, brevo")
345+
cfg := Load()
346+
if len(cfg.EmailProviderFallback) != 2 ||
347+
cfg.EmailProviderFallback[0] != "ses" || cfg.EmailProviderFallback[1] != "brevo" {
348+
t.Errorf("EmailProviderFallback = %v; want [ses brevo]", cfg.EmailProviderFallback)
349+
}
350+
}
351+
352+
// TestLoad_EmailProviderFallback_UnsetIsNil — the inert default at the config
353+
// layer: no env var → nil slice → single-provider mode.
354+
func TestLoad_EmailProviderFallback_UnsetIsNil(t *testing.T) {
355+
t.Setenv("DATABASE_URL", "postgres://x")
356+
t.Setenv("EMAIL_PROVIDER_FALLBACK", "")
357+
cfg := Load()
358+
if cfg.EmailProviderFallback != nil {
359+
t.Errorf("EmailProviderFallback = %v; want nil (inert default)", cfg.EmailProviderFallback)
360+
}
361+
}

internal/email/factory.go

Lines changed: 112 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ package email
1111
import (
1212
"fmt"
1313
"log/slog"
14+
"strings"
1415
)
1516

1617
// Provider identifier constants — used both by env-var parsing in
@@ -48,14 +49,101 @@ type Config struct {
4849

4950
// Future: SendGrid SendGridConfig. Keep grouped so adding a
5051
// provider doesn't pollute the top-level Config namespace.
52+
53+
// Fallbacks is the ordered EMAIL_PROVIDER_FALLBACK list (comma-separated
54+
// provider names, e.g. ["ses"]). When non-empty, NewProvider builds the
55+
// primary (cfg.Provider) plus each named fallback via the SAME switch and
56+
// returns a FailoverProvider that tries them in order. When empty/unset
57+
// the behavior is byte-identical to single-provider mode — this is the
58+
// inert-by-default safety contract (root CLAUDE.md flag-protect rule):
59+
// with no secondary configured the worker returns the bare single
60+
// provider exactly as it did before this feature existed.
61+
//
62+
// Each fallback uses the same per-provider sub-config already on Config
63+
// (Brevo/SES). An operator wiring SES as a fallback sets EMAIL_PROVIDER=
64+
// brevo, EMAIL_PROVIDER_FALLBACK=ses, and the SES_* env vars; both share
65+
// this one Config.
66+
Fallbacks []string
5167
}
5268

53-
// NewProvider builds the EmailProvider selected by cfg.Provider. Returns
54-
// an error only when the provider name is set to an unknown value — empty
55-
// or "noop" deliberately succeeds with NoopProvider so a missing env var
56-
// doesn't crash the worker at boot.
69+
// NewProvider builds the EmailProvider selected by cfg.Provider, optionally
70+
// wrapped in a FailoverProvider when cfg.Fallbacks is non-empty.
71+
//
72+
// Returns an error only when a provider name (primary OR a fallback) is set to
73+
// an unknown value — empty or "noop" for the PRIMARY deliberately succeeds
74+
// with NoopProvider so a missing env var doesn't crash the worker at boot.
75+
//
76+
// Inert default: cfg.Fallbacks empty/unset → returns the bare single provider
77+
// exactly as before (no FailoverProvider wrapper). This is the safety contract;
78+
// TestFactory_NoFallback_ReturnsBareProvider pins it.
79+
//
80+
// Fallback construction is fail-OPEN on missing creds: a fallback whose
81+
// provider name is valid but whose required creds are unset (e.g. SES_* not
82+
// yet wired) is logged with a clear warning and SKIPPED, not fatal. This lets
83+
// an operator set EMAIL_PROVIDER_FALLBACK=ses ahead of the SES_* secrets
84+
// landing without wedging the worker boot. An UNKNOWN fallback name is still a
85+
// hard error (same strictness as the primary switch) — that's a config typo,
86+
// not a not-yet-provisioned credential. If every fallback skips, the worker
87+
// boots with just the primary (no wrapper), preserving today's behavior.
5788
func NewProvider(cfg Config) (EmailProvider, error) {
58-
switch cfg.Provider {
89+
primary, err := buildOne(cfg.Provider, cfg)
90+
if err != nil {
91+
return nil, err
92+
}
93+
94+
// Inert default — no fallbacks configured → single provider, byte-identical
95+
// to the pre-failover behavior. No FailoverProvider, no metric, no wrapper.
96+
if len(cfg.Fallbacks) == 0 {
97+
return primary, nil
98+
}
99+
100+
chain := []EmailProvider{primary}
101+
for _, name := range cfg.Fallbacks {
102+
name = strings.TrimSpace(name)
103+
if name == "" {
104+
continue // tolerate "ses," / " , ses" sloppiness in the env var
105+
}
106+
fb, ferr := buildOne(name, cfg)
107+
if ferr != nil {
108+
if isMissingCredsErr(name, ferr) {
109+
// Fail-open: valid provider, creds not yet wired. Skip it so
110+
// the operator can stage EMAIL_PROVIDER_FALLBACK ahead of the
111+
// secret. Warn loudly so the gap is visible.
112+
slog.Warn("email.failover.fallback_skipped",
113+
"fallback", name,
114+
"reason", "required credentials unset — fallback unavailable until configured",
115+
"error", ferr,
116+
)
117+
continue
118+
}
119+
// Unknown provider name (config typo) → hard boot error.
120+
return nil, fmt.Errorf("email: invalid EMAIL_PROVIDER_FALLBACK %q: %w", name, ferr)
121+
}
122+
chain = append(chain, fb)
123+
}
124+
125+
// Every fallback skipped (creds unset) → fall back to the bare primary so
126+
// the worker still boots and sends via the primary. No wrapper, same as
127+
// the inert default.
128+
if len(chain) == 1 {
129+
slog.Warn("email.failover.no_usable_fallbacks",
130+
"reason", "EMAIL_PROVIDER_FALLBACK set but every fallback was skipped (creds unset) — using primary only",
131+
"primary", primary.Name(),
132+
)
133+
return primary, nil
134+
}
135+
136+
return NewFailoverProvider(chain...)
137+
}
138+
139+
// buildOne constructs a single concrete EmailProvider by name, sharing the
140+
// per-provider sub-configs on cfg. This is the ONE switch used by both the
141+
// primary and every fallback — adding a provider means one new case here and
142+
// it is instantly available as both a primary and a fallback (no duplicated
143+
// switch). An empty/"noop" name yields NoopProvider (fail-open default for the
144+
// primary); used as a fallback it is a harmless no-op that always skips.
145+
func buildOne(name string, cfg Config) (EmailProvider, error) {
146+
switch name {
59147
case "", providerNameNoop:
60148
// Fail-open: an operator who hasn't configured an email provider
61149
// gets silent no-ops, not a boot crash. The warning surfaces
@@ -75,7 +163,24 @@ func NewProvider(cfg Config) (EmailProvider, error) {
75163
// return NewSendGridProvider(cfg.SendGrid)
76164

77165
default:
78-
return nil, fmt.Errorf("email: unknown EMAIL_PROVIDER %q (supported: %q, %q, %q, %q)",
79-
cfg.Provider, providerNameNoop, providerNameBrevo, providerNameSES, "")
166+
return nil, fmt.Errorf("email: unknown provider %q (supported: %q, %q, %q, %q)",
167+
name, providerNameNoop, providerNameBrevo, providerNameSES, "")
168+
}
169+
}
170+
171+
// isMissingCredsErr reports whether a fallback construction error is a
172+
// "required credential unset" condition (skip-with-warning) rather than an
173+
// unknown-name typo (hard error). The provider constructors return a known
174+
// non-nil error in exactly the missing-creds case; an unknown name is caught
175+
// by the buildOne default branch and its message begins "email: unknown
176+
// provider". We classify by "is the name a real provider" — if buildOne knew
177+
// the name, any error from it is a construction/creds error, which we treat as
178+
// skip-with-warning. This keeps the policy decision in one place.
179+
func isMissingCredsErr(name string, _ error) bool {
180+
switch name {
181+
case providerNameBrevo, providerNameSES:
182+
return true // known provider whose constructor failed → missing creds
183+
default:
184+
return false // unknown name → hard error
80185
}
81186
}

0 commit comments

Comments
 (0)