Skip to content
This repository was archived by the owner on Apr 16, 2026. It is now read-only.

Commit 3695a93

Browse files
authored
Add siphon production ops profile (#33)
1 parent 935a16b commit 3695a93

10 files changed

Lines changed: 61 additions & 2 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ When any admin token is configured (`server.admin_token`, `server.admin_token_se
160160
- Replay is capped by `server.admin_replay_max_limit` (default `2000`, valid range `1..100000`); accepted response includes replay job metadata (`job_id`, `status`, `effective_limit`, `max_limit`, `capped`, `dry_run`).
161161
- Replay job metadata retention/capacity is configurable (`server.admin_replay_job_ttl`, `server.admin_replay_job_max_jobs`), and backend is configurable (`server.admin_replay_store_backend=memory|sqlite`, `server.admin_replay_sqlite_path`).
162162
- Replay execution is configurable (`server.admin_replay_job_timeout`, `server.admin_replay_max_concurrent_jobs`) for bounded runtime and concurrency.
163+
- HTTP shutdown grace is configurable via `server.shutdown_timeout` for slower drains in production.
163164
- Queue fan-out safety rails are configurable (`server.admin_replay_max_queued_per_ip`, `server.admin_replay_max_queued_per_token`) and return `409` when exceeded.
164165
- `GET /admin/replay-dlq`
165166
- Requires header `X-Admin-Token` with read permission (`admin_token`/`admin_token_secondary`/`admin_token_read`/`admin_token_replay`/`admin_token_cancel`).

charts/siphon/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ Auth notes:
185185
- `config.nats.stream_compression` supports `none|s2`; `config.nats.stream_max_consumers` and `config.nats.stream_max_msgs_per_subject` must be `>= 0`.
186186
- `config.clickhouse.consumer_backoff` values must be positive and non-decreasing; when `config.clickhouse.consumer_max_deliver > 0`, it must equal the backoff list length.
187187
- Keep `config.clickhouse.consumer_fetch_max_wait < config.clickhouse.consumer_ack_wait` and `config.clickhouse.insert_timeout + config.clickhouse.flush_interval < config.clickhouse.consumer_ack_wait`.
188+
- `config.server.shutdown_timeout` controls how long the tap server waits for in-flight work to drain before forcing HTTP shutdown.
188189

189190
## Ops hardening defaults
190191

@@ -197,6 +198,7 @@ Auth notes:
197198
- `networkPolicy.natsEgressTo=[]` and `networkPolicy.clickhouseEgressTo=[]` optionally scope derived transport rules to destination selectors (`namespaceSelector`, `podSelector`, `ipBlock`) for least-privilege egress.
198199
- `envSecrets` supports direct `env` values from secret key references.
199200
- `autoscaling.customMetrics` enables HPA custom metrics in addition to CPU/memory targets.
201+
- `values-production.yaml` enables HPA by default for production installs (`minReplicas=2`, CPU+memory targets enabled).
200202

201203
Example selector-based transport policy:
202204

@@ -209,6 +211,14 @@ helm upgrade --install siphon ./charts/siphon \
209211
--set networkPolicy.clickhouseEgressTo[0].ipBlock.cidr=10.42.0.0/16
210212
```
211213

214+
## Production profile
215+
216+
```bash
217+
helm upgrade --install siphon ./charts/siphon \
218+
--namespace siphon \
219+
-f ./charts/siphon/values-production.yaml
220+
```
221+
212222
## Enable sqlite state persistence
213223

214224
```bash
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
autoscaling:
2+
enabled: true
3+
minReplicas: 2
4+
maxReplicas: 10
5+
targetCPUUtilizationPercentage: 80
6+
targetMemoryUtilizationPercentage: 80

charts/siphon/values.schema.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,10 @@
638638
"type": "string",
639639
"description": "HTTP server write timeout."
640640
},
641+
"shutdown_timeout": {
642+
"type": "string",
643+
"description": "HTTP server shutdown grace timeout."
644+
},
641645
"max_body_size": {
642646
"type": "integer",
643647
"minimum": 1,

charts/siphon/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ config:
230230
base_path: /webhooks
231231
read_timeout: 10s
232232
write_timeout: 5s
233+
shutdown_timeout: 10s
233234
max_body_size: 1048576
234235
admin_token: ${TAP_ADMIN_TOKEN}
235236
admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY}

cmd/tap/run.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ func run(ctx context.Context, cfg config.Config, logger *slog.Logger) error {
163163
}
164164
}
165165

166-
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
166+
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), cfg.Server.ShutdownTimeout)
167167
defer shutdownCancel()
168168

169169
if err := ingressServer.Shutdown(shutdownCtx); err != nil && !errors.Is(err, http.ErrServerClosed) {

config.example.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ server:
191191
base_path: /webhooks
192192
read_timeout: 10s
193193
write_timeout: 5s
194+
shutdown_timeout: 10s
194195
max_body_size: 1048576
195196
admin_token: ${TAP_ADMIN_TOKEN}
196197
admin_token_secondary: ${TAP_ADMIN_TOKEN_SECONDARY}

config/config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ type ServerConfig struct {
190190
BasePath string `koanf:"base_path"`
191191
ReadTimeout time.Duration `koanf:"read_timeout"`
192192
WriteTimeout time.Duration `koanf:"write_timeout"`
193+
ShutdownTimeout time.Duration `koanf:"shutdown_timeout"`
193194
MaxBodySize int64 `koanf:"max_body_size"`
194195
AdminToken string `koanf:"admin_token"`
195196
AdminTokenSecondary string `koanf:"admin_token_secondary"`
@@ -339,6 +340,9 @@ func (c *Config) ApplyDefaults() {
339340
if c.Server.WriteTimeout == 0 {
340341
c.Server.WriteTimeout = 5 * time.Second
341342
}
343+
if c.Server.ShutdownTimeout == 0 {
344+
c.Server.ShutdownTimeout = 10 * time.Second
345+
}
342346
if c.Server.MaxBodySize == 0 {
343347
c.Server.MaxBodySize = 1 << 20
344348
}
@@ -420,6 +424,9 @@ func (c Config) Validate() error {
420424
if c.Server.AdminReplayMaxLimit <= 0 || c.Server.AdminReplayMaxLimit > maxAdminReplayMaxLimit {
421425
return fmt.Errorf("server.admin_replay_max_limit must be in range 1..%d", maxAdminReplayMaxLimit)
422426
}
427+
if c.Server.ShutdownTimeout <= 0 {
428+
return fmt.Errorf("server.shutdown_timeout must be greater than 0")
429+
}
423430
if c.Server.AdminReplayJobTTL <= 0 {
424431
return fmt.Errorf("server.admin_replay_job_ttl must be greater than 0")
425432
}

config/config_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ server:
3535

3636
t.Setenv("STRIPE_WEBHOOK_SECRET", "whsec_123")
3737
t.Setenv("TAP_SERVER_PORT", "9091")
38+
t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "17s")
3839

3940
cfg, err := Load(path)
4041
if err != nil {
@@ -43,6 +44,9 @@ server:
4344
if got := cfg.Server.Port; got != 9091 {
4445
t.Fatalf("expected env override port 9091, got %d", got)
4546
}
47+
if got := cfg.Server.ShutdownTimeout; got != 17*time.Second {
48+
t.Fatalf("expected env override shutdown timeout 17s, got %s", got)
49+
}
4650
if cfg.Providers["stripe"].Secret != "whsec_123" {
4751
t.Fatalf("expected secret expansion")
4852
}
@@ -179,6 +183,9 @@ func TestLoadConfigMissingFileAppliesDefaults(t *testing.T) {
179183
if cfg.Server.BasePath != "/webhooks" {
180184
t.Fatalf("expected default base path, got %q", cfg.Server.BasePath)
181185
}
186+
if cfg.Server.ShutdownTimeout != 10*time.Second {
187+
t.Fatalf("expected default shutdown timeout 10s, got %s", cfg.Server.ShutdownTimeout)
188+
}
182189
if cfg.Server.AdminReplayMaxLimit != 2000 {
183190
t.Fatalf("expected default admin replay max limit 2000, got %d", cfg.Server.AdminReplayMaxLimit)
184191
}
@@ -327,6 +334,16 @@ server:
327334
}
328335
}
329336

337+
func TestConfigValidateRejectsNonPositiveShutdownTimeout(t *testing.T) {
338+
cfg := Config{}
339+
cfg.ApplyDefaults()
340+
cfg.Server.ShutdownTimeout = 0
341+
342+
if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "server.shutdown_timeout") {
343+
t.Fatalf("expected shutdown timeout validation error, got %v", err)
344+
}
345+
}
346+
330347
func TestLoadConfigVaultReferenceRequiresAddress(t *testing.T) {
331348
t.Setenv("VAULT_ADDR", "")
332349

@@ -417,6 +434,7 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) {
417434
t.Setenv("TAP_NATS_SECURE", "true")
418435
t.Setenv("TAP_NATS_CA_FILE", "/var/run/secrets/nats/ca.crt")
419436
t.Setenv("TAP_SERVER_MAX_BODY_SIZE", "2097152")
437+
t.Setenv("TAP_SERVER_SHUTDOWN_TIMEOUT", "11s")
420438
t.Setenv("TAP_SERVER_ADMIN_REPLAY_MAX_LIMIT", "1234")
421439
t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_TTL", "12h")
422440
t.Setenv("TAP_SERVER_ADMIN_REPLAY_JOB_MAX_JOBS", "777")
@@ -481,6 +499,9 @@ func TestLoadConfigSnakeCaseEnvOverrides(t *testing.T) {
481499
if cfg.Server.MaxBodySize != 2097152 {
482500
t.Fatalf("expected server.max_body_size override, got %d", cfg.Server.MaxBodySize)
483501
}
502+
if cfg.Server.ShutdownTimeout != 11*time.Second {
503+
t.Fatalf("expected server.shutdown_timeout override, got %s", cfg.Server.ShutdownTimeout)
504+
}
484505
if cfg.Server.AdminReplayMaxLimit != 1234 {
485506
t.Fatalf("expected server.admin_replay_max_limit override, got %d", cfg.Server.AdminReplayMaxLimit)
486507
}

scripts/assert-chart-render.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ rendered_default="$(mktemp)"
2626
rendered_fixture="$(mktemp)"
2727
rendered_automount="$(mktemp)"
2828
rendered_startup_disabled="$(mktemp)"
29+
rendered_production="$(mktemp)"
2930
fixture_values="$(mktemp)"
3031
cleanup() {
31-
rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${fixture_values}"
32+
rm -f "${rendered_default}" "${rendered_fixture}" "${rendered_automount}" "${rendered_startup_disabled}" "${rendered_production}" "${fixture_values}"
3233
}
3334
trap cleanup EXIT
3435

@@ -65,6 +66,7 @@ helm template siphon charts/siphon >"${rendered_default}"
6566
helm template siphon charts/siphon -f "${fixture_values}" >"${rendered_fixture}"
6667
helm template siphon charts/siphon --set serviceAccount.automount=true >"${rendered_automount}"
6768
helm template siphon charts/siphon --set startupProbe.enabled=false >"${rendered_startup_disabled}"
69+
helm template siphon charts/siphon -f charts/siphon/values-production.yaml >"${rendered_production}"
6870

6971
default_automount="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.automountServiceAccountToken' "${rendered_default}")"
7072
[[ "${default_automount}" == "false" ]] || fail "default automountServiceAccountToken should be false, got ${default_automount}"
@@ -93,4 +95,10 @@ startup_path="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.conta
9395
startup_disabled="$(yq -r 'select(.kind == "Deployment") | .spec.template.spec.containers[] | select(.name == "tap") | has("startupProbe")' "${rendered_startup_disabled}")"
9496
[[ "${startup_disabled}" == "false" ]] || fail "startupProbe should be omitted when startupProbe.enabled=false, got ${startup_disabled}"
9597

98+
production_hpa="$(yq -r 'select(.kind == "HorizontalPodAutoscaler") | .kind' "${rendered_production}")"
99+
[[ "${production_hpa}" == "HorizontalPodAutoscaler" ]] || fail "production profile should render an HPA, got ${production_hpa}"
100+
101+
production_replicas="$(yq -r 'select(.kind == "Deployment") | .spec | has("replicas")' "${rendered_production}" | head -n1)"
102+
[[ "${production_replicas}" == "false" ]] || fail "production profile should omit deployment replicas when autoscaling is enabled, got ${production_replicas}"
103+
96104
echo "ok: chart render assertions passed"

0 commit comments

Comments
 (0)