Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ All configuration via **environment variables**. Bold = required.
| `REDIS_REQUIRED` | `true` | Fail startup when `REDIS_URL` is unset. Set `false` only for dev / single-replica; stateless mode leaves codes/refresh replayable within TTL |
| `REDIS_KEY_PREFIX` | `mcp-auth-proxy:` | Key prefix for shared Redis; set to empty to opt out of namespacing |
| `REFRESH_RACE_GRACE_SEC` | `2` | Grace window in seconds during which a refresh-rotation collision is treated as a benign concurrent submit (parallel-tab refresh, slow-network double-submit) and returns 429 `refresh_concurrent_submit` without revoking the family. Outside the window every collision still revokes. Range `[0, 10]`; `0` disables (every collision = reuse). The 10s ceiling is a security cap — wider windows are statistically attacker-shaped |
| `IDP_EXCHANGE_RATE_PER_SEC` | _(empty / disabled)_ | Cap on outbound proxy → IdP token-endpoint requests at `/callback`. Defense in depth: a flood of `/callback` hits that slips past the per-IP limiter (distributed sources, permissive XFF trust matrix) is bounded by this token bucket before reaching the IdP. Denied requests get 503 `temporarily_unavailable` + `error_code=idp_exchange_throttled` + `Retry-After: 1`. Set to a positive number (e.g. `20`) to enable |
| `IDP_EXCHANGE_BURST` | `50` | Burst size for the IdP-exchange limiter when `IDP_EXCHANGE_RATE_PER_SEC > 0`. Higher burst absorbs a short spike (e.g. a deploy-time reconnect storm) without 503s; lower burst keeps the ceiling tighter. Ignored when `IDP_EXCHANGE_RATE_PER_SEC` is unset/zero (limiter is not constructed) |
| `RATE_LIMIT_ENABLED` | `true` | Per-IP rate limiting on pre-auth endpoints and on the authenticated MCP route. Disable only behind a WAF that already enforces it |
| `TOKEN_SIGNING_SECRETS_PREVIOUS` | _(empty)_ | Whitespace-separated retired signing secrets accepted on Open during a rolling rotation. New seals always use `TOKEN_SIGNING_SECRET` (primary); Open tries primary first, then each previous. See [`docs/runbooks/key-rotation.md`](./docs/runbooks/key-rotation.md) |
| `TRUSTED_PROXY_CIDRS` | _(empty)_ | Comma-separated CIDRs of peers whose forwarding header (default `X-Forwarded-For`) is walked right-to-left for rate-limit keying. The first hop NOT in the trusted set is the bucket key; everything left of it (typically appended by the client) is ignored. Other peers fall back to RemoteAddr. Preferred over the legacy `TRUST_PROXY_HEADERS` bool |
Expand Down Expand Up @@ -245,6 +247,12 @@ rollout notes, and K8s deployment shape.
`consent`, or `callback_state` replays caught by the Redis-backed
store
- `mcp_auth_rate_limited_total{endpoint}` — httprate 429s by endpoint
- `mcp_auth_idp_exchange_throttled_total` — outbound proxy → IdP
token-endpoint exchanges denied by the rate-limit bucket
(`IDP_EXCHANGE_RATE_PER_SEC`). A spike under steady inbound
traffic usually means a distributed flood is slipping past the
per-IP limiter, or the IdP is slow enough that the bucket
fills faster than it drains
- `mcp_auth_clients_registered_total` — RFC 7591 registrations
- `mcp_auth_groups_claim_shape_mismatch_total` — id_token `groups`
claim failed to decode as `[]string`; user is admitted with empty
Expand Down
37 changes: 37 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ type Config struct {
// applies. Default 2s; clamped to [0, 10s] at Load. Set to 0 to
// keep the strict pre-grace behavior. env: REFRESH_RACE_GRACE_SEC.
RefreshRaceGrace time.Duration
// IdPExchangeRatePerSec caps the proxy → IdP token-endpoint
// fan-out at /callback. Defense in depth: a flood of /callback
// hits that slips past the per-IP limiter (distributed sources,
// permissive XFF trust matrix) is bounded by this token-bucket
// before reaching the IdP. 0 disables (no outbound throttling).
// env: IDP_EXCHANGE_RATE_PER_SEC.
IdPExchangeRatePerSec float64
// IdPExchangeBurst is the burst size for the IdP-exchange limiter
// when IdPExchangeRatePerSec > 0. env: IDP_EXCHANGE_BURST.
IdPExchangeBurst int
// CompatAllowStateless keeps the legacy Cursor/MCP Inspector behavior of
// accepting /authorize requests without a client-supplied state. Default
// false — strict mode refuses stateless requests so the client cannot
Expand Down Expand Up @@ -315,6 +325,33 @@ func Load() (*Config, error) {
c.RefreshRaceGrace = time.Duration(n) * time.Second
}

// IDP_EXCHANGE_RATE_PER_SEC + IDP_EXCHANGE_BURST tune the
// outbound rate-limit bucket on the proxy → IdP /token leg.
// 0 disables. Default off — operators behind a permissive XFF
// trust matrix or facing distributed flood patterns opt in
// explicitly.
if raw := os.Getenv("IDP_EXCHANGE_RATE_PER_SEC"); raw != "" {
f, err := strconv.ParseFloat(raw, 64)
if err != nil {
return nil, fmt.Errorf("IDP_EXCHANGE_RATE_PER_SEC must be a number: %w", err)
}
if f < 0 {
return nil, fmt.Errorf("IDP_EXCHANGE_RATE_PER_SEC must be >= 0; got %v", f)
}
c.IdPExchangeRatePerSec = f
}
c.IdPExchangeBurst = 50
if raw := os.Getenv("IDP_EXCHANGE_BURST"); raw != "" {
n, err := strconv.Atoi(raw)
if err != nil {
return nil, fmt.Errorf("IDP_EXCHANGE_BURST must be an integer: %w", err)
}
if n < 1 {
return nil, fmt.Errorf("IDP_EXCHANGE_BURST must be >= 1; got %d", n)
}
c.IdPExchangeBurst = n
}

if ag := os.Getenv("ALLOWED_GROUPS"); ag != "" {
for _, g := range strings.Split(ag, ",") {
if g = strings.TrimSpace(g); g != "" {
Expand Down
57 changes: 57 additions & 0 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,63 @@ func TestLoad_RefreshRaceGrace_RejectsAboveCeiling(t *testing.T) {
}
}

func TestLoad_IdPExchangeRate_DefaultDisabled(t *testing.T) {
setAllRequired(t)
cfg, err := Load()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if cfg.IdPExchangeRatePerSec != 0 {
t.Errorf("IdPExchangeRatePerSec default = %v, want 0 (disabled)", cfg.IdPExchangeRatePerSec)
}
if cfg.IdPExchangeBurst != 50 {
t.Errorf("IdPExchangeBurst default = %d, want 50", cfg.IdPExchangeBurst)
}
}

func TestLoad_IdPExchangeRate_Custom(t *testing.T) {
setAllRequired(t)
t.Setenv("IDP_EXCHANGE_RATE_PER_SEC", "20")
t.Setenv("IDP_EXCHANGE_BURST", "100")
cfg, err := Load()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if cfg.IdPExchangeRatePerSec != 20 {
t.Errorf("IdPExchangeRatePerSec = %v, want 20", cfg.IdPExchangeRatePerSec)
}
if cfg.IdPExchangeBurst != 100 {
t.Errorf("IdPExchangeBurst = %d, want 100", cfg.IdPExchangeBurst)
}
}

func TestLoad_IdPExchangeRate_RejectsNegative(t *testing.T) {
setAllRequired(t)
t.Setenv("IDP_EXCHANGE_RATE_PER_SEC", "-1")
_, err := Load()
if err == nil || !strings.Contains(err.Error(), "IDP_EXCHANGE_RATE_PER_SEC") {
t.Fatalf("want error mentioning IDP_EXCHANGE_RATE_PER_SEC, got %v", err)
}
}

func TestLoad_IdPExchangeRate_RejectsNonNumber(t *testing.T) {
setAllRequired(t)
t.Setenv("IDP_EXCHANGE_RATE_PER_SEC", "fast")
_, err := Load()
if err == nil || !strings.Contains(err.Error(), "must be a number") {
t.Fatalf("want parse error, got %v", err)
}
}

func TestLoad_IdPExchangeBurst_RejectsZero(t *testing.T) {
setAllRequired(t)
t.Setenv("IDP_EXCHANGE_BURST", "0")
_, err := Load()
if err == nil || !strings.Contains(err.Error(), ">= 1") {
t.Fatalf("want >= 1 rejection, got %v", err)
}
}

func TestLoad_PKCERequired_Default(t *testing.T) {
setAllRequired(t)
cfg, err := Load()
Expand Down
31 changes: 31 additions & 0 deletions docs/runbooks/idp-outage.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,37 @@ rollout-restart.
`OIDC_ISSUER_URL`, `OIDC_CLIENT_ID`, `OIDC_CLIENT_SECRET` and a
rollout.

### IdP overload — proxy → IdP rate-bucket

If `mcp_auth_idp_exchange_throttled_total` is climbing while
inbound traffic stays steady, the optional outbound rate-bucket
(`IDP_EXCHANGE_RATE_PER_SEC` + `IDP_EXCHANGE_BURST`) is doing its
job — capping proxy → IdP fan-out at `/callback` so the IdP isn't
hammered. Throttled requests return 503 `idp_exchange_throttled`
+ `Retry-After: 1`; the user retries and gets through once the
bucket refills.

Tuning playbook (only if the bucket is wired):

1. **Start liberal, narrow if alerting fires.** Default 20/sec +
burst 50 is generous for a typical MCP deployment doing <1
auth/sec. Most operators never need this enabled.
2. **Per-replica scope.** The limiter is in-process. An
`N`-replica Deployment admits up to `N × IDP_EXCHANGE_RATE_PER_SEC`
to the IdP. Divide your IdP-side ceiling by replica count.
3. **If `idp_exchange_throttled_total` climbs under steady
inbound traffic, two distinct causes:**
a. A distributed flood is slipping past the per-IP limiter
(check `TRUSTED_PROXY_CIDRS` — a permissive XFF trust
matrix can be the culprit).
b. The IdP itself is slow enough that the bucket refills
slower than it drains. In that case raise the rate
cautiously after confirming the IdP can handle it.
4. **Do not raise the rate to "make the alert go away".** The
bucket exists to protect the IdP; bypassing it can cascade
the IdP outage into a proxy outage when the IdP eventually
drops requests on the floor.

## Prevention

- **Monitor the IdP's availability independently** of the proxy.
Expand Down
2 changes: 1 addition & 1 deletion docs/threat-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ privilege.
| 6 | S | **Consent-token replay** — captured `consent_token` POSTed twice within its 5-min TTL | `JTI` `ClaimOnce` at POST `/consent` before either Approve or Deny; per-render `JTI` so back-button = re-consent (a new claim slot) | `handlers/consent.go` | `handlers/single_use_replay_test.go` (`TestConsent_SingleUse_*`) | — |
| 7 | S | **Callback-state replay** — captured `/callback` URL replayed (e.g. attacker-observable redirect) | `SessionID` `ClaimOnce` BEFORE the upstream OIDC token-endpoint exchange — replay never fans out to the IdP and never produces audit-log noise | `handlers/callback.go` | `handlers/single_use_replay_test.go` (`TestCallback_SingleUse_*`) | — |
| 8 | D | **Redis outage** — replay store unreachable | Fail-closed at every claim site: 503 `server_error` + `error_code=replay_store_unavailable`. `mcp_auth_access_denied_total{reason="replay_store_unavailable"}` is the single alerting source. `PROD_MODE=true` validates `REDIS_REQUIRED=true` + `REDIS_URL` set at startup | `replay/redis.go`, `config/config.go` (`Validate`), every claim site | `config/config_test.go` (production posture) | `docs/runbooks/redis-outage.md` |
| 9 | D | **IdP outage** — upstream OIDC unavailable | 10s context timeout on `oauth2Cfg.Exchange`; surfaces 502 `server_error`; readiness probe lives on the metrics port only so an unauthenticated public-listener flood cannot flip every replica out of the K8s Service | `handlers/callback.go` (`oidcExchangeTTL`), `main.go` (port split) | `handlers/handlers_test.go` (callback timeout tests) | `docs/runbooks/idp-outage.md` |
| 9 | D | **IdP outage** — upstream OIDC unavailable | 10s context timeout on `oauth2Cfg.Exchange`; surfaces 502 `server_error`; readiness probe lives on the metrics port only so an unauthenticated public-listener flood cannot flip every replica out of the K8s Service. Optional outbound rate-bucket (`IDP_EXCHANGE_RATE_PER_SEC` + `IDP_EXCHANGE_BURST`) caps proxy → IdP fan-out at `/callback`; throttled requests return 503 `idp_exchange_throttled` (T2.2). **Per-replica scope:** the limiter is in-process, so an `N`-replica deployment admits up to `N × IDP_EXCHANGE_RATE_PER_SEC` to the IdP — divide the desired IdP-side ceiling by replica count when sizing the env var | `handlers/callback.go` (`oidcExchangeTTL`, `IdPExchangeLimiter`), `main.go` (port split) | `handlers/handlers_test.go` (callback timeout tests), `handlers/single_use_replay_test.go` (`TestCallback_IdPExchange*`) | `docs/runbooks/idp-outage.md` |
| 10 | S | **XFF / proxy-header spoofing** — `X-Forwarded-For` set by an attacker to bypass per-IP rate limiting | `TRUSTED_PROXY_CIDRS` allowlist scopes which upstream hops can set forwarding headers; `TRUSTED_PROXY_HEADER` names the trusted header; `PROD_MODE=true` rejects `TRUST_PROXY_HEADERS=true` without a `TRUSTED_PROXY_CIDRS` allowlist | `config/config.go` (`TrustedProxyCIDRs`), `main.go` (`ipKeyFunc`) | `config/config_test.go` (XFF cases) | — |
| 11 | R | **Multi-replica drift** — two proxy replicas issue inconsistent decisions on the same code, refresh, or callback state | All claim sites use a shared Redis store; `FamilyIssuedAt` + `REVOKE_BEFORE` propagate bulk-cutoff revocations across replicas without per-replica state | `replay/redis.go`, `token/` (`FamilyIssuedAt`) | `replay/redis_test.go` | `docs/runbooks/key-rotation.md` |
| 12 | D | **Sealed-input parse exhaustion** — oversized sealed blob slows `token.Open()` under load | Per-input length cap on every sealed-token open call (PR #18); 1 MB body cap via `MaxBytesReader` on `/token`, `/consent`, `/register` | `token/seal.go`, every handler entry point | `token/seal_test.go` (`FuzzOpenJSON`, `FuzzValidate`) | — |
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
golang.org/x/oauth2 v0.36.0
golang.org/x/sync v0.13.0
golang.org/x/term v0.41.0
golang.org/x/time v0.15.0
)

require (
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU=
golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A=
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand Down
35 changes: 35 additions & 0 deletions handlers/callback.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/google/uuid"
"go.uber.org/zap"
"golang.org/x/oauth2"
"golang.org/x/time/rate"
)

const (
Expand Down Expand Up @@ -86,6 +87,15 @@ type CallbackConfig struct {
// burn the IdP's `invalid_grant` quota anyway). nil = stateless
// fallback (configured opt-out).
ReplayStore replay.Store
// IdPExchangeLimiter, when non-nil, throttles the proxy → IdP
// token-endpoint exchange (the second leg of /callback). Defense
// in depth: if a flood of /callback hits slips past the per-IP
// rate limiter (e.g. distributed across IPs, behind a permissive
// XFF trust matrix), the limiter caps the rate at which the
// proxy can fan out to the IdP. Denied requests get a 503 +
// `idp_exchange_throttled` log + metric; the user can retry once
// the bucket refills. nil = no outbound throttling.
IdPExchangeLimiter *rate.Limiter
}

// Callback handles GET /callback (IdP redirect after user authentication).
Expand Down Expand Up @@ -196,6 +206,31 @@ func callbackHandler(tm *token.Manager, logger *zap.Logger, audience string, oau
return
}

// Outbound rate-limit (defense in depth). When a limiter is
// wired, fail fast with 503 if the bucket is empty rather
// than queueing. Queueing ties up proxy connection budget on
// behalf of the IdP and cascades the IdP outage into a proxy
// outage. The limiter is nil for unit tests / operators who
// opt out by leaving IDP_EXCHANGE_RATE_PER_SEC unset.
//
// The throttle check runs BEFORE the SessionID replay claim
// so a transiently-throttled legitimate user can retry the
// same /callback URL once the bucket refills — placing it
// after the claim would burn the slot and force a 400
// callback_state_replay on the retry. Replay defense is
// per-state (claim slot is unique to one user); throttling
// is global IdP-protection — they don't need to share a
// fail-fast point.
if cbCfg.IdPExchangeLimiter != nil && !cbCfg.IdPExchangeLimiter.Allow() {
metrics.IdPExchangeThrottled.Inc()
logger.Warn("idp_exchange_throttled",
zap.String("client_id", session.ClientID),
)
w.Header().Set("Retry-After", "1")
writeOAuthError(w, http.StatusServiceUnavailable, "temporarily_unavailable", "upstream IdP exchange throttled; retry shortly", "idp_exchange_throttled")
return
}

// Single-use claim on the sealed state's SessionID. Applies
// BEFORE the upstream IdP exchange so a replayed /callback
// URL never fans out to the IdP. Same nil-store /
Expand Down
Loading