From 1c8559d11a1be45b7ea745f15be9b16b16817fd8 Mon Sep 17 00:00:00 2001 From: Damien Degois Date: Mon, 27 Apr 2026 23:32:55 +0200 Subject: [PATCH] feat(replay): single-use claim on consent + callback state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the only documented security trade-off in specs.md (consent_token replay) and removes IdP-fan-out + audit noise on retried /callback. When a replay store is wired (REDIS_URL): - GET /authorize seals a fresh JTI on every consent render and a fresh SessionID on every silent-redirect session. Per-render JTI keeps back-button safe (a re-render gets a new claim slot) while still single-use within one render. - POST /consent ClaimOnce on JTI before either approve or deny — a captured consent_token cannot be replayed for either decision. - GET /callback ClaimOnce on session.SessionID before the upstream OIDC exchange — a replayed /callback URL never fans out to the IdP token endpoint and never produces audit-log noise. - Replay detected → 400 invalid_request with error_code consent_replay / callback_state_replay, increments mcp_auth_replay_detected_total{kind=consent|callback_state}. - Store unreachable → 503 with error_code replay_store_unavailable, increments mcp_auth_access_denied_total{reason="replay_store_unavailable"} (reuses the existing /token counter so one alert rule covers every claim site). - nil store → stateless fallback (configured opt-out, mirrors /token). - Empty JTI / SessionID on tokens sealed by older binaries falls through to stateless behavior so an in-flight rollout doesn't 503 active users for the duration of consentTTL / sessionTTL. - The /callback IdP-error redirect path deliberately does NOT claim — the redirect is idempotent (no IdP fan-out, no token issuance) and claiming would force re-auth on every legit transient-error retry. - All four replay_store_error log sites now carry an `op` field (claim_authz_code / claim_refresh_family / claim_consent / claim_callback_state) for site-level log correlation. specs.md and README updated. 10 new handler tests cover replay detection, deny-replayed, nil-store fallback, legacy empty JTI/SID, store-error fail-closed, and JTI/SID population by /authorize for both forks. --- README.md | 5 +- handlers/authorize.go | 6 + handlers/callback.go | 50 ++- handlers/consent.go | 67 +++- handlers/helpers.go | 17 +- handlers/single_use_replay_test.go | 495 +++++++++++++++++++++++++++++ handlers/token.go | 4 +- main.go | 5 +- metrics/metrics.go | 5 +- specs.md | 19 +- 10 files changed, 639 insertions(+), 34 deletions(-) create mode 100644 handlers/single_use_replay_test.go diff --git a/README.md b/README.md index 7a562bf..f7a7de3 100644 --- a/README.md +++ b/README.md @@ -238,8 +238,9 @@ rollout notes, and K8s deployment shape. expected interaction, not a policy rejection. Pair with `mcp_auth_tokens_issued_total{grant_type="authorization_code"}` to derive abandoned-after-approve as a funnel signal - - `mcp_auth_replay_detected_total{kind}` — `code` or `refresh` replays - caught by the Redis-backed store + - `mcp_auth_replay_detected_total{kind}` — `code`, `refresh`, + `consent`, or `callback_state` replays caught by the Redis-backed + store - `mcp_auth_rate_limited_total{endpoint}` — httprate 429s by endpoint - `mcp_auth_clients_registered_total` — RFC 7591 registrations - `mcp_auth_groups_claim_shape_mismatch_total` — id_token `groups` diff --git a/handlers/authorize.go b/handlers/authorize.go index fded8a7..bc70c80 100644 --- a/handlers/authorize.go +++ b/handlers/authorize.go @@ -9,6 +9,7 @@ import ( "github.com/babs/mcp-auth-proxy/metrics" "github.com/babs/mcp-auth-proxy/token" + "github.com/google/uuid" "go.uber.org/zap" "golang.org/x/oauth2" ) @@ -226,6 +227,10 @@ func Authorize(tm *token.Manager, logger *zap.Logger, baseURL string, oauth2Cfg // redirect) replays from POST /consent on approval. if authzCfg.RenderConsentPage { renderConsent(w, r, tm, logger, baseURL, authzCfg.ResourceName, sealedConsent{ + // Per-render JTI: a fresh id every GET /authorize so + // back-button = re-consent (each render gets its own + // single-use claim slot) rather than dead-state errors. + JTI: uuid.New().String(), ClientID: client.ID, ClientName: client.ClientName, RedirectURI: redirectURI, @@ -280,6 +285,7 @@ func Authorize(tm *token.Manager, logger *zap.Logger, baseURL string, oauth2Cfg PKCEVerifier: upstreamVerifier, SvrVerifier: svrVerifier, // empty unless H6 server-side PKCE kicked in SvrChallenge: svrChallenge, // mirrors sessionChallenge in that case + SessionID: uuid.New().String(), Typ: token.PurposeSession, Audience: baseURL, Resource: authzCfg.CanonicalResource, diff --git a/handlers/callback.go b/handlers/callback.go index 17e498e..faf7c89 100644 --- a/handlers/callback.go +++ b/handlers/callback.go @@ -3,12 +3,14 @@ package handlers import ( "context" "encoding/json" + "errors" "net/http" "net/url" "strings" "time" "github.com/babs/mcp-auth-proxy/metrics" + "github.com/babs/mcp-auth-proxy/replay" "github.com/babs/mcp-auth-proxy/token" "github.com/coreos/go-oidc/v3/oidc" "github.com/google/uuid" @@ -72,10 +74,18 @@ func sanitizeErrorDescription(s string) string { return string(b) } -// CallbackConfig holds the group filtering parameters for the callback handler. +// CallbackConfig holds optional dependencies for the callback handler. type CallbackConfig struct { AllowedGroups []string // empty = allow all authenticated users GroupsClaim string // flat claim name in id_token (default "groups") + // ReplayStore, when non-nil, enforces single-use semantics on the + // sealed `state` parameter via its embedded SessionID: a captured + // /callback URL cannot be replayed to fan out to the upstream IdP + // (audit-noise + outbound-fan-out defense — the IdP authorization + // code is itself single-use, so a successful replay would just + // burn the IdP's `invalid_grant` quota anyway). nil = stateless + // fallback (configured opt-out). + ReplayStore replay.Store } // Callback handles GET /callback (IdP redirect after user authentication). @@ -118,6 +128,17 @@ func callbackHandler(tm *token.Manager, logger *zap.Logger, audience string, oau internalState := q.Get("state") if idpError != "" { + // Deliberately NO ClaimOnce on this branch even when the + // session opens cleanly. The error redirect produces no + // IdP fan-out (no token-endpoint call) and no token + // issuance — just an idempotent 302 to the registered + // redirect_uri carrying `error=…`. Claiming here would + // burn the state on the first transient IdP error and + // force the user through full re-auth on the next legit + // retry. Trade-off accepted: a stolen state can replay + // the same error redirect, but the attacker gains + // nothing not already producible by sending the user a + // crafted /authorize URL. safeError := normalizeIdPError(idpError) // `error_description` from a /callback hit is fully // attacker-controlled — anyone can craft a /callback URL @@ -175,6 +196,33 @@ func callbackHandler(tm *token.Manager, logger *zap.Logger, audience string, oau return } + // Single-use claim on the sealed state's SessionID. Applies + // BEFORE the upstream IdP exchange so a replayed /callback + // URL never fans out to the IdP. Same nil-store / + // ErrAlreadyClaimed / fail-closed-on-error policy as the + // /token code claim. Empty SessionID is the in-flight-rollout + // fallback (older binary sealed the session before this + // field existed). + if cbCfg.ReplayStore != nil && session.SessionID != "" { + remaining := max(time.Until(session.ExpiresAt), time.Second) + key := replay.NamespacedKey("callback_state", session.SessionID) + if err := cbCfg.ReplayStore.ClaimOnce(r.Context(), key, remaining); err != nil { + if errors.Is(err, replay.ErrAlreadyClaimed) { + metrics.ReplayDetected.WithLabelValues("callback_state").Inc() + logger.Warn("callback_state_replay", + zap.String("session_id", session.SessionID), + zap.String("client_id", session.ClientID), + ) + writeOAuthError(w, http.StatusBadRequest, "invalid_request", "callback state already used", "callback_state_replay") + return + } + logger.Error("replay_store_error", zap.String("op", "claim_callback_state"), zap.Error(err)) + metrics.AccessDenied.WithLabelValues("replay_store_unavailable").Inc() + writeOAuthError(w, http.StatusServiceUnavailable, "server_error", "replay store unavailable", "replay_store_unavailable") + return + } + } + // Explicit timeout for upstream OIDC token exchange exchangeCtx, cancel := context.WithTimeout(r.Context(), oidcExchangeTTL) defer cancel() diff --git a/handlers/consent.go b/handlers/consent.go index 0ecea95..14e6720 100644 --- a/handlers/consent.go +++ b/handlers/consent.go @@ -10,7 +10,9 @@ import ( "time" "github.com/babs/mcp-auth-proxy/metrics" + "github.com/babs/mcp-auth-proxy/replay" "github.com/babs/mcp-auth-proxy/token" + "github.com/google/uuid" "go.uber.org/zap" "golang.org/x/oauth2" ) @@ -183,10 +185,14 @@ func redirectHost(redirectURI string) string { } // ConsentConfig holds optional dependencies for the consent -// approval handler. Mirrors the shape of CallbackConfig — no -// runtime tunables today, kept as a struct so future fields don't -// re-break the constructor signature. -type ConsentConfig struct{} +// approval handler. Mirrors the shape of CallbackConfig. +type ConsentConfig struct { + // ReplayStore, when non-nil, enforces single-use semantics on the + // consent token's JTI: a captured consent_token can be POSTed at + // most once. nil = stateless fallback (configured opt-out — the + // token is still audience- and TTL-bound). + ReplayStore replay.Store +} // Consent handles POST /consent (consent-page approval submit). // @@ -206,17 +212,14 @@ type ConsentConfig struct{} // purpose-bound, 5-min TTL). A POST without a valid consent_token // is rejected. // -// Replay note: the consent token is NOT integrated with the replay -// store. A given token can be redeemed multiple times within its -// 5-min window. Each Approve mints a fresh sealedSession (different -// nonce / verifier), and the IdP login still has to complete; each -// Deny just emits another error=access_denied to the registered -// redirect_uri. Neither path produces an effect the original user -// couldn't reproduce by clicking again. Wiring single-use through -// the replay store would be belt-and-braces against an attacker -// who exfiltrated the consent token (a stronger compromise than -// this defense addresses) — accepted as a deliberate trade-off. -func Consent(tm *token.Manager, logger *zap.Logger, baseURL string, oauth2Cfg *oauth2.Config, _ ConsentConfig) http.HandlerFunc { +// Replay defense: when ConsentConfig.ReplayStore is wired, the +// consent token's JTI is claimed single-use before either branch +// runs. Each GET /authorize render mints a fresh JTI so the +// back-button case still works (a re-render gets a new claim +// slot); a stolen consent_token can be POSTed at most once. Empty +// JTI (token sealed by an older binary still in flight during +// rollout) falls through to the prior stateless behavior. +func Consent(tm *token.Manager, logger *zap.Logger, baseURL string, oauth2Cfg *oauth2.Config, cfg ConsentConfig) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { r.Body = http.MaxBytesReader(w, r.Body, maxBodySize) @@ -278,6 +281,39 @@ func Consent(tm *token.Manager, logger *zap.Logger, baseURL string, oauth2Cfg *o return } + // Single-use claim on the consent JTI — applies BEFORE the + // approve/deny branch so a captured token cannot be replayed + // for either decision. Mirrors the /token authorization-code + // claim policy: nil store = stateless fallback (configured + // opt-out); ErrAlreadyClaimed = 400 + replay metric; other + // backend errors = fail-closed 503 so we never proceed + // against an uncertain replay-state result. Empty JTI is the + // in-flight-rollout fallback (older binary sealed the token + // before this field existed). + if cfg.ReplayStore != nil && consent.JTI != "" { + remaining := max(time.Until(consent.ExpiresAt), time.Second) + key := replay.NamespacedKey("consent", consent.JTI) + if err := cfg.ReplayStore.ClaimOnce(r.Context(), key, remaining); err != nil { + if errors.Is(err, replay.ErrAlreadyClaimed) { + metrics.ReplayDetected.WithLabelValues("consent").Inc() + logger.Warn("consent_token_replay", + zap.String("jti", consent.JTI), + zap.String("client_id", consent.ClientID), + ) + writeOAuthError(w, http.StatusBadRequest, "invalid_request", "consent token already used", "consent_replay") + return + } + // Reuse the same access_denied{replay_store_unavailable} + // counter as /token rather than a per-site counter — a + // Redis outage hits every claim site at once and a single + // alerting rule on this counter covers all of them. + logger.Error("replay_store_error", zap.String("op", "claim_consent"), zap.Error(err)) + metrics.AccessDenied.WithLabelValues("replay_store_unavailable").Inc() + writeOAuthError(w, http.StatusServiceUnavailable, "server_error", "replay store unavailable", "replay_store_unavailable") + return + } + } + if action == "deny" { // Counted on a dedicated funnel counter rather than // AccessDenied: clicking Deny is a normal expected user @@ -335,6 +371,7 @@ func Consent(tm *token.Manager, logger *zap.Logger, baseURL string, oauth2Cfg *o PKCEVerifier: upstreamVerifier, SvrVerifier: svrVerifier, SvrChallenge: svrChallenge, + SessionID: uuid.New().String(), Typ: token.PurposeSession, Audience: baseURL, Resource: consent.Resource, diff --git a/handlers/helpers.go b/handlers/helpers.go index 7479310..c917d14 100644 --- a/handlers/helpers.go +++ b/handlers/helpers.go @@ -60,8 +60,14 @@ type sealedSession struct { PKCEVerifier string `json:"pv"` SvrVerifier string `json:"sv,omitempty"` SvrChallenge string `json:"sch,omitempty"` - Typ string `json:"typ"` - Audience string `json:"aud"` + // SessionID is a per-session unique id used as the replay-store + // claim key at /callback so a captured `state` cannot be replayed + // against the IdP token endpoint (audit + outbound-fan-out + // defense). Empty on sessions sealed before this field existed — + // /callback then falls through to the prior stateless behavior. + SessionID string `json:"sid,omitempty"` + Typ string `json:"typ"` + Audience string `json:"aud"` // Resource is the canonical RFC 8707 resource indicator the // downstream tokens will be bound to. Captured at /authorize so // the binding is fixed BEFORE the upstream IdP round trip — a @@ -125,6 +131,13 @@ type sealedCode struct { // usefulness). AAD purpose binding keeps it from being opened as // any other sealed type. type sealedConsent struct { + // JTI is a per-render unique id used as the single-use claim key + // at POST /consent. Each GET /authorize render mints a fresh JTI + // (back-button-safe: a re-render gets a new claim slot, the prior + // one is dead once redeemed). Empty on tokens sealed before this + // field existed — /consent then falls through to the prior + // stateless behavior for that token. + JTI string `json:"jti,omitempty"` ClientID string `json:"cid"` ClientName string `json:"cn,omitempty"` RedirectURI string `json:"ru"` diff --git a/handlers/single_use_replay_test.go b/handlers/single_use_replay_test.go new file mode 100644 index 0000000..e7fb11d --- /dev/null +++ b/handlers/single_use_replay_test.go @@ -0,0 +1,495 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" + "time" + + "github.com/babs/mcp-auth-proxy/metrics" + "github.com/babs/mcp-auth-proxy/replay" + "github.com/babs/mcp-auth-proxy/token" + "github.com/coreos/go-oidc/v3/oidc" + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus/testutil" + "go.uber.org/zap" + "golang.org/x/oauth2" +) + +// erroringStore is a replay.Store stub for driving the +// replay-store-error fail-closed branch on /consent and /callback. +// Each method has its own optional override so future tests can +// fail one operation while leaving the others healthy. A nil +// override falls back to the shared `err` field. `err` itself nil +// makes the method a no-op success. +type erroringStore struct { + err error + claimOnceErr error + markErr error + existsErr error + claimOrCheckFamErr error +} + +func (e *erroringStore) effective(specific error) error { + if specific != nil { + return specific + } + return e.err +} + +func (e *erroringStore) ClaimOnce(_ context.Context, _ string, _ time.Duration) error { + return e.effective(e.claimOnceErr) +} +func (e *erroringStore) Mark(_ context.Context, _ string, _ time.Duration) error { + return e.effective(e.markErr) +} +func (e *erroringStore) Exists(_ context.Context, _ string) (bool, error) { + if err := e.effective(e.existsErr); err != nil { + return false, err + } + return false, nil +} +func (e *erroringStore) ClaimOrCheckFamily(_ context.Context, _, _ string, _, _ time.Duration) (bool, bool, error) { + if err := e.effective(e.claimOrCheckFamErr); err != nil { + return false, false, err + } + return false, false, nil +} +func (e *erroringStore) Close() error { return nil } + +// mintConsentTokenWithJTI seals a consent blob carrying the given +// JTI. Mirrors mintConsentToken from consent_test.go but exposes the +// claim key so tests can drive the replay branch deterministically. +func mintConsentTokenWithJTI(t *testing.T, tm *token.Manager, jti string) string { + t.Helper() + consent := sealedConsent{ + JTI: jti, + ClientID: uuid.New().String(), + ClientName: "Friendly App", + RedirectURI: "https://app.example.com/cb", + OriginalState: "client-state", + CodeChallenge: pkceChallenge("dBjftJeZ4CVP-mB92K27uhbUJU1p1r_wW1gFWFOEjXk"), + Resource: testBaseURL + "/mcp", + Typ: token.PurposeConsent, + Audience: testBaseURL, + ExpiresAt: time.Now().Add(consentTTL), + } + tok, err := tm.SealJSON(consent, token.PurposeConsent) + if err != nil { + t.Fatalf("SealJSON: %v", err) + } + return tok +} + +// TestConsent_SingleUse_ReplayDetected pins T1.2: when a replay store +// is wired and the consent token carries a JTI, the second POST of +// the same token is rejected with 400 invalid_request + +// error_code=consent_replay, and mcp_auth_replay_detected_total{kind="consent"} +// increments by exactly one. +func TestConsent_SingleUse_ReplayDetected(t *testing.T) { + tm := newTestTokenManager(t) + store := replay.NewMemoryStore() + defer store.Close() + + tok := mintConsentTokenWithJTI(t, tm, uuid.NewString()) + + before := testutil.ToFloat64(metrics.ReplayDetected.WithLabelValues("consent")) + approvedBefore := testutil.ToFloat64(metrics.ConsentDecisions.WithLabelValues("approved")) + + post := func() *httptest.ResponseRecorder { + form := url.Values{"consent_token": {tok}, "action": {"approve"}} + req := httptest.NewRequestWithContext(context.Background(), http.MethodPost, "/consent", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rr := httptest.NewRecorder() + Consent(tm, zap.NewNop(), testBaseURL, testOAuth2Config(), ConsentConfig{ReplayStore: store})(rr, req) + return rr + } + + first := post() + if first.Code != http.StatusFound { + t.Fatalf("first POST: want 302, got %d: %s", first.Code, first.Body.String()) + } + + second := post() + if second.Code != http.StatusBadRequest { + t.Fatalf("replayed POST: want 400, got %d: %s", second.Code, second.Body.String()) + } + var oauthErr OAuthError + if err := json.NewDecoder(second.Body).Decode(&oauthErr); err != nil { + t.Fatalf("decode: %v", err) + } + if oauthErr.Error != "invalid_request" { + t.Errorf("error = %q, want invalid_request", oauthErr.Error) + } + if oauthErr.ErrorCode != "consent_replay" { + t.Errorf("error_code = %q, want consent_replay", oauthErr.ErrorCode) + } + + if got := testutil.ToFloat64(metrics.ReplayDetected.WithLabelValues("consent")); got-before != 1 { + t.Errorf("ReplayDetected{kind=consent} delta = %v, want 1", got-before) + } + // First POST took the approve branch; only that one should + // have ticked the approved counter. Pins that the replay + // rejection sits BEFORE the decision counter so a flooded + // replay does not inflate consent-funnel metrics. + if got := testutil.ToFloat64(metrics.ConsentDecisions.WithLabelValues("approved")); got-approvedBefore != 1 { + t.Errorf("ConsentDecisions{decision=approved} delta = %v, want 1 (replay must not double-count)", got-approvedBefore) + } +} + +// TestConsent_SingleUse_ClaimsBeforeDeny pins that the JTI is +// claimed BEFORE the deny branch — a captured token cannot be +// replayed for either decision. +func TestConsent_SingleUse_ClaimsBeforeDeny(t *testing.T) { + tm := newTestTokenManager(t) + store := replay.NewMemoryStore() + defer store.Close() + + tok := mintConsentTokenWithJTI(t, tm, uuid.NewString()) + + post := func(action string) *httptest.ResponseRecorder { + form := url.Values{"consent_token": {tok}, "action": {action}} + req := httptest.NewRequestWithContext(context.Background(), http.MethodPost, "/consent", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rr := httptest.NewRecorder() + Consent(tm, zap.NewNop(), testBaseURL, testOAuth2Config(), ConsentConfig{ReplayStore: store})(rr, req) + return rr + } + + if got := post("deny").Code; got != http.StatusFound { + t.Fatalf("first deny: want 302, got %d", got) + } + // Replayed deny — token already claimed; must be rejected. + second := post("deny") + if second.Code != http.StatusBadRequest { + t.Fatalf("replayed deny: want 400, got %d: %s", second.Code, second.Body.String()) + } +} + +// TestConsent_SingleUse_NilStoreFallback pins the configured-opt-out +// path: when ReplayStore is nil the handler falls through to the +// prior stateless behavior (token replayable within TTL). Mirrors +// the /token nil-replayStore semantics. +func TestConsent_SingleUse_NilStoreFallback(t *testing.T) { + tm := newTestTokenManager(t) + tok := mintConsentTokenWithJTI(t, tm, uuid.NewString()) + + post := func() *httptest.ResponseRecorder { + form := url.Values{"consent_token": {tok}, "action": {"approve"}} + req := httptest.NewRequestWithContext(context.Background(), http.MethodPost, "/consent", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rr := httptest.NewRecorder() + Consent(tm, zap.NewNop(), testBaseURL, testOAuth2Config(), ConsentConfig{})(rr, req) + return rr + } + if got := post().Code; got != http.StatusFound { + t.Fatalf("first POST: want 302, got %d", got) + } + if got := post().Code; got != http.StatusFound { + t.Fatalf("second POST (nil store): want 302, got %d", got) + } +} + +// TestConsent_SingleUse_LegacyEmptyJTI pins the in-flight-rollout +// fallback: a consent token sealed by an older binary lacks JTI +// and must still be redeemable within its TTL even when ReplayStore +// is wired. Without this back-compat, every in-flight token +// during the deploy window would 503. +func TestConsent_SingleUse_LegacyEmptyJTI(t *testing.T) { + tm := newTestTokenManager(t) + store := replay.NewMemoryStore() + defer store.Close() + + tok := mintConsentTokenWithJTI(t, tm, "") // legacy + + form := url.Values{"consent_token": {tok}, "action": {"approve"}} + req := httptest.NewRequestWithContext(context.Background(), http.MethodPost, "/consent", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rr := httptest.NewRecorder() + Consent(tm, zap.NewNop(), testBaseURL, testOAuth2Config(), ConsentConfig{ReplayStore: store})(rr, req) + + if rr.Code != http.StatusFound { + t.Fatalf("legacy token (empty JTI): want 302, got %d: %s", rr.Code, rr.Body.String()) + } +} + +// TestConsent_SingleUse_StoreErrorFailClosed pins the +// fail-closed-on-backend-error policy: the user opted into replay +// defense by configuring a store, so a transient store outage must +// not silently downgrade to stateless behavior. Returns 503 with +// error_code=replay_store_unavailable, mirroring /token. +func TestConsent_SingleUse_StoreErrorFailClosed(t *testing.T) { + tm := newTestTokenManager(t) + store := &erroringStore{err: errors.New("redis blew up")} + + tok := mintConsentTokenWithJTI(t, tm, uuid.NewString()) + + before := testutil.ToFloat64(metrics.AccessDenied.WithLabelValues("replay_store_unavailable")) + + form := url.Values{"consent_token": {tok}, "action": {"approve"}} + req := httptest.NewRequestWithContext(context.Background(), http.MethodPost, "/consent", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rr := httptest.NewRecorder() + Consent(tm, zap.NewNop(), testBaseURL, testOAuth2Config(), ConsentConfig{ReplayStore: store})(rr, req) + + if rr.Code != http.StatusServiceUnavailable { + t.Fatalf("want 503, got %d: %s", rr.Code, rr.Body.String()) + } + var oauthErr OAuthError + if err := json.NewDecoder(rr.Body).Decode(&oauthErr); err != nil { + t.Fatalf("decode: %v", err) + } + if oauthErr.ErrorCode != "replay_store_unavailable" { + t.Errorf("error_code = %q, want replay_store_unavailable", oauthErr.ErrorCode) + } + if got := testutil.ToFloat64(metrics.AccessDenied.WithLabelValues("replay_store_unavailable")); got-before != 1 { + t.Errorf("AccessDenied{reason=replay_store_unavailable} delta = %v, want 1", got-before) + } +} + +// --- T2.1: callback sealed-state ClaimOnce --- + +// mintCallbackSession seals a session with the given SessionID for +// driving /callback through to the upstream-exchange step. The +// upstream exchange is stubbed via verifyFunc panicking — the +// replay rejection MUST fire BEFORE that point. +func mintCallbackSession(t *testing.T, tm *token.Manager, sid string) string { + t.Helper() + s := sealedSession{ + ClientID: uuid.New().String(), + RedirectURI: "https://app.example.com/cb", + OriginalState: "client-state", + Nonce: "n", + PKCEVerifier: oauth2.GenerateVerifier(), + SessionID: sid, + Typ: token.PurposeSession, + Audience: testBaseURL, + ExpiresAt: time.Now().Add(5 * time.Minute), + } + state, err := tm.SealJSON(s, token.PurposeSession) + if err != nil { + t.Fatalf("seal session: %v", err) + } + return state +} + +// TestCallback_SingleUse_ReplayDetected pins T2.1: when a replay +// store is wired and the sealed state carries a SessionID, a second +// /callback hit on the same state is rejected BEFORE the upstream +// IdP exchange (no fan-out, no audit-log noise). Returns 400 +// invalid_request + error_code=callback_state_replay, +// mcp_auth_replay_detected_total{kind="callback_state"} increments. +func TestCallback_SingleUse_ReplayDetected(t *testing.T) { + tm := newTestTokenManager(t) + store := replay.NewMemoryStore() + defer store.Close() + oauth2Cfg := testOAuth2Config() + exchanged := 0 + verifyFunc := func(_ context.Context, _ string) (*oidc.IDToken, error) { + // Replay branch must short-circuit before this is called on + // the second hit. The first hit will reach Exchange and fail + // (no real IdP) — that's fine, the replay test is about + // hit #2. + exchanged++ + return nil, errors.New("not used") + } + + state := mintCallbackSession(t, tm, uuid.NewString()) + + before := testutil.ToFloat64(metrics.ReplayDetected.WithLabelValues("callback_state")) + + hit := func() *httptest.ResponseRecorder { + req := httptest.NewRequest(http.MethodGet, "/callback?code=fake&state="+url.QueryEscape(state), nil) + rr := httptest.NewRecorder() + CallbackWithVerifyFunc(tm, zap.NewNop(), testBaseURL, oauth2Cfg, verifyFunc, CallbackConfig{ReplayStore: store})(rr, req) + return rr + } + + first := hit() + // First hit reaches Exchange (which fails: no real IdP) — the + // replay claim already happened. This is what we want: the + // claim must be recorded even when the downstream exchange + // fails, otherwise a flaky upstream lets a stolen state be + // retried. + _ = first + + second := hit() + if second.Code != http.StatusBadRequest { + t.Fatalf("replayed /callback: want 400, got %d: %s", second.Code, second.Body.String()) + } + var oauthErr OAuthError + if err := json.NewDecoder(second.Body).Decode(&oauthErr); err != nil { + t.Fatalf("decode: %v", err) + } + if oauthErr.ErrorCode != "callback_state_replay" { + t.Errorf("error_code = %q, want callback_state_replay", oauthErr.ErrorCode) + } + + if got := testutil.ToFloat64(metrics.ReplayDetected.WithLabelValues("callback_state")); got-before != 1 { + t.Errorf("ReplayDetected{kind=callback_state} delta = %v, want 1", got-before) + } +} + +// TestCallback_SingleUse_LegacyEmptySID pins the in-flight-rollout +// fallback for /callback: a state sealed by an older binary has no +// SessionID and must not 503 the user out of their auth flow. +// +// Asserts negatively — the replay-rejection and store-error error +// codes must NOT appear. The downstream IdP exchange will fail +// (no real IdP) producing some other error; that's expected and +// not what this test pins. +func TestCallback_SingleUse_LegacyEmptySID(t *testing.T) { + tm := newTestTokenManager(t) + store := replay.NewMemoryStore() + defer store.Close() + oauth2Cfg := testOAuth2Config() + verifyFunc := func(_ context.Context, _ string) (*oidc.IDToken, error) { + return nil, errors.New("not used") + } + + state := mintCallbackSession(t, tm, "") // legacy + + req := httptest.NewRequest(http.MethodGet, "/callback?code=fake&state="+url.QueryEscape(state), nil) + rr := httptest.NewRecorder() + CallbackWithVerifyFunc(tm, zap.NewNop(), testBaseURL, oauth2Cfg, verifyFunc, CallbackConfig{ReplayStore: store})(rr, req) + + var oauthErr OAuthError + _ = json.NewDecoder(rr.Body).Decode(&oauthErr) // body shape is best-effort here + if oauthErr.ErrorCode == "callback_state_replay" { + t.Fatalf("legacy state (empty SessionID) hit the replay-rejection branch") + } + if oauthErr.ErrorCode == "replay_store_unavailable" { + t.Fatalf("legacy state (empty SessionID) hit the store-error branch") + } +} + +// TestCallback_SingleUse_StoreErrorFailClosed pins the +// fail-closed-on-store-error policy on /callback. Same shape as +// the consent test. +func TestCallback_SingleUse_StoreErrorFailClosed(t *testing.T) { + tm := newTestTokenManager(t) + store := &erroringStore{err: errors.New("redis blew up")} + oauth2Cfg := testOAuth2Config() + verifyFunc := func(_ context.Context, _ string) (*oidc.IDToken, error) { + t.Fatalf("verifyFunc must not run after replay-store error") + return nil, nil + } + + state := mintCallbackSession(t, tm, uuid.NewString()) + + before := testutil.ToFloat64(metrics.AccessDenied.WithLabelValues("replay_store_unavailable")) + + req := httptest.NewRequest(http.MethodGet, "/callback?code=fake&state="+url.QueryEscape(state), nil) + rr := httptest.NewRecorder() + CallbackWithVerifyFunc(tm, zap.NewNop(), testBaseURL, oauth2Cfg, verifyFunc, CallbackConfig{ReplayStore: store})(rr, req) + + if rr.Code != http.StatusServiceUnavailable { + t.Fatalf("want 503, got %d: %s", rr.Code, rr.Body.String()) + } + var oauthErr OAuthError + if err := json.NewDecoder(rr.Body).Decode(&oauthErr); err != nil { + t.Fatalf("decode: %v", err) + } + if oauthErr.ErrorCode != "replay_store_unavailable" { + t.Errorf("error_code = %q, want replay_store_unavailable", oauthErr.ErrorCode) + } + if got := testutil.ToFloat64(metrics.AccessDenied.WithLabelValues("replay_store_unavailable")); got-before != 1 { + t.Errorf("AccessDenied{reason=replay_store_unavailable} delta = %v, want 1", got-before) + } +} + +// TestAuthorize_RenderConsent_PopulatesJTI pins that GET /authorize +// (consent-fork enabled) seals a non-empty JTI into the consent +// blob. Without this, the per-render single-use defense degrades +// silently to the legacy back-compat fallback. +func TestAuthorize_RenderConsent_PopulatesJTI(t *testing.T) { + tm := newTestTokenManager(t) + encClientID, _ := registerClientNamed(t, tm, []string{"https://app.example.com/callback"}, "App") + + codeVerifier := "dBjftJeZ4CVP-mB92K27uhbUJU1p1r_wW1gFWFOEjXk" + q := url.Values{ + "response_type": {"code"}, + "client_id": {encClientID}, + "redirect_uri": {"https://app.example.com/callback"}, + "code_challenge": {pkceChallenge(codeVerifier)}, + "code_challenge_method": {"S256"}, + "state": {"client-state"}, + } + req := httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/authorize?"+q.Encode(), nil) + rr := httptest.NewRecorder() + authorizeConsentEnabled(tm)(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("want 200, got %d", rr.Code) + } + body := rr.Body.String() + // Extract the consent_token from the form. The page is small + // enough that a substring search is cheaper than an HTML parse. + const marker = `name="consent_token" value="` + idx := strings.Index(body, marker) + if idx < 0 { + t.Fatalf("consent_token input not found in body") + } + rest := body[idx+len(marker):] + end := strings.Index(rest, `"`) + if end < 0 { + t.Fatalf("consent_token close-quote not found") + } + tok := rest[:end] + + var consent sealedConsent + if err := tm.OpenJSON(tok, &consent, token.PurposeConsent); err != nil { + t.Fatalf("OpenJSON consent: %v", err) + } + if consent.JTI == "" { + t.Errorf("sealedConsent.JTI is empty — per-render claim slot not populated") + } +} + +// TestAuthorize_SilentRedirect_PopulatesSessionID pins that the +// inline /authorize path (RenderConsentPage=false) seals a non-empty +// SessionID into the session blob. /callback's per-state replay +// claim depends on this. +func TestAuthorize_SilentRedirect_PopulatesSessionID(t *testing.T) { + tm := newTestTokenManager(t) + encClientID, _ := registerClientNamed(t, tm, []string{"https://app.example.com/callback"}, "App") + + codeVerifier := "dBjftJeZ4CVP-mB92K27uhbUJU1p1r_wW1gFWFOEjXk" + q := url.Values{ + "response_type": {"code"}, + "client_id": {encClientID}, + "redirect_uri": {"https://app.example.com/callback"}, + "code_challenge": {pkceChallenge(codeVerifier)}, + "code_challenge_method": {"S256"}, + "state": {"client-state"}, + } + req := httptest.NewRequestWithContext(context.Background(), http.MethodGet, "/authorize?"+q.Encode(), nil) + rr := httptest.NewRecorder() + // Silent path — consent fork off. + Authorize(tm, zap.NewNop(), testBaseURL, testOAuth2Config(), AuthorizeConfig{ + PKCERequired: true, + ResourceURIs: []string{testBaseURL + "/mcp"}, + CanonicalResource: testBaseURL + "/mcp", + })(rr, req) + + if rr.Code != http.StatusFound { + t.Fatalf("want 302, got %d: %s", rr.Code, rr.Body.String()) + } + loc, err := url.Parse(rr.Header().Get("Location")) + if err != nil { + t.Fatalf("parse Location: %v", err) + } + idpState := loc.Query().Get("state") + var sess sealedSession + if err := tm.OpenJSON(idpState, &sess, token.PurposeSession); err != nil { + t.Fatalf("OpenJSON session: %v", err) + } + if sess.SessionID == "" { + t.Errorf("sealedSession.SessionID is empty — per-session claim slot not populated") + } +} diff --git a/handlers/token.go b/handlers/token.go index 2cc0b0c..20c66d2 100644 --- a/handlers/token.go +++ b/handlers/token.go @@ -251,7 +251,7 @@ func handleAuthorizationCode(w http.ResponseWriter, r *http.Request, tm *token.M // `replay_store_unavailable` reason on the same denial counter // the runbook tells operators to alert on (a Redis outage hits // both the code and refresh paths under load). - logger.Error("replay_store_error", zap.Error(err)) + logger.Error("replay_store_error", zap.String("op", "claim_authz_code"), zap.Error(err)) metrics.AccessDenied.WithLabelValues("replay_store_unavailable").Inc() writeOAuthError(w, http.StatusServiceUnavailable, "server_error", "replay store unavailable", "replay_store_unavailable") return @@ -415,7 +415,7 @@ func handleRefreshToken(w http.ResponseWriter, r *http.Request, tm *token.Manage // path a client cancel could cut short. revoked, alreadyClaimed, err := replayStore.ClaimOrCheckFamily(r.Context(), familyKey, claimKey, claimTTL, refreshTokenTTL) if err != nil { - logger.Error("replay_store_error", zap.Error(err)) + logger.Error("replay_store_error", zap.String("op", "claim_refresh_family"), zap.Error(err)) metrics.AccessDenied.WithLabelValues("replay_store_unavailable").Inc() writeOAuthError(w, http.StatusServiceUnavailable, "server_error", "replay store unavailable", "replay_store_unavailable") return diff --git a/main.go b/main.go index 8b85009..6914a7a 100644 --- a/main.go +++ b/main.go @@ -351,10 +351,13 @@ func main() { // /consent POST, and the two consume from independent buckets // so a human who clicks Approve quickly after Authorize doesn't // halve the per-IP budget for either path. - r.With(consentLimit).Post("/consent", handlers.Consent(tm, logger, cfg.ProxyBaseURL, oauth2Cfg, handlers.ConsentConfig{})) + r.With(consentLimit).Post("/consent", handlers.Consent(tm, logger, cfg.ProxyBaseURL, oauth2Cfg, handlers.ConsentConfig{ + ReplayStore: replayStore, + })) r.With(callbackLimit).Get("/callback", handlers.Callback(tm, logger, cfg.ProxyBaseURL, oauth2Cfg, idTokenVerifier, handlers.CallbackConfig{ AllowedGroups: cfg.AllowedGroups, GroupsClaim: cfg.GroupsClaim, + ReplayStore: replayStore, })) r.With(tokenLimit).Post("/token", handlers.Token(tm, logger, cfg.ProxyBaseURL, cfg.RevokeBefore, replayStore, cfg.ProxyBaseURL+cfg.UpstreamMCPMountPath)) diff --git a/metrics/metrics.go b/metrics/metrics.go index b98796e..35ea43a 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -81,10 +81,11 @@ var ( }, []string{"reason"}) // ReplayDetected counts replay attempts caught by the replay store - // (requires REDIS_URL). Labelled by kind (code / refresh). + // (requires REDIS_URL). Labelled by kind: code, refresh, consent, + // callback_state. ReplayDetected = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "mcp_auth_replay_detected_total", - Help: "Replay attempts detected by the replay store, by kind.", + Help: "Replay attempts detected by the replay store, by kind (code/refresh/consent/callback_state).", }, []string{"kind"}) // ClientsRegistered counts dynamic client registrations (RFC 7591). diff --git a/specs.md b/specs.md index 4f8bc18..30725b2 100644 --- a/specs.md +++ b/specs.md @@ -334,7 +334,7 @@ Always registered. Driven by the consent form rendered at step 3 of `/authorize` 3. **`action=approve`:** mint upstream OIDC `nonce` (random 32 hex) + upstream PKCE verifier, regenerate the H6 server-side PKCE pair if the consent blob recorded one, seal a `sealedSession` (same shape as step 4 of `/authorize`), redirect 302 to the IdP. Increment `mcp_auth_consent_decisions_total{decision="approved"}`. 4. **Anything else:** 400 `invalid_request`. -**Why no replay-store integration on `consent_token`:** within the 5-min TTL the token can be redeemed multiple times. Each Approve mints a *new* `sealedSession` (different `nonce` / verifier), and the IdP login still has to complete; each Deny just emits another `error=access_denied` to the registered `redirect_uri`. Neither path produces an effect the original user couldn't produce themselves by clicking again. Adding a Redis claim would be belt-and-braces against an attacker who exfiltrated the consent_token (a stronger compromise than this defense addresses) — accepted as a deliberate trade-off rather than a default. +**Single-use replay defense on `consent_token`:** when a replay store is wired (`REDIS_URL`), the consent token's `jti` is claimed atomically before either branch runs — a captured `consent_token` can be redeemed at most once for either Approve or Deny. Each GET `/authorize` render mints a fresh `jti`, so the back-button case still works (a re-render gets a new claim slot, the prior one is dead once redeemed). On replay: 400 `invalid_request` with `error_code=consent_replay`, `mcp_auth_replay_detected_total{kind="consent"}` increments. With no replay store wired (configured opt-out) the handler falls back to the prior stateless behavior (token unique, audience-bound, TTL-bounded, but replayable within the 5-min window). --- @@ -345,13 +345,14 @@ Always registered. Driven by the consent form rendered at step 3 of `/authorize` **Behavior:** 1. If the IdP returns an `error` (RFC 6749 §4.1.2.1), propagate it to the client 2. Decrypt the `state` → retrieve the session, verify not expired -3. Exchange the code with the IdP (POST token endpoint, 10s timeout) to obtain `id_token` + `access_token` -4. Validate the `id_token` via go-oidc (JWKS signature auto-discovery, issuer, audience) -5. Extract claims: `sub`, `email`, `email_verified`, `name` -6. If `email_verified` is present and false, reject with 403 `access_denied` + `error_code: email_not_verified` (absent claim is accepted — not all IdPs emit it) -7. Extract groups from the configured claim (`GROUPS_CLAIM`, default `groups`) -8. If `ALLOWED_GROUPS` is configured, verify the user belongs to at least one allowed group → 403 otherwise -9. Encrypt an internal authorization code with AES-GCM (60s TTL): +3. **Single-use claim** on the session's `sid` when a replay store is wired — a replayed `/callback` is rejected with 400 `invalid_request` + `error_code=callback_state_replay` BEFORE the upstream IdP exchange runs (no fan-out, no audit-log noise). `mcp_auth_replay_detected_total{kind="callback_state"}` increments. Empty `sid` (legacy session in flight during rollout) falls through to stateless behavior. Store-error path fail-closes to 503 + `replay_store_unavailable`. +4. Exchange the code with the IdP (POST token endpoint, 10s timeout) to obtain `id_token` + `access_token` +5. Validate the `id_token` via go-oidc (JWKS signature auto-discovery, issuer, audience) +6. Extract claims: `sub`, `email`, `email_verified`, `name` +7. If `email_verified` is present and false, reject with 403 `access_denied` + `error_code: email_not_verified` (absent claim is accepted — not all IdPs emit it) +8. Extract groups from the configured claim (`GROUPS_CLAIM`, default `groups`) +9. If `ALLOWED_GROUPS` is configured, verify the user belongs to at least one allowed group → 403 otherwise +10. Encrypt an internal authorization code with AES-GCM (60s TTL): ``` { token_id (UUID, used for single-use replay check), @@ -361,7 +362,7 @@ Always registered. Driven by the consent form rendered at step 3 of `/authorize` expires_at } ``` -10. Redirect 302 to `redirect_uri?code={encrypted_code}&state={original_state}&iss={PROXY_BASE_URL}` +11. Redirect 302 to `redirect_uri?code={encrypted_code}&state={original_state}&iss={PROXY_BASE_URL}` - Built via `url.Parse` + merged query params (safe even if redirect_uri already contains query params) - `iss` is emitted per RFC 9700 §2.1.4 (mix-up defense): a client that talks to multiple ASes can verify the response came from the AS it actually sent the request to. Value matches the `issuer` field in the RFC 8414 metadata document.