Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/arch/13-vmcp-scalability.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,30 @@ line 177). This means:
| Inactivity beyond TTL | Redis TTL expiry (automatic, no application-side action needed) |
| Pod-local cache eviction (LRU) | `onEvict` callback closes backend connections only; the Redis metadata key is **not** deleted and expires via TTL |

### Identity-binding storage and Redis access control

Each vMCP session carries an identity binding stored in session metadata under the
key `vmcp.identity.binding`. The canonical format is defined in
`pkg/vmcp/session/binding/binding.go`: a NUL-separated `iss + "\x00" + sub` for
authenticated sessions, and the literal string `"unauthenticated"` for sessions
without an auth identity.

The binding is stored as **plaintext** in the session store (Redis/Valkey). It is
not a credential — it identifies but does not authenticate a principal — but it is
personally-identifying information (a combination of issuer URL and user subject).

Operators are responsible for access-controlling the Redis/Valkey instance
equivalently to any other identity store. Concretely: enable Redis ACLs (Redis 6+)
or `requirepass`, restrict network reach with a Kubernetes `NetworkPolicy`, and
avoid sharing the cluster with untrusted workloads.

The session store prior to issue #5306 held an HMAC of the bearer token rather than
the raw `(iss, sub)` pair. That scheme reduced the value of a Redis dump at the cost
of breaking on every legitimate OAuth token refresh. The current scheme accepts
plaintext PII at rest as the price of correctness; operators who require additional
protection against a Redis compromise must layer Redis-side access controls as
described above.

## File descriptor limits

Each open backend connection consumes one file descriptor on the vMCP pod. A
Expand Down
68 changes: 19 additions & 49 deletions pkg/vmcp/cli/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,15 @@ func Serve(ctx context.Context, cfg ServeConfig) error {

slog.Info(fmt.Sprintf("Setting up incoming authentication (type: %s)", vmcpCfg.IncomingAuth.Type))

if vmcpCfg.IncomingAuth.Type == config.IncomingAuthTypeAnonymous {
slog.Warn(
"vMCP is configured with anonymous incoming auth; all anonymous sessions share a single sentinel binding, "+
"so possession of a session ID is sufficient to act as that session from any source. "+
"Anonymous mode is intended for development only.",
"incoming_auth_type", config.IncomingAuthTypeAnonymous,
)
}

// Configure health monitoring if enabled.
var healthMonitorConfig *health.MonitorConfig
if vmcpCfg.Operational != nil &&
Expand Down Expand Up @@ -336,15 +345,11 @@ func Serve(ctx context.Context, cfg ServeConfig) error {
}

envReader := &env.OSReader{}
sessionFactory, err := createSessionFactory(
envReader.Getenv("VMCP_SESSION_HMAC_SECRET"),
runtime.IsKubernetesRuntimeWithEnv(envReader),
outgoingRegistry,
agg,
)
if err != nil {
return err
if hmacSecret := envReader.Getenv("VMCP_SESSION_HMAC_SECRET"); hmacSecret != "" {
slog.Debug("VMCP_SESSION_HMAC_SECRET is set but no longer used after #5306; ignoring",
"env_var", "VMCP_SESSION_HMAC_SECRET")
}
sessionFactory := createSessionFactory(outgoingRegistry, agg)

// When the optimizer is enabled, its meta-tools must pass through the authz
// response filter so they appear in tools/list.
Expand Down Expand Up @@ -645,51 +650,16 @@ func runDiscovery(
return backends, backendClient, outgoingRegistry, nil
}

// createSessionFactory creates a MultiSessionFactory with HMAC-SHA256 token binding.
// The HMAC secret and Kubernetes detection are passed in as parameters (typically sourced
// from the VMCP_SESSION_HMAC_SECRET environment variable and runtime environment detection
// by the caller).
//
// Behavior:
// - If hmacSecret is non-empty: validates length and creates factory with the secret.
// - If running in Kubernetes without secret: returns error (production safety requirement).
// - Otherwise: logs warning and creates factory with default insecure secret.
// createSessionFactory creates a MultiSessionFactory backed by the provided outgoing
// auth registry and optional aggregator. When agg is non-nil, sessions gain access
// to aggregated backend metadata; pass nil for single-backend deployments.
func createSessionFactory(
hmacSecret string,
isKubernetes bool,
outgoingRegistry vmcpauth.OutgoingAuthRegistry,
agg aggregator.Aggregator,
) (vmcpsession.MultiSessionFactory, error) {
const minRecommendedSecretLen = 32

opts := []vmcpsession.MultiSessionFactoryOption{}
) vmcpsession.MultiSessionFactory {
var opts []vmcpsession.MultiSessionFactoryOption
if agg != nil {
opts = append(opts, vmcpsession.WithAggregator(agg))
}

if hmacSecret != "" {
if secretLen := len(hmacSecret); secretLen < minRecommendedSecretLen {
// G706: Safe - only logging integer length, not the secret itself.
slog.Warn( //nolint:gosec
"HMAC secret is shorter than recommended length - consider using a longer secret",
"actual_length", secretLen,
"recommended_length", minRecommendedSecretLen,
)
}
slog.Info("using provided HMAC secret for session token binding")
opts = append(opts, vmcpsession.WithHMACSecret([]byte(hmacSecret)))
return vmcpsession.NewSessionFactory(outgoingRegistry, opts...), nil
}

// No secret provided — fail fast in Kubernetes (production environment).
if isKubernetes {
return nil, fmt.Errorf(
"an HMAC secret is required when running in Kubernetes (set VMCP_SESSION_HMAC_SECRET). " +
"Generate a secure secret with: openssl rand -base64 32",
)
}

// Development mode: use default insecure secret with warning.
slog.Warn("no HMAC secret provided - using default insecure secret (NOT recommended for production)")
return vmcpsession.NewSessionFactory(outgoingRegistry, opts...), nil
return vmcpsession.NewSessionFactory(outgoingRegistry, opts...)
}
59 changes: 21 additions & 38 deletions pkg/vmcp/cli/serve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
authserverconfig "github.com/stacklok/toolhive/pkg/authserver"
"github.com/stacklok/toolhive/pkg/groups"
"github.com/stacklok/toolhive/pkg/vmcp"
"github.com/stacklok/toolhive/pkg/vmcp/aggregator"
aggregatormocks "github.com/stacklok/toolhive/pkg/vmcp/aggregator/mocks"
clientmocks "github.com/stacklok/toolhive/pkg/vmcp/client/mocks"
"github.com/stacklok/toolhive/pkg/vmcp/config"
Expand Down Expand Up @@ -165,45 +166,27 @@ func newSessionFactoryMocks(t *testing.T) (*clientmocks.MockOutgoingAuthRegistry
return clientmocks.NewMockOutgoingAuthRegistry(ctrl), aggregatormocks.NewMockAggregator(ctrl)
}

func TestCreateSessionFactory_WithHMACSecret(t *testing.T) {
func TestCreateSessionFactory(t *testing.T) {
t.Parallel()
registry, agg := newSessionFactoryMocks(t)
factory, err := createSessionFactory("a-sufficiently-long-hmac-secret-value-32b", false, registry, agg)
require.NoError(t, err)
require.NotNil(t, factory)
}

func TestCreateSessionFactory_HMACSecretExactly32Bytes(t *testing.T) {
t.Parallel()
registry, agg := newSessionFactoryMocks(t)
factory, err := createSessionFactory("12345678901234567890123456789012", false, registry, agg)
require.NoError(t, err)
require.NotNil(t, factory)
}

func TestCreateSessionFactory_ShortHMACSecret(t *testing.T) {
t.Parallel()
registry, agg := newSessionFactoryMocks(t)
factory, err := createSessionFactory("short", false, registry, agg)
require.NoError(t, err)
require.NotNil(t, factory)
}

func TestCreateSessionFactory_NoSecretNonKubernetes(t *testing.T) {
t.Parallel()
registry, agg := newSessionFactoryMocks(t)
factory, err := createSessionFactory("", false, registry, agg)
require.NoError(t, err)
require.NotNil(t, factory)
}

func TestCreateSessionFactory_NoSecretKubernetes(t *testing.T) {
t.Parallel()
registry, agg := newSessionFactoryMocks(t)
factory, err := createSessionFactory("", true, registry, agg)
require.Error(t, err)
require.ErrorContains(t, err, "an HMAC secret is required when running in Kubernetes")
require.Nil(t, factory)
tests := []struct {
name string
useAgg bool
}{
{name: "with aggregator", useAgg: true},
{name: "without aggregator", useAgg: false},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
registry, agg := newSessionFactoryMocks(t)
var aggArg aggregator.Aggregator
if tc.useAgg {
aggArg = agg
}
factory := createSessionFactory(registry, aggArg)
require.NotNil(t, factory)
})
}
}

// TestRunDiscovery_KubernetesGroupNotFound exercises the Kubernetes-specific branch
Expand Down
4 changes: 2 additions & 2 deletions pkg/vmcp/server/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -506,8 +506,8 @@ func TestIntegration_AuditLogging(t *testing.T) {
// table needed for tool calls and resource reads to be audit-logged correctly.
auditSessionFactory := sessionfactorymocks.NewMockMultiSessionFactory(ctrl)
auditSessionFactory.EXPECT().
MakeSessionWithID(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
DoAndReturn(func(_ context.Context, id string, _ *auth.Identity, _ bool, _ []*vmcp.Backend) (vmcpsession.MultiSession, error) {
MakeSessionWithID(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
DoAndReturn(func(_ context.Context, id string, _ *auth.Identity, _ []*vmcp.Backend) (vmcpsession.MultiSession, error) {
mock := sessionmocks.NewMockMultiSession(ctrl)
mock.EXPECT().ID().Return(id).AnyTimes()
mock.EXPECT().UpdatedAt().Return(time.Time{}).AnyTimes()
Expand Down
17 changes: 8 additions & 9 deletions pkg/vmcp/server/session_management_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ func newNoopMockFactory(t *testing.T) *sessionfactorymocks.MockMultiSessionFacto
t.Helper()
ctrl := gomock.NewController(t)
factory := sessionfactorymocks.NewMockMultiSessionFactory(ctrl)
factory.EXPECT().MakeSessionWithID(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
DoAndReturn(func(_ context.Context, id string, _ *auth.Identity, _ bool, _ []*vmcp.Backend) (vmcpsession.MultiSession, error) {
factory.EXPECT().MakeSessionWithID(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
DoAndReturn(func(_ context.Context, id string, _ *auth.Identity, _ []*vmcp.Backend) (vmcpsession.MultiSession, error) {
mock := sessionmocks.NewMockMultiSession(ctrl)
mock.EXPECT().ID().Return(id).AnyTimes()
mock.EXPECT().UpdatedAt().Return(time.Time{}).AnyTimes()
Expand Down Expand Up @@ -90,13 +90,12 @@ func newMockFactory(t *testing.T, ctrl *gomock.Controller, tools []vmcp.Tool) (*
t.Helper()
state := &mockFactoryState{}
factory := sessionfactorymocks.NewMockMultiSessionFactory(ctrl)
factory.EXPECT().MakeSessionWithID(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
DoAndReturn(func(_ context.Context, id string, identity *auth.Identity, allowAnonymous bool, _ []*vmcp.Backend) (vmcpsession.MultiSession, error) {
factory.EXPECT().MakeSessionWithID(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
DoAndReturn(func(_ context.Context, id string, _ *auth.Identity, _ []*vmcp.Backend) (vmcpsession.MultiSession, error) {
state.makeWithIDCalled.Store(true)
tokenHash := ""
if identity != nil && identity.Token != "" && !allowAnonymous {
tokenHash = "fake-hash-for-testing"
}
// All sessions carry MetadataKeyIdentityBinding so Terminate takes the
// Phase 2 (storage.Delete) path. The sentinel value is sufficient for
// tests that don't validate the binding content.
mock := sessionmocks.NewMockMultiSession(ctrl)
mock.EXPECT().ID().Return(id).AnyTimes()
mock.EXPECT().UpdatedAt().Return(time.Time{}).AnyTimes()
Expand All @@ -105,7 +104,7 @@ func newMockFactory(t *testing.T, ctrl *gomock.Controller, tools []vmcp.Tool) (*
mock.EXPECT().GetData().Return(nil).AnyTimes()
mock.EXPECT().SetData(gomock.Any()).AnyTimes()
mock.EXPECT().GetMetadata().Return(map[string]string{
vmcpsession.MetadataKeyTokenHash: tokenHash,
vmcpsession.MetadataKeyIdentityBinding: "unauthenticated",
}).AnyTimes()
mock.EXPECT().SetMetadata(gomock.Any(), gomock.Any()).AnyTimes()
toolsCopy := make([]vmcp.Tool, len(tools))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ import (
sessiontypes "github.com/stacklok/toolhive/pkg/vmcp/session/types"
)

// hmacSecret is a fixed 32-byte secret used across all integration tests.
var hmacSecret = []byte("test-hmac-secret-32bytes-exactly")

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -63,9 +60,8 @@ func newSharedRedisStorage(t *testing.T, mr *miniredis.Miniredis) transportsessi
}

// newTestManagerWithSharedStorage creates a Manager backed by the given
// DataStorage, a real session factory with the package-level hmacSecret, and
// an ImmutableRegistry containing backends. Cleanup is registered via
// t.Cleanup.
// DataStorage, a real session factory, and an ImmutableRegistry containing
// backends. Cleanup is registered via t.Cleanup.
func newTestManagerWithSharedStorage(t *testing.T, storage transportsession.DataStorage, backends []*vmcp.Backend) *Manager {
t.Helper()
backendList := make([]vmcp.Backend, len(backends))
Expand All @@ -75,7 +71,6 @@ func newTestManagerWithSharedStorage(t *testing.T, storage transportsession.Data
registry := vmcp.NewImmutableRegistry(backendList)
factory := vmcpsession.NewSessionFactory(
newUnauthenticatedAuthRegistry(t),
vmcpsession.WithHMACSecret(hmacSecret),
)
sm, cleanup, err := New(storage, &FactoryConfig{Base: factory}, registry)
require.NoError(t, err)
Expand Down Expand Up @@ -215,13 +210,26 @@ func TestHorizontalScaling_CrossPodHijackPrevention(t *testing.T) {
storage := newSharedRedisStorage(t, mr)
backend := startMCPBackend(t, "backend-alpha", "echo")

// Both alice and eve need Claims with iss+sub so the identity-binding
// decorator can extract their (iss, sub) pairs (Token is not used for binding
// in the #5306 model; Claims are the canonical source).
identity := &auth.Identity{
PrincipalInfo: auth.PrincipalInfo{Subject: "alice"},
Token: "alice-bearer-token",
PrincipalInfo: auth.PrincipalInfo{
Subject: "alice",
Claims: map[string]any{
"iss": "https://idp.example",
"sub": "alice",
},
},
}
wrongCaller := &auth.Identity{
PrincipalInfo: auth.PrincipalInfo{Subject: "eve"},
Token: "eve-bearer-token",
PrincipalInfo: auth.PrincipalInfo{
Subject: "eve",
Claims: map[string]any{
"iss": "https://idp.example",
"sub": "eve",
},
},
}

// Pod A: create session bound to alice.
Expand Down
22 changes: 8 additions & 14 deletions pkg/vmcp/server/sessionmanager/session_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,18 +304,11 @@ func (sm *Manager) CreateSession(
// Resolve the caller identity (may be nil for anonymous access).
identity, _ := auth.IdentityFromContext(ctx)

// Note: Token hash and salt are computed and stored by the session factory
// (MakeSessionWithID below). Token binding enforcement happens at the session
// level via validateCaller(), which uses HMAC-SHA256 with a per-session salt.

// List all available backends from the registry.
backends := sm.listAllBackends(ctx)

// Build the fully-formed MultiSession using the SDK-assigned session ID.
// Sessions created with an identity are bound to that identity (allowAnonymous=false).
// Sessions created without an identity allow anonymous access (allowAnonymous=true).
allowAnonymous := sessiontypes.ShouldAllowAnonymous(identity)
sess, err := sm.factory.MakeSessionWithID(ctx, sessionID, identity, allowAnonymous, backends)
sess, err := sm.factory.MakeSessionWithID(ctx, sessionID, identity, backends)
if err != nil {
sm.cleanupFailedPlaceholder(sessionID, placeholder)
return nil, fmt.Errorf("Manager.CreateSession: failed to create multi-session: %w", err)
Expand Down Expand Up @@ -482,7 +475,7 @@ func (sm *Manager) Terminate(sessionID string) (isNotAllowed bool, err error) {
return false, fmt.Errorf("Manager.Terminate: failed to load session %q: %w", sessionID, loadErr)
}

if _, isFullSession := metadata[sessiontypes.MetadataKeyTokenHash]; isFullSession {
if _, isFullSession := metadata[sessiontypes.MetadataKeyIdentityBinding]; isFullSession {
// Phase 2 (full MultiSession): delete from storage. The cache entry will be
// evicted lazily on the next Get when checkSession finds the session gone.
if deleteErr := sm.storage.Delete(ctx, sessionID); deleteErr != nil {
Expand Down Expand Up @@ -701,16 +694,17 @@ func (sm *Manager) loadSession(sessionID string) (vmcpsession.MultiSession, erro
}

// Don't restore placeholder sessions (Phase 2 never ran).
// PreventSessionHijacking always writes MetadataKeyTokenHash during Phase 2
// (empty sentinel for anonymous, non-empty hash for authenticated). Its
// absence means Generate() stored this record but CreateSession() never
// completed — treat it as "not found" rather than "corrupted".
// BindSession always writes MetadataKeyIdentityBinding during Phase 2
// (the unauthenticated sentinel for anonymous sessions, a bound (iss, sub)
// binding for authenticated ones). Its absence means Generate() stored
// this record but CreateSession() never completed — treat it as "not
// found" rather than "corrupted".
//
// Note: this is intentionally different from RestoreSession's fail-closed
// check (absent key → error). Here we know a placeholder's empty metadata
// is valid storage state produced by Generate(), so we return the
// SDK-standard ErrSessionNotFound instead of an error.
if _, hashPresent := metadata[sessiontypes.MetadataKeyTokenHash]; !hashPresent {
if _, bindingPresent := metadata[sessiontypes.MetadataKeyIdentityBinding]; !bindingPresent {
return nil, transportsession.ErrSessionNotFound
}

Expand Down
Loading
Loading