@@ -15,6 +15,7 @@ import (
1515 "log/slog"
1616 "net"
1717 "net/http"
18+ "net/http/httptrace"
1819 "net/http/httputil"
1920 "net/url"
2021 "os"
@@ -153,6 +154,17 @@ const (
153154 // It is written on initialize and read in the Rewrite closure to route follow-up requests
154155 // to the same backend pod that handled the session's initialize request.
155156 sessionMetadataBackendURL = "backend_url"
157+
158+ // sessionMetadataInitBody stores the raw JSON-RPC initialize request body.
159+ // It is used to transparently re-initialize a backend session when the pod that
160+ // originally handled initialize has been replaced (new IP or lost in-memory state).
161+ sessionMetadataInitBody = "init_body"
162+
163+ // sessionMetadataBackendSID stores the backend's assigned Mcp-Session-Id when it
164+ // diverges from the client-facing session ID after a transparent re-initialization.
165+ // The Rewrite closure rewrites the outbound Mcp-Session-Id header to this value so
166+ // the backend sees its own session ID while the client keeps its original one.
167+ sessionMetadataBackendSID = "backend_sid"
156168)
157169
158170// Option is a functional option for configuring TransparentProxy
@@ -436,12 +448,33 @@ func (t *tracingTransport) RoundTrip(req *http.Request) (*http.Response, error)
436448 }
437449 }
438450
451+ // Attach an httptrace to capture the actual backend pod IP after kube-proxy
452+ // DNAT resolves the ClusterIP to a specific pod. The captured address is stored
453+ // as backend_url so follow-up requests always reach the same pod, even after a
454+ // proxy runner restart that would otherwise lose the in-memory routing state.
455+ var capturedPodAddr string
456+ if sawInitialize {
457+ trace := & httptrace.ClientTrace {
458+ GotConn : func (info httptrace.GotConnInfo ) {
459+ capturedPodAddr = info .Conn .RemoteAddr ().String ()
460+ },
461+ }
462+ req = req .WithContext (httptrace .WithClientTrace (req .Context (), trace ))
463+ }
464+
439465 resp , err := t .forward (req )
440466 if err != nil {
441467 if errors .Is (err , context .Canceled ) {
442468 // Expected during shutdown or client disconnect—silently ignore
443469 return nil , err
444470 }
471+ // Dial error against a stored pod IP means the pod has been replaced.
472+ // Attempt transparent re-initialization so the client sees no error.
473+ if isDialError (err ) {
474+ if reInitResp , reInitErr := t .reinitializeAndReplay (req , reqBody ); reInitResp != nil || reInitErr != nil {
475+ return reInitResp , reInitErr
476+ }
477+ }
445478 slog .Error ("failed to forward request" , "error" , err )
446479 return nil , err
447480 }
@@ -471,6 +504,20 @@ func (t *tracingTransport) RoundTrip(req *http.Request) (*http.Response, error)
471504 }
472505 }
473506
507+ // Backend returned 404 for a non-initialize, non-DELETE request whose session IS
508+ // known to the proxy. This means the backend pod lost its in-memory session state
509+ // (e.g. it was restarted but got the same IP). Attempt transparent re-initialization
510+ // so the client sees no error. DELETE is excluded because the session has already
511+ // been cleaned up above and the 404 is the expected terminal response.
512+ if resp .StatusCode == http .StatusNotFound && ! sawInitialize && req .Method != http .MethodDelete {
513+ if sid := req .Header .Get ("Mcp-Session-Id" ); sid != "" {
514+ if reInitResp , reInitErr := t .reinitializeAndReplay (req , reqBody ); reInitResp != nil || reInitErr != nil {
515+ _ = resp .Body .Close ()
516+ return reInitResp , reInitErr
517+ }
518+ }
519+ }
520+
474521 if resp .StatusCode == http .StatusOK {
475522 // check if we saw a valid mcp header
476523 ct := resp .Header .Get ("Mcp-Session-Id" )
@@ -480,14 +527,15 @@ func (t *tracingTransport) RoundTrip(req *http.Request) (*http.Response, error)
480527 internalID := normalizeSessionID (ct )
481528 if _ , ok := t .p .sessionManager .Get (internalID ); ! ok {
482529 sess := session .NewProxySession (internalID )
483- // Store targetURI as the default backend_url for this session.
484- // In single-replica deployments targetURI is already the pod address,
485- // so no override is needed. In multi-replica deployments the
486- // vMCP/operator layer is responsible for setting backend_url to the
487- // actual pod DNS name (e.g. http://mcp-server-0.mcp-server.default.svc:8080)
488- // before the request reaches this proxy; the Rewrite closure then reads
489- // that value and routes follow-up requests to the correct pod.
490- sess .SetMetadata (sessionMetadataBackendURL , t .p .targetURI )
530+ // Store the actual pod IP (captured via GotConn) as backend_url so that
531+ // after a proxy runner restart the session is routed to the same backend
532+ // pod that handled initialize, not a random pod via ClusterIP.
533+ sess .SetMetadata (sessionMetadataBackendURL , t .podBackendURL (capturedPodAddr ))
534+ // Store the initialize body so we can transparently re-initialize the
535+ // backend session if the pod is later replaced or loses session state.
536+ if len (reqBody ) > 0 {
537+ sess .SetMetadata (sessionMetadataInitBody , string (reqBody ))
538+ }
491539 if err := t .p .sessionManager .AddSession (sess ); err != nil {
492540 //nolint:gosec // G706: session ID from HTTP response header
493541 slog .Error ("failed to create session from header" ,
@@ -553,6 +601,133 @@ func (t *tracingTransport) detectInitialize(body []byte) bool {
553601 return false
554602}
555603
604+ // podBackendURL constructs a backend URL that targets the specific pod IP captured
605+ // via httptrace.GotConn, using the scheme from targetURI. Falls back to targetURI
606+ // when no address was captured (e.g. single-replica, connection reuse without a new conn).
607+ func (t * tracingTransport ) podBackendURL (capturedAddr string ) string {
608+ if capturedAddr == "" {
609+ return t .p .targetURI
610+ }
611+ parsed , err := url .Parse (t .p .targetURI )
612+ if err != nil {
613+ return t .p .targetURI
614+ }
615+ parsed .Host = capturedAddr
616+ return parsed .String ()
617+ }
618+
619+ // isDialError reports whether err is a TCP dial failure, indicating that the
620+ // target host is unreachable (pod has been terminated or rescheduled).
621+ func isDialError (err error ) bool {
622+ var opErr * net.OpError
623+ return errors .As (err , & opErr ) && opErr .Op == "dial"
624+ }
625+
626+ // reinitializeAndReplay is called when the proxy detects that the backend pod
627+ // that owned a session is no longer reachable (dial error) or has lost its
628+ // in-memory session state (backend returned 404). It transparently:
629+ // 1. Re-sends the stored initialize body to the ClusterIP service so kube-proxy
630+ // selects a healthy pod and the backend creates a new session.
631+ // 2. Captures the new pod IP via httptrace.GotConn and stores it as backend_url.
632+ // 3. Maps the client's original session ID to the new backend session ID.
633+ // 4. Replays the original client request so the client sees no error.
634+ //
635+ // Returns (nil, nil) when re-initialization is not applicable (no stored init
636+ // body, session unknown, or already routing via ClusterIP).
637+ func (t * tracingTransport ) reinitializeAndReplay (req * http.Request , origBody []byte ) (* http.Response , error ) {
638+ sid := req .Header .Get ("Mcp-Session-Id" )
639+ if sid == "" {
640+ return nil , nil
641+ }
642+ internalSID := normalizeSessionID (sid )
643+ sess , ok := t .p .sessionManager .Get (internalSID )
644+ if ! ok {
645+ return nil , nil
646+ }
647+
648+ initBody , hasInit := sess .GetMetadataValue (sessionMetadataInitBody )
649+ if ! hasInit || initBody == "" {
650+ // No stored init body — cannot re-initialize transparently.
651+ // Reset backend_url to ClusterIP so the next request goes through
652+ // kube-proxy and lets the client receive a clean 404 to re-initialize.
653+ sess .SetMetadata (sessionMetadataBackendURL , t .p .targetURI )
654+ _ = t .p .sessionManager .UpsertSession (sess )
655+ return nil , nil
656+ }
657+
658+ slog .Debug ("backend session lost; transparently re-initializing" ,
659+ "session_id" , sid , "target" , t .p .targetURI )
660+
661+ // Capture the new pod IP via GotConn on the re-initialize connection.
662+ var capturedPodAddr string
663+ trace := & httptrace.ClientTrace {
664+ GotConn : func (info httptrace.GotConnInfo ) {
665+ capturedPodAddr = info .Conn .RemoteAddr ().String ()
666+ },
667+ }
668+ initCtx := httptrace .WithClientTrace (req .Context (), trace )
669+
670+ // Build a fresh initialize request to the ClusterIP (no Mcp-Session-Id —
671+ // the backend assigns a new session ID in the response).
672+ parsedTarget , err := url .Parse (t .p .targetURI )
673+ if err != nil {
674+ return nil , nil
675+ }
676+ initURL := * req .URL
677+ initURL .Scheme = parsedTarget .Scheme
678+ initURL .Host = parsedTarget .Host
679+
680+ initReq , err := http .NewRequestWithContext (initCtx , http .MethodPost , initURL .String (), bytes .NewReader ([]byte (initBody )))
681+ if err != nil {
682+ return nil , nil
683+ }
684+ initReq .Header .Set ("Content-Type" , "application/json" )
685+
686+ initResp , err := t .forward (initReq )
687+ if err != nil {
688+ slog .Error ("transparent re-initialize failed" , "error" , err )
689+ return nil , err
690+ }
691+ _ , _ = io .Copy (io .Discard , initResp .Body )
692+ _ = initResp .Body .Close ()
693+
694+ newBackendSID := initResp .Header .Get ("Mcp-Session-Id" )
695+ if newBackendSID == "" {
696+ slog .Debug ("re-initialize response contained no Mcp-Session-Id; falling back to ClusterIP" )
697+ sess .SetMetadata (sessionMetadataBackendURL , t .p .targetURI )
698+ _ = t .p .sessionManager .UpsertSession (sess )
699+ return nil , nil
700+ }
701+
702+ // Update session: point backend_url at the newly-discovered pod and record
703+ // the backend session ID so Rewrite rewrites Mcp-Session-Id on outbound requests.
704+ newPodURL := t .podBackendURL (capturedPodAddr )
705+ sess .SetMetadata (sessionMetadataBackendURL , newPodURL )
706+ sess .SetMetadata (sessionMetadataBackendSID , normalizeSessionID (newBackendSID ))
707+ if upsertErr := t .p .sessionManager .UpsertSession (sess ); upsertErr != nil {
708+ slog .Debug ("failed to update session after re-initialize" , "error" , upsertErr )
709+ }
710+
711+ // Replay the original client request to the new pod with the new backend SID.
712+ // Use the captured pod address directly so we bypass the Rewrite closure
713+ // (which still holds the old backend_url until the next session load).
714+ replayHost := capturedPodAddr
715+ if replayHost == "" {
716+ replayHost = parsedTarget .Host
717+ }
718+ replayReq := req .Clone (req .Context ())
719+ replayReq .URL .Scheme = parsedTarget .Scheme
720+ replayReq .URL .Host = replayHost
721+ replayReq .Host = replayHost // keep Host header consistent with URL to avoid backend validation errors
722+ replayReq .Header .Set ("Mcp-Session-Id" , newBackendSID )
723+ replayReq .Body = io .NopCloser (bytes .NewReader (origBody ))
724+ replayReq .ContentLength = int64 (len (origBody ))
725+
726+ slog .Debug ("replaying original request after transparent re-initialization" ,
727+ "new_pod_url" , newPodURL , "new_backend_sid" , newBackendSID )
728+ return t .forward (replayReq )
729+ }
730+
556731// modifyResponse modifies HTTP responses based on transport-specific requirements.
557732// Delegates to the appropriate ResponseProcessor based on transport type.
558733func (p * TransparentProxy ) modifyResponse (resp * http.Response ) error {
@@ -601,6 +776,13 @@ func (p *TransparentProxy) Start(ctx context.Context) error {
601776 pr .Out .URL .Host = parsed .Host
602777 }
603778 }
779+ // After a transparent re-initialization the proxy maps the client's
780+ // session ID to the backend's newly-assigned session ID. Rewrite the
781+ // outbound header so the backend sees its own ID while the client
782+ // continues to use its original session ID unchanged.
783+ if backendSID , exists := sess .GetMetadataValue (sessionMetadataBackendSID ); exists && backendSID != "" {
784+ pr .Out .Header .Set ("Mcp-Session-Id" , backendSID )
785+ }
604786 }
605787 }
606788
0 commit comments