Skip to content

Commit a976713

Browse files
scotwellsclaude
andcommitted
fix(extension-server): return tunnel-offline 503 on user path for offline connector
When a Connector is offline, NSO still emits its backendRef, so Envoy Gateway programs an endpoint-less cluster for the connector backend. The ext-server's offline handling only prepended a connect_matcher route (503 "Tunnel not online"), which matches CONNECT (tunnel-control) clients only. A normal user GET fell through to the prefix:"/" route, still targeting the zero-endpoint cluster, so Envoy returned a generic 503 no_healthy_upstream (UH) plus retry/cluster-stat noise instead of a deterministic tunnel-offline response. ApplyConnectorRoutes now, in the offline branch, also rewrites the user-facing forwarding route(s) that target the offline connector cluster into an immediate direct_response 503, reusing the same "Tunnel not online" body. Only the route's Action oneof is replaced, so each route's match, metadata, and typed_per_filter_config are preserved. The existing connect_matcher offline route is still prepended for CONNECT clients. The online (replaced) path is unchanged. Idempotent: converted routes become direct_responses, so routeCluster returns "" for them and they never re-match the connector cluster. Surfaces a new counter (nso_extension_connector_offline_routes_total) and a "connector_offline_routes" field on the PostTranslateModify log line. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 8b8cda1 commit a976713

4 files changed

Lines changed: 205 additions & 20 deletions

File tree

internal/extensionserver/metrics/metrics.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ var (
141141
},
142142
)
143143

144+
// ConnectorOfflineRoutesTotal counts total user-facing forwarding routes
145+
// rewritten to a tunnel-offline 503 direct_response across all hook
146+
// invocations. Each increment = one route that previously targeted an
147+
// endpoint-less offline-connector cluster and now returns a deterministic
148+
// 503 (instead of Envoy's generic 503 no_healthy_upstream). Use rate()/sum()
149+
// to derive per-build averages.
150+
ConnectorOfflineRoutesTotal = promauto.NewCounter(
151+
prometheus.CounterOpts{
152+
Name: "nso_extension_connector_offline_routes_total",
153+
Help: "Total user-facing forwarding routes rewritten to a tunnel-offline 503 direct_response across all hook invocations.",
154+
},
155+
)
156+
144157
// CacheSynced is 1 when the informer cache has synced and the extension server
145158
// is accepting gRPC connections, 0 during startup or if the cache lost sync.
146159
CacheSynced = promauto.NewGauge(

internal/extensionserver/mutate/connector.go

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,22 @@ func ReplaceConnectorClusters(
7474
// ApplyConnectorRoutes applies connector route mutations to a RouteConfiguration:
7575
// - Online (replaced) connector found in a VH: prepend a CONNECT upgrade route
7676
// targeting the replaced cluster and append info.TargetHost to VH domains.
77-
// - Offline connector found in a VH: prepend a direct_response 503 CONNECT route.
77+
// - Offline connector found in a VH: prepend a direct_response 503 CONNECT
78+
// route (for CONNECT/tunnel-control clients) AND rewrite the user-facing
79+
// forwarding routes that target the (endpoint-less) offline connector
80+
// cluster into an immediate direct_response 503. Without the latter, a
81+
// normal user GET falls through to the prefix:"/" route, which still targets
82+
// the zero-endpoint cluster — Envoy then returns a generic 503
83+
// no_healthy_upstream (UH) instead of a deterministic tunnel-offline
84+
// response, and the empty cluster triggers retries/cluster-stat noise.
7885
//
79-
// Returns the number of VirtualHosts mutated.
86+
// Returns the number of VirtualHosts mutated and the number of forwarding
87+
// routes converted to a tunnel-offline direct_response.
8088
func ApplyConnectorRoutes(
8189
rc *routev3.RouteConfiguration,
8290
idx *extcache.PolicyIndex,
8391
replaced, offline map[string]*extcache.ConnectorInfo,
84-
) (int, error) {
85-
mutated := 0
92+
) (mutated, converted int, err error) {
8693
for _, vh := range rc.GetVirtualHosts() {
8794
// Find any connector cluster referenced by routes in this VH.
8895
var connectorCluster string
@@ -102,7 +109,6 @@ func ApplyConnectorRoutes(
102109
}
103110

104111
var newRoute *routev3.Route
105-
var err error
106112

107113
if info, ok := replaced[connectorCluster]; ok {
108114
// Online: CONNECT route targeting the replaced cluster.
@@ -111,7 +117,7 @@ func ApplyConnectorRoutes(
111117
connectorCluster,
112118
)
113119
if err != nil {
114-
return mutated, fmt.Errorf("build connect route for %q: %w", vh.GetName(), err)
120+
return mutated, converted, fmt.Errorf("build connect route for %q: %w", vh.GetName(), err)
115121
}
116122
// Prepend the CONNECT route (NSO inserts at /virtual_hosts/0/routes/0).
117123
vh.Routes = append([]*routev3.Route{newRoute}, vh.Routes...)
@@ -120,17 +126,39 @@ func ApplyConnectorRoutes(
120126
// NOT a synthetic .connector.local domain.
121127
vh.Domains = appendUnique(vh.Domains, info.TargetHost)
122128
} else {
123-
// Offline: direct_response 503 CONNECT route ("Tunnel not online").
129+
// Offline: prepend the direct_response 503 connect_matcher route so
130+
// CONNECT (tunnel-control) clients still get "Tunnel not online".
124131
newRoute, err = buildOfflineRoute("connector-offline-" + sanitizeID(vh.GetName()))
125132
if err != nil {
126-
return mutated, fmt.Errorf("build offline route for %q: %w", vh.GetName(), err)
133+
return mutated, converted, fmt.Errorf("build offline route for %q: %w", vh.GetName(), err)
127134
}
128135
vh.Routes = append([]*routev3.Route{newRoute}, vh.Routes...)
129136
// No domain appended for offline connectors (no live tunnel).
137+
138+
// Rewrite the user-facing forwarding route(s) that target the
139+
// endpoint-less offline connector cluster into an immediate 503
140+
// direct_response. Otherwise a normal GET falls through to the
141+
// prefix:"/" route, hits the zero-endpoint cluster, and Envoy
142+
// returns a generic 503 no_healthy_upstream (UH) plus retry/stat
143+
// noise. We only replace the route's Action oneof, so each route's
144+
// match, metadata, and typed_per_filter_config are preserved.
145+
//
146+
// Idempotent: the prepended connect_matcher route and any already
147+
// converted forwarding route are direct_responses, so routeCluster
148+
// returns "" for them and they never re-match connectorCluster.
149+
for _, rt := range vh.GetRoutes() {
150+
if routeCluster(rt) != connectorCluster {
151+
continue
152+
}
153+
if derr := setRouteDirectResponse(rt, 503, offlineResponseBody); derr != nil {
154+
return mutated, converted, fmt.Errorf("convert offline forward route for %q: %w", vh.GetName(), derr)
155+
}
156+
converted++
157+
}
130158
}
131159
mutated++
132160
}
133-
return mutated, nil
161+
return mutated, converted, nil
134162
}
135163

136164
// --- Internal helpers ---
@@ -234,18 +262,39 @@ func buildConnectRoute(name, cluster string) (*routev3.Route, error) {
234262
return rt, nil
235263
}
236264

265+
// offlineResponseBody is the inline body returned for tunnel-offline 503
266+
// responses. Shared by the connect_matcher offline route (buildOfflineRoute)
267+
// and the user-facing forwarding-route conversion (setRouteDirectResponse) so
268+
// both paths return an identical body. The exact string is part of the
269+
// STATE.md metadata contract.
270+
const offlineResponseBody = "Tunnel not online"
271+
237272
// buildOfflineRoute builds the offline CONNECT route (direct_response 503
238-
// "Tunnel not online"). Mirrors buildOfflineRoute in the seed. The exact body
239-
// string is part of the STATE.md metadata contract.
273+
// "Tunnel not online"). Mirrors buildOfflineRoute in the seed.
240274
func buildOfflineRoute(name string) (*routev3.Route, error) {
241275
j := fmt.Sprintf(`{
242276
"name": %q,
243277
"match": { "connect_matcher": {} },
244-
"direct_response": { "status": 503, "body": { "inline_string": "Tunnel not online" } }
245-
}`, name)
278+
"direct_response": { "status": 503, "body": { "inline_string": %q } }
279+
}`, name, offlineResponseBody)
246280
rt := &routev3.Route{}
247281
if err := protojson.Unmarshal([]byte(j), rt); err != nil {
248282
return nil, fmt.Errorf("unmarshal offline route JSON: %w", err)
249283
}
250284
return rt, nil
251285
}
286+
287+
// setRouteDirectResponse rewrites a route's action to an immediate
288+
// direct_response with the given status and inline body. Only the Action oneof
289+
// is replaced, so the route's match, metadata, and typed_per_filter_config are
290+
// preserved. Used to convert a forwarding route that targets an endpoint-less
291+
// offline connector cluster into a deterministic tunnel-offline response.
292+
func setRouteDirectResponse(rt *routev3.Route, status uint32, body string) error {
293+
j := fmt.Sprintf(`{ "status": %d, "body": { "inline_string": %q } }`, status, body)
294+
dr := &routev3.DirectResponseAction{}
295+
if err := protojson.Unmarshal([]byte(j), dr); err != nil {
296+
return fmt.Errorf("unmarshal direct_response action JSON: %w", err)
297+
}
298+
rt.Action = &routev3.Route_DirectResponse{DirectResponse: dr}
299+
return nil
300+
}

internal/extensionserver/mutate/connector_test.go

Lines changed: 121 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
clusterv3 "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3"
1010
routev3 "github.com/envoyproxy/go-control-plane/envoy/config/route/v3"
11+
"google.golang.org/protobuf/types/known/anypb"
1112

1213
extcache "go.datum.net/network-services-operator/internal/extensionserver/cache"
1314
)
@@ -206,9 +207,10 @@ func TestApplyConnectorRoutes_Online_PrependsCONNECTRouteAndAppendsTargetHost(t
206207
},
207208
}
208209

209-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
210+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
210211
require.NoError(t, err)
211212
assert.Equal(t, 1, n, "one VH should be mutated")
213+
assert.Equal(t, 0, converted, "online connector must not convert any forwarding routes")
212214

213215
vh := rc.VirtualHosts[0]
214216
require.Len(t, vh.Routes, 2, "CONNECT route must be prepended; want 2 routes total")
@@ -252,7 +254,7 @@ func TestApplyConnectorRoutes_Online_TargetHostDeduplicated(t *testing.T) {
252254
},
253255
}
254256

255-
_, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
257+
_, _, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
256258
require.NoError(t, err)
257259

258260
// Domain must not be duplicated.
@@ -288,27 +290,139 @@ func TestApplyConnectorRoutes_Offline_Prepends503Route_NoDomain(t *testing.T) {
288290
},
289291
}
290292

291-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
293+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
292294
require.NoError(t, err)
293295
assert.Equal(t, 1, n, "offline VH must be mutated (503 route prepended)")
296+
assert.Equal(t, 1, converted, "the user-facing forwarding route must be converted to a direct_response")
294297

295298
vh := rc.VirtualHosts[0]
296299
require.Len(t, vh.Routes, 2, "503 direct_response route must be prepended")
297300

301+
// First route: the connect_matcher offline route for CONNECT clients.
298302
offlineRoute := vh.Routes[0]
299303
dr := offlineRoute.GetDirectResponse()
300304
require.NotNil(t, dr, "first route must be a direct_response")
305+
assert.NotNil(t, offlineRoute.GetMatch().GetConnectMatcher(),
306+
"prepended offline route must keep its connect_matcher")
301307
assert.Equal(t, uint32(503), dr.GetStatus(),
302308
"offline route must return 503")
303309
assert.Equal(t, "Tunnel not online", dr.GetBody().GetInlineString(),
304310
"offline route body must be 'Tunnel not online' per STATE.md contract")
305311

312+
// Second route: the original user-facing forwarding route must now be a
313+
// direct_response 503 too (Gap B), NOT a cluster route to the empty cluster.
314+
fwd := vh.Routes[1]
315+
assert.Equal(t, "fwd", fwd.GetName(), "forwarding route identity preserved")
316+
assert.Empty(t, routeCluster(fwd),
317+
"forwarding route must no longer target the endpoint-less connector cluster")
318+
fwdDR := fwd.GetDirectResponse()
319+
require.NotNil(t, fwdDR, "forwarding route must be converted to a direct_response")
320+
assert.Equal(t, uint32(503), fwdDR.GetStatus(), "converted forwarding route must return 503")
321+
assert.Equal(t, "Tunnel not online", fwdDR.GetBody().GetInlineString(),
322+
"converted forwarding route must reuse the offline body")
323+
306324
// No domain must be appended for offline connectors.
307325
assert.NotContains(t, vh.Domains, testTargetHost,
308326
"target host must NOT be appended for offline connector")
309327
assert.Len(t, vh.Domains, 1, "domains must remain unchanged for offline connector")
310328
}
311329

330+
// TestApplyConnectorRoutes_Offline_PreservesMatchAndConfig verifies that
331+
// converting a forwarding route to a direct_response only replaces the Action
332+
// oneof — the route's match and typed_per_filter_config survive — and that a
333+
// co-located non-connector route in the same VH is untouched.
334+
func TestApplyConnectorRoutes_Offline_PreservesMatchAndUntouchedRoute(t *testing.T) {
335+
idx := connectorPolicyIndex(false)
336+
clusterName := testClusterName()
337+
offlineInfo := &extcache.ConnectorInfo{Online: false, TargetHost: testTargetHost, TargetPort: testTargetPort}
338+
replaced := map[string]*extcache.ConnectorInfo{}
339+
offline := map[string]*extcache.ConnectorInfo{clusterName: offlineInfo}
340+
341+
// Connector forwarding route carries a prefix match + typed_per_filter_config.
342+
connRoute := routeTargeting(clusterName)
343+
connRoute.Match = &routev3.RouteMatch{
344+
PathSpecifier: &routev3.RouteMatch_Prefix{Prefix: "/"},
345+
}
346+
connRoute.TypedPerFilterConfig = map[string]*anypb.Any{
347+
"envoy.filters.http.cors": {TypeUrl: "type.googleapis.com/example.Cfg"},
348+
}
349+
350+
// A second route in the same VH targets an unrelated cluster.
351+
otherRoute := routeTargeting("infra-cluster")
352+
otherRoute.Name = "other"
353+
354+
rc := &routev3.RouteConfiguration{
355+
VirtualHosts: []*routev3.VirtualHost{
356+
{
357+
Name: "vh",
358+
Domains: []string{"app.local.test"},
359+
Routes: []*routev3.Route{connRoute, otherRoute},
360+
},
361+
},
362+
}
363+
364+
_, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
365+
require.NoError(t, err)
366+
assert.Equal(t, 1, converted, "only the connector forwarding route must be converted")
367+
368+
vh := rc.VirtualHosts[0]
369+
require.Len(t, vh.Routes, 3, "connect_matcher route prepended to the two originals")
370+
371+
// Converted forwarding route: match + typed_per_filter_config preserved.
372+
gotConn := vh.Routes[1]
373+
require.NotNil(t, gotConn.GetDirectResponse(), "connector forwarding route must be a direct_response")
374+
assert.Equal(t, "/", gotConn.GetMatch().GetPrefix(), "prefix match must be preserved")
375+
assert.Contains(t, gotConn.GetTypedPerFilterConfig(), "envoy.filters.http.cors",
376+
"typed_per_filter_config must be preserved on the converted route")
377+
378+
// The non-connector route must be completely untouched.
379+
gotOther := vh.Routes[2]
380+
assert.Equal(t, "other", gotOther.GetName())
381+
assert.Equal(t, "infra-cluster", routeCluster(gotOther),
382+
"non-connector route must still target its cluster")
383+
assert.Nil(t, gotOther.GetDirectResponse(), "non-connector route must not be converted")
384+
}
385+
386+
// TestApplyConnectorRoutes_Offline_Idempotent verifies a second pass does not
387+
// double-apply: the connect_matcher route is prepended once more (it is keyed by
388+
// VH name and matches no cluster), but no forwarding route is re-converted
389+
// because converted routes are direct_responses and no longer target the cluster.
390+
func TestApplyConnectorRoutes_Offline_Idempotent(t *testing.T) {
391+
idx := connectorPolicyIndex(false)
392+
clusterName := testClusterName()
393+
offlineInfo := &extcache.ConnectorInfo{Online: false, TargetHost: testTargetHost, TargetPort: testTargetPort}
394+
replaced := map[string]*extcache.ConnectorInfo{}
395+
offline := map[string]*extcache.ConnectorInfo{clusterName: offlineInfo}
396+
397+
rc := &routev3.RouteConfiguration{
398+
VirtualHosts: []*routev3.VirtualHost{
399+
{
400+
Name: "vh",
401+
Domains: []string{"app.local.test"},
402+
Routes: []*routev3.Route{routeTargeting(clusterName)},
403+
},
404+
},
405+
}
406+
407+
_, converted1, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
408+
require.NoError(t, err)
409+
assert.Equal(t, 1, converted1, "first pass converts the forwarding route")
410+
411+
// Second pass: the cluster is gone from all routes, so nothing converts.
412+
_, converted2, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
413+
require.NoError(t, err)
414+
assert.Equal(t, 0, converted2, "second pass must not re-convert any route")
415+
416+
// Every direct_response route still returns the offline 503 body.
417+
for _, rt := range rc.VirtualHosts[0].Routes {
418+
if dr := rt.GetDirectResponse(); dr != nil {
419+
assert.Equal(t, uint32(503), dr.GetStatus())
420+
assert.Equal(t, "Tunnel not online", dr.GetBody().GetInlineString())
421+
}
422+
assert.Empty(t, routeCluster(rt), "no route may target the offline connector cluster after conversion")
423+
}
424+
}
425+
312426
func TestApplyConnectorRoutes_NoConnector_VHUntouched(t *testing.T) {
313427
idx := connectorPolicyIndex(true)
314428

@@ -326,9 +440,10 @@ func TestApplyConnectorRoutes_NoConnector_VHUntouched(t *testing.T) {
326440
},
327441
}
328442

329-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
443+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
330444
require.NoError(t, err)
331445
assert.Equal(t, 0, n, "VH with non-connector cluster must not be mutated")
446+
assert.Equal(t, 0, converted, "no forwarding routes converted when no connector present")
332447
assert.Len(t, rc.VirtualHosts[0].Routes, 1, "route count must not change")
333448
assert.Len(t, rc.VirtualHosts[0].Domains, 1, "domain list must not change")
334449
}
@@ -340,7 +455,8 @@ func TestApplyConnectorRoutes_EmptyRouteConfiguration_NoOp(t *testing.T) {
340455

341456
rc := &routev3.RouteConfiguration{Name: "empty"}
342457

343-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
458+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
344459
require.NoError(t, err)
345460
assert.Equal(t, 0, n)
461+
assert.Equal(t, 0, converted)
346462
}

internal/extensionserver/server/server.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ func (s *Server) PostTranslateModify(
135135
localReplyCount int
136136
tppCount int
137137
vhCount int
138+
offlineRtCount int
138139
replaced map[string]*extcache.ConnectorInfo
139140
connOffline map[string]*extcache.ConnectorInfo
140141
)
@@ -236,7 +237,7 @@ func (s *Server) PostTranslateModify(
236237

237238
_, connRoutesSpan := tr.Start(mctx, "connector.routes")
238239
for _, rc := range routes {
239-
n, mutErr := mutate.ApplyConnectorRoutes(rc, idx, replaced, connOffline)
240+
n, offlineRt, mutErr := mutate.ApplyConnectorRoutes(rc, idx, replaced, connOffline)
240241
if mutErr != nil {
241242
s.log.Error("apply connector routes", "route_config", rc.GetName(), "err", mutErr)
242243
connRoutesSpan.RecordError(mutErr)
@@ -249,8 +250,12 @@ func (s *Server) PostTranslateModify(
249250
return nil, mutErr
250251
}
251252
vhCount += n
253+
offlineRtCount += offlineRt
252254
}
253-
connRoutesSpan.SetAttributes(attribute.Int("vhosts.connector_applied", vhCount))
255+
connRoutesSpan.SetAttributes(
256+
attribute.Int("vhosts.connector_applied", vhCount),
257+
attribute.Int("routes.connector_offline", offlineRtCount),
258+
)
254259
connRoutesSpan.End()
255260
mspan.End()
256261

@@ -263,6 +268,7 @@ func (s *Server) PostTranslateModify(
263268
extmetrics.WAFRouteMutationsTotal.Add(float64(tppCount))
264269
extmetrics.ConnectorClustersTotal.Add(float64(len(replaced)))
265270
extmetrics.ConnectorRoutesTotal.Add(float64(vhCount))
271+
extmetrics.ConnectorOfflineRoutesTotal.Add(float64(offlineRtCount))
266272

267273
s.log.Info("PostTranslateModify",
268274
"clusters", len(clusters),
@@ -274,6 +280,7 @@ func (s *Server) PostTranslateModify(
274280
"clusters_replaced", len(replaced),
275281
"clusters_offline", len(connOffline),
276282
"vhosts_connector_applied", vhCount,
283+
"connector_offline_routes", offlineRtCount,
277284
)
278285

279286
return &pb.PostTranslateModifyResponse{

0 commit comments

Comments
 (0)