Skip to content

Commit 6ca76df

Browse files
authored
Merge pull request #207 from datum-cloud/fix/connector-offline-user-path
2 parents 8b8cda1 + cf70fa4 commit 6ca76df

4 files changed

Lines changed: 190 additions & 23 deletions

File tree

internal/extensionserver/metrics/metrics.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ var (
141141
},
142142
)
143143

144+
// ConnectorOfflineRoutesTotal counts total user-facing forwarding routes
145+
// rewritten to a tunnel-offline 503 direct_response across all hook
146+
// invocations. Each increment = one route that previously targeted an
147+
// endpoint-less offline-connector cluster and now returns a deterministic
148+
// 503 (instead of Envoy's generic 503 no_healthy_upstream). Use rate()/sum()
149+
// to derive per-build averages.
150+
ConnectorOfflineRoutesTotal = promauto.NewCounter(
151+
prometheus.CounterOpts{
152+
Name: "nso_extension_connector_offline_routes_total",
153+
Help: "Total user-facing forwarding routes rewritten to a tunnel-offline 503 direct_response across all hook invocations.",
154+
},
155+
)
156+
144157
// CacheSynced is 1 when the informer cache has synced and the extension server
145158
// is accepting gRPC connections, 0 during startup or if the cache lost sync.
146159
CacheSynced = promauto.NewGauge(

internal/extensionserver/mutate/connector.go

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,19 @@ func ReplaceConnectorClusters(
7272
}
7373

7474
// ApplyConnectorRoutes applies connector route mutations to a RouteConfiguration:
75-
// - Online (replaced) connector found in a VH: prepend a CONNECT upgrade route
76-
// targeting the replaced cluster and append info.TargetHost to VH domains.
77-
// - Offline connector found in a VH: prepend a direct_response 503 CONNECT route.
75+
// - Online (replaced) connector: prepend a CONNECT upgrade route targeting the
76+
// replaced cluster and append info.TargetHost to VH domains.
77+
// - Offline connector: prepend a CONNECT direct_response 503 (tunnel-control
78+
// clients) and rewrite the user-facing forwarding routes to a 503
79+
// direct_response (see the offline branch for why).
7880
//
79-
// Returns the number of VirtualHosts mutated.
81+
// Returns the number of VirtualHosts mutated and the number of forwarding
82+
// routes converted to a tunnel-offline direct_response.
8083
func ApplyConnectorRoutes(
8184
rc *routev3.RouteConfiguration,
8285
idx *extcache.PolicyIndex,
8386
replaced, offline map[string]*extcache.ConnectorInfo,
84-
) (int, error) {
85-
mutated := 0
87+
) (mutated, converted int, err error) {
8688
for _, vh := range rc.GetVirtualHosts() {
8789
// Find any connector cluster referenced by routes in this VH.
8890
var connectorCluster string
@@ -102,7 +104,6 @@ func ApplyConnectorRoutes(
102104
}
103105

104106
var newRoute *routev3.Route
105-
var err error
106107

107108
if info, ok := replaced[connectorCluster]; ok {
108109
// Online: CONNECT route targeting the replaced cluster.
@@ -111,7 +112,7 @@ func ApplyConnectorRoutes(
111112
connectorCluster,
112113
)
113114
if err != nil {
114-
return mutated, fmt.Errorf("build connect route for %q: %w", vh.GetName(), err)
115+
return mutated, converted, fmt.Errorf("build connect route for %q: %w", vh.GetName(), err)
115116
}
116117
// Prepend the CONNECT route (NSO inserts at /virtual_hosts/0/routes/0).
117118
vh.Routes = append([]*routev3.Route{newRoute}, vh.Routes...)
@@ -120,17 +121,31 @@ func ApplyConnectorRoutes(
120121
// NOT a synthetic .connector.local domain.
121122
vh.Domains = appendUnique(vh.Domains, info.TargetHost)
122123
} else {
123-
// Offline: direct_response 503 CONNECT route ("Tunnel not online").
124+
// Offline connect_matcher route for tunnel-control clients.
124125
newRoute, err = buildOfflineRoute("connector-offline-" + sanitizeID(vh.GetName()))
125126
if err != nil {
126-
return mutated, fmt.Errorf("build offline route for %q: %w", vh.GetName(), err)
127+
return mutated, converted, fmt.Errorf("build offline route for %q: %w", vh.GetName(), err)
127128
}
128129
vh.Routes = append([]*routev3.Route{newRoute}, vh.Routes...)
129-
// No domain appended for offline connectors (no live tunnel).
130+
131+
// Route user traffic to a deterministic 503 instead of the
132+
// endpoint-less offline cluster, which would yield a generic
133+
// no_healthy_upstream plus retry/cluster-stat noise. Replacing only
134+
// the Action oneof preserves each route's match/metadata; idempotent
135+
// because direct_responses carry no cluster to re-match.
136+
for _, rt := range vh.GetRoutes() {
137+
if routeCluster(rt) != connectorCluster {
138+
continue
139+
}
140+
if derr := setRouteDirectResponse(rt, 503, offlineResponseBody); derr != nil {
141+
return mutated, converted, fmt.Errorf("convert offline forward route for %q: %w", vh.GetName(), derr)
142+
}
143+
converted++
144+
}
130145
}
131146
mutated++
132147
}
133-
return mutated, nil
148+
return mutated, converted, nil
134149
}
135150

136151
// --- Internal helpers ---
@@ -234,18 +249,34 @@ func buildConnectRoute(name, cluster string) (*routev3.Route, error) {
234249
return rt, nil
235250
}
236251

237-
// buildOfflineRoute builds the offline CONNECT route (direct_response 503
238-
// "Tunnel not online"). Mirrors buildOfflineRoute in the seed. The exact body
252+
// offlineResponseBody is the inline body for tunnel-offline 503 responses,
253+
// shared by both offline paths so they return an identical body. The exact
239254
// string is part of the STATE.md metadata contract.
255+
const offlineResponseBody = "Tunnel not online"
256+
257+
// buildOfflineRoute builds the offline CONNECT route (direct_response 503
258+
// "Tunnel not online"). Mirrors buildOfflineRoute in the seed.
240259
func buildOfflineRoute(name string) (*routev3.Route, error) {
241260
j := fmt.Sprintf(`{
242261
"name": %q,
243262
"match": { "connect_matcher": {} },
244-
"direct_response": { "status": 503, "body": { "inline_string": "Tunnel not online" } }
245-
}`, name)
263+
"direct_response": { "status": 503, "body": { "inline_string": %q } }
264+
}`, name, offlineResponseBody)
246265
rt := &routev3.Route{}
247266
if err := protojson.Unmarshal([]byte(j), rt); err != nil {
248267
return nil, fmt.Errorf("unmarshal offline route JSON: %w", err)
249268
}
250269
return rt, nil
251270
}
271+
272+
// setRouteDirectResponse swaps a route's action for a direct_response. Only the
273+
// Action oneof is replaced, so match/metadata/typed_per_filter_config survive.
274+
func setRouteDirectResponse(rt *routev3.Route, status uint32, body string) error {
275+
j := fmt.Sprintf(`{ "status": %d, "body": { "inline_string": %q } }`, status, body)
276+
dr := &routev3.DirectResponseAction{}
277+
if err := protojson.Unmarshal([]byte(j), dr); err != nil {
278+
return fmt.Errorf("unmarshal direct_response action JSON: %w", err)
279+
}
280+
rt.Action = &routev3.Route_DirectResponse{DirectResponse: dr}
281+
return nil
282+
}

internal/extensionserver/mutate/connector_test.go

Lines changed: 121 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
clusterv3 "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3"
1010
routev3 "github.com/envoyproxy/go-control-plane/envoy/config/route/v3"
11+
"google.golang.org/protobuf/types/known/anypb"
1112

1213
extcache "go.datum.net/network-services-operator/internal/extensionserver/cache"
1314
)
@@ -206,9 +207,10 @@ func TestApplyConnectorRoutes_Online_PrependsCONNECTRouteAndAppendsTargetHost(t
206207
},
207208
}
208209

209-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
210+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
210211
require.NoError(t, err)
211212
assert.Equal(t, 1, n, "one VH should be mutated")
213+
assert.Equal(t, 0, converted, "online connector must not convert any forwarding routes")
212214

213215
vh := rc.VirtualHosts[0]
214216
require.Len(t, vh.Routes, 2, "CONNECT route must be prepended; want 2 routes total")
@@ -252,7 +254,7 @@ func TestApplyConnectorRoutes_Online_TargetHostDeduplicated(t *testing.T) {
252254
},
253255
}
254256

255-
_, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
257+
_, _, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
256258
require.NoError(t, err)
257259

258260
// Domain must not be duplicated.
@@ -288,27 +290,139 @@ func TestApplyConnectorRoutes_Offline_Prepends503Route_NoDomain(t *testing.T) {
288290
},
289291
}
290292

291-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
293+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
292294
require.NoError(t, err)
293295
assert.Equal(t, 1, n, "offline VH must be mutated (503 route prepended)")
296+
assert.Equal(t, 1, converted, "the user-facing forwarding route must be converted to a direct_response")
294297

295298
vh := rc.VirtualHosts[0]
296299
require.Len(t, vh.Routes, 2, "503 direct_response route must be prepended")
297300

301+
// First route: the connect_matcher offline route for CONNECT clients.
298302
offlineRoute := vh.Routes[0]
299303
dr := offlineRoute.GetDirectResponse()
300304
require.NotNil(t, dr, "first route must be a direct_response")
305+
assert.NotNil(t, offlineRoute.GetMatch().GetConnectMatcher(),
306+
"prepended offline route must keep its connect_matcher")
301307
assert.Equal(t, uint32(503), dr.GetStatus(),
302308
"offline route must return 503")
303309
assert.Equal(t, "Tunnel not online", dr.GetBody().GetInlineString(),
304310
"offline route body must be 'Tunnel not online' per STATE.md contract")
305311

312+
// Second route: the original user-facing forwarding route must now be a
313+
// direct_response 503 too, NOT a cluster route to the empty cluster.
314+
fwd := vh.Routes[1]
315+
assert.Equal(t, "fwd", fwd.GetName(), "forwarding route identity preserved")
316+
assert.Empty(t, routeCluster(fwd),
317+
"forwarding route must no longer target the endpoint-less connector cluster")
318+
fwdDR := fwd.GetDirectResponse()
319+
require.NotNil(t, fwdDR, "forwarding route must be converted to a direct_response")
320+
assert.Equal(t, uint32(503), fwdDR.GetStatus(), "converted forwarding route must return 503")
321+
assert.Equal(t, "Tunnel not online", fwdDR.GetBody().GetInlineString(),
322+
"converted forwarding route must reuse the offline body")
323+
306324
// No domain must be appended for offline connectors.
307325
assert.NotContains(t, vh.Domains, testTargetHost,
308326
"target host must NOT be appended for offline connector")
309327
assert.Len(t, vh.Domains, 1, "domains must remain unchanged for offline connector")
310328
}
311329

330+
// TestApplyConnectorRoutes_Offline_PreservesMatchAndConfig verifies that
331+
// converting a forwarding route to a direct_response only replaces the Action
332+
// oneof — the route's match and typed_per_filter_config survive — and that a
333+
// co-located non-connector route in the same VH is untouched.
334+
func TestApplyConnectorRoutes_Offline_PreservesMatchAndUntouchedRoute(t *testing.T) {
335+
idx := connectorPolicyIndex(false)
336+
clusterName := testClusterName()
337+
offlineInfo := &extcache.ConnectorInfo{Online: false, TargetHost: testTargetHost, TargetPort: testTargetPort}
338+
replaced := map[string]*extcache.ConnectorInfo{}
339+
offline := map[string]*extcache.ConnectorInfo{clusterName: offlineInfo}
340+
341+
// Connector forwarding route carries a prefix match + typed_per_filter_config.
342+
connRoute := routeTargeting(clusterName)
343+
connRoute.Match = &routev3.RouteMatch{
344+
PathSpecifier: &routev3.RouteMatch_Prefix{Prefix: "/"},
345+
}
346+
connRoute.TypedPerFilterConfig = map[string]*anypb.Any{
347+
"envoy.filters.http.cors": {TypeUrl: "type.googleapis.com/example.Cfg"},
348+
}
349+
350+
// A second route in the same VH targets an unrelated cluster.
351+
otherRoute := routeTargeting("infra-cluster")
352+
otherRoute.Name = "other"
353+
354+
rc := &routev3.RouteConfiguration{
355+
VirtualHosts: []*routev3.VirtualHost{
356+
{
357+
Name: "vh",
358+
Domains: []string{"app.local.test"},
359+
Routes: []*routev3.Route{connRoute, otherRoute},
360+
},
361+
},
362+
}
363+
364+
_, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
365+
require.NoError(t, err)
366+
assert.Equal(t, 1, converted, "only the connector forwarding route must be converted")
367+
368+
vh := rc.VirtualHosts[0]
369+
require.Len(t, vh.Routes, 3, "connect_matcher route prepended to the two originals")
370+
371+
// Converted forwarding route: match + typed_per_filter_config preserved.
372+
gotConn := vh.Routes[1]
373+
require.NotNil(t, gotConn.GetDirectResponse(), "connector forwarding route must be a direct_response")
374+
assert.Equal(t, "/", gotConn.GetMatch().GetPrefix(), "prefix match must be preserved")
375+
assert.Contains(t, gotConn.GetTypedPerFilterConfig(), "envoy.filters.http.cors",
376+
"typed_per_filter_config must be preserved on the converted route")
377+
378+
// The non-connector route must be completely untouched.
379+
gotOther := vh.Routes[2]
380+
assert.Equal(t, "other", gotOther.GetName())
381+
assert.Equal(t, "infra-cluster", routeCluster(gotOther),
382+
"non-connector route must still target its cluster")
383+
assert.Nil(t, gotOther.GetDirectResponse(), "non-connector route must not be converted")
384+
}
385+
386+
// TestApplyConnectorRoutes_Offline_Idempotent verifies a second pass does not
387+
// double-apply: the connect_matcher route is prepended once more (it is keyed by
388+
// VH name and matches no cluster), but no forwarding route is re-converted
389+
// because converted routes are direct_responses and no longer target the cluster.
390+
func TestApplyConnectorRoutes_Offline_Idempotent(t *testing.T) {
391+
idx := connectorPolicyIndex(false)
392+
clusterName := testClusterName()
393+
offlineInfo := &extcache.ConnectorInfo{Online: false, TargetHost: testTargetHost, TargetPort: testTargetPort}
394+
replaced := map[string]*extcache.ConnectorInfo{}
395+
offline := map[string]*extcache.ConnectorInfo{clusterName: offlineInfo}
396+
397+
rc := &routev3.RouteConfiguration{
398+
VirtualHosts: []*routev3.VirtualHost{
399+
{
400+
Name: "vh",
401+
Domains: []string{"app.local.test"},
402+
Routes: []*routev3.Route{routeTargeting(clusterName)},
403+
},
404+
},
405+
}
406+
407+
_, converted1, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
408+
require.NoError(t, err)
409+
assert.Equal(t, 1, converted1, "first pass converts the forwarding route")
410+
411+
// Second pass: the cluster is gone from all routes, so nothing converts.
412+
_, converted2, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
413+
require.NoError(t, err)
414+
assert.Equal(t, 0, converted2, "second pass must not re-convert any route")
415+
416+
// Every direct_response route still returns the offline 503 body.
417+
for _, rt := range rc.VirtualHosts[0].Routes {
418+
if dr := rt.GetDirectResponse(); dr != nil {
419+
assert.Equal(t, uint32(503), dr.GetStatus())
420+
assert.Equal(t, "Tunnel not online", dr.GetBody().GetInlineString())
421+
}
422+
assert.Empty(t, routeCluster(rt), "no route may target the offline connector cluster after conversion")
423+
}
424+
}
425+
312426
func TestApplyConnectorRoutes_NoConnector_VHUntouched(t *testing.T) {
313427
idx := connectorPolicyIndex(true)
314428

@@ -326,9 +440,10 @@ func TestApplyConnectorRoutes_NoConnector_VHUntouched(t *testing.T) {
326440
},
327441
}
328442

329-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
443+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
330444
require.NoError(t, err)
331445
assert.Equal(t, 0, n, "VH with non-connector cluster must not be mutated")
446+
assert.Equal(t, 0, converted, "no forwarding routes converted when no connector present")
332447
assert.Len(t, rc.VirtualHosts[0].Routes, 1, "route count must not change")
333448
assert.Len(t, rc.VirtualHosts[0].Domains, 1, "domain list must not change")
334449
}
@@ -340,7 +455,8 @@ func TestApplyConnectorRoutes_EmptyRouteConfiguration_NoOp(t *testing.T) {
340455

341456
rc := &routev3.RouteConfiguration{Name: "empty"}
342457

343-
n, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
458+
n, converted, err := ApplyConnectorRoutes(rc, idx, replaced, offline)
344459
require.NoError(t, err)
345460
assert.Equal(t, 0, n)
461+
assert.Equal(t, 0, converted)
346462
}

internal/extensionserver/server/server.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ func (s *Server) PostTranslateModify(
135135
localReplyCount int
136136
tppCount int
137137
vhCount int
138+
offlineRtCount int
138139
replaced map[string]*extcache.ConnectorInfo
139140
connOffline map[string]*extcache.ConnectorInfo
140141
)
@@ -236,7 +237,7 @@ func (s *Server) PostTranslateModify(
236237

237238
_, connRoutesSpan := tr.Start(mctx, "connector.routes")
238239
for _, rc := range routes {
239-
n, mutErr := mutate.ApplyConnectorRoutes(rc, idx, replaced, connOffline)
240+
n, offlineRt, mutErr := mutate.ApplyConnectorRoutes(rc, idx, replaced, connOffline)
240241
if mutErr != nil {
241242
s.log.Error("apply connector routes", "route_config", rc.GetName(), "err", mutErr)
242243
connRoutesSpan.RecordError(mutErr)
@@ -249,8 +250,12 @@ func (s *Server) PostTranslateModify(
249250
return nil, mutErr
250251
}
251252
vhCount += n
253+
offlineRtCount += offlineRt
252254
}
253-
connRoutesSpan.SetAttributes(attribute.Int("vhosts.connector_applied", vhCount))
255+
connRoutesSpan.SetAttributes(
256+
attribute.Int("vhosts.connector_applied", vhCount),
257+
attribute.Int("routes.connector_offline", offlineRtCount),
258+
)
254259
connRoutesSpan.End()
255260
mspan.End()
256261

@@ -263,6 +268,7 @@ func (s *Server) PostTranslateModify(
263268
extmetrics.WAFRouteMutationsTotal.Add(float64(tppCount))
264269
extmetrics.ConnectorClustersTotal.Add(float64(len(replaced)))
265270
extmetrics.ConnectorRoutesTotal.Add(float64(vhCount))
271+
extmetrics.ConnectorOfflineRoutesTotal.Add(float64(offlineRtCount))
266272

267273
s.log.Info("PostTranslateModify",
268274
"clusters", len(clusters),
@@ -274,6 +280,7 @@ func (s *Server) PostTranslateModify(
274280
"clusters_replaced", len(replaced),
275281
"clusters_offline", len(connOffline),
276282
"vhosts_connector_applied", vhCount,
283+
"connector_offline_routes", offlineRtCount,
277284
)
278285

279286
return &pb.PostTranslateModifyResponse{

0 commit comments

Comments
 (0)