Skip to content

Commit 34d65cf

Browse files
committed
Reap orphaned bridge tc state on TAP GC
1 parent 3b85349 commit 34d65cf

5 files changed

Lines changed: 356 additions & 138 deletions

File tree

lib/instances/network_test.go

Lines changed: 152 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,18 +65,20 @@ func TestCreateInstanceWithNetwork(t *testing.T) {
6565
t.Log("Initializing network...")
6666
err = manager.networkManager.Initialize(ctx, nil)
6767
require.NoError(t, err)
68+
require.NoError(t, manager.networkManager.SetupHTB(ctx, 100*1024*1024))
6869
t.Log("Network initialized")
6970

7071
// Create instance with nginx:alpine and default network
7172
t.Log("Creating instance with default network...")
7273
inst, err := manager.CreateInstance(ctx, CreateInstanceRequest{
73-
Name: "test-net-instance",
74-
Image: integrationTestImageRef(t, "docker.io/library/nginx:alpine"),
75-
Size: 2 * 1024 * 1024 * 1024, // 2GB (needs extra room for initrd with NVIDIA libs)
76-
HotplugSize: 512 * 1024 * 1024,
77-
OverlaySize: 5 * 1024 * 1024 * 1024,
78-
Vcpus: 1,
79-
NetworkEnabled: true,
74+
Name: "test-net-instance",
75+
Image: integrationTestImageRef(t, "docker.io/library/nginx:alpine"),
76+
Size: 2 * 1024 * 1024 * 1024, // 2GB (needs extra room for initrd with NVIDIA libs)
77+
HotplugSize: 512 * 1024 * 1024,
78+
OverlaySize: 5 * 1024 * 1024 * 1024,
79+
Vcpus: 1,
80+
NetworkEnabled: true,
81+
NetworkBandwidthUpload: 1024 * 1024,
8082
HealthCheck: &healthcheck.Policy{
8183
Type: healthcheck.TypeHTTP,
8284
Interval: "1s",
@@ -126,6 +128,18 @@ func TestCreateInstanceWithNetwork(t *testing.T) {
126128
require.NoError(t, err)
127129
_, isBridge := master.(*netlink.Bridge)
128130
assert.True(t, isBridge, "TAP should be attached to a bridge")
131+
bridgeName := master.Attrs().Name
132+
133+
t.Log("Verifying orphaned bridge tc cleanup preserves live TAP state...")
134+
liveFlowID := waitForUploadFlowID(t, ctx, manager, inst.Id, bridgeName)
135+
require.True(t, bridgeClassExists(t, bridgeName, liveFlowID), "live upload class should exist before cleanup")
136+
staleFlowID := createStaleBridgeTCForTest(t, bridgeName)
137+
deletedTC := manager.networkManager.CleanupOrphanedClasses(ctx)
138+
require.GreaterOrEqual(t, deletedTC, 2, "expected stale filter and class to be deleted")
139+
assert.True(t, bridgeFilterExistsForFlowID(t, bridgeName, liveFlowID), "live upload filter should remain")
140+
assert.True(t, bridgeClassExists(t, bridgeName, liveFlowID), "live upload class should remain")
141+
assert.False(t, bridgeFilterExistsForFlowID(t, bridgeName, staleFlowID), "stale upload filter should be deleted")
142+
assert.False(t, bridgeClassExists(t, bridgeName, staleFlowID), "stale upload class should be deleted")
129143

130144
// Wait for nginx to start
131145
t.Log("Waiting for nginx to start...")
@@ -301,6 +315,137 @@ func TestCreateInstanceWithNetwork(t *testing.T) {
301315
t.Log("Network integration test complete!")
302316
}
303317

318+
func tcForTest(t *testing.T) string {
319+
t.Helper()
320+
if path, err := exec.LookPath("tc"); err == nil {
321+
return path
322+
}
323+
return "/usr/sbin/tc"
324+
}
325+
326+
func runTCForTest(t *testing.T, args ...string) string {
327+
t.Helper()
328+
output, err := exec.Command(tcForTest(t), args...).CombinedOutput()
329+
require.NoError(t, err, "tc %s output: %s", strings.Join(args, " "), string(output))
330+
return string(output)
331+
}
332+
333+
func bridgeClassesForTest(t *testing.T, bridgeName string) []string {
334+
t.Helper()
335+
output := runTCForTest(t, "class", "show", "dev", bridgeName)
336+
var classes []string
337+
for _, line := range strings.Split(output, "\n") {
338+
if !strings.Contains(line, "class htb 1:") {
339+
continue
340+
}
341+
fields := strings.Fields(line)
342+
if len(fields) >= 3 {
343+
classes = append(classes, fields[2])
344+
}
345+
}
346+
return classes
347+
}
348+
349+
func bridgeClassExists(t *testing.T, bridgeName, classID string) bool {
350+
t.Helper()
351+
for _, class := range bridgeClassesForTest(t, bridgeName) {
352+
if class == classID {
353+
return true
354+
}
355+
}
356+
return false
357+
}
358+
359+
func bridgeFilterExistsForFlowID(t *testing.T, bridgeName, flowID string) bool {
360+
t.Helper()
361+
return len(bridgeFilterHandlesForFlowID(t, bridgeName, flowID)) > 0
362+
}
363+
364+
func bridgeFilterHandlesForFlowID(t *testing.T, bridgeName, flowID string) []string {
365+
t.Helper()
366+
output := runTCForTest(t, "filter", "show", "dev", bridgeName, "parent", "1:")
367+
var handles []string
368+
for _, line := range strings.Split(output, "\n") {
369+
if !strings.HasPrefix(line, "filter ") {
370+
continue
371+
}
372+
fields := strings.Fields(line)
373+
handle, gotFlowID := "", ""
374+
for i, field := range fields {
375+
if i+1 >= len(fields) {
376+
break
377+
}
378+
switch field {
379+
case "handle":
380+
handle = fields[i+1]
381+
case "flowid":
382+
gotFlowID = fields[i+1]
383+
}
384+
}
385+
if handle != "" && gotFlowID == flowID {
386+
handles = append(handles, handle)
387+
}
388+
}
389+
return handles
390+
}
391+
392+
func waitForUploadFlowID(t *testing.T, ctx context.Context, manager *manager, instanceID, bridgeName string) string {
393+
t.Helper()
394+
var flowID string
395+
require.Eventually(t, func() bool {
396+
alloc, err := manager.networkManager.GetAllocation(ctx, instanceID)
397+
if err != nil || alloc == nil || alloc.ClassID == "" {
398+
return false
399+
}
400+
flowID = "1:" + alloc.ClassID
401+
return bridgeClassExists(t, bridgeName, flowID) && bridgeFilterExistsForFlowID(t, bridgeName, flowID)
402+
}, integrationTestTimeout(5*time.Second), 100*time.Millisecond)
403+
return flowID
404+
}
405+
406+
func createStaleBridgeTCForTest(t *testing.T, bridgeName string) string {
407+
t.Helper()
408+
used := make(map[string]bool)
409+
for _, classID := range bridgeClassesForTest(t, bridgeName) {
410+
used[classID] = true
411+
}
412+
413+
staleFlowID := ""
414+
for id := 0xff00; id <= 0xffff; id++ {
415+
candidate := fmt.Sprintf("1:%04x", id)
416+
if !used[candidate] {
417+
staleFlowID = candidate
418+
break
419+
}
420+
}
421+
require.NotEmpty(t, staleFlowID, "expected an unused test class id")
422+
423+
t.Cleanup(func() {
424+
bestEffortDeleteBridgeFiltersForFlowID(t, bridgeName, staleFlowID)
425+
_ = exec.Command(tcForTest(t), "qdisc", "del", "dev", bridgeName, "parent", staleFlowID).Run()
426+
_ = exec.Command(tcForTest(t), "class", "del", "dev", bridgeName, "classid", staleFlowID).Run()
427+
})
428+
429+
runTCForTest(t, "class", "add", "dev", bridgeName, "parent", "1:1",
430+
"classid", staleFlowID, "htb", "rate", "1mbit", "ceil", "1mbit")
431+
runTCForTest(t, "qdisc", "add", "dev", bridgeName, "parent", staleFlowID, "fq_codel")
432+
runTCForTest(t, "filter", "add", "dev", bridgeName, "parent", "1:",
433+
"protocol", "all", "prio", "1", "basic",
434+
"match", "meta(rt_iif eq 1)", "flowid", staleFlowID)
435+
436+
require.True(t, bridgeClassExists(t, bridgeName, staleFlowID), "staged stale class should exist")
437+
require.True(t, bridgeFilterExistsForFlowID(t, bridgeName, staleFlowID), "staged stale filter should exist")
438+
return staleFlowID
439+
}
440+
441+
func bestEffortDeleteBridgeFiltersForFlowID(t *testing.T, bridgeName, flowID string) {
442+
t.Helper()
443+
for _, handle := range bridgeFilterHandlesForFlowID(t, bridgeName, flowID) {
444+
_ = exec.Command(tcForTest(t), "filter", "del", "dev", bridgeName, "parent", "1:",
445+
"protocol", "all", "prio", "1", "handle", handle, "basic").Run()
446+
}
447+
}
448+
304449
func startRestartPolicyControllerForTest(t *testing.T, ctx context.Context, manager *manager) {
305450
t.Helper()
306451

lib/instances/tap_gc.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,12 @@ func (m *manager) reconcileTAPs(ctx context.Context) {
6363
// CleanupOrphanedTAPs short-circuits on empty preserve set to avoid
6464
// clobbering TAPs from concurrent hypeman processes/tests. Mirror that here
6565
// rather than calling it with an empty slice.
66-
log.DebugContext(ctx, "TAP GC: no preserve candidates, skipping pass")
67-
return
68-
}
69-
if deleted := m.networkManager.CleanupOrphanedTAPs(ctx, preserve, tapGCMinAge); deleted > 0 {
66+
log.DebugContext(ctx, "TAP GC: no preserve candidates, skipping TAP pass")
67+
} else if deleted := m.networkManager.CleanupOrphanedTAPs(ctx, preserve, tapGCMinAge); deleted > 0 {
7068
log.InfoContext(ctx, "TAP GC: cleaned up orphaned TAP devices", "count", deleted)
7169
}
70+
71+
if deleted := m.networkManager.CleanupOrphanedClasses(ctx); deleted > 0 {
72+
log.InfoContext(ctx, "TAP GC: cleaned up orphaned tc filters/classes", "count", deleted)
73+
}
7274
}

0 commit comments

Comments
 (0)