Skip to content

Commit c76e9e9

Browse files
authored
fix: allocate fresh network on start VM (#36)
* fix: allocate fresh network on start VM * Update config disk on start * Run clean in CI
1 parent a076e43 commit c76e9e9

3 files changed

Lines changed: 42 additions & 33 deletions

File tree

.github/workflows/test.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ jobs:
3434
username: ${{ secrets.DOCKERHUB_USERNAME }}
3535
password: ${{ secrets.DOCKERHUB_PASSWORD }}
3636

37+
- name: Clean cached binaries
38+
run: make clean
39+
3740
- name: Build
3841
run: make build
3942

Makefile

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SHELL := /bin/bash
2-
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep
2+
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean
33

44
# Directory where local binaries will be installed
55
BIN_DIR ?= $(CURDIR)/bin
@@ -192,10 +192,8 @@ gen-jwt: $(GODOTENV)
192192
# Clean generated files and binaries
193193
clean:
194194
rm -rf $(BIN_DIR)
195-
rm -f lib/oapi/oapi.go
196-
rm -f lib/vmm/vmm.go
197-
rm -f lib/exec/exec.pb.go
198-
rm -f lib/exec/exec_grpc.pb.go
195+
rm -rf lib/vmm/binaries/cloud-hypervisor/
196+
rm -rf lib/ingress/binaries/
199197
rm -f lib/system/exec_agent/exec-agent
200198

201199
# Prepare for release build (called by GoReleaser)

lib/instances/start.go

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/onkernel/hypeman/lib/logger"
99
"github.com/onkernel/hypeman/lib/network"
1010
"go.opentelemetry.io/otel/trace"
11+
"gvisor.dev/gvisor/pkg/cleanup"
1112
)
1213

1314
// startInstance starts a stopped instance
@@ -52,46 +53,53 @@ func (m *manager) startInstance(
5253
return nil, fmt.Errorf("get image: %w", err)
5354
}
5455

55-
// 4. Recreate network allocation if network enabled
56+
// Setup cleanup stack for automatic rollback on errors
57+
cu := cleanup.Make(func() {})
58+
defer cu.Clean()
59+
60+
// 4. Allocate fresh network if network enabled
5661
var netConfig *network.NetworkConfig
5762
if stored.NetworkEnabled {
58-
log.DebugContext(ctx, "recreating network for start", "instance_id", id, "network", "default")
59-
if err := m.networkManager.RecreateAllocation(ctx, id); err != nil {
60-
log.ErrorContext(ctx, "failed to recreate network", "instance_id", id, "error", err)
61-
return nil, fmt.Errorf("recreate network: %w", err)
62-
}
63-
// Get the network config for VM configuration
64-
netAlloc, err := m.networkManager.GetAllocation(ctx, id)
63+
log.DebugContext(ctx, "allocating network for start", "instance_id", id, "network", "default")
64+
netConfig, err = m.networkManager.CreateAllocation(ctx, network.AllocateRequest{
65+
InstanceID: id,
66+
InstanceName: stored.Name,
67+
})
6568
if err != nil {
66-
log.ErrorContext(ctx, "failed to get network allocation", "instance_id", id, "error", err)
67-
// Cleanup network on failure
68-
if netAlloc != nil {
69-
m.networkManager.ReleaseAllocation(ctx, netAlloc)
70-
}
71-
return nil, fmt.Errorf("get network allocation: %w", err)
72-
}
73-
netConfig = &network.NetworkConfig{
74-
TAPDevice: netAlloc.TAPDevice,
75-
IP: netAlloc.IP,
76-
MAC: netAlloc.MAC,
77-
Netmask: "255.255.255.0", // Default netmask
69+
log.ErrorContext(ctx, "failed to allocate network", "instance_id", id, "error", err)
70+
return nil, fmt.Errorf("allocate network: %w", err)
7871
}
72+
// Update stored metadata with new IP/MAC
73+
stored.IP = netConfig.IP
74+
stored.MAC = netConfig.MAC
75+
// Add network cleanup to stack
76+
cu.Add(func() {
77+
m.networkManager.ReleaseAllocation(ctx, &network.Allocation{
78+
InstanceID: id,
79+
TAPDevice: netConfig.TAPDevice,
80+
})
81+
})
7982
}
8083

81-
// 5. Start VMM and boot VM (reuses logic from create)
84+
// 5. Regenerate config disk with new network configuration
85+
instForConfig := &Instance{StoredMetadata: *stored}
86+
log.DebugContext(ctx, "regenerating config disk", "instance_id", id)
87+
if err := m.createConfigDisk(instForConfig, imageInfo, netConfig); err != nil {
88+
log.ErrorContext(ctx, "failed to create config disk", "instance_id", id, "error", err)
89+
return nil, fmt.Errorf("create config disk: %w", err)
90+
}
91+
92+
// 6. Start VMM and boot VM (reuses logic from create)
8293
log.InfoContext(ctx, "starting VMM and booting VM", "instance_id", id)
8394
if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil {
8495
log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err)
85-
// Cleanup network on failure
86-
if stored.NetworkEnabled {
87-
if netAlloc, err := m.networkManager.GetAllocation(ctx, id); err == nil {
88-
m.networkManager.ReleaseAllocation(ctx, netAlloc)
89-
}
90-
}
9196
return nil, err
9297
}
9398

94-
// 6. Update metadata (set PID, StartedAt)
99+
// Success - release cleanup stack (prevent cleanup)
100+
cu.Release()
101+
102+
// 7. Update metadata (set PID, StartedAt)
95103
now := time.Now()
96104
stored.StartedAt = &now
97105

0 commit comments

Comments
 (0)