Skip to content

Commit 561e34f

Browse files
sjmiller609rgarcia
andauthored
Add standby / restore + fork support for Mac vz hypervisor (#115)
* Describe implementing vz standby * Fix VZ standby review issues * Increase CI test timeout and fix standby test nil deref * Add VZ fork preparation and running-fork integration test * Generalize vsock socket naming and add VZ standby fork integration * Update readme * Delete redundant test * fix: wire DockerSocket config and keep VM alive after entrypoint exits Two fixes for macOS development: 1. Wire cfg.Build.DockerSocket into builds.Config so the config file value (e.g. Colima socket path) is actually used instead of always falling back to /var/run/docker.sock. 2. Restore pre-PR#99 behavior of keeping the VM alive after the entrypoint exits by waiting on the guest-agent. PR#99 changed init to immediately power off the VM when the entrypoint exits, which breaks images like alpine:latest whose CMD is /bin/sh — the shell gets no stdin and exits instantly, killing the VM before anyone can `hm exec` into it. The guest-agent keeps the VM alive and accessible until an explicit stop/delete. * Revert "fix: wire DockerSocket config and keep VM alive after entrypoint exits" This reverts commit 37272ca. * fix: wire DockerSocket config into build manager (#121) cfg.Build.DockerSocket was parsed from config.yaml but never passed to builds.Config, so the docker_socket setting was silently ignored and the build manager always fell back to /var/run/docker.sock. --------- Co-authored-by: Rafael Garcia <raf@kernel.sh>
1 parent e57e856 commit 561e34f

21 files changed

Lines changed: 900 additions & 73 deletions

.github/workflows/test.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ jobs:
6161
6262
- name: Run tests
6363
env:
64+
GO_TEST_TIMEOUT: 600s
6465
# Docker auth for tests running as root (sudo)
6566
DOCKER_CONFIG: /home/debianuser/.docker
6667
# TLS/ACME testing (optional - tests will skip if not configured)
@@ -118,6 +119,7 @@ jobs:
118119
119120
- name: Run tests
120121
env:
122+
GO_TEST_TIMEOUT: 600s
121123
DEFAULT_HYPERVISOR: vz
122124
JWT_SECRET: ci-test-secret
123125
run: make test

DEVELOPMENT.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ Note: Full integration tests require Linux. On macOS, focus on unit tests and ma
469469

470470
1. **Disk Format**: vz only supports raw disk images (not qcow2). The image pipeline handles conversion automatically.
471471

472-
2. **Snapshots**: Not currently supported on the vz hypervisor.
472+
2. **Snapshot Compatibility**: vz save/restore requires macOS 14.0+ on Apple Silicon and a VM configuration that passes save/restore validation.
473473

474474
### Troubleshooting
475475

@@ -496,6 +496,7 @@ brew install caddy
496496
**"snapshot not supported"**
497497
- Requires macOS 14.0+ on Apple Silicon
498498
- Check: `sw_vers` and `uname -m` (should be arm64)
499+
- Ensure the VM has been paused before standby and has a save/restore-compatible configuration
499500

500501
**VM fails to start**
501502
- Check serial log: `<data_dir>/instances/<id>/serial.log`

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ SHELL := /bin/bash
33

44
# Directory where local binaries will be installed
55
BIN_DIR ?= $(CURDIR)/bin
6+
GO_TEST_TIMEOUT ?= 300s
67

78
$(BIN_DIR):
89
mkdir -p $(BIN_DIR)
@@ -13,7 +14,7 @@ OAPI_CODEGEN_VERSION ?= v2.5.1
1314
AIR ?= $(BIN_DIR)/air
1415
WIRE ?= $(BIN_DIR)/wire
1516
XCADDY ?= $(BIN_DIR)/xcaddy
16-
TEST_TIMEOUT ?= 600s
17+
TEST_TIMEOUT ?= $(GO_TEST_TIMEOUT)
1718

1819
# Install oapi-codegen (pinned to match committed generated code)
1920
$(OAPI_CODEGEN): | $(BIN_DIR)

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,10 @@ hypeman stop my-app
123123
# Start a stopped VM
124124
hypeman start my-app
125125
126-
# Put the VM to sleep (paused)
126+
# Put the VM in standby (snapshot to disk, stop hypervisor)
127127
hypeman standby my-app
128128
129-
# Wake the VM (resumed)
129+
# Restore the VM from standby
130130
hypeman restore my-app
131131
132132
# Delete all VMs

cmd/vz-shim/main.go

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,36 @@ func main() {
4646
slog.Info("vz-shim starting", "control_socket", config.ControlSocket, "vsock_socket", config.VsockSocket)
4747

4848
// Create the VM
49-
vm, vmConfig, err := createVM(config)
49+
vm, vmConfig, err := createVM(&config)
5050
if err != nil {
5151
slog.Error("failed to create VM", "error", err)
5252
fmt.Fprintf(os.Stderr, "failed to create VM: %v\n", err)
5353
os.Exit(1)
5454
}
5555

56-
if err := vm.Start(); err != nil {
57-
slog.Error("failed to start VM", "error", err)
58-
fmt.Fprintf(os.Stderr, "failed to start VM: %v\n", err)
59-
os.Exit(1)
56+
if config.RestoreMachineStatePath != "" {
57+
if err := validateSaveRestoreSupport(vmConfig); err != nil {
58+
slog.Error("save/restore not supported for VM config", "error", err)
59+
fmt.Fprintf(os.Stderr, "save/restore not supported for VM config: %v\n", err)
60+
os.Exit(1)
61+
}
62+
if err := restoreMachineState(vm, config.RestoreMachineStatePath); err != nil {
63+
slog.Error("failed to restore VM machine state", "error", err, "path", config.RestoreMachineStatePath)
64+
fmt.Fprintf(os.Stderr, "failed to restore VM machine state: %v\n", err)
65+
os.Exit(1)
66+
}
67+
slog.Info("VM restored from machine state", "path", config.RestoreMachineStatePath, "state", vm.State())
68+
} else {
69+
if err := vm.Start(); err != nil {
70+
slog.Error("failed to start VM", "error", err)
71+
fmt.Fprintf(os.Stderr, "failed to start VM: %v\n", err)
72+
os.Exit(1)
73+
}
74+
slog.Info("VM started", "vcpus", config.VCPUs, "memory_mb", config.MemoryBytes/1024/1024)
6075
}
61-
slog.Info("VM started", "vcpus", config.VCPUs, "memory_mb", config.MemoryBytes/1024/1024)
6276

6377
// Create the shim server
64-
server := NewShimServer(vm, vmConfig)
78+
server := NewShimServer(vm, vmConfig, config)
6579

6680
// Start control socket listener (remove stale socket from previous run)
6781
os.Remove(config.ControlSocket)

cmd/vz-shim/save_restore_arm64.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
//go:build darwin && arm64
2+
3+
package main
4+
5+
import (
6+
"fmt"
7+
8+
"github.com/Code-Hex/vz/v3"
9+
)
10+
11+
func validateSaveRestoreSupport(vmConfig *vz.VirtualMachineConfiguration) error {
12+
ok, err := vmConfig.ValidateSaveRestoreSupport()
13+
if err != nil {
14+
return err
15+
}
16+
if !ok {
17+
return fmt.Errorf("virtual machine configuration does not support save/restore")
18+
}
19+
return nil
20+
}
21+
22+
func saveMachineState(vm *vz.VirtualMachine, snapshotPath string) error {
23+
return vm.SaveMachineStateToPath(snapshotPath)
24+
}
25+
26+
func restoreMachineState(vm *vz.VirtualMachine, snapshotPath string) error {
27+
// The vz wrapper accepts a filesystem path and constructs a file URL internally.
28+
return vm.RestoreMachineStateFromURL(snapshotPath)
29+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
//go:build darwin && !arm64
2+
3+
package main
4+
5+
import (
6+
"fmt"
7+
"runtime"
8+
9+
"github.com/Code-Hex/vz/v3"
10+
)
11+
12+
func validateSaveRestoreSupport(vmConfig *vz.VirtualMachineConfiguration) error {
13+
return fmt.Errorf("save/restore is only supported on darwin/arm64 (current arch: %s)", runtime.GOARCH)
14+
}
15+
16+
func saveMachineState(vm *vz.VirtualMachine, snapshotPath string) error {
17+
return fmt.Errorf("save/restore is only supported on darwin/arm64 (current arch: %s)", runtime.GOARCH)
18+
}
19+
20+
func restoreMachineState(vm *vz.VirtualMachine, snapshotPath string) error {
21+
return fmt.Errorf("save/restore is only supported on darwin/arm64 (current arch: %s)", runtime.GOARCH)
22+
}

cmd/vz-shim/server.go

Lines changed: 91 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,28 @@ import (
1010
"log/slog"
1111
"net"
1212
"net/http"
13+
"os"
14+
"path/filepath"
1315
"sync"
1416

1517
"github.com/Code-Hex/vz/v3"
18+
"github.com/kernel/hypeman/lib/hypervisor/vz/shimconfig"
1619
)
1720

1821
// ShimServer handles control API and vsock proxy for a vz VM.
1922
type ShimServer struct {
20-
vm *vz.VirtualMachine
21-
vmConfig *vz.VirtualMachineConfiguration
22-
mu sync.RWMutex
23+
vm *vz.VirtualMachine
24+
vmConfig *vz.VirtualMachineConfiguration
25+
shimConfig shimconfig.ShimConfig
26+
mu sync.RWMutex
2327
}
2428

2529
// NewShimServer creates a new shim server.
26-
func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfiguration) *ShimServer {
30+
func NewShimServer(vm *vz.VirtualMachine, vmConfig *vz.VirtualMachineConfiguration, config shimconfig.ShimConfig) *ShimServer {
2731
return &ShimServer{
28-
vm: vm,
29-
vmConfig: vmConfig,
32+
vm: vm,
33+
vmConfig: vmConfig,
34+
shimConfig: config,
3035
}
3136
}
3237

@@ -35,6 +40,10 @@ type VMInfoResponse struct {
3540
State string `json:"state"`
3641
}
3742

43+
type snapshotRequest struct {
44+
DestinationPath string `json:"destination_path"`
45+
}
46+
3847
// Handler returns the HTTP handler for the control API.
3948
func (s *ShimServer) Handler() http.Handler {
4049
mux := http.NewServeMux()
@@ -44,6 +53,7 @@ func (s *ShimServer) Handler() http.Handler {
4453
mux.HandleFunc("PUT /api/v1/vm.pause", s.handlePause)
4554
mux.HandleFunc("PUT /api/v1/vm.resume", s.handleResume)
4655
mux.HandleFunc("PUT /api/v1/vm.shutdown", s.handleShutdown)
56+
mux.HandleFunc("PUT /api/v1/vm.snapshot", s.handleSnapshot)
4757
mux.HandleFunc("PUT /api/v1/vm.power-button", s.handlePowerButton)
4858
mux.HandleFunc("GET /api/v1/vmm.ping", s.handlePing)
4959
mux.HandleFunc("PUT /api/v1/vmm.shutdown", s.handleVMMShutdown)
@@ -118,6 +128,77 @@ func (s *ShimServer) handleShutdown(w http.ResponseWriter, r *http.Request) {
118128
w.WriteHeader(http.StatusNoContent)
119129
}
120130

131+
func (s *ShimServer) handleSnapshot(w http.ResponseWriter, r *http.Request) {
132+
s.mu.Lock()
133+
defer s.mu.Unlock()
134+
135+
var req snapshotRequest
136+
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
137+
http.Error(w, fmt.Sprintf("invalid snapshot request: %v", err), http.StatusBadRequest)
138+
return
139+
}
140+
if req.DestinationPath == "" {
141+
http.Error(w, "destination_path is required", http.StatusBadRequest)
142+
return
143+
}
144+
if s.vm.State() != vz.VirtualMachineStatePaused {
145+
http.Error(w, "vm must be paused before snapshot", http.StatusBadRequest)
146+
return
147+
}
148+
if err := validateSaveRestoreSupport(s.vmConfig); err != nil {
149+
http.Error(w, fmt.Sprintf("save/restore not supported: %v", err), http.StatusBadRequest)
150+
return
151+
}
152+
153+
if err := os.MkdirAll(req.DestinationPath, 0755); err != nil {
154+
http.Error(w, fmt.Sprintf("create snapshot dir failed: %v", err), http.StatusInternalServerError)
155+
return
156+
}
157+
snapshotComplete := false
158+
defer func() {
159+
if !snapshotComplete {
160+
_ = os.RemoveAll(req.DestinationPath)
161+
}
162+
}()
163+
164+
machineStatePath := filepath.Join(req.DestinationPath, shimconfig.SnapshotMachineStateFile)
165+
if err := os.RemoveAll(machineStatePath); err != nil {
166+
http.Error(w, fmt.Sprintf("prepare machine state path failed: %v", err), http.StatusInternalServerError)
167+
return
168+
}
169+
if err := saveMachineState(s.vm, machineStatePath); err != nil {
170+
http.Error(w, fmt.Sprintf("save machine state failed: %v", err), http.StatusInternalServerError)
171+
return
172+
}
173+
174+
manifestPath := filepath.Join(req.DestinationPath, shimconfig.SnapshotManifestFile)
175+
tmpManifestPath := manifestPath + ".tmp"
176+
manifest := shimconfig.SnapshotManifest{
177+
Hypervisor: "vz",
178+
MachineStateFile: shimconfig.SnapshotMachineStateFile,
179+
ShimConfig: s.shimConfig,
180+
}
181+
// This field is runtime-only; restore path is populated by the caller on restore.
182+
manifest.ShimConfig.RestoreMachineStatePath = ""
183+
manifestBytes, err := json.Marshal(manifest)
184+
if err != nil {
185+
http.Error(w, fmt.Sprintf("marshal manifest failed: %v", err), http.StatusInternalServerError)
186+
return
187+
}
188+
if err := os.WriteFile(tmpManifestPath, manifestBytes, 0644); err != nil {
189+
http.Error(w, fmt.Sprintf("write manifest failed: %v", err), http.StatusInternalServerError)
190+
return
191+
}
192+
if err := os.Rename(tmpManifestPath, manifestPath); err != nil {
193+
http.Error(w, fmt.Sprintf("finalize manifest failed: %v", err), http.StatusInternalServerError)
194+
return
195+
}
196+
197+
snapshotComplete = true
198+
slog.Info("VM snapshot saved", "destination", req.DestinationPath, "machine_state", machineStatePath)
199+
w.WriteHeader(http.StatusNoContent)
200+
}
201+
121202
func (s *ShimServer) handlePowerButton(w http.ResponseWriter, r *http.Request) {
122203
s.mu.Lock()
123204
defer s.mu.Unlock()
@@ -173,6 +254,10 @@ func vzStateToString(state vz.VirtualMachineState) string {
173254
return "Resuming"
174255
case vz.VirtualMachineStateStopping:
175256
return "Stopping"
257+
case vz.VirtualMachineStateSaving:
258+
return "Saving"
259+
case vz.VirtualMachineStateRestoring:
260+
return "Restoring"
176261
default:
177262
return "Unknown"
178263
}

cmd/vz-shim/vm.go

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
package main
44

55
import (
6+
"encoding/base64"
67
"fmt"
78
"log/slog"
89
"net"
@@ -15,7 +16,7 @@ import (
1516
)
1617

1718
// createVM creates and configures a vz.VirtualMachine from ShimConfig.
18-
func createVM(config shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMachineConfiguration, error) {
19+
func createVM(config *shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMachineConfiguration, error) {
1920
// Prepare kernel command line (vz uses hvc0 for serial console)
2021
kernelArgs := config.KernelArgs
2122
if kernelArgs == "" {
@@ -61,15 +62,19 @@ func createVM(config shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMach
6162
return nil, nil, fmt.Errorf("configure storage: %w", err)
6263
}
6364

65+
if err := configurePlatform(vmConfig, config); err != nil {
66+
return nil, nil, fmt.Errorf("configure platform: %w", err)
67+
}
68+
6469
vsockConfig, err := vz.NewVirtioSocketDeviceConfiguration()
6570
if err != nil {
6671
return nil, nil, fmt.Errorf("create vsock device: %w", err)
6772
}
6873
vmConfig.SetSocketDevicesVirtualMachineConfiguration([]vz.SocketDeviceConfiguration{vsockConfig})
6974

70-
if balloonConfig, err := vz.NewVirtioTraditionalMemoryBalloonDeviceConfiguration(); err == nil {
71-
vmConfig.SetMemoryBalloonDevicesVirtualMachineConfiguration([]vz.MemoryBalloonDeviceConfiguration{balloonConfig})
72-
}
75+
// Do not attach memory balloon for now.
76+
// Save/restore compatibility on VZ can fail with "invalid argument" for some
77+
// Linux guest configurations when a balloon device is present.
7378

7479
if validated, err := vmConfig.Validate(); !validated || err != nil {
7580
return nil, nil, fmt.Errorf("invalid vm configuration: %w", err)
@@ -83,6 +88,37 @@ func createVM(config shimconfig.ShimConfig) (*vz.VirtualMachine, *vz.VirtualMach
8388
return vm, vmConfig, nil
8489
}
8590

91+
func configurePlatform(vmConfig *vz.VirtualMachineConfiguration, config *shimconfig.ShimConfig) error {
92+
var machineID *vz.GenericMachineIdentifier
93+
var err error
94+
95+
if config.MachineIdentifierData != "" {
96+
b, decodeErr := base64.StdEncoding.DecodeString(config.MachineIdentifierData)
97+
if decodeErr != nil {
98+
return fmt.Errorf("decode machine identifier data: %w", decodeErr)
99+
}
100+
machineID, err = vz.NewGenericMachineIdentifierWithData(b)
101+
if err != nil {
102+
return fmt.Errorf("recreate machine identifier: %w", err)
103+
}
104+
} else {
105+
machineID, err = vz.NewGenericMachineIdentifier()
106+
if err != nil {
107+
return fmt.Errorf("create machine identifier: %w", err)
108+
}
109+
config.MachineIdentifierData = base64.StdEncoding.EncodeToString(machineID.DataRepresentation())
110+
}
111+
112+
platformConfig, err := vz.NewGenericPlatformConfiguration(
113+
vz.WithGenericMachineIdentifier(machineID),
114+
)
115+
if err != nil {
116+
return fmt.Errorf("create generic platform config: %w", err)
117+
}
118+
vmConfig.SetPlatformVirtualMachineConfiguration(platformConfig)
119+
return nil
120+
}
121+
86122
func configureSerialConsole(vmConfig *vz.VirtualMachineConfiguration, logPath string) error {
87123
var serialAttachment *vz.FileHandleSerialPortAttachment
88124

0 commit comments

Comments
 (0)