Skip to content

Commit e57e856

Browse files
authored
Fork VM (#114)
* Implement cloud-hypervisor fork * move fork support checks behind hypervisor starter interface * api: add fork instance endpoint * refactor: move CH fork snapshot rewrite into CH package * fork: support running source via standby-resume flow * Fix fork restore guest IP reconfiguration * Add QEMU fork support for running standby flow * Add fork target state and fix running-fork cleanup * Stabilize Linux CI test timeouts * Deep-copy metadata refs in fork path * Fix fork CID persistence semantics and cleanup validation * Stabilize firecracker fork integration test assertions * Harden fork rewrite safety and volume fork validation * Add Firecracker standby fork support * Address remaining fork review findings * Add firecracker fork support * Use running Firecracker fork test and gate on guest-agent readiness * Fail running firecracker fork when guest agent is not ready * Serialize firecracker snapshot source aliasing during restore * Fix fork name collisions and running-source restore ordering
1 parent 50f4539 commit e57e856

31 files changed

Lines changed: 3150 additions & 211 deletions

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
7171
TLS_TEST_DOMAIN: "test.hypeman-development.com"
7272
TLS_ALLOWED_DOMAINS: '*.hypeman-development.com'
73-
run: make test
73+
run: make test TEST_TIMEOUT=20m
7474

7575
test-darwin:
7676
runs-on: [self-hosted, macos, arm64]

cmd/api/api/instances.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,74 @@ func (s *ApiService) RestoreInstance(ctx context.Context, request oapi.RestoreIn
447447
return oapi.RestoreInstance200JSONResponse(instanceToOAPI(*result)), nil
448448
}
449449

450+
// ForkInstance forks an instance from stopped or standby into a new instance.
451+
// The id parameter can be an instance ID, name, or ID prefix.
452+
// Note: Resolution is handled by ResolveResource middleware.
453+
func (s *ApiService) ForkInstance(ctx context.Context, request oapi.ForkInstanceRequestObject) (oapi.ForkInstanceResponseObject, error) {
454+
inst := mw.GetResolvedInstance[instances.Instance](ctx)
455+
if inst == nil {
456+
return oapi.ForkInstance500JSONResponse{
457+
Code: "internal_error",
458+
Message: "resource not resolved",
459+
}, nil
460+
}
461+
log := logger.FromContext(ctx)
462+
463+
if request.Body == nil {
464+
return oapi.ForkInstance400JSONResponse{
465+
Code: "invalid_request",
466+
Message: "request body is required",
467+
}, nil
468+
}
469+
470+
targetState := instances.State("")
471+
if request.Body.TargetState != nil {
472+
targetState = instances.State(*request.Body.TargetState)
473+
}
474+
475+
result, err := s.InstanceManager.ForkInstance(ctx, inst.Id, instances.ForkInstanceRequest{
476+
Name: request.Body.Name,
477+
FromRunning: request.Body.FromRunning != nil && *request.Body.FromRunning,
478+
TargetState: targetState,
479+
})
480+
if err != nil {
481+
switch {
482+
case errors.Is(err, instances.ErrNotFound):
483+
return oapi.ForkInstance404JSONResponse{
484+
Code: "not_found",
485+
Message: "instance not found",
486+
}, nil
487+
case errors.Is(err, instances.ErrInvalidState):
488+
return oapi.ForkInstance409JSONResponse{
489+
Code: "invalid_state",
490+
Message: err.Error(),
491+
}, nil
492+
case errors.Is(err, instances.ErrInvalidRequest):
493+
return oapi.ForkInstance400JSONResponse{
494+
Code: "invalid_request",
495+
Message: err.Error(),
496+
}, nil
497+
case errors.Is(err, instances.ErrAlreadyExists), errors.Is(err, network.ErrNameExists):
498+
return oapi.ForkInstance409JSONResponse{
499+
Code: "name_conflict",
500+
Message: err.Error(),
501+
}, nil
502+
case errors.Is(err, instances.ErrNotSupported):
503+
return oapi.ForkInstance501JSONResponse{
504+
Code: "not_supported",
505+
Message: err.Error(),
506+
}, nil
507+
default:
508+
log.ErrorContext(ctx, "failed to fork instance", "error", err)
509+
return oapi.ForkInstance500JSONResponse{
510+
Code: "internal_error",
511+
Message: "failed to fork instance",
512+
}, nil
513+
}
514+
}
515+
return oapi.ForkInstance201JSONResponse(instanceToOAPI(*result)), nil
516+
}
517+
450518
// StopInstance gracefully stops a running instance
451519
// The id parameter can be an instance ID, name, or ID prefix
452520
// Note: Resolution is handled by ResolveResource middleware

cmd/api/api/instances_test.go

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ package api
22

33
import (
44
"context"
5+
"fmt"
56
"os"
67
"testing"
78
"time"
89

910
"github.com/c2h5oh/datasize"
1011
"github.com/kernel/hypeman/lib/hypervisor"
1112
"github.com/kernel/hypeman/lib/instances"
13+
mw "github.com/kernel/hypeman/lib/middleware"
1214
"github.com/kernel/hypeman/lib/oapi"
1315
"github.com/kernel/hypeman/lib/paths"
1416
"github.com/kernel/hypeman/lib/system"
@@ -137,6 +139,24 @@ type captureCreateManager struct {
137139
lastReq *instances.CreateInstanceRequest
138140
}
139141

142+
type captureForkManager struct {
143+
instances.Manager
144+
lastID string
145+
lastReq *instances.ForkInstanceRequest
146+
result *instances.Instance
147+
err error
148+
}
149+
150+
func (m *captureForkManager) ForkInstance(ctx context.Context, id string, req instances.ForkInstanceRequest) (*instances.Instance, error) {
151+
reqCopy := req
152+
m.lastID = id
153+
m.lastReq = &reqCopy
154+
if m.err != nil {
155+
return nil, m.err
156+
}
157+
return m.result, nil
158+
}
159+
140160
func (m *captureCreateManager) CreateInstance(ctx context.Context, req instances.CreateInstanceRequest) (*instances.Instance, error) {
141161
reqCopy := req
142162
m.lastReq = &reqCopy
@@ -190,6 +210,185 @@ func TestCreateInstance_OmittedHotplugSizeDefaultsToZero(t *testing.T) {
190210
assert.Equal(t, int64(0), int64(hotplugBytes), "response should report zero hotplug_size when omitted")
191211
}
192212

213+
func TestForkInstance_Success(t *testing.T) {
214+
svc := newTestService(t)
215+
216+
now := time.Now()
217+
source := instances.Instance{
218+
StoredMetadata: instances.StoredMetadata{
219+
Id: "src-instance",
220+
Name: "src-instance",
221+
Image: "docker.io/library/alpine:latest",
222+
CreatedAt: now,
223+
HypervisorType: hypervisor.TypeCloudHypervisor,
224+
},
225+
State: instances.StateStopped,
226+
}
227+
228+
forked := &instances.Instance{
229+
StoredMetadata: instances.StoredMetadata{
230+
Id: "forked-instance",
231+
Name: "forked-instance",
232+
Image: "docker.io/library/alpine:latest",
233+
CreatedAt: now,
234+
HypervisorType: hypervisor.TypeCloudHypervisor,
235+
},
236+
State: instances.StateStopped,
237+
}
238+
239+
mockMgr := &captureForkManager{
240+
Manager: svc.InstanceManager,
241+
result: forked,
242+
}
243+
svc.InstanceManager = mockMgr
244+
245+
resp, err := svc.ForkInstance(
246+
mw.WithResolvedInstance(ctx(), source.Id, source),
247+
oapi.ForkInstanceRequestObject{
248+
Id: source.Id,
249+
Body: &oapi.ForkInstanceRequest{
250+
Name: "forked-instance",
251+
},
252+
},
253+
)
254+
require.NoError(t, err)
255+
256+
created, ok := resp.(oapi.ForkInstance201JSONResponse)
257+
require.True(t, ok, "expected 201 response")
258+
assert.Equal(t, "forked-instance", created.Name)
259+
assert.Equal(t, source.Id, mockMgr.lastID)
260+
require.NotNil(t, mockMgr.lastReq)
261+
assert.Equal(t, "forked-instance", mockMgr.lastReq.Name)
262+
assert.False(t, mockMgr.lastReq.FromRunning)
263+
assert.Equal(t, instances.State(""), mockMgr.lastReq.TargetState)
264+
}
265+
266+
func TestForkInstance_NotSupported(t *testing.T) {
267+
svc := newTestService(t)
268+
269+
source := instances.Instance{
270+
StoredMetadata: instances.StoredMetadata{
271+
Id: "src-instance",
272+
Name: "src-instance",
273+
Image: "docker.io/library/alpine:latest",
274+
CreatedAt: time.Now(),
275+
HypervisorType: hypervisor.TypeQEMU,
276+
},
277+
State: instances.StateStopped,
278+
}
279+
280+
mockMgr := &captureForkManager{
281+
Manager: svc.InstanceManager,
282+
err: instances.ErrNotSupported,
283+
}
284+
svc.InstanceManager = mockMgr
285+
286+
resp, err := svc.ForkInstance(
287+
mw.WithResolvedInstance(ctx(), source.Id, source),
288+
oapi.ForkInstanceRequestObject{
289+
Id: source.Id,
290+
Body: &oapi.ForkInstanceRequest{
291+
Name: "forked-instance",
292+
},
293+
},
294+
)
295+
require.NoError(t, err)
296+
297+
notSupported, ok := resp.(oapi.ForkInstance501JSONResponse)
298+
require.True(t, ok, "expected 501 response")
299+
assert.Equal(t, "not_supported", notSupported.Code)
300+
}
301+
302+
func TestForkInstance_InvalidRequest(t *testing.T) {
303+
svc := newTestService(t)
304+
305+
source := instances.Instance{
306+
StoredMetadata: instances.StoredMetadata{
307+
Id: "src-instance",
308+
Name: "src-instance",
309+
Image: "docker.io/library/alpine:latest",
310+
CreatedAt: time.Now(),
311+
HypervisorType: hypervisor.TypeCloudHypervisor,
312+
},
313+
State: instances.StateStopped,
314+
}
315+
316+
mockMgr := &captureForkManager{
317+
Manager: svc.InstanceManager,
318+
err: fmt.Errorf("%w: name is required", instances.ErrInvalidRequest),
319+
}
320+
svc.InstanceManager = mockMgr
321+
322+
resp, err := svc.ForkInstance(
323+
mw.WithResolvedInstance(ctx(), source.Id, source),
324+
oapi.ForkInstanceRequestObject{
325+
Id: source.Id,
326+
Body: &oapi.ForkInstanceRequest{
327+
Name: "",
328+
},
329+
},
330+
)
331+
require.NoError(t, err)
332+
333+
badReq, ok := resp.(oapi.ForkInstance400JSONResponse)
334+
require.True(t, ok, "expected 400 response")
335+
assert.Equal(t, "invalid_request", badReq.Code)
336+
}
337+
338+
func TestForkInstance_FromRunningFlagForwarded(t *testing.T) {
339+
svc := newTestService(t)
340+
341+
now := time.Now()
342+
source := instances.Instance{
343+
StoredMetadata: instances.StoredMetadata{
344+
Id: "src-instance",
345+
Name: "src-instance",
346+
Image: "docker.io/library/alpine:latest",
347+
CreatedAt: now,
348+
HypervisorType: hypervisor.TypeCloudHypervisor,
349+
},
350+
State: instances.StateRunning,
351+
}
352+
353+
forked := &instances.Instance{
354+
StoredMetadata: instances.StoredMetadata{
355+
Id: "forked-instance",
356+
Name: "forked-instance",
357+
Image: "docker.io/library/alpine:latest",
358+
CreatedAt: now,
359+
HypervisorType: hypervisor.TypeCloudHypervisor,
360+
},
361+
State: instances.StateStandby,
362+
}
363+
364+
mockMgr := &captureForkManager{
365+
Manager: svc.InstanceManager,
366+
result: forked,
367+
}
368+
svc.InstanceManager = mockMgr
369+
370+
fromRunning := true
371+
targetState := oapi.ForkTargetStateRunning
372+
resp, err := svc.ForkInstance(
373+
mw.WithResolvedInstance(ctx(), source.Id, source),
374+
oapi.ForkInstanceRequestObject{
375+
Id: source.Id,
376+
Body: &oapi.ForkInstanceRequest{
377+
Name: "forked-instance",
378+
FromRunning: &fromRunning,
379+
TargetState: &targetState,
380+
},
381+
},
382+
)
383+
require.NoError(t, err)
384+
385+
_, ok := resp.(oapi.ForkInstance201JSONResponse)
386+
require.True(t, ok, "expected 201 response")
387+
require.NotNil(t, mockMgr.lastReq)
388+
assert.True(t, mockMgr.lastReq.FromRunning)
389+
assert.Equal(t, instances.StateRunning, mockMgr.lastReq.TargetState)
390+
}
391+
193392
func TestInstanceLifecycle_StopStart(t *testing.T) {
194393
// Require KVM access for VM creation
195394
if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) {

lib/builds/manager_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ func (m *mockInstanceManager) DeleteInstance(ctx context.Context, id string) err
8484
return nil
8585
}
8686

87+
func (m *mockInstanceManager) ForkInstance(ctx context.Context, id string, req instances.ForkInstanceRequest) (*instances.Instance, error) {
88+
return nil, instances.ErrNotFound
89+
}
90+
8791
func (m *mockInstanceManager) StandbyInstance(ctx context.Context, id string) (*instances.Instance, error) {
8892
return nil, nil
8993
}

lib/forkvm/README.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# VM Forking: Hypervisor Behavior
2+
3+
This document describes hypervisor-specific fork behavior and how fork is made
4+
to work across implementations.
5+
6+
## Common fork model
7+
8+
- **Stopped source**: clone VM data and start a new VM from copied state.
9+
- **Standby source**: clone data + snapshot artifacts, then adapt snapshot
10+
identity for the fork (paths, network, vsock behavior varies by hypervisor).
11+
- **Running source**: transition source to standby, fork from that standby
12+
snapshot, then restore the source.
13+
14+
For networked forks, the fork gets a fresh host/guest identity (IP, MAC, TAP)
15+
instead of reusing the source identity.
16+
17+
## Cloud Hypervisor
18+
19+
- Snapshot-based forks are supported by rewriting snapshot configuration before
20+
restore.
21+
- Path rewrites are constrained to exact source-directory matches or source-dir
22+
path prefixes to avoid mutating unrelated values.
23+
- Serial log path, vsock socket path, and network fields are updated for the
24+
fork.
25+
- Vsock CID is intentionally kept stable for snapshot restore compatibility.
26+
- Running-source fork works by standby -> fork -> restore source, with source
27+
and fork separated by rewritten runtime endpoints.
28+
29+
## QEMU
30+
31+
- Snapshot-based forks are supported by rewriting QEMU snapshot VM config.
32+
- Rewrites are explicit and path-safe (source-dir exact/prefix replacement),
33+
applied to disk/kernel/initrd/serial/vsock socket paths.
34+
- Kernel arguments are left unchanged (not blanket-rewritten), to avoid
35+
accidental mutation of non-path text.
36+
- Network identity is updated in snapshot config for the fork.
37+
- Vsock CID updates are supported for snapshot state, so running-source fork can
38+
rotate source CID when needed to avoid CID collision after restore.
39+
40+
## Firecracker
41+
42+
- Firecracker snapshot restore supports **network overrides** but does not
43+
expose a full snapshot-config rewrite surface for arbitrary embedded paths.
44+
- To make standby/running fork work, fork preparation stores desired network
45+
override data and source->target data-directory mapping.
46+
- During restore, the source data path is temporarily aliased to the fork data
47+
path so embedded snapshot paths resolve for the fork, then aliasing is
48+
cleaned up.
49+
- Network override fields are supplied at snapshot load to bind the fork to its
50+
own TAP device.
51+
- Vsock CID remains stable for snapshot-based flows.
52+
53+
## VZ (Virtualization.framework)
54+
55+
- Fork is not supported.
56+
- Snapshot restore for Linux guests is not available in this mode, so standby
57+
snapshot-based fork mechanics cannot be implemented.
58+
59+
## Operational constraints
60+
61+
- Writable attached volumes are rejected for fork to prevent concurrent
62+
cross-VM writes to the same backing data.
63+
- If a post-fork target-state transition fails, the partially created fork is
64+
cleaned up rather than left orphaned.

0 commit comments

Comments
 (0)