Skip to content

Commit 2b58e51

Browse files
authored
Preserve snapshot fork restore paths (#239)
* Add Firecracker warm fork chain regression * Preserve Firecracker snapshot source aliases * Test warm fork chains on QEMU and Cloud Hypervisor * Persist QEMU restore config * Skip source CID rotation for stopped forks * Refresh stopped fork vsock CID * Return Firecracker source stat errors --------- Co-authored-by: sjmiller609 <7516283+sjmiller609@users.noreply.github.com>
1 parent 1b7d0b7 commit 2b58e51

9 files changed

Lines changed: 386 additions & 7 deletions

File tree

lib/hypervisor/firecracker/config.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,9 @@ type instanceInfo struct {
101101
}
102102

103103
type restoreMetadata struct {
104-
NetworkOverrides []networkOverride `json:"network_overrides,omitempty"`
105-
SnapshotSourceDataDir string `json:"snapshot_source_data_dir,omitempty"`
104+
NetworkOverrides []networkOverride `json:"network_overrides,omitempty"`
105+
SnapshotSourceDataDir string `json:"snapshot_source_data_dir,omitempty"`
106+
RetainSnapshotSourceDataDirAlias bool `json:"retain_snapshot_source_data_dir_alias,omitempty"`
106107
}
107108

108109
func toBootSource(cfg hypervisor.VMConfig) bootSource {

lib/hypervisor/firecracker/fork.go

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package firecracker
22

33
import (
44
"context"
5+
"fmt"
6+
"os"
57
"path/filepath"
68

79
"github.com/kernel/hypeman/lib/hypervisor"
@@ -48,9 +50,26 @@ func (s *Starter) PrepareFork(ctx context.Context, req hypervisor.ForkPrepareReq
4850
}
4951
}
5052
if req.SourceDataDir != "" && req.TargetDataDir != "" && req.SourceDataDir != req.TargetDataDir {
51-
if meta.SnapshotSourceDataDir != req.SourceDataDir {
52-
meta.SnapshotSourceDataDir = req.SourceDataDir
53-
changed = true
53+
if meta.RetainSnapshotSourceDataDirAlias && meta.SnapshotSourceDataDir != "" {
54+
// Keep the upstream source path for snapshot-derived forks. The retained
55+
// Firecracker base can still reference that path after later diff snapshots.
56+
} else {
57+
retainAlias := false
58+
if _, err := os.Stat(req.SourceDataDir); err != nil {
59+
if os.IsNotExist(err) {
60+
retainAlias = true
61+
} else {
62+
return hypervisor.ForkPrepareResult{}, fmt.Errorf("stat snapshot source data dir %q: %w", req.SourceDataDir, err)
63+
}
64+
}
65+
if meta.SnapshotSourceDataDir != req.SourceDataDir {
66+
meta.SnapshotSourceDataDir = req.SourceDataDir
67+
changed = true
68+
}
69+
if meta.RetainSnapshotSourceDataDirAlias != retainAlias {
70+
meta.RetainSnapshotSourceDataDirAlias = retainAlias
71+
changed = true
72+
}
5473
}
5574
}
5675

lib/hypervisor/firecracker/fork_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,100 @@ func TestPrepareFork_SnapshotRewritePersistsRestoreMetadata(t *testing.T) {
4040
require.Len(t, meta.NetworkOverrides, 1)
4141
assert.Equal(t, "tap-new", meta.NetworkOverrides[0].HostDevName)
4242
assert.Equal(t, filepath.Join(tmp, "source"), meta.SnapshotSourceDataDir)
43+
assert.True(t, meta.RetainSnapshotSourceDataDirAlias)
44+
}
45+
46+
func TestPrepareFork_DoesNotRetainExistingSourceAlias(t *testing.T) {
47+
starter := NewStarter()
48+
tmp := t.TempDir()
49+
sourceDir := filepath.Join(tmp, "source")
50+
targetDir := filepath.Join(tmp, "target")
51+
require.NoError(t, os.MkdirAll(sourceDir, 0755))
52+
require.NoError(t, os.MkdirAll(targetDir, 0755))
53+
require.NoError(t, saveRestoreMetadata(targetDir, nil))
54+
55+
_, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{
56+
SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"),
57+
SourceDataDir: sourceDir,
58+
TargetDataDir: targetDir,
59+
})
60+
require.NoError(t, err)
61+
62+
meta, err := loadRestoreMetadata(targetDir)
63+
require.NoError(t, err)
64+
assert.Equal(t, sourceDir, meta.SnapshotSourceDataDir)
65+
assert.False(t, meta.RetainSnapshotSourceDataDirAlias)
66+
}
67+
68+
func TestPrepareFork_ReturnsSourceStatErrors(t *testing.T) {
69+
starter := NewStarter()
70+
tmp := t.TempDir()
71+
targetDir := filepath.Join(tmp, "target")
72+
require.NoError(t, os.MkdirAll(targetDir, 0755))
73+
require.NoError(t, saveRestoreMetadata(targetDir, nil))
74+
75+
_, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{
76+
SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"),
77+
SourceDataDir: filepath.Join(tmp, "source") + "\x00",
78+
TargetDataDir: targetDir,
79+
})
80+
require.Error(t, err)
81+
assert.Contains(t, err.Error(), "stat snapshot source data dir")
82+
}
83+
84+
func TestPrepareFork_PreservesRetainedUpstreamAlias(t *testing.T) {
85+
starter := NewStarter()
86+
tmp := t.TempDir()
87+
upstreamDir := filepath.Join(tmp, "upstream")
88+
sourceDir := filepath.Join(tmp, "source")
89+
targetDir := filepath.Join(tmp, "target")
90+
require.NoError(t, os.MkdirAll(sourceDir, 0755))
91+
require.NoError(t, os.MkdirAll(targetDir, 0755))
92+
require.NoError(t, saveRestoreMetadataState(targetDir, &restoreMetadata{
93+
SnapshotSourceDataDir: upstreamDir,
94+
RetainSnapshotSourceDataDirAlias: true,
95+
}))
96+
97+
_, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{
98+
SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"),
99+
SourceDataDir: sourceDir,
100+
TargetDataDir: targetDir,
101+
})
102+
require.NoError(t, err)
103+
104+
meta, err := loadRestoreMetadata(targetDir)
105+
require.NoError(t, err)
106+
assert.Equal(t, upstreamDir, meta.SnapshotSourceDataDir)
107+
assert.True(t, meta.RetainSnapshotSourceDataDirAlias)
108+
}
109+
110+
func TestPrepareFork_NetworkRewritePreservesRetainedAlias(t *testing.T) {
111+
starter := NewStarter()
112+
tmp := t.TempDir()
113+
upstreamDir := filepath.Join(tmp, "upstream")
114+
targetDir := filepath.Join(tmp, "target")
115+
require.NoError(t, os.MkdirAll(targetDir, 0755))
116+
require.NoError(t, saveRestoreMetadataState(targetDir, &restoreMetadata{
117+
SnapshotSourceDataDir: upstreamDir,
118+
RetainSnapshotSourceDataDirAlias: true,
119+
NetworkOverrides: []networkOverride{{
120+
IfaceID: "eth0",
121+
HostDevName: "tap-old",
122+
}},
123+
}))
124+
125+
_, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{
126+
SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"),
127+
Network: &hypervisor.ForkNetworkConfig{
128+
TAPDevice: "tap-new",
129+
},
130+
})
131+
require.NoError(t, err)
132+
133+
meta, err := loadRestoreMetadata(targetDir)
134+
require.NoError(t, err)
135+
require.Len(t, meta.NetworkOverrides, 1)
136+
assert.Equal(t, "tap-new", meta.NetworkOverrides[0].HostDevName)
137+
assert.Equal(t, upstreamDir, meta.SnapshotSourceDataDir)
138+
assert.True(t, meta.RetainSnapshotSourceDataDirAlias)
43139
}

lib/hypervisor/firecracker/process.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
125125
if err != nil {
126126
return 0, nil, fmt.Errorf("load firecracker snapshot: %w", err)
127127
}
128-
if meta.SnapshotSourceDataDir != "" {
128+
if meta.SnapshotSourceDataDir != "" && !meta.RetainSnapshotSourceDataDirAlias {
129129
meta.SnapshotSourceDataDir = ""
130130
if err := saveRestoreMetadataState(filepath.Dir(socketPath), meta); err != nil {
131131
return 0, nil, fmt.Errorf("clear firecracker snapshot source alias metadata: %w", err)

lib/hypervisor/qemu/process.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,10 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string,
466466
}
467467
log.DebugContext(ctx, "VM ready", "duration_ms", time.Since(migrationWaitStart).Milliseconds())
468468

469+
if err := saveVMConfig(filepath.Dir(socketPath), config); err != nil {
470+
return 0, nil, fmt.Errorf("save restored vm config: %w", err)
471+
}
472+
469473
cu.Release()
470474
log.DebugContext(ctx, "QEMU restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds())
471475
return pid, hv, nil

lib/instances/firecracker_test.go

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/kernel/hypeman/lib/network"
2626
"github.com/kernel/hypeman/lib/paths"
2727
"github.com/kernel/hypeman/lib/resources"
28+
snapshottest "github.com/kernel/hypeman/lib/snapshot/testsupport"
2829
"github.com/kernel/hypeman/lib/system"
2930
"github.com/kernel/hypeman/lib/volumes"
3031
"github.com/stretchr/testify/assert"
@@ -557,6 +558,95 @@ func TestFirecrackerSnapshotFeature(t *testing.T) {
557558
})
558559
}
559560

561+
func TestFirecrackerWarmForkChain(t *testing.T) {
562+
t.Parallel()
563+
requireFirecrackerIntegrationPrereqs(t)
564+
565+
mgr, tmpDir := setupTestManagerForFirecrackerNoNetwork(t)
566+
ctx := context.Background()
567+
p := paths.New(tmpDir)
568+
569+
imageManager, err := images.NewManager(p, 1, nil)
570+
require.NoError(t, err)
571+
imageName := integrationTestImageRef(t, "docker.io/library/alpine:latest")
572+
snapshottest.EnsureImageReady(t, ctx, p, imageManager, imageName)
573+
574+
systemManager := system.NewManager(p)
575+
require.NoError(t, systemManager.EnsureSystemFiles(ctx))
576+
577+
source, err := mgr.CreateInstance(ctx, CreateInstanceRequest{
578+
Name: "fc-warm-chain-src",
579+
Image: imageName,
580+
Size: 1024 * 1024 * 1024,
581+
OverlaySize: 1024 * 1024 * 1024,
582+
Vcpus: 1,
583+
NetworkEnabled: false,
584+
Hypervisor: hypervisor.TypeFirecracker,
585+
Cmd: []string{"sleep", "infinity"},
586+
})
587+
require.NoError(t, err)
588+
sourceID := source.Id
589+
sourceDeleted := false
590+
t.Cleanup(func() {
591+
if !sourceDeleted {
592+
_ = mgr.DeleteInstance(context.Background(), sourceID)
593+
}
594+
})
595+
596+
source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, integrationTestTimeout(20*time.Second))
597+
require.NoError(t, err)
598+
require.NoError(t, waitForExecAgent(ctx, mgr, sourceID, 30*time.Second))
599+
600+
snapshot, err := mgr.CreateSnapshot(ctx, sourceID, CreateSnapshotRequest{
601+
Kind: SnapshotKindStandby,
602+
Name: "fc-warm-chain-snap",
603+
})
604+
require.NoError(t, err)
605+
require.Equal(t, SnapshotKindStandby, snapshot.Kind)
606+
607+
require.NoError(t, mgr.DeleteInstance(ctx, sourceID))
608+
sourceDeleted = true
609+
610+
warm, err := mgr.ForkSnapshot(ctx, snapshot.Id, ForkSnapshotRequest{
611+
Name: "fc-warm-chain-warm",
612+
TargetState: StateRunning,
613+
})
614+
require.NoError(t, err)
615+
warmID := warm.Id
616+
warmDeleted := false
617+
t.Cleanup(func() {
618+
if !warmDeleted {
619+
_ = mgr.DeleteInstance(context.Background(), warmID)
620+
}
621+
})
622+
warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(20*time.Second))
623+
require.NoError(t, err)
624+
require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 30*time.Second))
625+
626+
child, err := mgr.ForkInstance(ctx, warmID, ForkInstanceRequest{
627+
Name: "fc-warm-chain-child",
628+
FromRunning: true,
629+
TargetState: StateStopped,
630+
})
631+
require.NoError(t, err)
632+
require.Equal(t, StateStopped, child.State)
633+
childID := child.Id
634+
t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), childID) })
635+
636+
warm, err = mgr.GetInstance(ctx, warmID)
637+
require.NoError(t, err)
638+
if warm.State != StateRunning {
639+
warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(20*time.Second))
640+
require.NoError(t, err)
641+
}
642+
require.Equal(t, StateRunning, warm.State)
643+
require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 30*time.Second))
644+
645+
require.NoError(t, mgr.DeleteInstance(ctx, warmID))
646+
warmDeleted = true
647+
require.NoError(t, mgr.DeleteSnapshot(ctx, snapshot.Id))
648+
}
649+
560650
// TestFirecrackerForkIsolation verifies CoW isolation between a firecracker
561651
// source's standby snapshot and a fork derived from it. A fork must end up
562652
// with its own mem-file inode (reflink-cloned, not hardlinked) so that

lib/instances/fork.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
6868
}
6969

7070
forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true)
71-
if forkErr == nil {
71+
if forkErr == nil && targetState != StateStopped {
7272
if err := m.rotateSourceVsockForRestore(ctx, id, forked.Id); err != nil {
7373
forkErr = fmt.Errorf("prepare source snapshot for restore: %w", err)
7474
if cleanupErr := m.cleanupForkInstanceOnError(ctx, forked.Id); cleanupErr != nil {
@@ -437,6 +437,14 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe
437437
if err := os.RemoveAll(m.paths.InstanceSnapshotLatest(forkID)); err != nil {
438438
return nil, fmt.Errorf("remove fork snapshot: %w", err)
439439
}
440+
meta, err := m.loadMetadata(forkID)
441+
if err != nil {
442+
return nil, fmt.Errorf("load stopped fork metadata: %w", err)
443+
}
444+
meta.StoredMetadata.VsockCID = generateVsockCID(forkID)
445+
if err := m.saveMetadata(meta); err != nil {
446+
return nil, fmt.Errorf("save stopped fork metadata: %w", err)
447+
}
440448
return returnWithReadiness(m.getInstance(ctx, forkID))
441449
}
442450
case StateRunning:

0 commit comments

Comments
 (0)