Skip to content

Commit 8c23e9f

Browse files
committed
Model template as an instance state instead of a separate registry
Adds StateTemplate to the instance state machine. A Standby instance is auto-promoted to Template the first time it's forked from a snapshot, and ForkCount is bumped on each subsequent fork. Templates can't wake while ForkCount > 0; un-promote (Template -> Standby) and delete (Template -> Stopped) are both refused until forks drain. Fork bookkeeping lives on StoredMetadata (IsTemplate, ForkCount, ForkOfTemplate, plus a reserved HotPagesPath for the prefetch path). Deleting a fork decrements the parent template's ForkCount under the parent's lock; deletion of the fork's own data has already happened, so worst case is refcount drift that a future reconciliation pass fixes. The running-fork flow keeps skipping promotion: it restores the source back to Running afterward, and a template can't wake.
1 parent ba4a02d commit 8c23e9f

11 files changed

Lines changed: 600 additions & 290 deletions

File tree

lib/instances/delete.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ func (m *manager) deleteInstance(
3535
stored := &meta.StoredMetadata
3636
log.DebugContext(ctx, "loaded instance", "instance_id", id, "state", inst.State)
3737

38+
if inst.State == StateTemplate && stored.ForkCount > 0 {
39+
return fmt.Errorf("%w: cannot delete template %s with %d live fork(s); delete forks first", ErrInvalidState, id, stored.ForkCount)
40+
}
41+
parentTemplateID := stored.ForkOfTemplate
42+
3843
target, err := m.cancelAndWaitCompressionJob(ctx, m.snapshotJobKeyForInstance(id))
3944
if err != nil {
4045
return fmt.Errorf("wait for instance compression to stop: %w", err)
@@ -136,10 +141,41 @@ func (m *manager) deleteInstance(
136141
return fmt.Errorf("delete instance data: %w", err)
137142
}
138143

144+
if parentTemplateID != "" {
145+
m.decrementTemplateForkCount(ctx, parentTemplateID)
146+
}
147+
139148
log.InfoContext(ctx, "instance deleted successfully", "instance_id", id)
140149
return nil
141150
}
142151

152+
// decrementTemplateForkCount drops the parent template's ForkCount by one under
153+
// the parent's lock. Logs but does not return errors: the fork is already gone,
154+
// so the worst case is refcount drift that a future reconciliation pass fixes.
155+
func (m *manager) decrementTemplateForkCount(ctx context.Context, parentID string) {
156+
log := logger.FromContext(ctx)
157+
lock := m.getInstanceLock(parentID)
158+
lock.Lock()
159+
defer lock.Unlock()
160+
161+
parent, err := m.loadMetadata(parentID)
162+
if err != nil {
163+
log.WarnContext(ctx, "failed to load parent template for refcount decrement",
164+
"parent_template_id", parentID, "error", err)
165+
return
166+
}
167+
if parent.ForkCount <= 0 {
168+
log.WarnContext(ctx, "parent template fork count is non-positive at decrement; leaving as-is",
169+
"parent_template_id", parentID, "fork_count", parent.ForkCount)
170+
return
171+
}
172+
parent.ForkCount--
173+
if err := m.saveMetadata(parent); err != nil {
174+
log.WarnContext(ctx, "failed to save parent template after refcount decrement",
175+
"parent_template_id", parentID, "error", err)
176+
}
177+
}
178+
143179
// killHypervisor force kills the hypervisor process without graceful shutdown
144180
// Used only for delete operations where we're removing all data anyway.
145181
// For operations that need graceful shutdown (like standby), use the hypervisor API directly.

lib/instances/fork.go

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,9 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
6666
return nil, "", fmt.Errorf("standby source instance: %w", err)
6767
}
6868

69-
forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true)
69+
// Running fork is a one-shot clone that restores the source afterward;
70+
// promoting to Template would block the restore (templates can't wake).
71+
forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true, true)
7072
if forkErr == nil {
7173
if err := m.rotateSourceVsockForRestore(ctx, id, forked.Id); err != nil {
7274
forkErr = fmt.Errorf("prepare source snapshot for restore: %w", err)
@@ -104,14 +106,14 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR
104106
return nil, "", forkErr
105107
}
106108
return forked, targetState, nil
107-
case StateStopped, StateStandby:
108-
forked, err := m.forkInstanceFromStoppedOrStandby(ctx, id, req, false)
109+
case StateStopped, StateStandby, StateTemplate:
110+
forked, err := m.forkInstanceFromStoppedOrStandby(ctx, id, req, false, false)
109111
if err != nil {
110112
return nil, "", err
111113
}
112114
return forked, targetState, nil
113115
default:
114-
return nil, "", fmt.Errorf("%w: cannot fork from state %s (must be Stopped or Standby, or Running with from_running=true)", ErrInvalidState, source.State)
116+
return nil, "", fmt.Errorf("%w: cannot fork from state %s (must be Stopped, Standby, or Template, or Running with from_running=true)", ErrInvalidState, source.State)
115117
}
116118
}
117119

@@ -193,7 +195,7 @@ func generateForkSourceVsockCID(sourceID, forkID string, current int64) int64 {
193195
return cid
194196
}
195197

196-
func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id string, req ForkInstanceRequest, supportValidated bool) (*Instance, error) {
198+
func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id string, req ForkInstanceRequest, supportValidated, skipTemplatePromotion bool) (*Instance, error) {
197199
log := logger.FromContext(ctx)
198200

199201
meta, err := m.loadMetadata(id)
@@ -205,10 +207,10 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
205207
stored := &meta.StoredMetadata
206208

207209
switch source.State {
208-
case StateStopped, StateStandby:
210+
case StateStopped, StateStandby, StateTemplate:
209211
// allowed
210212
default:
211-
return nil, fmt.Errorf("%w: cannot fork from state %s (must be Stopped or Standby)", ErrInvalidState, source.State)
213+
return nil, fmt.Errorf("%w: cannot fork from state %s (must be Stopped, Standby, or Template)", ErrInvalidState, source.State)
212214
}
213215

214216
if !supportValidated {
@@ -250,7 +252,9 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
250252
})
251253
defer cu.Clean()
252254

253-
if source.State == StateStandby {
255+
fromSnapshot := source.State == StateStandby || source.State == StateTemplate
256+
257+
if fromSnapshot {
254258
if err := m.ensureSnapshotMemoryReady(ctx, m.paths.InstanceSnapshotLatest(id), m.snapshotJobKeyForInstance(id), stored.HypervisorType); err != nil {
255259
return nil, fmt.Errorf("prepare standby snapshot for fork: %w", err)
256260
}
@@ -286,17 +290,23 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
286290
// phase (Standby for snapshot forks, Stopped for stopped forks) will be
287291
// recorded by the appropriate operation when the fork is acted on.
288292
forkMeta.Phases.Reset()
289-
switch source.State {
290-
case StateStandby:
293+
if fromSnapshot {
291294
forkMeta.Phases.Record(phasetracking.PhaseStandby, now)
292-
case StateStopped:
295+
} else {
293296
forkMeta.Phases.Record(phasetracking.PhaseStopped, now)
294297
}
295298

299+
// Template-only fields don't carry forward to the fork; the fork is a fresh
300+
// instance regardless of whether the parent is a template.
301+
forkMeta.IsTemplate = false
302+
forkMeta.ForkCount = 0
303+
forkMeta.HotPagesPath = ""
304+
forkMeta.ForkOfTemplate = ""
305+
296306
// Keep the original CID for snapshot-based forks.
297307
// Rewriting CID in restored memory snapshots is not reliable across
298308
// hypervisors.
299-
if source.State == StateStandby {
309+
if fromSnapshot {
300310
forkMeta.VsockCID = stored.VsockCID
301311
} else {
302312
forkMeta.VsockCID = generateVsockCID(forkID)
@@ -309,7 +319,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
309319
forkMeta.MAC = ""
310320
}
311321

312-
if source.State == StateStandby {
322+
if fromSnapshot {
313323
snapshotConfigPath := m.paths.InstanceSnapshotConfig(forkID)
314324
netCfg := (*hypervisor.ForkNetworkConfig)(nil)
315325
if forkMeta.NetworkEnabled {
@@ -331,6 +341,25 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
331341
}
332342
}
333343

344+
// Promote source to Template (or bump existing template ForkCount) so the
345+
// snapshot we just cloned can't be woken or deleted while this fork lives.
346+
// Skipped for the running-fork flow, where the source is restored afterward.
347+
if fromSnapshot && !skipTemplatePromotion {
348+
priorIsTemplate := stored.IsTemplate
349+
priorForkCount := stored.ForkCount
350+
stored.IsTemplate = true
351+
stored.ForkCount = priorForkCount + 1
352+
if err := m.saveMetadata(meta); err != nil {
353+
return nil, fmt.Errorf("promote source to template: %w", err)
354+
}
355+
cu.Add(func() {
356+
stored.IsTemplate = priorIsTemplate
357+
stored.ForkCount = priorForkCount
358+
_ = m.saveMetadata(meta)
359+
})
360+
forkMeta.ForkOfTemplate = stored.Id
361+
}
362+
334363
newMeta := &metadata{StoredMetadata: forkMeta}
335364
if err := m.saveMetadata(newMeta); err != nil {
336365
return nil, fmt.Errorf("save fork metadata: %w", err)
@@ -384,6 +413,10 @@ func resolveForkTargetState(requested State, sourceState State) (State, error) {
384413
switch sourceState {
385414
case StateRunning, StateStandby, StateStopped:
386415
return sourceState, nil
416+
case StateTemplate:
417+
// Forks of a template are plain Standby instances; the fork itself
418+
// is never a template.
419+
return StateStandby, nil
387420
default:
388421
return "", fmt.Errorf("%w: cannot derive fork target state from source state %s", ErrInvalidState, sourceState)
389422
}

lib/instances/fork_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ func TestForkInstanceFromStandbyCancelsCompressionJobAndCopiesRawMemory(t *testi
266266
forked, err := manager.forkInstanceFromStoppedOrStandby(ctx, sourceID, ForkInstanceRequest{
267267
Name: "fork-standby-compressed-copy",
268268
TargetState: StateStopped,
269-
}, true)
269+
}, true, false)
270270
require.NoError(t, err)
271271
require.NotNil(t, forked)
272272

lib/instances/manager.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,13 +447,39 @@ func (m *manager) RestoreInstance(ctx context.Context, id string) (*Instance, er
447447
if current.State == StateRunning || current.State == StateInitializing {
448448
return current, nil
449449
}
450+
if current.State == StateTemplate {
451+
if err := m.demoteTemplate(ctx, id); err != nil {
452+
return nil, err
453+
}
454+
}
450455
inst, err := m.restoreInstance(ctx, id)
451456
if err == nil {
452457
m.notifyLifecycleEvent(ctx, LifecycleEventRestore, inst)
453458
}
454459
return inst, err
455460
}
456461

462+
// demoteTemplate un-promotes a Template back to Standby so it can be restored.
463+
// Requires ForkCount==0. Must be called with the instance lock held.
464+
func (m *manager) demoteTemplate(ctx context.Context, id string) error {
465+
meta, err := m.loadMetadata(id)
466+
if err != nil {
467+
return err
468+
}
469+
if meta.ForkCount > 0 {
470+
return fmt.Errorf("%w: cannot un-promote template %s with %d live fork(s); delete forks first", ErrInvalidState, id, meta.ForkCount)
471+
}
472+
if err := StateTemplate.CanTransitionTo(StateStandby); err != nil {
473+
return err
474+
}
475+
meta.IsTemplate = false
476+
meta.HotPagesPath = ""
477+
if err := m.saveMetadata(meta); err != nil {
478+
return fmt.Errorf("save metadata after template un-promote: %w", err)
479+
}
480+
return nil
481+
}
482+
457483
func (m *manager) RestoreSnapshot(ctx context.Context, id string, snapshotID string, req RestoreSnapshotRequest) (*Instance, error) {
458484
lock := m.getInstanceLock(id)
459485
lock.Lock()

lib/instances/manager_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1497,12 +1497,17 @@ func TestStateTransitions(t *testing.T) {
14971497
{"Standby to Paused", StateStandby, StatePaused, false},
14981498
{"Shutdown to Stopped", StateShutdown, StateStopped, false},
14991499
{"Standby to Stopped", StateStandby, StateStopped, false},
1500+
{"Standby to Template", StateStandby, StateTemplate, false},
1501+
{"Template to Standby", StateTemplate, StateStandby, false},
1502+
{"Template to Stopped", StateTemplate, StateStopped, false},
15001503
// Invalid transitions
15011504
{"Running to Standby", StateRunning, StateStandby, true},
15021505
{"Stopped to Running", StateStopped, StateRunning, true},
15031506
{"Stopped to Initializing", StateStopped, StateInitializing, true},
15041507
{"Standby to Running", StateStandby, StateRunning, true},
15051508
{"Initializing to Paused", StateInitializing, StatePaused, true},
1509+
{"Template to Running", StateTemplate, StateRunning, true},
1510+
{"Template to Paused", StateTemplate, StatePaused, true},
15061511
}
15071512

15081513
for _, tt := range tests {

lib/instances/query.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,12 @@ func (m *manager) deriveStateWithOptions(ctx context.Context, stored *StoredMeta
7272

7373
// 1. Check if socket exists
7474
if _, err := os.Stat(stored.SocketPath); err != nil {
75-
// No socket - check for snapshot to distinguish Stopped vs Standby
75+
// No socket - check for snapshot to distinguish Stopped vs Standby/Template
7676
m.invalidateCachedHypervisorState(stored.Id)
7777
if m.hasSnapshot(stored.DataDir) {
78+
if stored.IsTemplate {
79+
return stateResult{State: StateTemplate}
80+
}
7881
return stateResult{State: StateStandby}
7982
}
8083
return stateResult{State: StateStopped}

lib/instances/state.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,13 @@ var ValidTransitions = map[State][]State{
3434
StateCreated, // start VMM process
3535
},
3636
StateStandby: {
37-
StatePaused, // start VMM + restore (atomic operation)
38-
StateStopped, // delete snapshot + cleanup (terminal)
37+
StatePaused, // start VMM + restore (atomic operation)
38+
StateStopped, // delete snapshot + cleanup (terminal)
39+
StateTemplate, // promote to fork-only parent (gated by fork intent)
40+
},
41+
StateTemplate: {
42+
StateStandby, // un-promote (allowed only when ForkCount==0)
43+
StateStopped, // delete (allowed only when ForkCount==0)
3944
},
4045
// StateUnknown means we failed to determine state - no transitions allowed.
4146
// Operations on instances in Unknown state should fail with an error
@@ -75,7 +80,7 @@ func (s State) RequiresVMM() bool {
7580
switch s {
7681
case StateCreated, StateInitializing, StateRunning, StatePaused, StateShutdown:
7782
return true
78-
case StateStopped, StateStandby, StateUnknown:
83+
case StateStopped, StateStandby, StateTemplate, StateUnknown:
7984
return false
8085
default:
8186
return false

0 commit comments

Comments
 (0)