Skip to content

Commit 4acbfdc

Browse files
authored
Add instance restart policy (#233)
* Add instance restart policy * Integrate health checks with restart policy * Simplify restart policy controller * Add restart policy network integration coverage * Expect restart policy lifecycle metrics * Update instance patch validation test * Fix restart policy event reconciliation * Preserve manual restart stop status * Address restart policy review comments * Preserve restart reason during attempts * Clear restart status before start no-op * Recheck state before policy restart * Drop stale restart strategy test fields --------- Co-authored-by: sjmiller609 <7516283+sjmiller609@users.noreply.github.com>
1 parent 9db1f75 commit 4acbfdc

24 files changed

Lines changed: 1820 additions & 300 deletions

cmd/api/api/instances.go

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,13 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
293293
Message: err.Error(),
294294
}, nil
295295
}
296+
restartPolicy, err := toDomainRestartPolicy(request.Body.RestartPolicy)
297+
if err != nil {
298+
return oapi.CreateInstance400JSONResponse{
299+
Code: "invalid_restart_policy",
300+
Message: err.Error(),
301+
}, nil
302+
}
296303

297304
domainReq := instances.CreateInstanceRequest{
298305
Name: request.Body.Name,
@@ -319,6 +326,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
319326
SkipGuestAgent: request.Body.SkipGuestAgent != nil && *request.Body.SkipGuestAgent,
320327
AutoStandby: autoStandby,
321328
HealthCheck: healthCheck,
329+
RestartPolicy: restartPolicy,
322330
}
323331
if request.Body.SnapshotPolicy != nil {
324332
snapshotPolicy, err := toInstanceSnapshotPolicy(*request.Body.SnapshotPolicy)
@@ -970,11 +978,20 @@ func (s *ApiService) UpdateInstance(ctx context.Context, request oapi.UpdateInst
970978
Message: err.Error(),
971979
}, nil
972980
}
981+
restartPolicy, err := toDomainRestartPolicy(request.Body.RestartPolicy)
982+
if err != nil {
983+
return oapi.UpdateInstance400JSONResponse{
984+
Code: "invalid_restart_policy",
985+
Message: err.Error(),
986+
}, nil
987+
}
973988

974989
result, err := s.InstanceManager.UpdateInstance(ctx, inst.Id, instances.UpdateInstanceRequest{
975-
Env: env,
976-
AutoStandby: autoStandby,
977-
HealthCheck: healthCheck,
990+
Env: env,
991+
AutoStandby: autoStandby,
992+
HealthCheck: healthCheck,
993+
RestartPolicy: restartPolicy,
994+
RestartPolicySet: request.Body.RestartPolicy != nil,
978995
})
979996
if err != nil {
980997
switch {
@@ -1108,6 +1125,8 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
11081125
oapiInst.AutoStandby = toOAPIAutoStandbyPolicy(inst.AutoStandby)
11091126
oapiInst.HealthCheck = toOAPIHealthCheck(inst.HealthCheck)
11101127
oapiInst.HealthStatus = toOAPIHealthStatus(healthcheck.Snapshot(inst.HealthCheck, string(inst.State), inst.HealthCheckRuntime))
1128+
oapiInst.RestartPolicy = toOAPIRestartPolicy(inst.RestartPolicy)
1129+
oapiInst.RestartStatus = toOAPIRestartStatus(inst.RestartStatus)
11111130

11121131
// Convert volume attachments
11131132
if len(inst.Volumes) > 0 {

cmd/api/api/instances_test.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
mw "github.com/kernel/hypeman/lib/middleware"
1717
"github.com/kernel/hypeman/lib/oapi"
1818
"github.com/kernel/hypeman/lib/paths"
19+
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
1920
"github.com/kernel/hypeman/lib/system"
2021
"github.com/stretchr/testify/assert"
2122
"github.com/stretchr/testify/require"
@@ -281,6 +282,7 @@ func (m *captureUpdateManager) UpdateInstance(ctx context.Context, id string, re
281282
Env: req.Env,
282283
AutoStandby: req.AutoStandby,
283284
HealthCheck: req.HealthCheck,
285+
RestartPolicy: req.RestartPolicy,
284286
CreatedAt: now,
285287
HypervisorType: hypervisor.TypeCloudHypervisor,
286288
},
@@ -304,6 +306,7 @@ func (m *captureCreateManager) CreateInstance(ctx context.Context, req instances
304306
Vcpus: req.Vcpus,
305307
AutoStandby: req.AutoStandby,
306308
HealthCheck: req.HealthCheck,
309+
RestartPolicy: req.RestartPolicy,
307310
CreatedAt: now,
308311
HypervisorType: hypervisor.TypeCloudHypervisor,
309312
},
@@ -705,6 +708,48 @@ func TestCreateInstance_MapsHealthCheckPolicy(t *testing.T) {
705708
assert.Equal(t, oapi.InstanceHealthStatusStatusStarting, instance.HealthStatus.Status)
706709
}
707710

711+
func TestCreateInstance_MapsRestartPolicy(t *testing.T) {
712+
t.Parallel()
713+
714+
svc := newTestService(t)
715+
origMgr := svc.InstanceManager
716+
mockMgr := &captureCreateManager{Manager: origMgr}
717+
svc.InstanceManager = mockMgr
718+
719+
policy := oapi.OnFailure
720+
backoff := "7s"
721+
stableAfter := "2m"
722+
maxAttempts := 4
723+
724+
resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
725+
Body: &oapi.CreateInstanceRequest{
726+
Name: "test-restart-policy",
727+
Image: "docker.io/library/alpine:latest",
728+
RestartPolicy: &oapi.RestartPolicy{
729+
Policy: &policy,
730+
Backoff: &backoff,
731+
StableAfter: &stableAfter,
732+
MaxAttempts: &maxAttempts,
733+
},
734+
},
735+
})
736+
require.NoError(t, err)
737+
738+
created, ok := resp.(oapi.CreateInstance201JSONResponse)
739+
require.True(t, ok, "expected 201 response")
740+
require.NotNil(t, mockMgr.lastReq)
741+
require.NotNil(t, mockMgr.lastReq.RestartPolicy)
742+
assert.Equal(t, restartpolicy.PolicyOnFailure, mockMgr.lastReq.RestartPolicy.Policy)
743+
assert.Equal(t, "7s", mockMgr.lastReq.RestartPolicy.Backoff)
744+
assert.Equal(t, "2m0s", mockMgr.lastReq.RestartPolicy.StableAfter)
745+
assert.Equal(t, 4, mockMgr.lastReq.RestartPolicy.MaxAttempts)
746+
747+
instance := oapi.Instance(created)
748+
require.NotNil(t, instance.RestartPolicy)
749+
require.NotNil(t, instance.RestartPolicy.Policy)
750+
assert.Equal(t, oapi.OnFailure, *instance.RestartPolicy.Policy)
751+
}
752+
708753
func TestUpdateInstance_MapsEnvPatch(t *testing.T) {
709754
t.Parallel()
710755
svc := newTestService(t)
@@ -883,6 +928,108 @@ func TestUpdateInstance_MapsHealthCheckPatch(t *testing.T) {
883928
assert.Equal(t, oapi.InstanceHealthStatusStatusUnknown, instance.HealthStatus.Status)
884929
}
885930

931+
func TestUpdateInstance_MapsRestartPolicyPatch(t *testing.T) {
932+
t.Parallel()
933+
svc := newTestService(t)
934+
935+
origMgr := svc.InstanceManager
936+
now := time.Now()
937+
mockMgr := &captureUpdateManager{
938+
Manager: origMgr,
939+
result: &instances.Instance{
940+
StoredMetadata: instances.StoredMetadata{
941+
Id: "inst-update-restart-policy",
942+
Name: "inst-update-restart-policy",
943+
Image: "docker.io/library/alpine:latest",
944+
CreatedAt: now,
945+
HypervisorType: hypervisor.TypeCloudHypervisor,
946+
RestartPolicy: &restartpolicy.Policy{
947+
Policy: restartpolicy.PolicyAlways,
948+
Backoff: "5s",
949+
StableAfter: "10m0s",
950+
},
951+
RestartStatus: restartpolicy.Status{
952+
BlockedReason: restartpolicy.BlockedReasonManualStop,
953+
},
954+
},
955+
State: instances.StateStopped,
956+
},
957+
}
958+
svc.InstanceManager = mockMgr
959+
960+
policy := oapi.Always
961+
resolved := &instances.Instance{
962+
StoredMetadata: instances.StoredMetadata{
963+
Id: "inst-update-restart-policy",
964+
Name: "inst-update-restart-policy",
965+
Image: "docker.io/library/alpine:latest",
966+
CreatedAt: now,
967+
HypervisorType: hypervisor.TypeCloudHypervisor,
968+
},
969+
State: instances.StateStopped,
970+
}
971+
972+
resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
973+
Id: resolved.Id,
974+
Body: &oapi.UpdateInstanceRequest{
975+
RestartPolicy: &oapi.RestartPolicy{Policy: &policy},
976+
},
977+
})
978+
require.NoError(t, err)
979+
updated, ok := resp.(oapi.UpdateInstance200JSONResponse)
980+
require.True(t, ok, "expected 200 response")
981+
982+
require.NotNil(t, mockMgr.lastReq)
983+
assert.True(t, mockMgr.lastReq.RestartPolicySet)
984+
require.NotNil(t, mockMgr.lastReq.RestartPolicy)
985+
assert.Equal(t, restartpolicy.PolicyAlways, mockMgr.lastReq.RestartPolicy.Policy)
986+
987+
instance := oapi.Instance(updated)
988+
require.NotNil(t, instance.RestartPolicy)
989+
require.NotNil(t, instance.RestartStatus)
990+
require.NotNil(t, instance.RestartStatus.BlockedReason)
991+
assert.Equal(t, oapi.ManualStop, *instance.RestartStatus.BlockedReason)
992+
}
993+
994+
func TestUpdateInstance_RejectsInvalidRestartPolicy(t *testing.T) {
995+
t.Parallel()
996+
svc := newTestService(t)
997+
998+
origMgr := svc.InstanceManager
999+
mockMgr := &captureUpdateManager{Manager: origMgr}
1000+
svc.InstanceManager = mockMgr
1001+
1002+
now := time.Now()
1003+
resolved := &instances.Instance{
1004+
StoredMetadata: instances.StoredMetadata{
1005+
Id: "inst-update-restart-policy",
1006+
Name: "inst-update-restart-policy",
1007+
Image: "docker.io/library/alpine:latest",
1008+
CreatedAt: now,
1009+
HypervisorType: hypervisor.TypeCloudHypervisor,
1010+
},
1011+
State: instances.StateStopped,
1012+
}
1013+
policy := oapi.OnFailure
1014+
backoff := "0s"
1015+
1016+
resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
1017+
Id: resolved.Id,
1018+
Body: &oapi.UpdateInstanceRequest{
1019+
RestartPolicy: &oapi.RestartPolicy{
1020+
Policy: &policy,
1021+
Backoff: &backoff,
1022+
},
1023+
},
1024+
})
1025+
require.NoError(t, err)
1026+
1027+
badReq, ok := resp.(oapi.UpdateInstance400JSONResponse)
1028+
require.True(t, ok, "expected 400 response")
1029+
assert.Equal(t, "invalid_restart_policy", badReq.Code)
1030+
assert.Nil(t, mockMgr.lastReq)
1031+
}
1032+
8861033
func TestUpdateInstance_RejectsZeroAutoStandbyIgnoreDestinationPort(t *testing.T) {
8871034
t.Parallel()
8881035
svc := newTestService(t)

cmd/api/api/restart_policy.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package api
2+
3+
import (
4+
"github.com/kernel/hypeman/lib/oapi"
5+
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
6+
"github.com/samber/lo"
7+
)
8+
9+
func toDomainRestartPolicy(policy *oapi.RestartPolicy) (*restartpolicy.Policy, error) {
10+
if policy == nil {
11+
return nil, nil
12+
}
13+
14+
out := &restartpolicy.Policy{}
15+
if policy.Policy != nil {
16+
out.Policy = restartpolicy.PolicyMode(*policy.Policy)
17+
}
18+
if policy.Backoff != nil {
19+
out.Backoff = *policy.Backoff
20+
}
21+
if policy.MaxAttempts != nil {
22+
out.MaxAttempts = *policy.MaxAttempts
23+
}
24+
if policy.StableAfter != nil {
25+
out.StableAfter = *policy.StableAfter
26+
}
27+
normalized, err := restartpolicy.NormalizePolicy(out)
28+
if err != nil {
29+
return nil, err
30+
}
31+
return normalized, nil
32+
}
33+
34+
func toOAPIRestartPolicy(policy *restartpolicy.Policy) *oapi.RestartPolicy {
35+
if policy == nil {
36+
return nil
37+
}
38+
39+
mode := oapi.RestartPolicyPolicy(policy.Policy)
40+
out := &oapi.RestartPolicy{
41+
Policy: &mode,
42+
}
43+
if policy.Backoff != "" {
44+
out.Backoff = lo.ToPtr(policy.Backoff)
45+
}
46+
if policy.MaxAttempts > 0 {
47+
out.MaxAttempts = lo.ToPtr(policy.MaxAttempts)
48+
}
49+
if policy.StableAfter != "" {
50+
out.StableAfter = lo.ToPtr(policy.StableAfter)
51+
}
52+
return out
53+
}
54+
55+
func toOAPIRestartStatus(status restartpolicy.Status) *oapi.RestartStatus {
56+
if status.IsZero() {
57+
return nil
58+
}
59+
60+
out := &oapi.RestartStatus{
61+
Attempts: lo.ToPtr(status.Attempts),
62+
}
63+
if status.BlockedReason != "" {
64+
reason := oapi.RestartStatusBlockedReason(status.BlockedReason)
65+
out.BlockedReason = &reason
66+
}
67+
if status.LastAttemptAt != nil {
68+
lastAttemptAt := status.LastAttemptAt.UTC()
69+
out.LastAttemptAt = &lastAttemptAt
70+
}
71+
if status.NextAttemptAt != nil {
72+
nextAttemptAt := status.NextAttemptAt.UTC()
73+
out.NextAttemptAt = &nextAttemptAt
74+
}
75+
if status.LastReason != "" {
76+
reason := oapi.RestartStatusLastReason(status.LastReason)
77+
out.LastReason = &reason
78+
}
79+
return out
80+
}

cmd/api/main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,14 @@ func run() error {
571571
return app.HealthCheckController.Run(gctx)
572572
})
573573
}
574+
if restartController, ok := app.InstanceManager.(interface {
575+
StartRestartPolicyController(context.Context) error
576+
}); ok {
577+
grp.Go(func() error {
578+
logger.Info("starting restart policy controller")
579+
return restartController.StartRestartPolicyController(gctx)
580+
})
581+
}
574582

575583
// Run the server
576584
grp.Go(func() error {

lib/healthcheck/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ Stopping, deleting, standing by, or restoring an instance stops active checks. S
7575

7676
## Restart Policy
7777

78-
Health checks only report health. They do not restart instances.
78+
Health checks do not restart instances by themselves.
7979

80-
If Hypeman later adds restart-on-unhealthy behavior, it should consume `health_status=unhealthy` explicitly rather than making health checks mutate lifecycle state.
80+
When an instance also has `restart_policy.policy=on_failure` or `restart_policy.policy=always`, an `unhealthy` health status becomes a restart-policy failure signal. The restart policy applies its normal backoff, max attempts, manual-stop suppression, and stable-window reset before Hypeman restarts the whole instance.
81+
82+
With `restart_policy.policy=never` or no restart policy, health checks only report status.
83+
84+
Health checks still do not mutate lifecycle state directly. The instance remains `Running` while unhealthy until restart policy chooses to stop and start it.

lib/instances/create.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ func (m *manager) createInstance(
375375
SnapshotPolicy: cloneSnapshotPolicy(req.SnapshotPolicy),
376376
AutoStandby: cloneAutoStandbyPolicy(req.AutoStandby),
377377
HealthCheck: cloneHealthCheckPolicy(req.HealthCheck),
378+
RestartPolicy: cloneRestartPolicy(req.RestartPolicy),
378379
}
379380

380381
// 12. Ensure directories
@@ -638,6 +639,11 @@ func validateCreateRequest(req *CreateInstanceRequest) error {
638639
if err := validateHealthCheckCompatibility(req.HealthCheck, req.NetworkEnabled, req.SkipGuestAgent); err != nil {
639640
return err
640641
}
642+
normalizedRestartPolicy, err := normalizeRestartPolicy(req.RestartPolicy)
643+
if err != nil {
644+
return err
645+
}
646+
req.RestartPolicy = normalizedRestartPolicy
641647

642648
// Validate volume attachments
643649
if err := validateVolumeAttachments(req.Volumes); err != nil {

lib/instances/fork.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/kernel/hypeman/lib/instances/phasetracking"
1717
"github.com/kernel/hypeman/lib/logger"
1818
"github.com/kernel/hypeman/lib/network"
19+
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
1920
"github.com/nrednav/cuid2"
2021
"go.opentelemetry.io/otel/attribute"
2122
"gvisor.dev/gvisor/pkg/cleanup"
@@ -281,6 +282,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
281282
forkMeta.VsockSocket = m.paths.InstanceSocket(forkID, hypervisor.VsockSocketNameForType(forkMeta.HypervisorType))
282283
forkMeta.ExitCode = nil
283284
forkMeta.ExitMessage = ""
285+
forkMeta.RestartStatus = restartpolicy.Status{}
284286
// Forks are new instances; phase accounting must not inherit the source's
285287
// cumulative durations. The first transition into the fork's runtime
286288
// phase (Standby for snapshot forks, Stopped for stopped forks) will be
@@ -481,6 +483,9 @@ func cloneStoredMetadata(src StoredMetadata) StoredMetadata {
481483
if src.HealthCheck != nil {
482484
dst.HealthCheck = cloneHealthCheckPolicy(src.HealthCheck)
483485
}
486+
if src.RestartPolicy != nil {
487+
dst.RestartPolicy = cloneRestartPolicy(src.RestartPolicy)
488+
}
484489
if src.SnapshotPolicy != nil {
485490
dst.SnapshotPolicy = cloneSnapshotPolicy(src.SnapshotPolicy)
486491
}

0 commit comments

Comments
 (0)