Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,14 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
}, nil
}

stopTimeout, err := validateStopTimeout(request.Body.StopTimeout)
if err != nil {
return oapi.CreateInstance400JSONResponse{
Code: "invalid_stop_timeout",
Message: err.Error(),
}, nil
}

domainReq := instances.CreateInstanceRequest{
Name: request.Body.Name,
Image: request.Body.Image,
Expand All @@ -326,6 +334,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
Cmd: cmd,
SkipKernelHeaders: request.Body.SkipKernelHeaders != nil && *request.Body.SkipKernelHeaders,
SkipGuestAgent: request.Body.SkipGuestAgent != nil && *request.Body.SkipGuestAgent,
StopTimeout: lo.FromPtr(stopTimeout),
AutoStandby: autoStandby,
HealthCheck: healthCheck,
RestartPolicy: restartPolicy,
Expand Down Expand Up @@ -754,7 +763,19 @@ func (s *ApiService) StopInstance(ctx context.Context, request oapi.StopInstance
}
log := logger.FromContext(ctx)

result, err := s.InstanceManager.StopInstance(ctx, inst.Id)
var requestedStopTimeout *int
if request.Body != nil {
requestedStopTimeout = request.Body.StopTimeout
}
stopTimeout, err := validateStopTimeout(requestedStopTimeout)
if err != nil {
return oapi.StopInstance400JSONResponse{
Code: "invalid_stop_timeout",
Message: err.Error(),
}, nil
}

result, err := s.InstanceManager.StopInstance(ctx, inst.Id, stopTimeout)
if err != nil {
switch {
case errors.Is(err, instances.ErrInvalidState):
Expand Down Expand Up @@ -1162,6 +1183,10 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
oapiInst.ExitMessage = lo.ToPtr(inst.ExitMessage)
}

if inst.StopTimeout > 0 {
oapiInst.StopTimeout = lo.ToPtr(inst.StopTimeout)
}

if len(inst.Env) > 0 {
oapiInst.Env = &inst.Env
}
Expand Down
38 changes: 38 additions & 0 deletions cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,44 @@ func TestInstanceToOAPI_OmitsPlatformWhenUnset(t *testing.T) {
assert.Nil(t, oapiInst.Platform)
}

func TestInstanceToOAPI_EchoesStopTimeout(t *testing.T) {
t.Parallel()

inst := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-stop-timeout",
Name: "inst-stop-timeout",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeCloudHypervisor,
StopTimeout: 30,
},
State: instances.StateRunning,
}

oapiInst := instanceToOAPI(inst)
require.NotNil(t, oapiInst.StopTimeout)
assert.Equal(t, 30, *oapiInst.StopTimeout)
}

func TestInstanceToOAPI_OmitsStopTimeoutWhenUnset(t *testing.T) {
t.Parallel()

inst := instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-no-stop-timeout",
Name: "inst-no-stop-timeout",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

oapiInst := instanceToOAPI(inst)
assert.Nil(t, oapiInst.StopTimeout)
}

// errCreateInstanceManager is a fake whose CreateInstance always fails with a
// preset error, used to assert the handler maps typed image errors to statuses.
type errCreateInstanceManager struct {
Expand Down
13 changes: 13 additions & 0 deletions cmd/api/api/stop_timeout.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package api

import "fmt"

// validateStopTimeout normalizes an optional stop_timeout (seconds) from a
// request body, rejecting non-positive values. A nil input or result means the
// caller falls back to the instance's configured value or the server default.
func validateStopTimeout(v *int) (*int, error) {
if v != nil && *v < 1 {
return nil, fmt.Errorf("stop_timeout must be at least 1 second")
}
return v, nil
}
2 changes: 1 addition & 1 deletion lib/builds/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func (m *mockInstanceManager) RestoreSnapshot(ctx context.Context, id string, sn
return nil, instances.ErrNotSupported
}

func (m *mockInstanceManager) StopInstance(ctx context.Context, id string) (*instances.Instance, error) {
func (m *mockInstanceManager) StopInstance(ctx context.Context, id string, stopTimeout *int) (*instances.Instance, error) {
if m.stopFunc != nil {
return m.stopFunc(ctx, id)
}
Expand Down
1 change: 1 addition & 0 deletions lib/instances/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ func (m *manager) createInstance(
Cmd: req.Cmd,
SkipKernelHeaders: req.SkipKernelHeaders,
SkipGuestAgent: req.SkipGuestAgent,
StopTimeout: req.StopTimeout,
EnableRosetta: enableRosetta,
SnapshotPolicy: cloneSnapshotPolicy(req.SnapshotPolicy),
AutoStandby: cloneAutoStandbyPolicy(req.AutoStandby),
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (m *manager) deleteInstance(
// 4. If active, try graceful guest shutdown before force kill.
gracefulShutdown := false
if inst.State == StateRunning || inst.State == StateInitializing {
stopTimeout := resolveStopTimeout(stored)
stopTimeout := resolveStopTimeout(stored, nil)
if stopTimeout > deleteGracefulShutdownTimeout {
stopTimeout = deleteGracefulShutdownTimeout
}
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/firecracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ func TestFirecrackerStopClearsStaleSnapshot(t *testing.T) {
require.NoError(t, err)
require.True(t, beforeStop.HasSnapshot, "test setup should create visible stale snapshot")

inst, err = mgr.StopInstance(ctx, inst.Id)
inst, err = mgr.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err)
assert.Equal(t, StateStopped, inst.State)
assert.False(t, inst.HasSnapshot, "stopped instances should not retain stale snapshots")
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe
inst, err := m.standbyInstance(ctx, forkID, StandbyInstanceRequest{}, false)
return returnWithReadiness(inst, err, false)
case StateStopped:
inst, err := m.stopInstance(ctx, forkID)
inst, err := m.stopInstance(ctx, forkID, nil)
return returnWithReadiness(inst, err, false)
}
}
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/fork_speed_darwin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func TestVZForkSpeed(t *testing.T) {
require.Equalf(t, 0, code, "dd to fill overlay should succeed")

// Stop the source so each fork copies a stable on-disk state without booting.
_, err = mgr.StopInstance(ctx, source.Id)
_, err = mgr.StopInstance(ctx, source.Id, nil)
require.NoError(t, err)

// Reset the global reflink flag however the test exits — a require failure
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/lifecycle_noop_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func TestLifecycleNoopTransitionsReturnCurrentInstanceWithoutEvent(t *testing.T)
name: "stop already stopped",
state: StateStopped,
action: func(ctx context.Context, m *manager, id string) (*Instance, error) {
return m.StopInstance(ctx, id)
return m.StopInstance(ctx, id, nil)
},
},
}
Expand Down
9 changes: 5 additions & 4 deletions lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ type Manager interface {
StandbyInstance(ctx context.Context, id string, req StandbyInstanceRequest) (*Instance, error)
RestoreInstance(ctx context.Context, id string) (*Instance, error)
RestoreSnapshot(ctx context.Context, id string, snapshotID string, req RestoreSnapshotRequest) (*Instance, error)
StopInstance(ctx context.Context, id string) (*Instance, error)
StopInstance(ctx context.Context, id string, stopTimeout *int) (*Instance, error)
StartInstance(ctx context.Context, id string, req StartInstanceRequest) (*Instance, error)
UpdateInstance(ctx context.Context, id string, req UpdateInstanceRequest) (*Instance, error)
StreamInstanceLogs(ctx context.Context, id string, tail int, follow bool, source LogSource) (<-chan string, error)
Expand Down Expand Up @@ -609,8 +609,9 @@ func (m *manager) RestoreSnapshot(ctx context.Context, id string, snapshotID str
return inst, err
}

// StopInstance gracefully stops a running instance
func (m *manager) StopInstance(ctx context.Context, id string) (*Instance, error) {
// StopInstance gracefully stops a running instance. A non-nil stopTimeout
// overrides the instance's configured grace period for this call only.
func (m *manager) StopInstance(ctx context.Context, id string, stopTimeout *int) (*Instance, error) {
lock := m.getInstanceLock(id)
lock.Lock()
defer lock.Unlock()
Expand All @@ -631,7 +632,7 @@ func (m *manager) StopInstance(ctx context.Context, id string) (*Instance, error
if err := m.markRestartManualStopLocked(ctx, id); err != nil {
return nil, err
}
inst, err := m.stopInstance(ctx, id)
inst, err := m.stopInstance(ctx, id, stopTimeout)
if err == nil {
m.notifyLifecycleEvent(ctx, LifecycleEventStop, inst)
}
Expand Down
6 changes: 3 additions & 3 deletions lib/instances/manager_darwin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func TestVZBasicLifecycle(t *testing.T) {

// Graceful shutdown test
t.Log("Stopping instance (graceful shutdown)...")
inst, err = mgr.StopInstance(ctx, inst.Id)
inst, err = mgr.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err)
assert.Equal(t, StateStopped, inst.State)
t.Log("Instance stopped")
Expand Down Expand Up @@ -255,7 +255,7 @@ func TestVZBasicLifecycle(t *testing.T) {

// Stop again before delete
t.Log("Stopping instance before delete...")
inst, err = mgr.StopInstance(ctx, inst.Id)
inst, err = mgr.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err)
assert.Equal(t, StateStopped, inst.State)

Expand Down Expand Up @@ -359,7 +359,7 @@ func TestVZExecAndShutdown(t *testing.T) {

// Graceful shutdown
t.Log("Stopping instance...")
inst, err = mgr.StopInstance(ctx, inst.Id)
inst, err = mgr.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err)
assert.Equal(t, StateStopped, inst.State)
t.Log("Instance stopped gracefully")
Expand Down
4 changes: 2 additions & 2 deletions lib/instances/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ func TestBasicEndToEnd(t *testing.T) {
// Test graceful stop: StopInstance sends Shutdown RPC -> init forwards SIGTERM
// -> app exits -> init writes exit sentinel -> reboot(POWER_OFF) -> VM stops cleanly
t.Log("Testing graceful stop via StopInstance...")
stoppedInst, err := manager.StopInstance(ctx, inst.Id)
stoppedInst, err := manager.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err, "StopInstance should succeed")
assert.Equal(t, StateStopped, stoppedInst.State, "Instance should be in Stopped state after StopInstance")

Expand Down Expand Up @@ -911,7 +911,7 @@ func TestBasicEndToEnd(t *testing.T) {
t.Log("Restart test passed -- exit info cleared!")

// Stop again before deleting
_, err = manager.StopInstance(ctx, inst.Id)
_, err = manager.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err)

// Delete instance
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/qemu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ func TestQEMUBasicEndToEnd(t *testing.T) {
// Test graceful stop: StopInstance sends Shutdown RPC -> init forwards SIGTERM
// -> app exits -> init writes exit sentinel -> reboot(POWER_OFF) -> VM stops cleanly
t.Log("Testing graceful stop via StopInstance...")
stoppedInst, err := manager.StopInstance(ctx, inst.Id)
stoppedInst, err := manager.StopInstance(ctx, inst.Id, nil)
require.NoError(t, err, "StopInstance should succeed")
assert.Equal(t, StateStopped, stoppedInst.State, "Instance should be in Stopped state after StopInstance")

Expand Down
2 changes: 1 addition & 1 deletion lib/instances/restart_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ func (m *manager) HandleHealthCheckUnhealthy(ctx context.Context, id string) err
return err
}

stopped, err := m.stopInstance(ctx, id)
stopped, err := m.stopInstance(ctx, id, nil)
if err != nil {
_ = m.updateRestartStatusLocked(id, restartStatusAfterFailedAttempt(policy, nextStatus, reason, now))
return err
Expand Down
13 changes: 9 additions & 4 deletions lib/instances/stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ const shutdownFailureFallbackWait = 500 * time.Millisecond

var errGracefulShutdownFailed = errors.New("graceful guest shutdown did not complete")

// resolveStopTimeout returns the configured stop timeout in seconds,
// falling back to the package default when unset/invalid.
func resolveStopTimeout(stored *StoredMetadata) int {
// resolveStopTimeout returns the stop timeout in seconds. A positive override
// (e.g. from a per-call stop request) wins; otherwise the instance's configured
// value is used, falling back to the package default when unset/invalid.
func resolveStopTimeout(stored *StoredMetadata, override *int) int {
if override != nil && *override > 0 {
return *override
}
stopTimeout := stored.StopTimeout
if stopTimeout <= 0 {
stopTimeout = DefaultStopTimeout
Expand Down Expand Up @@ -143,6 +147,7 @@ func (m *manager) forceKillHypervisorProcess(ctx context.Context, inst *Instance
func (m *manager) stopInstance(
ctx context.Context,
id string,
stopTimeoutOverride *int,
) (_ *Instance, retErr error) {
start := time.Now()
log := logger.FromContext(ctx)
Expand Down Expand Up @@ -185,7 +190,7 @@ func (m *manager) stopInstance(

// 4. Graceful shutdown: send signal to guest init via Shutdown RPC,
// then wait for VM to power off cleanly. Fall back to hypervisor shutdown on timeout.
stopTimeout := resolveStopTimeout(stored)
stopTimeout := resolveStopTimeout(stored, stopTimeoutOverride)
gracefulCtx, gracefulSpanEnd := m.startLifecycleStep(ctx, "graceful_guest_shutdown",
attribute.String("instance_id", id),
attribute.String("hypervisor", string(stored.HypervisorType)),
Expand Down
32 changes: 32 additions & 0 deletions lib/instances/stop_timeout_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package instances

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestResolveStopTimeout(t *testing.T) {
intPtr := func(v int) *int { return &v }

tests := []struct {
name string
stored int
override *int
want int
}{
{name: "unset uses default", stored: 0, override: nil, want: DefaultStopTimeout},
{name: "negative stored uses default", stored: -3, override: nil, want: DefaultStopTimeout},
{name: "configured value", stored: 12, override: nil, want: 12},
{name: "override wins over configured", stored: 12, override: intPtr(30), want: 30},
{name: "override wins over default", stored: 0, override: intPtr(30), want: 30},
{name: "non-positive override ignored", stored: 12, override: intPtr(0), want: 12},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := resolveStopTimeout(&StoredMetadata{StopTimeout: tt.stored}, tt.override)
assert.Equal(t, tt.want, got)
})
}
}
1 change: 1 addition & 0 deletions lib/instances/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ type CreateInstanceRequest struct {
Cmd []string // Override image cmd (nil = use image default)
SkipKernelHeaders bool // Skip kernel headers installation (disables DKMS)
SkipGuestAgent bool // Skip guest-agent installation (disables exec/stat API)
StopTimeout int // Grace period in seconds for graceful stop (0 = use default 5s)
SnapshotPolicy *SnapshotPolicy // Optional snapshot policy defaults for this instance
AutoStandby *autostandby.Policy // Optional automatic standby policy
HealthCheck *healthcheck.Policy // Optional workload health check policy
Expand Down
2 changes: 1 addition & 1 deletion lib/instances/wait_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func (s *stubManager) RestoreInstance(context.Context, string) (*Instance, error
func (s *stubManager) RestoreSnapshot(context.Context, string, string, RestoreSnapshotRequest) (*Instance, error) {
return nil, nil
}
func (s *stubManager) StopInstance(context.Context, string) (*Instance, error) { return nil, nil }
func (s *stubManager) StopInstance(context.Context, string, *int) (*Instance, error) { return nil, nil }
func (s *stubManager) StartInstance(context.Context, string, StartInstanceRequest) (*Instance, error) {
return nil, nil
}
Expand Down
Loading
Loading