Skip to content

Commit b5db2bb

Browse files
JAORMXclaude
andcommitted
Send signals to process group for complete cleanup
Use kill(-pid, sig) instead of kill(pid, sig) in Process.Stop() and terminateStaleRunner() so the entire runner process group is terminated. The runner already starts with Setsid: true (PGID == PID), so this is safe and strictly better when the runner has children. Guard against unsafe PIDs (0 and 1) to prevent catastrophic kill(0)/kill(-1) calls. Use errors.Is for ESRCH checks. Closes #4 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ec0b7ea commit b5db2bb

5 files changed

Lines changed: 182 additions & 10 deletions

File tree

propolis.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -285,16 +285,21 @@ func terminateStaleRunner(cfg *config) {
285285
slog.Debug("could not load state for stale runner check", "error", err)
286286
return
287287
}
288-
if st.PID <= 0 {
288+
if st.PID <= 1 {
289289
return
290290
}
291291
if !cfg.processAlive(st.PID) {
292292
slog.Debug("stale runner already dead", "pid", st.PID)
293293
return
294294
}
295295

296-
slog.Warn("terminating stale runner process", "pid", st.PID)
297-
if err := cfg.killProcess(st.PID, syscall.SIGTERM); err != nil {
296+
// Use negative PID to signal the entire process group (PGID == PID
297+
// because the runner starts with Setsid: true). This ensures any
298+
// children spawned by the runner are also terminated.
299+
target := -st.PID
300+
301+
slog.Warn("terminating stale runner process group", "pid", st.PID)
302+
if err := cfg.killProcess(target, syscall.SIGTERM); err != nil {
298303
slog.Warn("failed to send SIGTERM to stale runner", "pid", st.PID, "error", err)
299304
return
300305
}
@@ -311,7 +316,7 @@ func terminateStaleRunner(cfg *config) {
311316

312317
// Force kill.
313318
slog.Warn("stale runner did not exit after SIGTERM, sending SIGKILL", "pid", st.PID)
314-
if err := cfg.killProcess(st.PID, syscall.SIGKILL); err != nil {
319+
if err := cfg.killProcess(target, syscall.SIGKILL); err != nil {
315320
slog.Warn("failed to send SIGKILL to stale runner", "pid", st.PID, "error", err)
316321
}
317322
}

propolis_test.go

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -791,7 +791,7 @@ func TestTerminateStaleRunner_AliveProcess_GracefulExit(t *testing.T) {
791791
aliveCount := 0
792792

793793
cfg.killProcess = func(pid int, sig syscall.Signal) error {
794-
assert.Equal(t, 99999, pid)
794+
assert.Equal(t, -99999, pid, "should signal the process group (negative PID)")
795795
mu.Lock()
796796
signals = append(signals, sig)
797797
mu.Unlock()
@@ -834,7 +834,7 @@ func TestTerminateStaleRunner_AliveProcess_RequiresKill(t *testing.T) {
834834
var signals []syscall.Signal
835835

836836
cfg.killProcess = func(pid int, sig syscall.Signal) error {
837-
assert.Equal(t, 99999, pid)
837+
assert.Equal(t, -99999, pid, "should signal the process group (negative PID)")
838838
mu.Lock()
839839
signals = append(signals, sig)
840840
mu.Unlock()
@@ -852,6 +852,75 @@ func TestTerminateStaleRunner_AliveProcess_RequiresKill(t *testing.T) {
852852
assert.Equal(t, syscall.SIGKILL, signals[1])
853853
}
854854

855+
func TestTerminateStaleRunner_SendsToProcessGroup(t *testing.T) {
856+
t.Parallel()
857+
858+
dataDir := t.TempDir()
859+
860+
mgr := state.NewManager(dataDir)
861+
ls, err := mgr.LoadAndLock(context.Background())
862+
require.NoError(t, err)
863+
ls.State.Active = true
864+
ls.State.PID = 55555
865+
require.NoError(t, ls.Save())
866+
ls.Release()
867+
868+
cfg := defaultConfig()
869+
cfg.dataDir = dataDir
870+
871+
var mu sync.Mutex
872+
var receivedPIDs []int
873+
aliveCount := 0
874+
875+
cfg.killProcess = func(pid int, _ syscall.Signal) error {
876+
mu.Lock()
877+
receivedPIDs = append(receivedPIDs, pid)
878+
mu.Unlock()
879+
return nil
880+
}
881+
cfg.processAlive = func(_ int) bool {
882+
mu.Lock()
883+
defer mu.Unlock()
884+
aliveCount++
885+
return aliveCount <= 1
886+
}
887+
888+
terminateStaleRunner(cfg)
889+
890+
mu.Lock()
891+
defer mu.Unlock()
892+
require.Len(t, receivedPIDs, 1)
893+
assert.Equal(t, -55555, receivedPIDs[0], "killProcess should receive negative PID for process group")
894+
}
895+
896+
func TestTerminateStaleRunner_PID1_Skipped(t *testing.T) {
897+
t.Parallel()
898+
899+
dataDir := t.TempDir()
900+
901+
// Write state with PID=1 (init). This must never produce kill(-1, sig).
902+
mgr := state.NewManager(dataDir)
903+
ls, err := mgr.LoadAndLock(context.Background())
904+
require.NoError(t, err)
905+
ls.State.Active = true
906+
ls.State.PID = 1
907+
require.NoError(t, ls.Save())
908+
ls.Release()
909+
910+
cfg := defaultConfig()
911+
cfg.dataDir = dataDir
912+
913+
var killCalled bool
914+
cfg.killProcess = func(_ int, _ syscall.Signal) error {
915+
killCalled = true
916+
return nil
917+
}
918+
cfg.processAlive = func(_ int) bool { return true }
919+
920+
terminateStaleRunner(cfg)
921+
assert.False(t, killCalled, "should not attempt to kill PID 1")
922+
}
923+
855924
func TestTerminateStaleRunner_ZeroPID(t *testing.T) {
856925
t.Parallel()
857926

runner/spawn.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package runner
66
import (
77
"context"
88
"encoding/json"
9+
"errors"
910
"fmt"
1011
"log/slog"
1112
"os"
@@ -68,6 +69,17 @@ type Process struct {
6869
// PID returns the process ID (implements ProcessHandle).
6970
func (p *Process) PID() int { return p.pid }
7071

72+
// killTarget returns the negative PID for process-group-wide signals.
73+
// Because the runner starts with Setsid: true, its PGID == PID,
74+
// so kill(-pid, sig) targets the entire process group.
75+
// Panics if pid <= 1 to prevent kill(0) (own group) or kill(-1) (all processes).
76+
func (p *Process) killTarget() int {
77+
if p.pid <= 1 {
78+
panic(fmt.Sprintf("killTarget called with unsafe pid %d", p.pid))
79+
}
80+
return -p.pid
81+
}
82+
7183
// Spawn starts the propolis-runner binary as a detached subprocess.
7284
// Deprecated: Use SpawnProcess or DefaultSpawner instead.
7385
func Spawn(ctx context.Context, cfg Config) (*Process, error) {
@@ -164,7 +176,9 @@ func (p *Process) Stop(ctx context.Context) error {
164176
}
165177

166178
// Send SIGTERM for graceful shutdown.
167-
if err := p.deps.kill(p.pid, syscall.SIGTERM); err != nil {
179+
// Use killTarget() (negative PID) to signal the entire process group,
180+
// ensuring any children spawned by the runner are also terminated.
181+
if err := p.deps.kill(p.killTarget(), syscall.SIGTERM); err != nil {
168182
// If the process is already gone, that is fine.
169183
if !isNoSuchProcess(err) {
170184
return fmt.Errorf("send SIGTERM to pid %d: %w", p.pid, err)
@@ -181,15 +195,15 @@ func (p *Process) Stop(ctx context.Context) error {
181195
select {
182196
case <-ctx.Done():
183197
// Context canceled — force kill.
184-
_ = p.deps.kill(p.pid, syscall.SIGKILL)
198+
_ = p.deps.kill(p.killTarget(), syscall.SIGKILL)
185199
return ctx.Err()
186200
case <-ticker.C:
187201
if !p.IsAlive() {
188202
return nil
189203
}
190204
if time.Now().After(deadline) {
191205
// Timeout — force kill.
192-
if err := p.deps.kill(p.pid, syscall.SIGKILL); err != nil && !isNoSuchProcess(err) {
206+
if err := p.deps.kill(p.killTarget(), syscall.SIGKILL); err != nil && !isNoSuchProcess(err) {
193207
return fmt.Errorf("send SIGKILL to pid %d: %w", p.pid, err)
194208
}
195209
return nil
@@ -260,5 +274,5 @@ func libPathEnvVar() string {
260274

261275
// isNoSuchProcess returns true if the error indicates the process does not exist.
262276
func isNoSuchProcess(err error) bool {
263-
return err == syscall.ESRCH
277+
return errors.Is(err, syscall.ESRCH)
264278
}

runner/spawn_test.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,88 @@ func TestIsNoSuchProcess(t *testing.T) {
309309
assert.False(t, isNoSuchProcess(os.ErrNotExist))
310310
}
311311

312+
// --- Process group cleanup tests ---
313+
314+
func TestProcess_Stop_SendsToProcessGroup(t *testing.T) {
315+
t.Parallel()
316+
317+
terminated := false
318+
319+
var receivedPID int
320+
p := &Process{
321+
pid: 12345,
322+
deps: processDeps{
323+
findProcess: func(_ int) (*os.Process, error) {
324+
if terminated {
325+
return nil, os.ErrNotExist
326+
}
327+
return os.FindProcess(os.Getpid())
328+
},
329+
kill: func(pid int, sig syscall.Signal) error {
330+
receivedPID = pid
331+
if sig == syscall.SIGTERM {
332+
terminated = true
333+
}
334+
return nil
335+
},
336+
},
337+
}
338+
339+
err := p.Stop(context.Background())
340+
require.NoError(t, err)
341+
assert.Equal(t, -12345, receivedPID, "kill should receive negative PID for process group")
342+
}
343+
344+
func TestProcess_Stop_KillSendsToProcessGroup(t *testing.T) {
345+
t.Parallel()
346+
347+
// Process stays alive forever: SIGTERM succeeds but process never exits.
348+
// A short-lived context should cause SIGKILL, also to the process group.
349+
var killPIDs []int
350+
p := &Process{
351+
pid: 12345,
352+
deps: processDeps{
353+
findProcess: func(_ int) (*os.Process, error) {
354+
return os.FindProcess(os.Getpid())
355+
},
356+
kill: func(pid int, _ syscall.Signal) error {
357+
killPIDs = append(killPIDs, pid)
358+
return nil
359+
},
360+
},
361+
}
362+
363+
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
364+
defer cancel()
365+
366+
_ = p.Stop(ctx)
367+
// Both SIGTERM and SIGKILL should target the process group (negative PID).
368+
for i, pid := range killPIDs {
369+
assert.Equal(t, -12345, pid, "kill call %d should use negative PID", i)
370+
}
371+
}
372+
373+
func TestProcess_killTarget(t *testing.T) {
374+
t.Parallel()
375+
376+
p := &Process{pid: 42}
377+
assert.Equal(t, -42, p.killTarget())
378+
}
379+
380+
func TestProcess_killTarget_PanicsOnPID0(t *testing.T) {
381+
t.Parallel()
382+
383+
p := &Process{pid: 0}
384+
assert.Panics(t, func() { p.killTarget() }, "killTarget with PID 0 should panic")
385+
}
386+
387+
func TestProcess_killTarget_PanicsOnPID1(t *testing.T) {
388+
t.Parallel()
389+
390+
p := &Process{pid: 1}
391+
assert.Panics(t, func() { p.killTarget() }, "killTarget with PID 1 should panic")
392+
}
393+
312394
// --- Interface compliance ---
313395

314396
func TestDefaultSpawner_ImplementsInterface(t *testing.T) {

vm.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ func (vm *VM) Remove(ctx context.Context) error {
127127

128128
// RunnerPID returns the runner process ID, or 0 if the ID cannot be parsed
129129
// as a PID (e.g. non-process-based backends).
130+
// Because the runner starts with Setsid: true, its PGID equals its PID.
131+
// Callers needing process-group signals can use syscall.Kill(-pid, sig).
130132
func (vm *VM) RunnerPID() int {
131133
pid, err := pidFromID(vm.handle.ID())
132134
if err != nil {

0 commit comments

Comments
 (0)