Skip to content

Commit 2772623

Browse files
Pangjipingclaude
andauthored
fix(execd): kill entire process group on command cancel (#924)
* fix(execd): kill entire process group on command cancel When a foreground command was cancelled (client disconnect, timeout, or DELETE /command), only the bash group leader received SIGKILL — child processes spawned via `&` or pipelines kept running as orphans because exec.CommandContext's internal kill targets a single pid, and killPid sent signals to the leader only. Fix runCommand's ctx.Done() branch to send SIGKILL to -pid (the whole group, since the leader is launched with Setpgid: true), mirroring runBackgroundCommand. Rewrite killPid to signal -pid for SIGTERM/SIGKILL and to use kill(-pid, 0) for liveness probing, so Interrupt() also terminates descendants. Adds regression tests covering both cancel and Interrupt paths. Fixes #922 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * fix(execd): guard Interrupt against stale PID after command exit killPid now signals the whole process group (-pid). Combined with the fact that commandClientMap retains finished sessions, a late or retried Interrupt could otherwise terminate every process in an unrelated process group whose PGID has reused the recorded PID. - markCommandFinished clears kernel.pid alongside kernel.running so the stale PID is no longer accessible. - commandSnapshot now reads under c.mu.RLock for a consistent view of running/pid relative to markCommandFinished's write under c.mu.Lock. - Interrupt() (unix and windows) snapshots the kernel and refuses to signal when the command has already finished. Adds a regression test ensuring Interrupt on a completed session returns an error and that pid is cleared. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * fix(execd): don't surface slow group teardown as Interrupt failure kill(2) on a process group only guarantees delivery to at least one member, and kill(-pid, 0) keeps reporting the group as observable while any unreaped zombie lingers. The previous post-SIGKILL probe ran for only 150ms and then returned a hard error, so Interrupt could surface a 500 even though the kill signal had already been delivered. Likewise on macOS, SIGKILL on a group that has been reduced to zombies returns EPERM, which the previous code reported as a kill failure even though SIGTERM had already taken effect. - After a successful SIGKILL, log a warning when the probe loop still observes the group instead of returning an error. - When SIGTERM was delivered but the SIGKILL syscall fails (commonly EPERM on a zombie-only group), log and return nil — the kill is in flight and the kernel will reap the group once Wait() runs. Adds a regression test that runs killPid against a Setpgid group with no concurrent reaper, exercising the zombie-lingering path. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * fix(execd): only group-kill on real cancellation, not after success Execute() defers cancel() for every foreground command, including successful ones, so the signal-forwarding goroutine's ctx.Done() branch also fired on the normal-success path. With the new group-wide SIGKILL on -cmd.Process.Pid, that post-completion signal could hit a recycled pid/pgid and kill an unrelated process group inside the sandbox. Gate the goroutine on the existing `done` channel (closed after cmd.Wait() returns or on start failure): exit cleanly when the command has finished, so only genuine cancellations — timeout, client abort, Interrupt — trigger the group kill. A double-check inside the ctx.Done() branch handles the race where ctx is cancelled at the same instant cmd.Wait() returns. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
1 parent d10f0e3 commit 2772623

5 files changed

Lines changed: 343 additions & 37 deletions

File tree

components/execd/pkg/runtime/command.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,32 @@ func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest
170170
safego.Go(func() {
171171
for {
172172
select {
173+
case <-done:
174+
// cmd.Wait() has returned (or start failed). The pid is
175+
// about to be — or already has been — reaped, so we
176+
// must not signal it. Execute()'s defer cancel() fires
177+
// after every foreground command, including successful
178+
// ones, so without this gate the SIGKILL below would
179+
// run on a recycled pid/pgid and could kill an
180+
// unrelated process group.
181+
return
173182
case <-ctx.Done():
183+
// Re-check `done` to avoid a race with cmd.Wait()
184+
// returning concurrently. If cmd.Wait() has just
185+
// finished, the leader pid may be reaped and recycled
186+
// at any moment; signaling -pid would then target a
187+
// foreign process group.
188+
select {
189+
case <-done:
190+
return
191+
default:
192+
}
193+
// Genuine cancellation (timeout, client disconnect,
194+
// Interrupt). Kill the whole process group so children
195+
// don't outlive the cancelled context.
196+
if cmd.Process != nil {
197+
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
198+
}
174199
return
175200
case sig := <-signals:
176201
if sig == nil {
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
// Copyright 2025 Alibaba Group Holding Ltd.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
//go:build !windows
16+
// +build !windows
17+
18+
package runtime
19+
20+
import (
21+
"context"
22+
"errors"
23+
"os"
24+
"os/exec"
25+
"path/filepath"
26+
"strconv"
27+
"strings"
28+
"sync"
29+
"syscall"
30+
"testing"
31+
"time"
32+
33+
"github.com/alibaba/opensandbox/execd/pkg/jupyter/execute"
34+
"github.com/stretchr/testify/require"
35+
)
36+
37+
// TestRunCommand_CancelKillsChildren verifies that cancelling the context
38+
// terminates not only the bash group leader but also its descendant
39+
// processes. Regression test for
40+
// https://github.com/alibaba/OpenSandbox/issues/922.
41+
func TestRunCommand_CancelKillsChildren(t *testing.T) {
42+
if _, err := exec.LookPath("bash"); err != nil {
43+
t.Skip("bash not found in PATH")
44+
}
45+
46+
pidFile := filepath.Join(t.TempDir(), "child.pid")
47+
48+
c := NewController("", "")
49+
ctx, cancel := context.WithCancel(context.Background())
50+
defer cancel()
51+
52+
started := make(chan struct{})
53+
var once sync.Once
54+
55+
req := &ExecuteCodeRequest{
56+
// Spawn a sleep child, record its pid, then wait so the bash
57+
// leader stays alive until the context is cancelled.
58+
Code: `sleep 30 & echo $! > "` + pidFile + `"; echo READY; wait`,
59+
Cwd: t.TempDir(),
60+
Timeout: 30 * time.Second,
61+
Hooks: ExecuteResultHook{
62+
OnExecuteInit: func(_ string) {},
63+
OnExecuteStdout: func(s string) {
64+
if strings.TrimSpace(s) == "READY" {
65+
once.Do(func() { close(started) })
66+
}
67+
},
68+
OnExecuteStderr: func(_ string) {},
69+
OnExecuteError: func(_ *execute.ErrorOutput) {},
70+
OnExecuteComplete: func(_ time.Duration) {},
71+
},
72+
}
73+
74+
done := make(chan struct{})
75+
go func() {
76+
_ = c.runCommand(ctx, req)
77+
close(done)
78+
}()
79+
80+
select {
81+
case <-started:
82+
case <-time.After(10 * time.Second):
83+
cancel()
84+
<-done
85+
t.Fatal("command did not emit READY in time")
86+
}
87+
88+
pidBytes, err := os.ReadFile(pidFile)
89+
require.NoError(t, err, "expected child pid file")
90+
childPid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
91+
require.NoError(t, err)
92+
require.Positive(t, childPid)
93+
94+
require.NoError(t, syscall.Kill(childPid, 0), "child should be alive before cancel")
95+
96+
cancel()
97+
98+
select {
99+
case <-done:
100+
case <-time.After(5 * time.Second):
101+
t.Fatal("runCommand did not return after cancel")
102+
}
103+
104+
deadline := time.Now().Add(2 * time.Second)
105+
for time.Now().Before(deadline) {
106+
if err := syscall.Kill(childPid, 0); err != nil {
107+
require.True(t, errors.Is(err, syscall.ESRCH),
108+
"unexpected liveness probe error: %v", err)
109+
return
110+
}
111+
time.Sleep(50 * time.Millisecond)
112+
}
113+
t.Fatalf("child pid %d still alive 2s after cancel — process leak", childPid)
114+
}
115+
116+
// TestInterrupt_AfterFinished_ReturnsError verifies that an Interrupt
117+
// arriving after the command has completed does not signal a recycled PID.
118+
// Without this guard, group-wide kill would amplify the stale-PID hazard
119+
// to every process in an unrelated process group.
120+
func TestInterrupt_AfterFinished_ReturnsError(t *testing.T) {
121+
if _, err := exec.LookPath("bash"); err != nil {
122+
t.Skip("bash not found in PATH")
123+
}
124+
125+
c := NewController("", "")
126+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
127+
defer cancel()
128+
129+
var session string
130+
completeCh := make(chan struct{}, 1)
131+
req := &ExecuteCodeRequest{
132+
Code: `echo done`,
133+
Cwd: t.TempDir(),
134+
Timeout: 5 * time.Second,
135+
Hooks: ExecuteResultHook{
136+
OnExecuteInit: func(s string) { session = s },
137+
OnExecuteStdout: func(_ string) {},
138+
OnExecuteStderr: func(_ string) {},
139+
OnExecuteError: func(_ *execute.ErrorOutput) {},
140+
OnExecuteComplete: func(_ time.Duration) { completeCh <- struct{}{} },
141+
},
142+
}
143+
require.NoError(t, c.runCommand(ctx, req))
144+
145+
select {
146+
case <-completeCh:
147+
case <-time.After(3 * time.Second):
148+
t.Fatal("command did not complete in time")
149+
}
150+
require.NotEmpty(t, session)
151+
152+
err := c.Interrupt(session)
153+
require.Error(t, err, "Interrupt on finished session must error")
154+
require.Contains(t, err.Error(), "not running")
155+
156+
snap := c.commandSnapshot(session)
157+
require.NotNil(t, snap)
158+
require.False(t, snap.running, "running flag should be cleared")
159+
require.Equal(t, 0, snap.pid, "pid should be cleared to avoid stale-PID kill")
160+
}
161+
162+
// TestKillPid_ZombieLeaderDoesNotFail verifies that killPid does not
163+
// return an error when a group leader becomes a zombie before its parent
164+
// has reaped it. kill(-pid, 0) keeps reporting the group as observable
165+
// while the zombie lingers, but SIGKILL has already been delivered and
166+
// the kernel will tear the group down once Wait() runs. Treating that
167+
// state as a failure caused Interrupt to surface a 500 even though the
168+
// kill succeeded.
169+
func TestKillPid_ZombieLeaderDoesNotFail(t *testing.T) {
170+
if _, err := exec.LookPath("bash"); err != nil {
171+
t.Skip("bash not found in PATH")
172+
}
173+
174+
cmd := exec.Command("bash", "-c", `sleep 30 & wait`)
175+
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
176+
require.NoError(t, cmd.Start())
177+
// Deliberately omit a reaper goroutine so the leader stays as a
178+
// zombie after kill — that is the condition we want to exercise.
179+
t.Cleanup(func() {
180+
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
181+
_, _ = cmd.Process.Wait()
182+
})
183+
184+
// Give bash a moment to spawn the sleep child so the group has more
185+
// than just the leader.
186+
time.Sleep(100 * time.Millisecond)
187+
188+
c := &Controller{}
189+
require.NoError(t, c.killPid(cmd.Process.Pid),
190+
"slow post-SIGKILL teardown must not be reported as a hard failure")
191+
}
192+
193+
// TestKillPid_TerminatesEntireProcessGroup verifies that killPid signals
194+
// the whole process group, not just the leader. Regression test for
195+
// https://github.com/alibaba/OpenSandbox/issues/922.
196+
func TestKillPid_TerminatesEntireProcessGroup(t *testing.T) {
197+
if _, err := exec.LookPath("bash"); err != nil {
198+
t.Skip("bash not found in PATH")
199+
}
200+
201+
pidFile := filepath.Join(t.TempDir(), "child.pid")
202+
cmd := exec.Command("bash", "-c",
203+
`sleep 30 & echo $! > "`+pidFile+`"; wait`)
204+
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
205+
require.NoError(t, cmd.Start())
206+
// Reap the leader concurrently so it doesn't linger as a zombie that
207+
// keeps the process group "alive" from killPid's liveness probe
208+
// perspective. Mirrors how runCommand's cmd.Wait() reaps in production.
209+
waitDone := make(chan struct{})
210+
go func() {
211+
_, _ = cmd.Process.Wait()
212+
close(waitDone)
213+
}()
214+
t.Cleanup(func() {
215+
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
216+
<-waitDone
217+
})
218+
219+
var childPid int
220+
deadline := time.Now().Add(3 * time.Second)
221+
for time.Now().Before(deadline) {
222+
if data, err := os.ReadFile(pidFile); err == nil {
223+
if pid, perr := strconv.Atoi(strings.TrimSpace(string(data))); perr == nil && pid > 0 {
224+
childPid = pid
225+
break
226+
}
227+
}
228+
time.Sleep(50 * time.Millisecond)
229+
}
230+
require.Positive(t, childPid, "failed to capture child pid")
231+
require.NoError(t, syscall.Kill(childPid, 0), "child should be alive before kill")
232+
233+
c := &Controller{}
234+
require.NoError(t, c.killPid(cmd.Process.Pid))
235+
236+
deadline = time.Now().Add(2 * time.Second)
237+
for time.Now().Before(deadline) {
238+
if err := syscall.Kill(childPid, 0); err != nil {
239+
require.True(t, errors.Is(err, syscall.ESRCH),
240+
"unexpected liveness probe error: %v", err)
241+
return
242+
}
243+
time.Sleep(50 * time.Millisecond)
244+
}
245+
t.Fatalf("child pid %d still alive 2s after killPid — process leak", childPid)
246+
}

components/execd/pkg/runtime/command_status.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ type CommandOutput struct {
4040
}
4141

4242
func (c *Controller) commandSnapshot(session string) *commandKernel {
43+
c.mu.RLock()
44+
defer c.mu.RUnlock()
45+
4346
var kernel *commandKernel
4447
if v, ok := c.commandClientMap.Load(session); ok {
4548
kernel, _ = v.(*commandKernel)
@@ -128,4 +131,8 @@ func (c *Controller) markCommandFinished(session string, exitCode int, errMsg st
128131
kernel.errMsg = errMsg
129132
kernel.running = false
130133
kernel.finishedAt = &now
134+
// Clear the PID so a late or retried Interrupt cannot signal a recycled
135+
// process. Group-wide kill would otherwise amplify the impact of a
136+
// stale-PID hit to every process in the unrelated process group.
137+
kernel.pid = 0
131138
}

0 commit comments

Comments
 (0)