-
Notifications
You must be signed in to change notification settings - Fork 333
fix(envd): kill command's whole process tree on signal #2933
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,10 @@ | |
| "net/http/httptest" | ||
| "os" | ||
| "os/user" | ||
| "slices" | ||
| "strconv" | ||
| "strings" | ||
| "syscall" | ||
| "testing" | ||
| "time" | ||
|
|
||
|
|
@@ -84,6 +88,113 @@ | |
| assert.NotNil(t, events[len(events)-1].GetEnd(), "last event should be End") | ||
| } | ||
|
|
||
| // TestSendSignal_KillsProcessTree verifies that killing a command also | ||
| // terminates the child processes it spawned. envd's per-process handle only | ||
| // signaled the leader, so children kept running after a kill; non-PTY commands | ||
| // now run in their own process group and SIGKILL reaches the whole tree. | ||
| func TestSendSignal_KillsProcessTree(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| client, cleanup := newTestService(t) | ||
| defer cleanup() | ||
|
|
||
| ctx, cancel := context.WithTimeout(t.Context(), 20*time.Second) | ||
| defer cancel() | ||
|
|
||
| // The leader spawns two long-lived children and waits on them. | ||
| stream, err := client.Start(ctx, connect.NewRequest(&rpc.StartRequest{ | ||
| Process: &rpc.ProcessConfig{ | ||
| Cmd: "/bin/sh", | ||
| Args: []string{"-c", "sleep 120 & sleep 120 & wait"}, | ||
| }, | ||
| })) | ||
| require.NoError(t, err) | ||
|
|
||
| // Read the start event to capture the leader pid. | ||
| require.True(t, stream.Receive(), "expected a start event") | ||
| pid := stream.Msg().GetEvent().GetStart().GetPid() | ||
| require.NotZero(t, pid) | ||
|
|
||
| // Wait for the children to come up, then capture their pids. | ||
| var childPids []int | ||
| require.Eventually(t, func() bool { | ||
| childPids = childrenOf(t, int(pid)) | ||
|
|
||
| return len(childPids) == 2 | ||
| }, 5*time.Second, 50*time.Millisecond, "expected leader to spawn two children") | ||
|
|
||
| // Kill the command — this should take down the whole process group. | ||
| _, err = client.SendSignal(ctx, connect.NewRequest(&rpc.SendSignalRequest{ | ||
| Process: &rpc.ProcessSelector{ | ||
| Selector: &rpc.ProcessSelector_Pid{Pid: pid}, | ||
| }, | ||
| Signal: rpc.Signal_SIGNAL_SIGKILL, | ||
| })) | ||
| require.NoError(t, err) | ||
|
|
||
| // Drain the stream so the leader is reaped. | ||
| for stream.Receive() { | ||
| } | ||
| _ = stream.Close() | ||
|
|
||
| // The leader and every child must be gone. | ||
| require.Eventually(t, func() bool { | ||
| return !slices.ContainsFunc(childPids, processAlive) | ||
| }, 5*time.Second, 50*time.Millisecond, "child processes should be killed with the leader") | ||
| } | ||
|
|
||
| // childrenOf returns the pids whose parent is ppid, read from /proc. | ||
| func childrenOf(t *testing.T, ppid int) []int { | ||
| t.Helper() | ||
|
|
||
| entries, err := os.ReadDir("/proc") | ||
| require.NoError(t, err) | ||
|
|
||
| var children []int | ||
| for _, entry := range entries { | ||
| if !entry.IsDir() { | ||
| continue | ||
| } | ||
|
|
||
| pid, err := strconv.Atoi(entry.Name()) | ||
| if err != nil { | ||
| continue | ||
| } | ||
|
|
||
| data, err := os.ReadFile("/proc/" + entry.Name() + "/stat") | ||
| if err != nil { | ||
| // The process may have exited between listing and reading. | ||
| continue | ||
| } | ||
|
|
||
| // /proc/<pid>/stat fields: pid (comm) state ppid ... | ||
| // comm can contain spaces/parens, so parse after the closing paren. | ||
| stat := string(data) | ||
| idx := strings.LastIndex(stat, ")") | ||
| if idx == -1 { | ||
| continue | ||
| } | ||
|
|
||
| fields := strings.Fields(stat[idx+1:]) | ||
| // fields[0]=state, fields[1]=ppid | ||
| if len(fields) < 2 { | ||
| continue | ||
| } | ||
|
|
||
| if parsed, err := strconv.Atoi(fields[1]); err == nil && parsed == ppid { | ||
| children = append(children, pid) | ||
| } | ||
| } | ||
|
|
||
| return children | ||
| } | ||
|
|
||
| // processAlive reports whether the given pid still exists. | ||
| func processAlive(pid int) bool { | ||
| // Signal 0 performs error checking without actually sending a signal. | ||
| return syscall.Kill(pid, 0) == nil | ||
| } | ||
|
|
||
|
Check warning on line 197 in packages/envd/internal/services/process/start_test.go
|
||
|
Comment on lines
+193
to
+197
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 processAlive uses syscall.Kill(pid, 0) which succeeds for zombies — kill(2) explicitly notes the existence check passes "as long as the process still exists" (i.e. has a /proc entry, even if unreaped). After the test SIGKILLs the process group, the two sleep children are orphaned to PID 1 / nearest subreaper; if the test ever runs in a container where PID 1 is the Go test binary itself (no tini/dumb-init), the orphans stay zombies, processAlive returns true forever, and the 5s Eventually times out. Test-only nit — childrenOf already parses /proc//stat, so reading fields[0] and treating '''Z''' as not-alive is a one-line fix. Extended reasoning...What the bug is
Why the test can flake
Why the refutation is partly right but not a reason to skipThe refutation is correct that in the environments this code is normally tested in (dev host, The issue is what happens in the one environment where it doesn't: Step-by-step proof
How to fix
func processAlive(pid int) bool {
data, err := os.ReadFile("/proc/" + strconv.Itoa(pid) + "/stat")
if err != nil {
return false // gone from /proc → definitely not alive
}
idx := strings.LastIndex(string(data), ")")
if idx == -1 {
return false
}
fields := strings.Fields(string(data)[idx+1:])
return len(fields) > 0 && fields[0] != "Z"
}This makes the test robust regardless of who PID 1 is. |
||
| // TestStart_ClientDisconnectMidStream verifies that when a client | ||
| // cancels mid-stream, the handler returns without racing. This is | ||
| // the scenario that caused the nil-pointer panic in production: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| package pkg | ||
|
|
||
| const Version = "0.6.1" | ||
| const Version = "0.6.2" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 After this PR,
SendSignalon a non-PTY handler bypasses Go'sErrProcessDoneguard:signalProcessGroupcallssyscall.Kill(-cmd.Process.Pid, signal)directly. There is a window aftercmd.Wait()reaps the leader (freeing its PID) but before the deferreds.processes.Delete(pid)runs instart.go:184-188— a staleSendSignalarriving in that window can target a recycled PID and silently kill an unrelated command's whole process group. Fix: gatesignalProcessGrouponcmd.ProcessState != nil(or a handler-level atomic flag set aroundcmd.Wait()).Extended reasoning...
What the bug is
Pre-PR,
SendSignaldelivered the signal viap.cmd.Process.Signal(signal). Go'sos.Processkeeps adoneatomic (set insideWait()before/afterWait4), andpidSignal(/usr/local/go/src/os/exec_unix.go) returnsos.ErrProcessDoneonce the process has been reaped. That guard makes signaling a reaped leader a no-op — important because the kernel is free to reuse a reaped PID immediately.This PR introduces
signalProcessGroup:It bypasses Go's
doneguard entirely and issues a rawkill(-pid, sig)on whatever PIDcmd.Process.Pidstill records — which is exactly the PID that may have just been reassigned by the kernel.The race window
start.go:184-188reaps and unregisters like this:handler.Wait()then does, in order:<-p.outCtx.Done()→p.cmd.Wait()(leader is reaped here — kernel can reuse the PID from this point) →p.tty.Close()→ build endEvent →p.EndEvent.Source <- event(buffer-0 multiplex channel — blocks until thehandleStartconsumer reads) → log →p.cancel(). Only after all of that returns doess.processes.Delete(pid)run.During that window:
getProcess(service.go:47).p.cmd.Process.Pidstill holds the now-free leader PID.SysProcAttr.Setpgid = true(handler.go:151), so if its leader gets the recycled PID it becomes a process-group leader withpgid == pid.A stale
SendSignal(SIGKILL)(client retry, tag lookup, UI race) arriving against the old handler in that window callssignalProcessGroup→kill(-recycled_pid, SIGKILL), which lands on the brand-new, unrelated command's entire process group.Step-by-step proof
Setpgid=truemakes it pgid=1234.handler.Wait(),p.cmd.Wait()returns — kernel marks PID 1234 reusable. Handler is still ins.processesbecause thedefer s.processes.Delete(pid)hasn't fired yet.p.EndEvent.Source <- eventblocks: the SDK consumer is slow / coalescing reads. Window stretches into ms+ territory.Setpgid=true). The kernel reassigns PID 1234 to B; B becomes pgid=1234 leader.SendSignal(SIGKILL)for command A arrives —getProcess(pid=1234)(or by tag) still returns A's handler.A.SendSignalcallssignalProcessGroup(A.cmd, SIGKILL)→syscall.Kill(-1234, SIGKILL)→ entire process group 1234 (= command B's leader and all its children) is killed.Pre-PR step 6 would have returned
ErrProcessDoneand made no syscall.Why existing code doesn't prevent it
if cmd.Process == nilcheck only covers "not started yet"; it does nothing about "already reaped".errors.Is(err, syscall.ESRCH)swallow makes things worse, not better: it only triggers when the group is truly gone — which is exactly not the case when the PID has been recycled.p.cmd.Process.Signaland retains Go's guard, so only non-PTY commands regress.pid_maxdoesn't save us: PIDs are recycled FIFO within the in-use set, and a sandbox that churns short-lived commands can wrap quickly (much faster in PID namespaces with smaller ranges).Impact
A stale signal can silently kill a different command's whole process tree — exactly the blast radius this PR was supposed to introduce for legitimate kills, now applied to the wrong target. The error returned to the caller is
nil, so the caller has no signal that anything went wrong; the victim command just dies. Likelihood is low (small window × PID reuse), but the failure mode is silent cross-command termination, which is hard to diagnose after the fact.Fix
Restore the equivalent of
ErrProcessDone. Simplest:or track a handler-level
atomic.Boolset immediately aroundp.cmd.Wait()and consult it inSendSignalbefore falling through. Thecmd.Cancelcallback (also new in this PR) has the same hazard and should use the same guard.