diff --git a/packages/envd/internal/services/process/handler/handler.go b/packages/envd/internal/services/process/handler/handler.go index ed33d68fbc..001dab588a 100644 --- a/packages/envd/internal/services/process/handler/handler.go +++ b/packages/envd/internal/services/process/handler/handler.go @@ -51,6 +51,13 @@ type Handler struct { cmd *exec.Cmd tty *os.File + // processGroup is true when the command was started in its own process + // group (Setpgid). When set, signals are delivered to the whole group so + // the command's child processes are terminated together with the leader. + // PTY processes are excluded — the pty package already puts them in their + // own session and we leave their signal handling untouched. + processGroup bool + cancel context.CancelFunc outCtx context.Context //nolint:containedctx // todo: refactor so this can be removed @@ -134,6 +141,25 @@ func New( } applyCgroupFD(cmd.SysProcAttr, cgroupFD, ok) + // Non-PTY commands run in their own process group so that SendSignal can + // reach the command's entire process tree (the leader plus any children it + // spawned), not just the single process envd manages. envd's per-process + // handle could not otherwise terminate those children, leaving them running + // — and consuming resources — after the command was killed. + // PTY processes are left alone: the pty package starts them in their own + // session already, and we keep their existing signal behavior. + isPTY := req.GetPty() != nil + if !isPTY { + cmd.SysProcAttr.Setpgid = true + + // exec.CommandContext kills only the leader when the context (e.g. the + // command timeout) fires. Override the canceller to signal the whole + // process group instead, matching SendSignal's behavior. + cmd.Cancel = func() error { + return signalProcessGroup(cmd, syscall.SIGKILL) + } + } + resolvedPath, err := permissions.ExpandAndResolve(req.GetProcess().GetCwd(), user, defaults.Workdir) if err != nil { return nil, connect.NewError(connect.CodeInvalidArgument, err) @@ -178,15 +204,16 @@ func New( outCtx, outCancel := context.WithCancel(ctx) h := &Handler{ - Config: req.GetProcess(), - cmd: cmd, - Tag: req.Tag, - DataEvent: outMultiplex, - cancel: cancel, - outCtx: outCtx, - outCancel: outCancel, - EndEvent: NewMultiplexedChannel[rpc.ProcessEvent_End](0), - logger: logger, + Config: req.GetProcess(), + cmd: cmd, + Tag: req.Tag, + DataEvent: outMultiplex, + cancel: cancel, + outCtx: outCtx, + outCancel: outCancel, + EndEvent: NewMultiplexedChannel[rpc.ProcessEvent_End](0), + logger: logger, + processGroup: !isPTY, } if req.GetPty() != nil { @@ -358,9 +385,33 @@ func (p *Handler) SendSignal(signal syscall.Signal) error { p.outCancel() } + // For non-PTY commands the signal is delivered to the whole process group so + // the command's children are terminated together with the leader. PTY + // processes keep the original single-process behavior. + if p.processGroup { + return signalProcessGroup(p.cmd, signal) + } + return p.cmd.Process.Signal(signal) } +// signalProcessGroup delivers signal to every process in cmd's process group. +// The command is started with Setpgid, so the process group id equals the +// leader's pid; signaling the negative pid targets the entire group. An already +// gone group (ESRCH) is treated as success — there is nothing left to kill. +func signalProcessGroup(cmd *exec.Cmd, signal syscall.Signal) error { + if cmd.Process == nil { + return errors.New("process not started") + } + + err := syscall.Kill(-cmd.Process.Pid, signal) + if err != nil && !errors.Is(err, syscall.ESRCH) { + return err + } + + return nil +} + func (p *Handler) ResizeTty(size *pty.Winsize) error { if p.tty == nil { return errors.New("tty not assigned to process") diff --git a/packages/envd/internal/services/process/start_test.go b/packages/envd/internal/services/process/start_test.go index 4ccd39f4a0..ede35db9e7 100644 --- a/packages/envd/internal/services/process/start_test.go +++ b/packages/envd/internal/services/process/start_test.go @@ -6,6 +6,10 @@ import ( "net/http/httptest" "os" "os/user" + "slices" + "strconv" + "strings" + "syscall" "testing" "time" @@ -84,6 +88,113 @@ func TestStart_ShortCommand(t *testing.T) { assert.NotNil(t, events[len(events)-1].GetEnd(), "last event should be End") } +// TestSendSignal_KillsProcessTree verifies that killing a command also +// terminates the child processes it spawned. envd's per-process handle only +// signaled the leader, so children kept running after a kill; non-PTY commands +// now run in their own process group and SIGKILL reaches the whole tree. +func TestSendSignal_KillsProcessTree(t *testing.T) { + t.Parallel() + + client, cleanup := newTestService(t) + defer cleanup() + + ctx, cancel := context.WithTimeout(t.Context(), 20*time.Second) + defer cancel() + + // The leader spawns two long-lived children and waits on them. + stream, err := client.Start(ctx, connect.NewRequest(&rpc.StartRequest{ + Process: &rpc.ProcessConfig{ + Cmd: "/bin/sh", + Args: []string{"-c", "sleep 120 & sleep 120 & wait"}, + }, + })) + require.NoError(t, err) + + // Read the start event to capture the leader pid. + require.True(t, stream.Receive(), "expected a start event") + pid := stream.Msg().GetEvent().GetStart().GetPid() + require.NotZero(t, pid) + + // Wait for the children to come up, then capture their pids. + var childPids []int + require.Eventually(t, func() bool { + childPids = childrenOf(t, int(pid)) + + return len(childPids) == 2 + }, 5*time.Second, 50*time.Millisecond, "expected leader to spawn two children") + + // Kill the command — this should take down the whole process group. + _, err = client.SendSignal(ctx, connect.NewRequest(&rpc.SendSignalRequest{ + Process: &rpc.ProcessSelector{ + Selector: &rpc.ProcessSelector_Pid{Pid: pid}, + }, + Signal: rpc.Signal_SIGNAL_SIGKILL, + })) + require.NoError(t, err) + + // Drain the stream so the leader is reaped. + for stream.Receive() { + } + _ = stream.Close() + + // The leader and every child must be gone. + require.Eventually(t, func() bool { + return !slices.ContainsFunc(childPids, processAlive) + }, 5*time.Second, 50*time.Millisecond, "child processes should be killed with the leader") +} + +// childrenOf returns the pids whose parent is ppid, read from /proc. +func childrenOf(t *testing.T, ppid int) []int { + t.Helper() + + entries, err := os.ReadDir("/proc") + require.NoError(t, err) + + var children []int + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + pid, err := strconv.Atoi(entry.Name()) + if err != nil { + continue + } + + data, err := os.ReadFile("/proc/" + entry.Name() + "/stat") + if err != nil { + // The process may have exited between listing and reading. + continue + } + + // /proc//stat fields: pid (comm) state ppid ... + // comm can contain spaces/parens, so parse after the closing paren. + stat := string(data) + idx := strings.LastIndex(stat, ")") + if idx == -1 { + continue + } + + fields := strings.Fields(stat[idx+1:]) + // fields[0]=state, fields[1]=ppid + if len(fields) < 2 { + continue + } + + if parsed, err := strconv.Atoi(fields[1]); err == nil && parsed == ppid { + children = append(children, pid) + } + } + + return children +} + +// processAlive reports whether the given pid still exists. +func processAlive(pid int) bool { + // Signal 0 performs error checking without actually sending a signal. + return syscall.Kill(pid, 0) == nil +} + // TestStart_ClientDisconnectMidStream verifies that when a client // cancels mid-stream, the handler returns without racing. This is // the scenario that caused the nil-pointer panic in production: diff --git a/packages/envd/pkg/version.go b/packages/envd/pkg/version.go index e22f35a718..d041ae4222 100644 --- a/packages/envd/pkg/version.go +++ b/packages/envd/pkg/version.go @@ -1,3 +1,3 @@ package pkg -const Version = "0.6.1" +const Version = "0.6.2"