Skip to content

Commit 10746a7

Browse files
authored
fix(envd): fix process cleanup in cgroup test to prevent OOM and data race (#2588)
The cgroup round-trip test spawns `tail /dev/zero` under a memory-limited cgroup to verify the OOM kill. Three problems caused host OOM and flaky failures when running with `-count` or `-race`: Killing only bash left `tail` running as an orphan that ate unbounded memory. The fix starts the child in its own process group (`Setpgid`) and kills the entire group on timeout and in `t.Cleanup`. The command now uses `exec` so bash replaces itself with the child process. Both `waitForProcess` and `t.Cleanup` called `cmd.Wait()`, causing a data race. Now `t.Cleanup` only kills, and `waitForProcess` owns the wait — it drains the goroutine after killing on timeout so there's no leak or race. The second commit replaces `tail /dev/zero` with a perl one-liner that allocates a fixed 512 MiB and sleeps. If the process escapes cleanup it holds bounded memory rather than growing until the kernel intervenes.
1 parent 6c0bcb1 commit 10746a7

1 file changed

Lines changed: 18 additions & 2 deletions

File tree

packages/envd/internal/services/cgroups/cgroup2_test.go

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,21 +157,34 @@ func createCgroupPath(t *testing.T, s string) string {
157157
func startProcess(t *testing.T, m *Cgroup2Manager, pt ProcessType) *exec.Cmd {
158158
t.Helper()
159159

160-
cmdName, args := "bash", []string{"-c", `sleep 1 && tail /dev/zero`}
160+
cmdName, args := "bash", []string{"-c", `sleep 1 && exec perl -e 'my $x = "A" x (512*1024*1024); sleep 300'`}
161161
cmd := exec.CommandContext(t.Context(), cmdName, args...)
162162

163163
fd, ok := m.GetFileDescriptor(pt)
164164
cmd.SysProcAttr = &syscall.SysProcAttr{
165165
UseCgroupFD: ok,
166166
CgroupFD: fd,
167+
Setpgid: true,
167168
}
168169

169170
err := cmd.Start()
170171
require.NoError(t, err)
171172

173+
t.Cleanup(func() { killProcessGroup(cmd) })
174+
172175
return cmd
173176
}
174177

178+
func killProcessGroup(cmd *exec.Cmd) {
179+
if cmd.Process == nil {
180+
return
181+
}
182+
183+
if err := syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL); err != nil {
184+
_ = cmd.Process.Kill()
185+
}
186+
}
187+
175188
func waitForProcess(t *testing.T, cmd *exec.Cmd, timeout time.Duration) error {
176189
t.Helper()
177190

@@ -183,10 +196,13 @@ func waitForProcess(t *testing.T, cmd *exec.Cmd, timeout time.Duration) error {
183196
}()
184197

185198
ctx, cancel := context.WithTimeout(t.Context(), timeout)
186-
t.Cleanup(cancel)
199+
defer cancel()
187200

188201
select {
189202
case <-ctx.Done():
203+
killProcessGroup(cmd)
204+
<-done
205+
190206
return ctx.Err()
191207
case err := <-done:
192208
return err

0 commit comments

Comments
 (0)