Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .tool-versions
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
golang 1.24.13
2 changes: 2 additions & 0 deletions internal/executor/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ func NewReal() *Real { return &Real{} }
func (r *Real) Run(ctx context.Context, name string, args ...string) (string, string, int, error) {
cmd := exec.CommandContext(ctx, name, args...)
winproc.HideWindow(cmd)
setupKillgroupOnCancel(cmd)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
Expand Down Expand Up @@ -115,6 +116,7 @@ func (r *Real) RunInDir(ctx context.Context, dir string, timeout time.Duration,
defer cancel()
cmd := exec.CommandContext(ctx, name, args...)
winproc.HideWindow(cmd)
setupKillgroupOnCancel(cmd)
cmd.Dir = dir
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
Expand Down
31 changes: 31 additions & 0 deletions internal/executor/executor_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,42 @@ import (
"context"
"fmt"
"os"
"os/exec"
"runtime"
"strings"
"syscall"
"time"
)

// setupKillgroupOnCancel arranges for cmd's whole process group to be killed
// when ctx fires, and bounds how long Wait() will block on hung pipe copies.
//
// Without this, exec.CommandContext only SIGKILLs the immediate child PID on
// timeout. If that child has forked grandchildren that inherit stdout/stderr
// (Electron helpers, npm sandbox workers, bash backgrounded subprocesses),
// the pipes remain open from the surviving descendants and cmd.Wait() blocks
// forever — the deadline is effectively ignored. Seen in production as
// node_scan hangs averaging 3.6 min per project under a 30s per-call ceiling.
//
// Setpgid: true makes cmd its own process group leader, so kill(-pid, SIGKILL)
// reaches the whole subtree. cmd.Cancel runs on ctx cancel/deadline.
// WaitDelay bounds the pipe-copy wait independently of the kill — if a child
// somehow survives the group kill (e.g. PID reused), Wait still returns.
func setupKillgroupOnCancel(cmd *exec.Cmd) {
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.SysProcAttr.Setpgid = true
cmd.Cancel = func() error {
if cmd.Process == nil {
return nil
}
// Negative PID targets the process group leader's group.
return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
}
cmd.WaitDelay = 2 * time.Second
}

func (r *Real) IsRoot() bool {
return os.Getuid() == 0
}
Expand Down
60 changes: 60 additions & 0 deletions internal/executor/executor_unix_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//go:build !windows

package executor

import (
"context"
"os"
"path/filepath"
"testing"
"time"
)

// TestRunWithTimeoutHangsOnUnreapedChild reproduces the Antigravity hang
// (StepSecurity Device Agent v1.11.6, Coveo deployment). The IDE detector
// invokes `Contents/MacOS/Antigravity --version`; Antigravity is Electron-
// based and forks helper processes (GPU, renderer, utility) that inherit
// the parent's stdout/stderr. When the 10s context timeout fires, Go sends
// SIGKILL to the parent PID only — not the process group — so the helpers
// keep the inherited pipe write-ends open, and cmd.Wait() blocks forever
// waiting for an EOF that never arrives.
//
// This test models the failure with a bash script that backgrounds a long
// sleep (the "helper") before the "parent" exits. On the current Real.Run
// implementation the call blocks until the sleep completes, demonstrating
// that the context timeout is ignored. After the fix (Setpgid + cmd.Cancel
// killing the process group, plus cmd.WaitDelay) it should return within
// roughly the requested timeout.
func TestRunWithTimeoutHangsOnUnreapedChild(t *testing.T) {
tmp := t.TempDir()
script := filepath.Join(tmp, "fake-version.sh")
// Background `sleep` inherits stdout/stderr from the script. The script
// itself exits immediately after echoing — matching Electron's "print
// version, exit, leave helpers running" behavior.
body := "#!/bin/bash\nsleep 60 &\necho version-1.0\n"
if err := os.WriteFile(script, []byte(body), 0o755); err != nil {
t.Fatalf("write script: %v", err)
}

r := &Real{}
const timeout = 2 * time.Second

start := time.Now()
_, _, _, _ = r.RunWithTimeout(context.Background(), timeout, script)
elapsed := time.Since(start)

// With setupKillgroupOnCancel in place, the context timeout fires,
// kill -PGID reaps the backgrounded sleep, and Wait returns within
// WaitDelay (2s) of the timeout. Allow 7s of slack for slow CI runners.
// Without the fix the test hangs ~60s (the sleep duration); >7s here
// would mean the process-group kill or WaitDelay regressed.
if elapsed > 7*time.Second {
t.Fatalf(
"RunWithTimeout hung for %s (expected ~%s). "+
"The context timeout fired but cmd.Wait() blocked because a "+
"backgrounded child still holds stdout open. Same failure mode "+
"as running /Applications/Antigravity.app/Contents/MacOS/Antigravity --version.",
elapsed, timeout,
)
}
}
9 changes: 9 additions & 0 deletions internal/executor/executor_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,20 @@ package executor

import (
"context"
"os/exec"
"strings"

"golang.org/x/sys/windows"
)

// setupKillgroupOnCancel is a no-op on Windows for now. The Unix equivalent
// uses Setpgid + kill(-pgid) to kill grandchildren on ctx cancel. The
// Windows analogue (JobObject + CREATE_BREAKAWAY_FROM_JOB) is a larger
// change and is tracked separately — Windows hosts are less exposed to the
// unreaped-helper hang because most scanned binaries are not Electron apps
// invoked under launchd.
func setupKillgroupOnCancel(cmd *exec.Cmd) {}

func (r *Real) IsRoot() bool {
return windows.GetCurrentProcessToken().IsElevated()
}
Expand Down
96 changes: 96 additions & 0 deletions internal/telemetry/log_tail_emitter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package telemetry

import (
"bytes"
"compress/gzip"
"encoding/base64"
"sync"
"time"
)

const (
// captureTailBytes is how many trailing bytes of the captured stderr
// stream get shipped on each eligible heartbeat. 256 KB strikes a
// balance between "enough context to diagnose a hang" and "small
// enough that backend storage doesn't explode at scale" — typical
// gzip ratio on agent log lines is ~10x, so on the wire each tail is
// roughly 25 KB.
captureTailBytes = 256 * 1024

// logTailHeartbeatInterval throttles tail attachment. Phase
// boundaries fire rapidly during fast scans (sub-second) and would
// otherwise attach a tail to every status_info post. 2 minutes keeps
// the tail "fresh enough" for diagnosing a stuck device while
// bounding traffic: at most 30 tails per hour per device.
logTailHeartbeatInterval = 2 * time.Minute
)

// logTailEmitter attaches a gzip+base64-encoded log tail to RunStatusInfo
// snapshots on a fixed interval. Safe for concurrent use across the
// heartbeat goroutine and the inline phase-boundary callers, which can
// both reach postPhase.
type logTailEmitter struct {
capture *LogCapture
interval time.Duration
now func() time.Time

mu sync.Mutex
lastSent time.Time
}

func newLogTailEmitter(capture *LogCapture, interval time.Duration) *logTailEmitter {
return &logTailEmitter{capture: capture, interval: interval, now: time.Now}
}

// MaybeAttach populates info.LogTailGzipBase64 iff the throttle window has
// elapsed since the last attachment and the capture buffer is non-empty.
// No-op on nil capture (tests, or pre-StartCapture early returns).
func (e *logTailEmitter) MaybeAttach(info *RunStatusInfo) {
if e == nil || e.capture == nil || info == nil {
return
}
now := e.now()
e.mu.Lock()
if !e.lastSent.IsZero() && now.Sub(e.lastSent) < e.interval {
e.mu.Unlock()
return
}
// Tentatively claim the slot; release if there's nothing to attach so
// the next caller can try again immediately.
previous := e.lastSent
e.lastSent = now
e.mu.Unlock()

tail := e.capture.Tail(captureTailBytes)
if len(tail) == 0 {
e.mu.Lock()
e.lastSent = previous
e.mu.Unlock()
return
}

encoded, err := gzipBase64(tail)
if err != nil {
// Compression failure shouldn't happen for in-memory writes; if it
// does, just drop the tail rather than ship raw bytes — the
// progress upsert still succeeds with the rest of the snapshot.
return
}
info.LogTailGzipBase64 = encoded
}

// gzipBase64 gzips b at default compression and base64-encodes the
// result. Wrapped here rather than inlined so the emitter and tests
// share a single encoding pipeline.
func gzipBase64(b []byte) (string, error) {
var buf bytes.Buffer
zw := gzip.NewWriter(&buf)
if _, err := zw.Write(b); err != nil {
_ = zw.Close()
return "", err
}
if err := zw.Close(); err != nil {
return "", err
}
return base64.StdEncoding.EncodeToString(buf.Bytes()), nil
}
111 changes: 111 additions & 0 deletions internal/telemetry/log_tail_emitter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package telemetry

import (
"bytes"
"compress/gzip"
"encoding/base64"
"io"
"strings"
"testing"
"time"
)

// stepClock is a deterministic time source for the throttle test.
type stepClock struct{ t time.Time }

func (c *stepClock) now() time.Time { return c.t }
func (c *stepClock) advance(d time.Duration) { c.t = c.t.Add(d) }

func newEmitterWithFakeClock(cap *LogCapture, interval time.Duration, clk *stepClock) *logTailEmitter {
e := newLogTailEmitter(cap, interval)
e.now = clk.now
return e
}

func TestLogTailEmitter_ThrottlesAttachments(t *testing.T) {
lc := &LogCapture{ring: newRingBuffer(64 * 1024)}
lc.ring.Write([]byte("first batch of logs\n"))

clk := &stepClock{t: time.Unix(1_700_000_000, 0)}
em := newEmitterWithFakeClock(lc, 2*time.Minute, clk)

// First call after construction MUST attach — there's no prior send
// timestamp; otherwise the very first heartbeat would miss the tail.
var snap RunStatusInfo
em.MaybeAttach(&snap)
if snap.LogTailGzipBase64 == "" {
t.Fatalf("first MaybeAttach must attach a tail; got empty field")
}
if decoded := decodeTail(t, snap.LogTailGzipBase64); !strings.Contains(decoded, "first batch") {
t.Fatalf("attached tail missing expected content; got %q", decoded)
}

// Within the throttle window, no attachment.
lc.ring.Write([]byte("second batch (within throttle window)\n"))
clk.advance(30 * time.Second)
snap = RunStatusInfo{}
em.MaybeAttach(&snap)
if snap.LogTailGzipBase64 != "" {
t.Fatalf("MaybeAttach within throttle window must skip; attached anyway")
}

// After the window, attachment resumes and reflects the latest buffer.
clk.advance(2 * time.Minute)
lc.ring.Write([]byte("third batch after window\n"))
snap = RunStatusInfo{}
em.MaybeAttach(&snap)
if snap.LogTailGzipBase64 == "" {
t.Fatalf("MaybeAttach after throttle window must attach")
}
decoded := decodeTail(t, snap.LogTailGzipBase64)
if !strings.Contains(decoded, "third batch") {
t.Fatalf("post-window tail must include latest content; got %q", decoded)
}
}

func TestLogTailEmitter_NilSafe(t *testing.T) {
// Nil receiver, nil capture, and nil info should all be no-ops, not panics.
var em *logTailEmitter
em.MaybeAttach(&RunStatusInfo{}) // nil receiver

em2 := newLogTailEmitter(nil, 2*time.Minute)
em2.MaybeAttach(&RunStatusInfo{}) // nil capture

em3 := newLogTailEmitter(&LogCapture{ring: newRingBuffer(1024)}, 2*time.Minute)
em3.MaybeAttach(nil) // nil info
}

func TestRingBuffer_TailRespectsWraparound(t *testing.T) {
// Cap is small so we can deterministically wrap.
r := newRingBuffer(8)
r.Write([]byte("abcdef")) // not yet full
if got := string(r.Tail(10)); got != "abcdef" {
t.Fatalf("Tail before fill: got %q, want %q", got, "abcdef")
}

r.Write([]byte("ghij")) // now full + wrapped: total written "abcdefghij", buffer holds "cdefghij"
if got := string(r.Tail(8)); got != "cdefghij" {
t.Fatalf("Tail after wrap: got %q, want %q", got, "cdefghij")
}
if got := string(r.Tail(3)); got != "hij" {
t.Fatalf("Tail(3) after wrap: got %q, want %q", got, "hij")
}
}

func decodeTail(t *testing.T, encoded string) string {
t.Helper()
raw, err := base64.StdEncoding.DecodeString(encoded)
if err != nil {
t.Fatalf("base64 decode: %v", err)
}
zr, err := gzip.NewReader(bytes.NewReader(raw))
if err != nil {
t.Fatalf("gzip reader: %v", err)
}
defer zr.Close()
out, err := io.ReadAll(zr)
if err != nil {
t.Fatalf("gzip read: %v", err)
}
return string(out)
}
Loading
Loading