Skip to content

Commit 078df68

Browse files
fix(telemetry): sandbox invocation test + make postPhase non-blocking
1 parent 80526e0 commit 078df68

3 files changed

Lines changed: 126 additions & 20 deletions

File tree

internal/telemetry/invocation_test.go

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"os"
55
"path/filepath"
66
"runtime"
7+
"strings"
78
"testing"
89

910
"github.com/step-security/dev-machine-guard/internal/launchd"
@@ -51,12 +52,24 @@ func TestDetectInvocationMethod_HostMachine(t *testing.T) {
5152
// TestDetectInvocationMethod_RespondsToFilesystem covers the darwin/linux
5253
// path that stats a scheduler artifact. On Windows the check shells out to
5354
// schtasks, which we can't safely stub without an executor seam — skip there.
55+
//
56+
// Sandboxes HOME (Unix) and USERPROFILE (Windows-safe no-op on Unix) under
57+
// t.TempDir() so launchd.UserPlistPath / systemd.TimerUnitPath compute paths
58+
// that live entirely inside the temp tree. Without this the test would write
59+
// markers (and MkdirAll-created parent dirs) into the developer's real
60+
// ~/Library/LaunchAgents or ~/.config/systemd/user — leaving stray files
61+
// behind on CI and risking a tiny TOCTOU window against a real install.
5462
func TestDetectInvocationMethod_RespondsToFilesystem(t *testing.T) {
5563
if runtime.GOOS == "windows" {
5664
t.Skip("windows uses schtasks /query, not filesystem")
5765
}
5866

59-
// Resolve the platform's expected artifact path.
67+
tempHome := t.TempDir()
68+
t.Setenv("HOME", tempHome)
69+
t.Setenv("USERPROFILE", tempHome) // no-op on Unix but cheap and keeps the seam consistent
70+
71+
// Resolve the platform's expected artifact path AFTER the env override
72+
// so os.UserHomeDir() returns tempHome.
6073
var path string
6174
switch runtime.GOOS {
6275
case "darwin":
@@ -69,24 +82,25 @@ func TestDetectInvocationMethod_RespondsToFilesystem(t *testing.T) {
6982
if path == "" {
7083
t.Skip("could not resolve scheduler artifact path on this host")
7184
}
85+
if !strings.HasPrefix(path, tempHome) {
86+
t.Fatalf("resolved path %q escaped tempHome %q — env sandbox is not effective", path, tempHome)
87+
}
7288

73-
// We don't want to clobber a real installation. If the artifact already
74-
// exists, just confirm the detector agrees and bail; otherwise create a
75-
// fake marker, assert detection flips, then clean up.
76-
if _, err := os.Stat(path); err == nil {
77-
if got := DetectInvocationMethod(); got != InvocationInstall {
78-
t.Fatalf("artifact %q exists but detector returned %q", path, got)
79-
}
80-
return
89+
// Fresh temp home — detector starts at one_time, flips to install when
90+
// the marker appears, flips back when it's removed.
91+
if got := DetectInvocationMethod(); got != InvocationOneTime {
92+
t.Fatalf("on clean temp home, detector returned %q, want %q",
93+
got, InvocationOneTime)
8194
}
8295

8396
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
84-
t.Skipf("cannot prepare scheduler artifact dir for test: %v", err)
97+
t.Fatalf("prepare scheduler artifact dir: %v", err)
8598
}
8699
if err := os.WriteFile(path, []byte("x"), 0o600); err != nil {
87-
t.Skipf("cannot write fake scheduler artifact: %v", err)
100+
t.Fatalf("write fake scheduler artifact: %v", err)
88101
}
89-
t.Cleanup(func() { _ = os.Remove(path) })
102+
// No explicit cleanup: everything lives under t.TempDir() and is
103+
// removed by the testing framework when the test ends.
90104

91105
if got := DetectInvocationMethod(); got != InvocationInstall {
92106
t.Fatalf("after creating %q, detector returned %q, want %q",
@@ -98,8 +112,6 @@ func TestDetectInvocationMethod_RespondsToFilesystem(t *testing.T) {
98112
if err := os.Remove(path); err != nil {
99113
t.Fatalf("remove fake artifact: %v", err)
100114
}
101-
// Avoid double-remove in cleanup.
102-
t.Cleanup(func() {})
103115

104116
if got := DetectInvocationMethod(); got != InvocationOneTime {
105117
t.Fatalf("after removing %q, detector returned %q, want %q",

internal/telemetry/run_status.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,12 @@ func postRunStatusWithRetry(ctx context.Context, log *progress.Logger,
146146
select {
147147
case <-time.After(runStatusRetryBackoff):
148148
case <-ctx.Done():
149-
log.Progress("run-status: parent context done, abandoning retries")
149+
// Demoted to Debug: parent ctx done means clean shutdown
150+
// (or operator-initiated cancel that already logged its
151+
// own reason). No need to add a second progress-level
152+
// noise line in the normal "scan finished successfully,
153+
// last progress post got cut off by cancelRun()" flow.
154+
log.Debug("run-status: parent context done, abandoning retries")
150155
return
151156
}
152157
}
@@ -159,9 +164,22 @@ func postRunStatusWithRetry(ctx context.Context, log *progress.Logger,
159164
// postRunStatusOnce performs a single HTTP attempt. Returns true on a 2xx
160165
// or 4xx (terminal — retrying a bad request will not help). Returns false
161166
// on transport errors or 5xx so the caller can retry.
167+
//
168+
// Treats parent-ctx cancellation as terminal-and-quiet: the only time we
169+
// observe ctx.Err() != nil here is shutdown (cancelRun fired). Logging a
170+
// "POST error: context canceled" at progress level on every successful
171+
// scan — because the final phase-boundary post raced the deferred
172+
// cancelRun — would mislead operators into thinking the run failed.
162173
func postRunStatusOnce(ctx context.Context, log *progress.Logger,
163174
endpoint string, body []byte, status string, attempt, maxAttempts int) bool {
164175

176+
// Short-circuit when the scan ctx is already done — no point opening a
177+
// connection just to have it cancelled mid-handshake.
178+
if ctx.Err() != nil {
179+
log.Debug("run-status[%s %d/%d]: skipped (parent context done)", status, attempt, maxAttempts)
180+
return true
181+
}
182+
165183
cctx, cancel := context.WithTimeout(ctx, runStatusHTTPTimeout)
166184
defer cancel()
167185

@@ -177,6 +195,14 @@ func postRunStatusOnce(ctx context.Context, log *progress.Logger,
177195
client := &http.Client{Timeout: runStatusHTTPTimeout}
178196
resp, err := client.Do(req)
179197
if err != nil {
198+
// Distinguish shutdown-induced cancellation from a real transport
199+
// error. If the parent ctx is done, this was clean shutdown — log
200+
// at debug and return terminal so the caller doesn't burn a retry
201+
// on an already-cancelled context.
202+
if ctx.Err() != nil {
203+
log.Debug("run-status[%s %d/%d]: aborted at shutdown: %v", status, attempt, maxAttempts, err)
204+
return true
205+
}
180206
log.Progress("run-status[%s %d/%d]: POST error: %v", status, attempt, maxAttempts, err)
181207
return false
182208
}

internal/telemetry/telemetry.go

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"net/http"
1111
"os"
1212
"os/signal"
13+
"sync"
1314
"sync/atomic"
1415
"syscall"
1516
"time"
@@ -153,11 +154,76 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err
153154
}
154155
}
155156

157+
// Phase-boundary progress posts run on a dedicated worker so the scan
158+
// never blocks on HTTP at a call site. Buffer=1 + drop-oldest send
159+
// gives us two properties together:
160+
// - Strict ordering: a single consumer means the backend can never
161+
// see an older snapshot land after a newer one (which would cause
162+
// the console UI to briefly regress on degraded networks).
163+
// - Bounded resources: at most one pending snapshot is queued; a
164+
// slow-network backlog can't grow across the 11+ inline call
165+
// sites. The latest snapshot always wins, which matches what an
166+
// operator watching progress actually cares about.
167+
//
168+
// Without this, blocking phase posts could add the per-call retry
169+
// budget (~6s: 2 attempts × 3s HTTP timeout + 500ms backoff) to each
170+
// call site, compounding to over a minute of added scan latency on a
171+
// degraded link for purely best-effort progress data.
172+
phaseCh := make(chan RunStatusInfo, 1)
173+
phaseDone := make(chan struct{})
174+
var phaseSendMu sync.Mutex // serialises drain+send so concurrent producers (main scan + heartbeat) don't race
175+
go func() {
176+
defer close(phaseDone)
177+
// process posts one snapshot using a Background-derived ctx with
178+
// a bounded per-post timeout. We deliberately do NOT chain off the
179+
// scan ctx here: the final phase-boundary post (which is the only
180+
// snapshot that includes "telemetry_upload" in phases_completed)
181+
// arrives at the worker *after* the function body returns and the
182+
// deferred cancelRun() fires. If we shared the scan ctx, that post
183+
// would always be cancelled mid-flight and the backend would never
184+
// learn the upload completed. The 10s budget covers postProgress's
185+
// own internal retry window (2×3s + 500ms backoff) with slack.
186+
process := func(snap RunStatusInfo) {
187+
postCtx, postCancel := context.WithTimeout(context.Background(), 10*time.Second)
188+
defer postCancel()
189+
postProgress(postCtx, log, executionID, deviceID, invocationMethod, snap)
190+
}
191+
for {
192+
select {
193+
case snap := <-phaseCh:
194+
process(snap)
195+
case <-ctx.Done():
196+
// Drain any queued snapshot before exiting. Without this,
197+
// a naïve select on the next iteration would 50/50 between
198+
// the ready ctx.Done() and the ready phaseCh — dropping
199+
// the final post is exactly what the user reports as
200+
// "telemetry_upload missing from phases_completed".
201+
for {
202+
select {
203+
case snap := <-phaseCh:
204+
process(snap)
205+
default:
206+
return
207+
}
208+
}
209+
}
210+
}
211+
}()
212+
156213
// postPhase is the convergence point for phase-boundary and heartbeat
157214
// progress updates. Captured here so the heartbeat goroutine and the
158215
// inline phase wrappers share a single call site.
159216
postPhase := func() {
160-
postProgress(ctx, log, executionID, deviceID, invocationMethod, tracker.Snapshot())
217+
snap := tracker.Snapshot()
218+
phaseSendMu.Lock()
219+
defer phaseSendMu.Unlock()
220+
// Drop any queued (older) snapshot so the freshest one always lands.
221+
select {
222+
case <-phaseCh:
223+
default:
224+
}
225+
// Always succeeds: buffer=1, just drained, single sender under the mutex.
226+
phaseCh <- snap
161227
}
162228

163229
// Catch SIGINT / SIGTERM so cancellation (Ctrl+C, launchd stop, kill)
@@ -278,13 +344,15 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err
278344
}
279345
}
280346
}()
281-
// Shut the heartbeat down cleanly on return. Order matters: cancel
282-
// first so the goroutine sees ctx.Done() and exits, THEN wait for it
283-
// to close heartbeatDone. Splitting these into two `defer` statements
284-
// would deadlock — LIFO would block on the wait before cancel fires.
347+
// Shut down both the heartbeat goroutine and the phase-post worker
348+
// cleanly on return. Order matters: cancel first so both goroutines
349+
// see ctx.Done() and exit, THEN wait for each to close its done
350+
// channel. Splitting these into separate `defer` statements would
351+
// deadlock — LIFO would block on the waits before cancel fires.
285352
defer func() {
286353
cancelRun()
287354
<-heartbeatDone
355+
<-phaseDone
288356
}()
289357

290358
// Detect logged-in user for running commands as the real user when root.

0 commit comments

Comments
 (0)