Skip to content

Commit db2b2bc

Browse files
fix(telemetry): capture log tail through upload
1 parent 796a917 commit db2b2bc

7 files changed

Lines changed: 276 additions & 25 deletions

File tree

internal/telemetry/execution_deadline_test.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,14 @@ func TestExecutionDeadlineFromEnv(t *testing.T) {
2424
}
2525
for _, tc := range cases {
2626
t.Run(tc.name, func(t *testing.T) {
27+
// Always set the var (to "" for the unset cases) so the test is
28+
// hermetic: a value inherited from the host or a CI job must not
29+
// leak into the "unset" cases. os.Getenv treats truly-unset and
30+
// empty identically, so "" exercises the same fall-through path.
2731
if tc.set {
2832
t.Setenv("STEPSEC_MAX_EXECUTION_DURATION", tc.env)
2933
} else {
30-
// Belt-and-braces: t.Setenv resets at test exit, but
31-
// nothing in this package mutates the env at init.
32-
_ = tc.env
34+
t.Setenv("STEPSEC_MAX_EXECUTION_DURATION", "")
3335
}
3436
got := ExecutionDeadlineFromEnv()
3537
if got != tc.want {
@@ -63,8 +65,12 @@ func TestExecutionDeadline_EnvThenConfigThenDefault(t *testing.T) {
6365
}
6466
for _, tc := range cases {
6567
t.Run(tc.name, func(t *testing.T) {
68+
// Always set the var (to "" when setEnv is false) so an inherited
69+
// host/CI value can't bypass the config-fallback cases under test.
6670
if tc.setEnv {
6771
t.Setenv("STEPSEC_MAX_EXECUTION_DURATION", tc.env)
72+
} else {
73+
t.Setenv("STEPSEC_MAX_EXECUTION_DURATION", "")
6874
}
6975
got := ExecutionDeadline(tc.configVal)
7076
if got != tc.want {

internal/telemetry/log_tail_emitter.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,30 @@ func (e *logTailEmitter) MaybeAttach(info *RunStatusInfo) {
7979
info.LogTailGzipBase64 = encoded
8080
}
8181

82+
// ForceAttach attaches the current tail regardless of the throttle window.
83+
// Used for the single final post-upload snapshot so the run-status row carries
84+
// the most complete tail — including the upload's own output and the
85+
// completion line — which a throttled MaybeAttach would otherwise drop on a
86+
// sub-2-minute run. Updates lastSent so any later MaybeAttach still respects
87+
// the interval. No-op on nil capture / empty buffer.
88+
func (e *logTailEmitter) ForceAttach(info *RunStatusInfo) {
89+
if e == nil || e.capture == nil || info == nil {
90+
return
91+
}
92+
tail := e.capture.Tail(captureTailBytes)
93+
if len(tail) == 0 {
94+
return
95+
}
96+
encoded, err := gzipBase64(tail)
97+
if err != nil {
98+
return
99+
}
100+
e.mu.Lock()
101+
e.lastSent = e.now()
102+
e.mu.Unlock()
103+
info.LogTailGzipBase64 = encoded
104+
}
105+
82106
// gzipBase64 gzips b at default compression and base64-encodes the
83107
// result. Wrapped here rather than inlined so the emitter and tests
84108
// share a single encoding pipeline.

internal/telemetry/log_tail_emitter_test.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,65 @@ func TestLogTailEmitter_ThrottlesAttachments(t *testing.T) {
6363
}
6464
}
6565

66+
// ForceAttach must attach the current tail even inside the throttle window —
67+
// the final post-upload snapshot depends on it so the last lines (the upload's
68+
// own output) aren't dropped by the throttle on a sub-2-minute run.
69+
func TestLogTailEmitter_ForceAttachBypassesThrottle(t *testing.T) {
70+
lc := &LogCapture{ring: newRingBuffer(64 * 1024)}
71+
lc.ring.Write([]byte("startup output\n"))
72+
73+
clk := &stepClock{t: time.Unix(1_700_000_000, 0)}
74+
em := newEmitterWithFakeClock(lc, 2*time.Minute, clk)
75+
76+
// First MaybeAttach consumes the throttle slot.
77+
var snap RunStatusInfo
78+
em.MaybeAttach(&snap)
79+
if snap.LogTailGzipBase64 == "" {
80+
t.Fatal("first MaybeAttach should attach")
81+
}
82+
83+
// Still within the throttle window: MaybeAttach skips, ForceAttach does not.
84+
clk.advance(10 * time.Second)
85+
lc.ring.Write([]byte("upload completed\n"))
86+
87+
var throttled RunStatusInfo
88+
em.MaybeAttach(&throttled)
89+
if throttled.LogTailGzipBase64 != "" {
90+
t.Fatal("MaybeAttach within the window should skip")
91+
}
92+
93+
var forced RunStatusInfo
94+
em.ForceAttach(&forced)
95+
if forced.LogTailGzipBase64 == "" {
96+
t.Fatal("ForceAttach must attach even within the throttle window")
97+
}
98+
if decoded := decodeTail(t, forced.LogTailGzipBase64); !strings.Contains(decoded, "upload completed") {
99+
t.Fatalf("ForceAttach must reflect the latest buffer; got %q", decoded)
100+
}
101+
102+
// ForceAttach updated lastSent, so a follow-on MaybeAttach still throttles.
103+
clk.advance(10 * time.Second)
104+
var after RunStatusInfo
105+
em.MaybeAttach(&after)
106+
if after.LogTailGzipBase64 != "" {
107+
t.Fatal("MaybeAttach should still be throttled after ForceAttach")
108+
}
109+
}
110+
66111
func TestLogTailEmitter_NilSafe(t *testing.T) {
67-
// Nil receiver, nil capture, and nil info should all be no-ops, not panics.
112+
// Nil receiver, nil capture, and nil info should all be no-ops, not panics —
113+
// for both MaybeAttach and ForceAttach.
68114
var em *logTailEmitter
69115
em.MaybeAttach(&RunStatusInfo{}) // nil receiver
116+
em.ForceAttach(&RunStatusInfo{}) // nil receiver
70117

71118
em2 := newLogTailEmitter(nil, 2*time.Minute)
72119
em2.MaybeAttach(&RunStatusInfo{}) // nil capture
120+
em2.ForceAttach(&RunStatusInfo{}) // nil capture
73121

74122
em3 := newLogTailEmitter(&LogCapture{ring: newRingBuffer(1024)}, 2*time.Minute)
75123
em3.MaybeAttach(nil) // nil info
124+
em3.ForceAttach(nil) // nil info
76125
}
77126

78127
func TestRingBuffer_TailRespectsWraparound(t *testing.T) {

internal/telemetry/logcapture.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,17 @@ func (lc *LogCapture) Finalize() string {
175175
return base64.StdEncoding.EncodeToString(lc.ringBytesLocked())
176176
}
177177

178+
// SnapshotBase64 returns the base64-encoded buffer contents WITHOUT stopping
179+
// capture, so a caller can embed the session-so-far in the telemetry payload
180+
// while the capture keeps recording (e.g. through the upload that follows).
181+
// The real teardown — closing the pipe and restoring os.Stderr — stays in
182+
// Finalize, which the caller still defers. Safe to call during active capture.
183+
func (lc *LogCapture) SnapshotBase64() string {
184+
lc.mu.Lock()
185+
defer lc.mu.Unlock()
186+
return base64.StdEncoding.EncodeToString(lc.ringBytesLocked())
187+
}
188+
178189
// Tail returns the last n captured bytes as a fresh slice. Safe to call
179190
// concurrently with active capture; returns nil if the buffer is empty
180191
// or n ≤ 0. Used by heartbeat posts to ship the most recent diagnostic

internal/telemetry/phase_deadline.go

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package telemetry
22

33
import (
44
"context"
5+
"os"
6+
"strings"
57
"time"
68

79
"github.com/step-security/dev-machine-guard/internal/progress"
@@ -13,13 +15,15 @@ import (
1315
// production heartbeat data — they exist to bound the pathological tail,
1416
// not to clip normal scans.
1517
//
16-
// Order of overrides at a single phase site:
17-
// 1. STEPSEC_PHASE_BUDGET_<NAME> env var (Go duration, e.g. "10m") — if set
18-
// 2. this map's entry
19-
// 3. defaultPhaseBudget (5m) when neither is set
18+
// Order of overrides at a single phase site (resolved by resolvePhaseBudget):
19+
// 1. STEPSEC_PHASE_BUDGET_<NAME> env var — <NAME> is the phase name
20+
// upper-cased (e.g. node_scan → STEPSEC_PHASE_BUDGET_NODE_SCAN=20m).
21+
// "0" or "off" disables the deadline; an unparseable value is ignored.
22+
// 2. this map's entry (an explicit 0 here also disables the deadline)
23+
// 3. defaultPhaseBudget (5m) when the phase has no entry above
2024
//
21-
// A budget of 0 disables the per-phase deadline for that phase (the parent
22-
// scan deadline still applies).
25+
// "Disabled" means no per-phase deadline; the parent scan deadline
26+
// (STEPSEC_MAX_SCAN_DURATION) and the whole-process watchdog still apply.
2327
var phaseBudgets = map[string]time.Duration{
2428
"device_info": 30 * time.Second,
2529
"ide_scan": 2 * time.Minute,
@@ -35,18 +39,48 @@ var phaseBudgets = map[string]time.Duration{
3539

3640
const defaultPhaseBudget = 5 * time.Minute
3741

42+
// resolvePhaseBudget computes a phase's effective budget, honoring (in order)
43+
// the STEPSEC_PHASE_BUDGET_<NAME> env override, the phaseBudgets map entry,
44+
// then defaultPhaseBudget. The bool is false when the phase has NO deadline:
45+
// either source can disable it ("0"/"off" in the env var, or an explicit 0 in
46+
// the map). A missing map entry — distinct from an entry of 0 — falls back to
47+
// defaultPhaseBudget. An unparseable/empty env value is ignored so a typo in
48+
// an unattended scheduler context can't strip every phase's deadline.
49+
func resolvePhaseBudget(name string) (time.Duration, bool) {
50+
if v := os.Getenv("STEPSEC_PHASE_BUDGET_" + strings.ToUpper(name)); v != "" {
51+
switch {
52+
case v == "0" || v == "off":
53+
return 0, false
54+
default:
55+
if d, err := time.ParseDuration(v); err == nil && d > 0 {
56+
return d, true
57+
}
58+
// Unparseable or non-positive: ignore and fall through.
59+
}
60+
}
61+
if budget, ok := phaseBudgets[name]; ok {
62+
if budget <= 0 {
63+
return 0, false // explicit 0 in the map disables the deadline
64+
}
65+
return budget, true
66+
}
67+
return defaultPhaseBudget, true
68+
}
69+
3870
// startPhase opens a new phase and returns a derived context that carries
3971
// the phase's budget as its deadline. Callers must invoke endPhase (or
4072
// otherwise call the returned cancel func) before opening the next phase.
73+
// When the phase budget is disabled (see resolvePhaseBudget) the returned
74+
// context carries no per-phase deadline — only the parent scan deadline.
4175
//
4276
// The caller continues to own postPhase() — endPhase only handles the
4377
// tracker.Finish + cancel + deadline-overrun log line so the per-phase
4478
// edit at each site stays small.
4579
func startPhase(parent context.Context, tracker *PhaseTracker, name string) (context.Context, context.CancelFunc) {
4680
tracker.Start(name)
47-
budget := phaseBudgets[name]
48-
if budget == 0 {
49-
budget = defaultPhaseBudget
81+
budget, hasDeadline := resolvePhaseBudget(name)
82+
if !hasDeadline {
83+
return context.WithCancel(parent)
5084
}
5185
return context.WithTimeout(parent, budget)
5286
}
@@ -62,10 +96,9 @@ func startPhase(parent context.Context, tracker *PhaseTracker, name string) (con
6296
func endPhase(phaseCtx context.Context, cancel context.CancelFunc,
6397
tracker *PhaseTracker, log *progress.Logger, name string) {
6498
if phaseCtx.Err() == context.DeadlineExceeded {
65-
budget := phaseBudgets[name]
66-
if budget == 0 {
67-
budget = defaultPhaseBudget
68-
}
99+
// Only reachable when the phase had a deadline, so hasDeadline is true
100+
// and budget is the real value used to set the timeout.
101+
budget, _ := resolvePhaseBudget(name)
69102
log.Warn("phase %s exceeded budget %s — continuing with partial results", name, budget)
70103
}
71104
cancel()
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package telemetry
2+
3+
import (
4+
"context"
5+
"testing"
6+
"time"
7+
)
8+
9+
// resolvePhaseBudget honors STEPSEC_PHASE_BUDGET_<NAME> > map entry > default,
10+
// and treats an env "0"/"off" or an explicit map 0 as "no deadline". Every
11+
// subtest sets the env var explicitly (to "" when it shouldn't apply) so an
12+
// inherited host/CI value can't make the result nondeterministic.
13+
func TestResolvePhaseBudget(t *testing.T) {
14+
cases := []struct {
15+
name string
16+
phase string
17+
env string // value for STEPSEC_PHASE_BUDGET_<PHASE>
18+
wantBudget time.Duration
19+
wantEnabled bool
20+
}{
21+
{"env override wins over map", "node_scan", "20m", 20 * time.Minute, true},
22+
{"env off disables", "node_scan", "off", 0, false},
23+
{"env 0 disables", "node_scan", "0", 0, false},
24+
{"env junk ignored, falls to map", "node_scan", "junk", 15 * time.Minute, true},
25+
{"env negative ignored, falls to map", "node_scan", "-5m", 15 * time.Minute, true},
26+
{"map entry used when env empty", "ide_scan", "", 2 * time.Minute, true},
27+
{"unlisted phase falls to default", "totally_made_up_phase", "", defaultPhaseBudget, true},
28+
}
29+
for _, tc := range cases {
30+
t.Run(tc.name, func(t *testing.T) {
31+
// Hermetic: set the exact env key this phase reads (to "" when the
32+
// case shouldn't have an override).
33+
t.Setenv("STEPSEC_PHASE_BUDGET_"+upper(tc.phase), tc.env)
34+
gotBudget, gotEnabled := resolvePhaseBudget(tc.phase)
35+
if gotEnabled != tc.wantEnabled {
36+
t.Fatalf("enabled = %v, want %v", gotEnabled, tc.wantEnabled)
37+
}
38+
if tc.wantEnabled && gotBudget != tc.wantBudget {
39+
t.Errorf("budget = %v, want %v", gotBudget, tc.wantBudget)
40+
}
41+
})
42+
}
43+
}
44+
45+
// An explicit 0 in the phaseBudgets map disables the deadline — distinct from a
46+
// missing entry, which falls back to the default. (Copilot finding: the old
47+
// `if budget == 0 { budget = default }` conflated the two.)
48+
func TestResolvePhaseBudget_ExplicitZeroInMapDisables(t *testing.T) {
49+
const phase = "zzz_explicit_zero_phase"
50+
phaseBudgets[phase] = 0
51+
t.Cleanup(func() { delete(phaseBudgets, phase) })
52+
t.Setenv("STEPSEC_PHASE_BUDGET_"+upper(phase), "") // no env override
53+
54+
d, enabled := resolvePhaseBudget(phase)
55+
if enabled || d != 0 {
56+
t.Errorf("explicit map 0 must disable the deadline: got (%v, enabled=%v)", d, enabled)
57+
}
58+
}
59+
60+
// startPhase must produce a context with NO deadline when the budget is
61+
// disabled (only the parent scan deadline applies), and a deadline bounded by
62+
// the budget otherwise.
63+
func TestStartPhase_DeadlineWiring(t *testing.T) {
64+
tracker := NewPhaseTracker()
65+
66+
t.Run("disabled budget → no phase deadline", func(t *testing.T) {
67+
t.Setenv("STEPSEC_PHASE_BUDGET_NODE_SCAN", "off")
68+
ctx, cancel := startPhase(context.Background(), tracker, "node_scan")
69+
defer cancel()
70+
if _, ok := ctx.Deadline(); ok {
71+
t.Error("disabled phase budget must not set a context deadline")
72+
}
73+
})
74+
75+
t.Run("enabled budget → bounded deadline", func(t *testing.T) {
76+
t.Setenv("STEPSEC_PHASE_BUDGET_NODE_SCAN", "20m")
77+
ctx, cancel := startPhase(context.Background(), tracker, "node_scan")
78+
defer cancel()
79+
dl, ok := ctx.Deadline()
80+
if !ok {
81+
t.Fatal("enabled phase budget must set a context deadline")
82+
}
83+
if remaining := time.Until(dl); remaining <= 0 || remaining > 20*time.Minute {
84+
t.Errorf("deadline %v out of expected ~20m window", remaining)
85+
}
86+
})
87+
}
88+
89+
// upper uppercases an ASCII phase name for the env-var key, mirroring
90+
// strings.ToUpper without importing it into the test for one call.
91+
func upper(s string) string {
92+
b := []byte(s)
93+
for i := range b {
94+
if b[i] >= 'a' && b[i] <= 'z' {
95+
b[i] -= 'a' - 'A'
96+
}
97+
}
98+
return string(b)
99+
}

0 commit comments

Comments
 (0)