Skip to content

Commit a4b4251

Browse files
authored
Merge pull request #39 from shubham-stepsecurity/sm/feat/add-support
feat(mdm): add telemetry run status reporting
2 parents 0fba903 + 70aecdd commit a4b4251

10 files changed

Lines changed: 506 additions & 28 deletions

File tree

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ module github.com/step-security/dev-machine-guard
33
go 1.24
44

55
require golang.org/x/sys v0.33.0
6+
7+
require github.com/google/uuid v1.6.0

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1+
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
2+
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
13
golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
24
golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=

internal/model/model.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ type ScanResult struct {
2020
PythonPkgManagers []PkgManager `json:"python_package_managers"`
2121
PythonPackages []PythonPackage `json:"python_packages"`
2222
PythonProjects []ProjectInfo `json:"python_projects"`
23-
SystemPkgManager *PkgManager `json:"system_package_manager,omitempty"`
24-
SystemPackages []SystemPackage `json:"system_packages"`
25-
SnapPkgManager *PkgManager `json:"snap_package_manager,omitempty"`
26-
SnapPackages []SystemPackage `json:"snap_packages"`
27-
FlatpakPkgManager *PkgManager `json:"flatpak_package_manager,omitempty"`
28-
FlatpakPackages []SystemPackage `json:"flatpak_packages"`
29-
Summary Summary `json:"summary"`
23+
SystemPkgManager *PkgManager `json:"system_package_manager,omitempty"`
24+
SystemPackages []SystemPackage `json:"system_packages"`
25+
SnapPkgManager *PkgManager `json:"snap_package_manager,omitempty"`
26+
SnapPackages []SystemPackage `json:"snap_packages"`
27+
FlatpakPkgManager *PkgManager `json:"flatpak_package_manager,omitempty"`
28+
FlatpakPackages []SystemPackage `json:"flatpak_packages"`
29+
Summary Summary `json:"summary"`
3030
}
3131

3232
type Device struct {
@@ -153,7 +153,7 @@ type PythonPackage struct {
153153
// Unlike BrewScanResult (which sends raw base64), this sends pre-parsed packages
154154
// since syspkg.go already handles the format-specific parsing edge cases.
155155
type SystemPackageScanResult struct {
156-
ScanType string `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak"
156+
ScanType string `json:"scan_type"` // "rpm", "dpkg", "pacman", "apk", "snap", "flatpak"
157157
PackageManager *PkgManager `json:"package_manager,omitempty"`
158158
Packages []SystemPackage `json:"packages"`
159159
PackagesCount int `json:"packages_count"`

internal/telemetry/run_status.go

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
package telemetry
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"net/http"
9+
"runtime"
10+
"time"
11+
12+
"github.com/step-security/dev-machine-guard/internal/buildinfo"
13+
"github.com/step-security/dev-machine-guard/internal/config"
14+
"github.com/step-security/dev-machine-guard/internal/progress"
15+
)
16+
17+
const (
18+
runStatusStarted = "started"
19+
runStatusFailed = "failed"
20+
runStatusCancelled = "cancelled by user"
21+
runStatusMaxErrorChars = 2000
22+
runStatusHTTPTimeout = 3 * time.Second
23+
24+
// Retry counts per status. "started" is load-bearing for attempt
25+
// visibility — we retry harder so a single transient network blip
26+
// does not lose the signal that the run was attempted. "failed"
27+
// fires during shutdown, so one retry covers the common case.
28+
runStatusStartedAttempts = 3
29+
runStatusFailedAttempts = 2
30+
runStatusRetryBackoff = 500 * time.Millisecond
31+
)
32+
33+
// reportRunStatus POSTs a lifecycle transition to the backend with a small
34+
// retry budget. Never returns an error: running the scan is the priority.
35+
//
36+
// status must be "started" or "failed". Passing "succeeded" (or any other
37+
// value) is a defensive no-op — success is written by the backend worker
38+
// after it persists the uploaded telemetry.
39+
func reportRunStatus(ctx context.Context, log *progress.Logger,
40+
executionID, deviceID, status, errMsg string) {
41+
42+
if !config.IsEnterpriseMode() {
43+
return
44+
}
45+
if status != runStatusStarted && status != runStatusFailed {
46+
return
47+
}
48+
if executionID == "" {
49+
return
50+
}
51+
52+
payload := map[string]string{
53+
"execution_id": executionID,
54+
"device_id": deviceID,
55+
"status": status,
56+
"agent_version": buildinfo.Version,
57+
"platform": runtime.GOOS,
58+
}
59+
if status == runStatusFailed {
60+
if errMsg == "" {
61+
// Backend rejects a "failed" report with no error_message.
62+
errMsg = "unspecified failure"
63+
}
64+
if len(errMsg) > runStatusMaxErrorChars {
65+
errMsg = errMsg[:runStatusMaxErrorChars]
66+
}
67+
payload["error_message"] = errMsg
68+
}
69+
70+
body, err := json.Marshal(payload)
71+
if err != nil {
72+
log.Progress("run-status: marshal error: %v", err)
73+
return
74+
}
75+
76+
endpoint := fmt.Sprintf("%s/v1/%s/developer-mdm-agent/telemetry/run-status",
77+
config.APIEndpoint, config.CustomerID)
78+
79+
attempts := runStatusFailedAttempts
80+
if status == runStatusStarted {
81+
attempts = runStatusStartedAttempts
82+
}
83+
84+
for i := 1; i <= attempts; i++ {
85+
if i > 1 {
86+
// Fixed short backoff. Keeps the total time budget bounded so
87+
// retries don't visibly delay the scan start.
88+
select {
89+
case <-time.After(runStatusRetryBackoff):
90+
case <-ctx.Done():
91+
log.Progress("run-status: parent context done, abandoning retries")
92+
return
93+
}
94+
}
95+
if postRunStatusOnce(ctx, log, endpoint, body, status, i, attempts) {
96+
return
97+
}
98+
}
99+
}
100+
101+
// postRunStatusOnce performs a single HTTP attempt. Returns true on a 2xx
102+
// or 4xx (terminal — retrying a bad request will not help). Returns false
103+
// on transport errors or 5xx so the caller can retry.
104+
func postRunStatusOnce(ctx context.Context, log *progress.Logger,
105+
endpoint string, body []byte, status string, attempt, maxAttempts int) bool {
106+
107+
cctx, cancel := context.WithTimeout(ctx, runStatusHTTPTimeout)
108+
defer cancel()
109+
110+
req, err := http.NewRequestWithContext(cctx, http.MethodPost, endpoint, bytes.NewReader(body))
111+
if err != nil {
112+
log.Progress("run-status[%s %d/%d]: request error: %v", status, attempt, maxAttempts, err)
113+
return false
114+
}
115+
req.Header.Set("Content-Type", "application/json")
116+
req.Header.Set("Authorization", "Bearer "+config.APIKey)
117+
req.Header.Set("X-Agent-Version", buildinfo.Version)
118+
119+
client := &http.Client{Timeout: runStatusHTTPTimeout}
120+
resp, err := client.Do(req)
121+
if err != nil {
122+
log.Progress("run-status[%s %d/%d]: POST error: %v", status, attempt, maxAttempts, err)
123+
return false
124+
}
125+
defer func() { _ = resp.Body.Close() }()
126+
127+
if resp.StatusCode < 300 {
128+
return true
129+
}
130+
if resp.StatusCode >= 400 && resp.StatusCode < 500 {
131+
log.Progress("run-status[%s]: HTTP %d (terminal, no retry)", status, resp.StatusCode)
132+
return true
133+
}
134+
log.Progress("run-status[%s %d/%d]: HTTP %d from backend", status, attempt, maxAttempts, resp.StatusCode)
135+
return false
136+
}
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
package telemetry
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"io"
7+
"net/http"
8+
"net/http/httptest"
9+
"sync/atomic"
10+
"testing"
11+
"time"
12+
13+
"github.com/step-security/dev-machine-guard/internal/config"
14+
"github.com/step-security/dev-machine-guard/internal/progress"
15+
)
16+
17+
// withEnterpriseConfig temporarily patches config to look enterprise-enabled
18+
// and points APIEndpoint at the given test server. Restores on return.
19+
func withEnterpriseConfig(t *testing.T, endpoint string) func() {
20+
t.Helper()
21+
savedKey, savedCustomer, savedEndpoint := config.APIKey, config.CustomerID, config.APIEndpoint
22+
config.APIKey = "sk-test-123"
23+
config.CustomerID = "test-customer"
24+
config.APIEndpoint = endpoint
25+
return func() {
26+
config.APIKey, config.CustomerID, config.APIEndpoint = savedKey, savedCustomer, savedEndpoint
27+
}
28+
}
29+
30+
func TestReportRunStatus_StartedRetriesOn5xx(t *testing.T) {
31+
var calls int32
32+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
33+
atomic.AddInt32(&calls, 1)
34+
w.WriteHeader(http.StatusInternalServerError)
35+
}))
36+
defer srv.Close()
37+
defer withEnterpriseConfig(t, srv.URL)()
38+
39+
log := progress.NewLogger(progress.LevelInfo)
40+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
41+
42+
if got := atomic.LoadInt32(&calls); got != int32(runStatusStartedAttempts) {
43+
t.Fatalf("expected %d retries on 5xx, got %d", runStatusStartedAttempts, got)
44+
}
45+
}
46+
47+
func TestReportRunStatus_StartedStopsAfter2xx(t *testing.T) {
48+
var calls int32
49+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
50+
atomic.AddInt32(&calls, 1)
51+
w.WriteHeader(http.StatusOK)
52+
}))
53+
defer srv.Close()
54+
defer withEnterpriseConfig(t, srv.URL)()
55+
56+
log := progress.NewLogger(progress.LevelInfo)
57+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
58+
59+
if got := atomic.LoadInt32(&calls); got != 1 {
60+
t.Fatalf("expected exactly 1 call on 2xx, got %d", got)
61+
}
62+
}
63+
64+
func TestReportRunStatus_DoesNotRetryOn4xx(t *testing.T) {
65+
// 4xx is terminal: validation or auth rejection; retrying cannot help.
66+
var calls int32
67+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
68+
atomic.AddInt32(&calls, 1)
69+
w.WriteHeader(http.StatusBadRequest)
70+
}))
71+
defer srv.Close()
72+
defer withEnterpriseConfig(t, srv.URL)()
73+
74+
log := progress.NewLogger(progress.LevelInfo)
75+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
76+
77+
if got := atomic.LoadInt32(&calls); got != 1 {
78+
t.Fatalf("expected 1 call for 4xx (no retry), got %d", got)
79+
}
80+
}
81+
82+
func TestReportRunStatus_FailedRetriesOn5xx(t *testing.T) {
83+
var calls int32
84+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
85+
atomic.AddInt32(&calls, 1)
86+
w.WriteHeader(http.StatusInternalServerError)
87+
}))
88+
defer srv.Close()
89+
defer withEnterpriseConfig(t, srv.URL)()
90+
91+
log := progress.NewLogger(progress.LevelInfo)
92+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "boom")
93+
94+
if got := atomic.LoadInt32(&calls); got != int32(runStatusFailedAttempts) {
95+
t.Fatalf("expected %d retries on 5xx for failed, got %d", runStatusFailedAttempts, got)
96+
}
97+
}
98+
99+
func TestReportRunStatus_FailedIncludesErrorMessage(t *testing.T) {
100+
var gotBody map[string]string
101+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
102+
body, _ := io.ReadAll(r.Body)
103+
_ = json.Unmarshal(body, &gotBody)
104+
w.WriteHeader(http.StatusOK)
105+
}))
106+
defer srv.Close()
107+
defer withEnterpriseConfig(t, srv.URL)()
108+
109+
log := progress.NewLogger(progress.LevelInfo)
110+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusFailed, "context deadline exceeded")
111+
112+
if gotBody["status"] != runStatusFailed {
113+
t.Errorf("status = %q, want %q", gotBody["status"], runStatusFailed)
114+
}
115+
if gotBody["error_message"] != "context deadline exceeded" {
116+
t.Errorf("error_message = %q, want %q", gotBody["error_message"], "context deadline exceeded")
117+
}
118+
if gotBody["execution_id"] == "" {
119+
t.Errorf("execution_id missing from body: %+v", gotBody)
120+
}
121+
}
122+
123+
func TestReportRunStatus_SkipsSucceededAndUnknownStatus(t *testing.T) {
124+
var calls int32
125+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
126+
atomic.AddInt32(&calls, 1)
127+
}))
128+
defer srv.Close()
129+
defer withEnterpriseConfig(t, srv.URL)()
130+
131+
log := progress.NewLogger(progress.LevelInfo)
132+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "succeeded", "")
133+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", "cancelled", "")
134+
135+
if got := atomic.LoadInt32(&calls); got != 0 {
136+
t.Fatalf("expected zero HTTP calls for non-agent statuses, got %d", got)
137+
}
138+
}
139+
140+
func TestReportRunStatus_SkipsWhenNotEnterprise(t *testing.T) {
141+
var calls int32
142+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
143+
atomic.AddInt32(&calls, 1)
144+
}))
145+
defer srv.Close()
146+
147+
// Restore config after test. Default config.APIKey is the placeholder, which
148+
// makes IsEnterpriseMode return false.
149+
savedKey := config.APIKey
150+
config.APIKey = "{{API_KEY}}"
151+
defer func() { config.APIKey = savedKey }()
152+
153+
log := progress.NewLogger(progress.LevelInfo)
154+
reportRunStatus(context.Background(), log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
155+
156+
if got := atomic.LoadInt32(&calls); got != 0 {
157+
t.Fatalf("expected zero calls when not in enterprise mode, got %d", got)
158+
}
159+
}
160+
161+
func TestReportRunStatus_SkipsEmptyExecutionID(t *testing.T) {
162+
var calls int32
163+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
164+
atomic.AddInt32(&calls, 1)
165+
}))
166+
defer srv.Close()
167+
defer withEnterpriseConfig(t, srv.URL)()
168+
169+
log := progress.NewLogger(progress.LevelInfo)
170+
reportRunStatus(context.Background(), log, "", "dev-1", runStatusStarted, "")
171+
172+
if got := atomic.LoadInt32(&calls); got != 0 {
173+
t.Fatalf("expected zero calls when execution_id is empty, got %d", got)
174+
}
175+
}
176+
177+
func TestReportRunStatus_AbortsRetriesOnCtxCancel(t *testing.T) {
178+
// Server hangs — every attempt will hit the per-attempt timeout, but we
179+
// cancel the parent context mid-run to confirm retries stop.
180+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
181+
time.Sleep(runStatusHTTPTimeout + 2*time.Second)
182+
}))
183+
defer srv.Close()
184+
defer withEnterpriseConfig(t, srv.URL)()
185+
186+
ctx, cancel := context.WithCancel(context.Background())
187+
// Cancel after the first attempt completes (~runStatusHTTPTimeout) so we
188+
// land in the backoff select where ctx.Done wins.
189+
time.AfterFunc(runStatusHTTPTimeout+100*time.Millisecond, cancel)
190+
191+
log := progress.NewLogger(progress.LevelInfo)
192+
done := make(chan struct{})
193+
start := time.Now()
194+
go func() {
195+
reportRunStatus(ctx, log, "11111111-2222-4333-8444-555555555555", "dev-1", runStatusStarted, "")
196+
close(done)
197+
}()
198+
199+
select {
200+
case <-done:
201+
elapsed := time.Since(start)
202+
// Should be close to the first-attempt timeout, well under the full
203+
// retry budget (~runStatusHTTPTimeout * runStatusStartedAttempts).
204+
budget := runStatusHTTPTimeout*2 + 500*time.Millisecond
205+
if elapsed > budget {
206+
t.Fatalf("reportRunStatus took %s, expected ≤ %s once ctx is cancelled", elapsed, budget)
207+
}
208+
case <-time.After(runStatusHTTPTimeout*int64Attempts() + 5*time.Second):
209+
t.Fatal("reportRunStatus did not return")
210+
}
211+
}
212+
213+
func int64Attempts() time.Duration {
214+
return time.Duration(runStatusStartedAttempts)
215+
}

0 commit comments

Comments
 (0)