Skip to content

Commit 859ef3b

Browse files
Sayan-claude
andcommitted
add VM-internal failure telemetry (OOM + service crashes)
Two new telemetry event types surface VM-level failures alongside the existing browser events: - `system_oom_kill` — emitted by an in-process /dev/kmsg reader in the api server whenever the kernel OOM-killer terminates a process, including unsupervised Chrome renderer subprocesses. - `service_crashed` — emitted by a tiny supervisord eventlistener binary that POSTs to the local /telemetry/events endpoint whenever a supervised service unexpectedly exits (PROCESS_STATE_EXITED with expected=0, or PROCESS_STATE_FATAL). Both events flow through the existing EventStream and inherit the SSE and S2 sinks for free. Categorized as `system` so they're always-on. The shim is shipped in both the chromium-headful and chromium-headless images and registered as `[eventlistener:supervisord-shim]`. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 805213e commit 859ef3b

13 files changed

Lines changed: 1382 additions & 322 deletions

File tree

images/chromium-headful/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
3333
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
3434
go build -ldflags="-s -w" -o /out/wrapper ./cmd/wrapper
3535

36+
# Build supervisord eventlistener shim
37+
RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
38+
--mount=type=cache,target=/go/pkg/mod,id=$CACHEIDPREFIX-go-pkg-mod \
39+
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
40+
go build -ldflags="-s -w" -o /out/kernel-images-supervisord-shim ./cmd/supervisord-shim
41+
3642
# webrtc client
3743
FROM node:22-bullseye-slim AS client
3844
WORKDIR /src
@@ -378,6 +384,7 @@ RUN chmod +x /usr/local/bin/init-envoy.sh
378384
# copy the kernel-images API binary built in the builder stage
379385
COPY --from=server-builder /out/kernel-images-api /usr/local/bin/kernel-images-api
380386
COPY --from=server-builder /out/chromium-launcher /usr/local/bin/chromium-launcher
387+
COPY --from=server-builder /out/kernel-images-supervisord-shim /usr/local/bin/kernel-images-supervisord-shim
381388
COPY --from=server-builder /out/wrapper /wrapper
382389

383390
# Copy and compile the Playwright daemon
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[eventlistener:supervisord-shim]
2+
command=/usr/local/bin/kernel-images-supervisord-shim
3+
events=PROCESS_STATE_EXITED,PROCESS_STATE_FATAL
4+
autostart=true
5+
autorestart=true
6+
stderr_logfile=/var/log/supervisord/supervisord-shim
7+
; stdout is the eventlistener protocol channel; do not redirect.

images/chromium-headless/image/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
3434
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
3535
go build -ldflags="-s -w" -o /out/wrapper ./cmd/wrapper
3636

37+
# Build supervisord eventlistener shim
38+
RUN --mount=type=cache,target=/root/.cache/go-build,id=$CACHEIDPREFIX-go-build \
39+
--mount=type=cache,target=/go/pkg/mod,id=$CACHEIDPREFIX-go-pkg-mod \
40+
GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} \
41+
go build -ldflags="-s -w" -o /out/kernel-images-supervisord-shim ./cmd/supervisord-shim
42+
3743
FROM docker.io/ubuntu:22.04 AS ffmpeg-downloader
3844

3945
# Allow cross-compilation when building with BuildKit platforms
@@ -256,6 +262,7 @@ RUN chmod +x /usr/local/bin/bake-certs.sh && /usr/local/bin/bake-certs.sh && rm
256262
# Copy the kernel-images API binary built in the builder stage
257263
COPY --from=server-builder /out/kernel-images-api /usr/local/bin/kernel-images-api
258264
COPY --from=server-builder /out/chromium-launcher /usr/local/bin/chromium-launcher
265+
COPY --from=server-builder /out/kernel-images-supervisord-shim /usr/local/bin/kernel-images-supervisord-shim
259266

260267
# Copy and compile the Playwright daemon
261268
COPY server/runtime/playwright-daemon.ts /tmp/playwright-daemon.ts
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[eventlistener:supervisord-shim]
2+
command=/usr/local/bin/kernel-images-supervisord-shim
3+
events=PROCESS_STATE_EXITED,PROCESS_STATE_FATAL
4+
autostart=true
5+
autorestart=true
6+
stderr_logfile=/var/log/supervisord/supervisord-shim
7+
; stdout is the eventlistener protocol channel; do not redirect.

server/cmd/api/main.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
oapi "github.com/kernel/kernel-images/server/lib/oapi"
3131
"github.com/kernel/kernel-images/server/lib/recorder"
3232
"github.com/kernel/kernel-images/server/lib/scaletozero"
33+
"github.com/kernel/kernel-images/server/lib/sysmon"
3334
"github.com/kernel/kernel-images/server/lib/telemetry"
3435
)
3536

@@ -103,6 +104,11 @@ func main() {
103104
}
104105
telemetrySession := telemetry.NewTelemetrySession(eventStream)
105106

107+
// VM-internal failure telemetry (OOM kills via /dev/kmsg).
108+
// service_crashed events arrive via POST /telemetry/events from the
109+
// supervisord-shim child process, not through this monitor.
110+
sysmon.New(eventStream, slogger).Start(ctx)
111+
106112
// Optional S2 storage sink.
107113
var s2Writer *events.S2StorageWriter
108114
if config.S2Basin != "" && config.S2AccessToken != "" && config.S2Stream != "" {
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
// Command supervisord-shim is a tiny supervisord eventlistener that
2+
// translates PROCESS_STATE_EXITED (expected=0) and PROCESS_STATE_FATAL events
3+
// into BrowserServiceCrashedEvent payloads and POSTs them to the local
4+
// kernel-images-api telemetry endpoint.
5+
//
6+
// All schema-mapping and event publishing logic lives here; lib/sysmon does
7+
// not handle supervisord events. Keeping the shim as the sole owner of the
8+
// supervisord protocol means lib/sysmon stays single-purpose (kmsg only).
9+
//
10+
// Wire protocol per supervisord docs:
11+
//
12+
// stdout: "READY\n"
13+
// stdin: header line ("ver:3.0 ... eventname:PROCESS_STATE_EXITED len:54\n")
14+
// stdin: payload of `len` bytes (no trailing newline)
15+
// stdout: "RESULT 2\nOK\n" (always; ACK regardless of downstream success)
16+
//
17+
// We always ACK with OK so supervisord doesn't quarantine us when the
18+
// downstream HTTP target is briefly unavailable. The events are
19+
// best-effort; if the API is down, we drop and log.
20+
//
21+
// All logging goes to stderr — stdout is the supervisord protocol channel.
22+
package main
23+
24+
import (
25+
"bufio"
26+
"bytes"
27+
"context"
28+
"encoding/json"
29+
"fmt"
30+
"io"
31+
"log"
32+
"net/http"
33+
"os"
34+
"strconv"
35+
"strings"
36+
"time"
37+
)
38+
39+
const (
40+
defaultTelemetryURL = "http://127.0.0.1:10001/telemetry/events"
41+
httpTimeout = 2 * time.Second
42+
)
43+
44+
func main() {
45+
log.SetOutput(os.Stderr)
46+
log.SetFlags(log.LstdFlags | log.Lmicroseconds)
47+
48+
telemetryURL := os.Getenv("KERNEL_IMAGES_TELEMETRY_URL")
49+
if telemetryURL == "" {
50+
telemetryURL = defaultTelemetryURL
51+
}
52+
53+
pub := &publisher{
54+
url: telemetryURL,
55+
client: &http.Client{Timeout: httpTimeout},
56+
}
57+
58+
in := bufio.NewReader(os.Stdin)
59+
out := bufio.NewWriter(os.Stdout)
60+
61+
for {
62+
if _, err := out.WriteString("READY\n"); err != nil {
63+
log.Fatalf("write READY: %v", err)
64+
}
65+
if err := out.Flush(); err != nil {
66+
log.Fatalf("flush READY: %v", err)
67+
}
68+
69+
header, payload, err := readEvent(in)
70+
if err != nil {
71+
if err == io.EOF {
72+
return
73+
}
74+
log.Fatalf("read event: %v", err)
75+
}
76+
77+
// Try to publish but always ACK supervisord.
78+
if ev, ok := mapEvent(header, payload); ok {
79+
if perr := pub.publish(context.Background(), ev); perr != nil {
80+
log.Printf("publish telemetry event: %v", perr)
81+
}
82+
}
83+
84+
if _, err := out.WriteString("RESULT 2\nOK\n"); err != nil {
85+
log.Fatalf("write RESULT: %v", err)
86+
}
87+
if err := out.Flush(); err != nil {
88+
log.Fatalf("flush RESULT: %v", err)
89+
}
90+
}
91+
}
92+
93+
// readEvent reads one supervisord event: a header line followed by a payload
94+
// of declared length.
95+
func readEvent(in *bufio.Reader) (map[string]string, map[string]string, error) {
96+
headerLine, err := in.ReadString('\n')
97+
if err != nil {
98+
return nil, nil, err
99+
}
100+
header := parseFields(strings.TrimRight(headerLine, "\n"))
101+
102+
lenStr, ok := header["len"]
103+
if !ok {
104+
return nil, nil, fmt.Errorf("missing len in header: %q", headerLine)
105+
}
106+
n, err := strconv.Atoi(lenStr)
107+
if err != nil {
108+
return nil, nil, fmt.Errorf("invalid len %q: %w", lenStr, err)
109+
}
110+
111+
buf := make([]byte, n)
112+
if _, err := io.ReadFull(in, buf); err != nil {
113+
return nil, nil, fmt.Errorf("read payload: %w", err)
114+
}
115+
payload := parseFields(string(buf))
116+
return header, payload, nil
117+
}
118+
119+
// parseFields parses supervisord's "key:value key:value" tokenization.
120+
// Values are split on the first colon; supervisord does not escape colons in
121+
// values, but in practice the values we care about (process names, states,
122+
// ints) never contain them.
123+
func parseFields(s string) map[string]string {
124+
out := make(map[string]string)
125+
for _, tok := range strings.Fields(s) {
126+
i := strings.IndexByte(tok, ':')
127+
if i < 0 {
128+
continue
129+
}
130+
out[tok[:i]] = tok[i+1:]
131+
}
132+
return out
133+
}
134+
135+
// telemetryEventBody mirrors oapi.TelemetryEvent but is duplicated here so the
136+
// shim does not pull in the entire server module — keeps the binary tiny.
137+
type telemetryEventBody struct {
138+
Type string `json:"type"`
139+
Category string `json:"category"`
140+
Source telemetryEventSource `json:"source"`
141+
Data serviceCrashedPayload `json:"data"`
142+
}
143+
144+
type telemetryEventSource struct {
145+
Kind string `json:"kind"`
146+
Event string `json:"event"`
147+
}
148+
149+
type serviceCrashedPayload struct {
150+
ServiceName string `json:"service_name"`
151+
FromState string `json:"from_state"`
152+
Pid *int `json:"pid,omitempty"`
153+
}
154+
155+
// mapEvent decides whether to publish and constructs the event payload.
156+
// Returns ok=false for events we deliberately skip (intentional stops,
157+
// non-crash event types).
158+
func mapEvent(header, payload map[string]string) (telemetryEventBody, bool) {
159+
eventName := header["eventname"]
160+
switch eventName {
161+
case "PROCESS_STATE_EXITED":
162+
// expected=0 means the exit was not in `exitcodes` — i.e. a crash.
163+
// expected=1 means clean shutdown (supervisorctl stop, or a configured
164+
// exitcode). Skip the latter.
165+
if payload["expected"] != "0" {
166+
return telemetryEventBody{}, false
167+
}
168+
case "PROCESS_STATE_FATAL":
169+
// FATAL: supervisord exhausted startretries. Always a crash.
170+
default:
171+
return telemetryEventBody{}, false
172+
}
173+
174+
name := payload["processname"]
175+
if name == "" {
176+
return telemetryEventBody{}, false
177+
}
178+
fromState := payload["from_state"]
179+
if fromState == "" {
180+
return telemetryEventBody{}, false
181+
}
182+
183+
body := telemetryEventBody{
184+
Type: "service_crashed",
185+
Category: "system",
186+
Source: telemetryEventSource{
187+
Kind: "local_process",
188+
Event: "supervisord.process_" + strings.ToLower(strings.TrimPrefix(eventName, "PROCESS_STATE_")),
189+
},
190+
Data: serviceCrashedPayload{
191+
ServiceName: name,
192+
FromState: fromState,
193+
},
194+
}
195+
if pidStr := payload["pid"]; pidStr != "" {
196+
if pid, err := strconv.Atoi(pidStr); err == nil {
197+
body.Data.Pid = &pid
198+
}
199+
}
200+
return body, true
201+
}
202+
203+
type publisher struct {
204+
url string
205+
client *http.Client
206+
}
207+
208+
func (p *publisher) publish(ctx context.Context, body telemetryEventBody) error {
209+
buf, err := json.Marshal(body)
210+
if err != nil {
211+
return fmt.Errorf("marshal: %w", err)
212+
}
213+
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.url, bytes.NewReader(buf))
214+
if err != nil {
215+
return fmt.Errorf("new request: %w", err)
216+
}
217+
req.Header.Set("Content-Type", "application/json")
218+
resp, err := p.client.Do(req)
219+
if err != nil {
220+
return err
221+
}
222+
defer resp.Body.Close()
223+
if resp.StatusCode >= 300 {
224+
b, _ := io.ReadAll(resp.Body)
225+
return fmt.Errorf("status %d: %s", resp.StatusCode, bytes.TrimSpace(b))
226+
}
227+
return nil
228+
}

0 commit comments

Comments
 (0)