Skip to content

Commit fe04dc1

Browse files
committed
otel: add tracing to VM startup path
Add OpenTelemetry instrumentation to trace VM boot latency: - Add libkrun.VMStart and libkrun.WaitForTTRPC spans to instance.go Start() - Add otelttrpc.UnaryClientInterceptor to the TTRPC client - Add build-tagged (//go:build tracing) integration test TestTraceVMBoot - Add Makefile targets: jaeger-start, jaeger-stop, test-tracing, jaeger-open, jaeger-gantt - Add hack/trace-to-mermaid.py to render Jaeger traces as Mermaid gantt charts - Add docs/tracing.md with usage instructions Signed-off-by: David Scott <dave@recoil.org>
1 parent ffbf23e commit fe04dc1

File tree

7 files changed

+384
-5
lines changed

7 files changed

+384
-5
lines changed

Makefile

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,32 @@ test-unit:
188188
test-integration: _output/integration.test
189189
@echo "$(WHALE) $@"
190190
gotestsum -f testname --raw-command ./integration/test.sh
191+
192+
jaeger-start:
193+
@if docker inspect nerdbox-jaeger >/dev/null 2>&1; then \
194+
docker start nerdbox-jaeger; \
195+
else \
196+
docker run -d --name nerdbox-jaeger \
197+
-p 16686:16686 -p 4318:4318 \
198+
jaegertracing/all-in-one:latest; \
199+
fi
200+
@echo "Jaeger UI: http://localhost:16686"
201+
202+
jaeger-stop:
203+
docker stop nerdbox-jaeger 2>/dev/null || true
204+
205+
test-tracing: jaeger-start
206+
$(GO) test -c -o _output/integration.test -tags tracing ./integration
207+
ifeq ($(OS),Darwin)
208+
codesign --entitlements cmd/containerd-shim-nerdbox-v1/containerd-shim-nerdbox-v1.entitlements --force -s - _output/integration.test
209+
endif
210+
./_output/integration.test -test.v -test.count=1 -test.run TestTraceVMBoot -test.timeout 2m
211+
212+
jaeger-open:
213+
@TRACE_ID=$$(curl -sf "http://localhost:16686/api/traces?service=nerdbox&limit=1" | \
214+
python3 -c "import sys,json; print(json.load(sys.stdin)['data'][0]['traceID'])") && \
215+
echo "Opening trace $$TRACE_ID" && \
216+
open "http://localhost:16686/trace/$$TRACE_ID"
217+
218+
jaeger-gantt:
219+
@python3 hack/trace-to-mermaid.py nerdbox

docs/tracing.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# OTel Tracing
2+
3+
Nerdbox supports OpenTelemetry tracing.
4+
5+
## Tracing with Jaeger
6+
7+
This requires docker to run the Jaeger service.
8+
9+
```bash
10+
make test-tracing
11+
12+
# Open the Jaeger UI in the browser
13+
make jaeger-open
14+
15+
# Make a Mermaid gantt diagram
16+
make jaeger-gantt
17+
18+
# Stop Jaeger
19+
make jaeger-stop
20+
```

go.mod

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ require (
2626
github.com/stretchr/testify v1.11.1
2727
github.com/vishvananda/netlink v1.3.1
2828
github.com/vishvananda/netns v0.0.5
29+
go.opentelemetry.io/otel v1.42.0
30+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.42.0
31+
go.opentelemetry.io/otel/sdk v1.42.0
32+
go.opentelemetry.io/otel/trace v1.42.0
2933
golang.org/x/sync v0.20.0
3034
golang.org/x/sys v0.42.0
3135
google.golang.org/grpc v1.79.3
@@ -34,6 +38,7 @@ require (
3438

3539
require (
3640
github.com/Microsoft/hcsshim v0.14.0-rc.1 // indirect
41+
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
3742
github.com/cespare/xxhash/v2 v2.3.0 // indirect
3843
github.com/cilium/ebpf v0.16.0 // indirect
3944
github.com/containerd/continuity v0.4.5 // indirect
@@ -47,6 +52,8 @@ require (
4752
github.com/gogo/protobuf v1.3.2 // indirect
4853
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
4954
github.com/google/go-cmp v0.7.0 // indirect
55+
github.com/google/uuid v1.6.0 // indirect
56+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
5057
github.com/josharian/native v1.1.0 // indirect
5158
github.com/klauspost/compress v1.18.4 // indirect
5259
github.com/mdlayher/packet v1.1.2 // indirect
@@ -61,12 +68,13 @@ require (
6168
github.com/u-root/uio v0.0.0-20230220225925-ffce2a382923 // indirect
6269
go.opencensus.io v0.24.0 // indirect
6370
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
64-
go.opentelemetry.io/otel v1.42.0 // indirect
71+
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.42.0 // indirect
6572
go.opentelemetry.io/otel/metric v1.42.0 // indirect
66-
go.opentelemetry.io/otel/trace v1.42.0 // indirect
73+
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
6774
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect
6875
golang.org/x/net v0.51.0 // indirect
6976
golang.org/x/text v0.34.0 // indirect
77+
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect
7078
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect
7179
gopkg.in/yaml.v3 v3.0.1 // indirect
7280
)

go.sum

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo
44
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
55
github.com/Microsoft/hcsshim v0.14.0-rc.1 h1:qAPXKwGOkVn8LlqgBN8GS0bxZ83hOJpcjxzmlQKxKsQ=
66
github.com/Microsoft/hcsshim v0.14.0-rc.1/go.mod h1:hTKFGbnDtQb1wHiOWv4v0eN+7boSWAHyK/tNAaYZL0c=
7+
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
8+
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
79
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
810
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
911
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
@@ -94,6 +96,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX
9496
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
9597
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
9698
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
99+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
100+
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
97101
github.com/hugelgupf/socketpair v0.0.0-20190730060125-05d35a94e714 h1:/jC7qQFrv8CrSJVmaolDVOxTfS9kc36uB6H40kdbQq8=
98102
github.com/hugelgupf/socketpair v0.0.0-20190730060125-05d35a94e714/go.mod h1:2Goc3h8EklBH5mspfHFxBnEoURQCGzQQH1ga9Myjvis=
99103
github.com/insomniacslk/dhcp v0.0.0-20250919081422-f80a1952f48e h1:nu5z6Kg+gMNW6tdqnVjg/QEJ8Nw71IJQqOtWj00XHEU=
@@ -166,14 +170,20 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
166170
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
167171
go.opentelemetry.io/otel v1.42.0 h1:lSQGzTgVR3+sgJDAU/7/ZMjN9Z+vUip7leaqBKy4sho=
168172
go.opentelemetry.io/otel v1.42.0/go.mod h1:lJNsdRMxCUIWuMlVJWzecSMuNjE7dOYyWlqOXWkdqCc=
173+
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.42.0 h1:THuZiwpQZuHPul65w4WcwEnkX2QIuMT+UFoOrygtoJw=
174+
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.42.0/go.mod h1:J2pvYM5NGHofZ2/Ru6zw/TNWnEQp5crgyDeSrYpXkAw=
175+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.42.0 h1:uLXP+3mghfMf7XmV4PkGfFhFKuNWoCvvx5wP/wOXo0o=
176+
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.42.0/go.mod h1:v0Tj04armyT59mnURNUJf7RCKcKzq+lgJs6QSjHjaTc=
169177
go.opentelemetry.io/otel/metric v1.42.0 h1:2jXG+3oZLNXEPfNmnpxKDeZsFI5o4J+nz6xUlaFdF/4=
170178
go.opentelemetry.io/otel/metric v1.42.0/go.mod h1:RlUN/7vTU7Ao/diDkEpQpnz3/92J9ko05BIwxYa2SSI=
171179
go.opentelemetry.io/otel/sdk v1.42.0 h1:LyC8+jqk6UJwdrI/8VydAq/hvkFKNHZVIWuslJXYsDo=
172180
go.opentelemetry.io/otel/sdk v1.42.0/go.mod h1:rGHCAxd9DAph0joO4W6OPwxjNTYWghRWmkHuGbayMts=
173-
go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8=
174-
go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew=
181+
go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA=
182+
go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc=
175183
go.opentelemetry.io/otel/trace v1.42.0 h1:OUCgIPt+mzOnaUTpOQcBiM/PLQ/Op7oq6g4LenLmOYY=
176184
go.opentelemetry.io/otel/trace v1.42.0/go.mod h1:f3K9S+IFqnumBkKhRJMeaZeNk9epyhnCmQh/EysQCdc=
185+
go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
186+
go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
177187
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
178188
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
179189
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -239,6 +249,8 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7
239249
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
240250
google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
241251
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
252+
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0=
253+
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY=
242254
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
243255
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
244256
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=

hack/trace-to-mermaid.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
#!/usr/bin/env python3
2+
"""Fetch the latest Jaeger trace for a service and render it as a Mermaid Gantt chart.
3+
4+
Usage: trace-to-mermaid.py <service_name> [min_ms]
5+
6+
Arguments:
7+
service_name Jaeger service name (e.g., "nerdbox")
8+
min_ms Minimum span duration in ms to include (default: 1)
9+
10+
Environment:
11+
JAEGER_URL Jaeger base URL (default: http://localhost:16686)
12+
"""
13+
import json
14+
import sys
15+
import tempfile
16+
import urllib.request
17+
18+
if len(sys.argv) < 2:
19+
print("Usage: trace-to-mermaid.py <service_name> [min_ms]", file=sys.stderr)
20+
sys.exit(1)
21+
22+
service = sys.argv[1]
23+
min_ms = int(sys.argv[2]) if len(sys.argv) > 2 else 1
24+
jaeger_url = __import__("os").environ.get("JAEGER_URL", "http://localhost:16686")
25+
26+
# Fetch latest trace for the service
27+
url = f"{jaeger_url}/api/traces?service={service}&limit=1"
28+
try:
29+
with urllib.request.urlopen(url) as resp:
30+
trace_data = json.load(resp)
31+
except Exception as e:
32+
print(f"Error: could not fetch traces from {jaeger_url}: {e}", file=sys.stderr)
33+
print("Is Jaeger running? Try: make jaeger-start", file=sys.stderr)
34+
sys.exit(1)
35+
36+
if not trace_data.get("data"):
37+
print(f"Error: no traces found for service '{service}'", file=sys.stderr)
38+
print("Run some operations first, e.g.:", file=sys.stderr)
39+
print(" make test-tracing", file=sys.stderr)
40+
sys.exit(1)
41+
42+
trace = trace_data["data"][0]
43+
spans = trace["spans"]
44+
processes = trace["processes"]
45+
46+
if not spans:
47+
print("Error: trace has no spans", file=sys.stderr)
48+
sys.exit(1)
49+
50+
# Find trace start time (earliest span)
51+
trace_start = min(s["startTime"] for s in spans)
52+
53+
# Build span lookup and parent map
54+
span_map = {}
55+
children = {}
56+
for s in spans:
57+
sid = s["spanID"]
58+
span_map[sid] = s
59+
children.setdefault(sid, [])
60+
for ref in s.get("references", []):
61+
if ref["refType"] == "CHILD_OF":
62+
parent_id = ref["spanID"]
63+
children.setdefault(parent_id, [])
64+
children[parent_id].append(sid)
65+
66+
# Find root spans (no CHILD_OF reference within this trace)
67+
all_span_ids = {s["spanID"] for s in spans}
68+
root_spans = []
69+
for s in spans:
70+
refs = s.get("references", [])
71+
parent_refs = [r for r in refs if r["refType"] == "CHILD_OF" and r["spanID"] in all_span_ids]
72+
if not parent_refs:
73+
root_spans.append(s)
74+
75+
# Sort roots by start time
76+
root_spans.sort(key=lambda s: s["startTime"])
77+
78+
79+
def span_duration_ms(s):
80+
return s["duration"] / 1000.0
81+
82+
83+
def span_start_ms(s):
84+
return (s["startTime"] - trace_start) / 1000.0
85+
86+
87+
def safe_id(name, sid):
88+
"""Create a mermaid-safe task ID."""
89+
clean = name.replace(" ", "_").replace(".", "_").replace("/", "_").replace("-", "_")
90+
# Append short span ID suffix for uniqueness
91+
return f"{clean}_{sid[:6]}"
92+
93+
94+
def collect_spans(span_id, depth=0):
95+
"""Collect span and descendants, depth-first."""
96+
s = span_map[span_id]
97+
result = [(s, depth)]
98+
kids = children.get(span_id, [])
99+
kids.sort(key=lambda cid: span_map[cid]["startTime"])
100+
for cid in kids:
101+
result.extend(collect_spans(cid, depth + 1))
102+
return result
103+
104+
105+
# Build output
106+
service_name = processes[spans[0]["processID"]]["serviceName"]
107+
lines = []
108+
lines.append("```mermaid")
109+
lines.append("gantt")
110+
lines.append(" dateFormat x")
111+
lines.append(" axisFormat %s.%L s")
112+
lines.append(f" title Trace for {service_name}")
113+
lines.append("")
114+
115+
for root in root_spans:
116+
root_name = root["operationName"]
117+
118+
# Section per root span
119+
lines.append(f" section {root_name}")
120+
121+
all_spans = collect_spans(root["spanID"])
122+
for s, depth in all_spans:
123+
dur_ms = span_duration_ms(s)
124+
if dur_ms < min_ms:
125+
continue
126+
start_ms = span_start_ms(s)
127+
end_ms = start_ms + dur_ms
128+
129+
# Indent name by depth for readability
130+
prefix = ". " * depth
131+
op = s["operationName"]
132+
proc = processes[s["processID"]]["serviceName"]
133+
label = f"{prefix}{op}"
134+
if proc != service_name:
135+
label = f"{prefix}{proc}/{op}"
136+
137+
tid = safe_id(op, s["spanID"])
138+
# Mermaid gantt uses milliseconds with dateFormat x
139+
lines.append(f" {label} :{tid}, {int(start_ms)}, {int(end_ms)}")
140+
141+
lines.append("```")
142+
143+
print("\n".join(lines))

0 commit comments

Comments
 (0)