Skip to content

Commit ebbdfef

Browse files
fix(telemetry): TLS + NR api-key header for OTLP exporter (P0-2)
Distributed tracing was 100% dead in prod across api, worker, and provisioner — every pod logged a trace-export failure every 15-30s and `trace_id` was empty on every log line, making cross-service request correlation impossible. Root cause from BUGHUNT-2026-05-20-T21 P0-2: tracer.go called `otlptracegrpc.WithInsecure()` (plaintext gRPC) but the OTel endpoint pointed at the TLS host `https://otlp.nr-data.net:4317`. The exporter sent cleartext HTTP/2 into a TLS listener; the TLS ServerHello bytes were misread as an oversized HTTP/2 frame and every export silently dropped. Compounding the issue, no `api-key` header was sent — NR OTLP ingest requires it, so even a fixed handshake would have returned UNAUTHENTICATED. Fix (mirrors the api repo change): - Auto-detect TLS from endpoint scheme: `https://` prefix, NR's `otlp.*nr-data.net` host, or a `:443` suffix → TLS; everything else → plaintext (for local dev / in-cluster collectors). - When NEW_RELIC_LICENSE_KEY is set, attach it as the `api-key` gRPC header on every export (NR ingest contract). - Fail-open: empty endpoint = noop shutdown; empty/CHANGE_ME license key = WARN + exporter is still constructed (it dials lazily so boot never blocks). - Stamp INFO `telemetry.tracer_initialized` with endpoint + tls + nr_auth flags at boot so an operator can verify the config from a single log line. Regression tests (tracer_test.go) — same set as api / provisioner: - TestInitTracer_EmptyEndpointNoop - TestInitTracer_Boots - TestShouldUseTLS - TestStripScheme `go test ./internal/telemetry/...` green (the rest of the worker tree currently has an unrelated in-flight edit in internal/jobs/ expire.go from a parallel agent — out of scope for this commit). Refs: BUGHUNT-2026-05-20-T21.md (P0-2), CLAUDE-AGENT-RELIABILITY-RULES.md rule 22 (contract changes touch all surfaces — mirror commits in api, provisioner, infra). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5721fce commit ebbdfef

2 files changed

Lines changed: 183 additions & 10 deletions

File tree

internal/telemetry/tracer.go

Lines changed: 109 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package telemetry
22

33
import (
44
"context"
5+
"crypto/tls"
56
"fmt"
67
"log/slog"
78
"os"
@@ -14,31 +15,84 @@ import (
1415
"go.opentelemetry.io/otel/sdk/resource"
1516
sdktrace "go.opentelemetry.io/otel/sdk/trace"
1617
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
18+
"google.golang.org/grpc/credentials"
1719
)
1820

1921
// InitTracer configures the global OpenTelemetry tracer provider.
20-
// When otlpEndpoint is empty, the noop provider remains (fail open).
22+
//
23+
// Endpoint selection (in order of precedence):
24+
// 1. otlpEndpoint argument (typically os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"))
25+
// 2. if empty → tracing disabled (noop), return a no-op shutdown
26+
//
27+
// TLS vs plaintext is auto-detected from the scheme: an `https://` prefix
28+
// (or an endpoint targeting the well-known NR OTLP host `otlp.*nr-data.net`)
29+
// uses TLS; everything else (no scheme, `http://`, an in-cluster host like
30+
// `otel-collector:4317`) falls back to plaintext for local dev.
31+
//
32+
// New Relic auth: when NEW_RELIC_LICENSE_KEY is set (and non-sentinel), it
33+
// is sent as the `api-key` gRPC header on every export — this is the NR
34+
// OTLP ingest contract. When unset/empty/`CHANGE_ME`, we still construct
35+
// a working exporter (it just won't be accepted by NR) and log a WARN so
36+
// the operator knows tracing is configured-but-unauthenticated.
37+
//
38+
// Returns shutdown which should be deferred; shutdown is a no-op when
39+
// tracing is disabled. NEVER crashes — every failure mode falls back to
40+
// a no-op shutdown so a misconfigured tracer can never block service boot.
41+
//
42+
// Historical note (2026-05-20 P0-2): the prior implementation called
43+
// `otlptracegrpc.WithInsecure()` against the TLS endpoint
44+
// `https://otlp.nr-data.net:4317` and never sent the NR `api-key` header.
45+
// Result: every export silently failed (`http2 frame too large`), every
46+
// log line had `trace_id=""`. The TLS-by-scheme + NR-key-header pair
47+
// below is the fix; do not revert.
2148
func InitTracer(serviceName, otlpEndpoint string) func(context.Context) error {
22-
ep := strings.TrimSpace(otlpEndpoint)
23-
if ep == "" {
49+
raw := strings.TrimSpace(otlpEndpoint)
50+
if raw == "" {
2451
return func(context.Context) error { return nil }
2552
}
53+
2654
if s := strings.TrimSpace(os.Getenv("OTEL_SERVICE_NAME")); s != "" {
2755
serviceName = s
2856
}
2957

30-
ep = strings.TrimPrefix(ep, "https://")
31-
ep = strings.TrimPrefix(ep, "http://")
58+
useTLS := shouldUseTLS(raw)
59+
ep := stripScheme(raw)
60+
61+
licenseKey := strings.TrimSpace(os.Getenv("NEW_RELIC_LICENSE_KEY"))
62+
if licenseKey == "" || licenseKey == "CHANGE_ME" {
63+
slog.Warn("telemetry.nr_license_missing",
64+
"endpoint", ep,
65+
"detail", "OTLP exporter constructed but NEW_RELIC_LICENSE_KEY is empty/sentinel; exports will be rejected by NR")
66+
licenseKey = ""
67+
}
68+
69+
opts := []otlptracegrpc.Option{
70+
otlptracegrpc.WithEndpoint(ep),
71+
}
72+
if useTLS {
73+
opts = append(opts,
74+
otlptracegrpc.WithTLSCredentials(credentials.NewTLS(&tls.Config{
75+
MinVersion: tls.VersionTLS12,
76+
})),
77+
)
78+
} else {
79+
opts = append(opts, otlptracegrpc.WithInsecure())
80+
}
81+
if licenseKey != "" {
82+
// NR OTLP requires this header on every request; without it the
83+
// ingest path returns UNAUTHENTICATED and the exporter silently
84+
// drops every span.
85+
opts = append(opts, otlptracegrpc.WithHeaders(map[string]string{
86+
"api-key": licenseKey,
87+
}))
88+
}
3289

3390
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
3491
defer cancel()
3592

36-
exporter, err := otlptracegrpc.New(ctx,
37-
otlptracegrpc.WithEndpoint(ep),
38-
otlptracegrpc.WithInsecure(),
39-
)
93+
exporter, err := otlptracegrpc.New(ctx, opts...)
4094
if err != nil {
41-
slog.Error("telemetry.otlp_exporter_failed", "error", err, "endpoint", ep)
95+
slog.Error("telemetry.otlp_exporter_failed", "error", err, "endpoint", ep, "tls", useTLS)
4296
return func(context.Context) error { return nil }
4397
}
4498

@@ -61,6 +115,12 @@ func InitTracer(serviceName, otlpEndpoint string) func(context.Context) error {
61115
propagation.Baggage{},
62116
))
63117

118+
slog.Info("telemetry.tracer_initialized",
119+
"service", serviceName,
120+
"endpoint", ep,
121+
"tls", useTLS,
122+
"nr_auth", licenseKey != "")
123+
64124
return func(shutdownCtx context.Context) error {
65125
ctx, cancel := context.WithTimeout(shutdownCtx, 10*time.Second)
66126
defer cancel()
@@ -70,3 +130,42 @@ func InitTracer(serviceName, otlpEndpoint string) func(context.Context) error {
70130
return nil
71131
}
72132
}
133+
134+
// shouldUseTLS returns true when the OTLP endpoint should be dialed over
135+
// TLS. Heuristics, in order:
136+
// 1. explicit `https://` scheme → TLS
137+
// 2. explicit `http://` scheme → plaintext
138+
// 3. host matches `otlp.*nr-data.net` (NR's OTLP ingest hosts, all TLS) → TLS
139+
// 4. host ends in `:443` → TLS
140+
// 5. otherwise (no scheme, in-cluster collector, etc) → plaintext
141+
//
142+
// Exported only for tests.
143+
func shouldUseTLS(endpoint string) bool {
144+
e := strings.ToLower(strings.TrimSpace(endpoint))
145+
if strings.HasPrefix(e, "https://") {
146+
return true
147+
}
148+
if strings.HasPrefix(e, "http://") {
149+
return false
150+
}
151+
host := stripScheme(e)
152+
// Bare host[:port] — sniff for NR's well-known OTLP hosts and the
153+
// 443 port suffix.
154+
if strings.Contains(host, "nr-data.net") {
155+
return true
156+
}
157+
if strings.HasSuffix(host, ":443") {
158+
return true
159+
}
160+
return false
161+
}
162+
163+
// stripScheme removes a leading `http://` or `https://` from the endpoint,
164+
// returning the bare `host:port` form that otlptracegrpc.WithEndpoint
165+
// expects.
166+
func stripScheme(endpoint string) string {
167+
e := strings.TrimSpace(endpoint)
168+
e = strings.TrimPrefix(e, "https://")
169+
e = strings.TrimPrefix(e, "http://")
170+
return e
171+
}

internal/telemetry/tracer_test.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
package telemetry
2+
3+
import (
4+
"context"
5+
"testing"
6+
)
7+
8+
// TestInitTracer_EmptyEndpointNoop — when the endpoint is unset, the
9+
// returned shutdown must be a working no-op. This is the fail-open
10+
// contract for local dev / CI runs where OTel is intentionally off.
11+
func TestInitTracer_EmptyEndpointNoop(t *testing.T) {
12+
shutdown := InitTracer("instant-worker", "")
13+
if shutdown == nil {
14+
t.Fatal("InitTracer returned nil shutdown for empty endpoint")
15+
}
16+
if err := shutdown(context.Background()); err != nil {
17+
t.Fatalf("noop shutdown returned error: %v", err)
18+
}
19+
}
20+
21+
// TestInitTracer_Boots — with a non-empty endpoint, InitTracer constructs
22+
// a real exporter without crashing even if NEW_RELIC_LICENSE_KEY is unset.
23+
// The exporter dials lazily on the first export, so construction must
24+
// succeed regardless of whether the endpoint is reachable.
25+
func TestInitTracer_Boots(t *testing.T) {
26+
t.Setenv("NEW_RELIC_LICENSE_KEY", "")
27+
shutdown := InitTracer("instant-worker", "https://otlp.nr-data.net:4317")
28+
if shutdown == nil {
29+
t.Fatal("InitTracer returned nil shutdown")
30+
}
31+
_ = shutdown(context.Background())
32+
}
33+
34+
// TestShouldUseTLS — the regression case for P0-2: every `https://`
35+
// endpoint AND every `*nr-data.net` host MUST resolve to TLS=true.
36+
// Reverting to WithInsecure() for these would silently kill tracing
37+
// again (the symptom that produced this test).
38+
func TestShouldUseTLS(t *testing.T) {
39+
cases := []struct {
40+
endpoint string
41+
want bool
42+
}{
43+
{"https://otlp.nr-data.net:4317", true},
44+
{"https://otlp.eu01.nr-data.net:4317", true},
45+
{"otlp.nr-data.net:4317", true},
46+
{"otlp.eu01.nr-data.net:4317", true},
47+
{"foo.example.com:443", true},
48+
{"http://otel-collector.observability:4317", false},
49+
{"otel-collector.observability:4317", false},
50+
{"localhost:4317", false},
51+
{"", false},
52+
}
53+
for _, c := range cases {
54+
got := shouldUseTLS(c.endpoint)
55+
if got != c.want {
56+
t.Errorf("shouldUseTLS(%q) = %v, want %v", c.endpoint, got, c.want)
57+
}
58+
}
59+
}
60+
61+
// TestStripScheme — strips http:// and https:// uniformly.
62+
func TestStripScheme(t *testing.T) {
63+
cases := map[string]string{
64+
"https://otlp.nr-data.net:4317": "otlp.nr-data.net:4317",
65+
"http://localhost:4317": "localhost:4317",
66+
"otlp.nr-data.net:4317": "otlp.nr-data.net:4317",
67+
"": "",
68+
}
69+
for in, want := range cases {
70+
if got := stripScheme(in); got != want {
71+
t.Errorf("stripScheme(%q) = %q, want %q", in, got, want)
72+
}
73+
}
74+
}

0 commit comments

Comments
 (0)