Skip to content

Commit fd68082

Browse files
committed
fix(tracing): properly shutdown tracer provider
Fix tracer provider reloads in different scenarios: - disabled -> enabled: install new tracer provider - enabled -> enabled: keep provider up and swap with new one - enabled -> disabled: shutdown old tracer provider, use noop provider - build failure: keep current provider Signed-off-by: Siavash Safi <siavash@cloudflare.com>
1 parent b8725fe commit fd68082

2 files changed

Lines changed: 158 additions & 18 deletions

File tree

tracing/tracing.go

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"fmt"
1919
"log/slog"
2020
"reflect"
21+
"sync"
2122
"sync/atomic"
2223
"time"
2324

@@ -65,6 +66,7 @@ func (t *conditionalTracer) Start(ctx context.Context, spanName string, opts ...
6566
// Manager is capable of building, (re)installing and shutting down
6667
// the tracer provider.
6768
type Manager struct {
69+
mtx sync.Mutex
6870
logger *slog.Logger
6971
done chan struct{}
7072
config TracingConfig
@@ -92,6 +94,9 @@ func (m *Manager) Run() {
9294
// ApplyConfig takes care of refreshing the tracing configuration by shutting down
9395
// the current tracer provider (if any is registered) and installing a new one.
9496
func (m *Manager) ApplyConfig(cfg TracingConfig) error {
97+
m.mtx.Lock()
98+
defer m.mtx.Unlock()
99+
95100
// Update only if a config change is detected. If TLS configuration is
96101
// set, we have to restart the manager to make sure that new TLS
97102
// certificates are picked up.
@@ -100,31 +105,40 @@ func (m *Manager) ApplyConfig(cfg TracingConfig) error {
100105
return nil
101106
}
102107

103-
if m.shutdownFunc != nil {
104-
if err := m.shutdownFunc(); err != nil {
105-
return fmt.Errorf("failed to shut down the tracer provider: %w", err)
106-
}
107-
}
108-
109-
// If no endpoint is set, assume tracing should be disabled.
108+
// If no endpoint is set, disable tracing and shut down the old provider.
110109
if cfg.Endpoint == "" {
111110
tracingEnabled.Store(false)
112-
m.config = cfg
113-
m.shutdownFunc = nil
114111
otel.SetTracerProvider(noop.NewTracerProvider())
112+
if m.shutdownFunc != nil {
113+
if err := m.shutdownFunc(); err != nil {
114+
m.logger.Warn("Failed to shut down the previous tracer provider", "err", err)
115+
}
116+
m.shutdownFunc = nil
117+
}
118+
m.config = cfg
115119
m.logger.Info("Tracing provider uninstalled.")
116120
return nil
117121
}
118122

123+
// Build the new provider before tearing down the old one so that
124+
// tracing remains available throughout the reload.
119125
tp, shutdownFunc, err := buildTracerProvider(context.Background(), cfg)
120126
if err != nil {
121-
return fmt.Errorf("failed to install a new tracer provider: %w", err)
127+
return fmt.Errorf("failed to build a new tracer provider: %w", err)
122128
}
123129

124-
m.shutdownFunc = shutdownFunc
125-
m.config = cfg
130+
// Swap to the new provider, then shut down the old one.
131+
oldShutdown := m.shutdownFunc
126132
otel.SetTracerProvider(tp)
127133
tracingEnabled.Store(true)
134+
m.shutdownFunc = shutdownFunc
135+
m.config = cfg
136+
137+
if oldShutdown != nil {
138+
if err := oldShutdown(); err != nil {
139+
m.logger.Warn("Failed to shut down the previous tracer provider", "err", err)
140+
}
141+
}
128142

129143
m.logger.Info("Successfully installed a new tracer provider.")
130144
return nil
@@ -134,13 +148,20 @@ func (m *Manager) ApplyConfig(cfg TracingConfig) error {
134148
func (m *Manager) Stop() {
135149
defer close(m.done)
136150

151+
m.mtx.Lock()
152+
defer m.mtx.Unlock()
153+
137154
if m.shutdownFunc == nil {
138155
return
139156
}
140157

158+
tracingEnabled.Store(false)
159+
otel.SetTracerProvider(noop.NewTracerProvider())
160+
141161
if err := m.shutdownFunc(); err != nil {
142162
m.logger.Error("failed to shut down the tracer provider", "err", err)
143163
}
164+
m.shutdownFunc = nil
144165

145166
m.logger.Info("Tracing manager stopped")
146167
}
@@ -190,12 +211,7 @@ func buildTracerProvider(ctx context.Context, tracingCfg TracingConfig) (trace.T
190211
return tp, func() error {
191212
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
192213
defer cancel()
193-
err := tp.Shutdown(ctx)
194-
if err != nil {
195-
return err
196-
}
197-
198-
return nil
214+
return tp.Shutdown(ctx)
199215
}, nil
200216
}
201217

tracing/tracing_test.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,139 @@
1414
package tracing
1515

1616
import (
17+
"context"
18+
"fmt"
1719
"testing"
1820

1921
commoncfg "github.com/prometheus/common/config"
2022
"github.com/prometheus/common/promslog"
2123
"github.com/stretchr/testify/require"
2224
"go.opentelemetry.io/otel"
25+
"go.opentelemetry.io/otel/attribute"
2326
"go.opentelemetry.io/otel/trace/noop"
2427
)
2528

29+
func TestConditionalTracerDisabled(t *testing.T) {
30+
tracingEnabled.Store(false)
31+
t.Cleanup(func() { tracingEnabled.Store(false) })
32+
33+
tracer := NewTracer("test")
34+
ctx := context.Background()
35+
retCtx, span := tracer.Start(ctx, "test-span")
36+
37+
// When disabled, Start should return the original context and a noop span.
38+
require.Equal(t, ctx, retCtx)
39+
require.IsType(t, noop.Span{}, span)
40+
41+
// Noop span operations should not panic.
42+
span.SetAttributes(attribute.String("key", "value"))
43+
span.AddEvent("event")
44+
span.End()
45+
}
46+
47+
func TestConditionalTracerEnabled(t *testing.T) {
48+
m := NewManager(promslog.NewNopLogger())
49+
cfg := TracingConfig{
50+
Endpoint: "localhost:1234",
51+
ClientType: TracingClientGRPC,
52+
}
53+
require.NoError(t, m.ApplyConfig(cfg))
54+
t.Cleanup(func() {
55+
tracingEnabled.Store(false)
56+
m.shutdownFunc = nil
57+
})
58+
59+
tracer := NewTracer("test")
60+
ctx := context.Background()
61+
_, span := tracer.Start(ctx, "test-span")
62+
defer span.End()
63+
64+
// When enabled, Start should return a real span (not noop).
65+
require.NotEqual(t, noop.Span{}, span)
66+
require.True(t, span.SpanContext().IsValid())
67+
}
68+
69+
func TestTracingEnabledFlagTransitions(t *testing.T) {
70+
m := NewManager(promslog.NewNopLogger())
71+
t.Cleanup(func() {
72+
tracingEnabled.Store(false)
73+
m.shutdownFunc = nil
74+
})
75+
76+
// Initially disabled.
77+
require.False(t, tracingEnabled.Load())
78+
79+
// Enable tracing.
80+
cfg := TracingConfig{
81+
Endpoint: "localhost:1234",
82+
ClientType: TracingClientGRPC,
83+
}
84+
require.NoError(t, m.ApplyConfig(cfg))
85+
require.True(t, tracingEnabled.Load())
86+
87+
// Disable tracing.
88+
require.NoError(t, m.ApplyConfig(TracingConfig{}))
89+
require.False(t, tracingEnabled.Load())
90+
}
91+
92+
func TestApplyConfigNoGapDuringReload(t *testing.T) {
93+
m := NewManager(promslog.NewNopLogger())
94+
cfg := TracingConfig{
95+
Endpoint: "localhost:1234",
96+
ClientType: TracingClientGRPC,
97+
}
98+
require.NoError(t, m.ApplyConfig(cfg))
99+
t.Cleanup(func() {
100+
tracingEnabled.Store(false)
101+
m.shutdownFunc = nil
102+
})
103+
104+
require.True(t, tracingEnabled.Load())
105+
tpBefore := otel.GetTracerProvider()
106+
107+
// Reload with a different config — tracing should never be disabled.
108+
cfg2 := TracingConfig{
109+
Endpoint: "localhost:5678",
110+
ClientType: TracingClientHTTP,
111+
}
112+
require.NoError(t, m.ApplyConfig(cfg2))
113+
114+
// tracingEnabled should still be true (was never set to false).
115+
require.True(t, tracingEnabled.Load())
116+
require.NotEqual(t, tpBefore, otel.GetTracerProvider())
117+
}
118+
119+
func TestApplyConfigBuildFailurePreservesState(t *testing.T) {
120+
m := NewManager(promslog.NewNopLogger())
121+
cfg := TracingConfig{
122+
Endpoint: "localhost:1234",
123+
ClientType: TracingClientGRPC,
124+
}
125+
require.NoError(t, m.ApplyConfig(cfg))
126+
t.Cleanup(func() {
127+
tracingEnabled.Store(false)
128+
m.shutdownFunc = nil
129+
})
130+
131+
require.True(t, tracingEnabled.Load())
132+
tpBefore := otel.GetTracerProvider()
133+
shutdownBefore := m.shutdownFunc
134+
135+
// Apply an invalid config that will cause buildTracerProvider to fail.
136+
badCfg := TracingConfig{
137+
Endpoint: "localhost:1234",
138+
ClientType: "invalid-client-type",
139+
}
140+
require.Error(t, m.ApplyConfig(badCfg))
141+
142+
// State should be preserved — tracing still enabled with original provider.
143+
require.True(t, tracingEnabled.Load())
144+
require.Equal(t, tpBefore, otel.GetTracerProvider())
145+
require.NotNil(t, m.shutdownFunc)
146+
// shutdownFunc should be the same as before the failed apply.
147+
require.Equal(t, fmt.Sprintf("%p", shutdownBefore), fmt.Sprintf("%p", m.shutdownFunc))
148+
}
149+
26150
func TestInstallingNewTracerProvider(t *testing.T) {
27151
tpBefore := otel.GetTracerProvider()
28152

0 commit comments

Comments
 (0)