Skip to content

Commit 2731d2f

Browse files
authored
fix(tracing): properly shutdown tracer provider (#5131)
Fix tracer provider reloads in different scenarios: - disabled -> enabled: install new tracer provider - enabled -> enabled: keep provider up and swap with new one - enabled -> disabled: shutdown old tracer provider, use noop provider - build failure: keep current provider Signed-off-by: Siavash Safi <siavash@cloudflare.com>
1 parent 97c8a53 commit 2731d2f

2 files changed

Lines changed: 146 additions & 18 deletions

File tree

tracing/tracing.go

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"fmt"
1919
"log/slog"
2020
"reflect"
21+
"sync"
2122
"sync/atomic"
2223
"time"
2324

@@ -65,6 +66,7 @@ func (t *conditionalTracer) Start(ctx context.Context, spanName string, opts ...
6566
// Manager is capable of building, (re)installing and shutting down
6667
// the tracer provider.
6768
type Manager struct {
69+
mtx sync.Mutex
6870
logger *slog.Logger
6971
done chan struct{}
7072
config TracingConfig
@@ -92,6 +94,9 @@ func (m *Manager) Run() {
9294
// ApplyConfig takes care of refreshing the tracing configuration by shutting down
9395
// the current tracer provider (if any is registered) and installing a new one.
9496
func (m *Manager) ApplyConfig(cfg TracingConfig) error {
97+
m.mtx.Lock()
98+
defer m.mtx.Unlock()
99+
95100
// Update only if a config change is detected. If TLS configuration is
96101
// set, we have to restart the manager to make sure that new TLS
97102
// certificates are picked up.
@@ -100,31 +105,40 @@ func (m *Manager) ApplyConfig(cfg TracingConfig) error {
100105
return nil
101106
}
102107

103-
if m.shutdownFunc != nil {
104-
if err := m.shutdownFunc(); err != nil {
105-
return fmt.Errorf("failed to shut down the tracer provider: %w", err)
106-
}
107-
}
108-
109-
// If no endpoint is set, assume tracing should be disabled.
108+
// If no endpoint is set, disable tracing and shut down the old provider.
110109
if cfg.Endpoint == "" {
111110
tracingEnabled.Store(false)
112-
m.config = cfg
113-
m.shutdownFunc = nil
114111
otel.SetTracerProvider(noop.NewTracerProvider())
112+
if m.shutdownFunc != nil {
113+
if err := m.shutdownFunc(); err != nil {
114+
m.logger.Warn("Failed to shut down the previous tracer provider", "err", err)
115+
}
116+
m.shutdownFunc = nil
117+
}
118+
m.config = cfg
115119
m.logger.Info("Tracing provider uninstalled.")
116120
return nil
117121
}
118122

123+
// Build the new provider before tearing down the old one so that
124+
// tracing remains available throughout the reload.
119125
tp, shutdownFunc, err := buildTracerProvider(context.Background(), cfg)
120126
if err != nil {
121-
return fmt.Errorf("failed to install a new tracer provider: %w", err)
127+
return fmt.Errorf("failed to build a new tracer provider: %w", err)
122128
}
123129

124-
m.shutdownFunc = shutdownFunc
125-
m.config = cfg
130+
// Swap to the new provider, then shut down the old one.
131+
oldShutdown := m.shutdownFunc
126132
otel.SetTracerProvider(tp)
127133
tracingEnabled.Store(true)
134+
m.shutdownFunc = shutdownFunc
135+
m.config = cfg
136+
137+
if oldShutdown != nil {
138+
if err := oldShutdown(); err != nil {
139+
m.logger.Warn("Failed to shut down the previous tracer provider", "err", err)
140+
}
141+
}
128142

129143
m.logger.Info("Successfully installed a new tracer provider.")
130144
return nil
@@ -134,13 +148,20 @@ func (m *Manager) ApplyConfig(cfg TracingConfig) error {
134148
func (m *Manager) Stop() {
135149
defer close(m.done)
136150

151+
m.mtx.Lock()
152+
defer m.mtx.Unlock()
153+
137154
if m.shutdownFunc == nil {
138155
return
139156
}
140157

158+
tracingEnabled.Store(false)
159+
otel.SetTracerProvider(noop.NewTracerProvider())
160+
141161
if err := m.shutdownFunc(); err != nil {
142162
m.logger.Error("failed to shut down the tracer provider", "err", err)
143163
}
164+
m.shutdownFunc = nil
144165

145166
m.logger.Info("Tracing manager stopped")
146167
}
@@ -190,12 +211,7 @@ func buildTracerProvider(ctx context.Context, tracingCfg TracingConfig) (trace.T
190211
return tp, func() error {
191212
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
192213
defer cancel()
193-
err := tp.Shutdown(ctx)
194-
if err != nil {
195-
return err
196-
}
197-
198-
return nil
214+
return tp.Shutdown(ctx)
199215
}, nil
200216
}
201217

tracing/tracing_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,127 @@
1414
package tracing
1515

1616
import (
17+
"context"
18+
"reflect"
1719
"testing"
1820

1921
commoncfg "github.com/prometheus/common/config"
2022
"github.com/prometheus/common/promslog"
2123
"github.com/stretchr/testify/require"
2224
"go.opentelemetry.io/otel"
25+
"go.opentelemetry.io/otel/attribute"
2326
"go.opentelemetry.io/otel/trace/noop"
2427
)
2528

29+
func TestConditionalTracerDisabled(t *testing.T) {
30+
tracingEnabled.Store(false)
31+
t.Cleanup(func() { tracingEnabled.Store(false) })
32+
33+
tracer := NewTracer("test")
34+
ctx := context.Background()
35+
retCtx, span := tracer.Start(ctx, "test-span")
36+
37+
// When disabled, Start should return the original context and a noop span.
38+
require.Equal(t, ctx, retCtx)
39+
require.IsType(t, noop.Span{}, span)
40+
41+
// Noop span operations should not panic.
42+
span.SetAttributes(attribute.String("key", "value"))
43+
span.AddEvent("event")
44+
span.End()
45+
}
46+
47+
func TestConditionalTracerEnabled(t *testing.T) {
48+
m := NewManager(promslog.NewNopLogger())
49+
cfg := TracingConfig{
50+
Endpoint: "localhost:1234",
51+
ClientType: TracingClientGRPC,
52+
}
53+
require.NoError(t, m.ApplyConfig(cfg))
54+
t.Cleanup(m.Stop)
55+
56+
tracer := NewTracer("test")
57+
ctx := context.Background()
58+
_, span := tracer.Start(ctx, "test-span")
59+
defer span.End()
60+
61+
// When enabled, Start should return a real span (not noop).
62+
require.NotEqual(t, noop.Span{}, span)
63+
require.True(t, span.SpanContext().IsValid())
64+
}
65+
66+
func TestTracingEnabledFlagTransitions(t *testing.T) {
67+
m := NewManager(promslog.NewNopLogger())
68+
t.Cleanup(m.Stop)
69+
70+
// Initially disabled.
71+
require.False(t, tracingEnabled.Load())
72+
73+
// Enable tracing.
74+
cfg := TracingConfig{
75+
Endpoint: "localhost:1234",
76+
ClientType: TracingClientGRPC,
77+
}
78+
require.NoError(t, m.ApplyConfig(cfg))
79+
require.True(t, tracingEnabled.Load())
80+
81+
// Disable tracing.
82+
require.NoError(t, m.ApplyConfig(TracingConfig{}))
83+
require.False(t, tracingEnabled.Load())
84+
}
85+
86+
func TestApplyConfigNoGapDuringReload(t *testing.T) {
87+
m := NewManager(promslog.NewNopLogger())
88+
cfg := TracingConfig{
89+
Endpoint: "localhost:1234",
90+
ClientType: TracingClientGRPC,
91+
}
92+
require.NoError(t, m.ApplyConfig(cfg))
93+
t.Cleanup(m.Stop)
94+
95+
require.True(t, tracingEnabled.Load())
96+
tpBefore := otel.GetTracerProvider()
97+
98+
// Reload with a different config — tracing should never be disabled.
99+
cfg2 := TracingConfig{
100+
Endpoint: "localhost:5678",
101+
ClientType: TracingClientHTTP,
102+
}
103+
require.NoError(t, m.ApplyConfig(cfg2))
104+
105+
// tracingEnabled should still be true (was never set to false).
106+
require.True(t, tracingEnabled.Load())
107+
require.NotEqual(t, tpBefore, otel.GetTracerProvider())
108+
}
109+
110+
func TestApplyConfigBuildFailurePreservesState(t *testing.T) {
111+
m := NewManager(promslog.NewNopLogger())
112+
cfg := TracingConfig{
113+
Endpoint: "localhost:1234",
114+
ClientType: TracingClientGRPC,
115+
}
116+
require.NoError(t, m.ApplyConfig(cfg))
117+
t.Cleanup(m.Stop)
118+
119+
require.True(t, tracingEnabled.Load())
120+
tpBefore := otel.GetTracerProvider()
121+
shutdownBefore := m.shutdownFunc
122+
123+
// Apply an invalid config that will cause buildTracerProvider to fail.
124+
badCfg := TracingConfig{
125+
Endpoint: "localhost:1234",
126+
ClientType: "invalid-client-type",
127+
}
128+
require.Error(t, m.ApplyConfig(badCfg))
129+
130+
// State should be preserved — tracing still enabled with original provider.
131+
require.True(t, tracingEnabled.Load())
132+
require.Equal(t, tpBefore, otel.GetTracerProvider())
133+
require.NotNil(t, m.shutdownFunc)
134+
// shutdownFunc should be the same as before the failed apply.
135+
require.Equal(t, reflect.ValueOf(shutdownBefore).Pointer(), reflect.ValueOf(m.shutdownFunc).Pointer())
136+
}
137+
26138
func TestInstallingNewTracerProvider(t *testing.T) {
27139
tpBefore := otel.GetTracerProvider()
28140

0 commit comments

Comments
 (0)