From fd680824a6e645303a4a6f8e0e45d5a5af63d99a Mon Sep 17 00:00:00 2001 From: Siavash Safi Date: Mon, 30 Mar 2026 11:57:42 +0200 Subject: [PATCH 1/3] fix(tracing): properly shutdown tracer provider Fix tracer provider reloads in different scenarios: - disabled -> enabled: install new tracer provider - enabled -> enabled: keep provider up and swap with new one - enabled -> disabled: shutdown old tracer provider, use noop provider - build failure: keep current provider Signed-off-by: Siavash Safi --- tracing/tracing.go | 52 +++++++++++------ tracing/tracing_test.go | 124 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 18 deletions(-) diff --git a/tracing/tracing.go b/tracing/tracing.go index b9c8f9413f..6c1fd0a7a9 100644 --- a/tracing/tracing.go +++ b/tracing/tracing.go @@ -18,6 +18,7 @@ import ( "fmt" "log/slog" "reflect" + "sync" "sync/atomic" "time" @@ -65,6 +66,7 @@ func (t *conditionalTracer) Start(ctx context.Context, spanName string, opts ... // Manager is capable of building, (re)installing and shutting down // the tracer provider. type Manager struct { + mtx sync.Mutex logger *slog.Logger done chan struct{} config TracingConfig @@ -92,6 +94,9 @@ func (m *Manager) Run() { // ApplyConfig takes care of refreshing the tracing configuration by shutting down // the current tracer provider (if any is registered) and installing a new one. func (m *Manager) ApplyConfig(cfg TracingConfig) error { + m.mtx.Lock() + defer m.mtx.Unlock() + // Update only if a config change is detected. If TLS configuration is // set, we have to restart the manager to make sure that new TLS // certificates are picked up. @@ -100,31 +105,40 @@ func (m *Manager) ApplyConfig(cfg TracingConfig) error { return nil } - if m.shutdownFunc != nil { - if err := m.shutdownFunc(); err != nil { - return fmt.Errorf("failed to shut down the tracer provider: %w", err) - } - } - - // If no endpoint is set, assume tracing should be disabled. + // If no endpoint is set, disable tracing and shut down the old provider. if cfg.Endpoint == "" { tracingEnabled.Store(false) - m.config = cfg - m.shutdownFunc = nil otel.SetTracerProvider(noop.NewTracerProvider()) + if m.shutdownFunc != nil { + if err := m.shutdownFunc(); err != nil { + m.logger.Warn("Failed to shut down the previous tracer provider", "err", err) + } + m.shutdownFunc = nil + } + m.config = cfg m.logger.Info("Tracing provider uninstalled.") return nil } + // Build the new provider before tearing down the old one so that + // tracing remains available throughout the reload. tp, shutdownFunc, err := buildTracerProvider(context.Background(), cfg) if err != nil { - return fmt.Errorf("failed to install a new tracer provider: %w", err) + return fmt.Errorf("failed to build a new tracer provider: %w", err) } - m.shutdownFunc = shutdownFunc - m.config = cfg + // Swap to the new provider, then shut down the old one. + oldShutdown := m.shutdownFunc otel.SetTracerProvider(tp) tracingEnabled.Store(true) + m.shutdownFunc = shutdownFunc + m.config = cfg + + if oldShutdown != nil { + if err := oldShutdown(); err != nil { + m.logger.Warn("Failed to shut down the previous tracer provider", "err", err) + } + } m.logger.Info("Successfully installed a new tracer provider.") return nil @@ -134,13 +148,20 @@ func (m *Manager) ApplyConfig(cfg TracingConfig) error { func (m *Manager) Stop() { defer close(m.done) + m.mtx.Lock() + defer m.mtx.Unlock() + if m.shutdownFunc == nil { return } + tracingEnabled.Store(false) + otel.SetTracerProvider(noop.NewTracerProvider()) + if err := m.shutdownFunc(); err != nil { m.logger.Error("failed to shut down the tracer provider", "err", err) } + m.shutdownFunc = nil m.logger.Info("Tracing manager stopped") } @@ -190,12 +211,7 @@ func buildTracerProvider(ctx context.Context, tracingCfg TracingConfig) (trace.T return tp, func() error { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() - err := tp.Shutdown(ctx) - if err != nil { - return err - } - - return nil + return tp.Shutdown(ctx) }, nil } diff --git a/tracing/tracing_test.go b/tracing/tracing_test.go index 392f26115d..dec8b68ea1 100644 --- a/tracing/tracing_test.go +++ b/tracing/tracing_test.go @@ -14,15 +14,139 @@ package tracing import ( + "context" + "fmt" "testing" commoncfg "github.com/prometheus/common/config" "github.com/prometheus/common/promslog" "github.com/stretchr/testify/require" "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace/noop" ) +func TestConditionalTracerDisabled(t *testing.T) { + tracingEnabled.Store(false) + t.Cleanup(func() { tracingEnabled.Store(false) }) + + tracer := NewTracer("test") + ctx := context.Background() + retCtx, span := tracer.Start(ctx, "test-span") + + // When disabled, Start should return the original context and a noop span. + require.Equal(t, ctx, retCtx) + require.IsType(t, noop.Span{}, span) + + // Noop span operations should not panic. + span.SetAttributes(attribute.String("key", "value")) + span.AddEvent("event") + span.End() +} + +func TestConditionalTracerEnabled(t *testing.T) { + m := NewManager(promslog.NewNopLogger()) + cfg := TracingConfig{ + Endpoint: "localhost:1234", + ClientType: TracingClientGRPC, + } + require.NoError(t, m.ApplyConfig(cfg)) + t.Cleanup(func() { + tracingEnabled.Store(false) + m.shutdownFunc = nil + }) + + tracer := NewTracer("test") + ctx := context.Background() + _, span := tracer.Start(ctx, "test-span") + defer span.End() + + // When enabled, Start should return a real span (not noop). + require.NotEqual(t, noop.Span{}, span) + require.True(t, span.SpanContext().IsValid()) +} + +func TestTracingEnabledFlagTransitions(t *testing.T) { + m := NewManager(promslog.NewNopLogger()) + t.Cleanup(func() { + tracingEnabled.Store(false) + m.shutdownFunc = nil + }) + + // Initially disabled. + require.False(t, tracingEnabled.Load()) + + // Enable tracing. + cfg := TracingConfig{ + Endpoint: "localhost:1234", + ClientType: TracingClientGRPC, + } + require.NoError(t, m.ApplyConfig(cfg)) + require.True(t, tracingEnabled.Load()) + + // Disable tracing. + require.NoError(t, m.ApplyConfig(TracingConfig{})) + require.False(t, tracingEnabled.Load()) +} + +func TestApplyConfigNoGapDuringReload(t *testing.T) { + m := NewManager(promslog.NewNopLogger()) + cfg := TracingConfig{ + Endpoint: "localhost:1234", + ClientType: TracingClientGRPC, + } + require.NoError(t, m.ApplyConfig(cfg)) + t.Cleanup(func() { + tracingEnabled.Store(false) + m.shutdownFunc = nil + }) + + require.True(t, tracingEnabled.Load()) + tpBefore := otel.GetTracerProvider() + + // Reload with a different config — tracing should never be disabled. + cfg2 := TracingConfig{ + Endpoint: "localhost:5678", + ClientType: TracingClientHTTP, + } + require.NoError(t, m.ApplyConfig(cfg2)) + + // tracingEnabled should still be true (was never set to false). + require.True(t, tracingEnabled.Load()) + require.NotEqual(t, tpBefore, otel.GetTracerProvider()) +} + +func TestApplyConfigBuildFailurePreservesState(t *testing.T) { + m := NewManager(promslog.NewNopLogger()) + cfg := TracingConfig{ + Endpoint: "localhost:1234", + ClientType: TracingClientGRPC, + } + require.NoError(t, m.ApplyConfig(cfg)) + t.Cleanup(func() { + tracingEnabled.Store(false) + m.shutdownFunc = nil + }) + + require.True(t, tracingEnabled.Load()) + tpBefore := otel.GetTracerProvider() + shutdownBefore := m.shutdownFunc + + // Apply an invalid config that will cause buildTracerProvider to fail. + badCfg := TracingConfig{ + Endpoint: "localhost:1234", + ClientType: "invalid-client-type", + } + require.Error(t, m.ApplyConfig(badCfg)) + + // State should be preserved — tracing still enabled with original provider. + require.True(t, tracingEnabled.Load()) + require.Equal(t, tpBefore, otel.GetTracerProvider()) + require.NotNil(t, m.shutdownFunc) + // shutdownFunc should be the same as before the failed apply. + require.Equal(t, fmt.Sprintf("%p", shutdownBefore), fmt.Sprintf("%p", m.shutdownFunc)) +} + func TestInstallingNewTracerProvider(t *testing.T) { tpBefore := otel.GetTracerProvider() From e3d714ad5fd530a74b346b76e4b388334d6ba165 Mon Sep 17 00:00:00 2001 From: Siavash Safi Date: Fri, 3 Apr 2026 15:46:42 +0200 Subject: [PATCH 2/3] use better func comparison in tests Signed-off-by: Siavash Safi --- tracing/tracing_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tracing/tracing_test.go b/tracing/tracing_test.go index dec8b68ea1..683e5ae45f 100644 --- a/tracing/tracing_test.go +++ b/tracing/tracing_test.go @@ -15,7 +15,7 @@ package tracing import ( "context" - "fmt" + "reflect" "testing" commoncfg "github.com/prometheus/common/config" @@ -144,7 +144,7 @@ func TestApplyConfigBuildFailurePreservesState(t *testing.T) { require.Equal(t, tpBefore, otel.GetTracerProvider()) require.NotNil(t, m.shutdownFunc) // shutdownFunc should be the same as before the failed apply. - require.Equal(t, fmt.Sprintf("%p", shutdownBefore), fmt.Sprintf("%p", m.shutdownFunc)) + require.Equal(t, reflect.ValueOf(shutdownBefore).Pointer(), reflect.ValueOf(m.shutdownFunc).Pointer()) } func TestInstallingNewTracerProvider(t *testing.T) { From bd8085a562c18af000c24c3f241a0f6c50cab6a3 Mon Sep 17 00:00:00 2001 From: Siavash Safi Date: Fri, 3 Apr 2026 15:59:36 +0200 Subject: [PATCH 3/3] fix(tracing): call stop for test cleanup Signed-off-by: Siavash Safi --- tracing/tracing_test.go | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/tracing/tracing_test.go b/tracing/tracing_test.go index 683e5ae45f..8000926c6e 100644 --- a/tracing/tracing_test.go +++ b/tracing/tracing_test.go @@ -51,10 +51,7 @@ func TestConditionalTracerEnabled(t *testing.T) { ClientType: TracingClientGRPC, } require.NoError(t, m.ApplyConfig(cfg)) - t.Cleanup(func() { - tracingEnabled.Store(false) - m.shutdownFunc = nil - }) + t.Cleanup(m.Stop) tracer := NewTracer("test") ctx := context.Background() @@ -68,10 +65,7 @@ func TestConditionalTracerEnabled(t *testing.T) { func TestTracingEnabledFlagTransitions(t *testing.T) { m := NewManager(promslog.NewNopLogger()) - t.Cleanup(func() { - tracingEnabled.Store(false) - m.shutdownFunc = nil - }) + t.Cleanup(m.Stop) // Initially disabled. require.False(t, tracingEnabled.Load()) @@ -96,10 +90,7 @@ func TestApplyConfigNoGapDuringReload(t *testing.T) { ClientType: TracingClientGRPC, } require.NoError(t, m.ApplyConfig(cfg)) - t.Cleanup(func() { - tracingEnabled.Store(false) - m.shutdownFunc = nil - }) + t.Cleanup(m.Stop) require.True(t, tracingEnabled.Load()) tpBefore := otel.GetTracerProvider() @@ -123,10 +114,7 @@ func TestApplyConfigBuildFailurePreservesState(t *testing.T) { ClientType: TracingClientGRPC, } require.NoError(t, m.ApplyConfig(cfg)) - t.Cleanup(func() { - tracingEnabled.Store(false) - m.shutdownFunc = nil - }) + t.Cleanup(m.Stop) require.True(t, tracingEnabled.Load()) tpBefore := otel.GetTracerProvider()