Skip to content

Commit 6f4b819

Browse files
bm1549claudedarcciomtoffl01
authored
fix(tracer): only set _dd.p.ksr after agent rates are received (#4523)
### What does this PR do? Fixes `_dd.p.ksr` (Knuth Sampling Rate) to only be set on spans when the agent has provided sampling rates via `readRatesJSON()`. Previously, ksr was unconditionally set in `prioritySampler.apply()`, including when the rate was the initial client-side default (1.0) before any agent response arrived. Also refactors `prioritySampler` to consolidate lock acquisitions: extracts `getRateLocked()` so `apply()` acquires `ps.mu.RLock` only once to read both the rate and `agentRatesLoaded`. ### Motivation Cross-language consistency: Python, Java, PHP, and other tracers only set ksr when actual agent rates or sampling rules are applied, not for the default fallback. This aligns Go with that behavior. See RFC: "Transmit Knuth sampling rate to backend" ### Additional Notes - Added `agentRatesLoaded` bool field to `prioritySampler`, set to `true` in `readRatesJSON()` - `apply()` now gates ksr behind `agentRatesLoaded` check - Extracted `getRateLocked()` to avoid double lock acquisition in `apply()` - Rule-based sampling path (`applyTraceRuleSampling` in span.go) unchanged — correctly always sets ksr - Tests added: `ksr-not-set-without-agent-rates` and `ksr-set-after-agent-rates-received` Related PRs across tracers: - Java: DataDog/dd-trace-java#10802 - .NET: DataDog/dd-trace-dotnet#8287 - Ruby: DataDog/dd-trace-rb#5436 - Node.js: DataDog/dd-trace-js#7741 - PHP: DataDog/dd-trace-php#3701 - Rust: DataDog/dd-trace-rs#180 - C++: DataDog/dd-trace-cpp#288 - System tests: DataDog/system-tests#6466 ### Reviewer's Checklist - [x] Changed code has unit tests for its functionality at or near 100% coverage. - [x] [System-Tests](https://github.com/DataDog/system-tests/) covering this feature have been added and enabled with the va.b.c-dev version tag. - [ ] There is a benchmark for any new code, or changes to existing code. - [x] If this interacts with the agent in a new way, a system test has been added. - [x] New code is free of linting errors. You can check this by running `make lint` locally. - [x] New code doesn't break existing tests. You can check this by running `make test` locally. - [ ] Add an appropriate team label so this PR gets put in the right place for the release notes. - [ ] All generated files are up to date. You can check this by running `make generate` locally. - [ ] Non-trivial go.mod changes, e.g. adding new modules, are reviewed by @DataDog/dd-trace-go-guild. Make sure all nested modules are up to date by running `make fix-modules` locally. Unsure? Have a question? Request a review! 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Dario Castañé <dario.castane@datadoghq.com> Co-authored-by: Mikayla Toffler <46911781+mtoffl01@users.noreply.github.com>
1 parent 0b8e268 commit 6f4b819

2 files changed

Lines changed: 87 additions & 8 deletions

File tree

ddtrace/tracer/sampler.go

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
"github.com/DataDog/dd-trace-go/v2/ddtrace/ext"
1717
"github.com/DataDog/dd-trace-go/v2/internal/locking"
18+
"github.com/DataDog/dd-trace-go/v2/internal/locking/assert"
1819
"github.com/DataDog/dd-trace-go/v2/internal/samplernames"
1920
)
2021

@@ -137,10 +138,11 @@ const rampUpInterval = time.Second
137138
// prioritySampler holds a set of per-service sampling rates and applies
138139
// them to spans.
139140
type prioritySampler struct {
140-
mu locking.RWMutex
141-
rates map[serviceEnvKey]float64 // +checklocks:mu
142-
defaultRate float64 // +checklocks:mu
143-
lastCapped time.Time // +checklocks:mu
141+
mu locking.RWMutex
142+
rates map[serviceEnvKey]float64 // +checklocks:mu
143+
defaultRate float64 // +checklocks:mu
144+
agentRatesLoaded bool // +checklocks:mu
145+
lastCapped time.Time // +checklocks:mu
144146
}
145147

146148
func newPrioritySampler() *prioritySampler {
@@ -198,6 +200,7 @@ func (ps *prioritySampler) readRatesJSON(rc io.ReadCloser) error {
198200
}
199201
ps.mu.Lock()
200202
defer ps.mu.Unlock()
203+
ps.agentRatesLoaded = true
201204
now := time.Now()
202205
canIncrease := ps.lastCapped.IsZero() || now.Sub(ps.lastCapped) >= rampUpInterval
203206
capApplied := false
@@ -225,9 +228,17 @@ func (ps *prioritySampler) readRatesJSON(rc io.ReadCloser) error {
225228
// guard the span.
226229
// +checklocksignore — Called during initialization in StartSpan, span not yet shared.
227230
func (ps *prioritySampler) getRate(spn *Span) float64 {
228-
key := serviceEnvKey{service: spn.service, env: spn.meta[ext.Environment]}
229231
ps.mu.RLock()
230232
defer ps.mu.RUnlock()
233+
return ps.getRateLocked(spn)
234+
}
235+
236+
// getRateLocked returns the sampling rate for the given span.
237+
// Caller must hold ps.mu (at least RLock).
238+
// +checklocksignore — Called during initialization in StartSpan, span not yet shared.
239+
func (ps *prioritySampler) getRateLocked(spn *Span) float64 {
240+
assert.RWMutexRLocked(&ps.mu)
241+
key := serviceEnvKey{service: spn.service, env: spn.meta[ext.Environment]}
231242
if rate, ok := ps.rates[key]; ok {
232243
return rate
233244
}
@@ -245,13 +256,21 @@ func (ps *prioritySampler) getDefaultRate() float64 {
245256
// to modify the span.
246257
// +checklocksignore — Called during initialization in StartSpan, span not yet shared.
247258
func (ps *prioritySampler) apply(spn *Span) {
248-
rate := ps.getRate(spn)
259+
ps.mu.RLock()
260+
rate := ps.getRateLocked(spn)
261+
fromAgent := ps.agentRatesLoaded
262+
ps.mu.RUnlock()
249263
if sampledByRate(spn.traceID, rate) {
250264
spn.setSamplingPriority(ext.PriorityAutoKeep, samplernames.AgentRate)
251265
} else {
252266
spn.setSamplingPriority(ext.PriorityAutoReject, samplernames.AgentRate)
253267
}
254268
spn.SetTag(keySamplingPriorityRate, rate)
255-
// Set the Knuth sampling rate tag when sampled by agent rate
256-
spn.SetTag(keyKnuthSamplingRate, formatKnuthSamplingRate(rate))
269+
// Only set the Knuth sampling rate tag when actual agent rates have been
270+
// received. The initial default rate (1.0) is a client-side fallback that
271+
// does not represent an agent-configured rate, so it must not propagate
272+
// as _dd.p.ksr to stay consistent with other tracers.
273+
if fromAgent {
274+
spn.SetTag(keyKnuthSamplingRate, formatKnuthSamplingRate(rate))
275+
}
257276
}

ddtrace/tracer/sampler_test.go

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,66 @@ func TestPrioritySampler(t *testing.T) {
202202
assert.EqualValues(ext.PriorityAutoReject, priority)
203203
assert.EqualValues(0.5, rate)
204204
})
205+
206+
t.Run("ksr-not-set-without-agent-rates", func(t *testing.T) {
207+
// When no agent rates have been received, the priority sampler uses
208+
// its initial default rate (1.0). This is a client-side fallback and
209+
// should NOT propagate as _dd.p.ksr to stay consistent with other
210+
// Datadog tracers.
211+
assert := assert.New(t)
212+
ps := newPrioritySampler()
213+
spn := newBasicSpan("http.request")
214+
spn.service = "my-service"
215+
spn.traceID = 1
216+
217+
ps.apply(spn)
218+
_, ok := getMeta(spn, keyKnuthSamplingRate)
219+
assert.False(ok, "_dd.p.ksr must not be set when no agent rates have been received")
220+
221+
// Sampling priority and rate metric should still be set normally
222+
priority, _ := getMetric(spn, keySamplingPriority)
223+
assert.EqualValues(ext.PriorityAutoKeep, priority)
224+
rate, _ := getMetric(spn, keySamplingPriorityRate)
225+
assert.EqualValues(1.0, rate)
226+
})
227+
228+
t.Run("ksr-set-after-agent-rates-received", func(t *testing.T) {
229+
// After agent rates are received via readRatesJSON, apply should
230+
// set _dd.p.ksr — even when the span falls through to the default
231+
// rate provided by the agent.
232+
assert := assert.New(t)
233+
ps := newPrioritySampler()
234+
assert.NoError(ps.readRatesJSON(
235+
io.NopCloser(strings.NewReader(
236+
`{
237+
"rate_by_service":{
238+
"service:,env:":0.8,
239+
"service:obfuscate.http,env:":0.5
240+
}
241+
}`,
242+
)),
243+
))
244+
245+
// Span matching a per-service rate
246+
spn1 := newBasicSpan("http.request")
247+
spn1.service = "obfuscate.http"
248+
spn1.traceID = 1
249+
250+
ps.apply(spn1)
251+
ksr, ok := getMeta(spn1, keyKnuthSamplingRate)
252+
assert.True(ok, "_dd.p.ksr must be set when agent rates have been received (per-service)")
253+
assert.Equal("0.5", ksr)
254+
255+
// Span falling through to agent-provided default rate
256+
spn2 := newBasicSpan("http.request")
257+
spn2.service = "unknown-service"
258+
spn2.traceID = 1
259+
260+
ps.apply(spn2)
261+
ksr, ok = getMeta(spn2, keyKnuthSamplingRate)
262+
assert.True(ok, "_dd.p.ksr must be set when agent rates have been received (default)")
263+
assert.Equal("0.8", ksr)
264+
})
205265
}
206266

207267
func BenchmarkPrioritySamplerGetRate(b *testing.B) {

0 commit comments

Comments
 (0)